From 56d57d5f860b09289c4ea63bcfa64da9001060eb Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <weiqunzhang@lbl.gov>
Date: Tue, 14 Jul 2020 12:08:34 -0400
Subject: [PATCH 1/3] More on Gpu kernel fusing

* Add Gpu::KernelInfo argument to ParallelFor to allow the user to indicate
  whether the kernel is an candidate for fusing.

* For MFIter, if the local size is less or equal to 3, the fuse region is
  turned on and small kernels marked fusable will be fused.

* Add launch macros for fusing.

* Add fusing to a number of functions used by linear solvers.  Note that
  there are a lot more amrex functions need to be updated for fusing.

* Optimize reduction for bottom solve.

* Consolidate memcpy in communication functions.

* Option to use device memory in communication kernels for packing and
  unpacking buffers.  But it's currently turned off because the performance
  was not improved in testing.  In fact, it was worse than using pinned
  memory.  But this might change in the future.  So the option is kept.
---
 Src/Base/AMReX.cpp                            |   2 +-
 Src/Base/AMReX_FBI.H                          | 192 ++++++----
 Src/Base/AMReX_FabArray.H                     |  24 +-
 Src/Base/AMReX_FabArrayBase.cpp               |   2 +-
 Src/Base/AMReX_FabArrayCommI.H                |  10 +-
 Src/Base/AMReX_FabArrayUtility.H              |  18 +-
 Src/Base/AMReX_Gpu.H                          |   1 +
 Src/Base/AMReX_GpuFuse.H                      |  69 +++-
 Src/Base/AMReX_GpuFuse.cpp                    |  54 ++-
 Src/Base/AMReX_GpuKernelInfo.H                |  18 +
 Src/Base/AMReX_GpuLaunch.H                    |  32 ++
 Src/Base/AMReX_GpuLaunchFunctsC.H             | 257 +++++++++++--
 Src/Base/AMReX_GpuLaunchFunctsG.H             | 352 +++++++++++++++---
 Src/Base/AMReX_GpuLaunchMacrosG.H             | 165 ++++++++
 Src/Base/AMReX_MFIter.H                       |   4 +
 Src/Base/AMReX_MFIter.cpp                     |  10 +-
 Src/Base/AMReX_MultiFab.cpp                   |  46 ++-
 Src/Base/AMReX_MultiFabUtil.cpp               |  34 +-
 Src/Base/AMReX_Reduce.H                       |  17 +
 Src/Base/CMakeLists.txt                       |   1 +
 Src/Base/Make.package                         |   1 +
 .../MLMG/AMReX_MLABecLaplacian.cpp            |  22 +-
 Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp  |  11 +-
 Src/LinearSolvers/MLMG/AMReX_MLPoisson.cpp    |  14 +-
 24 files changed, 1115 insertions(+), 241 deletions(-)
 create mode 100644 Src/Base/AMReX_GpuKernelInfo.H
diff --git a/Src/Base/AMReX.cpp b/Src/Base/AMReX.cpp
index ffc0c57e372..6ba27f84b1b 100644
--- a/Src/Base/AMReX.cpp
+++ b/Src/Base/AMReX.cpp
@@ -522,7 +522,7 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse,
     BL_PROFILE_INITPARAMS();
 #endif
     machine::Initialize();
-#ifdef AMREX_USE_CUDA
+#ifdef AMREX_USE_GPU
     Gpu::Fuser::Initialize();
 #endif
 
diff --git a/Src/Base/AMReX_FBI.H b/Src/Base/AMReX_FBI.H
index 4a33ee31f2a..6748a591613 100644
--- a/Src/Base/AMReX_FBI.H
+++ b/Src/Base/AMReX_FBI.H
@@ -54,19 +54,26 @@ ParallelFor (Vector<Array4BoxTag<T> > const& tags, int ncomp, F && f)
     }
     nwarps.push_back(ntotwarps);
 
-    std::size_t nbytes = ntags*sizeof(TagType);
-    auto d_tags = static_cast<TagType*>(The_Device_Arena()->alloc(nbytes));
-    Gpu::htod_memcpy(d_tags, tags.data(), nbytes);
+    std::size_t sizeof_tags = ntags*sizeof(TagType);
+    std::size_t offset_nwarps = Arena::align(sizeof_tags);
+    std::size_t sizeof_nwarps = (ntags+1)*sizeof(int);
+    std::size_t total_buf_size = offset_nwarps + sizeof_nwarps;
 
-    nbytes = (ntags+1)*sizeof(int);
-    auto d_nwarps = static_cast<int*>(The_Device_Arena()->alloc(nbytes));
-    Gpu::htod_memcpy(d_nwarps, nwarps.data(), nbytes);
+    char* h_buffer = (char*)The_Pinned_Arena()->alloc(total_buf_size);
+    char* d_buffer = (char*)The_Arena()->alloc(total_buf_size);
 
-    constexpr int nthreads = 128;
+    std::memcpy(h_buffer, tags.data(), sizeof_tags);
+    std::memcpy(h_buffer+offset_nwarps, nwarps.data(), sizeof_nwarps);
+    Gpu::htod_memcpy_async(d_buffer, h_buffer, total_buf_size);
+
+    auto d_tags = reinterpret_cast<TagType*>(d_buffer);
+    auto d_nwarps = reinterpret_cast<int*>(d_buffer+offset_nwarps);
+
+    constexpr int nthreads = 256;
     constexpr int nwarps_per_block = nthreads/Gpu::Device::warp_size;
     int nblocks = (ntotwarps + nwarps_per_block-1) / nwarps_per_block;
 #ifdef AMREX_USE_DPCPP
-    amrex::launch(nblocks, nthreads, Gpu::nullStream(),
+    amrex::launch(nblocks, nthreads, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept
     AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size)
     {
@@ -112,7 +119,7 @@ ParallelFor (Vector<Array4BoxTag<T> > const& tags, int ncomp, F && f)
         }
     });
 #else
-    amrex::launch(nblocks, nthreads, Gpu::nullStream(),
+    amrex::launch(nblocks, nthreads, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept
     {
         int g_tid = blockDim.x*blockIdx.x + threadIdx.x;
@@ -159,8 +166,8 @@ ParallelFor (Vector<Array4BoxTag<T> > const& tags, int ncomp, F && f)
 #endif
 
     Gpu::synchronize();
-    The_Device_Arena()->free(d_nwarps);
-    The_Device_Arena()->free(d_tags);
+    The_Pinned_Arena()->free(h_buffer);
+    The_Arena()->free(d_buffer);
 }
 #endif
 
@@ -221,19 +228,27 @@ fab_to_fab (Vector<Array4CopyTag<T> > const& copy_tags, int scomp, int dcomp, in
     }
     nwarps.push_back(ntotwarps);
 
-    std::size_t nbytes = N_locs*sizeof(TagType);
-    auto d_tags = static_cast<TagType*>(The_Device_Arena()->alloc(nbytes));
-    Gpu::htod_memcpy(d_tags, copy_tags.data(), nbytes);
+    const int ntags = copy_tags.size();
+    std::size_t sizeof_tags = ntags*sizeof(TagType);
+    std::size_t offset_nwarps = Arena::align(sizeof_tags);
+    std::size_t sizeof_nwarps = (ntags+1)*sizeof(int);
+    std::size_t total_buf_size = offset_nwarps + sizeof_nwarps;
+
+    char* h_buffer = (char*)The_Pinned_Arena()->alloc(total_buf_size);
+    char* d_buffer = (char*)The_Arena()->alloc(total_buf_size);
+
+    std::memcpy(h_buffer, copy_tags.data(), sizeof_tags);
+    std::memcpy(h_buffer+offset_nwarps, nwarps.data(), sizeof_nwarps);
+    Gpu::htod_memcpy_async(d_buffer, h_buffer, total_buf_size);
 
-    nbytes = (N_locs+1)*sizeof(int);
-    auto d_nwarps = static_cast<int*>(The_Device_Arena()->alloc(nbytes));
-    Gpu::htod_memcpy(d_nwarps, nwarps.data(), nbytes);
+    auto d_tags = reinterpret_cast<TagType*>(d_buffer);
+    auto d_nwarps = reinterpret_cast<int*>(d_buffer+offset_nwarps);
 
-    constexpr int nthreads = 128;
+    constexpr int nthreads = 256;
     constexpr int nwarps_per_block = nthreads/Gpu::Device::warp_size;
     int nblocks = (ntotwarps + nwarps_per_block-1) / nwarps_per_block;
 #ifdef AMREX_USE_DPCPP
-    amrex::launch(nblocks, nthreads, Gpu::nullStream(),
+    amrex::launch(nblocks, nthreads, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept
     AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size)
     {
@@ -280,7 +295,7 @@ fab_to_fab (Vector<Array4CopyTag<T> > const& copy_tags, int scomp, int dcomp, in
         }
     });
 #else
-    amrex::launch(nblocks, nthreads, Gpu::nullStream(),
+    amrex::launch(nblocks, nthreads, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept
     {
         int g_tid = blockDim.x*blockIdx.x + threadIdx.x;
@@ -328,8 +343,8 @@ fab_to_fab (Vector<Array4CopyTag<T> > const& copy_tags, int scomp, int dcomp, in
 #endif
 
     Gpu::synchronize();
-    The_Device_Arena()->free(d_nwarps);
-    The_Device_Arena()->free(d_tags);
+    The_Pinned_Arena()->free(h_buffer);
+    The_Arena()->free(d_buffer);
 }
 
 template <class T, class F>
@@ -353,23 +368,31 @@ fab_to_fab (Vector<Array4CopyTag<T> > const& copy_tags, int scomp, int dcomp, in
     }
     nwarps.push_back(ntotwarps);
 
-    std::size_t nbytes = N_locs*sizeof(TagType);
-    auto d_tags = static_cast<TagType*>(The_Device_Arena()->alloc(nbytes));
-    Gpu::htod_memcpy(d_tags, copy_tags.data(), nbytes);
+    const int ntags = copy_tags.size();
+    std::size_t sizeof_tags = ntags*sizeof(TagType);
+    std::size_t offset_nwarps = Arena::align(sizeof_tags);
+    std::size_t sizeof_nwarps = (ntags+1)*sizeof(int);
+    std::size_t offset_masks = Arena::align(offset_nwarps+sizeof_nwarps);
+    std::size_t sizeof_masks = masks.size()*sizeof(Array4<int>);
+    std::size_t total_buf_size = offset_masks + sizeof_masks;
 
-    nbytes = (N_locs+1)*sizeof(int);
-    auto d_nwarps = static_cast<int*>(The_Device_Arena()->alloc(nbytes));
-    Gpu::htod_memcpy(d_nwarps, nwarps.data(), nbytes);
+    char* h_buffer = (char*)The_Pinned_Arena()->alloc(total_buf_size);
+    char* d_buffer = (char*)The_Arena()->alloc(total_buf_size);
 
-    nbytes = masks.size()*sizeof(Array4<int>);
-    auto d_masks = static_cast<Array4<int>*>(The_Device_Arena()->alloc(nbytes));
-    Gpu::htod_memcpy(d_masks, masks.data(), nbytes);
+    std::memcpy(h_buffer, copy_tags.data(), sizeof_tags);
+    std::memcpy(h_buffer+offset_nwarps, nwarps.data(), sizeof_nwarps);
+    std::memcpy(h_buffer+offset_masks, masks.data(), sizeof_masks);
+    Gpu::htod_memcpy_async(d_buffer, h_buffer, total_buf_size);
 
-    constexpr int nthreads = 128;
+    auto d_tags = reinterpret_cast<TagType*>(d_buffer);
+    auto d_nwarps = reinterpret_cast<int*>(d_buffer+offset_nwarps);
+    auto d_masks = reinterpret_cast<Array4<int>*>(d_buffer+offset_masks);
+
+    constexpr int nthreads = 256;
     constexpr int nwarps_per_block = nthreads/Gpu::Device::warp_size;
     int nblocks = (ntotwarps + nwarps_per_block-1) / nwarps_per_block;
 #ifdef AMREX_USE_DPCPP
-    amrex::launch(nblocks, nthreads, Gpu::nullStream(),
+    amrex::launch(nblocks, nthreads, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept
     {
         int g_tid = item.get_global_id(0);
@@ -437,7 +460,7 @@ fab_to_fab (Vector<Array4CopyTag<T> > const& copy_tags, int scomp, int dcomp, in
         if (m) *m = 0;
     });
 #else
-    amrex::launch(nblocks, nthreads, Gpu::nullStream(),
+    amrex::launch(nblocks, nthreads, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept
     {
         int g_tid = blockDim.x*blockIdx.x + threadIdx.x;
@@ -515,9 +538,8 @@ fab_to_fab (Vector<Array4CopyTag<T> > const& copy_tags, int scomp, int dcomp, in
 #endif
 
     Gpu::synchronize();
-    The_Device_Arena()->free(d_masks);
-    The_Device_Arena()->free(d_nwarps);
-    The_Device_Arena()->free(d_tags);
+    The_Pinned_Arena()->free(h_buffer);
+    The_Arena()->free(d_buffer);
 }
 
 template <typename T, amrex::EnableIf_t<amrex::IsStoreAtomic<T>::value,int> = 0>
@@ -661,16 +683,18 @@ FabArray<FAB>::FB_local_copy_gpu (const FB& TheFB, int scomp, int ncomp)
     }
 
     if (maskfabs.size() > 0) {
+        Gpu::FuseSafeGuard fsg(maskfabs.size() >= Gpu::getFuseNumKernelsThreshold());
         for (Gpu::StreamIter sit(maskfabs.size()); sit.isValid(); ++sit) {
             BaseFab<int>& mskfab = maskfabs[sit()];
             const Array4<int>& msk = mskfab.array();
             const Box& bx = mskfab.box();
-            amrex::ParallelFor(bx,
+            amrex::ParallelFor(Gpu::KernelInfo{}.setFusable(true), bx,
             [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
             {
                 msk(i,j,k) = 0;
             });
         }
+        Gpu::LaunchFusedKernels();
     }
 
     if (is_thread_safe) {
@@ -923,7 +947,7 @@ FabArray<FAB>::FB_pack_send_buffer_cuda_graph (const FB& TheFB, int scomp, int n
         // Is the conditional ever expected false?
         int launches = 0;
         for (int send = 0; send < N_snds; ++send) {
-            if (send_data[send] != nullptr) {
+            if (send_size[send] > 0) {
                 launches += send_cctc[send]->size();
             }
         }
@@ -940,8 +964,7 @@ FabArray<FAB>::FB_pack_send_buffer_cuda_graph (const FB& TheFB, int scomp, int n
                                                      std::size_t(sizeof(CopyMemory)*launches) );
 
             const int j = sit();
-            char* dptr = send_data[j];
-            if (dptr != nullptr)
+            if (send_size[j] > 0)
             {
                 auto const& cctc = *send_cctc[j];
                 for (auto const& tag : cctc)
@@ -971,9 +994,9 @@ FabArray<FAB>::FB_pack_send_buffer_cuda_graph (const FB& TheFB, int scomp, int n
     for (int send = 0; send < N_snds; ++send)
     {
         const int j = send;
-        char* dptr = send_data[j];
-        if (dptr != nullptr)
+        if (send_size[j] > 0)
         {
+            char* dptr = send_data[j];
             auto const& cctc = *send_cctc[j];
             for (auto const& tag : cctc)
             {
@@ -1009,9 +1032,9 @@ FabArray<FAB>::FB_unpack_recv_buffer_cuda_graph (const FB& TheFB, int dcomp, int
     LayoutData<Vector<VoidCopyTag> > recv_copy_tags(boxArray(),DistributionMap());
     for (int k = 0; k < N_rcvs; ++k)
     {
-        const char* dptr = recv_data[k];
-        if (dptr != nullptr)
+        if (recv_size[k] > 0)
         {
+            const char* dptr = recv_data[k];
             auto const& cctc = *recv_cctc[k];
             for (auto const& tag : cctc)
             {
@@ -1082,7 +1105,7 @@ FabArray<FAB>::FB_unpack_recv_buffer_cuda_graph (const FB& TheFB, int dcomp, int
 template <class FAB>
 void
 FabArray<FAB>::pack_send_buffer_gpu (FabArray<FAB> const& src, int scomp, int ncomp,
-                                     Vector<char*>& send_data,
+                                     Vector<char*> const& send_data,
                                      Vector<std::size_t> const& send_size,
                                      Vector<CopyComTagsContainer const*> const& send_cctc)
 {
@@ -1091,16 +1114,26 @@ FabArray<FAB>::pack_send_buffer_gpu (FabArray<FAB> const& src, int scomp, int nc
     const int N_snds = send_data.size();
     if (N_snds == 0) return;
 
+    char* pbuffer = send_data[0];
+    std::size_t szbuffer = 0;
+#if 0
+    // For linear solver test on summit, this is slower than writing to
+    // pinned memory directly on device.
+    if (not ParallelDescriptor::UseGpuAwareMpi()) {
+        // Memory in send_data is pinned.
+        szbuffer = (send_data[N_snds-1]-send_data[0]) + send_size[N_snds-1];
+        pbuffer = (char*)The_Arena()->alloc(szbuffer);
+    }
+#endif
+
     typedef Array4CopyTag<value_type> TagType;
     Vector<TagType> snd_copy_tags;
-    // FIX HIP HERE -- Dim3
-    Dim3 zero;
-    zero.x = 0; zero.y = 0; zero.z = 0;
     for (int j = 0; j < N_snds; ++j)
     {
-        char* dptr = send_data[j];
-        if (dptr != nullptr)
+        if (send_size[j] > 0)
         {
+            std::size_t offset = send_data[j]-send_data[0];
+            char* dptr = pbuffer + offset;
             auto const& cctc = *send_cctc[j];
             for (auto const& tag : cctc)
             {
@@ -1108,16 +1141,24 @@ FabArray<FAB>::pack_send_buffer_gpu (FabArray<FAB> const& src, int scomp, int nc
                     amrex::makeArray4((value_type*)(dptr), tag.sbox, ncomp),
                     src.array(tag.srcIndex),
                     tag.sbox,
-                    zero
+                    Dim3{0,0,0}
                 });
                 dptr += (tag.sbox.numPts() * ncomp * sizeof(value_type));
             }
-            BL_ASSERT(dptr <= send_data[j] + send_size[j]);
+            BL_ASSERT(dptr <= pbuffer + offset + send_size[j]);
         }
     }
 
     detail::fab_to_fab<value_type>(snd_copy_tags, scomp, 0, ncomp,
                                    detail::CellStore<value_type>());
+
+    // There is Gpu::synchronize in fab_to_fab.
+
+    if (pbuffer != send_data[0]) {
+        Gpu::copyAsync(Gpu::deviceToHost,pbuffer,pbuffer+szbuffer,send_data[0]);
+        Gpu::synchronize();
+        The_Arena()->free(pbuffer);
+    }
 }
 
 template <class FAB>
@@ -1133,6 +1174,20 @@ FabArray<FAB>::unpack_recv_buffer_gpu (FabArray<FAB>& dst, int dcomp, int ncomp,
     const int N_rcvs = recv_cctc.size();
     if (N_rcvs == 0) return;
 
+    char* pbuffer = recv_data[0];
+    std::size_t szbuffer = 0;
+#if 0
+    // For linear solver test on summit, this is slower than writing to
+    // pinned memory directly on device.
+    if (not ParallelDescriptor::UseGpuAwareMpi()) {
+        // Memory in recv_data is pinned.
+        szbuffer = (recv_data[N_rcvs-1]-recv_data[0]) + recv_size[N_rcvs-1];
+        pbuffer = (char*)The_Arena()->alloc(szbuffer);
+        Gpu::copyAsync(Gpu::hostToDevice,recv_data[0],recv_data[0]+szbuffer,pbuffer);
+        Gpu::synchronize();
+    }
+#endif
+
     typedef Array4CopyTag<value_type> TagType;
     Vector<TagType> recv_copy_tags;
 
@@ -1149,9 +1204,10 @@ FabArray<FAB>::unpack_recv_buffer_gpu (FabArray<FAB>& dst, int dcomp, int ncomp,
 
     for (int k = 0; k < N_rcvs; ++k)
     {
-        const char* dptr = recv_data[k];
-        if (dptr != nullptr)
+        if (recv_size[k] > 0)
         {
+            std::size_t offset = recv_data[k]-recv_data[0];
+            const char* dptr = pbuffer + offset;
             auto const& cctc = *recv_cctc[k];
             for (auto const& tag : cctc)
             {
@@ -1171,21 +1227,23 @@ FabArray<FAB>::unpack_recv_buffer_gpu (FabArray<FAB>& dst, int dcomp, int ncomp,
                     masks.push_back(maskfabs[li].array());
                 }
             }
-            BL_ASSERT(dptr <= recv_data[k] + recv_size[k]);
+            BL_ASSERT(dptr <= pbuffer + offset + recv_size[k]);
         }
     }
 
     if (maskfabs.size() > 0) {
+        Gpu::FuseSafeGuard fsg(maskfabs.size() >= Gpu::getFuseNumKernelsThreshold());
         for (Gpu::StreamIter sit(maskfabs.size()); sit.isValid(); ++sit) {
             BaseFab<int>& mskfab = maskfabs[sit()];
             const Array4<int>& msk = mskfab.array();
             const Box& bx = mskfab.box();
-            amrex::ParallelFor(bx,
+            amrex::ParallelFor(Gpu::KernelInfo().setFusable(true), bx,
             [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
             {
                 msk(i,j,k) = 0;
             });
         }
+        Gpu::LaunchFusedKernels();
     }
 
     if (op == FabArrayBase::COPY)
@@ -1206,6 +1264,12 @@ FabArray<FAB>::unpack_recv_buffer_gpu (FabArray<FAB>& dst, int dcomp, int ncomp,
             detail::fab_to_fab_atomic_add<value_type>(recv_copy_tags, 0, dcomp, ncomp, masks);
         }
     }
+
+    // There is Gpu::synchronize in fab_to_fab.
+
+    if (pbuffer != recv_data[0]) {
+        The_Arena()->free(pbuffer);
+    }
 }
 
 #endif /* AMREX_USE_GPU */
@@ -1213,7 +1277,7 @@ FabArray<FAB>::unpack_recv_buffer_gpu (FabArray<FAB>& dst, int dcomp, int ncomp,
 template <class FAB>
 void
 FabArray<FAB>::pack_send_buffer_cpu (FabArray<FAB> const& src, int scomp, int ncomp,
-                                     Vector<char*>& send_data,
+                                     Vector<char*> const& send_data,
                                      Vector<std::size_t> const& send_size,
                                      Vector<CopyComTagsContainer const*> const& send_cctc)
 {
@@ -1227,9 +1291,9 @@ FabArray<FAB>::pack_send_buffer_cpu (FabArray<FAB> const& src, int scomp, int nc
 #endif
     for (int j = 0; j < N_snds; ++j)
     {
-        char* dptr = send_data[j];
-        if (dptr != nullptr)
+        if (send_size[j] > 0)
         {
+            char* dptr = send_data[j];
             auto const& cctc = *send_cctc[j];
             for (auto const& tag : cctc)
             {
@@ -1268,9 +1332,9 @@ FabArray<FAB>::unpack_recv_buffer_cpu (FabArray<FAB>& dst, int dcomp, int ncomp,
 #endif
         for (int k = 0; k < N_rcvs; ++k)
         {
-            const char* dptr = recv_data[k];
-            if (dptr != nullptr)
+            if (recv_size[k] > 0)
             {
+                const char* dptr = recv_data[k];
                 auto const& cctc = *recv_cctc[k];
                 for (auto const& tag : cctc)
                 {
@@ -1296,9 +1360,9 @@ FabArray<FAB>::unpack_recv_buffer_cpu (FabArray<FAB>& dst, int dcomp, int ncomp,
         recv_copy_tags.define(dst.boxArray(),dst.DistributionMap());
         for (int k = 0; k < N_rcvs; ++k)
         {
-            const char* dptr = recv_data[k];
-            if (dptr != nullptr)
+            if (recv_size[k] > 0)
             {
+                const char* dptr = recv_data[k];
                 auto const& cctc = *recv_cctc[k];
                 for (auto const& tag : cctc)
                 {
diff --git a/Src/Base/AMReX_FabArray.H b/Src/Base/AMReX_FabArray.H
index 71e60674686..8ae7b66bd97 100644
--- a/Src/Base/AMReX_FabArray.H
+++ b/Src/Base/AMReX_FabArray.H
@@ -718,7 +718,7 @@ public:
 #endif
 
     static void pack_send_buffer_gpu (FabArray<FAB> const& src, int scomp, int ncomp,
-                                      Vector<char*>& send_data,
+                                      Vector<char*> const& send_data,
                                       Vector<std::size_t> const& send_size,
                                       Vector<const CopyComTagsContainer*> const& send_cctc);
 
@@ -731,7 +731,7 @@ public:
 #endif
 
     static void pack_send_buffer_cpu (FabArray<FAB> const& src, int scomp, int ncomp,
-                                      Vector<char*>& send_data,
+                                      Vector<char*> const& send_data,
                                       Vector<std::size_t> const& send_size,
                                       Vector<const CopyComTagsContainer*> const& send_cctc);
 
@@ -1582,7 +1582,7 @@ FabArray<FAB>::setVal (value_type val,
     {
 	const Box& bx = fai.growntilebox(nghost);
         auto fab = this->array(fai);
-        AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, ncomp, i, j, k, n,
+        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, ncomp, i, j, k, n,
         {
             fab(i,j,k,n+comp) = val;
         });
@@ -1625,7 +1625,7 @@ FabArray<FAB>::setVal (value_type val,
 
         if (b.ok()) {
             auto fab = this->array(fai);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( b, ncomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( b, ncomp, i, j, k, n,
             {
                 fab(i,j,k,n+comp) = val;
             });
@@ -1655,7 +1655,7 @@ FabArray<FAB>::abs (int comp, int ncomp, const IntVect& nghost)
     {
         const Box& bx = mfi.growntilebox(nghost);
         auto fab = this->array(mfi);
-        AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, ncomp, i, j, k, n,
+        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, ncomp, i, j, k, n,
         {
             fab(i,j,k,n+comp) = amrex::Math::abs(fab(i,j,k,n+comp));
         });
@@ -1674,7 +1674,7 @@ FabArray<FAB>::plus (value_type val, int comp, int num_comp, int nghost)
     {
         const Box& bx = mfi.growntilebox(nghost);
         auto fab = this->array(mfi);
-        AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n,
+        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, num_comp, i, j, k, n,
         {
             fab(i,j,k,n+comp) += val;
         });
@@ -1694,7 +1694,7 @@ FabArray<FAB>::plus (value_type val, const Box& region, int comp, int num_comp,
         const Box& bx = mfi.growntilebox(nghost) & region;
         if (bx.ok()) {
             auto fab = this->array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, num_comp, i, j, k, n,
             {
                 fab(i,j,k,n+comp) += val;
             });
@@ -1714,7 +1714,7 @@ FabArray<FAB>::mult (value_type val, int comp, int num_comp, int nghost)
     {
         const Box& bx = mfi.growntilebox(nghost);
         auto fab = this->array(mfi);
-        AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n,
+        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, num_comp, i, j, k, n,
         {
             fab(i,j,k,n+comp) *= val;
         });
@@ -1734,7 +1734,7 @@ FabArray<FAB>::mult (value_type val, const Box& region, int comp, int num_comp,
         const Box& bx = mfi.growntilebox(nghost) & region;
         if (bx.ok()) {
             auto fab = this->array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, num_comp, i, j, k, n,
             {
                 fab(i,j,k,n+comp) *= val;
             });
@@ -1754,7 +1754,7 @@ FabArray<FAB>::invert (value_type numerator, int comp, int num_comp, int nghost)
     {
         const Box& bx = mfi.growntilebox(nghost);
         auto fab = this->array(mfi);
-        AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n,
+        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, num_comp, i, j, k, n,
         {
             fab(i,j,k,n+comp) = numerator / fab(i,j,k,n+comp);
         });
@@ -1774,7 +1774,7 @@ FabArray<FAB>::invert (value_type numerator, const Box& region, int comp, int nu
         const Box& bx = mfi.growntilebox(nghost) & region;
         if (bx.ok()) {
             auto fab = this->array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, num_comp, i, j, k, n,
             {
                 fab(i,j,k,n+comp) = numerator / fab(i,j,k,n+comp);
             });
@@ -1970,7 +1970,7 @@ FabArray<FAB>::BuildMask (const Box& phys_domain, const Periodicity& period,
         Box const& fbx = mfi.growntilebox();
         Box const& gbx = fbx & domain;
         Box const& vbx = mfi.validbox();
-        AMREX_HOST_DEVICE_FOR_4D(fbx, ncomp, i, j, k, n,
+        AMREX_HOST_DEVICE_FOR_4D_FUSABLE(fbx, ncomp, i, j, k, n,
         {
             IntVect iv(AMREX_D_DECL(i,j,k));
             if (vbx.contains(iv)) {
diff --git a/Src/Base/AMReX_FabArrayBase.cpp b/Src/Base/AMReX_FabArrayBase.cpp
index 77429049df3..50e00ad2c9c 100644
--- a/Src/Base/AMReX_FabArrayBase.cpp
+++ b/Src/Base/AMReX_FabArrayBase.cpp
@@ -118,7 +118,7 @@ FabArrayBase::Initialize ()
 
 #ifdef AMREX_USE_GPU
     if (ParallelDescriptor::UseGpuAwareMpi()) {
-        the_fa_arena = The_Device_Arena();
+        the_fa_arena = The_Arena();
     } else {
         the_fa_arena = The_Pinned_Arena();
     }
diff --git a/Src/Base/AMReX_FabArrayCommI.H b/Src/Base/AMReX_FabArrayCommI.H
index 46ecef31170..a0f05f93d24 100644
--- a/Src/Base/AMReX_FabArrayCommI.H
+++ b/Src/Base/AMReX_FabArrayCommI.H
@@ -143,9 +143,7 @@ FabArray<FAB>::FBEP_nowait (int scomp, int ncomp, const IntVect& nghost,
         {
             the_send_data = static_cast<char*>(amrex::The_FA_Arena()->alloc(total_volume));
             for (int i = 0, N = send_size.size(); i < N; ++i) {
-                if (send_size[i] > 0) {
-                    send_data[i] = the_send_data + offset[i];
-                }
+                send_data[i] = the_send_data + offset[i];
             }
         } else {
             the_send_data = nullptr;
@@ -495,9 +493,7 @@ FabArray<FAB>::ParallelCopy (const FabArray<FAB>& src,
             {
                 the_send_data = static_cast<char*>(amrex::The_FA_Arena()->alloc(total_volume));
                 for (int i = 0, N = send_size.size(); i < N; ++i) {
-                    if (send_size[i] > 0) {
-                        send_data[i] = the_send_data + offset[i];
-                    }
+                    send_data[i] = the_send_data + offset[i];
                 }
             }
 
@@ -749,9 +745,9 @@ FabArray<FAB>::PostRcvs (const MapOfCopyComTagContainers&  m_RcvTags,
 
         for (int i = 0; i < nrecv; ++i)
         {
+            recv_data[i] = the_recv_data + offset[i];
             if (recv_size[i] > 0)
             {
-                recv_data[i] = the_recv_data + offset[i];
                 const int rank = ParallelContext::global_to_local_rank(recv_from[i]);
                 const int comm_data_type = ParallelDescriptor::select_comm_data_type(recv_size[i]);
                 if (comm_data_type == 1) {
diff --git a/Src/Base/AMReX_FabArrayUtility.H b/Src/Base/AMReX_FabArrayUtility.H
index c31fd19b0c1..c7075d48520 100644
--- a/Src/Base/AMReX_FabArrayUtility.H
+++ b/Src/Base/AMReX_FabArrayUtility.H
@@ -153,11 +153,13 @@ ReduceSum_device (FabArray<FAB1> const& fa1, FabArray<FAB2> const& fa2,
     using value_type = typename FAB1::value_type;
     value_type sm = 0;
 
+    BL_PROFILE("ReduceSum_device");
+
     {
         ReduceOps<ReduceOpSum> reduce_op;
         ReduceData<value_type> reduce_data(reduce_op);
         using ReduceTuple = typename decltype(reduce_data)::Type;
-
+        Gpu::FuseReductionSafeGuard rsg(true);
         for (MFIter mfi(fa1); mfi.isValid(); ++mfi)
         {
             const Box& bx = amrex::grow(mfi.validbox(),nghost);
@@ -1467,7 +1469,7 @@ Add (FabArray<FAB>& dst, FabArray<FAB> const& src, int srccomp, int dstcomp, int
         {
             auto const srcFab = src.array(mfi);
             auto       dstFab = dst.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n,
             {
                 dstFab(i,j,k,n+dstcomp) += srcFab(i,j,k,n+srccomp);
             });
@@ -1499,7 +1501,7 @@ Copy (FabArray<FAB>& dst, FabArray<FAB> const& src, int srccomp, int dstcomp, in
         {
             auto const srcFab = src.array(mfi);
             auto       dstFab = dst.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n,
             {
                 dstFab(i,j,k,dstcomp+n) = srcFab(i,j,k,srccomp+n);
             });
@@ -1531,7 +1533,7 @@ Subtract (FabArray<FAB>& dst, FabArray<FAB> const& src, int srccomp, int dstcomp
         {
             auto const srcFab = src.array(mfi);
             auto       dstFab = dst.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n,
             {
                 dstFab(i,j,k,n+dstcomp) -= srcFab(i,j,k,n+srccomp);
             });
@@ -1563,7 +1565,7 @@ Multiply (FabArray<FAB>& dst, FabArray<FAB> const& src, int srccomp, int dstcomp
         {
             auto const srcFab = src.array(mfi);
             auto       dstFab = dst.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n,
             {
                 dstFab(i,j,k,n+dstcomp) *= srcFab(i,j,k,n+srccomp);
             });
@@ -1595,7 +1597,7 @@ Divide (FabArray<FAB>& dst, FabArray<FAB> const& src, int srccomp, int dstcomp,
         {
             auto const srcFab = src.array(mfi);
             auto       dstFab = dst.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n,
             {
                 dstFab(i,j,k,n+dstcomp) /= srcFab(i,j,k,n+srccomp);
             });
@@ -1625,7 +1627,7 @@ Abs (FabArray<FAB>& fa, int icomp, int numcomp, const IntVect& nghost)
         if (bx.ok())
         {
             auto const& fab = fa.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n,
             {
                 fab(i,j,k,n+icomp) = amrex::Math::abs(fab(i,j,k,n+icomp));
             });
@@ -1682,7 +1684,7 @@ OverrideSync (FabArray<FAB> & fa, FabArray<IFAB> const& msk, const Periodicity&
         const Box& bx = mfi.tilebox();
         auto fab = fa.array(mfi);
         auto const ifab = msk.array(mfi);
-        AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, ncomp, i, j, k, n,
+        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, ncomp, i, j, k, n,
         {
             if (!ifab(i,j,k)) fab(i,j,k,n) = 0;
         });
diff --git a/Src/Base/AMReX_Gpu.H b/Src/Base/AMReX_Gpu.H
index a0fd747a17a..5c59d89305b 100644
--- a/Src/Base/AMReX_Gpu.H
+++ b/Src/Base/AMReX_Gpu.H
@@ -9,6 +9,7 @@ namespace amrex { namespace Cuda {} }
 #endif
 
 #include <AMReX_GpuQualifiers.H>
+#include <AMReX_GpuKernelInfo.H>
 #include <AMReX_GpuPrint.H>
 #include <AMReX_GpuAssert.H>
 #include <AMReX_GpuTypes.H>
diff --git a/Src/Base/AMReX_GpuFuse.H b/Src/Base/AMReX_GpuFuse.H
index dc3555cb39d..99467f9a956 100644
--- a/Src/Base/AMReX_GpuFuse.H
+++ b/Src/Base/AMReX_GpuFuse.H
@@ -13,6 +13,8 @@
 namespace amrex {
 namespace Gpu {
 
+#ifdef AMREX_USE_GPU
+
 #ifdef AMREX_USE_CUDA
 
 typedef void (*Lambda1DLauncher)(char*,int);
@@ -229,23 +231,6 @@ private:
     }
 };
 
-Long getFuseSizeThreshold ();
-Long setFuseSizeThreshold (Long new_threshold);
-int getFuseNumKernelsThreshold ();
-int setFuseNumKernelsThreshold (int new_threshold);
-bool inFuseRegion ();
-bool setFuseRegion (bool flag);
-
-struct FuseSafeGuard
-{
-    explicit FuseSafeGuard (bool flag) noexcept
-        : m_old(setFuseRegion(flag)) {}
-    ~FuseSafeGuard () { setFuseRegion(m_old); }
-private:
-    bool m_old;
-};
-
-
 template <typename F>
 void
 Register (Box const& bx, F&& f)
@@ -273,6 +258,56 @@ LaunchFusedKernels ()
     Fuser::getInstance().Launch();
 }
 
+#else
+
+class Fuser
+{
+public:
+    static Fuser& getInstance ();
+    static void Initialize ();
+    static void Finalize ();
+private:
+    static std::unique_ptr<Fuser> m_instance;
+};
+
+inline void LaunchFusedKernels () {}
+
+#endif
+
+Long getFuseSizeThreshold ();
+Long setFuseSizeThreshold (Long new_threshold);
+int getFuseNumKernelsThreshold ();
+int setFuseNumKernelsThreshold (int new_threshold);
+bool inFuseRegion ();
+bool setFuseRegion (bool flag);
+bool inFuseReductionRegion ();
+bool setFuseReductionRegion (bool flag);
+
+struct FuseSafeGuard
+{
+    explicit FuseSafeGuard (bool flag) noexcept
+        : m_old(setFuseRegion(flag)) {}
+    ~FuseSafeGuard () { setFuseRegion(m_old); }
+private:
+    bool m_old;
+};
+
+struct FuseReductionSafeGuard
+{
+    explicit FuseReductionSafeGuard (bool flag) noexcept
+        : m_old(setFuseReductionRegion(flag)) {}
+    ~FuseReductionSafeGuard () { setFuseReductionRegion(m_old); }
+private:
+    bool m_old;
+};
+
+#else
+
+struct FuseSafeGuard
+{
+    explicit FuseSafeGuard (bool) {}
+};
+
 #endif
 
 }}
diff --git a/Src/Base/AMReX_GpuFuse.cpp b/Src/Base/AMReX_GpuFuse.cpp
index 26f942aa963..e3a4fe812b1 100644
--- a/Src/Base/AMReX_GpuFuse.cpp
+++ b/Src/Base/AMReX_GpuFuse.cpp
@@ -6,10 +6,11 @@
 namespace amrex {
 namespace Gpu {
 
-#ifdef AMREX_USE_CUDA
+#ifdef AMREX_USE_GPU
 
 namespace {
     bool s_in_fuse_region = false;
+    bool s_in_fuse_reduction_region = false;
     // Fusing kernels with elements greater than this are not recommended based tests on v100
     Long s_fuse_size_threshold = 257*257;
     // If the number of kernels is less than this, fusing is not recommended based on tests on v100
@@ -18,6 +19,8 @@ namespace {
 
 std::unique_ptr<Fuser> Fuser::m_instance = nullptr;
 
+#ifdef AMREX_USE_CUDA
+
 Fuser::Fuser ()
 {
     AMREX_ASSERT(!OpenMP::in_parallel());
@@ -86,12 +89,32 @@ void Fuser::Launch ()
         constexpr int nwarps_per_block = nthreads/Gpu::Device::warp_size;
         int nblocks = (ntotwarps + nwarps_per_block-1) / nwarps_per_block;
 
+        bool is_reduction = s_in_fuse_reduction_region;
+
         amrex::launch(nblocks, nthreads, Gpu::gpuStream(),
         [=] AMREX_GPU_DEVICE () noexcept
         {
             int g_tid = blockDim.x*blockIdx.x + threadIdx.x;
             int g_wid = g_tid / Gpu::Device::warp_size;
-            if (g_wid >= ntotwarps) return;
+            if (g_wid >= ntotwarps) {
+                if (is_reduction) {
+                    // for reduction, the assumption is all lambdas have function signature
+                    FuseHelper& helper = d_lambda_helper[0];
+                    char* lambda_object = d_lambda_object + helper.m_offset;
+                    if (helper.m_bx.isEmpty()) {
+                        helper.m_fp.L1D(lambda_object,-1);
+                    } else {
+                        if (helper.m_N == 0) {
+                            helper.m_fp.L3D(lambda_object, INT_MIN, INT_MIN, INT_MIN);
+                        } else {
+                            for (int n = 0; n < helper.m_N; ++n) {
+                                helper.m_fp.L4D(lambda_object, INT_MIN, INT_MIN, INT_MIN, -1);
+                            }
+                        }
+                    }
+                }
+                return;
+            }
 
             int ilambda;
             {
@@ -120,6 +143,8 @@ void Fuser::Launch ()
             if (bx.isEmpty()) {
                 if (icell < helper.m_N) {
                     helper.m_fp.L1D(lambda_object,icell);
+                } else if (is_reduction) {
+                    helper.m_fp.L1D(lambda_object,-1);
                 }
             } else {
                 int ncells = bx.numPts();
@@ -139,6 +164,14 @@ void Fuser::Launch ()
                             helper.m_fp.L4D(lambda_object,i,j,k,n);
                         }
                     }
+                } else if (is_reduction) {
+                    if (helper.m_N == 0) {
+                        helper.m_fp.L3D(lambda_object, INT_MIN, INT_MIN, INT_MIN);
+                    } else {
+                        for (int n = 0; n < helper.m_N; ++n) {
+                            helper.m_fp.L4D(lambda_object, INT_MIN, INT_MIN, INT_MIN, -1);
+                        }
+                    }
                 }
             }
         });
@@ -181,6 +214,8 @@ Fuser::resize_helper_buf ()
     m_helper_buf = p;
 }
 
+#endif
+
 Fuser&
 Fuser::getInstance ()
 {
@@ -218,8 +253,7 @@ setFuseSizeThreshold (Long new_threshold)
 
 int getFuseNumKernelsThreshold () { return s_fuse_numkernels_threshold; }
 
-int
-setFuseNumKernelsThreshold (int new_threshold)
+int setFuseNumKernelsThreshold (int new_threshold)
 {
     int old = s_fuse_numkernels_threshold;
     s_fuse_numkernels_threshold = new_threshold;
@@ -228,14 +262,22 @@ setFuseNumKernelsThreshold (int new_threshold)
 
 bool inFuseRegion () { return s_in_fuse_region; }
 
-bool
-setFuseRegion (bool flag)
+bool setFuseRegion (bool flag)
 {
     bool old = s_in_fuse_region;
     s_in_fuse_region = flag;
     return old;
 }
 
+bool inFuseReductionRegion () { return s_in_fuse_reduction_region; }
+
+bool setFuseReductionRegion (bool flag)
+{
+    bool old = s_in_fuse_reduction_region;
+    s_in_fuse_reduction_region = flag;
+    return old;
+}
+
 #endif
 
 }}
diff --git a/Src/Base/AMReX_GpuKernelInfo.H b/Src/Base/AMReX_GpuKernelInfo.H
new file mode 100644
index 00000000000..2eac4054e63
--- /dev/null
+++ b/Src/Base/AMReX_GpuKernelInfo.H
@@ -0,0 +1,18 @@
+#ifndef AMREX_GPU_KERNEL_INFO_H_
+#define AMREX_GPU_KERNEL_INFO_H_
+
+namespace amrex {
+namespace Gpu {
+
+class KernelInfo
+{
+public:
+    KernelInfo& setFusable (bool flag) { fusable = flag; return *this; }
+    bool isFusable () const { return fusable; }
+private:
+    bool fusable = false;
+};
+
+}}
+
+#endif
diff --git a/Src/Base/AMReX_GpuLaunch.H b/Src/Base/AMReX_GpuLaunch.H
index f79901a2403..5305139f636 100644
--- a/Src/Base/AMReX_GpuLaunch.H
+++ b/Src/Base/AMReX_GpuLaunch.H
@@ -2,6 +2,7 @@
 #define AMREX_GPU_LAUNCH_H_
 
 #include <AMReX_GpuQualifiers.H>
+#include <AMReX_GpuKernelInfo.H>
 #include <AMReX_GpuControl.H>
 #include <AMReX_GpuTypes.H>
 #include <AMReX_GpuError.H>
@@ -9,6 +10,7 @@
 #include <AMReX_GpuDevice.H>
 #include <AMReX_GpuMemory.H>
 #include <AMReX_GpuReduce.H>
+#include <AMReX_GpuFuse.H>
 #include <AMReX_Tuple.H>
 #include <AMReX_Box.H>
 #include <AMReX_Loop.H>
@@ -187,6 +189,20 @@ namespace Gpu {
                                                         AMREX_WRONG_NUM_ARGS, \
                                                         AMREX_WRONG_NUM_ARGS)(__VA_ARGS__)
 
+#ifdef AMREX_USE_CUDA
+#define AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA(...) AMREX_GET_MACRO(__VA_ARGS__,\
+                                                        AMREX_GPU_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA_RANGE_3, \
+                                                        AMREX_WRONG_NUM_ARGS, \
+                                                        AMREX_WRONG_NUM_ARGS, \
+                                                        AMREX_GPU_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA_RANGE_2, \
+                                                        AMREX_WRONG_NUM_ARGS, \
+                                                        AMREX_WRONG_NUM_ARGS, \
+                                                        AMREX_GPU_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA_RANGE, \
+                                                        AMREX_WRONG_NUM_ARGS, \
+                                                        AMREX_WRONG_NUM_ARGS)(__VA_ARGS__)
+#else
+#define AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA(...) AMREX_LAUNCH_HOST_DEVICE_LAMBDA(__VA_ARGS__)
+#endif
 
 #if (AMREX_SPACEDIM == 1)
 #define AMREX_LAUNCH_DEVICE_LAMBDA_DIM(a1,a2,a3,b1,b2,b3,c1,c2,c3)      AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE     (a1,a2,a2)
@@ -215,6 +231,22 @@ namespace Gpu {
 #define AMREX_HOST_DEVICE_PARALLEL_FOR_3D(...) AMREX_GPU_HOST_DEVICE_PARALLEL_FOR_3D(__VA_ARGS__)
 #define AMREX_HOST_DEVICE_PARALLEL_FOR_4D(...) AMREX_GPU_HOST_DEVICE_PARALLEL_FOR_4D(__VA_ARGS__)
 
+#ifdef AMREX_USE_CUDA
+#define AMREX_HOST_DEVICE_PARALLEL_FOR_1D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_1D_FUSABLE(__VA_ARGS__)
+#define AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_3D_FUSABLE(__VA_ARGS__)
+#define AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_4D_FUSABLE(__VA_ARGS__)
+#define AMREX_HOST_DEVICE_FOR_1D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_1D_FUSABLE(__VA_ARGS__)
+#define AMREX_HOST_DEVICE_FOR_3D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_3D_FUSABLE(__VA_ARGS__)
+#define AMREX_HOST_DEVICE_FOR_4D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_4D_FUSABLE(__VA_ARGS__)
+#else
+#define AMREX_HOST_DEVICE_PARALLEL_FOR_1D_FUSABLE(...) AMREX_HOST_DEVICE_PARALLEL_FOR_1D(__VA_ARGS__)
+#define AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSABLE(...) AMREX_HOST_DEVICE_PARALLEL_FOR_3D(__VA_ARGS__)
+#define AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE(...) AMREX_HOST_DEVICE_PARALLEL_FOR_4D(__VA_ARGS__)
+#define AMREX_HOST_DEVICE_FOR_1D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_1D(__VA_ARGS__)
+#define AMREX_HOST_DEVICE_FOR_3D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_3D(__VA_ARGS__)
+#define AMREX_HOST_DEVICE_FOR_4D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_4D(__VA_ARGS__)
+#endif
+
 #ifdef AMREX_USE_GPU
 
 #define AMREX_HOST_DEVICE_PARALLEL_FOR_1D_FLAG(where_to_run,n,i,block) \
diff --git a/Src/Base/AMReX_GpuLaunchFunctsC.H b/Src/Base/AMReX_GpuLaunchFunctsC.H
index 3b26b799841..584ad71d810 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsC.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsC.H
@@ -10,7 +10,7 @@ void launch (T const& n, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept
 }
 
 template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
-void For (T n, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept
+void For (T n, L&& f) noexcept
 {
     for (T i = 0; i < n; ++i) {
         f(i);
@@ -18,7 +18,13 @@ void For (T n, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept
 }
 
 template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
-void ParallelFor (T n, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept
+void For (Gpu::KernelInfo const&, T n, L&& f) noexcept
+{
+    For(n, std::forward<L>(f));
+}
+
+template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
+void ParallelFor (T n, L&& f) noexcept
 {
     AMREX_PRAGMA_SIMD
     for (T i = 0; i < n; ++i) {
@@ -26,8 +32,14 @@ void ParallelFor (T n, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept
     }
 }
 
+template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
+void ParallelFor (Gpu::KernelInfo const&, T n, L&& f) noexcept
+{
+    ParallelFor(n, std::forward<L>(f));
+}
+
 template <typename L>
-void For (Box const& box, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept
+void For (Box const& box, L&& f) noexcept
 {
     const auto lo = amrex::lbound(box);
     const auto hi = amrex::ubound(box);
@@ -39,7 +51,13 @@ void For (Box const& box, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept
 }
 
 template <typename L>
-void ParallelFor (Box const& box, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept
+void For (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept
+{
+    For(box, std::forward<L>(f));
+}
+
+template <typename L>
+void ParallelFor (Box const& box, L&& f) noexcept
 {
     const auto lo = amrex::lbound(box);
     const auto hi = amrex::ubound(box);
@@ -51,8 +69,14 @@ void ParallelFor (Box const& box, L&& f, std::size_t /*shared_mem_bytes*/=0) noe
     }}}
 }
 
+template <typename L>
+void ParallelFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept
+{
+    ParallelFor(box, std::forward<L>(f));
+}
+
 template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
-void For (Box const& box, T ncomp, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept
+void For (Box const& box, T ncomp, L&& f) noexcept
 {
     const auto lo = amrex::lbound(box);
     const auto hi = amrex::ubound(box);
@@ -66,7 +90,13 @@ void For (Box const& box, T ncomp, L&& f, std::size_t /*shared_mem_bytes*/=0) no
 }
 
 template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
-void ParallelFor (Box const& box, T ncomp, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept
+void For (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
+{
+    For(box, ncomp, std::forward<L>(f));
+}
+
+template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
+void ParallelFor (Box const& box, T ncomp, L&& f) noexcept
 {
     const auto lo = amrex::lbound(box);
     const auto hi = amrex::ubound(box);
@@ -80,90 +110,161 @@ void ParallelFor (Box const& box, T ncomp, L&& f, std::size_t /*shared_mem_bytes
     }
 }
 
+template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
+void ParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
+{
+    ParallelFor(box, ncomp, std::forward<L>(f));
+}
+
 template <typename L1, typename L2>
-void For (Box const& box1, Box const& box2, L1&& f1, L2&& f2,
-          std::size_t /*shared_mem_bytes*/=0) noexcept
+void For (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
     For(box1, std::forward<L1>(f1));
     For(box2, std::forward<L2>(f2));
 }
 
+template <typename L1, typename L2>
+void For (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+{
+    For (box1, box2, std::forward<L1>(f1), std::forward<L2>(f2));
+}
+
 template <typename L1, typename L2, typename L3>
-void For (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3,
-          std::size_t /*shared_mem_bytes*/=0) noexcept
+void For (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     For(box1, std::forward<L1>(f1));
     For(box2, std::forward<L2>(f2));
     For(box3, std::forward<L3>(f3));
 }
 
+template <typename L1, typename L2, typename L3>
+void For (Gpu::KernelInfo const&, Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
+{
+    For(box1, box2, box3, std::forward<L1>(f1), std::forward<L2>(f2), std::forward<L3>(f3));
+}
+
 template <typename T1, typename T2, typename L1, typename L2,
           typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
           typename M2=amrex::EnableIf_t<std::is_integral<T2>::value> >
 void For (Box const& box1, T1 ncomp1, L1&& f1,
-          Box const& box2, T2 ncomp2, L2&& f2,
-          std::size_t /*shared_mem_bytes*/=0) noexcept
+          Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     For(box1, ncomp1, std::forward<L1>(f1));
     For(box2, ncomp2, std::forward<L2>(f2));
 }
 
+template <typename T1, typename T2, typename L1, typename L2,
+          typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
+          typename M2=amrex::EnableIf_t<std::is_integral<T2>::value> >
+void For (Gpu::KernelInfo const&,
+          Box const& box1, T1 ncomp1, L1&& f1,
+          Box const& box2, T2 ncomp2, L2&& f2) noexcept
+{
+    For(box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
+}
+
 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
           typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
           typename M2=amrex::EnableIf_t<std::is_integral<T2>::value>,
           typename M3=amrex::EnableIf_t<std::is_integral<T3>::value> >
 void For (Box const& box1, T1 ncomp1, L1&& f1,
           Box const& box2, T2 ncomp2, L2&& f2,
-          Box const& box3, T3 ncomp3, L3&& f3,
-          std::size_t /*shared_mem_bytes*/=0) noexcept
+          Box const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     For(box1, ncomp1, std::forward<L1>(f1));
     For(box2, ncomp2, std::forward<L2>(f2));
     For(box3, ncomp3, std::forward<L3>(f3));
 }
 
+template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+          typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
+          typename M2=amrex::EnableIf_t<std::is_integral<T2>::value>,
+          typename M3=amrex::EnableIf_t<std::is_integral<T3>::value> >
+void For (Gpu::KernelInfo const&,
+          Box const& box1, T1 ncomp1, L1&& f1,
+          Box const& box2, T2 ncomp2, L2&& f2,
+          Box const& box3, T3 ncomp3, L3&& f3) noexcept
+{
+    For(box1,ncomp1,std::forward<L1>(f1),
+        box2,ncomp2,std::forward<L2>(f2),
+        box3,ncomp3,std::forward<L3>(f3));
+}
+
 template <typename L1, typename L2>
-void ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2,
-                  std::size_t /*shared_mem_bytes*/=0) noexcept
+void ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
     ParallelFor(box1, std::forward<L1>(f1));
     ParallelFor(box2, std::forward<L2>(f2));
 }
 
+template <typename L1, typename L2>
+void ParallelFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+{
+    ParallelFor(box1,box2,f1,f2);
+}
+
 template <typename L1, typename L2, typename L3>
-void ParallelFor (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3,
-                  std::size_t /*shared_mem_bytes*/=0) noexcept
+void ParallelFor (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     ParallelFor(box1, std::forward<L1>(f1));
     ParallelFor(box2, std::forward<L2>(f2));
     ParallelFor(box3, std::forward<L3>(f3));
 }
 
+template <typename L1, typename L2, typename L3>
+void ParallelFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
+{
+    ParallelFor(box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
+}
+
 template <typename T1, typename T2, typename L1, typename L2,
           typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
           typename M2=amrex::EnableIf_t<std::is_integral<T2>::value> >
 void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
-                  Box const& box2, T2 ncomp2, L2&& f2,
-                  std::size_t /*shared_mem_bytes*/=0) noexcept
+                  Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     ParallelFor(box1, ncomp1, std::forward<L1>(f1));
     ParallelFor(box2, ncomp2, std::forward<L2>(f2));
 }
 
+template <typename T1, typename T2, typename L1, typename L2,
+          typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
+          typename M2=amrex::EnableIf_t<std::is_integral<T2>::value> >
+void ParallelFor (Gpu::KernelInfo const&,
+                  Box const& box1, T1 ncomp1, L1&& f1,
+                  Box const& box2, T2 ncomp2, L2&& f2) noexcept
+{
+    ParallelFor(box1,ncomp1,std::forward<L1>(f1),
+                box2,ncomp2,std::forward<L2>(f2));
+}
+
 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
           typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
           typename M2=amrex::EnableIf_t<std::is_integral<T2>::value>,
           typename M3=amrex::EnableIf_t<std::is_integral<T3>::value> >
 void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
                   Box const& box2, T2 ncomp2, L2&& f2,
-                  Box const& box3, T3 ncomp3, L3&& f3,
-                  std::size_t /*shared_mem_bytes*/=0) noexcept
+                  Box const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     ParallelFor(box1, ncomp1, std::forward<L1>(f1));
     ParallelFor(box2, ncomp2, std::forward<L2>(f2));
     ParallelFor(box3, ncomp3, std::forward<L3>(f3));
 }
 
+template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+          typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
+          typename M2=amrex::EnableIf_t<std::is_integral<T2>::value>,
+          typename M3=amrex::EnableIf_t<std::is_integral<T3>::value> >
+void ParallelFor (Gpu::KernelInfo const&,
+                  Box const& box1, T1 ncomp1, L1&& f1,
+                  Box const& box2, T2 ncomp2, L2&& f2,
+                  Box const& box3, T3 ncomp3, L3&& f3) noexcept
+{
+    ParallelFor(box1, ncomp1, std::forward<L1>(f1),
+                box2, ncomp2, std::forward<L2>(f2),
+                box3, ncomp3, std::forward<L3>(f3));
+}
+
 template <typename N, typename T, typename L1, typename L2,
           typename M=amrex::EnableIf_t<std::is_integral<N>::value> >
 void FabReduce (Box const& box, N ncomp, T const& init_val,
@@ -315,6 +416,118 @@ void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1,
         box3,ncomp3,std::forward<L3>(f3));
 }
 
+template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
+void HostDeviceParallelFor (Gpu::KernelInfo const&, T n, L&& f) noexcept
+{
+    ParallelFor(n,std::forward<L>(f));
+}
+
+template <typename L>
+void HostDeviceParallelFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept
+{
+    ParallelFor(box,std::forward<L>(f));
+}
+
+template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
+void HostDeviceParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
+{
+    ParallelFor(box,ncomp,std::forward<L>(f));
+}
+
+template <typename L1, typename L2>
+void HostDeviceParallelFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+{
+    ParallelFor(box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
+}
+
+template <typename L1, typename L2, typename L3>
+void HostDeviceParallelFor (Gpu::KernelInfo const&,
+                            Box const& box1, Box const& box2, Box const& box3,
+                            L1&& f1, L2&& f2, L3&& f3) noexcept
+{
+    ParallelFor(box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
+}
+
+template <typename T1, typename T2, typename L1, typename L2,
+          typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
+          typename M2=amrex::EnableIf_t<std::is_integral<T2>::value> >
+void HostDeviceParallelFor (Gpu::KernelInfo const&,
+                            Box const& box1, T1 ncomp1, L1&& f1,
+                            Box const& box2, T2 ncomp2, L2&& f2) noexcept
+{
+    ParallelFor(box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
+}
+
+template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+          typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
+          typename M2=amrex::EnableIf_t<std::is_integral<T2>::value>,
+          typename M3=amrex::EnableIf_t<std::is_integral<T3>::value> >
+void HostDeviceParallelFor (Gpu::KernelInfo const&,
+                            Box const& box1, T1 ncomp1, L1&& f1,
+                            Box const& box2, T2 ncomp2, L2&& f2,
+                            Box const& box3, T3 ncomp3, L3&& f3) noexcept
+{
+    ParallelFor(box1,ncomp1,std::forward<L1>(f1),
+                box2,ncomp2,std::forward<L2>(f2),
+                box3,ncomp3,std::forward<L3>(f3));
+}
+
+template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
+void HostDeviceFor (Gpu::KernelInfo const&, T n, L&& f) noexcept
+{
+    For(n,std::forward<L>(f));
+}
+
+template <typename L>
+void HostDeviceFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept
+{
+    For(box,std::forward<L>(f));
+}
+
+template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
+void HostDeviceFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
+{
+    For(box,ncomp,std::forward<L>(f));
+}
+
+template <typename L1, typename L2>
+void HostDeviceFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+{
+    For(box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
+}
+
+template <typename L1, typename L2, typename L3>
+void HostDeviceFor (Gpu::KernelInfo const&,
+                    Box const& box1, Box const& box2, Box const& box3,
+                    L1&& f1, L2&& f2, L3&& f3) noexcept
+{
+    For(box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
+}
+
+template <typename T1, typename T2, typename L1, typename L2,
+          typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
+          typename M2=amrex::EnableIf_t<std::is_integral<T2>::value> >
+void HostDeviceFor (Gpu::KernelInfo const&,
+                    Box const& box1, T1 ncomp1, L1&& f1,
+                    Box const& box2, T2 ncomp2, L2&& f2) noexcept
+{
+    For(box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
+}
+
+template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+          typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
+          typename M2=amrex::EnableIf_t<std::is_integral<T2>::value>,
+          typename M3=amrex::EnableIf_t<std::is_integral<T3>::value> >
+void HostDeviceFor (Gpu::KernelInfo const&,
+                    Box const& box1, T1 ncomp1, L1&& f1,
+                    Box const& box2, T2 ncomp2, L2&& f2,
+                    Box const& box3, T3 ncomp3, L3&& f3) noexcept
+{
+    For(box1,ncomp1,std::forward<L1>(f1),
+        box2,ncomp2,std::forward<L2>(f2),
+        box3,ncomp3,std::forward<L3>(f3));
+}
+
 }
 
 #endif
diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
index 920570be12f..d4a1841ce73 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsG.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -94,7 +94,7 @@ void launch (T const& n, L&& f) noexcept
 }
 
 template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
-void ParallelFor (T n, L&& f) noexcept
+void ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
 {
     if (amrex::isEmpty(n)) return;
     const auto ec = Gpu::ExecutionConfig(n);
@@ -122,7 +122,7 @@ void ParallelFor (T n, L&& f) noexcept
 }
 
 template <typename L>
-void ParallelFor (Box const& box, L&& f) noexcept
+void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) return;
     int ncells = box.numPts();
@@ -159,7 +159,7 @@ void ParallelFor (Box const& box, L&& f) noexcept
 }
 
 template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
-void ParallelFor (Box const& box, T ncomp, L&& f) noexcept
+void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) return;
     int ncells = box.numPts();
@@ -198,7 +198,7 @@ void ParallelFor (Box const& box, T ncomp, L&& f) noexcept
 }
 
 template <typename L1, typename L2>
-void ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+void ParallelFor (Gpu::KernelInfo const& info, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
     // xxxxx DPCPP todo: launch separate kernel to reduce kernel size
     ParallelFor(box1, std::forward<L1>(f1));
@@ -255,7 +255,9 @@ void ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 }
 
 template <typename L1, typename L2, typename L3>
-void ParallelFor (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
+void ParallelFor (Gpu::KernelInfo const& info,
+                  Box const& box1, Box const& box2, Box const& box3,
+                  L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     // xxxxx DPCPP todo: launch separate kernel to reduce kernel size
     ParallelFor(box1, std::forward<L1>(f1));
@@ -327,7 +329,8 @@ void ParallelFor (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2
 template <typename T1, typename T2, typename L1, typename L2,
           typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
           typename M2=amrex::EnableIf_t<std::is_integral<T2>::value> >
-void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
+void ParallelFor (Gpu::KernelInfo const& info,
+                  Box const& box1, T1 ncomp1, L1&& f1,
                   Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     // xxxxx DPCPP todo: launch separate kernel to reduce kernel size
@@ -392,7 +395,8 @@ template <typename T1, typename T2, typename T3, typename L1, typename L2, typen
           typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
           typename M2=amrex::EnableIf_t<std::is_integral<T2>::value>,
           typename M3=amrex::EnableIf_t<std::is_integral<T3>::value> >
-void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
+void ParallelFor (Gpu::KernelInfo const& info,
+                  Box const& box1, T1 ncomp1, L1&& f1,
                   Box const& box2, T2 ncomp2, L2&& f2,
                   Box const& box3, T3 ncomp3, L3&& f3) noexcept
 {
@@ -628,9 +632,15 @@ void launch (T const& n, L&& f) noexcept
 
 template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
 amrex::EnableIf_t<MaybeDeviceRunnable<L>::value>
-ParallelFor (T n, L&& f) noexcept
+ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
 {
     if (amrex::isEmpty(n)) return;
+#ifdef AMREX_USE_CUDA
+    if (Gpu::inFuseRegion() && info.isFusable() && n <= Gpu::getFuseSizeThreshold()) {
+        Gpu::Register(n, f);
+    } else
+#endif
+    {
     const auto ec = Gpu::ExecutionConfig(n);
     AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
@@ -640,14 +650,21 @@ ParallelFor (T n, L&& f) noexcept
         }
     });
     AMREX_GPU_ERROR_CHECK();
+    }
 }
 
 template <typename L>
 amrex::EnableIf_t<MaybeDeviceRunnable<L>::value>
-ParallelFor (Box const& box, L&& f) noexcept
+ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) return;
     int ncells = box.numPts();
+#ifdef AMREX_USE_CUDA
+    if (Gpu::inFuseRegion() && info.isFusable() && ncells <= Gpu::getFuseSizeThreshold()) {
+        Gpu::Register(box, f);
+    } else
+#endif
+    {
     const auto lo  = amrex::lbound(box);
     const auto len = amrex::length(box);
     const auto ec = Gpu::ExecutionConfig(ncells);
@@ -665,14 +682,21 @@ ParallelFor (Box const& box, L&& f) noexcept
         }
     });
     AMREX_GPU_ERROR_CHECK();
+    }
 }
 
 template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
 amrex::EnableIf_t<MaybeDeviceRunnable<L>::value>
-ParallelFor (Box const& box, T ncomp, L&& f) noexcept
+ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) return;
     int ncells = box.numPts();
+#ifdef AMREX_USE_CUDA
+    if (Gpu::inFuseRegion() && info.isFusable() && ncells <= Gpu::getFuseSizeThreshold()) {
+        Gpu::Register(box, ncomp, f);
+    } else
+#endif
+    {
     const auto lo  = amrex::lbound(box);
     const auto len = amrex::length(box);
     const auto ec = Gpu::ExecutionConfig(ncells);
@@ -692,16 +716,25 @@ ParallelFor (Box const& box, T ncomp, L&& f) noexcept
         }
     });
     AMREX_GPU_ERROR_CHECK();
+    }
 }
 
 template <typename L1, typename L2>
 amrex::EnableIf_t<MaybeDeviceRunnable<L1>::value and MaybeDeviceRunnable<L2>::value>
-ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+ParallelFor (Gpu::KernelInfo const& info,
+             Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
     if (amrex::isEmpty(box1) and amrex::isEmpty(box2)) return;
     int ncells1 = box1.numPts();
     int ncells2 = box2.numPts();
     int ncells = amrex::max(ncells1, ncells2);
+#ifdef AMREX_USE_CUDA
+    if (Gpu::inFuseRegion() && info.isFusable() && ncells <= Gpu::getFuseSizeThreshold()) {
+        Gpu::Register(box1, f1);
+        Gpu::Register(box2, f2);
+    } else
+#endif
+    {
     const auto lo1  = amrex::lbound(box1);
     const auto lo2  = amrex::lbound(box2);
     const auto len1 = amrex::length(box1);
@@ -732,17 +765,28 @@ ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
         }
     });
     AMREX_GPU_ERROR_CHECK();
+    }
 }
 
 template <typename L1, typename L2, typename L3>
 amrex::EnableIf_t<MaybeDeviceRunnable<L1>::value and MaybeDeviceRunnable<L2>::value and MaybeDeviceRunnable<L3>::value>
-ParallelFor (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
+ParallelFor (Gpu::KernelInfo const& info,
+             Box const& box1, Box const& box2, Box const& box3,
+             L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     if (amrex::isEmpty(box1) and amrex::isEmpty(box2) and amrex::isEmpty(box3)) return;
     int ncells1 = box1.numPts();
     int ncells2 = box2.numPts();
     int ncells3 = box3.numPts();
     int ncells = amrex::max(ncells1, ncells2, ncells3);
+#ifdef AMREX_USE_CUDA
+    if (Gpu::inFuseRegion() && info.isFusable() && ncells <= Gpu::getFuseSizeThreshold()) {
+        Gpu::Register(box1, f1);
+        Gpu::Register(box2, f2);
+        Gpu::Register(box3, f3);
+    } else
+#endif
+    {
     const auto lo1  = amrex::lbound(box1);
     const auto lo2  = amrex::lbound(box2);
     const auto lo3  = amrex::lbound(box3);
@@ -784,19 +828,28 @@ ParallelFor (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2
         }
     });
     AMREX_GPU_ERROR_CHECK();
+    }
 }
 
 template <typename T1, typename T2, typename L1, typename L2,
           typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
           typename M2=amrex::EnableIf_t<std::is_integral<T2>::value> >
 amrex::EnableIf_t<MaybeDeviceRunnable<L1>::value and MaybeDeviceRunnable<L2>::value>
-ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
+ParallelFor (Gpu::KernelInfo const& info,
+             Box const& box1, T1 ncomp1, L1&& f1,
              Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     if (amrex::isEmpty(box1) and amrex::isEmpty(box2)) return;
     int ncells1 = box1.numPts();
     int ncells2 = box2.numPts();
     int ncells = amrex::max(ncells1, ncells2);
+#ifdef AMREX_USE_CUDA
+    if (Gpu::inFuseRegion() && info.isFusable() && ncells <= Gpu::getFuseSizeThreshold()) {
+        Gpu::Register(box1, ncomp1, f1);
+        Gpu::Register(box2, ncomp2, f2);
+    } else
+#endif
+    {
     const auto lo1  = amrex::lbound(box1);
     const auto lo2  = amrex::lbound(box2);
     const auto len1 = amrex::length(box1);
@@ -831,6 +884,7 @@ ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
         }
     });
     AMREX_GPU_ERROR_CHECK();
+    }
 }
 
 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
@@ -838,7 +892,8 @@ template <typename T1, typename T2, typename T3, typename L1, typename L2, typen
           typename M2=amrex::EnableIf_t<std::is_integral<T2>::value>,
           typename M3=amrex::EnableIf_t<std::is_integral<T3>::value> >
 amrex::EnableIf_t<MaybeDeviceRunnable<L1>::value and MaybeDeviceRunnable<L2>::value and MaybeDeviceRunnable<L3>::value>
-ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
+ParallelFor (Gpu::KernelInfo const& info,
+             Box const& box1, T1 ncomp1, L1&& f1,
              Box const& box2, T2 ncomp2, L2&& f2,
              Box const& box3, T3 ncomp3, L3&& f3) noexcept
 {
@@ -847,6 +902,14 @@ ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
     int ncells2 = box2.numPts();
     int ncells3 = box3.numPts();
     int ncells = amrex::max(ncells1, ncells2, ncells3);
+#ifdef AMREX_USE_CUDA
+    if (Gpu::inFuseRegion() && info.isFusable() && ncells <= Gpu::getFuseSizeThreshold()) {
+        Gpu::Register(box1, ncomp1, f1);
+        Gpu::Register(box2, ncomp2, f2);
+        Gpu::Register(box3, ncomp3, f3);
+    } else
+#endif
+    {
     const auto lo1  = amrex::lbound(box1);
     const auto lo2  = amrex::lbound(box2);
     const auto lo3  = amrex::lbound(box3);
@@ -894,6 +957,7 @@ ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
         }
     });
     AMREX_GPU_ERROR_CHECK();
+    }
 }
 
 template <typename T, typename L1, typename L2>
@@ -983,34 +1047,147 @@ void single_task (L&& f) noexcept
     single_task(Gpu::gpuStream(), std::forward<L>(f));
 }
 
+template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
+void For (Gpu::KernelInfo const& info, T n, L&& f) noexcept
+{
+    ParallelFor(info, n,std::forward<L>(f));
+}
+
+template <typename L>
+void For (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
+{
+    ParallelFor(info, box,std::forward<L>(f));
+}
+
+template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
+void For (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept
+{
+    ParallelFor(info,box,ncomp,std::forward<L>(f));
+}
+
+template <typename L1, typename L2>
+void For (Gpu::KernelInfo const& info,
+          Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+{
+    ParallelFor(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
+}
+
+template <typename L1, typename L2, typename L3>
+void For (Gpu::KernelInfo const& info,
+          Box const& box1, Box const& box2, Box const& box3,
+          L1&& f1, L2&& f2, L3&& f3) noexcept
+{
+    ParallelFor(info,box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
+}
+
+template <typename T1, typename T2, typename L1, typename L2,
+          typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
+          typename M2=amrex::EnableIf_t<std::is_integral<T2>::value> >
+void For (Gpu::KernelInfo const& info,
+          Box const& box1, T1 ncomp1, L1&& f1,
+          Box const& box2, T2 ncomp2, L2&& f2) noexcept
+{
+    ParallelFor(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
+}
+
+template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+          typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
+          typename M2=amrex::EnableIf_t<std::is_integral<T2>::value>,
+          typename M3=amrex::EnableIf_t<std::is_integral<T3>::value> >
+void For (Gpu::KernelInfo const& info,
+          Box const& box1, T1 ncomp1, L1&& f1,
+          Box const& box2, T2 ncomp2, L2&& f2,
+          Box const& box3, T3 ncomp3, L3&& f3) noexcept
+{
+    ParallelFor(info,
+                box1,ncomp1,std::forward<L1>(f1),
+                box2,ncomp2,std::forward<L2>(f2),
+                box3,ncomp3,std::forward<L3>(f3));
+}
+
+template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
+void ParallelFor (T n, L&& f) noexcept
+{
+    ParallelFor(Gpu::KernelInfo{}, n, std::forward<L>(f));
+}
+
+template <typename L>
+void ParallelFor (Box const& box, L&& f) noexcept
+{
+    ParallelFor(Gpu::KernelInfo{}, box, std::forward<L>(f));
+}
+
+template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
+void ParallelFor (Box const& box, T ncomp, L&& f) noexcept
+{
+    ParallelFor(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
+}
+
+template <typename L1, typename L2>
+void ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+{
+    ParallelFor(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
+}
+
+template <typename L1, typename L2, typename L3>
+void ParallelFor (Box const& box1, Box const& box2, Box const& box3,
+                  L1&& f1, L2&& f2, L3&& f3) noexcept
+{
+    ParallelFor(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
+}
+
+template <typename T1, typename T2, typename L1, typename L2,
+          typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
+          typename M2=amrex::EnableIf_t<std::is_integral<T2>::value> >
+void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
+                  Box const& box2, T2 ncomp2, L2&& f2) noexcept
+{
+    ParallelFor(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
+}
+
+template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+          typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
+          typename M2=amrex::EnableIf_t<std::is_integral<T2>::value>,
+          typename M3=amrex::EnableIf_t<std::is_integral<T3>::value> >
+void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
+                  Box const& box2, T2 ncomp2, L2&& f2,
+                  Box const& box3, T3 ncomp3, L3&& f3) noexcept
+{
+    ParallelFor(Gpu::KernelInfo{},
+                box1,ncomp1,std::forward<L1>(f1),
+                box2,ncomp2,std::forward<L2>(f2),
+                box3,ncomp3,std::forward<L3>(f3));
+}
+
 template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
 void For (T n, L&& f) noexcept
 {
-    ParallelFor(n,std::forward<L>(f));
+    ParallelFor(Gpu::KernelInfo{}, n,std::forward<L>(f));
 }
 
 template <typename L>
 void For (Box const& box, L&& f) noexcept
 {
-    ParallelFor(box,std::forward<L>(f));
+    ParallelFor(Gpu::KernelInfo{}, box,std::forward<L>(f));
 }
 
 template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
 void For (Box const& box, T ncomp, L&& f) noexcept
 {
-    ParallelFor(box,ncomp,std::forward<L>(f));
+    ParallelFor(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
 }
 
 template <typename L1, typename L2>
 void For (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
-    ParallelFor(box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
+    ParallelFor(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
 }
 
 template <typename L1, typename L2, typename L3>
-void For (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
+void For (Box const& box1, Box const& box2, Box const& box3,
+          L1&& f1, L2&& f2, L3&& f3) noexcept
 {
-    ParallelFor(box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
+    ParallelFor(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
 }
 
 template <typename T1, typename T2, typename L1, typename L2,
@@ -1019,7 +1196,7 @@ template <typename T1, typename T2, typename L1, typename L2,
 void For (Box const& box1, T1 ncomp1, L1&& f1,
           Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
-    ParallelFor(box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
+    ParallelFor(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
 }
 
 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
@@ -1030,29 +1207,37 @@ void For (Box const& box1, T1 ncomp1, L1&& f1,
           Box const& box2, T2 ncomp2, L2&& f2,
           Box const& box3, T3 ncomp3, L3&& f3) noexcept
 {
-    ParallelFor(box1,ncomp1,std::forward<L1>(f1),
+    ParallelFor(Gpu::KernelInfo{},
+                box1,ncomp1,std::forward<L1>(f1),
                 box2,ncomp2,std::forward<L2>(f2),
                 box3,ncomp3,std::forward<L3>(f3));
 }
 
 template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
 amrex::EnableIf_t<MaybeHostDeviceRunnable<L>::value>
-HostDeviceParallelFor (T n, L&& f) noexcept
+HostDeviceParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
 {
     if (Gpu::inLaunchRegion()) {
-        ParallelFor(n,std::forward<L>(f));
+        ParallelFor(info,n,std::forward<L>(f));
     } else {
         AMREX_PRAGMA_SIMD
         for (T i = 0; i < n; ++i) f(i);
     }
 }
 
+template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
+amrex::EnableIf_t<MaybeHostDeviceRunnable<L>::value>
+HostDeviceParallelFor (T n, L&& f) noexcept
+{
+    HostDeviceParallelFor(Gpu::KernelInfo{}, n, std::forward<L>(f));
+}
+
 template <typename L>
 amrex::EnableIf_t<MaybeHostDeviceRunnable<L>::value>
-HostDeviceParallelFor (Box const& box, L&& f) noexcept
+HostDeviceParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
 {
     if (Gpu::inLaunchRegion()) {
-        ParallelFor(box,std::forward<L>(f));
+        ParallelFor(info, box,std::forward<L>(f));
     } else {
         LoopConcurrentOnCpu(box,std::forward<L>(f));
     }
@@ -1060,10 +1245,10 @@ HostDeviceParallelFor (Box const& box, L&& f) noexcept
 
 template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
 amrex::EnableIf_t<MaybeHostDeviceRunnable<L>::value>
-HostDeviceParallelFor (Box const& box, T ncomp, L&& f) noexcept
+HostDeviceParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept
 {
     if (Gpu::inLaunchRegion()) {
-        ParallelFor(box,ncomp,std::forward<L>(f));
+        ParallelFor(info, box,ncomp,std::forward<L>(f));
     } else {
         LoopConcurrentOnCpu(box,ncomp,std::forward<L>(f));
     }
@@ -1071,10 +1256,11 @@ HostDeviceParallelFor (Box const& box, T ncomp, L&& f) noexcept
 
 template <typename L1, typename L2>
 amrex::EnableIf_t<MaybeHostDeviceRunnable<L1>::value and MaybeHostDeviceRunnable<L2>::value>
-HostDeviceParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+HostDeviceParallelFor (Gpu::KernelInfo const& info,
+                       Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
     if (Gpu::inLaunchRegion()) {
-        ParallelFor(box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
+        ParallelFor(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
     } else {
         LoopConcurrentOnCpu(box1,std::forward<L1>(f1));
         LoopConcurrentOnCpu(box2,std::forward<L2>(f2));
@@ -1083,11 +1269,13 @@ HostDeviceParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexc
 
 template <typename L1, typename L2, typename L3>
 amrex::EnableIf_t<MaybeHostDeviceRunnable<L1>::value and MaybeHostDeviceRunnable<L2>::value and MaybeHostDeviceRunnable<L3>::value>
-HostDeviceParallelFor (Box const& box1, Box const& box2, Box const& box3,
+HostDeviceParallelFor (Gpu::KernelInfo const& info,
+                       Box const& box1, Box const& box2, Box const& box3,
                        L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     if (Gpu::inLaunchRegion()) {
-        ParallelFor(box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
+        ParallelFor(info,box1,box2,box3,
+                    std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
     } else {
         LoopConcurrentOnCpu(box1,std::forward<L1>(f1));
         LoopConcurrentOnCpu(box2,std::forward<L2>(f2));
@@ -1099,11 +1287,12 @@ template <typename T1, typename T2, typename L1, typename L2,
           typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
           typename M2=amrex::EnableIf_t<std::is_integral<T2>::value> >
 amrex::EnableIf_t<MaybeHostDeviceRunnable<L1>::value and MaybeHostDeviceRunnable<L2>::value>
-HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
+HostDeviceParallelFor (Gpu::KernelInfo const& info,
+                       Box const& box1, T1 ncomp1, L1&& f1,
                        Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     if (Gpu::inLaunchRegion()) {
-        ParallelFor(box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
+        ParallelFor(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
     } else {
         LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
         LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
@@ -1115,12 +1304,14 @@ template <typename T1, typename T2, typename T3, typename L1, typename L2, typen
           typename M2=amrex::EnableIf_t<std::is_integral<T2>::value>,
           typename M3=amrex::EnableIf_t<std::is_integral<T3>::value> >
 amrex::EnableIf_t<MaybeHostDeviceRunnable<L1>::value and MaybeHostDeviceRunnable<L2>::value and MaybeHostDeviceRunnable<L3>::value>
-HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
+HostDeviceParallelFor (Gpu::KernelInfo const& info,
+                       Box const& box1, T1 ncomp1, L1&& f1,
                        Box const& box2, T2 ncomp2, L2&& f2,
                        Box const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     if (Gpu::inLaunchRegion()) {
-        ParallelFor(box1,ncomp1,std::forward<L1>(f1),
+        ParallelFor(info,
+                    box1,ncomp1,std::forward<L1>(f1),
                     box2,ncomp2,std::forward<L2>(f2),
                     box3,ncomp3,std::forward<L3>(f3));
     } else {
@@ -1131,54 +1322,115 @@ HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
 }
 
 template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
-void HostDeviceFor (T n, L&& f) noexcept
+void HostDeviceFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
 {
-    HostDeviceParallelFor(n,std::forward<L>(f));
+    HostDeviceParallelFor(info,n,std::forward<L>(f));
 }
 
 template <typename L>
-void HostDeviceFor (Box const& box, L&& f) noexcept
+void HostDeviceFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
 {
-    HostDeviceParallelFor(box,std::forward<L>(f));
+    HostDeviceParallelFor(info,box,std::forward<L>(f));
 }
 
 template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
-void HostDeviceFor (Box const& box, T ncomp, L&& f) noexcept
+void HostDeviceFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept
 {
-    HostDeviceParallelFor(box,ncomp,std::forward<L>(f));
+    HostDeviceParallelFor(info,box,ncomp,std::forward<L>(f));
 }
 
 template <typename L1, typename L2>
-void HostDeviceFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+void HostDeviceFor (Gpu::KernelInfo const& info,
+                    Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
-    HostDeviceParallelFor(box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
+    HostDeviceParallelFor(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
 }
 
 template <typename L1, typename L2, typename L3>
-void HostDeviceFor (Box const& box1, Box const& box2, Box const& box3,
+void HostDeviceFor (Gpu::KernelInfo const& info,
+                    Box const& box1, Box const& box2, Box const& box3,
                     L1&& f1, L2&& f2, L3&& f3) noexcept
 {
-    HostDeviceParallelFor(box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
+    HostDeviceParallelFor(info, box1,box2,box3,
+                          std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
 }
 
 template <typename T1, typename T2, typename L1, typename L2,
           typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
           typename M2=amrex::EnableIf_t<std::is_integral<T2>::value> >
-void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1,
+void HostDeviceFor (Gpu::KernelInfo const& info,
+                    Box const& box1, T1 ncomp1, L1&& f1,
                     Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
-    HostDeviceParallelFor(box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
+    HostDeviceParallelFor(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
 }
 
 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
           typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
           typename M2=amrex::EnableIf_t<std::is_integral<T2>::value>,
           typename M3=amrex::EnableIf_t<std::is_integral<T3>::value> >
-void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1,
+void HostDeviceFor (Gpu::KernelInfo const& info,
+                    Box const& box1, T1 ncomp1, L1&& f1,
                     Box const& box2, T2 ncomp2, L2&& f2,
                     Box const& box3, T3 ncomp3, L3&& f3) noexcept
 {
-    HostDeviceParallelFor(box1,ncomp1,std::forward<L1>(f1),
+    HostDeviceParallelFor(info,
+                          box1,ncomp1,std::forward<L1>(f1),
+                          box2,ncomp2,std::forward<L2>(f2),
+                          box3,ncomp3,std::forward<L3>(f3));
+}
+
+template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
+void HostDeviceParallelFor (T n, L&& f) noexcept
+{
+    HostDeviceParallelFor(Gpu::KernelInfo{},n,std::forward<L>(f));
+}
+
+template <typename L>
+void HostDeviceParallelFor (Box const& box, L&& f) noexcept
+{
+    HostDeviceParallelFor(Gpu::KernelInfo{},box,std::forward<L>(f));
+}
+
+template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
+void HostDeviceParallelFor (Box const& box, T ncomp, L&& f) noexcept
+{
+    HostDeviceParallelFor(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
+}
+
+template <typename L1, typename L2>
+void HostDeviceParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+{
+    HostDeviceParallelFor(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
+}
+
+template <typename L1, typename L2, typename L3>
+void HostDeviceParallelFor (Box const& box1, Box const& box2, Box const& box3,
+                            L1&& f1, L2&& f2, L3&& f3) noexcept
+{
+    HostDeviceParallelFor(Gpu::KernelInfo{}, box1,box2,box3,
+                          std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
+}
+
+template <typename T1, typename T2, typename L1, typename L2,
+          typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
+          typename M2=amrex::EnableIf_t<std::is_integral<T2>::value> >
+void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
+                            Box const& box2, T2 ncomp2, L2&& f2) noexcept
+{
+    HostDeviceParallelFor(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
+}
+
+template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+          typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
+          typename M2=amrex::EnableIf_t<std::is_integral<T2>::value>,
+          typename M3=amrex::EnableIf_t<std::is_integral<T3>::value> >
+void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
+                            Box const& box2, T2 ncomp2, L2&& f2,
+                            Box const& box3, T3 ncomp3, L3&& f3) noexcept
+{
+    HostDeviceParallelFor(Gpu::KernelInfo{},
+                          box1,ncomp1,std::forward<L1>(f1),
                           box2,ncomp2,std::forward<L2>(f2),
                           box3,ncomp3,std::forward<L3>(f3));
 }
diff --git a/Src/Base/AMReX_GpuLaunchMacrosG.H b/Src/Base/AMReX_GpuLaunchMacrosG.H
index 83306b4b560..bb69202a9d9 100644
--- a/Src/Base/AMReX_GpuLaunchMacrosG.H
+++ b/Src/Base/AMReX_GpuLaunchMacrosG.H
@@ -53,6 +53,34 @@
             block \
         } \
     }}}
+#ifdef AMREX_USE_CUDA
+#define AMREX_GPU_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA_RANGE(TN,TI,block) \
+    { auto const& amrex_i_tn = TN; \
+    if (!amrex::isEmpty(amrex_i_tn)) { \
+    if (amrex::Gpu::inLaunchRegion()) \
+    { \
+      const auto amrex_i_ec = amrex::Gpu::ExecutionConfig(amrex_i_tn); \
+      if (amrex::Gpu::inFuseRegion() && amrex_i_ec.numBlocks.x*amrex_i_ec.numThreads.x <= amrex::Gpu::getFuseSizeThreshold()) { \
+        amrex::Gpu::Register(amrex_i_tn, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept { \
+            amrex::Box TI(amrex::IntVect(AMREX_D_DECL(i,j,k)),amrex::IntVect(AMREX_D_DECL(i,j,k))); \
+            block \
+        }); \
+      } else { \
+        AMREX_LAUNCH_KERNEL(amrex_i_ec.numBlocks, amrex_i_ec.numThreads, amrex_i_ec.sharedMem, amrex::Gpu::gpuStream(), \
+        [=] AMREX_GPU_DEVICE () noexcept { \
+            for (auto const TI : amrex::Gpu::Range(amrex_i_tn)) { \
+                block \
+            } \
+        }); \
+        AMREX_GPU_ERROR_CHECK(); \
+      } \
+    } \
+    else { \
+        for (auto const TI : amrex::Gpu::Range(amrex_i_tn)) { \
+            block \
+        } \
+    }}}
+#endif
 #endif
 
 // two fused launches
@@ -134,6 +162,51 @@
             block2 \
         } \
     }}}
+#ifdef AMREX_USE_CUDA
+#define AMREX_GPU_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA_RANGE_2(TN1,TI1,block1,TN2,TI2,block2) \
+    { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; \
+    if (!amrex::isEmpty(amrex_i_tn1) or !amrex::isEmpty(amrex_i_tn2)) { \
+    if (amrex::Gpu::inLaunchRegion()) \
+    { \
+      const auto amrex_i_ec1 = amrex::Gpu::ExecutionConfig(amrex_i_tn1); \
+      const auto amrex_i_ec2 = amrex::Gpu::ExecutionConfig(amrex_i_tn2); \
+      dim3 amrex_i_nblocks = amrex::max(amrex_i_ec1.numBlocks.x, \
+                                        amrex_i_ec2.numBlocks.x); \
+      amrex_i_nblocks.y = 2; \
+      if (amrex::Gpu::inFuseRegion() && amrex_i_nblocks.x*amrex_i_nblocks.y*amrex_i_ec1.numThreads.x <= amrex::Gpu::getFuseSizeThreshold()) { \
+        amrex::Gpu::Register(amrex_i_tn1, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept { \
+            amrex::Box TI1(amrex::IntVect(AMREX_D_DECL(i,j,k)),amrex::IntVect(AMREX_D_DECL(i,j,k))); \
+            block1 \
+        }); \
+        amrex::Gpu::Register(amrex_i_tn2, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept { \
+            amrex::Box TI2(amrex::IntVect(AMREX_D_DECL(i,j,k)),amrex::IntVect(AMREX_D_DECL(i,j,k))); \
+            block2 \
+        }); \
+      } else { \
+        AMREX_LAUNCH_KERNEL(amrex_i_nblocks, amrex_i_ec1.numThreads, 0, amrex::Gpu::gpuStream(), \
+        [=] AMREX_GPU_DEVICE () noexcept { \
+            switch (blockIdx.y) { \
+            case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
+                        block1 \
+                    } \
+                    break; \
+            case 1: for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \
+                        block2 \
+                    } \
+            } \
+        }); \
+        AMREX_GPU_ERROR_CHECK(); \
+      } \
+    } \
+    else { \
+        for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
+            block1 \
+        } \
+        for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \
+            block2 \
+        } \
+    }}}
+#endif
 #endif
 
 // three fused launches
@@ -233,6 +306,64 @@
             block3 \
         } \
     }}}
+#ifdef AMREX_USE_CUDA
+#define AMREX_GPU_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA_RANGE_3(TN1,TI1,block1,TN2,TI2,block2,TN3,TI3,block3) \
+    { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; auto const& amrex_i_tn3 = TN3; \
+    if (!amrex::isEmpty(amrex_i_tn1) or !amrex::isEmpty(amrex_i_tn2) or !amrex::isEmpty(amrex_i_tn3)) { \
+    if (amrex::Gpu::inLaunchRegion()) \
+    { \
+      const auto amrex_i_ec1 = amrex::Gpu::ExecutionConfig(amrex_i_tn1); \
+      const auto amrex_i_ec2 = amrex::Gpu::ExecutionConfig(amrex_i_tn2); \
+      const auto amrex_i_ec3 = amrex::Gpu::ExecutionConfig(amrex_i_tn3); \
+      dim3 amrex_i_nblocks = amrex::max(amrex::max(amrex_i_ec1.numBlocks.x, \
+                                                   amrex_i_ec2.numBlocks.x), \
+                                                   amrex_i_ec3.numBlocks.x); \
+      amrex_i_nblocks.y = 3; \
+      if (amrex::Gpu::inFuseRegion() && amrex_i_nblocks.x*amrex_i_nblocks.y*amrex_i_ec1.numThreads.x <= amrex::Gpu::getFuseSizeThreshold()) { \
+        amrex::Gpu::Register(amrex_i_tn1, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept { \
+            amrex::Box TI1(amrex::IntVect(AMREX_D_DECL(i,j,k)),amrex::IntVect(AMREX_D_DECL(i,j,k))); \
+            block1 \
+        }); \
+        amrex::Gpu::Register(amrex_i_tn2, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept { \
+            amrex::Box TI2(amrex::IntVect(AMREX_D_DECL(i,j,k)),amrex::IntVect(AMREX_D_DECL(i,j,k))); \
+            block2 \
+        }); \
+        amrex::Gpu::Register(amrex_i_tn3, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept { \
+            amrex::Box TI3(amrex::IntVect(AMREX_D_DECL(i,j,k)),amrex::IntVect(AMREX_D_DECL(i,j,k))); \
+            block3 \
+        }); \
+      } else {                                                             \
+        AMREX_LAUNCH_KERNEL(amrex_i_nblocks, amrex_i_ec1.numThreads, 0, amrex::Gpu::gpuStream(), \
+        [=] AMREX_GPU_DEVICE () noexcept { \
+            switch (blockIdx.y) { \
+            case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
+                        block1 \
+                    } \
+                    break; \
+            case 1: for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \
+                        block2 \
+                    } \
+                    break; \
+            case 2: for (auto const TI3 : amrex::Gpu::Range(amrex_i_tn3)) { \
+                        block3 \
+                    } \
+            } \
+        }); \
+        AMREX_GPU_ERROR_CHECK(); \
+      } \
+    } \
+    else { \
+        for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
+            block1 \
+        } \
+        for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \
+            block2 \
+        } \
+        for (auto const TI3 : amrex::Gpu::Range(amrex_i_tn3)) { \
+            block3 \
+        } \
+    }}}
+#endif
 #endif
 
 #ifdef AMREX_USE_DPCPP
@@ -452,6 +583,19 @@
     } \
 }
 
+#define AMREX_GPU_HOST_DEVICE_FOR_1D_FUSABLE(n,i,block) \
+{ \
+    auto const& amrex_i_n = n; \
+    using amrex_i_inttype = typename std::remove_const<decltype(n)>::type; \
+    if (amrex::Gpu::inLaunchRegion()) { \
+        amrex::ParallelFor(amrex::Gpu::KernelInfo().setFusable(true),amrex_i_n,[=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept block); \
+    } else { \
+        auto amrex_i_lambda = [=] (amrex_i_inttype i) noexcept block; \
+        AMREX_PRAGMA_SIMD \
+        for (amrex_i_inttype i = 0; i < amrex_i_n; ++i) amrex_i_lambda(i); \
+    } \
+}
+
 #define AMREX_GPU_DEVICE_FOR_1D(n,i,block) \
 { \
     using amrex_i_inttype = typename std::remove_const<decltype(n)>::type; \
@@ -470,6 +614,16 @@
     } \
 }
 
+#define AMREX_GPU_HOST_DEVICE_FOR_3D_FUSABLE(box,i,j,k,block) \
+{ \
+    auto const& amrex_i_box = box; \
+    if (amrex::Gpu::inLaunchRegion()) { \
+        amrex::ParallelFor(amrex::Gpu::KernelInfo().setFusable(true),amrex_i_box,[=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept block); \
+    } else { \
+        amrex::LoopConcurrentOnCpu(amrex_i_box,[=] (int i, int j, int k) noexcept block); \
+    } \
+}
+
 #define AMREX_GPU_DEVICE_FOR_3D(box,i,j,k,block) \
 { \
     amrex::ParallelFor(box,[=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept block); \
@@ -488,6 +642,17 @@
     } \
 }
 
+#define AMREX_GPU_HOST_DEVICE_FOR_4D_FUSABLE(box,ncomp,i,j,k,n,block) \
+{ \
+    auto const& amrex_i_box = box; \
+    auto const& amrex_i_ncomp = ncomp; \
+    if (amrex::Gpu::inLaunchRegion()) { \
+        amrex::ParallelFor(amrex::Gpu::KernelInfo().setFusable(true),amrex_i_box,amrex_i_ncomp,[=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept block); \
+    } else { \
+        amrex::LoopConcurrentOnCpu(amrex_i_box,amrex_i_ncomp,[=] (int i, int j, int k, int n) noexcept block); \
+    } \
+}
+
 #define AMREX_GPU_DEVICE_FOR_4D(box,ncomp,i,j,k,n,block) \
 { \
     amrex::ParallelFor(box,ncomp,[=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept block); \
diff --git a/Src/Base/AMReX_MFIter.H b/Src/Base/AMReX_MFIter.H
index 3df4cc2f884..8cae1464acc 100644
--- a/Src/Base/AMReX_MFIter.H
+++ b/Src/Base/AMReX_MFIter.H
@@ -222,6 +222,10 @@ protected:
     mutable Vector<Vector<Real*> > real_device_reduce_list;
 #endif
 
+#ifdef AMREX_USE_GPU
+    std::unique_ptr<Gpu::FuseSafeGuard> gpu_fsg;
+#endif
+
     static int nextDynamicIndex;
 
     void Initialize ();
diff --git a/Src/Base/AMReX_MFIter.cpp b/Src/Base/AMReX_MFIter.cpp
index 3c07e07fe17..375fd3e578c 100644
--- a/Src/Base/AMReX_MFIter.cpp
+++ b/Src/Base/AMReX_MFIter.cpp
@@ -225,6 +225,9 @@ MFIter::~MFIter ()
     AMREX_GPU_ERROR_CHECK();
     Gpu::Device::resetStreamIndex();
     Gpu::resetNumCallbacks();
+    if (!OpenMP::in_parallel() && Gpu::inFuseRegion()) {
+        Gpu::LaunchFusedKernels();
+    }
 #endif
 
     if (m_fa) {
@@ -236,7 +239,7 @@ MFIter::~MFIter ()
     }
 }
 
-void 
+void
 MFIter::Initialize ()
 {
     if (flags & SkipInit) {
@@ -327,6 +330,11 @@ MFIter::Initialize ()
 #ifdef AMREX_USE_GPU
 	Gpu::Device::setStreamIndex((streams > 0) ? currentIndex%streams : -1);
         Gpu::resetNumCallbacks();
+        if (!OpenMP::in_parallel()) {
+            if (index_map->size() >= Gpu::getFuseNumKernelsThreshold()) {
+                gpu_fsg.reset(new Gpu::FuseSafeGuard(true));
+            }
+        }
 #endif
 
 	typ = fabArray.boxArray().ixType();
diff --git a/Src/Base/AMReX_MultiFab.cpp b/Src/Base/AMReX_MultiFab.cpp
index 08d945cdca2..a748bcd2eb8 100644
--- a/Src/Base/AMReX_MultiFab.cpp
+++ b/Src/Base/AMReX_MultiFab.cpp
@@ -43,16 +43,36 @@ MultiFab::Dot (const MultiFab& x, int xcomp,
 
     BL_PROFILE("MultiFab::Dot()");
 
-    Real sm = amrex::ReduceSum(x, y, nghost,
-    [=] AMREX_GPU_HOST_DEVICE (Box const& bx, Array4<Real const> const& xfab, Array4<Real const> const& yfab) -> Real
-    {
-        Real t = 0.0;
-        AMREX_LOOP_4D(bx, numcomp, i, j, k, n,
+    Real sm = 0.0;
+#ifdef AMREX_USE_GPU
+    if (Gpu::inLaunchRegion()) {
+        sm = amrex::ReduceSum(x, y, nghost,
+        [=] AMREX_GPU_DEVICE (Box const& bx, Array4<Real const> const& xfab, Array4<Real const> const& yfab) -> Real
         {
-            t += xfab(i,j,k,xcomp+n) * yfab(i,j,k,ycomp+n);
+            Real t = 0.0;
+            AMREX_LOOP_4D(bx, numcomp, i, j, k, n,
+            {
+                t += xfab(i,j,k,xcomp+n) * yfab(i,j,k,ycomp+n);
+            });
+            return t;
         });
-        return t;
-    });
+    } else
+#endif
+    {
+#ifdef _OPENMP
+#pragma omp parallel reduction(+:sm)
+#endif
+        for (MFIter mfi(x,true); mfi.isValid(); ++mfi)
+        {
+            Box const& bx = mfi.growntilebox(nghost);
+            Array4<Real const> const& xfab = x.const_array(mfi);
+            Array4<Real const> const& yfab = y.const_array(mfi);
+            AMREX_LOOP_4D(bx, numcomp, i, j, k, n,
+            {
+                sm += xfab(i,j,k,xcomp+n) * yfab(i,j,k,ycomp+n);
+            });
+        }
+    }
 
     if (!local) ParallelAllReduce::Sum(sm, ParallelContext::CommunicatorSub());
 
@@ -198,7 +218,7 @@ MultiFab::Swap (MultiFab& dst, MultiFab& src,
             if (bx.ok()) {
                 auto sfab = src.array(mfi);
                 auto dfab = dst.array(mfi);
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n,
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n,
                 {
                     const amrex::Real tmp = dfab(i,j,k,n+dstcomp);
                     dfab(i,j,k,n+dstcomp) = sfab(i,j,k,n+srccomp);
@@ -298,7 +318,7 @@ MultiFab::Saxpy (MultiFab& dst, Real a, const MultiFab& src,
         if (bx.ok()) {
             auto const sfab = src.array(mfi);
             auto       dfab = dst.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n,
             {
                 dfab(i,j,k,dstcomp+n) += a * sfab(i,j,k,srccomp+n);
             });
@@ -332,7 +352,7 @@ MultiFab::Xpay (MultiFab& dst, Real a, const MultiFab& src,
         if (bx.ok()) {
             auto const sfab = src.array(mfi);
             auto       dfab = dst.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n,
             {
                 dfab(i,j,k,n+dstcomp) = sfab(i,j,k,n+srccomp) + a * dfab(i,j,k,n+dstcomp);
             });
@@ -374,7 +394,7 @@ MultiFab::LinComb (MultiFab& dst,
             auto const xfab =   x.array(mfi);
             auto const yfab =   y.array(mfi);
             auto       dfab = dst.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n,
             {
                 dfab(i,j,k,dstcomp+n) = a*xfab(i,j,k,xcomp+n) + b*yfab(i,j,k,ycomp+n);
             });
@@ -415,7 +435,7 @@ MultiFab::AddProduct (MultiFab& dst,
             auto const s1fab = src1.array(mfi);
             auto const s2fab = src2.array(mfi);
             auto        dfab =  dst.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n,
             {
                 dfab(i,j,k,n+dstcomp) += s1fab(i,j,k,n+comp1) * s2fab(i,j,k,n+comp2);
             });
diff --git a/Src/Base/AMReX_MultiFabUtil.cpp b/Src/Base/AMReX_MultiFabUtil.cpp
index d312cb676cf..3acbeeabe5f 100644
--- a/Src/Base/AMReX_MultiFabUtil.cpp
+++ b/Src/Base/AMReX_MultiFabUtil.cpp
@@ -67,7 +67,7 @@ namespace amrex
             Array4<Real> const& ccarr = cc.array(mfi);
             Array4<Real const> const& ndarr = nd.const_array(mfi);
 
-            AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
             {
                 amrex_avg_nd_to_cc(tbx, ccarr, ndarr, dcomp, scomp, ncomp);
             });
@@ -91,7 +91,7 @@ namespace amrex
                          Array4<Real const> const& eyarr = edge[1]->const_array(mfi);,
                          Array4<Real const> const& ezarr = edge[2]->const_array(mfi););
 
-            AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
             {
                 amrex_avg_eg_to_cc(tbx, ccarr, AMREX_D_DECL(exarr,eyarr,ezarr), dcomp);
             });
@@ -133,12 +133,12 @@ namespace amrex
                          Array4<Real const> const& fzarr = fc[2]->const_array(mfi););
 
 #if (AMREX_SPACEDIM == 1)
-            AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
             {
                 amrex_avg_fc_to_cc(tbx, ccarr, fxarr, dcomp, GeometryData());
             });
 #else
-            AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
             {
                 amrex_avg_fc_to_cc(tbx, ccarr, AMREX_D_DECL(fxarr,fyarr,fzarr), dcomp);
             });
@@ -168,12 +168,12 @@ namespace amrex
                          Array4<Real const> const& fzarr = fc[2]->const_array(mfi););
 
 #if (AMREX_SPACEDIM == 1)
-            AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
             {
                 amrex_avg_fc_to_cc(tbx, ccarr, fxarr, 0, gd);
             });
 #else
-            AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
             {
                 amrex_avg_fc_to_cc(tbx, ccarr, AMREX_D_DECL(fxarr,fyarr,fzarr), 0);
             });
@@ -218,12 +218,12 @@ namespace amrex
             Array4<Real const> const& ccarr = cc.const_array(mfi);
 
 #if (AMREX_SPACEDIM == 1)
-            AMREX_LAUNCH_HOST_DEVICE_LAMBDA (index_bounds, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA (index_bounds, tbx,
             {
                 amrex_avg_cc_to_fc(tbx, xbx, fxarr, ccarr, gd);
             });
 #else
-            AMREX_LAUNCH_HOST_DEVICE_LAMBDA (index_bounds, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA (index_bounds, tbx,
             {
                 amrex_avg_cc_to_fc(tbx, AMREX_D_DECL(xbx,ybx,zbx),
                                    AMREX_D_DECL(fxarr,fyarr,fzarr), ccarr);
@@ -288,7 +288,7 @@ namespace amrex
             Array4<Real const> const& finearr = S_fine.const_array(mfi);
             Array4<Real const> const& finevolarr = fvolume.const_array(mfi);
 
-            AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
             {
                 amrex_avgdown_with_vol(tbx,crsearr,finearr,finevolarr,
                                        0,scomp,ncomp,ratio);
@@ -337,7 +337,7 @@ namespace amrex
             Array4<Real> const& crsearr = crse_S_fine.array(mfi);
             Array4<Real const> const& finearr = S_fine.const_array(mfi);
 
-            AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
             {
                 amrex_avgdown(tbx,crsearr,finearr,0,scomp,ncomp,ratio);
             });
@@ -375,12 +375,12 @@ namespace amrex
                 Array4<Real const> const& finearr = S_fine.const_array(mfi);
 
                 if (is_cell_centered) {
-                    AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+                    AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
                     {
                         amrex_avgdown(tbx,crsearr,finearr,scomp,scomp,ncomp,ratio);
                     });
                 } else {
-                    AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+                    AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
                     {
                         amrex_avgdown_nodes(tbx,crsearr,finearr,scomp,scomp,ncomp,ratio);
                     });
@@ -406,12 +406,12 @@ namespace amrex
                 //        not part of the actual crse multifab which came in.
 
                 if (is_cell_centered) {
-                    AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+                    AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
                     {
                         amrex_avgdown(tbx,crsearr,finearr,0,scomp,ncomp,ratio);
                     });
                 } else {
-                    AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+                    AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
                     {
                         amrex_avgdown_nodes(tbx,crsearr,finearr,0,scomp,ncomp,ratio);
                     });
@@ -486,7 +486,7 @@ namespace amrex
                 Array4<Real> const& crsearr = crse.array(mfi);
                 Array4<Real const> const& finearr = fine.const_array(mfi);
 
-                AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+                AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
                 {
                     amrex_avgdown_faces(tbx, crsearr, finearr, 0, 0, ncomp, ratio, dir);
                 });
@@ -568,7 +568,7 @@ namespace amrex
                 Array4<Real> const& crsearr = crse.array(mfi);
                 Array4<Real const> const& finearr = fine.const_array(mfi);
 
-                AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+                AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
                 {
                     amrex_avgdown_edges(tbx, crsearr, finearr, 0, 0, ncomp, ratio, dir);
                 });
@@ -640,7 +640,7 @@ namespace amrex
 
             if (interpolate)
             {
-                AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( tile_box, thread_box,
+                AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( tile_box, thread_box,
                 {
                     amrex_fill_slice_interp(thread_box, slice_arr, full_arr,
                                             0, start_comp, ncomp,
diff --git a/Src/Base/AMReX_Reduce.H b/Src/Base/AMReX_Reduce.H
index beaeac8a218..aaef79e9253 100644
--- a/Src/Base/AMReX_Reduce.H
+++ b/Src/Base/AMReX_Reduce.H
@@ -309,6 +309,22 @@ public:
             Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(*dp,r, gh);
         });
 #else
+#ifdef AMREX_USE_CUDA
+        if (Gpu::inFuseRegion() && Gpu::inFuseReductionRegion()
+            && ec.numBlocks.x*ec.numThreads.x <= Gpu::getFuseSizeThreshold())
+        {
+            Gpu::Register(box, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+            {
+                ReduceTuple r = *(dp+1);
+                if (box.contains(IntVect(AMREX_D_DECL(i,j,k)))) {
+                    auto pr = Reduce::detail::call_f(f,i,j,k,ixtype);
+                    Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
+                }
+                Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(*dp,r);
+            });
+        } else
+#endif
+        {
         amrex::launch(ec.numBlocks.x, ec.numThreads.x, 0, Gpu::gpuStream(),
         [=] AMREX_GPU_DEVICE () noexcept {
             ReduceTuple r = *(dp+1);
@@ -325,6 +341,7 @@ public:
             }
             Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(*dp,r);
         });
+        }
 #endif
     }
 
diff --git a/Src/Base/CMakeLists.txt b/Src/Base/CMakeLists.txt
index 17d043fa097..081ecf696ed 100644
--- a/Src/Base/CMakeLists.txt
+++ b/Src/Base/CMakeLists.txt
@@ -175,6 +175,7 @@ target_sources( amrex
    # GPU --------------------------------------------------------------------
    AMReX_Gpu.H
    AMReX_GpuQualifiers.H
+   AMReX_GpuKernelInfo.H
    AMReX_GpuPrint.H
    AMReX_GpuAssert.H
    AMReX_GpuTypes.H
diff --git a/Src/Base/Make.package b/Src/Base/Make.package
index c97b264374d..edea7af44e0 100644
--- a/Src/Base/Make.package
+++ b/Src/Base/Make.package
@@ -66,6 +66,7 @@ C$(AMREX_BASE)_headers += AMReX_parstream.H
 C$(AMREX_BASE)_sources += AMReX_parstream.cpp
 
 C$(AMREX_BASE)_headers += AMReX_Gpu.H AMReX_GpuQualifiers.H AMReX_GpuPrint.H AMReX_GpuAssert.H AMReX_GpuTypes.H AMReX_GpuError.H
+C$(AMREX_BASE)_headers += AMReX_GpuKernelInfo.H
 
 C$(AMREX_BASE)_headers += AMReX_GpuLaunchMacrosG.H AMReX_GpuLaunchFunctsG.H
 C$(AMREX_BASE)_headers += AMReX_GpuLaunchMacrosC.H AMReX_GpuLaunchFunctsC.H
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.cpp b/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.cpp
index d1cbedfedea..e641313c7a5 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.cpp
@@ -410,13 +410,13 @@ MLABecLaplacian::Fapply (int amrlev, int mglev, MultiFab& out, const MultiFab& i
                      const auto& bzfab = bzcoef.array(mfi););
         if (m_overset_mask[amrlev][mglev]) {
             const auto& osm = m_overset_mask[amrlev][mglev]->array(mfi);
-            AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
             {
                 mlabeclap_adotx_os(tbx, yfab, xfab, afab, AMREX_D_DECL(bxfab,byfab,bzfab),
                                    osm, dxinv, ascalar, bscalar, ncomp);
             });
         } else {
-            AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
             {
                 mlabeclap_adotx(tbx, yfab, xfab, afab, AMREX_D_DECL(bxfab,byfab,bzfab),
                                 dxinv, ascalar, bscalar, ncomp);
@@ -454,7 +454,7 @@ MLABecLaplacian::normalize (int amrlev, int mglev, MultiFab& mf) const
                      const auto& byfab = bycoef.array(mfi);,
                      const auto& bzfab = bzcoef.array(mfi););
 
-        AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+        AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
         {
             mlabeclap_normalize(tbx, fab, afab, AMREX_D_DECL(bxfab,byfab,bzfab),
                                 dxinv, ascalar, bscalar, ncomp);
@@ -609,7 +609,7 @@ MLABecLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab&
 #else
         if (m_overset_mask[amrlev][mglev]) {
             const auto& osm = m_overset_mask[amrlev][mglev]->array(mfi);
-            AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( tbx, thread_box,
+            AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( tbx, thread_box,
             {
                 abec_gsrb_os(thread_box, solnfab, rhsfab, alpha, afab,
                              AMREX_D_DECL(dhx, dhy, dhz),
@@ -621,7 +621,7 @@ MLABecLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab&
                              osm, vbx, redblack, nc);
             });
         } else if (regular_coarsening) {
-            AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( tbx, thread_box,
+            AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( tbx, thread_box,
             {
                 abec_gsrb(thread_box, solnfab, rhsfab, alpha, afab,
                           AMREX_D_DECL(dhx, dhy, dhz),
@@ -688,7 +688,7 @@ MLABecLaplacian::FFlux (Box const& box, Real const* dxinv, Real bscalar,
         Real fac = bscalar*dxinv[0];
         Box blo = amrex::bdryLo(box, 0);
         int blen = box.length(0);
-        AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( blo, tbox,
+        AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( blo, tbox,
         {
             mlabeclap_flux_xface(tbox, fxarr, solarr, bx, fac, blen, ncomp);
         });
@@ -696,7 +696,7 @@ MLABecLaplacian::FFlux (Box const& box, Real const* dxinv, Real bscalar,
         fac = bscalar*dxinv[1];
         blo = amrex::bdryLo(box, 1);
         blen = box.length(1);
-        AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( blo, tbox,
+        AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( blo, tbox,
         {
             mlabeclap_flux_yface(tbox, fyarr, solarr, by, fac, blen, ncomp);
         });
@@ -705,7 +705,7 @@ MLABecLaplacian::FFlux (Box const& box, Real const* dxinv, Real bscalar,
         fac = bscalar*dxinv[2];
         blo = amrex::bdryLo(box, 2);
         blen = box.length(2);
-        AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( blo, tbox,
+        AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( blo, tbox,
         {
             mlabeclap_flux_zface(tbox, fzarr, solarr, bz, fac, blen, ncomp);
         });
@@ -715,14 +715,14 @@ MLABecLaplacian::FFlux (Box const& box, Real const* dxinv, Real bscalar,
     {
         Real fac = bscalar*dxinv[0];
         Box bflux = amrex::surroundingNodes(box, 0);
-        AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bflux, tbox,
+        AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bflux, tbox,
         {
             mlabeclap_flux_x(tbox, fxarr, solarr, bx, fac, ncomp);
         });
 #if (AMREX_SPACEDIM >= 2)
         fac = bscalar*dxinv[1];
         bflux = amrex::surroundingNodes(box, 1);
-        AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bflux, tbox,
+        AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bflux, tbox,
         {
             mlabeclap_flux_y(tbox, fyarr, solarr, by, fac, ncomp);
         });
@@ -730,7 +730,7 @@ MLABecLaplacian::FFlux (Box const& box, Real const* dxinv, Real bscalar,
 #if (AMREX_SPACEDIM == 3)
         fac = bscalar*dxinv[2];
         bflux = amrex::surroundingNodes(box, 2);
-        AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bflux, tbox,
+        AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bflux, tbox,
         {
             mlabeclap_flux_z(tbox, fzarr, solarr, bz, fac, ncomp);
         });
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp
index aadc9ee0106..c854898a693 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp
@@ -314,7 +314,7 @@ MLCellLinOp::interpolation (int amrlev, int fmglev, MultiFab& fine, const MultiF
         const Box& bx    = mfi.tilebox();
         Array4<Real const> const& cfab = crse.const_array(mfi);
         Array4<Real> const& ffab = fine.array(mfi);
-        AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, ncomp, i, j, k, n,
+        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, ncomp, i, j, k, n,
         {
             int ic = amrex::coarsen(i,ratio3.x);
             int jc = amrex::coarsen(j,ratio3.y);
@@ -505,7 +505,7 @@ MLCellLinOp::applyBC (int amrlev, int mglev, MultiFab& in, BCMode bc_mode, State
                     const Real bcllo = bdlv[icomp][olo];
                     const Real bclhi = bdlv[icomp][ohi];
                     if (idim == 0) {
-                        AMREX_LAUNCH_HOST_DEVICE_LAMBDA (
+                        AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA (
                         blo, tboxlo, {
                         mllinop_apply_bc_x(0, tboxlo, blen, iofab, mlo,
                                            bctlo, bcllo, bvlo,
@@ -517,7 +517,7 @@ MLCellLinOp::applyBC (int amrlev, int mglev, MultiFab& in, BCMode bc_mode, State
                                            imaxorder, dxi, flagbc, icomp);
                         });
                     } else if (idim == 1) {
-                        AMREX_LAUNCH_HOST_DEVICE_LAMBDA (
+                        AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA (
                         blo, tboxlo, {
                         mllinop_apply_bc_y(0, tboxlo, blen, iofab, mlo,
                                            bctlo, bcllo, bvlo,
@@ -529,7 +529,7 @@ MLCellLinOp::applyBC (int amrlev, int mglev, MultiFab& in, BCMode bc_mode, State
                                            imaxorder, dyi, flagbc, icomp);
                         });
                     } else {
-                        AMREX_LAUNCH_HOST_DEVICE_LAMBDA (
+                        AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA (
                         blo, tboxlo, {
                         mllinop_apply_bc_z(0, tboxlo, blen, iofab, mlo,
                                            bctlo, bcllo, bvlo,
@@ -619,6 +619,7 @@ MLCellLinOp::reflux (int crse_amrlev,
                 AMREX_D_TERM(Elixir elifx = flux[0].elixir();,
                              Elixir elify = flux[1].elixir();,
                              Elixir elifz = flux[2].elixir(););
+                Gpu::FuseSafeGuard fsg(false); // Turn off fusing in FFlux
                 FFlux(crse_amrlev, mfi, pflux, crse_sol[mfi], Location::FaceCentroid);
                 fluxreg.CrseAdd(mfi, cpflux, crse_dx, dt, RunOn::Gpu);
             }
@@ -640,6 +641,7 @@ MLCellLinOp::reflux (int crse_amrlev,
                 AMREX_D_TERM(Elixir elifx = flux[0].elixir();,
                              Elixir elify = flux[1].elixir();,
                              Elixir elifz = flux[2].elixir(););
+                Gpu::FuseSafeGuard fsg(false); // Turn off fusing in FFlux
                 FFlux(fine_amrlev, mfi, pflux, fine_sol[mfi], Location::FaceCentroid, face_only);
                 fluxreg.FineAdd(mfi, cpflux, fine_dx, dt, RunOn::Gpu);
             }
@@ -678,6 +680,7 @@ MLCellLinOp::compFlux (int amrlev, const Array<MultiFab*,AMREX_SPACEDIM>& fluxes
             AMREX_D_TERM(Elixir elifx = flux[0].elixir();,
                          Elixir elify = flux[1].elixir();,
                          Elixir elifz = flux[2].elixir(););
+            Gpu::FuseSafeGuard fsg(false); // Turn off fusing in FFlux
             FFlux(amrlev, mfi, pflux, sol[mfi], loc);
             for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
                 const Box& nbx = mfi.nodaltilebox(idim);
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLPoisson.cpp b/Src/LinearSolvers/MLMG/AMReX_MLPoisson.cpp
index 5ad48f59131..aba34c49dd5 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLPoisson.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLPoisson.cpp
@@ -77,30 +77,30 @@ MLPoisson::Fapply (int amrlev, int mglev, MultiFab& out, const MultiFab& in) con
         const auto& yfab = out.array(mfi);
 
 #if (AMREX_SPACEDIM == 3)
-        AMREX_HOST_DEVICE_PARALLEL_FOR_3D (bx, i, j, k,
+        AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSABLE (bx, i, j, k,
         {
             mlpoisson_adotx(i, j, k, yfab, xfab, dhx, dhy, dhz);
         });
 #elif (AMREX_SPACEDIM == 2)
         if (m_has_metric_term) {
-            AMREX_HOST_DEVICE_PARALLEL_FOR_3D (bx, i, j, k,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSABLE (bx, i, j, k,
             {
                 mlpoisson_adotx_m(i, j, yfab, xfab, dhx, dhy, dx, probxlo);
             });
         } else {
-            AMREX_HOST_DEVICE_PARALLEL_FOR_3D (bx, i, j, k,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSABLE (bx, i, j, k,
             {
                 mlpoisson_adotx(i, j, yfab, xfab, dhx, dhy);
             });
         }
 #elif (AMREX_SPACEDIM == 1)
         if (m_has_metric_term) {
-            AMREX_HOST_DEVICE_PARALLEL_FOR_3D (bx, i, j, k,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSABLE (bx, i, j, k,
             {
                 mlpoisson_adotx_m(i, yfab, xfab, dhx, dx, probxlo);
             });
         } else {
-            AMREX_HOST_DEVICE_PARALLEL_FOR_3D (bx, i, j, k,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSABLE (bx, i, j, k,
             {
                 mlpoisson_adotx(i, yfab, xfab, dhx);
             });
@@ -134,12 +134,12 @@ MLPoisson::normalize (int amrlev, int mglev, MultiFab& mf) const
         const auto& fab = mf.array(mfi);
 
 #if (AMREX_SPACEDIM == 2)
-        AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+        AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
         {
             mlpoisson_normalize(tbx, fab, dhx, dhy, dx, probxlo);
         });
 #else
-        AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+        AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
         {
             mlpoisson_normalize(tbx, fab, dhx, dx, probxlo);
         });

From 2635ca043b7b8e7d700d24f7c32645d5706a6bf9 Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <weiqunzhang@lbl.gov>
Date: Tue, 15 Sep 2020 09:20:52 -0700
Subject: [PATCH 2/3] fix warnings

---
 Src/Base/AMReX_GpuLaunchFunctsG.H | 395 +++++++++++++++---------------
 1 file changed, 201 insertions(+), 194 deletions(-)

diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
index d4a1841ce73..a9d3ee2bd2d 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsG.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -94,7 +94,7 @@ void launch (T const& n, L&& f) noexcept
 }
 
 template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
-void ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
+void ParallelFor (Gpu::KernelInfo const& /*info*/, T n, L&& f) noexcept
 {
     if (amrex::isEmpty(n)) return;
     const auto ec = Gpu::ExecutionConfig(n);
@@ -122,7 +122,7 @@ void ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
 }
 
 template <typename L>
-void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
+void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) return;
     int ncells = box.numPts();
@@ -159,7 +159,7 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
 }
 
 template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
-void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept
+void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box, T ncomp, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) return;
     int ncells = box.numPts();
@@ -198,7 +198,7 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) n
 }
 
 template <typename L1, typename L2>
-void ParallelFor (Gpu::KernelInfo const& info, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
     // xxxxx DPCPP todo: launch separate kernel to reduce kernel size
     ParallelFor(box1, std::forward<L1>(f1));
@@ -255,7 +255,7 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box1, Box const& box2,
 }
 
 template <typename L1, typename L2, typename L3>
-void ParallelFor (Gpu::KernelInfo const& info,
+void ParallelFor (Gpu::KernelInfo const& /*info*/,
                   Box const& box1, Box const& box2, Box const& box3,
                   L1&& f1, L2&& f2, L3&& f3) noexcept
 {
@@ -329,7 +329,7 @@ void ParallelFor (Gpu::KernelInfo const& info,
 template <typename T1, typename T2, typename L1, typename L2,
           typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
           typename M2=amrex::EnableIf_t<std::is_integral<T2>::value> >
-void ParallelFor (Gpu::KernelInfo const& info,
+void ParallelFor (Gpu::KernelInfo const& /*info*/,
                   Box const& box1, T1 ncomp1, L1&& f1,
                   Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
@@ -395,7 +395,7 @@ template <typename T1, typename T2, typename T3, typename L1, typename L2, typen
           typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
           typename M2=amrex::EnableIf_t<std::is_integral<T2>::value>,
           typename M3=amrex::EnableIf_t<std::is_integral<T3>::value> >
-void ParallelFor (Gpu::KernelInfo const& info,
+void ParallelFor (Gpu::KernelInfo const& /*info*/,
                   Box const& box1, T1 ncomp1, L1&& f1,
                   Box const& box2, T2 ncomp2, L2&& f2,
                   Box const& box3, T3 ncomp3, L3&& f3) noexcept
@@ -641,15 +641,16 @@ ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
     } else
 #endif
     {
-    const auto ec = Gpu::ExecutionConfig(n);
-    AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE () noexcept {
-        for (T i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
-             i < n; i += stride) {
-            f(i);
-        }
-    });
-    AMREX_GPU_ERROR_CHECK();
+        amrex::ignore_unused(info);
+        const auto ec = Gpu::ExecutionConfig(n);
+        AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept {
+            for (T i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+                 i < n; i += stride) {
+                f(i);
+            }
+        });
+        AMREX_GPU_ERROR_CHECK();
     }
 }
 
@@ -665,23 +666,24 @@ ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
     } else
 #endif
     {
-    const auto lo  = amrex::lbound(box);
-    const auto len = amrex::length(box);
-    const auto ec = Gpu::ExecutionConfig(ncells);
-    AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE () noexcept {
-        for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
-             icell < ncells; icell += stride) {
-            int k =  icell /   (len.x*len.y);
-            int j = (icell - k*(len.x*len.y)) /   len.x;
-            int i = (icell - k*(len.x*len.y)) - j*len.x;
-            i += lo.x;
-            j += lo.y;
-            k += lo.z;
-            f(i,j,k);
-        }
-    });
-    AMREX_GPU_ERROR_CHECK();
+        amrex::ignore_unused(info);
+        const auto lo  = amrex::lbound(box);
+        const auto len = amrex::length(box);
+        const auto ec = Gpu::ExecutionConfig(ncells);
+        AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept {
+            for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+                 icell < ncells; icell += stride) {
+                int k =  icell /   (len.x*len.y);
+                int j = (icell - k*(len.x*len.y)) /   len.x;
+                int i = (icell - k*(len.x*len.y)) - j*len.x;
+                i += lo.x;
+                j += lo.y;
+                k += lo.z;
+                f(i,j,k);
+            }
+        });
+        AMREX_GPU_ERROR_CHECK();
     }
 }
 
@@ -697,25 +699,26 @@ ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexce
     } else
 #endif
     {
-    const auto lo  = amrex::lbound(box);
-    const auto len = amrex::length(box);
-    const auto ec = Gpu::ExecutionConfig(ncells);
-    AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE () noexcept {
-        for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
-             icell < ncells; icell += stride) {
-            int k =  icell /   (len.x*len.y);
-            int j = (icell - k*(len.x*len.y)) /   len.x;
-            int i = (icell - k*(len.x*len.y)) - j*len.x;
-            i += lo.x;
-            j += lo.y;
-            k += lo.z;
-            for (T n = 0; n < ncomp; ++n) {
-                f(i,j,k,n);
+        amrex::ignore_unused(info);
+        const auto lo  = amrex::lbound(box);
+        const auto len = amrex::length(box);
+        const auto ec = Gpu::ExecutionConfig(ncells);
+        AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept {
+            for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+                 icell < ncells; icell += stride) {
+                int k =  icell /   (len.x*len.y);
+                int j = (icell - k*(len.x*len.y)) /   len.x;
+                int i = (icell - k*(len.x*len.y)) - j*len.x;
+                i += lo.x;
+                j += lo.y;
+                k += lo.z;
+                for (T n = 0; n < ncomp; ++n) {
+                    f(i,j,k,n);
+                }
             }
-        }
-    });
-    AMREX_GPU_ERROR_CHECK();
+        });
+        AMREX_GPU_ERROR_CHECK();
     }
 }
 
@@ -735,36 +738,37 @@ ParallelFor (Gpu::KernelInfo const& info,
     } else
 #endif
     {
-    const auto lo1  = amrex::lbound(box1);
-    const auto lo2  = amrex::lbound(box2);
-    const auto len1 = amrex::length(box1);
-    const auto len2 = amrex::length(box2);
-    const auto ec = Gpu::ExecutionConfig(ncells);
-    AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE () noexcept {
-        for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
-             icell < ncells; icell += stride) {
-            if (icell < ncells1) {
-                int k =  icell /   (len1.x*len1.y);
-                int j = (icell - k*(len1.x*len1.y)) /   len1.x;
-                int i = (icell - k*(len1.x*len1.y)) - j*len1.x;
-                i += lo1.x;
-                j += lo1.y;
-                k += lo1.z;
-                f1(i,j,k);
-            }
-            if (icell < ncells2) {
-                int k =  icell /   (len2.x*len2.y);
-                int j = (icell - k*(len2.x*len2.y)) /   len2.x;
-                int i = (icell - k*(len2.x*len2.y)) - j*len2.x;
-                i += lo2.x;
-                j += lo2.y;
-                k += lo2.z;
-                f2(i,j,k);
+        amrex::ignore_unused(info);
+        const auto lo1  = amrex::lbound(box1);
+        const auto lo2  = amrex::lbound(box2);
+        const auto len1 = amrex::length(box1);
+        const auto len2 = amrex::length(box2);
+        const auto ec = Gpu::ExecutionConfig(ncells);
+        AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept {
+            for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+                 icell < ncells; icell += stride) {
+                if (icell < ncells1) {
+                    int k =  icell /   (len1.x*len1.y);
+                    int j = (icell - k*(len1.x*len1.y)) /   len1.x;
+                    int i = (icell - k*(len1.x*len1.y)) - j*len1.x;
+                    i += lo1.x;
+                    j += lo1.y;
+                    k += lo1.z;
+                    f1(i,j,k);
+                }
+                if (icell < ncells2) {
+                    int k =  icell /   (len2.x*len2.y);
+                    int j = (icell - k*(len2.x*len2.y)) /   len2.x;
+                    int i = (icell - k*(len2.x*len2.y)) - j*len2.x;
+                    i += lo2.x;
+                    j += lo2.y;
+                    k += lo2.z;
+                    f2(i,j,k);
+                }
             }
-        }
-    });
-    AMREX_GPU_ERROR_CHECK();
+        });
+        AMREX_GPU_ERROR_CHECK();
     }
 }
 
@@ -787,47 +791,48 @@ ParallelFor (Gpu::KernelInfo const& info,
     } else
 #endif
     {
-    const auto lo1  = amrex::lbound(box1);
-    const auto lo2  = amrex::lbound(box2);
-    const auto lo3  = amrex::lbound(box3);
-    const auto len1 = amrex::length(box1);
-    const auto len2 = amrex::length(box2);
-    const auto len3 = amrex::length(box3);
-    const auto ec = Gpu::ExecutionConfig(ncells);
-    AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE () noexcept {
-        for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
-             icell < ncells; icell += stride) {
-            if (icell < ncells1) {
-                int k =  icell /   (len1.x*len1.y);
-                int j = (icell - k*(len1.x*len1.y)) /   len1.x;
-                int i = (icell - k*(len1.x*len1.y)) - j*len1.x;
-                i += lo1.x;
-                j += lo1.y;
-                k += lo1.z;
-                f1(i,j,k);
-            }
-            if (icell < ncells2) {
-                int k =  icell /   (len2.x*len2.y);
-                int j = (icell - k*(len2.x*len2.y)) /   len2.x;
-                int i = (icell - k*(len2.x*len2.y)) - j*len2.x;
-                i += lo2.x;
-                j += lo2.y;
-                k += lo2.z;
-                f2(i,j,k);
-            }
-            if (icell < ncells3) {
-                int k =  icell /   (len3.x*len3.y);
-                int j = (icell - k*(len3.x*len3.y)) /   len3.x;
-                int i = (icell - k*(len3.x*len3.y)) - j*len3.x;
-                i += lo3.x;
-                j += lo3.y;
-                k += lo3.z;
-                f3(i,j,k);
+        amrex::ignore_unused(info);
+        const auto lo1  = amrex::lbound(box1);
+        const auto lo2  = amrex::lbound(box2);
+        const auto lo3  = amrex::lbound(box3);
+        const auto len1 = amrex::length(box1);
+        const auto len2 = amrex::length(box2);
+        const auto len3 = amrex::length(box3);
+        const auto ec = Gpu::ExecutionConfig(ncells);
+        AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept {
+            for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+                 icell < ncells; icell += stride) {
+                if (icell < ncells1) {
+                    int k =  icell /   (len1.x*len1.y);
+                    int j = (icell - k*(len1.x*len1.y)) /   len1.x;
+                    int i = (icell - k*(len1.x*len1.y)) - j*len1.x;
+                    i += lo1.x;
+                    j += lo1.y;
+                    k += lo1.z;
+                    f1(i,j,k);
+                }
+                if (icell < ncells2) {
+                    int k =  icell /   (len2.x*len2.y);
+                    int j = (icell - k*(len2.x*len2.y)) /   len2.x;
+                    int i = (icell - k*(len2.x*len2.y)) - j*len2.x;
+                    i += lo2.x;
+                    j += lo2.y;
+                    k += lo2.z;
+                    f2(i,j,k);
+                }
+                if (icell < ncells3) {
+                    int k =  icell /   (len3.x*len3.y);
+                    int j = (icell - k*(len3.x*len3.y)) /   len3.x;
+                    int i = (icell - k*(len3.x*len3.y)) - j*len3.x;
+                    i += lo3.x;
+                    j += lo3.y;
+                    k += lo3.z;
+                    f3(i,j,k);
+                }
             }
-        }
-    });
-    AMREX_GPU_ERROR_CHECK();
+        });
+        AMREX_GPU_ERROR_CHECK();
     }
 }
 
@@ -850,40 +855,41 @@ ParallelFor (Gpu::KernelInfo const& info,
     } else
 #endif
     {
-    const auto lo1  = amrex::lbound(box1);
-    const auto lo2  = amrex::lbound(box2);
-    const auto len1 = amrex::length(box1);
-    const auto len2 = amrex::length(box2);
-    const auto ec = Gpu::ExecutionConfig(ncells);
-    AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE () noexcept {
-        for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
-             icell < ncells; icell += stride) {
-            if (icell < ncells1) {
-                int k =  icell /   (len1.x*len1.y);
-                int j = (icell - k*(len1.x*len1.y)) /   len1.x;
-                int i = (icell - k*(len1.x*len1.y)) - j*len1.x;
-                i += lo1.x;
-                j += lo1.y;
-                k += lo1.z;
-                for (T1 n = 0; n < ncomp1; ++n) {
-                    f1(i,j,k,n);
+        amrex::ignore_unused(info);
+        const auto lo1  = amrex::lbound(box1);
+        const auto lo2  = amrex::lbound(box2);
+        const auto len1 = amrex::length(box1);
+        const auto len2 = amrex::length(box2);
+        const auto ec = Gpu::ExecutionConfig(ncells);
+        AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept {
+            for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+                 icell < ncells; icell += stride) {
+                if (icell < ncells1) {
+                    int k =  icell /   (len1.x*len1.y);
+                    int j = (icell - k*(len1.x*len1.y)) /   len1.x;
+                    int i = (icell - k*(len1.x*len1.y)) - j*len1.x;
+                    i += lo1.x;
+                    j += lo1.y;
+                    k += lo1.z;
+                    for (T1 n = 0; n < ncomp1; ++n) {
+                        f1(i,j,k,n);
+                    }
                 }
-            }
-            if (icell < ncells2) {
-                int k =  icell /   (len2.x*len2.y);
-                int j = (icell - k*(len2.x*len2.y)) /   len2.x;
-                int i = (icell - k*(len2.x*len2.y)) - j*len2.x;
-                i += lo2.x;
-                j += lo2.y;
-                k += lo2.z;
-                for (T2 n = 0; n < ncomp2; ++n) {
-                    f2(i,j,k,n);
+                if (icell < ncells2) {
+                    int k =  icell /   (len2.x*len2.y);
+                    int j = (icell - k*(len2.x*len2.y)) /   len2.x;
+                    int i = (icell - k*(len2.x*len2.y)) - j*len2.x;
+                    i += lo2.x;
+                    j += lo2.y;
+                    k += lo2.z;
+                    for (T2 n = 0; n < ncomp2; ++n) {
+                        f2(i,j,k,n);
+                    }
                 }
             }
-        }
-    });
-    AMREX_GPU_ERROR_CHECK();
+        });
+        AMREX_GPU_ERROR_CHECK();
     }
 }
 
@@ -910,53 +916,54 @@ ParallelFor (Gpu::KernelInfo const& info,
     } else
 #endif
     {
-    const auto lo1  = amrex::lbound(box1);
-    const auto lo2  = amrex::lbound(box2);
-    const auto lo3  = amrex::lbound(box3);
-    const auto len1 = amrex::length(box1);
-    const auto len2 = amrex::length(box2);
-    const auto len3 = amrex::length(box3);
-    const auto ec = Gpu::ExecutionConfig(ncells);
-    AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE () noexcept {
-        for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
-             icell < ncells; icell += stride) {
-            if (icell < ncells1) {
-                int k =  icell /   (len1.x*len1.y);
-                int j = (icell - k*(len1.x*len1.y)) /   len1.x;
-                int i = (icell - k*(len1.x*len1.y)) - j*len1.x;
-                i += lo1.x;
-                j += lo1.y;
-                k += lo1.z;
-                for (T1 n = 0; n < ncomp1; ++n) {
-                    f1(i,j,k,n);
+        amrex::ignore_unused(info);
+        const auto lo1  = amrex::lbound(box1);
+        const auto lo2  = amrex::lbound(box2);
+        const auto lo3  = amrex::lbound(box3);
+        const auto len1 = amrex::length(box1);
+        const auto len2 = amrex::length(box2);
+        const auto len3 = amrex::length(box3);
+        const auto ec = Gpu::ExecutionConfig(ncells);
+        AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept {
+            for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+                 icell < ncells; icell += stride) {
+                if (icell < ncells1) {
+                    int k =  icell /   (len1.x*len1.y);
+                    int j = (icell - k*(len1.x*len1.y)) /   len1.x;
+                    int i = (icell - k*(len1.x*len1.y)) - j*len1.x;
+                    i += lo1.x;
+                    j += lo1.y;
+                    k += lo1.z;
+                    for (T1 n = 0; n < ncomp1; ++n) {
+                        f1(i,j,k,n);
+                    }
                 }
-            }
-            if (icell < ncells2) {
-                int k =  icell /   (len2.x*len2.y);
-                int j = (icell - k*(len2.x*len2.y)) /   len2.x;
-                int i = (icell - k*(len2.x*len2.y)) - j*len2.x;
-                i += lo2.x;
-                j += lo2.y;
-                k += lo2.z;
-                for (T2 n = 0; n < ncomp2; ++n) {
-                    f2(i,j,k,n);
+                if (icell < ncells2) {
+                    int k =  icell /   (len2.x*len2.y);
+                    int j = (icell - k*(len2.x*len2.y)) /   len2.x;
+                    int i = (icell - k*(len2.x*len2.y)) - j*len2.x;
+                    i += lo2.x;
+                    j += lo2.y;
+                    k += lo2.z;
+                    for (T2 n = 0; n < ncomp2; ++n) {
+                        f2(i,j,k,n);
+                    }
                 }
-            }
-            if (icell < ncells3) {
-                int k =  icell /   (len3.x*len3.y);
-                int j = (icell - k*(len3.x*len3.y)) /   len3.x;
-                int i = (icell - k*(len3.x*len3.y)) - j*len3.x;
-                i += lo3.x;
-                j += lo3.y;
-                k += lo3.z;
-                for (T3 n = 0; n < ncomp3; ++n) {
-                    f3(i,j,k,n);
+                if (icell < ncells3) {
+                    int k =  icell /   (len3.x*len3.y);
+                    int j = (icell - k*(len3.x*len3.y)) /   len3.x;
+                    int i = (icell - k*(len3.x*len3.y)) - j*len3.x;
+                    i += lo3.x;
+                    j += lo3.y;
+                    k += lo3.z;
+                    for (T3 n = 0; n < ncomp3; ++n) {
+                        f3(i,j,k,n);
+                    }
                 }
             }
-        }
-    });
-    AMREX_GPU_ERROR_CHECK();
+        });
+        AMREX_GPU_ERROR_CHECK();
     }
 }
 

From 3657b11d00aed5a5d519b4c0a8f7bd5c746d7f0f Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <weiqunzhang@lbl.gov>
Date: Tue, 15 Sep 2020 10:01:39 -0700
Subject: [PATCH 3/3] fusable -> fusible

---
 Src/Base/AMReX_FBI.H                          |  4 +--
 Src/Base/AMReX_FabArray.H                     | 20 +++++------
 Src/Base/AMReX_FabArrayUtility.H              | 14 ++++----
 Src/Base/AMReX_GpuKernelInfo.H                |  6 ++--
 Src/Base/AMReX_GpuLaunch.H                    | 34 +++++++++----------
 Src/Base/AMReX_GpuLaunchFunctsG.H             | 14 ++++----
 Src/Base/AMReX_GpuLaunchMacrosG.H             | 18 +++++-----
 Src/Base/AMReX_MultiFab.cpp                   | 10 +++---
 Src/Base/AMReX_MultiFabUtil.cpp               | 34 +++++++++----------
 .../MLMG/AMReX_MLABecLaplacian.cpp            | 22 ++++++------
 Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp  |  8 ++---
 Src/LinearSolvers/MLMG/AMReX_MLPoisson.cpp    | 14 ++++----
 12 files changed, 99 insertions(+), 99 deletions(-)

diff --git a/Src/Base/AMReX_FBI.H b/Src/Base/AMReX_FBI.H
index 6748a591613..970f63421e5 100644
--- a/Src/Base/AMReX_FBI.H
+++ b/Src/Base/AMReX_FBI.H
@@ -688,7 +688,7 @@ FabArray<FAB>::FB_local_copy_gpu (const FB& TheFB, int scomp, int ncomp)
             BaseFab<int>& mskfab = maskfabs[sit()];
             const Array4<int>& msk = mskfab.array();
             const Box& bx = mskfab.box();
-            amrex::ParallelFor(Gpu::KernelInfo{}.setFusable(true), bx,
+            amrex::ParallelFor(Gpu::KernelInfo{}.setFusible(true), bx,
             [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
             {
                 msk(i,j,k) = 0;
@@ -1237,7 +1237,7 @@ FabArray<FAB>::unpack_recv_buffer_gpu (FabArray<FAB>& dst, int dcomp, int ncomp,
             BaseFab<int>& mskfab = maskfabs[sit()];
             const Array4<int>& msk = mskfab.array();
             const Box& bx = mskfab.box();
-            amrex::ParallelFor(Gpu::KernelInfo().setFusable(true), bx,
+            amrex::ParallelFor(Gpu::KernelInfo().setFusible(true), bx,
             [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
             {
                 msk(i,j,k) = 0;
diff --git a/Src/Base/AMReX_FabArray.H b/Src/Base/AMReX_FabArray.H
index 8ae7b66bd97..db0b2073d2f 100644
--- a/Src/Base/AMReX_FabArray.H
+++ b/Src/Base/AMReX_FabArray.H
@@ -1582,7 +1582,7 @@ FabArray<FAB>::setVal (value_type val,
     {
 	const Box& bx = fai.growntilebox(nghost);
         auto fab = this->array(fai);
-        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, ncomp, i, j, k, n,
+        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, ncomp, i, j, k, n,
         {
             fab(i,j,k,n+comp) = val;
         });
@@ -1625,7 +1625,7 @@ FabArray<FAB>::setVal (value_type val,
 
         if (b.ok()) {
             auto fab = this->array(fai);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( b, ncomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( b, ncomp, i, j, k, n,
             {
                 fab(i,j,k,n+comp) = val;
             });
@@ -1655,7 +1655,7 @@ FabArray<FAB>::abs (int comp, int ncomp, const IntVect& nghost)
     {
         const Box& bx = mfi.growntilebox(nghost);
         auto fab = this->array(mfi);
-        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, ncomp, i, j, k, n,
+        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, ncomp, i, j, k, n,
         {
             fab(i,j,k,n+comp) = amrex::Math::abs(fab(i,j,k,n+comp));
         });
@@ -1674,7 +1674,7 @@ FabArray<FAB>::plus (value_type val, int comp, int num_comp, int nghost)
     {
         const Box& bx = mfi.growntilebox(nghost);
         auto fab = this->array(mfi);
-        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, num_comp, i, j, k, n,
+        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n,
         {
             fab(i,j,k,n+comp) += val;
         });
@@ -1694,7 +1694,7 @@ FabArray<FAB>::plus (value_type val, const Box& region, int comp, int num_comp,
         const Box& bx = mfi.growntilebox(nghost) & region;
         if (bx.ok()) {
             auto fab = this->array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, num_comp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n,
             {
                 fab(i,j,k,n+comp) += val;
             });
@@ -1714,7 +1714,7 @@ FabArray<FAB>::mult (value_type val, int comp, int num_comp, int nghost)
     {
         const Box& bx = mfi.growntilebox(nghost);
         auto fab = this->array(mfi);
-        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, num_comp, i, j, k, n,
+        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n,
         {
             fab(i,j,k,n+comp) *= val;
         });
@@ -1734,7 +1734,7 @@ FabArray<FAB>::mult (value_type val, const Box& region, int comp, int num_comp,
         const Box& bx = mfi.growntilebox(nghost) & region;
         if (bx.ok()) {
             auto fab = this->array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, num_comp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n,
             {
                 fab(i,j,k,n+comp) *= val;
             });
@@ -1754,7 +1754,7 @@ FabArray<FAB>::invert (value_type numerator, int comp, int num_comp, int nghost)
     {
         const Box& bx = mfi.growntilebox(nghost);
         auto fab = this->array(mfi);
-        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, num_comp, i, j, k, n,
+        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n,
         {
             fab(i,j,k,n+comp) = numerator / fab(i,j,k,n+comp);
         });
@@ -1774,7 +1774,7 @@ FabArray<FAB>::invert (value_type numerator, const Box& region, int comp, int nu
         const Box& bx = mfi.growntilebox(nghost) & region;
         if (bx.ok()) {
             auto fab = this->array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, num_comp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n,
             {
                 fab(i,j,k,n+comp) = numerator / fab(i,j,k,n+comp);
             });
@@ -1970,7 +1970,7 @@ FabArray<FAB>::BuildMask (const Box& phys_domain, const Periodicity& period,
         Box const& fbx = mfi.growntilebox();
         Box const& gbx = fbx & domain;
         Box const& vbx = mfi.validbox();
-        AMREX_HOST_DEVICE_FOR_4D_FUSABLE(fbx, ncomp, i, j, k, n,
+        AMREX_HOST_DEVICE_FOR_4D_FUSIBLE(fbx, ncomp, i, j, k, n,
         {
             IntVect iv(AMREX_D_DECL(i,j,k));
             if (vbx.contains(iv)) {
diff --git a/Src/Base/AMReX_FabArrayUtility.H b/Src/Base/AMReX_FabArrayUtility.H
index c7075d48520..869fd172d89 100644
--- a/Src/Base/AMReX_FabArrayUtility.H
+++ b/Src/Base/AMReX_FabArrayUtility.H
@@ -1469,7 +1469,7 @@ Add (FabArray<FAB>& dst, FabArray<FAB> const& src, int srccomp, int dstcomp, int
         {
             auto const srcFab = src.array(mfi);
             auto       dstFab = dst.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n,
             {
                 dstFab(i,j,k,n+dstcomp) += srcFab(i,j,k,n+srccomp);
             });
@@ -1501,7 +1501,7 @@ Copy (FabArray<FAB>& dst, FabArray<FAB> const& src, int srccomp, int dstcomp, in
         {
             auto const srcFab = src.array(mfi);
             auto       dstFab = dst.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n,
             {
                 dstFab(i,j,k,dstcomp+n) = srcFab(i,j,k,srccomp+n);
             });
@@ -1533,7 +1533,7 @@ Subtract (FabArray<FAB>& dst, FabArray<FAB> const& src, int srccomp, int dstcomp
         {
             auto const srcFab = src.array(mfi);
             auto       dstFab = dst.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n,
             {
                 dstFab(i,j,k,n+dstcomp) -= srcFab(i,j,k,n+srccomp);
             });
@@ -1565,7 +1565,7 @@ Multiply (FabArray<FAB>& dst, FabArray<FAB> const& src, int srccomp, int dstcomp
         {
             auto const srcFab = src.array(mfi);
             auto       dstFab = dst.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n,
             {
                 dstFab(i,j,k,n+dstcomp) *= srcFab(i,j,k,n+srccomp);
             });
@@ -1597,7 +1597,7 @@ Divide (FabArray<FAB>& dst, FabArray<FAB> const& src, int srccomp, int dstcomp,
         {
             auto const srcFab = src.array(mfi);
             auto       dstFab = dst.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n,
             {
                 dstFab(i,j,k,n+dstcomp) /= srcFab(i,j,k,n+srccomp);
             });
@@ -1627,7 +1627,7 @@ Abs (FabArray<FAB>& fa, int icomp, int numcomp, const IntVect& nghost)
         if (bx.ok())
         {
             auto const& fab = fa.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n,
             {
                 fab(i,j,k,n+icomp) = amrex::Math::abs(fab(i,j,k,n+icomp));
             });
@@ -1684,7 +1684,7 @@ OverrideSync (FabArray<FAB> & fa, FabArray<IFAB> const& msk, const Periodicity&
         const Box& bx = mfi.tilebox();
         auto fab = fa.array(mfi);
         auto const ifab = msk.array(mfi);
-        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, ncomp, i, j, k, n,
+        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, ncomp, i, j, k, n,
         {
             if (!ifab(i,j,k)) fab(i,j,k,n) = 0;
         });
diff --git a/Src/Base/AMReX_GpuKernelInfo.H b/Src/Base/AMReX_GpuKernelInfo.H
index 2eac4054e63..94e125e5a0c 100644
--- a/Src/Base/AMReX_GpuKernelInfo.H
+++ b/Src/Base/AMReX_GpuKernelInfo.H
@@ -7,10 +7,10 @@ namespace Gpu {
 class KernelInfo
 {
 public:
-    KernelInfo& setFusable (bool flag) { fusable = flag; return *this; }
-    bool isFusable () const { return fusable; }
+    KernelInfo& setFusible (bool flag) { fusible = flag; return *this; }
+    bool isFusible () const { return fusible; }
 private:
-    bool fusable = false;
+    bool fusible = false;
 };
 
 }}
diff --git a/Src/Base/AMReX_GpuLaunch.H b/Src/Base/AMReX_GpuLaunch.H
index 5305139f636..d6c35972427 100644
--- a/Src/Base/AMReX_GpuLaunch.H
+++ b/Src/Base/AMReX_GpuLaunch.H
@@ -190,18 +190,18 @@ namespace Gpu {
                                                         AMREX_WRONG_NUM_ARGS)(__VA_ARGS__)
 
 #ifdef AMREX_USE_CUDA
-#define AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA(...) AMREX_GET_MACRO(__VA_ARGS__,\
-                                                        AMREX_GPU_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA_RANGE_3, \
+#define AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA(...) AMREX_GET_MACRO(__VA_ARGS__,\
+                                                        AMREX_GPU_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA_RANGE_3, \
                                                         AMREX_WRONG_NUM_ARGS, \
                                                         AMREX_WRONG_NUM_ARGS, \
-                                                        AMREX_GPU_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA_RANGE_2, \
+                                                        AMREX_GPU_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA_RANGE_2, \
                                                         AMREX_WRONG_NUM_ARGS, \
                                                         AMREX_WRONG_NUM_ARGS, \
-                                                        AMREX_GPU_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA_RANGE, \
+                                                        AMREX_GPU_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA_RANGE, \
                                                         AMREX_WRONG_NUM_ARGS, \
                                                         AMREX_WRONG_NUM_ARGS)(__VA_ARGS__)
 #else
-#define AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA(...) AMREX_LAUNCH_HOST_DEVICE_LAMBDA(__VA_ARGS__)
+#define AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA(...) AMREX_LAUNCH_HOST_DEVICE_LAMBDA(__VA_ARGS__)
 #endif
 
 #if (AMREX_SPACEDIM == 1)
@@ -232,19 +232,19 @@ namespace Gpu {
 #define AMREX_HOST_DEVICE_PARALLEL_FOR_4D(...) AMREX_GPU_HOST_DEVICE_PARALLEL_FOR_4D(__VA_ARGS__)
 
 #ifdef AMREX_USE_CUDA
-#define AMREX_HOST_DEVICE_PARALLEL_FOR_1D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_1D_FUSABLE(__VA_ARGS__)
-#define AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_3D_FUSABLE(__VA_ARGS__)
-#define AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_4D_FUSABLE(__VA_ARGS__)
-#define AMREX_HOST_DEVICE_FOR_1D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_1D_FUSABLE(__VA_ARGS__)
-#define AMREX_HOST_DEVICE_FOR_3D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_3D_FUSABLE(__VA_ARGS__)
-#define AMREX_HOST_DEVICE_FOR_4D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_4D_FUSABLE(__VA_ARGS__)
+#define AMREX_HOST_DEVICE_PARALLEL_FOR_1D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_1D_FUSIBLE(__VA_ARGS__)
+#define AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_3D_FUSIBLE(__VA_ARGS__)
+#define AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_4D_FUSIBLE(__VA_ARGS__)
+#define AMREX_HOST_DEVICE_FOR_1D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_1D_FUSIBLE(__VA_ARGS__)
+#define AMREX_HOST_DEVICE_FOR_3D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_3D_FUSIBLE(__VA_ARGS__)
+#define AMREX_HOST_DEVICE_FOR_4D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_4D_FUSIBLE(__VA_ARGS__)
 #else
-#define AMREX_HOST_DEVICE_PARALLEL_FOR_1D_FUSABLE(...) AMREX_HOST_DEVICE_PARALLEL_FOR_1D(__VA_ARGS__)
-#define AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSABLE(...) AMREX_HOST_DEVICE_PARALLEL_FOR_3D(__VA_ARGS__)
-#define AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE(...) AMREX_HOST_DEVICE_PARALLEL_FOR_4D(__VA_ARGS__)
-#define AMREX_HOST_DEVICE_FOR_1D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_1D(__VA_ARGS__)
-#define AMREX_HOST_DEVICE_FOR_3D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_3D(__VA_ARGS__)
-#define AMREX_HOST_DEVICE_FOR_4D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_4D(__VA_ARGS__)
+#define AMREX_HOST_DEVICE_PARALLEL_FOR_1D_FUSIBLE(...) AMREX_HOST_DEVICE_PARALLEL_FOR_1D(__VA_ARGS__)
+#define AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSIBLE(...) AMREX_HOST_DEVICE_PARALLEL_FOR_3D(__VA_ARGS__)
+#define AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE(...) AMREX_HOST_DEVICE_PARALLEL_FOR_4D(__VA_ARGS__)
+#define AMREX_HOST_DEVICE_FOR_1D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_1D(__VA_ARGS__)
+#define AMREX_HOST_DEVICE_FOR_3D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_3D(__VA_ARGS__)
+#define AMREX_HOST_DEVICE_FOR_4D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_4D(__VA_ARGS__)
 #endif
 
 #ifdef AMREX_USE_GPU
diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
index a9d3ee2bd2d..295d3320380 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsG.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -636,7 +636,7 @@ ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
 {
     if (amrex::isEmpty(n)) return;
 #ifdef AMREX_USE_CUDA
-    if (Gpu::inFuseRegion() && info.isFusable() && n <= Gpu::getFuseSizeThreshold()) {
+    if (Gpu::inFuseRegion() && info.isFusible() && n <= Gpu::getFuseSizeThreshold()) {
         Gpu::Register(n, f);
     } else
 #endif
@@ -661,7 +661,7 @@ ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
     if (amrex::isEmpty(box)) return;
     int ncells = box.numPts();
 #ifdef AMREX_USE_CUDA
-    if (Gpu::inFuseRegion() && info.isFusable() && ncells <= Gpu::getFuseSizeThreshold()) {
+    if (Gpu::inFuseRegion() && info.isFusible() && ncells <= Gpu::getFuseSizeThreshold()) {
         Gpu::Register(box, f);
     } else
 #endif
@@ -694,7 +694,7 @@ ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexce
     if (amrex::isEmpty(box)) return;
     int ncells = box.numPts();
 #ifdef AMREX_USE_CUDA
-    if (Gpu::inFuseRegion() && info.isFusable() && ncells <= Gpu::getFuseSizeThreshold()) {
+    if (Gpu::inFuseRegion() && info.isFusible() && ncells <= Gpu::getFuseSizeThreshold()) {
         Gpu::Register(box, ncomp, f);
     } else
 #endif
@@ -732,7 +732,7 @@ ParallelFor (Gpu::KernelInfo const& info,
     int ncells2 = box2.numPts();
     int ncells = amrex::max(ncells1, ncells2);
 #ifdef AMREX_USE_CUDA
-    if (Gpu::inFuseRegion() && info.isFusable() && ncells <= Gpu::getFuseSizeThreshold()) {
+    if (Gpu::inFuseRegion() && info.isFusible() && ncells <= Gpu::getFuseSizeThreshold()) {
         Gpu::Register(box1, f1);
         Gpu::Register(box2, f2);
     } else
@@ -784,7 +784,7 @@ ParallelFor (Gpu::KernelInfo const& info,
     int ncells3 = box3.numPts();
     int ncells = amrex::max(ncells1, ncells2, ncells3);
 #ifdef AMREX_USE_CUDA
-    if (Gpu::inFuseRegion() && info.isFusable() && ncells <= Gpu::getFuseSizeThreshold()) {
+    if (Gpu::inFuseRegion() && info.isFusible() && ncells <= Gpu::getFuseSizeThreshold()) {
         Gpu::Register(box1, f1);
         Gpu::Register(box2, f2);
         Gpu::Register(box3, f3);
@@ -849,7 +849,7 @@ ParallelFor (Gpu::KernelInfo const& info,
     int ncells2 = box2.numPts();
     int ncells = amrex::max(ncells1, ncells2);
 #ifdef AMREX_USE_CUDA
-    if (Gpu::inFuseRegion() && info.isFusable() && ncells <= Gpu::getFuseSizeThreshold()) {
+    if (Gpu::inFuseRegion() && info.isFusible() && ncells <= Gpu::getFuseSizeThreshold()) {
         Gpu::Register(box1, ncomp1, f1);
         Gpu::Register(box2, ncomp2, f2);
     } else
@@ -909,7 +909,7 @@ ParallelFor (Gpu::KernelInfo const& info,
     int ncells3 = box3.numPts();
     int ncells = amrex::max(ncells1, ncells2, ncells3);
 #ifdef AMREX_USE_CUDA
-    if (Gpu::inFuseRegion() && info.isFusable() && ncells <= Gpu::getFuseSizeThreshold()) {
+    if (Gpu::inFuseRegion() && info.isFusible() && ncells <= Gpu::getFuseSizeThreshold()) {
         Gpu::Register(box1, ncomp1, f1);
         Gpu::Register(box2, ncomp2, f2);
         Gpu::Register(box3, ncomp3, f3);
diff --git a/Src/Base/AMReX_GpuLaunchMacrosG.H b/Src/Base/AMReX_GpuLaunchMacrosG.H
index bb69202a9d9..45400858abd 100644
--- a/Src/Base/AMReX_GpuLaunchMacrosG.H
+++ b/Src/Base/AMReX_GpuLaunchMacrosG.H
@@ -54,7 +54,7 @@
         } \
     }}}
 #ifdef AMREX_USE_CUDA
-#define AMREX_GPU_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA_RANGE(TN,TI,block) \
+#define AMREX_GPU_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA_RANGE(TN,TI,block) \
     { auto const& amrex_i_tn = TN; \
     if (!amrex::isEmpty(amrex_i_tn)) { \
     if (amrex::Gpu::inLaunchRegion()) \
@@ -163,7 +163,7 @@
         } \
     }}}
 #ifdef AMREX_USE_CUDA
-#define AMREX_GPU_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA_RANGE_2(TN1,TI1,block1,TN2,TI2,block2) \
+#define AMREX_GPU_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA_RANGE_2(TN1,TI1,block1,TN2,TI2,block2) \
     { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; \
     if (!amrex::isEmpty(amrex_i_tn1) or !amrex::isEmpty(amrex_i_tn2)) { \
     if (amrex::Gpu::inLaunchRegion()) \
@@ -307,7 +307,7 @@
         } \
     }}}
 #ifdef AMREX_USE_CUDA
-#define AMREX_GPU_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA_RANGE_3(TN1,TI1,block1,TN2,TI2,block2,TN3,TI3,block3) \
+#define AMREX_GPU_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA_RANGE_3(TN1,TI1,block1,TN2,TI2,block2,TN3,TI3,block3) \
     { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; auto const& amrex_i_tn3 = TN3; \
     if (!amrex::isEmpty(amrex_i_tn1) or !amrex::isEmpty(amrex_i_tn2) or !amrex::isEmpty(amrex_i_tn3)) { \
     if (amrex::Gpu::inLaunchRegion()) \
@@ -583,12 +583,12 @@
     } \
 }
 
-#define AMREX_GPU_HOST_DEVICE_FOR_1D_FUSABLE(n,i,block) \
+#define AMREX_GPU_HOST_DEVICE_FOR_1D_FUSIBLE(n,i,block) \
 { \
     auto const& amrex_i_n = n; \
     using amrex_i_inttype = typename std::remove_const<decltype(n)>::type; \
     if (amrex::Gpu::inLaunchRegion()) { \
-        amrex::ParallelFor(amrex::Gpu::KernelInfo().setFusable(true),amrex_i_n,[=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept block); \
+        amrex::ParallelFor(amrex::Gpu::KernelInfo().setFusible(true),amrex_i_n,[=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept block); \
     } else { \
         auto amrex_i_lambda = [=] (amrex_i_inttype i) noexcept block; \
         AMREX_PRAGMA_SIMD \
@@ -614,11 +614,11 @@
     } \
 }
 
-#define AMREX_GPU_HOST_DEVICE_FOR_3D_FUSABLE(box,i,j,k,block) \
+#define AMREX_GPU_HOST_DEVICE_FOR_3D_FUSIBLE(box,i,j,k,block) \
 { \
     auto const& amrex_i_box = box; \
     if (amrex::Gpu::inLaunchRegion()) { \
-        amrex::ParallelFor(amrex::Gpu::KernelInfo().setFusable(true),amrex_i_box,[=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept block); \
+        amrex::ParallelFor(amrex::Gpu::KernelInfo().setFusible(true),amrex_i_box,[=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept block); \
     } else { \
         amrex::LoopConcurrentOnCpu(amrex_i_box,[=] (int i, int j, int k) noexcept block); \
     } \
@@ -642,12 +642,12 @@
     } \
 }
 
-#define AMREX_GPU_HOST_DEVICE_FOR_4D_FUSABLE(box,ncomp,i,j,k,n,block) \
+#define AMREX_GPU_HOST_DEVICE_FOR_4D_FUSIBLE(box,ncomp,i,j,k,n,block) \
 { \
     auto const& amrex_i_box = box; \
     auto const& amrex_i_ncomp = ncomp; \
     if (amrex::Gpu::inLaunchRegion()) { \
-        amrex::ParallelFor(amrex::Gpu::KernelInfo().setFusable(true),amrex_i_box,amrex_i_ncomp,[=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept block); \
+        amrex::ParallelFor(amrex::Gpu::KernelInfo().setFusible(true),amrex_i_box,amrex_i_ncomp,[=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept block); \
     } else { \
         amrex::LoopConcurrentOnCpu(amrex_i_box,amrex_i_ncomp,[=] (int i, int j, int k, int n) noexcept block); \
     } \
diff --git a/Src/Base/AMReX_MultiFab.cpp b/Src/Base/AMReX_MultiFab.cpp
index a748bcd2eb8..2ce916e627e 100644
--- a/Src/Base/AMReX_MultiFab.cpp
+++ b/Src/Base/AMReX_MultiFab.cpp
@@ -218,7 +218,7 @@ MultiFab::Swap (MultiFab& dst, MultiFab& src,
             if (bx.ok()) {
                 auto sfab = src.array(mfi);
                 auto dfab = dst.array(mfi);
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n,
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n,
                 {
                     const amrex::Real tmp = dfab(i,j,k,n+dstcomp);
                     dfab(i,j,k,n+dstcomp) = sfab(i,j,k,n+srccomp);
@@ -318,7 +318,7 @@ MultiFab::Saxpy (MultiFab& dst, Real a, const MultiFab& src,
         if (bx.ok()) {
             auto const sfab = src.array(mfi);
             auto       dfab = dst.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n,
             {
                 dfab(i,j,k,dstcomp+n) += a * sfab(i,j,k,srccomp+n);
             });
@@ -352,7 +352,7 @@ MultiFab::Xpay (MultiFab& dst, Real a, const MultiFab& src,
         if (bx.ok()) {
             auto const sfab = src.array(mfi);
             auto       dfab = dst.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n,
             {
                 dfab(i,j,k,n+dstcomp) = sfab(i,j,k,n+srccomp) + a * dfab(i,j,k,n+dstcomp);
             });
@@ -394,7 +394,7 @@ MultiFab::LinComb (MultiFab& dst,
             auto const xfab =   x.array(mfi);
             auto const yfab =   y.array(mfi);
             auto       dfab = dst.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n,
             {
                 dfab(i,j,k,dstcomp+n) = a*xfab(i,j,k,xcomp+n) + b*yfab(i,j,k,ycomp+n);
             });
@@ -435,7 +435,7 @@ MultiFab::AddProduct (MultiFab& dst,
             auto const s1fab = src1.array(mfi);
             auto const s2fab = src2.array(mfi);
             auto        dfab =  dst.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n,
             {
                 dfab(i,j,k,n+dstcomp) += s1fab(i,j,k,n+comp1) * s2fab(i,j,k,n+comp2);
             });
diff --git a/Src/Base/AMReX_MultiFabUtil.cpp b/Src/Base/AMReX_MultiFabUtil.cpp
index 3acbeeabe5f..fb436d6dcaf 100644
--- a/Src/Base/AMReX_MultiFabUtil.cpp
+++ b/Src/Base/AMReX_MultiFabUtil.cpp
@@ -67,7 +67,7 @@ namespace amrex
             Array4<Real> const& ccarr = cc.array(mfi);
             Array4<Real const> const& ndarr = nd.const_array(mfi);
 
-            AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
             {
                 amrex_avg_nd_to_cc(tbx, ccarr, ndarr, dcomp, scomp, ncomp);
             });
@@ -91,7 +91,7 @@ namespace amrex
                          Array4<Real const> const& eyarr = edge[1]->const_array(mfi);,
                          Array4<Real const> const& ezarr = edge[2]->const_array(mfi););
 
-            AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
             {
                 amrex_avg_eg_to_cc(tbx, ccarr, AMREX_D_DECL(exarr,eyarr,ezarr), dcomp);
             });
@@ -133,12 +133,12 @@ namespace amrex
                          Array4<Real const> const& fzarr = fc[2]->const_array(mfi););
 
 #if (AMREX_SPACEDIM == 1)
-            AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
             {
                 amrex_avg_fc_to_cc(tbx, ccarr, fxarr, dcomp, GeometryData());
             });
 #else
-            AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
             {
                 amrex_avg_fc_to_cc(tbx, ccarr, AMREX_D_DECL(fxarr,fyarr,fzarr), dcomp);
             });
@@ -168,12 +168,12 @@ namespace amrex
                          Array4<Real const> const& fzarr = fc[2]->const_array(mfi););
 
 #if (AMREX_SPACEDIM == 1)
-            AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
             {
                 amrex_avg_fc_to_cc(tbx, ccarr, fxarr, 0, gd);
             });
 #else
-            AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
             {
                 amrex_avg_fc_to_cc(tbx, ccarr, AMREX_D_DECL(fxarr,fyarr,fzarr), 0);
             });
@@ -218,12 +218,12 @@ namespace amrex
             Array4<Real const> const& ccarr = cc.const_array(mfi);
 
 #if (AMREX_SPACEDIM == 1)
-            AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA (index_bounds, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA (index_bounds, tbx,
             {
                 amrex_avg_cc_to_fc(tbx, xbx, fxarr, ccarr, gd);
             });
 #else
-            AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA (index_bounds, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA (index_bounds, tbx,
             {
                 amrex_avg_cc_to_fc(tbx, AMREX_D_DECL(xbx,ybx,zbx),
                                    AMREX_D_DECL(fxarr,fyarr,fzarr), ccarr);
@@ -288,7 +288,7 @@ namespace amrex
             Array4<Real const> const& finearr = S_fine.const_array(mfi);
             Array4<Real const> const& finevolarr = fvolume.const_array(mfi);
 
-            AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
             {
                 amrex_avgdown_with_vol(tbx,crsearr,finearr,finevolarr,
                                        0,scomp,ncomp,ratio);
@@ -337,7 +337,7 @@ namespace amrex
             Array4<Real> const& crsearr = crse_S_fine.array(mfi);
             Array4<Real const> const& finearr = S_fine.const_array(mfi);
 
-            AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
             {
                 amrex_avgdown(tbx,crsearr,finearr,0,scomp,ncomp,ratio);
             });
@@ -375,12 +375,12 @@ namespace amrex
                 Array4<Real const> const& finearr = S_fine.const_array(mfi);
 
                 if (is_cell_centered) {
-                    AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
+                    AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
                     {
                         amrex_avgdown(tbx,crsearr,finearr,scomp,scomp,ncomp,ratio);
                     });
                 } else {
-                    AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
+                    AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
                     {
                         amrex_avgdown_nodes(tbx,crsearr,finearr,scomp,scomp,ncomp,ratio);
                     });
@@ -406,12 +406,12 @@ namespace amrex
                 //        not part of the actual crse multifab which came in.
 
                 if (is_cell_centered) {
-                    AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
+                    AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
                     {
                         amrex_avgdown(tbx,crsearr,finearr,0,scomp,ncomp,ratio);
                     });
                 } else {
-                    AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
+                    AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
                     {
                         amrex_avgdown_nodes(tbx,crsearr,finearr,0,scomp,ncomp,ratio);
                     });
@@ -486,7 +486,7 @@ namespace amrex
                 Array4<Real> const& crsearr = crse.array(mfi);
                 Array4<Real const> const& finearr = fine.const_array(mfi);
 
-                AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
+                AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
                 {
                     amrex_avgdown_faces(tbx, crsearr, finearr, 0, 0, ncomp, ratio, dir);
                 });
@@ -568,7 +568,7 @@ namespace amrex
                 Array4<Real> const& crsearr = crse.array(mfi);
                 Array4<Real const> const& finearr = fine.const_array(mfi);
 
-                AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
+                AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
                 {
                     amrex_avgdown_edges(tbx, crsearr, finearr, 0, 0, ncomp, ratio, dir);
                 });
@@ -640,7 +640,7 @@ namespace amrex
 
             if (interpolate)
             {
-                AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( tile_box, thread_box,
+                AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( tile_box, thread_box,
                 {
                     amrex_fill_slice_interp(thread_box, slice_arr, full_arr,
                                             0, start_comp, ncomp,
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.cpp b/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.cpp
index e641313c7a5..652baa1bf92 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.cpp
@@ -410,13 +410,13 @@ MLABecLaplacian::Fapply (int amrlev, int mglev, MultiFab& out, const MultiFab& i
                      const auto& bzfab = bzcoef.array(mfi););
         if (m_overset_mask[amrlev][mglev]) {
             const auto& osm = m_overset_mask[amrlev][mglev]->array(mfi);
-            AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
             {
                 mlabeclap_adotx_os(tbx, yfab, xfab, afab, AMREX_D_DECL(bxfab,byfab,bzfab),
                                    osm, dxinv, ascalar, bscalar, ncomp);
             });
         } else {
-            AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
             {
                 mlabeclap_adotx(tbx, yfab, xfab, afab, AMREX_D_DECL(bxfab,byfab,bzfab),
                                 dxinv, ascalar, bscalar, ncomp);
@@ -454,7 +454,7 @@ MLABecLaplacian::normalize (int amrlev, int mglev, MultiFab& mf) const
                      const auto& byfab = bycoef.array(mfi);,
                      const auto& bzfab = bzcoef.array(mfi););
 
-        AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
+        AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
         {
             mlabeclap_normalize(tbx, fab, afab, AMREX_D_DECL(bxfab,byfab,bzfab),
                                 dxinv, ascalar, bscalar, ncomp);
@@ -609,7 +609,7 @@ MLABecLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab&
 #else
         if (m_overset_mask[amrlev][mglev]) {
             const auto& osm = m_overset_mask[amrlev][mglev]->array(mfi);
-            AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( tbx, thread_box,
+            AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( tbx, thread_box,
             {
                 abec_gsrb_os(thread_box, solnfab, rhsfab, alpha, afab,
                              AMREX_D_DECL(dhx, dhy, dhz),
@@ -621,7 +621,7 @@ MLABecLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab&
                              osm, vbx, redblack, nc);
             });
         } else if (regular_coarsening) {
-            AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( tbx, thread_box,
+            AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( tbx, thread_box,
             {
                 abec_gsrb(thread_box, solnfab, rhsfab, alpha, afab,
                           AMREX_D_DECL(dhx, dhy, dhz),
@@ -688,7 +688,7 @@ MLABecLaplacian::FFlux (Box const& box, Real const* dxinv, Real bscalar,
         Real fac = bscalar*dxinv[0];
         Box blo = amrex::bdryLo(box, 0);
         int blen = box.length(0);
-        AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( blo, tbox,
+        AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( blo, tbox,
         {
             mlabeclap_flux_xface(tbox, fxarr, solarr, bx, fac, blen, ncomp);
         });
@@ -696,7 +696,7 @@ MLABecLaplacian::FFlux (Box const& box, Real const* dxinv, Real bscalar,
         fac = bscalar*dxinv[1];
         blo = amrex::bdryLo(box, 1);
         blen = box.length(1);
-        AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( blo, tbox,
+        AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( blo, tbox,
         {
             mlabeclap_flux_yface(tbox, fyarr, solarr, by, fac, blen, ncomp);
         });
@@ -705,7 +705,7 @@ MLABecLaplacian::FFlux (Box const& box, Real const* dxinv, Real bscalar,
         fac = bscalar*dxinv[2];
         blo = amrex::bdryLo(box, 2);
         blen = box.length(2);
-        AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( blo, tbox,
+        AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( blo, tbox,
         {
             mlabeclap_flux_zface(tbox, fzarr, solarr, bz, fac, blen, ncomp);
         });
@@ -715,14 +715,14 @@ MLABecLaplacian::FFlux (Box const& box, Real const* dxinv, Real bscalar,
     {
         Real fac = bscalar*dxinv[0];
         Box bflux = amrex::surroundingNodes(box, 0);
-        AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bflux, tbox,
+        AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bflux, tbox,
         {
             mlabeclap_flux_x(tbox, fxarr, solarr, bx, fac, ncomp);
         });
 #if (AMREX_SPACEDIM >= 2)
         fac = bscalar*dxinv[1];
         bflux = amrex::surroundingNodes(box, 1);
-        AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bflux, tbox,
+        AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bflux, tbox,
         {
             mlabeclap_flux_y(tbox, fyarr, solarr, by, fac, ncomp);
         });
@@ -730,7 +730,7 @@ MLABecLaplacian::FFlux (Box const& box, Real const* dxinv, Real bscalar,
 #if (AMREX_SPACEDIM == 3)
         fac = bscalar*dxinv[2];
         bflux = amrex::surroundingNodes(box, 2);
-        AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bflux, tbox,
+        AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bflux, tbox,
         {
             mlabeclap_flux_z(tbox, fzarr, solarr, bz, fac, ncomp);
         });
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp
index c854898a693..bb88308d595 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp
@@ -314,7 +314,7 @@ MLCellLinOp::interpolation (int amrlev, int fmglev, MultiFab& fine, const MultiF
         const Box& bx    = mfi.tilebox();
         Array4<Real const> const& cfab = crse.const_array(mfi);
         Array4<Real> const& ffab = fine.array(mfi);
-        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, ncomp, i, j, k, n,
+        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, ncomp, i, j, k, n,
         {
             int ic = amrex::coarsen(i,ratio3.x);
             int jc = amrex::coarsen(j,ratio3.y);
@@ -505,7 +505,7 @@ MLCellLinOp::applyBC (int amrlev, int mglev, MultiFab& in, BCMode bc_mode, State
                     const Real bcllo = bdlv[icomp][olo];
                     const Real bclhi = bdlv[icomp][ohi];
                     if (idim == 0) {
-                        AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA (
+                        AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA (
                         blo, tboxlo, {
                         mllinop_apply_bc_x(0, tboxlo, blen, iofab, mlo,
                                            bctlo, bcllo, bvlo,
@@ -517,7 +517,7 @@ MLCellLinOp::applyBC (int amrlev, int mglev, MultiFab& in, BCMode bc_mode, State
                                            imaxorder, dxi, flagbc, icomp);
                         });
                     } else if (idim == 1) {
-                        AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA (
+                        AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA (
                         blo, tboxlo, {
                         mllinop_apply_bc_y(0, tboxlo, blen, iofab, mlo,
                                            bctlo, bcllo, bvlo,
@@ -529,7 +529,7 @@ MLCellLinOp::applyBC (int amrlev, int mglev, MultiFab& in, BCMode bc_mode, State
                                            imaxorder, dyi, flagbc, icomp);
                         });
                     } else {
-                        AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA (
+                        AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA (
                         blo, tboxlo, {
                         mllinop_apply_bc_z(0, tboxlo, blen, iofab, mlo,
                                            bctlo, bcllo, bvlo,
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLPoisson.cpp b/Src/LinearSolvers/MLMG/AMReX_MLPoisson.cpp
index aba34c49dd5..c556f22824a 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLPoisson.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLPoisson.cpp
@@ -77,30 +77,30 @@ MLPoisson::Fapply (int amrlev, int mglev, MultiFab& out, const MultiFab& in) con
         const auto& yfab = out.array(mfi);
 
 #if (AMREX_SPACEDIM == 3)
-        AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSABLE (bx, i, j, k,
+        AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSIBLE (bx, i, j, k,
         {
             mlpoisson_adotx(i, j, k, yfab, xfab, dhx, dhy, dhz);
         });
 #elif (AMREX_SPACEDIM == 2)
         if (m_has_metric_term) {
-            AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSABLE (bx, i, j, k,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSIBLE (bx, i, j, k,
             {
                 mlpoisson_adotx_m(i, j, yfab, xfab, dhx, dhy, dx, probxlo);
             });
         } else {
-            AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSABLE (bx, i, j, k,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSIBLE (bx, i, j, k,
             {
                 mlpoisson_adotx(i, j, yfab, xfab, dhx, dhy);
             });
         }
 #elif (AMREX_SPACEDIM == 1)
         if (m_has_metric_term) {
-            AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSABLE (bx, i, j, k,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSIBLE (bx, i, j, k,
             {
                 mlpoisson_adotx_m(i, yfab, xfab, dhx, dx, probxlo);
             });
         } else {
-            AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSABLE (bx, i, j, k,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSIBLE (bx, i, j, k,
             {
                 mlpoisson_adotx(i, yfab, xfab, dhx);
             });
@@ -134,12 +134,12 @@ MLPoisson::normalize (int amrlev, int mglev, MultiFab& mf) const
         const auto& fab = mf.array(mfi);
 
 #if (AMREX_SPACEDIM == 2)
-        AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
+        AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
         {
             mlpoisson_normalize(tbx, fab, dhx, dhy, dx, probxlo);
         });
 #else
-        AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx,
+        AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
         {
             mlpoisson_normalize(tbx, fab, dhx, dx, probxlo);
         });