AMReX-Codes · WeiqunZhang · Sep 21, 2020 · Jul 14, 2020 · Sep 15, 2020 · Sep 15, 2020
diff --git a/Src/Base/AMReX.cpp b/Src/Base/AMReX.cpp
@@ -522,7 +522,7 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse,
     BL_PROFILE_INITPARAMS();
 #endif
     machine::Initialize();
-#ifdef AMREX_USE_CUDA
+#ifdef AMREX_USE_GPU
     Gpu::Fuser::Initialize();
 #endif
 

diff --git a/Src/Base/AMReX_FBI.H b/Src/Base/AMReX_FBI.H
diff --git a/Src/Base/AMReX_FabArray.H b/Src/Base/AMReX_FabArray.H
@@ -718,7 +718,7 @@ public:
 #endif
 
     static void pack_send_buffer_gpu (FabArray<FAB> const& src, int scomp, int ncomp,
-                                      Vector<char*>& send_data,
+                                      Vector<char*> const& send_data,
                                       Vector<std::size_t> const& send_size,
                                       Vector<const CopyComTagsContainer*> const& send_cctc);
 
@@ -731,7 +731,7 @@ public:
 #endif
 
     static void pack_send_buffer_cpu (FabArray<FAB> const& src, int scomp, int ncomp,
-                                      Vector<char*>& send_data,
+                                      Vector<char*> const& send_data,
                                       Vector<std::size_t> const& send_size,
                                       Vector<const CopyComTagsContainer*> const& send_cctc);
 
@@ -1582,7 +1582,7 @@ FabArray<FAB>::setVal (value_type val,
     {
 	const Box& bx = fai.growntilebox(nghost);
         auto fab = this->array(fai);
-        AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, ncomp, i, j, k, n,
+        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, ncomp, i, j, k, n,
         {
             fab(i,j,k,n+comp) = val;
         });
@@ -1625,7 +1625,7 @@ FabArray<FAB>::setVal (value_type val,
 
         if (b.ok()) {
             auto fab = this->array(fai);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( b, ncomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( b, ncomp, i, j, k, n,
             {
                 fab(i,j,k,n+comp) = val;
             });
@@ -1655,7 +1655,7 @@ FabArray<FAB>::abs (int comp, int ncomp, const IntVect& nghost)
     {
         const Box& bx = mfi.growntilebox(nghost);
         auto fab = this->array(mfi);
-        AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, ncomp, i, j, k, n,
+        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, ncomp, i, j, k, n,
         {
             fab(i,j,k,n+comp) = amrex::Math::abs(fab(i,j,k,n+comp));
         });
@@ -1674,7 +1674,7 @@ FabArray<FAB>::plus (value_type val, int comp, int num_comp, int nghost)
     {
         const Box& bx = mfi.growntilebox(nghost);
         auto fab = this->array(mfi);
-        AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n,
+        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n,
         {
             fab(i,j,k,n+comp) += val;
         });
@@ -1694,7 +1694,7 @@ FabArray<FAB>::plus (value_type val, const Box& region, int comp, int num_comp,
         const Box& bx = mfi.growntilebox(nghost) & region;
         if (bx.ok()) {
             auto fab = this->array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n,
             {
                 fab(i,j,k,n+comp) += val;
             });
@@ -1714,7 +1714,7 @@ FabArray<FAB>::mult (value_type val, int comp, int num_comp, int nghost)
     {
         const Box& bx = mfi.growntilebox(nghost);
         auto fab = this->array(mfi);
-        AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n,
+        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n,
         {
             fab(i,j,k,n+comp) *= val;
         });
@@ -1734,7 +1734,7 @@ FabArray<FAB>::mult (value_type val, const Box& region, int comp, int num_comp,
         const Box& bx = mfi.growntilebox(nghost) & region;
         if (bx.ok()) {
             auto fab = this->array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n,
             {
                 fab(i,j,k,n+comp) *= val;
             });
@@ -1754,7 +1754,7 @@ FabArray<FAB>::invert (value_type numerator, int comp, int num_comp, int nghost)
     {
         const Box& bx = mfi.growntilebox(nghost);
         auto fab = this->array(mfi);
-        AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n,
+        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n,
         {
             fab(i,j,k,n+comp) = numerator / fab(i,j,k,n+comp);
         });
@@ -1774,7 +1774,7 @@ FabArray<FAB>::invert (value_type numerator, const Box& region, int comp, int nu
         const Box& bx = mfi.growntilebox(nghost) & region;
         if (bx.ok()) {
             auto fab = this->array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n,
             {
                 fab(i,j,k,n+comp) = numerator / fab(i,j,k,n+comp);
             });
@@ -1970,7 +1970,7 @@ FabArray<FAB>::BuildMask (const Box& phys_domain, const Periodicity& period,
         Box const& fbx = mfi.growntilebox();
         Box const& gbx = fbx & domain;
         Box const& vbx = mfi.validbox();
-        AMREX_HOST_DEVICE_FOR_4D(fbx, ncomp, i, j, k, n,
+        AMREX_HOST_DEVICE_FOR_4D_FUSIBLE(fbx, ncomp, i, j, k, n,
         {
             IntVect iv(AMREX_D_DECL(i,j,k));
             if (vbx.contains(iv)) {

diff --git a/Src/Base/AMReX_FabArrayBase.cpp b/Src/Base/AMReX_FabArrayBase.cpp
@@ -118,7 +118,7 @@ FabArrayBase::Initialize ()
 
 #ifdef AMREX_USE_GPU
     if (ParallelDescriptor::UseGpuAwareMpi()) {
-        the_fa_arena = The_Device_Arena();
+        the_fa_arena = The_Arena();
     } else {
         the_fa_arena = The_Pinned_Arena();
     }

diff --git a/Src/Base/AMReX_FabArrayCommI.H b/Src/Base/AMReX_FabArrayCommI.H
@@ -143,9 +143,7 @@ FabArray<FAB>::FBEP_nowait (int scomp, int ncomp, const IntVect& nghost,
         {
             the_send_data = static_cast<char*>(amrex::The_FA_Arena()->alloc(total_volume));
             for (int i = 0, N = send_size.size(); i < N; ++i) {
-                if (send_size[i] > 0) {
-                    send_data[i] = the_send_data + offset[i];
-                }
+                send_data[i] = the_send_data + offset[i];
             }
         } else {
             the_send_data = nullptr;
@@ -495,9 +493,7 @@ FabArray<FAB>::ParallelCopy (const FabArray<FAB>& src,
             {
                 the_send_data = static_cast<char*>(amrex::The_FA_Arena()->alloc(total_volume));
                 for (int i = 0, N = send_size.size(); i < N; ++i) {
-                    if (send_size[i] > 0) {
-                        send_data[i] = the_send_data + offset[i];
-                    }
+                    send_data[i] = the_send_data + offset[i];
                 }
             }
 
@@ -749,9 +745,9 @@ FabArray<FAB>::PostRcvs (const MapOfCopyComTagContainers&  m_RcvTags,
 
         for (int i = 0; i < nrecv; ++i)
         {
+            recv_data[i] = the_recv_data + offset[i];
             if (recv_size[i] > 0)
             {
-                recv_data[i] = the_recv_data + offset[i];
                 const int rank = ParallelContext::global_to_local_rank(recv_from[i]);
                 const int comm_data_type = ParallelDescriptor::select_comm_data_type(recv_size[i]);
                 if (comm_data_type == 1) {

diff --git a/Src/Base/AMReX_FabArrayUtility.H b/Src/Base/AMReX_FabArrayUtility.H
@@ -153,11 +153,13 @@ ReduceSum_device (FabArray<FAB1> const& fa1, FabArray<FAB2> const& fa2,
     using value_type = typename FAB1::value_type;
     value_type sm = 0;
 
+    BL_PROFILE("ReduceSum_device");
+
     {
         ReduceOps<ReduceOpSum> reduce_op;
         ReduceData<value_type> reduce_data(reduce_op);
         using ReduceTuple = typename decltype(reduce_data)::Type;
-
+        Gpu::FuseReductionSafeGuard rsg(true);
         for (MFIter mfi(fa1); mfi.isValid(); ++mfi)
         {
             const Box& bx = amrex::grow(mfi.validbox(),nghost);
@@ -1467,7 +1469,7 @@ Add (FabArray<FAB>& dst, FabArray<FAB> const& src, int srccomp, int dstcomp, int
         {
             auto const srcFab = src.array(mfi);
             auto       dstFab = dst.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n,
             {
                 dstFab(i,j,k,n+dstcomp) += srcFab(i,j,k,n+srccomp);
             });
@@ -1499,7 +1501,7 @@ Copy (FabArray<FAB>& dst, FabArray<FAB> const& src, int srccomp, int dstcomp, in
         {
             auto const srcFab = src.array(mfi);
             auto       dstFab = dst.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n,
             {
                 dstFab(i,j,k,dstcomp+n) = srcFab(i,j,k,srccomp+n);
             });
@@ -1531,7 +1533,7 @@ Subtract (FabArray<FAB>& dst, FabArray<FAB> const& src, int srccomp, int dstcomp
         {
             auto const srcFab = src.array(mfi);
             auto       dstFab = dst.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n,
             {
                 dstFab(i,j,k,n+dstcomp) -= srcFab(i,j,k,n+srccomp);
             });
@@ -1563,7 +1565,7 @@ Multiply (FabArray<FAB>& dst, FabArray<FAB> const& src, int srccomp, int dstcomp
         {
             auto const srcFab = src.array(mfi);
             auto       dstFab = dst.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n,
             {
                 dstFab(i,j,k,n+dstcomp) *= srcFab(i,j,k,n+srccomp);
             });
@@ -1595,7 +1597,7 @@ Divide (FabArray<FAB>& dst, FabArray<FAB> const& src, int srccomp, int dstcomp,
         {
             auto const srcFab = src.array(mfi);
             auto       dstFab = dst.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n,
             {
                 dstFab(i,j,k,n+dstcomp) /= srcFab(i,j,k,n+srccomp);
             });
@@ -1625,7 +1627,7 @@ Abs (FabArray<FAB>& fa, int icomp, int numcomp, const IntVect& nghost)
         if (bx.ok())
         {
             auto const& fab = fa.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n,
             {
                 fab(i,j,k,n+icomp) = amrex::Math::abs(fab(i,j,k,n+icomp));
             });
@@ -1682,7 +1684,7 @@ OverrideSync (FabArray<FAB> & fa, FabArray<IFAB> const& msk, const Periodicity&
         const Box& bx = mfi.tilebox();
         auto fab = fa.array(mfi);
         auto const ifab = msk.array(mfi);
-        AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, ncomp, i, j, k, n,
+        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, ncomp, i, j, k, n,
         {
             if (!ifab(i,j,k)) fab(i,j,k,n) = 0;
         });

diff --git a/Src/Base/AMReX_Gpu.H b/Src/Base/AMReX_Gpu.H
@@ -9,6 +9,7 @@ namespace amrex { namespace Cuda {} }
 #endif
 
 #include <AMReX_GpuQualifiers.H>
+#include <AMReX_GpuKernelInfo.H>
 #include <AMReX_GpuPrint.H>
 #include <AMReX_GpuAssert.H>
 #include <AMReX_GpuTypes.H>

diff --git a/Src/Base/AMReX_GpuFuse.H b/Src/Base/AMReX_GpuFuse.H
@@ -13,6 +13,8 @@
 namespace amrex {
 namespace Gpu {
 
+#ifdef AMREX_USE_GPU
+
 #ifdef AMREX_USE_CUDA
 
 typedef void (*Lambda1DLauncher)(char*,int);
@@ -229,23 +231,6 @@ private:
     }
 };
 
-Long getFuseSizeThreshold ();
-Long setFuseSizeThreshold (Long new_threshold);
-int getFuseNumKernelsThreshold ();
-int setFuseNumKernelsThreshold (int new_threshold);
-bool inFuseRegion ();
-bool setFuseRegion (bool flag);
-
-struct FuseSafeGuard
-{
-    explicit FuseSafeGuard (bool flag) noexcept
-        : m_old(setFuseRegion(flag)) {}
-    ~FuseSafeGuard () { setFuseRegion(m_old); }
-private:
-    bool m_old;
-};
-
-
 template <typename F>
 void
 Register (Box const& bx, F&& f)
@@ -273,6 +258,56 @@ LaunchFusedKernels ()
     Fuser::getInstance().Launch();
 }
 
+#else
+
+class Fuser
+{
+public:
+    static Fuser& getInstance ();
+    static void Initialize ();
+    static void Finalize ();
+private:
+    static std::unique_ptr<Fuser> m_instance;
+};
+
+inline void LaunchFusedKernels () {}
+
+#endif
+
+Long getFuseSizeThreshold ();
+Long setFuseSizeThreshold (Long new_threshold);
+int getFuseNumKernelsThreshold ();
+int setFuseNumKernelsThreshold (int new_threshold);
+bool inFuseRegion ();
+bool setFuseRegion (bool flag);
+bool inFuseReductionRegion ();
+bool setFuseReductionRegion (bool flag);
+
+struct FuseSafeGuard
+{
+    explicit FuseSafeGuard (bool flag) noexcept
+        : m_old(setFuseRegion(flag)) {}
+    ~FuseSafeGuard () { setFuseRegion(m_old); }
+private:
+    bool m_old;
+};
+
+struct FuseReductionSafeGuard
+{
+    explicit FuseReductionSafeGuard (bool flag) noexcept
+        : m_old(setFuseReductionRegion(flag)) {}
+    ~FuseReductionSafeGuard () { setFuseReductionRegion(m_old); }
+private:
+    bool m_old;
+};
+
+#else
+
+struct FuseSafeGuard
+{
+    explicit FuseSafeGuard (bool) {}
+};
+
 #endif
 
 }}