BaseFab::lockAdd: Faster version of BaseFab::atomicAdd for OpenMP (AM…

…ReX-Codes#3696) For WarpX's Gordon Bell runs on Fugaku, Stephan Jaure of ATOS optimized atomicAdd using pthread spin locks. This commit implements Stephan's approach using OpenMP. In AMReX::Initialize, we create a number of OMP locks. When BaseFab::lockAdd is called, we loop over planes in z-direction and try to acquire a lock with omp_test_lock. If it's successful, we can access the data in that z-plane without worrying about race conditions. This allows us to use simd instructions instead of omp atomic adds. If it's not successful, we will try a different z-plane. The process stops till all planes are processed.
WeiqunZhang · Jan 12, 2024 · 255d30f · 255d30f
1 parent d440010
commit 255d30f
Show file tree

Hide file tree

Showing 4 changed files with 150 additions and 8 deletions.
diff --git a/Src/Base/AMReX.cpp b/Src/Base/AMReX.cpp
@@ -462,7 +462,7 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse,
 #endif
 
 #ifdef AMREX_USE_OMP
-    amrex::OpenMP::init_threads();
+    amrex::OpenMP::Initialize();
 
     // status output
     if (system::verbose > 0) {
@@ -817,6 +817,10 @@ amrex::Finalize (amrex::AMReX* pamrex)
     Gpu::Device::Finalize();
 #endif
 
+#ifdef AMREX_USE_OMP
+    amrex::OpenMP::Finalize();
+#endif
+
 #if defined(AMREX_USE_UPCXX)
     upcxx::finalize();
 #endif

diff --git a/Src/Base/AMReX_BaseFab.H b/Src/Base/AMReX_BaseFab.H
@@ -20,10 +20,8 @@
 #include <AMReX_Reduce.H>
 #include <AMReX_Gpu.H>
 #include <AMReX_Math.H>
-
-#ifdef AMREX_USE_OMP
-#include <omp.h>
-#endif
+#include <AMReX_OpenMP.H>
+#include <AMReX_MemPool.H>
 
 #include <cmath>
 #include <cstdlib>
@@ -979,6 +977,19 @@ public:
     BaseFab<T>& atomicAdd (const BaseFab<T>& src, const Box& srcbox, const Box& destbox,
                            int srccomp, int destcomp, int numcomp=1) noexcept;
 
+    /**
+    * \brief Atomically add srcbox region of src FAB to destbox region of this FAB.
+    * The srcbox and destbox must be same size. When OMP is on, this uses OMP locks
+    * in the implementation and it's usually faster than atomicAdd.
+    */
+#if defined(AMREX_USE_GPU)
+    template <RunOn run_on>
+#else
+    template <RunOn run_on=RunOn::Host>
+#endif
+    BaseFab<T>& lockAdd (const BaseFab<T>& src, const Box& srcbox, const Box& destbox,
+                         int srccomp, int destcomp, int numcomp) noexcept;
+
     //! FAB SAXPY (y[i] <- y[i] + a * x[i]), in place.
 #if defined(AMREX_USE_GPU)
     template <RunOn run_on>
@@ -3300,6 +3311,98 @@ BaseFab<T>::atomicAdd (const BaseFab<T>& src, const Box& srcbox, const Box& dest
     return *this;
 }
 
+template <class T>
+template <RunOn run_on>
+BaseFab<T>&
+BaseFab<T>::lockAdd (const BaseFab<T>& src, const Box& srcbox, const Box& destbox,
+                     int srccomp, int destcomp, int numcomp) noexcept
+{
+#if defined(AMREX_USE_OMP) && (AMREX_SPACEDIM > 1)
+#if defined(AMREX_USE_GPU)
+    if (run_on == RunOn::Host || Gpu::notInLaunchRegion()) {
+#endif
+        BL_ASSERT(destbox.ok());
+        BL_ASSERT(src.box().contains(srcbox));
+        BL_ASSERT(box().contains(destbox));
+        BL_ASSERT(destbox.sameSize(srcbox));
+        BL_ASSERT(srccomp >= 0 && srccomp+numcomp <= src.nComp());
+        BL_ASSERT(destcomp >= 0 && destcomp+numcomp <= nComp());
+
+        Array4<T> const& d = this->array();
+        Array4<T const> const& s = src.const_array();
+        auto const& dlo = destbox.smallEnd();
+#if (AMREX_SPACEDIM == 3)
+        auto const& dhi = destbox.bigEnd();
+#endif
+        auto const& slo = srcbox.smallEnd();
+        auto const offset = slo - dlo;
+        auto const lenx = srcbox.length(0);
+
+        auto const nplanes = srcbox.length(AMREX_SPACEDIM-1);
+        auto* mask = (bool*) amrex_mempool_alloc(sizeof(bool)*nplanes);
+        for (int ip = 0; ip < nplanes; ++ip) {
+            mask[ip] = false;
+        }
+
+        int mm = 0;
+        int planes_left = nplanes;
+        while (planes_left > 0) {
+            AMREX_ASSERT(mm < nplanes);
+            auto const m = mm + dlo[AMREX_SPACEDIM-1];
+            int ilock = m % OpenMP::nlocks;
+            if (ilock < 0) { ilock += OpenMP::nlocks; }
+            auto* lock = &(OpenMP::omp_locks[ilock]);
+            if (omp_test_lock(lock))
+            {
+                for (int n = 0; n < numcomp; ++n)
+                {
+#if (AMREX_SPACEDIM == 3)
+                    for (int j = dlo[1]; j <= dhi[1]; ++j)
+                    {
+                        IntVect div(dlo[0], j, m);
+#elif (AMREX_SPACEDIM == 2)
+                    {
+                        IntVect div(dlo[0], m);
+#endif
+                        auto      * pdst = d.ptr(div       ,n+destcomp);
+                        auto const* psrc = s.ptr(div+offset,n+srccomp);
+#pragma omp simd
+                        for (int ii = 0; ii < lenx; ++ii) {
+                            pdst[ii] += psrc[ii];
+                        }
+                    }
+                }
+
+                mask[mm] = true;
+                --planes_left;
+                omp_unset_lock(lock);
+                if (planes_left == 0) { break; }
+            }
+
+            ++mm;
+            for (int ip = 0; ip < nplanes; ++ip) {
+                int new_mm = (mm+ip) % nplanes;
+                if ( ! mask[new_mm] ) {
+                    mm = new_mm;
+                    break;
+                }
+            }
+        }
+
+        amrex_mempool_free(mask);
+
+        return *this;
+
+#if defined(AMREX_USE_GPU)
+    } else {
+        return this->template atomicAdd<run_on>(src, srcbox, destbox, srccomp, destcomp, numcomp);
+    }
+#endif
+#else
+    return this->template atomicAdd<run_on>(src, srcbox, destbox, srccomp, destcomp, numcomp);
+#endif
+}
+
 template <class T>
 template <RunOn run_on>
 BaseFab<T>&

diff --git a/Src/Base/AMReX_OpenMP.H b/Src/Base/AMReX_OpenMP.H
@@ -3,7 +3,9 @@
 #include <AMReX_Config.H>
 
 #ifdef AMREX_USE_OMP
+#include <AMReX_Extension.H>
 #include <omp.h>
+#include <array>
 
 namespace amrex::OpenMP {
 
@@ -13,7 +15,11 @@ namespace amrex::OpenMP {
     inline int in_parallel     () { return omp_in_parallel();     }
     inline void set_num_threads (int num) { omp_set_num_threads(num); }
 
-    void init_threads ();
+    void Initialize ();
+    void Finalize ();
+
+    static constexpr int nlocks = 128;
+    extern AMREX_EXPORT std::array<omp_lock_t,nlocks> omp_locks;
 }
 
 #else // AMREX_USE_OMP
@@ -25,7 +31,6 @@ namespace amrex::OpenMP {
     constexpr int get_thread_num  () { return 0; }
     constexpr int in_parallel     () { return false; }
     constexpr void set_num_threads (int) { /* nothing */ }
-    constexpr void init_threads () { /* nothing */ }
 }
 
 #endif // AMREX_USE_OMP

diff --git a/Src/Base/AMReX_OpenMP.cpp b/Src/Base/AMReX_OpenMP.cpp
@@ -135,8 +135,19 @@ namespace amrex
 #ifdef AMREX_USE_OMP
 namespace amrex::OpenMP
 {
-    void init_threads ()
+    std::array<omp_lock_t,nlocks> omp_locks;
+
+    namespace {
+        unsigned int initialized = 0;
+    }
+
+    void Initialize ()
     {
+        if (initialized) {
+            ++initialized;
+            return;
+        }
+
         amrex::ParmParse pp("amrex");
         std::string omp_threads = "system";
         pp.queryAdd("omp_threads", omp_threads);
@@ -173,6 +184,25 @@ namespace amrex::OpenMP
                 }
             }
         }
+
+        for (auto& lck : omp_locks) {
+            omp_init_lock(&lck);
+        }
+
+        ++initialized;
     }
+
+    void Finalize ()
+    {
+        if (initialized) {
+            --initialized;
+            if (initialized == 0) {
+                for (auto& lck : omp_locks) {
+                    omp_destroy_lock(&lck);
+                }
+            }
+        }
+    }
+
 } // namespace amrex::OpenMP
 #endif // AMREX_USE_OMP