From 255d30f387cf2c1a7eff5a31f703c94de803e8d8 Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Fri, 12 Jan 2024 14:48:35 -0800 Subject: [PATCH] BaseFab::lockAdd: Faster version of BaseFab::atomicAdd for OpenMP (#3696) For WarpX's Gordon Bell runs on Fugaku, Stephan Jaure of ATOS optimized atomicAdd using pthread spin locks. This commit implements Stephan's approach using OpenMP. In AMReX::Initialize, we create a number of OMP locks. When BaseFab::lockAdd is called, we loop over planes in z-direction and try to acquire a lock with omp_test_lock. If it's successful, we can access the data in that z-plane without worrying about race conditions. This allows us to use simd instructions instead of omp atomic adds. If it's not successful, we will try a different z-plane. The process stops till all planes are processed. --- Src/Base/AMReX.cpp | 6 ++- Src/Base/AMReX_BaseFab.H | 111 ++++++++++++++++++++++++++++++++++++-- Src/Base/AMReX_OpenMP.H | 9 +++- Src/Base/AMReX_OpenMP.cpp | 32 ++++++++++- 4 files changed, 150 insertions(+), 8 deletions(-) diff --git a/Src/Base/AMReX.cpp b/Src/Base/AMReX.cpp index 4449dab1955..90d2a4ee8cd 100644 --- a/Src/Base/AMReX.cpp +++ b/Src/Base/AMReX.cpp @@ -462,7 +462,7 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse, #endif #ifdef AMREX_USE_OMP - amrex::OpenMP::init_threads(); + amrex::OpenMP::Initialize(); // status output if (system::verbose > 0) { @@ -817,6 +817,10 @@ amrex::Finalize (amrex::AMReX* pamrex) Gpu::Device::Finalize(); #endif +#ifdef AMREX_USE_OMP + amrex::OpenMP::Finalize(); +#endif + #if defined(AMREX_USE_UPCXX) upcxx::finalize(); #endif diff --git a/Src/Base/AMReX_BaseFab.H b/Src/Base/AMReX_BaseFab.H index eb8e5c59615..c4820bbe923 100644 --- a/Src/Base/AMReX_BaseFab.H +++ b/Src/Base/AMReX_BaseFab.H @@ -20,10 +20,8 @@ #include #include #include - -#ifdef AMREX_USE_OMP -#include -#endif +#include +#include #include #include @@ -979,6 +977,19 @@ public: BaseFab& atomicAdd (const BaseFab& src, const Box& srcbox, const Box& destbox, int srccomp, int destcomp, int numcomp=1) noexcept; + /** + * \brief Atomically add srcbox region of src FAB to destbox region of this FAB. + * The srcbox and destbox must be same size. When OMP is on, this uses OMP locks + * in the implementation and it's usually faster than atomicAdd. + */ +#if defined(AMREX_USE_GPU) + template +#else + template +#endif + BaseFab& lockAdd (const BaseFab& src, const Box& srcbox, const Box& destbox, + int srccomp, int destcomp, int numcomp) noexcept; + //! FAB SAXPY (y[i] <- y[i] + a * x[i]), in place. #if defined(AMREX_USE_GPU) template @@ -3300,6 +3311,98 @@ BaseFab::atomicAdd (const BaseFab& src, const Box& srcbox, const Box& dest return *this; } +template +template +BaseFab& +BaseFab::lockAdd (const BaseFab& src, const Box& srcbox, const Box& destbox, + int srccomp, int destcomp, int numcomp) noexcept +{ +#if defined(AMREX_USE_OMP) && (AMREX_SPACEDIM > 1) +#if defined(AMREX_USE_GPU) + if (run_on == RunOn::Host || Gpu::notInLaunchRegion()) { +#endif + BL_ASSERT(destbox.ok()); + BL_ASSERT(src.box().contains(srcbox)); + BL_ASSERT(box().contains(destbox)); + BL_ASSERT(destbox.sameSize(srcbox)); + BL_ASSERT(srccomp >= 0 && srccomp+numcomp <= src.nComp()); + BL_ASSERT(destcomp >= 0 && destcomp+numcomp <= nComp()); + + Array4 const& d = this->array(); + Array4 const& s = src.const_array(); + auto const& dlo = destbox.smallEnd(); +#if (AMREX_SPACEDIM == 3) + auto const& dhi = destbox.bigEnd(); +#endif + auto const& slo = srcbox.smallEnd(); + auto const offset = slo - dlo; + auto const lenx = srcbox.length(0); + + auto const nplanes = srcbox.length(AMREX_SPACEDIM-1); + auto* mask = (bool*) amrex_mempool_alloc(sizeof(bool)*nplanes); + for (int ip = 0; ip < nplanes; ++ip) { + mask[ip] = false; + } + + int mm = 0; + int planes_left = nplanes; + while (planes_left > 0) { + AMREX_ASSERT(mm < nplanes); + auto const m = mm + dlo[AMREX_SPACEDIM-1]; + int ilock = m % OpenMP::nlocks; + if (ilock < 0) { ilock += OpenMP::nlocks; } + auto* lock = &(OpenMP::omp_locks[ilock]); + if (omp_test_lock(lock)) + { + for (int n = 0; n < numcomp; ++n) + { +#if (AMREX_SPACEDIM == 3) + for (int j = dlo[1]; j <= dhi[1]; ++j) + { + IntVect div(dlo[0], j, m); +#elif (AMREX_SPACEDIM == 2) + { + IntVect div(dlo[0], m); +#endif + auto * pdst = d.ptr(div ,n+destcomp); + auto const* psrc = s.ptr(div+offset,n+srccomp); +#pragma omp simd + for (int ii = 0; ii < lenx; ++ii) { + pdst[ii] += psrc[ii]; + } + } + } + + mask[mm] = true; + --planes_left; + omp_unset_lock(lock); + if (planes_left == 0) { break; } + } + + ++mm; + for (int ip = 0; ip < nplanes; ++ip) { + int new_mm = (mm+ip) % nplanes; + if ( ! mask[new_mm] ) { + mm = new_mm; + break; + } + } + } + + amrex_mempool_free(mask); + + return *this; + +#if defined(AMREX_USE_GPU) + } else { + return this->template atomicAdd(src, srcbox, destbox, srccomp, destcomp, numcomp); + } +#endif +#else + return this->template atomicAdd(src, srcbox, destbox, srccomp, destcomp, numcomp); +#endif +} + template template BaseFab& diff --git a/Src/Base/AMReX_OpenMP.H b/Src/Base/AMReX_OpenMP.H index ce267b9be73..2b53ea8c6ec 100644 --- a/Src/Base/AMReX_OpenMP.H +++ b/Src/Base/AMReX_OpenMP.H @@ -3,7 +3,9 @@ #include #ifdef AMREX_USE_OMP +#include #include +#include namespace amrex::OpenMP { @@ -13,7 +15,11 @@ namespace amrex::OpenMP { inline int in_parallel () { return omp_in_parallel(); } inline void set_num_threads (int num) { omp_set_num_threads(num); } - void init_threads (); + void Initialize (); + void Finalize (); + + static constexpr int nlocks = 128; + extern AMREX_EXPORT std::array omp_locks; } #else // AMREX_USE_OMP @@ -25,7 +31,6 @@ namespace amrex::OpenMP { constexpr int get_thread_num () { return 0; } constexpr int in_parallel () { return false; } constexpr void set_num_threads (int) { /* nothing */ } - constexpr void init_threads () { /* nothing */ } } #endif // AMREX_USE_OMP diff --git a/Src/Base/AMReX_OpenMP.cpp b/Src/Base/AMReX_OpenMP.cpp index c0c33ce962f..15bb1246071 100644 --- a/Src/Base/AMReX_OpenMP.cpp +++ b/Src/Base/AMReX_OpenMP.cpp @@ -135,8 +135,19 @@ namespace amrex #ifdef AMREX_USE_OMP namespace amrex::OpenMP { - void init_threads () + std::array omp_locks; + + namespace { + unsigned int initialized = 0; + } + + void Initialize () { + if (initialized) { + ++initialized; + return; + } + amrex::ParmParse pp("amrex"); std::string omp_threads = "system"; pp.queryAdd("omp_threads", omp_threads); @@ -173,6 +184,25 @@ namespace amrex::OpenMP } } } + + for (auto& lck : omp_locks) { + omp_init_lock(&lck); + } + + ++initialized; } + + void Finalize () + { + if (initialized) { + --initialized; + if (initialized == 0) { + for (auto& lck : omp_locks) { + omp_destroy_lock(&lck); + } + } + } + } + } // namespace amrex::OpenMP #endif // AMREX_USE_OMP