Skip to content

Commit

Permalink
BaseFab::lockAdd: Faster version of BaseFab::atomicAdd for OpenMP (AM…
Browse files Browse the repository at this point in the history
…ReX-Codes#3696)

For WarpX's Gordon Bell runs on Fugaku, Stephan Jaure of ATOS optimized
atomicAdd using pthread spin locks. This commit implements Stephan's
approach using OpenMP.

In AMReX::Initialize, we create a number of OMP locks. When
BaseFab::lockAdd is called, we loop over planes in z-direction and try
to acquire a lock with omp_test_lock. If it's successful, we can access
the data in that z-plane without worrying about race conditions. This
allows us to use simd instructions instead of omp atomic adds. If it's
not successful, we will try a different z-plane. The process stops till
all planes are processed.
  • Loading branch information
WeiqunZhang authored Jan 12, 2024
1 parent d440010 commit 255d30f
Show file tree
Hide file tree
Showing 4 changed files with 150 additions and 8 deletions.
6 changes: 5 additions & 1 deletion Src/Base/AMReX.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -462,7 +462,7 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse,
#endif

#ifdef AMREX_USE_OMP
amrex::OpenMP::init_threads();
amrex::OpenMP::Initialize();

// status output
if (system::verbose > 0) {
Expand Down Expand Up @@ -817,6 +817,10 @@ amrex::Finalize (amrex::AMReX* pamrex)
Gpu::Device::Finalize();
#endif

#ifdef AMREX_USE_OMP
amrex::OpenMP::Finalize();
#endif

#if defined(AMREX_USE_UPCXX)
upcxx::finalize();
#endif
Expand Down
111 changes: 107 additions & 4 deletions Src/Base/AMReX_BaseFab.H
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,8 @@
#include <AMReX_Reduce.H>
#include <AMReX_Gpu.H>
#include <AMReX_Math.H>

#ifdef AMREX_USE_OMP
#include <omp.h>
#endif
#include <AMReX_OpenMP.H>
#include <AMReX_MemPool.H>

#include <cmath>
#include <cstdlib>
Expand Down Expand Up @@ -979,6 +977,19 @@ public:
BaseFab<T>& atomicAdd (const BaseFab<T>& src, const Box& srcbox, const Box& destbox,
int srccomp, int destcomp, int numcomp=1) noexcept;

/**
* \brief Atomically add srcbox region of src FAB to destbox region of this FAB.
* The srcbox and destbox must be same size. When OMP is on, this uses OMP locks
* in the implementation and it's usually faster than atomicAdd.
*/
#if defined(AMREX_USE_GPU)
template <RunOn run_on>
#else
template <RunOn run_on=RunOn::Host>
#endif
BaseFab<T>& lockAdd (const BaseFab<T>& src, const Box& srcbox, const Box& destbox,
int srccomp, int destcomp, int numcomp) noexcept;

//! FAB SAXPY (y[i] <- y[i] + a * x[i]), in place.
#if defined(AMREX_USE_GPU)
template <RunOn run_on>
Expand Down Expand Up @@ -3300,6 +3311,98 @@ BaseFab<T>::atomicAdd (const BaseFab<T>& src, const Box& srcbox, const Box& dest
return *this;
}

template <class T>
template <RunOn run_on>
BaseFab<T>&
BaseFab<T>::lockAdd (const BaseFab<T>& src, const Box& srcbox, const Box& destbox,
int srccomp, int destcomp, int numcomp) noexcept
{
#if defined(AMREX_USE_OMP) && (AMREX_SPACEDIM > 1)
#if defined(AMREX_USE_GPU)
if (run_on == RunOn::Host || Gpu::notInLaunchRegion()) {
#endif
BL_ASSERT(destbox.ok());
BL_ASSERT(src.box().contains(srcbox));
BL_ASSERT(box().contains(destbox));
BL_ASSERT(destbox.sameSize(srcbox));
BL_ASSERT(srccomp >= 0 && srccomp+numcomp <= src.nComp());
BL_ASSERT(destcomp >= 0 && destcomp+numcomp <= nComp());

Array4<T> const& d = this->array();
Array4<T const> const& s = src.const_array();
auto const& dlo = destbox.smallEnd();
#if (AMREX_SPACEDIM == 3)
auto const& dhi = destbox.bigEnd();
#endif
auto const& slo = srcbox.smallEnd();
auto const offset = slo - dlo;
auto const lenx = srcbox.length(0);

auto const nplanes = srcbox.length(AMREX_SPACEDIM-1);
auto* mask = (bool*) amrex_mempool_alloc(sizeof(bool)*nplanes);
for (int ip = 0; ip < nplanes; ++ip) {
mask[ip] = false;
}

int mm = 0;
int planes_left = nplanes;
while (planes_left > 0) {
AMREX_ASSERT(mm < nplanes);
auto const m = mm + dlo[AMREX_SPACEDIM-1];
int ilock = m % OpenMP::nlocks;
if (ilock < 0) { ilock += OpenMP::nlocks; }
auto* lock = &(OpenMP::omp_locks[ilock]);
if (omp_test_lock(lock))
{
for (int n = 0; n < numcomp; ++n)
{
#if (AMREX_SPACEDIM == 3)
for (int j = dlo[1]; j <= dhi[1]; ++j)
{
IntVect div(dlo[0], j, m);
#elif (AMREX_SPACEDIM == 2)
{
IntVect div(dlo[0], m);
#endif
auto * pdst = d.ptr(div ,n+destcomp);
auto const* psrc = s.ptr(div+offset,n+srccomp);
#pragma omp simd
for (int ii = 0; ii < lenx; ++ii) {
pdst[ii] += psrc[ii];
}
}
}

mask[mm] = true;
--planes_left;
omp_unset_lock(lock);
if (planes_left == 0) { break; }
}

++mm;
for (int ip = 0; ip < nplanes; ++ip) {
int new_mm = (mm+ip) % nplanes;
if ( ! mask[new_mm] ) {
mm = new_mm;
break;
}
}
}

amrex_mempool_free(mask);

return *this;

#if defined(AMREX_USE_GPU)
} else {
return this->template atomicAdd<run_on>(src, srcbox, destbox, srccomp, destcomp, numcomp);
}
#endif
#else
return this->template atomicAdd<run_on>(src, srcbox, destbox, srccomp, destcomp, numcomp);
#endif
}

template <class T>
template <RunOn run_on>
BaseFab<T>&
Expand Down
9 changes: 7 additions & 2 deletions Src/Base/AMReX_OpenMP.H
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
#include <AMReX_Config.H>

#ifdef AMREX_USE_OMP
#include <AMReX_Extension.H>
#include <omp.h>
#include <array>

namespace amrex::OpenMP {

Expand All @@ -13,7 +15,11 @@ namespace amrex::OpenMP {
inline int in_parallel () { return omp_in_parallel(); }
inline void set_num_threads (int num) { omp_set_num_threads(num); }

void init_threads ();
void Initialize ();
void Finalize ();

static constexpr int nlocks = 128;
extern AMREX_EXPORT std::array<omp_lock_t,nlocks> omp_locks;
}

#else // AMREX_USE_OMP
Expand All @@ -25,7 +31,6 @@ namespace amrex::OpenMP {
constexpr int get_thread_num () { return 0; }
constexpr int in_parallel () { return false; }
constexpr void set_num_threads (int) { /* nothing */ }
constexpr void init_threads () { /* nothing */ }
}

#endif // AMREX_USE_OMP
Expand Down
32 changes: 31 additions & 1 deletion Src/Base/AMReX_OpenMP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,19 @@ namespace amrex
#ifdef AMREX_USE_OMP
namespace amrex::OpenMP
{
void init_threads ()
std::array<omp_lock_t,nlocks> omp_locks;

namespace {
unsigned int initialized = 0;
}

void Initialize ()
{
if (initialized) {
++initialized;
return;
}

amrex::ParmParse pp("amrex");
std::string omp_threads = "system";
pp.queryAdd("omp_threads", omp_threads);
Expand Down Expand Up @@ -173,6 +184,25 @@ namespace amrex::OpenMP
}
}
}

for (auto& lck : omp_locks) {
omp_init_lock(&lck);
}

++initialized;
}

void Finalize ()
{
if (initialized) {
--initialized;
if (initialized == 0) {
for (auto& lck : omp_locks) {
omp_destroy_lock(&lck);
}
}
}
}

} // namespace amrex::OpenMP
#endif // AMREX_USE_OMP

0 comments on commit 255d30f

Please sign in to comment.