Skip to content

Commit

Permalink
Use long integer in GPU kernels
Browse files Browse the repository at this point in the history
This can avoid integer overflow when box size is very big (e.g., more than
2^30 cells).
  • Loading branch information
WeiqunZhang committed Feb 3, 2024
1 parent 296ed40 commit d4bb76a
Show file tree
Hide file tree
Showing 6 changed files with 420 additions and 402 deletions.
29 changes: 9 additions & 20 deletions Src/Base/AMReX_BaseFabUtility.H
Original file line number Diff line number Diff line change
Expand Up @@ -36,32 +36,26 @@ void fill (BaseFab<STRUCT>& aos_fab, F && f)
"amrex::fill: sizeof(STRUCT) != sizeof(T)*STRUCTSIZE");
#ifdef AMREX_USE_GPU
if (Gpu::inLaunchRegion()) {
const auto lo = amrex::lbound(box);
const auto len = amrex::length(box);
const auto lenxy = len.x*len.y;
const auto lenx = len.x;
int ntotcells = box.numPts();
BoxIndexer indexer(box);
const auto ntotcells = std::uint64_t(box.numPts());
int nthreads_per_block = (STRUCTSIZE <= 8) ? 256 : 128;
int nblocks = (ntotcells+nthreads_per_block-1)/nthreads_per_block;
std::uint64_t nblocks_long = (ntotcells+nthreads_per_block-1)/nthreads_per_block;
AMREX_ASSERT(nblocks_long <= std::uint64_t(std::numeric_limits<int>::max()));
auto nblocks = int(nblocks_long);
std::size_t shared_mem_bytes = nthreads_per_block * sizeof(STRUCT);
T* p = (T*)aos_fab.dataPtr();
#ifdef AMREX_USE_SYCL
amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept
{
int icell = handler.globalIdx();
auto const icell = std::uint64_t(handler.globalIdx());
unsigned int blockDimx = handler.blockDim();
unsigned int threadIdxx = handler.threadIdx();
unsigned int blockIdxx = handler.blockIdx();
auto const shared = (T*)handler.sharedMemory();
if (icell < ntotcells) {
auto ga = new(shared+threadIdxx*STRUCTSIZE) STRUCT;
int k = icell / lenxy;
int j = (icell - k*lenxy) / lenx;
int i = (icell - k*lenxy) - j*lenx;
i += lo.x;
j += lo.y;
k += lo.z;
auto [i, j, k] = indexer(icell);
f(*ga, i, j, k);
}
handler.sharedBarrier();
Expand All @@ -75,17 +69,12 @@ void fill (BaseFab<STRUCT>& aos_fab, F && f)
amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE () noexcept
{
int icell = blockDim.x*blockIdx.x+threadIdx.x;
std::uint64_t const icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;
Gpu::SharedMemory<T> gsm;
T* const shared = gsm.dataPtr();
if (icell < ntotcells) {
auto ga = new(shared+threadIdx.x*STRUCTSIZE) STRUCT;
int k = icell / lenxy;
int j = (icell - k*lenxy) / lenx;
int i = (icell - k*lenxy) - j*lenx;
i += lo.x;
j += lo.y;
k += lo.z;
auto [i, j, k] = indexer(icell);
f(*ga, i, j, k);
}
__syncthreads();
Expand Down
81 changes: 81 additions & 0 deletions Src/Base/AMReX_Box.H
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <AMReX_Vector.H>
#include <AMReX_GpuQualifiers.H>
#include <AMReX_GpuControl.H>
#include <AMReX_Math.H>

#include <iosfwd>

Expand Down Expand Up @@ -1835,6 +1836,86 @@ Box makeSingleCellBox (int i, int j, int k, IndexType typ = IndexType::TheCellTy
return Box(IntVect(AMREX_D_DECL(i,j,k)),IntVect(AMREX_D_DECL(i,j,k)),typ);
}

struct BoxIndexer
{
#if (AMREX_SPACEDIM == 3)
Math::FastDivmodU64 fdxy;
Math::FastDivmodU64 fdx;
IntVect lo;

BoxIndexer (Box const& box)
: fdxy(std::uint64_t(box.length(0))*std::uint64_t(box.length(1))),
fdx (std::uint64_t(box.length(0))),
lo (box.smallEnd())
{}

[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
Dim3 operator() (std::uint64_t icell) const
{
std::uint64_t x, y, z, rem;
fdxy(z, rem, icell);
fdx(y, x, rem);
return {int(x)+lo[0], int(y)+lo[1], int(z)+lo[2]};
}

[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
IntVect intVect (std::uint64_t icell) const
{
std::uint64_t x, y, z, rem;
fdxy(z, rem, icell);
fdx(y, x, rem);
return {int(x)+lo[0], int(y)+lo[1], int(z)+lo[2]};
}

#elif (AMREX_SPACEDIM == 2)

Math::FastDivmodU64 fdx;
IntVect lo;

BoxIndexer (Box const& box)
: fdx (std::uint64_t(box.length(0))),
lo (box.smallEnd())
{}

[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
Dim3 operator() (std::uint64_t icell) const
{
std::uint64_t x, y;
fdx(y, x, icell);
return {int(x)+lo[0], int(y)+lo[1], 0};
}

[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
IntVect intVect (std::uint64_t icell) const
{
std::uint64_t x, y;
fdx(y, x, icell);
return {int(x)+lo[0], int(y)+lo[1]};
}

#elif (AMREX_SPACEDIM == 1)

int lo;

BoxIndexer (Box const& box)
: lo(box.smallEnd(0))
{}

[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
Dim3 operator() (std::uint64_t icell) const
{
return {int(icell)+lo, 0, 0};
}

[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
IntVect intVect (std::uint64_t icell) const
{
return IntVect{int(icell)+lo};
}

#endif
};

}

#endif /*AMREX_BOX_H*/
1 change: 1 addition & 0 deletions Src/Base/AMReX_GpuLaunch.H
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <AMReX_GpuLaunchGlobal.H>
#include <AMReX_RandomEngine.H>
#include <AMReX_Algorithm.H>
#include <AMReX_Math.H>
#include <cstddef>
#include <limits>
#include <algorithm>
Expand Down
Loading

0 comments on commit d4bb76a

Please sign in to comment.