Skip to content

Commit

Permalink
Use long integer in GPU kernels
Browse files Browse the repository at this point in the history
This can avoid integer overflow when box size is very big (e.g., more than
2^30 cells).
  • Loading branch information
WeiqunZhang committed Feb 2, 2024
1 parent 296ed40 commit 80cdc06
Show file tree
Hide file tree
Showing 7 changed files with 475 additions and 416 deletions.
21 changes: 3 additions & 18 deletions Src/AmrCore/AMReX_TagBox.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -518,10 +518,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
}
if (count > 0) {
Box const& bx = fai.fabbox();
const auto lo = amrex::lbound(bx);
const auto len = amrex::length(bx);
const auto lenxy = len.x*len.y;
const auto lenx = len.x;
SmallBoxIndexer indexer(bx);
const int ncells = bx.numPts();
const char* tags = (*this)[fai].dataPtr();
#ifdef AMREX_USE_SYCL
Expand All @@ -543,13 +540,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
sycl::access::address_space::local_space>
(shared_counter, 1u);
IntVect* p = dp_tags + dp_tags_offset[iblock_begin+bid];
int k = icell / lenxy;
int j = (icell - k*lenxy) / lenx;
int i = (icell - k*lenxy) - j*lenx;
i += lo.x;
j += lo.y;
k += lo.z;
p[itag] = IntVect(AMREX_D_DECL(i,j,k));
p[itag] = indexer.intVect(icell);
}
});
#else
Expand All @@ -570,13 +561,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
if (icell < ncells && tags[icell] != TagBox::CLEAR) {
unsigned int itag = Gpu::Atomic::Add(shared_counter, 1u);
IntVect* p = dp_tags + dp_tags_offset[iblock_begin+bid];
int k = icell / lenxy;
int j = (icell - k*lenxy) / lenx;
int i = (icell - k*lenxy) - j*lenx;
i += lo.x;
j += lo.y;
k += lo.z;
p[itag] = IntVect(AMREX_D_DECL(i,j,k));
p[itag] = indexer.intVect(icell);
}
});
#endif
Expand Down
27 changes: 7 additions & 20 deletions Src/Base/AMReX_BaseFabUtility.H
Original file line number Diff line number Diff line change
Expand Up @@ -36,32 +36,24 @@ void fill (BaseFab<STRUCT>& aos_fab, F && f)
"amrex::fill: sizeof(STRUCT) != sizeof(T)*STRUCTSIZE");
#ifdef AMREX_USE_GPU
if (Gpu::inLaunchRegion()) {
const auto lo = amrex::lbound(box);
const auto len = amrex::length(box);
const auto lenxy = len.x*len.y;
const auto lenx = len.x;
int ntotcells = box.numPts();
BoxIndexer indexer(box);
const auto ntotcells = std::uint64_t(box.numPts());
int nthreads_per_block = (STRUCTSIZE <= 8) ? 256 : 128;
int nblocks = (ntotcells+nthreads_per_block-1)/nthreads_per_block;
auto nblocks = int((ntotcells+nthreads_per_block-1)/nthreads_per_block);
std::size_t shared_mem_bytes = nthreads_per_block * sizeof(STRUCT);
T* p = (T*)aos_fab.dataPtr();
#ifdef AMREX_USE_SYCL
amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept
{
int icell = handler.globalIdx();
auto const icell = std::uint64_t(handler.globalIdx());
unsigned int blockDimx = handler.blockDim();
unsigned int threadIdxx = handler.threadIdx();
unsigned int blockIdxx = handler.blockIdx();
auto const shared = (T*)handler.sharedMemory();
if (icell < ntotcells) {
auto ga = new(shared+threadIdxx*STRUCTSIZE) STRUCT;
int k = icell / lenxy;
int j = (icell - k*lenxy) / lenx;
int i = (icell - k*lenxy) - j*lenx;
i += lo.x;
j += lo.y;
k += lo.z;
auto [i, j, k] = indexer(icell);
f(*ga, i, j, k);
}
handler.sharedBarrier();
Expand All @@ -75,17 +67,12 @@ void fill (BaseFab<STRUCT>& aos_fab, F && f)
amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE () noexcept
{
int icell = blockDim.x*blockIdx.x+threadIdx.x;
auto const icell = std::uint64_t(blockDim.x*blockIdx.x+threadIdx.x);
Gpu::SharedMemory<T> gsm;
T* const shared = gsm.dataPtr();
if (icell < ntotcells) {
auto ga = new(shared+threadIdx.x*STRUCTSIZE) STRUCT;
int k = icell / lenxy;
int j = (icell - k*lenxy) / lenx;
int i = (icell - k*lenxy) - j*lenx;
i += lo.x;
j += lo.y;
k += lo.z;
auto [i, j, k] = indexer(icell);
f(*ga, i, j, k);
}
__syncthreads();
Expand Down
139 changes: 139 additions & 0 deletions Src/Base/AMReX_Box.H
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <AMReX_Vector.H>
#include <AMReX_GpuQualifiers.H>
#include <AMReX_GpuControl.H>
#include <AMReX_Math.H>

#include <iosfwd>

Expand Down Expand Up @@ -1835,6 +1836,144 @@ Box makeSingleCellBox (int i, int j, int k, IndexType typ = IndexType::TheCellTy
return Box(IntVect(AMREX_D_DECL(i,j,k)),IntVect(AMREX_D_DECL(i,j,k)),typ);
}

struct BoxIndexer
{
#if (AMREX_SPACEDIM == 3)
Math::FastDivmodU64 fdxy;
Math::FastDivmodU64 fdx;
IntVect lo;

BoxIndexer (Box const& box)
: fdxy(std::uint64_t(box.length(0))*std::uint64_t(box.length(1))),
fdx (std::uint64_t(box.length(0))),
lo (box.smallEnd())
{}

[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
Dim3 operator() (std::uint64_t icell) const
{
std::uint64_t x, y, z, rem;
fdxy(z, rem, icell);
fdx(y, x, rem);
return {int(x)+lo[0], int(y)+lo[1], int(z+lo[2])};
}

[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
IntVect intVect (std::uint64_t icell) const
{
std::uint64_t x, y, z, rem;
fdxy(z, rem, icell);
fdx(y, x, rem);
return {int(x)+lo[0], int(y)+lo[1], int(z+lo[2])};
}

#elif (AMREX_SPACEDIM == 2)

Math::FastDivmodU64 fdx;
IntVect lo;

BoxIndexer (Box const& box)
: fdx (std::uint64_t(box.length(0))),
lo (box.smallEnd())
{}

[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
Dim3 operator() (std::uint64_t icell) const
{
std::uint64_t x, y;
fdx(y, x, icell);
return {int(x)+lo[0], int(y)+lo[1], 0};
}

[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
IntVect intVect (std::uint64_t icell) const
{
std::uint64_t x, y;
fdx(y, x, icell);
return {int(x)+lo[0], int(y)+lo[1]};
}

#elif (AMREX_SPACEDIM == 1)

int lo;

BoxIndexer (Box const& box)
: lo(box.smallEnd(0))
{}

[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
Dim3 operator() (std::uint64_t icell) const
{
return {int(icell)+lo, 0, 0};
}

[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
IntVect intVect (std::uint64_t icell) const
{
return IntVect{int(icell)+lo};
}

#endif
};

struct SmallBoxIndexer
{
IntVect lo;
#if (AMREX_SPACEDIM == 2)
int lenx;
#elif (AMREX_SPACEDIM == 3)
int lenx, lenxy;
#endif

SmallBoxIndexer (Box const& box)
: lo(box.smallEnd())
#if (AMREX_SPACEDIM == 2)
, lenx(box.length(0))
#elif (AMREX_SPACEDIM == 3)
, lenx(box.length(0)), lenxy(box.length(0)*box.length(1))
#endif
{
// We often use grid stride loop, thus the limit is only half of INT_MAX.
AMREX_ASSERT(box.numPts()*2 < Long(std::numeric_limits<int>::max()));
}

[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
Dim3 operator() (int icell) const
{
// For simplicity, we are not doing fast divide.
#if (AMREX_SPACEDIM == 1)
return {icell + lo[0], 0, 0};
#elif (AMREX_SPACEDIM == 2)
int j = icell / lenx;
int i = icell - j*lenx;
return {i+lo[0], j+lo[1], 0};
#else
int k = icell / lenxy;
int j = (icell - k*lenxy) / lenx;
int i = (icell - k*lenxy) - j*lenx;
return {i+lo[0], j+lo[1], k+lo[2]};
#endif
}

[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
IntVect intVect (int icell) const
{
// For simplicity, we are not doing fast divide.
#if (AMREX_SPACEDIM == 1)
return IntVect{icell + lo[0]};
#elif (AMREX_SPACEDIM == 2)
int j = icell / lenx;
int i = icell - j*lenx;
return {i+lo[0], j+lo[1]};
#else
int k = icell / lenxy;
int j = (icell - k*lenxy) / lenx;
int i = (icell - k*lenxy) - j*lenx;
return {i+lo[0], j+lo[1], k+lo[2]};
#endif
}
};

}

#endif /*AMREX_BOX_H*/
1 change: 1 addition & 0 deletions Src/Base/AMReX_GpuLaunch.H
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <AMReX_GpuLaunchGlobal.H>
#include <AMReX_RandomEngine.H>
#include <AMReX_Algorithm.H>
#include <AMReX_Math.H>
#include <cstddef>
#include <limits>
#include <algorithm>
Expand Down
Loading

0 comments on commit 80cdc06

Please sign in to comment.