Use long integer in GPU kernels

This can avoid integer overflow when box size is very big (e.g., more than 2^30 cells).
WeiqunZhang · Feb 3, 2024 · d4bb76a · d4bb76a
1 parent 296ed40
commit d4bb76a
Show file tree

Hide file tree

Showing 6 changed files with 420 additions and 402 deletions.
diff --git a/Src/Base/AMReX_BaseFabUtility.H b/Src/Base/AMReX_BaseFabUtility.H
@@ -36,32 +36,26 @@ void fill (BaseFab<STRUCT>& aos_fab, F && f)
                   "amrex::fill: sizeof(STRUCT) != sizeof(T)*STRUCTSIZE");
 #ifdef AMREX_USE_GPU
     if (Gpu::inLaunchRegion()) {
-        const auto lo  = amrex::lbound(box);
-        const auto len = amrex::length(box);
-        const auto lenxy = len.x*len.y;
-        const auto lenx = len.x;
-        int ntotcells = box.numPts();
+        BoxIndexer indexer(box);
+        const auto ntotcells = std::uint64_t(box.numPts());
         int nthreads_per_block = (STRUCTSIZE <= 8) ? 256 : 128;
-        int nblocks = (ntotcells+nthreads_per_block-1)/nthreads_per_block;
+        std::uint64_t nblocks_long = (ntotcells+nthreads_per_block-1)/nthreads_per_block;
+        AMREX_ASSERT(nblocks_long <= std::uint64_t(std::numeric_limits<int>::max()));
+        auto nblocks = int(nblocks_long);
         std::size_t shared_mem_bytes = nthreads_per_block * sizeof(STRUCT);
         T* p = (T*)aos_fab.dataPtr();
 #ifdef AMREX_USE_SYCL
         amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
         [=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept
         {
-            int icell = handler.globalIdx();
+            auto const icell = std::uint64_t(handler.globalIdx());
             unsigned int blockDimx = handler.blockDim();
             unsigned int threadIdxx = handler.threadIdx();
             unsigned int blockIdxx = handler.blockIdx();
             auto const shared = (T*)handler.sharedMemory();
             if (icell < ntotcells) {
                 auto ga = new(shared+threadIdxx*STRUCTSIZE) STRUCT;
-                int k =  icell /   lenxy;
-                int j = (icell - k*lenxy) /   lenx;
-                int i = (icell - k*lenxy) - j*lenx;
-                i += lo.x;
-                j += lo.y;
-                k += lo.z;
+                auto [i, j, k] = indexer(icell);
                 f(*ga, i, j, k);
             }
             handler.sharedBarrier();
@@ -75,17 +69,12 @@ void fill (BaseFab<STRUCT>& aos_fab, F && f)
         amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
         [=] AMREX_GPU_DEVICE () noexcept
         {
-            int icell = blockDim.x*blockIdx.x+threadIdx.x;
+            std::uint64_t const icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;
             Gpu::SharedMemory<T> gsm;
             T* const shared = gsm.dataPtr();
             if (icell < ntotcells) {
                 auto ga = new(shared+threadIdx.x*STRUCTSIZE) STRUCT;
-                int k =  icell /   lenxy;
-                int j = (icell - k*lenxy) /   lenx;
-                int i = (icell - k*lenxy) - j*lenx;
-                i += lo.x;
-                j += lo.y;
-                k += lo.z;
+                auto [i, j, k] = indexer(icell);
                 f(*ga, i, j, k);
             }
             __syncthreads();

diff --git a/Src/Base/AMReX_Box.H b/Src/Base/AMReX_Box.H
@@ -15,6 +15,7 @@
 #include <AMReX_Vector.H>
 #include <AMReX_GpuQualifiers.H>
 #include <AMReX_GpuControl.H>
+#include <AMReX_Math.H>
 
 #include <iosfwd>
 
@@ -1835,6 +1836,86 @@ Box makeSingleCellBox (int i, int j, int k, IndexType typ = IndexType::TheCellTy
     return Box(IntVect(AMREX_D_DECL(i,j,k)),IntVect(AMREX_D_DECL(i,j,k)),typ);
 }
 
+struct BoxIndexer
+{
+#if (AMREX_SPACEDIM == 3)
+    Math::FastDivmodU64 fdxy;
+    Math::FastDivmodU64 fdx;
+    IntVect lo;
+
+    BoxIndexer (Box const& box)
+        : fdxy(std::uint64_t(box.length(0))*std::uint64_t(box.length(1))),
+          fdx (std::uint64_t(box.length(0))),
+          lo  (box.smallEnd())
+        {}
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    Dim3 operator() (std::uint64_t icell) const
+    {
+        std::uint64_t x, y, z, rem;
+        fdxy(z, rem, icell);
+        fdx(y, x, rem);
+        return {int(x)+lo[0], int(y)+lo[1], int(z)+lo[2]};
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    IntVect intVect (std::uint64_t icell) const
+    {
+        std::uint64_t x, y, z, rem;
+        fdxy(z, rem, icell);
+        fdx(y, x, rem);
+        return {int(x)+lo[0], int(y)+lo[1], int(z)+lo[2]};
+    }
+
+#elif (AMREX_SPACEDIM == 2)
+
+    Math::FastDivmodU64 fdx;
+    IntVect lo;
+
+    BoxIndexer (Box const& box)
+        : fdx (std::uint64_t(box.length(0))),
+          lo  (box.smallEnd())
+        {}
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    Dim3 operator() (std::uint64_t icell) const
+    {
+        std::uint64_t x, y;
+        fdx(y, x, icell);
+        return {int(x)+lo[0], int(y)+lo[1], 0};
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    IntVect intVect (std::uint64_t icell) const
+    {
+        std::uint64_t x, y;
+        fdx(y, x, icell);
+        return {int(x)+lo[0], int(y)+lo[1]};
+    }
+
+#elif (AMREX_SPACEDIM == 1)
+
+    int lo;
+
+    BoxIndexer (Box const& box)
+        : lo(box.smallEnd(0))
+        {}
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    Dim3 operator() (std::uint64_t icell) const
+    {
+        return {int(icell)+lo, 0, 0};
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    IntVect intVect (std::uint64_t icell) const
+    {
+        return IntVect{int(icell)+lo};
+    }
+
+#endif
+};
+
 }
 
 #endif /*AMREX_BOX_H*/
diff --git a/Src/Base/AMReX_GpuLaunch.H b/Src/Base/AMReX_GpuLaunch.H
@@ -20,6 +20,7 @@
 #include <AMReX_GpuLaunchGlobal.H>
 #include <AMReX_RandomEngine.H>
 #include <AMReX_Algorithm.H>
+#include <AMReX_Math.H>
 #include <cstddef>
 #include <limits>
 #include <algorithm>