Use long integer in GPU kernels

This can avoid integer overflow when box size is very big (e.g., more than 2^30 cells).
WeiqunZhang · Feb 2, 2024 · 80cdc06 · 80cdc06
1 parent 296ed40
commit 80cdc06
Show file tree

Hide file tree

Showing 7 changed files with 475 additions and 416 deletions.
diff --git a/Src/AmrCore/AMReX_TagBox.cpp b/Src/AmrCore/AMReX_TagBox.cpp
@@ -518,10 +518,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
         }
         if (count > 0) {
             Box const& bx = fai.fabbox();
-            const auto lo  = amrex::lbound(bx);
-            const auto len = amrex::length(bx);
-            const auto lenxy = len.x*len.y;
-            const auto lenx  = len.x;
+            SmallBoxIndexer indexer(bx);
             const int ncells = bx.numPts();
             const char* tags = (*this)[fai].dataPtr();
 #ifdef AMREX_USE_SYCL
@@ -543,13 +540,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
                                                          sycl::access::address_space::local_space>
                         (shared_counter, 1u);
                     IntVect* p = dp_tags + dp_tags_offset[iblock_begin+bid];
-                    int k =  icell /   lenxy;
-                    int j = (icell - k*lenxy) /   lenx;
-                    int i = (icell - k*lenxy) - j*lenx;
-                    i += lo.x;
-                    j += lo.y;
-                    k += lo.z;
-                    p[itag] = IntVect(AMREX_D_DECL(i,j,k));
+                    p[itag] = indexer.intVect(icell);
                 }
             });
 #else
@@ -570,13 +561,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
                 if (icell < ncells && tags[icell] != TagBox::CLEAR) {
                     unsigned int itag = Gpu::Atomic::Add(shared_counter, 1u);
                     IntVect* p = dp_tags + dp_tags_offset[iblock_begin+bid];
-                    int k =  icell /   lenxy;
-                    int j = (icell - k*lenxy) /   lenx;
-                    int i = (icell - k*lenxy) - j*lenx;
-                    i += lo.x;
-                    j += lo.y;
-                    k += lo.z;
-                    p[itag] = IntVect(AMREX_D_DECL(i,j,k));
+                    p[itag] = indexer.intVect(icell);
                 }
             });
 #endif

diff --git a/Src/Base/AMReX_BaseFabUtility.H b/Src/Base/AMReX_BaseFabUtility.H
@@ -36,32 +36,24 @@ void fill (BaseFab<STRUCT>& aos_fab, F && f)
                   "amrex::fill: sizeof(STRUCT) != sizeof(T)*STRUCTSIZE");
 #ifdef AMREX_USE_GPU
     if (Gpu::inLaunchRegion()) {
-        const auto lo  = amrex::lbound(box);
-        const auto len = amrex::length(box);
-        const auto lenxy = len.x*len.y;
-        const auto lenx = len.x;
-        int ntotcells = box.numPts();
+        BoxIndexer indexer(box);
+        const auto ntotcells = std::uint64_t(box.numPts());
         int nthreads_per_block = (STRUCTSIZE <= 8) ? 256 : 128;
-        int nblocks = (ntotcells+nthreads_per_block-1)/nthreads_per_block;
+        auto nblocks = int((ntotcells+nthreads_per_block-1)/nthreads_per_block);
         std::size_t shared_mem_bytes = nthreads_per_block * sizeof(STRUCT);
         T* p = (T*)aos_fab.dataPtr();
 #ifdef AMREX_USE_SYCL
         amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
         [=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept
         {
-            int icell = handler.globalIdx();
+            auto const icell = std::uint64_t(handler.globalIdx());
             unsigned int blockDimx = handler.blockDim();
             unsigned int threadIdxx = handler.threadIdx();
             unsigned int blockIdxx = handler.blockIdx();
             auto const shared = (T*)handler.sharedMemory();
             if (icell < ntotcells) {
                 auto ga = new(shared+threadIdxx*STRUCTSIZE) STRUCT;
-                int k =  icell /   lenxy;
-                int j = (icell - k*lenxy) /   lenx;
-                int i = (icell - k*lenxy) - j*lenx;
-                i += lo.x;
-                j += lo.y;
-                k += lo.z;
+                auto [i, j, k] = indexer(icell);
                 f(*ga, i, j, k);
             }
             handler.sharedBarrier();
@@ -75,17 +67,12 @@ void fill (BaseFab<STRUCT>& aos_fab, F && f)
         amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
         [=] AMREX_GPU_DEVICE () noexcept
         {
-            int icell = blockDim.x*blockIdx.x+threadIdx.x;
+            auto const icell = std::uint64_t(blockDim.x*blockIdx.x+threadIdx.x);
             Gpu::SharedMemory<T> gsm;
             T* const shared = gsm.dataPtr();
             if (icell < ntotcells) {
                 auto ga = new(shared+threadIdx.x*STRUCTSIZE) STRUCT;
-                int k =  icell /   lenxy;
-                int j = (icell - k*lenxy) /   lenx;
-                int i = (icell - k*lenxy) - j*lenx;
-                i += lo.x;
-                j += lo.y;
-                k += lo.z;
+                auto [i, j, k] = indexer(icell);
                 f(*ga, i, j, k);
             }
             __syncthreads();

diff --git a/Src/Base/AMReX_Box.H b/Src/Base/AMReX_Box.H
@@ -15,6 +15,7 @@
 #include <AMReX_Vector.H>
 #include <AMReX_GpuQualifiers.H>
 #include <AMReX_GpuControl.H>
+#include <AMReX_Math.H>
 
 #include <iosfwd>
 
@@ -1835,6 +1836,144 @@ Box makeSingleCellBox (int i, int j, int k, IndexType typ = IndexType::TheCellTy
     return Box(IntVect(AMREX_D_DECL(i,j,k)),IntVect(AMREX_D_DECL(i,j,k)),typ);
 }
 
+struct BoxIndexer
+{
+#if (AMREX_SPACEDIM == 3)
+    Math::FastDivmodU64 fdxy;
+    Math::FastDivmodU64 fdx;
+    IntVect lo;
+
+    BoxIndexer (Box const& box)
+        : fdxy(std::uint64_t(box.length(0))*std::uint64_t(box.length(1))),
+          fdx (std::uint64_t(box.length(0))),
+          lo  (box.smallEnd())
+        {}
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    Dim3 operator() (std::uint64_t icell) const
+    {
+        std::uint64_t x, y, z, rem;
+        fdxy(z, rem, icell);
+        fdx(y, x, rem);
+        return {int(x)+lo[0], int(y)+lo[1], int(z+lo[2])};
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    IntVect intVect (std::uint64_t icell) const
+    {
+        std::uint64_t x, y, z, rem;
+        fdxy(z, rem, icell);
+        fdx(y, x, rem);
+        return {int(x)+lo[0], int(y)+lo[1], int(z+lo[2])};
+    }
+
+#elif (AMREX_SPACEDIM == 2)
+
+    Math::FastDivmodU64 fdx;
+    IntVect lo;
+
+    BoxIndexer (Box const& box)
+        : fdx (std::uint64_t(box.length(0))),
+          lo  (box.smallEnd())
+        {}
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    Dim3 operator() (std::uint64_t icell) const
+    {
+        std::uint64_t x, y;
+        fdx(y, x, icell);
+        return {int(x)+lo[0], int(y)+lo[1], 0};
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    IntVect intVect (std::uint64_t icell) const
+    {
+        std::uint64_t x, y;
+        fdx(y, x, icell);
+        return {int(x)+lo[0], int(y)+lo[1]};
+    }
+
+#elif (AMREX_SPACEDIM == 1)
+
+    int lo;
+
+    BoxIndexer (Box const& box)
+        : lo(box.smallEnd(0))
+        {}
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    Dim3 operator() (std::uint64_t icell) const
+    {
+        return {int(icell)+lo, 0, 0};
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    IntVect intVect (std::uint64_t icell) const
+    {
+        return IntVect{int(icell)+lo};
+    }
+
+#endif
+};
+
+struct SmallBoxIndexer
+{
+    IntVect lo;
+#if (AMREX_SPACEDIM == 2)
+    int lenx;
+#elif (AMREX_SPACEDIM == 3)
+    int lenx, lenxy;
+#endif
+
+    SmallBoxIndexer (Box const& box)
+        : lo(box.smallEnd())
+#if (AMREX_SPACEDIM  == 2)
+          , lenx(box.length(0))
+#elif (AMREX_SPACEDIM  == 3)
+          , lenx(box.length(0)), lenxy(box.length(0)*box.length(1))
+#endif
+    {
+        // We often use grid stride loop, thus the limit is only half of INT_MAX.
+        AMREX_ASSERT(box.numPts()*2 < Long(std::numeric_limits<int>::max()));
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    Dim3 operator() (int icell) const
+    {
+        // For simplicity, we are not doing fast divide.
+#if (AMREX_SPACEDIM == 1)
+        return {icell + lo[0], 0, 0};
+#elif (AMREX_SPACEDIM == 2)
+        int j = icell /   lenx;
+        int i = icell - j*lenx;
+        return {i+lo[0], j+lo[1], 0};
+#else
+        int k =  icell /   lenxy;
+        int j = (icell - k*lenxy) /   lenx;
+        int i = (icell - k*lenxy) - j*lenx;
+        return {i+lo[0], j+lo[1], k+lo[2]};
+#endif
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    IntVect intVect (int icell) const
+    {
+        // For simplicity, we are not doing fast divide.
+#if (AMREX_SPACEDIM == 1)
+        return IntVect{icell + lo[0]};
+#elif (AMREX_SPACEDIM == 2)
+        int j = icell /   lenx;
+        int i = icell - j*lenx;
+        return {i+lo[0], j+lo[1]};
+#else
+        int k =  icell /   lenxy;
+        int j = (icell - k*lenxy) /   lenx;
+        int i = (icell - k*lenxy) - j*lenx;
+        return {i+lo[0], j+lo[1], k+lo[2]};
+#endif
+    }
+};
+
 }
 
 #endif /*AMREX_BOX_H*/
diff --git a/Src/Base/AMReX_GpuLaunch.H b/Src/Base/AMReX_GpuLaunch.H
@@ -20,6 +20,7 @@
 #include <AMReX_GpuLaunchGlobal.H>
 #include <AMReX_RandomEngine.H>
 #include <AMReX_Algorithm.H>
+#include <AMReX_Math.H>
 #include <cstddef>
 #include <limits>
 #include <algorithm>