Loop Blocking for fn GPU Backend (#1787)

Implements loop blocking for the GPU fn backend. Thread block size (that is, CUDA/HIP threads per block) and loop block size (that is, loop iterations per CUDA/HIP thread) can now be specified as template parameters. Further changes: - Set `__launch_bounds__` in the fn GPU kernel based on the thread block size. - Activate vertical loop blocking in the fn nabla kernels on newer CUDA versions that support `GT_PROMISE`. Performance changes: - `__launch_bounds__` affects performance of the `fn_cartesian_vertical_advection` benchmark significantly (positively or negatively, depending on domain size). - Performance of fn nabla benchmarks improves significantly on newer CUDA versions. - Performance on Daint is currently reduced due to too old CUDA version.
GridTools · Oct 29, 2024 · 32daaa5 · 32daaa5
1 parent 805897d
commit 32daaa5
Show file tree

Hide file tree

Showing 13 changed files with 87,166 additions and 86,975 deletions.
diff --git a/include/gridtools/fn/backend/common.hpp b/include/gridtools/fn/backend/common.hpp
@@ -31,10 +31,29 @@ namespace gridtools::fn::backend {
                 meta::rename<tuple, Dims>());
         }
 
+        template <class Dims, class Sizes, class UnrollFactors>
+        constexpr GT_FUNCTION auto make_unrolled_loops(Sizes const &sizes, UnrollFactors) {
+            return tuple_util::host_device::fold(
+                [&](auto outer, auto dim) {
+                    using unroll_factor = element_at<decltype(dim), UnrollFactors>;
+                    return [outer = std::move(outer),
+                               inner = sid::make_unrolled_loop<decltype(dim), unroll_factor::value>(
+                                   host_device::at_key<decltype(dim)>(sizes))](
+                               auto &&...args) { return outer(inner(std::forward<decltype(args)>(args)...)); };
+                },
+                host_device::identity(),
+                meta::rename<tuple, Dims>());
+        }
+
         template <class Sizes>
         constexpr GT_FUNCTION auto make_loops(Sizes const &sizes) {
             return make_loops<get_keys<Sizes>>(sizes);
         }
+
+        template <class Sizes, class UnrollFactors>
+        constexpr GT_FUNCTION auto make_unrolled_loops(Sizes const &sizes, UnrollFactors unroll_factors) {
+            return make_unrolled_loops<get_keys<Sizes>>(sizes, unroll_factors);
+        }
     } // namespace common
 
     template <class T>

diff --git a/include/gridtools/fn/backend/gpu.hpp b/include/gridtools/fn/backend/gpu.hpp
diff --git a/include/gridtools/sid/loop.hpp b/include/gridtools/sid/loop.hpp
@@ -15,6 +15,7 @@
 #include <utility>
 
 #include "../common/defs.hpp"
+#include "../common/for_each.hpp"
 #include "../common/functional.hpp"
 #include "../common/host_device.hpp"
 #include "../common/integral_constant.hpp"
@@ -637,6 +638,44 @@ namespace gridtools {
             return {};
         }
 
+        template <class Key,
+            int UnrollFactor,
+            class NumSteps,
+            class Step = integral_constant<int, 1>,
+            std::enable_if_t<(UnrollFactor > 1), int> = 0>
+        constexpr GT_FUNCTION auto make_unrolled_loop(NumSteps num_steps, Step step = {}) {
+            using u = integral_constant<int, UnrollFactor>;
+            return [step,
+                       unrolled = make_loop<Key>(num_steps / u(), step * u()),
+                       epilogue = make_loop<Key>(num_steps % u(), step),
+                       epilogue_start = step * ((num_steps / u()) * u())](auto &&fun) {
+                return [unrolled =
+                               unrolled([step, fun = std::forward<decltype(fun)>(fun)](auto &&ptr, auto const strides) {
+                                   ::gridtools::host_device::for_each<meta::make_indices_c<UnrollFactor>>([&](auto) {
+                                       fun(std::forward<decltype(ptr)>(ptr), strides);
+                                       shift(std::forward<decltype(ptr)>(ptr), get_stride<Key>(strides), step);
+                                   });
+                                   shift(std::forward<decltype(ptr)>(ptr), get_stride<Key>(strides), -step * u());
+                               }),
+                           epilogue = epilogue(std::forward<decltype(fun)>(fun)),
+                           epilogue_start](auto &&ptr, auto const &strides) {
+                    unrolled(std::forward<decltype(ptr)>(ptr), strides);
+                    shift(std::forward<decltype(ptr)>(ptr), get_stride<Key>(strides), epilogue_start);
+                    epilogue(std::forward<decltype(ptr)>(ptr), strides);
+                    shift(std::forward<decltype(ptr)>(ptr), get_stride<Key>(strides), -epilogue_start);
+                };
+            };
+        }
+
+        template <class Key,
+            int UnrollFactor,
+            class NumSteps,
+            class Step = integral_constant<int, 1>,
+            std::enable_if_t<(UnrollFactor == 1), int> = 0>
+        constexpr GT_FUNCTION auto make_unrolled_loop(NumSteps num_steps, Step step = {}) {
+            return make_loop<Key>(num_steps, step);
+        }
+
         /**
          *   A helper that allows to use `SID`s with C++11 range based loop
          *