Skip to content

Commit

Permalink
Loop Blocking for fn GPU Backend (#1787)
Browse files Browse the repository at this point in the history
Implements loop blocking for the GPU fn backend. Thread block size (that
is, CUDA/HIP threads per block) and loop block size (that is, loop
iterations per CUDA/HIP thread) can now be specified as template
parameters.

Further changes:
- Set `__launch_bounds__` in the fn GPU kernel based on the thread block
size.
- Activate vertical loop blocking in the fn nabla kernels on newer CUDA
versions that support `GT_PROMISE`.

Performance changes:
- `__launch_bounds__` affects performance of the
`fn_cartesian_vertical_advection` benchmark significantly (positively or
negatively, depending on domain size).
- Performance of fn nabla benchmarks improves significantly on newer
CUDA versions.
- Performance on Daint is currently reduced due to too old CUDA version.
  • Loading branch information
fthaler authored Oct 29, 2024
1 parent 805897d commit 32daaa5
Show file tree
Hide file tree
Showing 13 changed files with 87,166 additions and 86,975 deletions.
19 changes: 19 additions & 0 deletions include/gridtools/fn/backend/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,29 @@ namespace gridtools::fn::backend {
meta::rename<tuple, Dims>());
}

template <class Dims, class Sizes, class UnrollFactors>
constexpr GT_FUNCTION auto make_unrolled_loops(Sizes const &sizes, UnrollFactors) {
return tuple_util::host_device::fold(
[&](auto outer, auto dim) {
using unroll_factor = element_at<decltype(dim), UnrollFactors>;
return [outer = std::move(outer),
inner = sid::make_unrolled_loop<decltype(dim), unroll_factor::value>(
host_device::at_key<decltype(dim)>(sizes))](
auto &&...args) { return outer(inner(std::forward<decltype(args)>(args)...)); };
},
host_device::identity(),
meta::rename<tuple, Dims>());
}

template <class Sizes>
constexpr GT_FUNCTION auto make_loops(Sizes const &sizes) {
return make_loops<get_keys<Sizes>>(sizes);
}

template <class Sizes, class UnrollFactors>
constexpr GT_FUNCTION auto make_unrolled_loops(Sizes const &sizes, UnrollFactors unroll_factors) {
return make_unrolled_loops<get_keys<Sizes>>(sizes, unroll_factors);
}
} // namespace common

template <class T>
Expand Down
223 changes: 147 additions & 76 deletions include/gridtools/fn/backend/gpu.hpp

Large diffs are not rendered by default.

39 changes: 39 additions & 0 deletions include/gridtools/sid/loop.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <utility>

#include "../common/defs.hpp"
#include "../common/for_each.hpp"
#include "../common/functional.hpp"
#include "../common/host_device.hpp"
#include "../common/integral_constant.hpp"
Expand Down Expand Up @@ -637,6 +638,44 @@ namespace gridtools {
return {};
}

template <class Key,
int UnrollFactor,
class NumSteps,
class Step = integral_constant<int, 1>,
std::enable_if_t<(UnrollFactor > 1), int> = 0>
constexpr GT_FUNCTION auto make_unrolled_loop(NumSteps num_steps, Step step = {}) {
using u = integral_constant<int, UnrollFactor>;
return [step,
unrolled = make_loop<Key>(num_steps / u(), step * u()),
epilogue = make_loop<Key>(num_steps % u(), step),
epilogue_start = step * ((num_steps / u()) * u())](auto &&fun) {
return [unrolled =
unrolled([step, fun = std::forward<decltype(fun)>(fun)](auto &&ptr, auto const strides) {
::gridtools::host_device::for_each<meta::make_indices_c<UnrollFactor>>([&](auto) {
fun(std::forward<decltype(ptr)>(ptr), strides);
shift(std::forward<decltype(ptr)>(ptr), get_stride<Key>(strides), step);
});
shift(std::forward<decltype(ptr)>(ptr), get_stride<Key>(strides), -step * u());
}),
epilogue = epilogue(std::forward<decltype(fun)>(fun)),
epilogue_start](auto &&ptr, auto const &strides) {
unrolled(std::forward<decltype(ptr)>(ptr), strides);
shift(std::forward<decltype(ptr)>(ptr), get_stride<Key>(strides), epilogue_start);
epilogue(std::forward<decltype(ptr)>(ptr), strides);
shift(std::forward<decltype(ptr)>(ptr), get_stride<Key>(strides), -epilogue_start);
};
};
}

template <class Key,
int UnrollFactor,
class NumSteps,
class Step = integral_constant<int, 1>,
std::enable_if_t<(UnrollFactor == 1), int> = 0>
constexpr GT_FUNCTION auto make_unrolled_loop(NumSteps num_steps, Step step = {}) {
return make_loop<Key>(num_steps, step);
}

/**
* A helper that allows to use `SID`s with C++11 range based loop
*
Expand Down
Loading

0 comments on commit 32daaa5

Please sign in to comment.