Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve function inlining for viewcopy #264

Merged
merged 5 commits into from
May 25, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions examples/viewcopy/viewcopy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,9 @@ void naive_copy(
llamaex::parallelForEachADCoord(
srcView.mapping.arrayDims(),
numThreads,
[&](auto ad) {
llama::forEachLeaf<typename DstMapping::RecordDim>([&](auto coord)
[&](auto ad) LLAMA_LAMBDA_INLINE
{
llama::forEachLeaf<typename DstMapping::RecordDim>([&](auto coord) LLAMA_LAMBDA_INLINE
{ dstView(ad)(coord) = srcView(ad)(coord); });
});
}
Expand Down
21 changes: 17 additions & 4 deletions include/llama/Core.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,17 @@ namespace llama
template <typename RecordDim>
using LeafRecordCoords = typename internal::LeafRecordCoordsImpl<RecordDim, RecordCoord<>>::type;

namespace internal
{
// adapted from boost::mp11, but with LLAMA_FN_HOST_ACC_INLINE
template <template <typename...> typename L, typename... T, typename F>
LLAMA_FN_HOST_ACC_INLINE constexpr void mp_for_each_inlined(L<T...>, F&& f)
{
using A = int[sizeof...(T)];
(void) A{((void) f(T{}), 0)...};
}
} // namespace internal

/// Iterates over the record dimension tree and calls a functor on each element.
/// \param functor Functor to execute at each element of. Needs to have
/// `operator()` with a template parameter for the \ref RecordCoord in the
Expand All @@ -299,8 +310,10 @@ namespace llama
LLAMA_FN_HOST_ACC_INLINE constexpr void forEachLeaf(Functor&& functor, RecordCoord<Coords...> baseCoord)
{
LLAMA_FORCE_INLINE_RECURSIVE
boost::mp11::mp_for_each<LeafRecordCoords<GetType<RecordDim, RecordCoord<Coords...>>>>([&](
auto innerCoord) constexpr { functor(cat(baseCoord, innerCoord)); });
internal::mp_for_each_inlined(
LeafRecordCoords<GetType<RecordDim, RecordCoord<Coords...>>>{},
[&](auto innerCoord) LLAMA_LAMBDA_INLINE_WITH_SPECIFIERS(constexpr)
{ std::forward<Functor>(functor)(cat(baseCoord, innerCoord)); });
}

/// Iterates over the record dimension tree and calls a functor on each element.
Expand Down Expand Up @@ -468,7 +481,7 @@ namespace llama
namespace internal
{
template <std::size_t Dim>
constexpr auto popFront(ArrayDims<Dim> ad)
LLAMA_FN_HOST_ACC_INLINE constexpr auto popFront(ArrayDims<Dim> ad)
{
ArrayDims<Dim - 1> result;
for (std::size_t i = 0; i < Dim - 1; i++)
Expand All @@ -478,7 +491,7 @@ namespace llama
} // namespace internal

template <std::size_t Dim, typename Func, typename... OuterIndices>
void forEachADCoord(ArrayDims<Dim> adSize, Func&& func, OuterIndices... outerIndices)
LLAMA_FN_HOST_ACC_INLINE void forEachADCoord(ArrayDims<Dim> adSize, Func&& func, OuterIndices... outerIndices)
{
for (std::size_t i = 0; i < adSize[0]; i++)
{
Expand Down
16 changes: 8 additions & 8 deletions include/llama/VirtualRecord.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,17 +51,17 @@ namespace llama
typename LeftRecord::AccessibleRecordDim,
typename RightRecord::AccessibleRecordDim>)
{
forEachLeaf<typename LeftRecord::AccessibleRecordDim>([&](auto coord)
forEachLeaf<typename LeftRecord::AccessibleRecordDim>([&](auto coord) LLAMA_LAMBDA_INLINE
{ Functor{}(left(coord), right(coord)); });
}
else
{
forEachLeaf<typename LeftRecord::AccessibleRecordDim>(
[&](auto leftCoord)
[&](auto leftCoord) LLAMA_LAMBDA_INLINE
{
using LeftInnerCoord = decltype(leftCoord);
forEachLeaf<typename RightRecord::AccessibleRecordDim>(
[&](auto rightCoord)
[&](auto rightCoord) LLAMA_LAMBDA_INLINE
{
using RightInnerCoord = decltype(rightCoord);
if constexpr (hasSameTags<
Expand All @@ -81,7 +81,7 @@ namespace llama
template <typename Functor, typename LeftRecord, typename T>
LLAMA_FN_HOST_ACC_INLINE auto virtualRecordArithOperator(LeftRecord& left, const T& right) -> LeftRecord&
{
forEachLeaf<typename LeftRecord::AccessibleRecordDim>([&](auto leftCoord)
forEachLeaf<typename LeftRecord::AccessibleRecordDim>([&](auto leftCoord) LLAMA_LAMBDA_INLINE
{ Functor{}(left(leftCoord), right); });
return left;
}
Expand All @@ -105,16 +105,16 @@ namespace llama
typename RightRecord::AccessibleRecordDim>)
{
forEachLeaf<typename LeftRecord::AccessibleRecordDim>(
[&](auto coord) { result &= Functor{}(left(coord), right(coord)); });
[&](auto coord) LLAMA_LAMBDA_INLINE { result &= Functor{}(left(coord), right(coord)); });
}
else
{
forEachLeaf<typename LeftRecord::AccessibleRecordDim>(
[&](auto leftCoord)
[&](auto leftCoord) LLAMA_LAMBDA_INLINE
{
using LeftInnerCoord = decltype(leftCoord);
forEachLeaf<typename RightRecord::AccessibleRecordDim>(
[&](auto rightCoord)
[&](auto rightCoord) LLAMA_LAMBDA_INLINE
{
using RightInnerCoord = decltype(rightCoord);
if constexpr (hasSameTags<
Expand All @@ -136,7 +136,7 @@ namespace llama
{
bool result = true;
forEachLeaf<typename LeftRecord::AccessibleRecordDim>(
[&](auto leftCoord) {
[&](auto leftCoord) LLAMA_LAMBDA_INLINE {
result &= Functor{}(
left(leftCoord),
static_cast<std::remove_reference_t<decltype(left(leftCoord))>>(right));
Expand Down
17 changes: 17 additions & 0 deletions include/llama/macros.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,23 @@
# endif
#endif

#ifndef LLAMA_LAMBDA_INLINE_WITH_SPECIFIERS
# if defined(__clang__) || defined(__INTEL_LLVM_COMPILER)
# define LLAMA_LAMBDA_INLINE_WITH_SPECIFIERS(...) __attribute__((always_inline)) __VA_ARGS__
# elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__NVCC__)
# define LLAMA_LAMBDA_INLINE_WITH_SPECIFIERS(...) __VA_ARGS__ __attribute__((always_inline))
# elif defined(_MSC_VER)
# define LLAMA_LAMBDA_INLINE_WITH_SPECIFIERS(...) \
__VA_ARGS__ /* FIXME: MSVC cannot combine constexpr and [[msvc::forceinline]] */
# else
# define LLAMA_LAMBDA_INLINE_WITH_SPECIFIERS(...) __VA_ARGS__
# warning LLAMA_LAMBDA_INLINE_WITH_SPECIFIERS not defined for this compiler
# endif
#endif
#ifndef LLAMA_LAMBDA_INLINE
# define LLAMA_LAMBDA_INLINE LLAMA_LAMBDA_INLINE_WITH_SPECIFIERS()
#endif

/// Suppresses nvcc warning: 'calling a __host__ function from __host__ __device__ function.'
#if defined(__NVCC__) && !defined(__clang__)
# define LLAMA_SUPPRESS_HOST_DEVICE_WARNING _Pragma("nv_exec_check_disable")
Expand Down