Skip to content

Commit

Permalink
Merge pull request #126 from bernhardmgruber/alpaka_nbody_ref
Browse files Browse the repository at this point in the history
Refactoring alpaka nbody and fix compilation with nvcc
  • Loading branch information
bernhardmgruber authored Nov 13, 2020
2 parents caf78f9 + 0338a82 commit 2fa4501
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 64 deletions.
4 changes: 1 addition & 3 deletions examples/alpaka/nbody/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
cmake_minimum_required (VERSION 3.15)
project(llama-alpaka-nbody)

set (CMAKE_CXX_STANDARD 17)
set (CMAKE_CXX_STANDARD_REQUIRED on)

if (NOT TARGET llama::llama)
find_package(llama REQUIRED)
endif()
find_package(alpaka 0.5.0 REQUIRED)
alpaka_add_executable(${PROJECT_NAME} nbody.cpp ../../common/alpakaHelpers.hpp ../../common/Stopwatch.hpp)
target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17)
target_compile_definitions(${PROJECT_NAME} PUBLIC LLAMA_FN_HOST_ACC_INLINE=ALPAKA_FN_HOST_ACC)
target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama alpaka::alpaka)
120 changes: 61 additions & 59 deletions examples/alpaka/nbody/nbody.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,7 @@
#include <utility>

constexpr auto MAPPING = 0; ///< 0 native AoS, 1 native SoA, 2 native SoA (separate blobs), 3 tree AoS, 4 tree SoA
constexpr auto USE_SHARED = true; ///< defines whether shared memory shall be used
constexpr auto USE_SHARED_TREE = true; ///< defines whether the shared memory shall use tree mapping or
///< native mapping

constexpr auto USE_SHARED_MEMORY = true; ///< use a kernel using shared memory for caching
constexpr auto PROBLEM_SIZE = 16 * 1024; ///< total number of particles
constexpr auto BLOCK_SIZE = 256; ///< number of elements per block
constexpr auto STEPS = 5; ///< number of steps to calculate
Expand Down Expand Up @@ -51,56 +48,42 @@ using Particle = llama::DS<
llama::DE<tag::Mass, FP>>;
// clang-format on

/// Helper function for particle particle interaction. Gets two virtual
/// datums like they are real particle objects
template <typename VirtualDatum1, typename VirtualDatum2>
LLAMA_FN_HOST_ACC_INLINE void pPInteraction(VirtualDatum1 p1, VirtualDatum2 p2, FP ts)
template <typename VirtualParticleI, typename VirtualParticleJ>
LLAMA_FN_HOST_ACC_INLINE void pPInteraction(VirtualParticleI pi, VirtualParticleJ pj, FP ts)
{
// Creating tempory virtual datum object for distance on stack:
auto distance = p1(tag::Pos()) - p2(tag::Pos());
distance *= distance; // square for each element
const FP distSqr = EPS2 + distance(tag::X()) + distance(tag::Y()) + distance(tag::Z());
auto dist = pi(tag::Pos()) - pj(tag::Pos());
dist *= dist;
const FP distSqr = EPS2 + dist(tag::X()) + dist(tag::Y()) + dist(tag::Z());
const FP distSixth = distSqr * distSqr * distSqr;
const FP invDistCube = 1.0f / std::sqrt(distSixth);
const FP s = p2(tag::Mass()) * invDistCube;
distance *= s * ts;
p1(tag::Vel()) += distance;
const FP sts = pj(tag::Mass()) * invDistCube * ts;
pi(tag::Vel()) += dist * sts;
}

/// Alpaka kernel for updating the speed of every particle based on the
/// distance and mass to each other particle. Has complexity O(N²).
template <std::size_t ProblemSize, std::size_t Elems, std::size_t BlockSize>
struct UpdateKernel
struct UpdateKernelSM
{
template <typename Acc, typename View>
LLAMA_FN_HOST_ACC_INLINE void operator()(const Acc& acc, View particles, FP ts) const
{
[[maybe_unused]] auto sharedView = [&] {
if constexpr (USE_SHARED)
auto sharedView = [&] {
const auto sharedMapping = llama::mapping::SoA(
typename View::ArrayDomain{BlockSize},
typename View::DatumDomain{}); // bug: nvcc 11.1 cannot have {} to call ctor

// if there is only 1 thread per block, avoid using shared
// memory
if constexpr (BlockSize / Elems == 1)
return llama::allocViewStack<View::ArrayDomain::rank, typename View::DatumDomain>();
else
{
const auto sharedMapping = [&] {
if constexpr (USE_SHARED_TREE)
return llama::mapping::tree::Mapping{
typename View::ArrayDomain{BlockSize},
llama::Tuple{llama::mapping::tree::functor::LeafOnlyRT()},
typename View::DatumDomain{}};
else
return llama::mapping::SoA{typename View::ArrayDomain{BlockSize}, typename View::DatumDomain{}};
}();

// if there is only 1 thread per block, avoid using shared
// memory
if constexpr (BlockSize / Elems == 1)
return llama::allocViewStack<View::ArrayDomain::rank, typename View::DatumDomain>();
else
{
constexpr auto sharedMemSize = llama::sizeOf<typename View::DatumDomain> * BlockSize;
auto& sharedMem = alpaka::allocVar<std::byte[sharedMemSize], __COUNTER__>(acc);
return llama::View{sharedMapping, llama::Array{&sharedMem[0]}};
}
constexpr auto sharedMemSize = llama::sizeOf<typename View::DatumDomain> * BlockSize;
auto& sharedMem = alpaka::allocVar<std::byte[sharedMemSize], __COUNTER__>(acc);
return llama::View{
sharedMapping,
llama::Array<std::byte*, 1>{
&sharedMem[0]}}; // bug: nvcc 11.1 needs explicit template args for llama::Array
}
else
return int{}; // dummy
}();

const auto ti = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
Expand All @@ -113,37 +96,51 @@ struct UpdateKernel
{
const auto start2 = b * BlockSize;
const auto end2 = alpaka::math::min(acc, start2 + BlockSize, ProblemSize) - start2;
if constexpr (USE_SHARED)

LLAMA_INDEPENDENT_DATA
for (auto pos2 = decltype(end2)(0); pos2 + ti < end2; pos2 += BlockSize / Elems)
sharedView(pos2 + tbi) = particles(start2 + pos2 + tbi);
alpaka::syncBlockThreads(acc);

LLAMA_INDEPENDENT_DATA
for (auto pos2 = decltype(end2)(0); pos2 < end2; ++pos2)
{
LLAMA_INDEPENDENT_DATA
for (auto pos2 = decltype(end2)(0); pos2 + ti < end2; pos2 += BlockSize / Elems)
sharedView(pos2 + tbi) = particles(start2 + pos2 + tbi);
alpaka::syncBlockThreads(acc);
for (auto i = start; i < end; ++i)
pPInteraction(particles(i), sharedView(pos2), ts);
}
alpaka::syncBlockThreads(acc);
}
}
};

template <std::size_t ProblemSize, std::size_t Elems, std::size_t BlockSize>
struct UpdateKernel
{
template <typename Acc, typename View>
LLAMA_FN_HOST_ACC_INLINE void operator()(const Acc& acc, View particles, FP ts) const
{
const auto ti = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
const auto start = ti * Elems;
const auto end = alpaka::math::min(acc, start + Elems, ProblemSize);

LLAMA_INDEPENDENT_DATA
for (auto j = 0; j < ProblemSize; ++j)
{
LLAMA_INDEPENDENT_DATA
for (auto pos2 = decltype(end2)(0); pos2 < end2; ++pos2)
LLAMA_INDEPENDENT_DATA
for (auto i = start; i < end; ++i)
if constexpr (USE_SHARED)
pPInteraction(particles(i), sharedView(pos2), ts);
else
pPInteraction(particles(i), particles(start2 + pos2), ts);
if constexpr (USE_SHARED)
alpaka::syncBlockThreads(acc);
pPInteraction(particles(i), particles(j), ts);
}
}
};

/// Alpaka kernel for moving each particle with its speed. Has complexity
/// O(N).
template <std::size_t ProblemSize, std::size_t Elems>
struct MoveKernel
{
template <typename Acc, typename View>
LLAMA_FN_HOST_ACC_INLINE void operator()(const Acc& acc, View particles, FP ts) const
{
const auto ti = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];

const auto start = ti * Elems;
const auto end = alpaka::math::min(acc, start + Elems, ProblemSize);

Expand All @@ -153,7 +150,7 @@ struct MoveKernel
}
};

int main(int argc, char** argv)
int main()
{
using Dim = alpaka::DimInt<1>;
using Size = std::size_t;
Expand Down Expand Up @@ -245,7 +242,12 @@ int main(int argc, char** argv)

for (std::size_t s = 0; s < STEPS; ++s)
{
UpdateKernel<PROBLEM_SIZE, elemCount, BLOCK_SIZE> updateKernel;
auto updateKernel = [&] {
if constexpr (USE_SHARED_MEMORY)
return UpdateKernelSM<PROBLEM_SIZE, elemCount, BLOCK_SIZE>{};
else
return UpdateKernel<PROBLEM_SIZE, elemCount, BLOCK_SIZE>{};
}();
alpaka::exec<Acc>(queue, workdiv, updateKernel, accView, ts);

chrono.printAndReset("Update kernel");
Expand Down
11 changes: 9 additions & 2 deletions include/llama/mapping/SoA.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

#include "Common.hpp"

#include <limits>

namespace llama::mapping
{
/// Struct of array mapping. Used to create a \ref View via \ref allocView.
Expand Down Expand Up @@ -78,10 +80,13 @@ namespace llama::mapping
index++;
});
if (!found)
throw "Passed TargetDatumCoord must be in datum domain";
return std::numeric_limits<std::size_t>::max();
return index;
}
();
static_assert(
blob != std::numeric_limits<std::size_t>::max(),
"Passed TargetDatumCoord must be in datum domain");

LLAMA_FORCE_INLINE_RECURSIVE
const auto offset = LinearizeArrayDomainFunctor{}(coord, arrayDomainSize)
Expand All @@ -93,7 +98,9 @@ namespace llama::mapping
LLAMA_FORCE_INLINE_RECURSIVE
const auto offset = LinearizeArrayDomainFunctor{}(coord, arrayDomainSize)
* sizeof(GetType<DatumDomain, DatumCoord<DatumDomainCoord...>>)
+ offsetOf<DatumDomain, DatumCoord<DatumDomainCoord...>> * LinearizeArrayDomainFunctor{}.size(arrayDomainSize);
+ offsetOf<
DatumDomain,
DatumCoord<DatumDomainCoord...>> * LinearizeArrayDomainFunctor{}.size(arrayDomainSize);
return {0, offset};
}
}
Expand Down

0 comments on commit 2fa4501

Please sign in to comment.