Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactoring alpaka nbody and fix compilation with nvcc #126

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions examples/alpaka/nbody/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
cmake_minimum_required (VERSION 3.15)
project(llama-alpaka-nbody)

set (CMAKE_CXX_STANDARD 17)
set (CMAKE_CXX_STANDARD_REQUIRED on)

if (NOT TARGET llama::llama)
find_package(llama REQUIRED)
endif()
find_package(alpaka 0.5.0 REQUIRED)
alpaka_add_executable(${PROJECT_NAME} nbody.cpp ../../common/alpakaHelpers.hpp ../../common/Stopwatch.hpp)
target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17)
target_compile_definitions(${PROJECT_NAME} PUBLIC LLAMA_FN_HOST_ACC_INLINE=ALPAKA_FN_HOST_ACC)
target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama alpaka::alpaka)
120 changes: 61 additions & 59 deletions examples/alpaka/nbody/nbody.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,7 @@
#include <utility>

constexpr auto MAPPING = 0; ///< 0 native AoS, 1 native SoA, 2 native SoA (separate blobs), 3 tree AoS, 4 tree SoA
constexpr auto USE_SHARED = true; ///< defines whether shared memory shall be used
constexpr auto USE_SHARED_TREE = true; ///< defines whether the shared memory shall use tree mapping or
///< native mapping

constexpr auto USE_SHARED_MEMORY = true; ///< use a kernel using shared memory for caching
constexpr auto PROBLEM_SIZE = 16 * 1024; ///< total number of particles
constexpr auto BLOCK_SIZE = 256; ///< number of elements per block
constexpr auto STEPS = 5; ///< number of steps to calculate
Expand Down Expand Up @@ -51,56 +48,42 @@ using Particle = llama::DS<
llama::DE<tag::Mass, FP>>;
// clang-format on

/// Helper function for particle particle interaction. Gets two virtual
/// datums like they are real particle objects
template <typename VirtualDatum1, typename VirtualDatum2>
LLAMA_FN_HOST_ACC_INLINE void pPInteraction(VirtualDatum1 p1, VirtualDatum2 p2, FP ts)
template <typename VirtualParticleI, typename VirtualParticleJ>
LLAMA_FN_HOST_ACC_INLINE void pPInteraction(VirtualParticleI pi, VirtualParticleJ pj, FP ts)
{
// Creating tempory virtual datum object for distance on stack:
auto distance = p1(tag::Pos()) - p2(tag::Pos());
distance *= distance; // square for each element
const FP distSqr = EPS2 + distance(tag::X()) + distance(tag::Y()) + distance(tag::Z());
auto dist = pi(tag::Pos()) - pj(tag::Pos());
dist *= dist;
const FP distSqr = EPS2 + dist(tag::X()) + dist(tag::Y()) + dist(tag::Z());
const FP distSixth = distSqr * distSqr * distSqr;
const FP invDistCube = 1.0f / std::sqrt(distSixth);
const FP s = p2(tag::Mass()) * invDistCube;
distance *= s * ts;
p1(tag::Vel()) += distance;
const FP sts = pj(tag::Mass()) * invDistCube * ts;
pi(tag::Vel()) += dist * sts;
}

/// Alpaka kernel for updating the speed of every particle based on the
/// distance and mass to each other particle. Has complexity O(N²).
template <std::size_t ProblemSize, std::size_t Elems, std::size_t BlockSize>
struct UpdateKernel
struct UpdateKernelSM
{
template <typename Acc, typename View>
LLAMA_FN_HOST_ACC_INLINE void operator()(const Acc& acc, View particles, FP ts) const
{
[[maybe_unused]] auto sharedView = [&] {
if constexpr (USE_SHARED)
auto sharedView = [&] {
const auto sharedMapping = llama::mapping::SoA(
typename View::ArrayDomain{BlockSize},
typename View::DatumDomain{}); // bug: nvcc 11.1 cannot have {} to call ctor

// if there is only 1 thread per block, avoid using shared
// memory
if constexpr (BlockSize / Elems == 1)
return llama::allocViewStack<View::ArrayDomain::rank, typename View::DatumDomain>();
else
{
const auto sharedMapping = [&] {
if constexpr (USE_SHARED_TREE)
return llama::mapping::tree::Mapping{
typename View::ArrayDomain{BlockSize},
llama::Tuple{llama::mapping::tree::functor::LeafOnlyRT()},
typename View::DatumDomain{}};
else
return llama::mapping::SoA{typename View::ArrayDomain{BlockSize}, typename View::DatumDomain{}};
}();

// if there is only 1 thread per block, avoid using shared
// memory
if constexpr (BlockSize / Elems == 1)
return llama::allocViewStack<View::ArrayDomain::rank, typename View::DatumDomain>();
else
{
constexpr auto sharedMemSize = llama::sizeOf<typename View::DatumDomain> * BlockSize;
auto& sharedMem = alpaka::allocVar<std::byte[sharedMemSize], __COUNTER__>(acc);
return llama::View{sharedMapping, llama::Array{&sharedMem[0]}};
}
constexpr auto sharedMemSize = llama::sizeOf<typename View::DatumDomain> * BlockSize;
auto& sharedMem = alpaka::allocVar<std::byte[sharedMemSize], __COUNTER__>(acc);
return llama::View{
sharedMapping,
llama::Array<std::byte*, 1>{
&sharedMem[0]}}; // bug: nvcc 11.1 needs explicit template args for llama::Array
}
else
return int{}; // dummy
}();

const auto ti = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
Expand All @@ -113,37 +96,51 @@ struct UpdateKernel
{
const auto start2 = b * BlockSize;
const auto end2 = alpaka::math::min(acc, start2 + BlockSize, ProblemSize) - start2;
if constexpr (USE_SHARED)

LLAMA_INDEPENDENT_DATA
for (auto pos2 = decltype(end2)(0); pos2 + ti < end2; pos2 += BlockSize / Elems)
sharedView(pos2 + tbi) = particles(start2 + pos2 + tbi);
alpaka::syncBlockThreads(acc);

LLAMA_INDEPENDENT_DATA
for (auto pos2 = decltype(end2)(0); pos2 < end2; ++pos2)
{
LLAMA_INDEPENDENT_DATA
for (auto pos2 = decltype(end2)(0); pos2 + ti < end2; pos2 += BlockSize / Elems)
sharedView(pos2 + tbi) = particles(start2 + pos2 + tbi);
alpaka::syncBlockThreads(acc);
for (auto i = start; i < end; ++i)
pPInteraction(particles(i), sharedView(pos2), ts);
}
alpaka::syncBlockThreads(acc);
}
}
};

template <std::size_t ProblemSize, std::size_t Elems, std::size_t BlockSize>
struct UpdateKernel
{
template <typename Acc, typename View>
LLAMA_FN_HOST_ACC_INLINE void operator()(const Acc& acc, View particles, FP ts) const
{
const auto ti = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
const auto start = ti * Elems;
const auto end = alpaka::math::min(acc, start + Elems, ProblemSize);

LLAMA_INDEPENDENT_DATA
for (auto j = 0; j < ProblemSize; ++j)
{
LLAMA_INDEPENDENT_DATA
for (auto pos2 = decltype(end2)(0); pos2 < end2; ++pos2)
LLAMA_INDEPENDENT_DATA
for (auto i = start; i < end; ++i)
if constexpr (USE_SHARED)
pPInteraction(particles(i), sharedView(pos2), ts);
else
pPInteraction(particles(i), particles(start2 + pos2), ts);
if constexpr (USE_SHARED)
alpaka::syncBlockThreads(acc);
pPInteraction(particles(i), particles(j), ts);
}
}
};

/// Alpaka kernel for moving each particle with its speed. Has complexity
/// O(N).
template <std::size_t ProblemSize, std::size_t Elems>
struct MoveKernel
{
template <typename Acc, typename View>
LLAMA_FN_HOST_ACC_INLINE void operator()(const Acc& acc, View particles, FP ts) const
{
const auto ti = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];

const auto start = ti * Elems;
const auto end = alpaka::math::min(acc, start + Elems, ProblemSize);

Expand All @@ -153,7 +150,7 @@ struct MoveKernel
}
};

int main(int argc, char** argv)
int main()
{
using Dim = alpaka::DimInt<1>;
using Size = std::size_t;
Expand Down Expand Up @@ -245,7 +242,12 @@ int main(int argc, char** argv)

for (std::size_t s = 0; s < STEPS; ++s)
{
UpdateKernel<PROBLEM_SIZE, elemCount, BLOCK_SIZE> updateKernel;
auto updateKernel = [&] {
if constexpr (USE_SHARED_MEMORY)
return UpdateKernelSM<PROBLEM_SIZE, elemCount, BLOCK_SIZE>{};
else
return UpdateKernel<PROBLEM_SIZE, elemCount, BLOCK_SIZE>{};
}();
alpaka::exec<Acc>(queue, workdiv, updateKernel, accView, ts);

chrono.printAndReset("Update kernel");
Expand Down
11 changes: 9 additions & 2 deletions include/llama/mapping/SoA.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

#include "Common.hpp"

#include <limits>

namespace llama::mapping
{
/// Struct of array mapping. Used to create a \ref View via \ref allocView.
Expand Down Expand Up @@ -78,10 +80,13 @@ namespace llama::mapping
index++;
});
if (!found)
throw "Passed TargetDatumCoord must be in datum domain";
return std::numeric_limits<std::size_t>::max();
return index;
}
();
static_assert(
blob != std::numeric_limits<std::size_t>::max(),
"Passed TargetDatumCoord must be in datum domain");

LLAMA_FORCE_INLINE_RECURSIVE
const auto offset = LinearizeArrayDomainFunctor{}(coord, arrayDomainSize)
Expand All @@ -93,7 +98,9 @@ namespace llama::mapping
LLAMA_FORCE_INLINE_RECURSIVE
const auto offset = LinearizeArrayDomainFunctor{}(coord, arrayDomainSize)
* sizeof(GetType<DatumDomain, DatumCoord<DatumDomainCoord...>>)
+ offsetOf<DatumDomain, DatumCoord<DatumDomainCoord...>> * LinearizeArrayDomainFunctor{}.size(arrayDomainSize);
+ offsetOf<
DatumDomain,
DatumCoord<DatumDomainCoord...>> * LinearizeArrayDomainFunctor{}.size(arrayDomainSize);
return {0, offset};
}
}
Expand Down