diff --git a/examples/alpaka/nbody/CMakeLists.txt b/examples/alpaka/nbody/CMakeLists.txt index a4522591b0..401d304226 100644 --- a/examples/alpaka/nbody/CMakeLists.txt +++ b/examples/alpaka/nbody/CMakeLists.txt @@ -1,13 +1,11 @@ cmake_minimum_required (VERSION 3.15) project(llama-alpaka-nbody) -set (CMAKE_CXX_STANDARD 17) -set (CMAKE_CXX_STANDARD_REQUIRED on) - if (NOT TARGET llama::llama) find_package(llama REQUIRED) endif() find_package(alpaka 0.5.0 REQUIRED) alpaka_add_executable(${PROJECT_NAME} nbody.cpp ../../common/alpakaHelpers.hpp ../../common/Stopwatch.hpp) +target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17) target_compile_definitions(${PROJECT_NAME} PUBLIC LLAMA_FN_HOST_ACC_INLINE=ALPAKA_FN_HOST_ACC) target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama alpaka::alpaka) diff --git a/examples/alpaka/nbody/nbody.cpp b/examples/alpaka/nbody/nbody.cpp index 3ea3f81391..46546eca7e 100644 --- a/examples/alpaka/nbody/nbody.cpp +++ b/examples/alpaka/nbody/nbody.cpp @@ -17,10 +17,7 @@ #include constexpr auto MAPPING = 0; ///< 0 native AoS, 1 native SoA, 2 native SoA (separate blobs), 3 tree AoS, 4 tree SoA -constexpr auto USE_SHARED = true; ///< defines whether shared memory shall be used -constexpr auto USE_SHARED_TREE = true; ///< defines whether the shared memory shall use tree mapping or - ///< native mapping - +constexpr auto USE_SHARED_MEMORY = true; ///< use a kernel using shared memory for caching constexpr auto PROBLEM_SIZE = 16 * 1024; ///< total number of particles constexpr auto BLOCK_SIZE = 256; ///< number of elements per block constexpr auto STEPS = 5; ///< number of steps to calculate @@ -51,56 +48,42 @@ using Particle = llama::DS< llama::DE>; // clang-format on -/// Helper function for particle particle interaction. Gets two virtual -/// datums like they are real particle objects -template -LLAMA_FN_HOST_ACC_INLINE void pPInteraction(VirtualDatum1 p1, VirtualDatum2 p2, FP ts) +template +LLAMA_FN_HOST_ACC_INLINE void pPInteraction(VirtualParticleI pi, VirtualParticleJ pj, FP ts) { - // Creating tempory virtual datum object for distance on stack: - auto distance = p1(tag::Pos()) - p2(tag::Pos()); - distance *= distance; // square for each element - const FP distSqr = EPS2 + distance(tag::X()) + distance(tag::Y()) + distance(tag::Z()); + auto dist = pi(tag::Pos()) - pj(tag::Pos()); + dist *= dist; + const FP distSqr = EPS2 + dist(tag::X()) + dist(tag::Y()) + dist(tag::Z()); const FP distSixth = distSqr * distSqr * distSqr; const FP invDistCube = 1.0f / std::sqrt(distSixth); - const FP s = p2(tag::Mass()) * invDistCube; - distance *= s * ts; - p1(tag::Vel()) += distance; + const FP sts = pj(tag::Mass()) * invDistCube * ts; + pi(tag::Vel()) += dist * sts; } -/// Alpaka kernel for updating the speed of every particle based on the -/// distance and mass to each other particle. Has complexity O(N²). template -struct UpdateKernel +struct UpdateKernelSM { template LLAMA_FN_HOST_ACC_INLINE void operator()(const Acc& acc, View particles, FP ts) const { - [[maybe_unused]] auto sharedView = [&] { - if constexpr (USE_SHARED) + auto sharedView = [&] { + const auto sharedMapping = llama::mapping::SoA( + typename View::ArrayDomain{BlockSize}, + typename View::DatumDomain{}); // bug: nvcc 11.1 cannot have {} to call ctor + + // if there is only 1 thread per block, avoid using shared + // memory + if constexpr (BlockSize / Elems == 1) + return llama::allocViewStack(); + else { - const auto sharedMapping = [&] { - if constexpr (USE_SHARED_TREE) - return llama::mapping::tree::Mapping{ - typename View::ArrayDomain{BlockSize}, - llama::Tuple{llama::mapping::tree::functor::LeafOnlyRT()}, - typename View::DatumDomain{}}; - else - return llama::mapping::SoA{typename View::ArrayDomain{BlockSize}, typename View::DatumDomain{}}; - }(); - - // if there is only 1 thread per block, avoid using shared - // memory - if constexpr (BlockSize / Elems == 1) - return llama::allocViewStack(); - else - { - constexpr auto sharedMemSize = llama::sizeOf * BlockSize; - auto& sharedMem = alpaka::allocVar(acc); - return llama::View{sharedMapping, llama::Array{&sharedMem[0]}}; - } + constexpr auto sharedMemSize = llama::sizeOf * BlockSize; + auto& sharedMem = alpaka::allocVar(acc); + return llama::View{ + sharedMapping, + llama::Array{ + &sharedMem[0]}}; // bug: nvcc 11.1 needs explicit template args for llama::Array } - else - return int{}; // dummy }(); const auto ti = alpaka::getIdx(acc)[0u]; @@ -113,29 +96,44 @@ struct UpdateKernel { const auto start2 = b * BlockSize; const auto end2 = alpaka::math::min(acc, start2 + BlockSize, ProblemSize) - start2; - if constexpr (USE_SHARED) + + LLAMA_INDEPENDENT_DATA + for (auto pos2 = decltype(end2)(0); pos2 + ti < end2; pos2 += BlockSize / Elems) + sharedView(pos2 + tbi) = particles(start2 + pos2 + tbi); + alpaka::syncBlockThreads(acc); + + LLAMA_INDEPENDENT_DATA + for (auto pos2 = decltype(end2)(0); pos2 < end2; ++pos2) { LLAMA_INDEPENDENT_DATA - for (auto pos2 = decltype(end2)(0); pos2 + ti < end2; pos2 += BlockSize / Elems) - sharedView(pos2 + tbi) = particles(start2 + pos2 + tbi); - alpaka::syncBlockThreads(acc); + for (auto i = start; i < end; ++i) + pPInteraction(particles(i), sharedView(pos2), ts); } + alpaka::syncBlockThreads(acc); + } + } +}; + +template +struct UpdateKernel +{ + template + LLAMA_FN_HOST_ACC_INLINE void operator()(const Acc& acc, View particles, FP ts) const + { + const auto ti = alpaka::getIdx(acc)[0u]; + const auto start = ti * Elems; + const auto end = alpaka::math::min(acc, start + Elems, ProblemSize); + + LLAMA_INDEPENDENT_DATA + for (auto j = 0; j < ProblemSize; ++j) + { LLAMA_INDEPENDENT_DATA - for (auto pos2 = decltype(end2)(0); pos2 < end2; ++pos2) - LLAMA_INDEPENDENT_DATA for (auto i = start; i < end; ++i) - if constexpr (USE_SHARED) - pPInteraction(particles(i), sharedView(pos2), ts); - else - pPInteraction(particles(i), particles(start2 + pos2), ts); - if constexpr (USE_SHARED) - alpaka::syncBlockThreads(acc); + pPInteraction(particles(i), particles(j), ts); } } }; -/// Alpaka kernel for moving each particle with its speed. Has complexity -/// O(N). template struct MoveKernel { @@ -143,7 +141,6 @@ struct MoveKernel LLAMA_FN_HOST_ACC_INLINE void operator()(const Acc& acc, View particles, FP ts) const { const auto ti = alpaka::getIdx(acc)[0]; - const auto start = ti * Elems; const auto end = alpaka::math::min(acc, start + Elems, ProblemSize); @@ -153,7 +150,7 @@ struct MoveKernel } }; -int main(int argc, char** argv) +int main() { using Dim = alpaka::DimInt<1>; using Size = std::size_t; @@ -245,7 +242,12 @@ int main(int argc, char** argv) for (std::size_t s = 0; s < STEPS; ++s) { - UpdateKernel updateKernel; + auto updateKernel = [&] { + if constexpr (USE_SHARED_MEMORY) + return UpdateKernelSM{}; + else + return UpdateKernel{}; + }(); alpaka::exec(queue, workdiv, updateKernel, accView, ts); chrono.printAndReset("Update kernel"); diff --git a/include/llama/mapping/SoA.hpp b/include/llama/mapping/SoA.hpp index 833f67f026..4ea75bac45 100644 --- a/include/llama/mapping/SoA.hpp +++ b/include/llama/mapping/SoA.hpp @@ -5,6 +5,8 @@ #include "Common.hpp" +#include + namespace llama::mapping { /// Struct of array mapping. Used to create a \ref View via \ref allocView. @@ -78,10 +80,13 @@ namespace llama::mapping index++; }); if (!found) - throw "Passed TargetDatumCoord must be in datum domain"; + return std::numeric_limits::max(); return index; } (); + static_assert( + blob != std::numeric_limits::max(), + "Passed TargetDatumCoord must be in datum domain"); LLAMA_FORCE_INLINE_RECURSIVE const auto offset = LinearizeArrayDomainFunctor{}(coord, arrayDomainSize) @@ -93,7 +98,9 @@ namespace llama::mapping LLAMA_FORCE_INLINE_RECURSIVE const auto offset = LinearizeArrayDomainFunctor{}(coord, arrayDomainSize) * sizeof(GetType>) - + offsetOf> * LinearizeArrayDomainFunctor{}.size(arrayDomainSize); + + offsetOf< + DatumDomain, + DatumCoord> * LinearizeArrayDomainFunctor{}.size(arrayDomainSize); return {0, offset}; } }