From 5f01b19d5172f9da766e3ed3ad7e8a699aa3ae92 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 12 Nov 2020 14:28:01 +0100 Subject: [PATCH 1/8] refactoring --- examples/alpaka/nbody/nbody.cpp | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/examples/alpaka/nbody/nbody.cpp b/examples/alpaka/nbody/nbody.cpp index 3ea3f81391..baddb66892 100644 --- a/examples/alpaka/nbody/nbody.cpp +++ b/examples/alpaka/nbody/nbody.cpp @@ -51,24 +51,18 @@ using Particle = llama::DS< llama::DE>; // clang-format on -/// Helper function for particle particle interaction. Gets two virtual -/// datums like they are real particle objects -template -LLAMA_FN_HOST_ACC_INLINE void pPInteraction(VirtualDatum1 p1, VirtualDatum2 p2, FP ts) +template +LLAMA_FN_HOST_ACC_INLINE void pPInteraction(VirtualParticleI pi, VirtualParticleJ pj, FP ts) { - // Creating tempory virtual datum object for distance on stack: - auto distance = p1(tag::Pos()) - p2(tag::Pos()); - distance *= distance; // square for each element - const FP distSqr = EPS2 + distance(tag::X()) + distance(tag::Y()) + distance(tag::Z()); + auto dist = pi(tag::Pos()) - pj(tag::Pos()); + dist *= dist; + const FP distSqr = EPS2 + dist(tag::X()) + dist(tag::Y()) + dist(tag::Z()); const FP distSixth = distSqr * distSqr * distSqr; const FP invDistCube = 1.0f / std::sqrt(distSixth); - const FP s = p2(tag::Mass()) * invDistCube; - distance *= s * ts; - p1(tag::Vel()) += distance; + const FP sts = pj(tag::Mass()) * invDistCube * ts; + pi(tag::Vel()) += dist * sts; } -/// Alpaka kernel for updating the speed of every particle based on the -/// distance and mass to each other particle. Has complexity O(N²). template struct UpdateKernel { @@ -134,8 +128,6 @@ struct UpdateKernel } }; -/// Alpaka kernel for moving each particle with its speed. Has complexity -/// O(N). template struct MoveKernel { @@ -153,7 +145,7 @@ struct MoveKernel } }; -int main(int argc, char** argv) +int main() { using Dim = alpaka::DimInt<1>; using Size = std::size_t; From e0edecc5a4074b9b0da75b4bf2040b1e82d713ea Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 12 Nov 2020 15:32:49 +0100 Subject: [PATCH 2/8] split UpdateKernel in a simple and shared memory version --- examples/alpaka/nbody/nbody.cpp | 94 ++++++++++++++++++++------------- 1 file changed, 58 insertions(+), 36 deletions(-) diff --git a/examples/alpaka/nbody/nbody.cpp b/examples/alpaka/nbody/nbody.cpp index baddb66892..5fd23f0910 100644 --- a/examples/alpaka/nbody/nbody.cpp +++ b/examples/alpaka/nbody/nbody.cpp @@ -64,37 +64,32 @@ LLAMA_FN_HOST_ACC_INLINE void pPInteraction(VirtualParticleI pi, VirtualParticle } template -struct UpdateKernel +struct UpdateKernelSM { template LLAMA_FN_HOST_ACC_INLINE void operator()(const Acc& acc, View particles, FP ts) const { - [[maybe_unused]] auto sharedView = [&] { - if constexpr (USE_SHARED) - { - const auto sharedMapping = [&] { - if constexpr (USE_SHARED_TREE) - return llama::mapping::tree::Mapping{ - typename View::ArrayDomain{BlockSize}, - llama::Tuple{llama::mapping::tree::functor::LeafOnlyRT()}, - typename View::DatumDomain{}}; - else - return llama::mapping::SoA{typename View::ArrayDomain{BlockSize}, typename View::DatumDomain{}}; - }(); - - // if there is only 1 thread per block, avoid using shared - // memory - if constexpr (BlockSize / Elems == 1) - return llama::allocViewStack(); + auto sharedView = [&] { + const auto sharedMapping = [&] { + if constexpr (USE_SHARED_TREE) + return llama::mapping::tree::Mapping{ + typename View::ArrayDomain{BlockSize}, + llama::Tuple{llama::mapping::tree::functor::LeafOnlyRT()}, + typename View::DatumDomain{}}; else - { - constexpr auto sharedMemSize = llama::sizeOf * BlockSize; - auto& sharedMem = alpaka::allocVar(acc); - return llama::View{sharedMapping, llama::Array{&sharedMem[0]}}; - } - } + return llama::mapping::SoA{typename View::ArrayDomain{BlockSize}, typename View::DatumDomain{}}; + }(); + + // if there is only 1 thread per block, avoid using shared + // memory + if constexpr (BlockSize / Elems == 1) + return llama::allocViewStack(); else - return int{}; // dummy + { + constexpr auto sharedMemSize = llama::sizeOf * BlockSize; + auto& sharedMem = alpaka::allocVar(acc); + return llama::View{sharedMapping, llama::Array{&sharedMem[0]}}; + } }(); const auto ti = alpaka::getIdx(acc)[0u]; @@ -107,23 +102,49 @@ struct UpdateKernel { const auto start2 = b * BlockSize; const auto end2 = alpaka::math::min(acc, start2 + BlockSize, ProblemSize) - start2; - if constexpr (USE_SHARED) + + LLAMA_INDEPENDENT_DATA + for (auto pos2 = decltype(end2)(0); pos2 + ti < end2; pos2 += BlockSize / Elems) + sharedView(pos2 + tbi) = particles(start2 + pos2 + tbi); + alpaka::syncBlockThreads(acc); + + LLAMA_INDEPENDENT_DATA + for (auto pos2 = decltype(end2)(0); pos2 < end2; ++pos2) { LLAMA_INDEPENDENT_DATA - for (auto pos2 = decltype(end2)(0); pos2 + ti < end2; pos2 += BlockSize / Elems) - sharedView(pos2 + tbi) = particles(start2 + pos2 + tbi); - alpaka::syncBlockThreads(acc); + for (auto i = start; i < end; ++i) + pPInteraction(particles(i), sharedView(pos2), ts); } + + alpaka::syncBlockThreads(acc); + } + } +}; + +template +struct UpdateKernel +{ + template + LLAMA_FN_HOST_ACC_INLINE void operator()(const Acc& acc, View particles, FP ts) const + { + const auto ti = alpaka::getIdx(acc)[0u]; + const auto tbi = alpaka::getIdx(acc)[0]; + + const auto start = ti * Elems; + const auto end = alpaka::math::min(acc, start + Elems, ProblemSize); + LLAMA_INDEPENDENT_DATA + for (std::size_t b = 0; b < (ProblemSize + BlockSize - 1u) / BlockSize; ++b) + { + const auto start2 = b * BlockSize; + const auto end2 = alpaka::math::min(acc, start2 + BlockSize, ProblemSize) - start2; + LLAMA_INDEPENDENT_DATA for (auto pos2 = decltype(end2)(0); pos2 < end2; ++pos2) + { LLAMA_INDEPENDENT_DATA - for (auto i = start; i < end; ++i) - if constexpr (USE_SHARED) - pPInteraction(particles(i), sharedView(pos2), ts); - else + for (auto i = start; i < end; ++i) pPInteraction(particles(i), particles(start2 + pos2), ts); - if constexpr (USE_SHARED) - alpaka::syncBlockThreads(acc); + } } } }; @@ -237,7 +258,8 @@ int main() for (std::size_t s = 0; s < STEPS; ++s) { - UpdateKernel updateKernel; + // UpdateKernel updateKernel; + UpdateKernelSM updateKernel; alpaka::exec(queue, workdiv, updateKernel, accView, ts); chrono.printAndReset("Update kernel"); From 8eb82211f8a2359192fd7281a4560b8d6cc32dda Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 12 Nov 2020 17:47:06 +0100 Subject: [PATCH 3/8] request C++17 via compiler features --- examples/alpaka/nbody/CMakeLists.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/alpaka/nbody/CMakeLists.txt b/examples/alpaka/nbody/CMakeLists.txt index a4522591b0..401d304226 100644 --- a/examples/alpaka/nbody/CMakeLists.txt +++ b/examples/alpaka/nbody/CMakeLists.txt @@ -1,13 +1,11 @@ cmake_minimum_required (VERSION 3.15) project(llama-alpaka-nbody) -set (CMAKE_CXX_STANDARD 17) -set (CMAKE_CXX_STANDARD_REQUIRED on) - if (NOT TARGET llama::llama) find_package(llama REQUIRED) endif() find_package(alpaka 0.5.0 REQUIRED) alpaka_add_executable(${PROJECT_NAME} nbody.cpp ../../common/alpakaHelpers.hpp ../../common/Stopwatch.hpp) +target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17) target_compile_definitions(${PROJECT_NAME} PUBLIC LLAMA_FN_HOST_ACC_INLINE=ALPAKA_FN_HOST_ACC) target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama alpaka::alpaka) From bcc381dd1afed64c37cd2031d79fa469ea88312c Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 12 Nov 2020 17:50:39 +0100 Subject: [PATCH 4/8] drop shared tree mapping --- examples/alpaka/nbody/nbody.cpp | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/examples/alpaka/nbody/nbody.cpp b/examples/alpaka/nbody/nbody.cpp index 5fd23f0910..e2ce549342 100644 --- a/examples/alpaka/nbody/nbody.cpp +++ b/examples/alpaka/nbody/nbody.cpp @@ -17,10 +17,6 @@ #include constexpr auto MAPPING = 0; ///< 0 native AoS, 1 native SoA, 2 native SoA (separate blobs), 3 tree AoS, 4 tree SoA -constexpr auto USE_SHARED = true; ///< defines whether shared memory shall be used -constexpr auto USE_SHARED_TREE = true; ///< defines whether the shared memory shall use tree mapping or - ///< native mapping - constexpr auto PROBLEM_SIZE = 16 * 1024; ///< total number of particles constexpr auto BLOCK_SIZE = 256; ///< number of elements per block constexpr auto STEPS = 5; ///< number of steps to calculate @@ -70,15 +66,8 @@ struct UpdateKernelSM LLAMA_FN_HOST_ACC_INLINE void operator()(const Acc& acc, View particles, FP ts) const { auto sharedView = [&] { - const auto sharedMapping = [&] { - if constexpr (USE_SHARED_TREE) - return llama::mapping::tree::Mapping{ - typename View::ArrayDomain{BlockSize}, - llama::Tuple{llama::mapping::tree::functor::LeafOnlyRT()}, - typename View::DatumDomain{}}; - else - return llama::mapping::SoA{typename View::ArrayDomain{BlockSize}, typename View::DatumDomain{}}; - }(); + const auto sharedMapping + = llama::mapping::SoA{typename View::ArrayDomain{BlockSize}, typename View::DatumDomain{}}; // if there is only 1 thread per block, avoid using shared // memory From f8ac331c3113ce495b15451cd543b7e1af6c2b6c Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 12 Nov 2020 17:56:07 +0100 Subject: [PATCH 5/8] replace exception in constexpr code by static_assert --- include/llama/mapping/SoA.hpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/include/llama/mapping/SoA.hpp b/include/llama/mapping/SoA.hpp index 833f67f026..4ea75bac45 100644 --- a/include/llama/mapping/SoA.hpp +++ b/include/llama/mapping/SoA.hpp @@ -5,6 +5,8 @@ #include "Common.hpp" +#include + namespace llama::mapping { /// Struct of array mapping. Used to create a \ref View via \ref allocView. @@ -78,10 +80,13 @@ namespace llama::mapping index++; }); if (!found) - throw "Passed TargetDatumCoord must be in datum domain"; + return std::numeric_limits::max(); return index; } (); + static_assert( + blob != std::numeric_limits::max(), + "Passed TargetDatumCoord must be in datum domain"); LLAMA_FORCE_INLINE_RECURSIVE const auto offset = LinearizeArrayDomainFunctor{}(coord, arrayDomainSize) @@ -93,7 +98,9 @@ namespace llama::mapping LLAMA_FORCE_INLINE_RECURSIVE const auto offset = LinearizeArrayDomainFunctor{}(coord, arrayDomainSize) * sizeof(GetType>) - + offsetOf> * LinearizeArrayDomainFunctor{}.size(arrayDomainSize); + + offsetOf< + DatumDomain, + DatumCoord> * LinearizeArrayDomainFunctor{}.size(arrayDomainSize); return {0, offset}; } } From 2f3761d1b70b9252bbafbdedefd8e6e289308ce6 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 12 Nov 2020 18:56:04 +0100 Subject: [PATCH 6/8] workaround nvcc bugs --- examples/alpaka/nbody/nbody.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/alpaka/nbody/nbody.cpp b/examples/alpaka/nbody/nbody.cpp index e2ce549342..d87298e259 100644 --- a/examples/alpaka/nbody/nbody.cpp +++ b/examples/alpaka/nbody/nbody.cpp @@ -67,7 +67,7 @@ struct UpdateKernelSM { auto sharedView = [&] { const auto sharedMapping - = llama::mapping::SoA{typename View::ArrayDomain{BlockSize}, typename View::DatumDomain{}}; + = llama::mapping::SoA(typename View::ArrayDomain{BlockSize}, typename View::DatumDomain{}); // bug: nvcc 11.1 cannot have {} to call ctor // if there is only 1 thread per block, avoid using shared // memory @@ -77,7 +77,10 @@ struct UpdateKernelSM { constexpr auto sharedMemSize = llama::sizeOf * BlockSize; auto& sharedMem = alpaka::allocVar(acc); - return llama::View{sharedMapping, llama::Array{&sharedMem[0]}}; + return llama::View{ + sharedMapping, + llama::Array{ + &sharedMem[0]}}; // bug: nvcc 11.1 needs explicit template args for llama::Array } }(); From 67730139631259d72fd8bd4f0951a0a5d64c5773 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 12 Nov 2020 19:38:23 +0100 Subject: [PATCH 7/8] simplify kernel not using shared memory --- examples/alpaka/nbody/nbody.cpp | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/examples/alpaka/nbody/nbody.cpp b/examples/alpaka/nbody/nbody.cpp index d87298e259..34003820f2 100644 --- a/examples/alpaka/nbody/nbody.cpp +++ b/examples/alpaka/nbody/nbody.cpp @@ -66,8 +66,9 @@ struct UpdateKernelSM LLAMA_FN_HOST_ACC_INLINE void operator()(const Acc& acc, View particles, FP ts) const { auto sharedView = [&] { - const auto sharedMapping - = llama::mapping::SoA(typename View::ArrayDomain{BlockSize}, typename View::DatumDomain{}); // bug: nvcc 11.1 cannot have {} to call ctor + const auto sharedMapping = llama::mapping::SoA( + typename View::ArrayDomain{BlockSize}, + typename View::DatumDomain{}); // bug: nvcc 11.1 cannot have {} to call ctor // if there is only 1 thread per block, avoid using shared // memory @@ -107,7 +108,6 @@ struct UpdateKernelSM for (auto i = start; i < end; ++i) pPInteraction(particles(i), sharedView(pos2), ts); } - alpaka::syncBlockThreads(acc); } } @@ -120,23 +120,15 @@ struct UpdateKernel LLAMA_FN_HOST_ACC_INLINE void operator()(const Acc& acc, View particles, FP ts) const { const auto ti = alpaka::getIdx(acc)[0u]; - const auto tbi = alpaka::getIdx(acc)[0]; - const auto start = ti * Elems; const auto end = alpaka::math::min(acc, start + Elems, ProblemSize); + LLAMA_INDEPENDENT_DATA - for (std::size_t b = 0; b < (ProblemSize + BlockSize - 1u) / BlockSize; ++b) + for (auto j = 0; j < ProblemSize; ++j) { - const auto start2 = b * BlockSize; - const auto end2 = alpaka::math::min(acc, start2 + BlockSize, ProblemSize) - start2; - LLAMA_INDEPENDENT_DATA - for (auto pos2 = decltype(end2)(0); pos2 < end2; ++pos2) - { - LLAMA_INDEPENDENT_DATA - for (auto i = start; i < end; ++i) - pPInteraction(particles(i), particles(start2 + pos2), ts); - } + for (auto i = start; i < end; ++i) + pPInteraction(particles(i), particles(j), ts); } } }; @@ -148,7 +140,6 @@ struct MoveKernel LLAMA_FN_HOST_ACC_INLINE void operator()(const Acc& acc, View particles, FP ts) const { const auto ti = alpaka::getIdx(acc)[0]; - const auto start = ti * Elems; const auto end = alpaka::math::min(acc, start + Elems, ProblemSize); From 0338a82a663d1c9ca66275c079720b636af61783 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 13 Nov 2020 21:24:27 +0100 Subject: [PATCH 8/8] add constant to choose kernel --- examples/alpaka/nbody/nbody.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/examples/alpaka/nbody/nbody.cpp b/examples/alpaka/nbody/nbody.cpp index 34003820f2..46546eca7e 100644 --- a/examples/alpaka/nbody/nbody.cpp +++ b/examples/alpaka/nbody/nbody.cpp @@ -17,6 +17,7 @@ #include constexpr auto MAPPING = 0; ///< 0 native AoS, 1 native SoA, 2 native SoA (separate blobs), 3 tree AoS, 4 tree SoA +constexpr auto USE_SHARED_MEMORY = true; ///< use a kernel using shared memory for caching constexpr auto PROBLEM_SIZE = 16 * 1024; ///< total number of particles constexpr auto BLOCK_SIZE = 256; ///< number of elements per block constexpr auto STEPS = 5; ///< number of steps to calculate @@ -241,8 +242,12 @@ int main() for (std::size_t s = 0; s < STEPS; ++s) { - // UpdateKernel updateKernel; - UpdateKernelSM updateKernel; + auto updateKernel = [&] { + if constexpr (USE_SHARED_MEMORY) + return UpdateKernelSM{}; + else + return UpdateKernel{}; + }(); alpaka::exec(queue, workdiv, updateKernel, accView, ts); chrono.printAndReset("Update kernel");