alpaka-group · bernhardmgruber · Nov 13, 2020 · Nov 12, 2020 · Nov 12, 2020 · Nov 12, 2020
diff --git a/examples/alpaka/nbody/CMakeLists.txt b/examples/alpaka/nbody/CMakeLists.txt
@@ -1,13 +1,11 @@
 cmake_minimum_required (VERSION 3.15)
 project(llama-alpaka-nbody)
 
-set (CMAKE_CXX_STANDARD 17)
-set (CMAKE_CXX_STANDARD_REQUIRED on)
-
 if (NOT TARGET llama::llama)
 	find_package(llama REQUIRED)
 endif()
 find_package(alpaka 0.5.0 REQUIRED)
 alpaka_add_executable(${PROJECT_NAME} nbody.cpp ../../common/alpakaHelpers.hpp ../../common/Stopwatch.hpp)
+target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17)
 target_compile_definitions(${PROJECT_NAME} PUBLIC LLAMA_FN_HOST_ACC_INLINE=ALPAKA_FN_HOST_ACC)
 target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama alpaka::alpaka)
diff --git a/examples/alpaka/nbody/nbody.cpp b/examples/alpaka/nbody/nbody.cpp
@@ -17,10 +17,7 @@
 #include <utility>
 
 constexpr auto MAPPING = 0; ///< 0 native AoS, 1 native SoA, 2 native SoA (separate blobs), 3 tree AoS, 4 tree SoA
-constexpr auto USE_SHARED = true; ///< defines whether shared memory shall be used
-constexpr auto USE_SHARED_TREE = true; ///< defines whether the shared memory shall use tree mapping or
-                                       ///< native mapping
-
+constexpr auto USE_SHARED_MEMORY = true; ///< use a kernel using shared memory for caching
 constexpr auto PROBLEM_SIZE = 16 * 1024; ///< total number of particles
 constexpr auto BLOCK_SIZE = 256; ///< number of elements per block
 constexpr auto STEPS = 5; ///< number of steps to calculate
@@ -51,56 +48,42 @@ using Particle = llama::DS<
     llama::DE<tag::Mass, FP>>;
 // clang-format on
 
-/// Helper function for particle particle interaction. Gets two virtual
-/// datums like they are real particle objects
-template <typename VirtualDatum1, typename VirtualDatum2>
-LLAMA_FN_HOST_ACC_INLINE void pPInteraction(VirtualDatum1 p1, VirtualDatum2 p2, FP ts)
+template <typename VirtualParticleI, typename VirtualParticleJ>
+LLAMA_FN_HOST_ACC_INLINE void pPInteraction(VirtualParticleI pi, VirtualParticleJ pj, FP ts)
 {
-    // Creating tempory virtual datum object for distance on stack:
-    auto distance = p1(tag::Pos()) - p2(tag::Pos());
-    distance *= distance; // square for each element
-    const FP distSqr = EPS2 + distance(tag::X()) + distance(tag::Y()) + distance(tag::Z());
+    auto dist = pi(tag::Pos()) - pj(tag::Pos());
+    dist *= dist;
+    const FP distSqr = EPS2 + dist(tag::X()) + dist(tag::Y()) + dist(tag::Z());
     const FP distSixth = distSqr * distSqr * distSqr;
     const FP invDistCube = 1.0f / std::sqrt(distSixth);
-    const FP s = p2(tag::Mass()) * invDistCube;
-    distance *= s * ts;
-    p1(tag::Vel()) += distance;
+    const FP sts = pj(tag::Mass()) * invDistCube * ts;
+    pi(tag::Vel()) += dist * sts;
 }
 
-/// Alpaka kernel for updating the speed of every particle based on the
-/// distance and mass to each other particle. Has complexity O(N²).
 template <std::size_t ProblemSize, std::size_t Elems, std::size_t BlockSize>
-struct UpdateKernel
+struct UpdateKernelSM
 {
     template <typename Acc, typename View>
     LLAMA_FN_HOST_ACC_INLINE void operator()(const Acc& acc, View particles, FP ts) const
     {
-        [[maybe_unused]] auto sharedView = [&] {
-            if constexpr (USE_SHARED)
+        auto sharedView = [&] {
+            const auto sharedMapping = llama::mapping::SoA(
+                typename View::ArrayDomain{BlockSize},
+                typename View::DatumDomain{}); // bug: nvcc 11.1 cannot have {} to call ctor
+
+            // if there is only 1 thread per block, avoid using shared
+            // memory
+            if constexpr (BlockSize / Elems == 1)
+                return llama::allocViewStack<View::ArrayDomain::rank, typename View::DatumDomain>();
+            else
             {
-                const auto sharedMapping = [&] {
-                    if constexpr (USE_SHARED_TREE)
-                        return llama::mapping::tree::Mapping{
-                            typename View::ArrayDomain{BlockSize},
-                            llama::Tuple{llama::mapping::tree::functor::LeafOnlyRT()},
-                            typename View::DatumDomain{}};
-                    else
-                        return llama::mapping::SoA{typename View::ArrayDomain{BlockSize}, typename View::DatumDomain{}};
-                }();
-
-                // if there is only 1 thread per block, avoid using shared
-                // memory
-                if constexpr (BlockSize / Elems == 1)
-                    return llama::allocViewStack<View::ArrayDomain::rank, typename View::DatumDomain>();
-                else
-                {
-                    constexpr auto sharedMemSize = llama::sizeOf<typename View::DatumDomain> * BlockSize;
-                    auto& sharedMem = alpaka::allocVar<std::byte[sharedMemSize], __COUNTER__>(acc);
-                    return llama::View{sharedMapping, llama::Array{&sharedMem[0]}};
-                }
+                constexpr auto sharedMemSize = llama::sizeOf<typename View::DatumDomain> * BlockSize;
+                auto& sharedMem = alpaka::allocVar<std::byte[sharedMemSize], __COUNTER__>(acc);
+                return llama::View{
+                    sharedMapping,
+                    llama::Array<std::byte*, 1>{
+                        &sharedMem[0]}}; // bug: nvcc 11.1 needs explicit template args for llama::Array
             }
-            else
-                return int{}; // dummy
         }();
 
         const auto ti = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
@@ -113,37 +96,51 @@ struct UpdateKernel
         {
             const auto start2 = b * BlockSize;
             const auto end2 = alpaka::math::min(acc, start2 + BlockSize, ProblemSize) - start2;
-            if constexpr (USE_SHARED)
+
+            LLAMA_INDEPENDENT_DATA
+            for (auto pos2 = decltype(end2)(0); pos2 + ti < end2; pos2 += BlockSize / Elems)
+                sharedView(pos2 + tbi) = particles(start2 + pos2 + tbi);
+            alpaka::syncBlockThreads(acc);
+
+            LLAMA_INDEPENDENT_DATA
+            for (auto pos2 = decltype(end2)(0); pos2 < end2; ++pos2)
             {
                 LLAMA_INDEPENDENT_DATA
-                for (auto pos2 = decltype(end2)(0); pos2 + ti < end2; pos2 += BlockSize / Elems)
-                    sharedView(pos2 + tbi) = particles(start2 + pos2 + tbi);
-                alpaka::syncBlockThreads(acc);
+                for (auto i = start; i < end; ++i)
+                    pPInteraction(particles(i), sharedView(pos2), ts);
             }
+            alpaka::syncBlockThreads(acc);
+        }
+    }
+};
+
+template <std::size_t ProblemSize, std::size_t Elems, std::size_t BlockSize>
+struct UpdateKernel
+{
+    template <typename Acc, typename View>
+    LLAMA_FN_HOST_ACC_INLINE void operator()(const Acc& acc, View particles, FP ts) const
+    {
+        const auto ti = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
+        const auto start = ti * Elems;
+        const auto end = alpaka::math::min(acc, start + Elems, ProblemSize);
+
+        LLAMA_INDEPENDENT_DATA
+        for (auto j = 0; j < ProblemSize; ++j)
+        {
             LLAMA_INDEPENDENT_DATA
-            for (auto pos2 = decltype(end2)(0); pos2 < end2; ++pos2)
-                LLAMA_INDEPENDENT_DATA
             for (auto i = start; i < end; ++i)
-                if constexpr (USE_SHARED)
-                    pPInteraction(particles(i), sharedView(pos2), ts);
-                else
-                    pPInteraction(particles(i), particles(start2 + pos2), ts);
-            if constexpr (USE_SHARED)
-                alpaka::syncBlockThreads(acc);
+                pPInteraction(particles(i), particles(j), ts);
         }
     }
 };
 
-/// Alpaka kernel for moving each particle with its speed. Has complexity
-/// O(N).
 template <std::size_t ProblemSize, std::size_t Elems>
 struct MoveKernel
 {
     template <typename Acc, typename View>
     LLAMA_FN_HOST_ACC_INLINE void operator()(const Acc& acc, View particles, FP ts) const
     {
         const auto ti = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];
-
         const auto start = ti * Elems;
         const auto end = alpaka::math::min(acc, start + Elems, ProblemSize);
 
@@ -153,7 +150,7 @@ struct MoveKernel
     }
 };
 
-int main(int argc, char** argv)
+int main()
 {
     using Dim = alpaka::DimInt<1>;
     using Size = std::size_t;
@@ -245,7 +242,12 @@ int main(int argc, char** argv)
 
     for (std::size_t s = 0; s < STEPS; ++s)
     {
-        UpdateKernel<PROBLEM_SIZE, elemCount, BLOCK_SIZE> updateKernel;
+        auto updateKernel = [&] {
+            if constexpr (USE_SHARED_MEMORY)
+                return UpdateKernelSM<PROBLEM_SIZE, elemCount, BLOCK_SIZE>{};
+            else
+                return UpdateKernel<PROBLEM_SIZE, elemCount, BLOCK_SIZE>{};
+        }();
         alpaka::exec<Acc>(queue, workdiv, updateKernel, accView, ts);
 
         chrono.printAndReset("Update kernel");

diff --git a/include/llama/mapping/SoA.hpp b/include/llama/mapping/SoA.hpp
@@ -5,6 +5,8 @@
 
 #include "Common.hpp"
 
+#include <limits>
+
 namespace llama::mapping
 {
     /// Struct of array mapping. Used to create a \ref View via \ref allocView.
@@ -78,10 +80,13 @@ namespace llama::mapping
                             index++;
                     });
                     if (!found)
-                        throw "Passed TargetDatumCoord must be in datum domain";
+                        return std::numeric_limits<std::size_t>::max();
                     return index;
                 }
                 ();
+                static_assert(
+                    blob != std::numeric_limits<std::size_t>::max(),
+                    "Passed TargetDatumCoord must be in datum domain");
 
                 LLAMA_FORCE_INLINE_RECURSIVE
                 const auto offset = LinearizeArrayDomainFunctor{}(coord, arrayDomainSize)
@@ -93,7 +98,9 @@ namespace llama::mapping
                 LLAMA_FORCE_INLINE_RECURSIVE
                 const auto offset = LinearizeArrayDomainFunctor{}(coord, arrayDomainSize)
                         * sizeof(GetType<DatumDomain, DatumCoord<DatumDomainCoord...>>)
-                    + offsetOf<DatumDomain, DatumCoord<DatumDomainCoord...>> * LinearizeArrayDomainFunctor{}.size(arrayDomainSize);
+                    + offsetOf<
+                          DatumDomain,
+                          DatumCoord<DatumDomainCoord...>> * LinearizeArrayDomainFunctor{}.size(arrayDomainSize);
                 return {0, offset};
             }
         }