Implement SIMD gather/scatter

alpaka-group · Jan 1, 2024 · abf44ef · abf44ef
1 parent 9680e3d
commit abf44ef
Show file tree

Hide file tree

Showing 3 changed files with 62 additions and 54 deletions.
diff --git a/docs/pages/simd.rst b/docs/pages/simd.rst
@@ -48,36 +48,19 @@ SIMD library integration with LLAMA
 
 In order for LLAMA to make use of a third-party SIMD library,
 the class template :cpp:`llama::SimdTraits` has to be specialized for the SIMD types of the SIMD library.
-
-Here is an exemplary integration of `std::experimental::simd<T, Abi>` with LLAMA:
-
-.. code-block:: C++
-
-    #include <llama/llama.hpp>
-    #include <experimental/simd>
-
-    namespace stdx = std::experimental;
-    template<typename T, typename Abi>
-    struct llama::SimdTraits<stdx::simd<T, Abi>> {
-        using value_type = T;
-
-        inline static constexpr std::size_t lanes = stdx::simd<T, Abi>::size();
-
-        static auto loadUnaligned(const value_type* mem) -> stdx::simd<T, Abi> {
-            return {mem, stdx::element_aligned};
-        }
-
-        static void storeUnaligned(stdx::simd<T, Abi> simd, value_type* mem) {
-            simd.copy_to(mem, stdx::element_aligned);
-        }
-    };
-
 Each specialization :cpp:`llama::SimdTraits<Simd>` must provide:
 
 * an alias :cpp:`value_type` to indicate the element type of the Simd.
 * a :cpp:`static constexpr size_t lanes` variable holding the number of SIMD lanes of the Simd.
 * a :cpp:`static auto loadUnaligned(const value_type* mem) -> Simd` function, loading a Simd from the given memory address.
 * a :cpp:`static void storeUnaligned(Simd simd, value_type* mem)` function, storing the given Simd to a given memory address.
+* a :cpp:`static auto gather(const value_type* mem, std::array<int, lanes> indices) -> Simd` function,
+  gathering values into a Simd from the memory addresses identified by :cpp:`mem + indices * sizeof(value_type)`.
+* a :cpp:`static void scatter(Simd simd, value_type* mem, std::array<int, lanes> indices)` function,
+  scattering the values from a Simd to the memory addresses identified by :cpp:`mem + indices * sizeof(value_type)`.
+
+For an example integration of `xsimd::batch<T, A>` with LLAMA, see the nbody example.
+For an example integration of `std::experimental::simd<T, Abi>` with LLAMA, see the `simd.cpp` unit tests.
 
 LLAMA already provides a specialization of :cpp:`llama::SimdTraits` for the built-in scalar `arithmetic types <https://en.cppreference.com/w/c/language/arithmetic_types>`_.
 In that sense, these types are SIMD types from LLAMA's perspective and can be used with the SIMD API in LLAMA.

diff --git a/include/llama/Simd.hpp b/include/llama/Simd.hpp
@@ -22,6 +22,10 @@ namespace llama
     /// address.
     /// * a `static void storeUnaligned(Simd simd, value_type* mem)` function, storing the given Simd to a given
     /// memory address.
+    /// * a `static auto gather(const value_type* mem, std::array<int, lanes> indices) -> Simd` function, gathering
+    ///  values into a Simd from the memory addresses identified by mem + indices * sizeof(value_type).
+    /// * a `static void scatter(Simd simd, value_type* mem, std::array<int, lanes> indices)` function, scattering the
+    /// values from a Simd to the memory addresses identified by mem + indices * sizeof(value_type).
     LLAMA_EXPORT
     template<typename Simd, typename SFINAE = void>
     struct SimdTraits
@@ -46,6 +50,16 @@ namespace llama
         {
             *mem = t;
         }
+
+        static LLAMA_FORCE_INLINE auto gather(const value_type* mem, std::array<int, lanes> indices) -> T
+        {
+            return mem[indices[0]];
+        }
+
+        static LLAMA_FORCE_INLINE void scatter(T t, value_type* mem, std::array<int, lanes> indices)
+        {
+            mem[indices[0]] = t;
+        }
     };
 
     /// The number of SIMD simdLanes the given SIMD vector or \ref Simd<T> has. If Simd is not a structural \ref Simd
@@ -175,6 +189,19 @@ namespace llama
 
     namespace internal
     {
+        template<typename AoSMapping, typename ElementType, std::size_t Lanes>
+        inline constexpr auto aosStridedIndices = []()
+        {
+            static constexpr auto stride = flatSizeOf<
+                                               typename AoSMapping::Permuter::FlatRecordDim,
+                                               AoSMapping::fieldAlignment == llama::mapping::FieldAlignment::Align>
+                / sizeof(ElementType);
+            std::array<int, Lanes> indices{};
+            for(int i = 0; i < static_cast<int>(Lanes); i++)
+                indices[i] = i * stride;
+            return indices;
+        }();
+
         template<typename T, typename Simd, typename RecordCoord>
         LLAMA_FN_HOST_ACC_INLINE void loadSimdRecord(const T& srcRef, Simd& dstSimd, RecordCoord rc)
         {
@@ -205,15 +232,7 @@ namespace llama
             else if constexpr(mapping::isAoS<Mapping>)
             {
                 static_assert(mapping::isAoS<Mapping>);
-                static constexpr auto srcStride = flatSizeOf<
-                    typename Mapping::Permuter::FlatRecordDim,
-                    Mapping::fieldAlignment == llama::mapping::FieldAlignment::Align>;
-                const auto* srcBaseAddr = reinterpret_cast<const std::byte*>(&srcRef(rc));
-                ElementSimd elemSimd; // g++-12 really needs the intermediate elemSimd and memcpy
-                for(auto i = 0; i < Traits::lanes; i++)
-                    reinterpret_cast<FieldType*>(&elemSimd)[i]
-                        = *reinterpret_cast<const FieldType*>(srcBaseAddr + i * srcStride);
-                std::memcpy(&dstSimd(rc), &elemSimd, sizeof(elemSimd));
+                dstSimd(rc) = Traits::gather(&srcRef(rc), aosStridedIndices<Mapping, FieldType, Traits::lanes>);
             }
             else
             {
@@ -245,14 +264,7 @@ namespace llama
             }
             else if constexpr(mapping::isAoS<Mapping>)
             {
-                static constexpr auto stride = flatSizeOf<
-                    typename Mapping::Permuter::FlatRecordDim,
-                    Mapping::fieldAlignment == llama::mapping::FieldAlignment::Align>;
-                auto* dstBaseAddr = reinterpret_cast<std::byte*>(&dstRef(rc));
-                const ElementSimd elemSimd = srcSimd(rc);
-                for(auto i = 0; i < Traits::lanes; i++)
-                    *reinterpret_cast<FieldType*>(dstBaseAddr + i * stride)
-                        = reinterpret_cast<const FieldType*>(&elemSimd)[i];
+                Traits::scatter(srcSimd(rc), &dstRef(rc), aosStridedIndices<Mapping, FieldType, Traits::lanes>);
             }
             else
             {

diff --git a/tests/simd.cpp b/tests/simd.cpp
@@ -39,6 +39,19 @@ struct llama::SimdTraits<stdx::simd<T, Abi>>
     {
         simd.copy_to(mem, stdx::element_aligned);
     }
+
+    static LLAMA_FORCE_INLINE auto gather(const value_type* mem, std::array<int, lanes> indices) -> stdx::simd<T, Abi>
+    {
+        // no native support for gather yet
+        return stdx::simd<T, Abi>{[&](auto ic) { return mem[indices[ic]]; }};
+    }
+
+    static LLAMA_FORCE_INLINE void scatter(stdx::simd<T, Abi> simd, value_type* mem, std::array<int, lanes> indices)
+    {
+        // no native support for scatter yet
+        for(std::size_t i = 0; i < lanes; i++)
+            mem[indices[i]] = simd[i];
+    }
 };
 
 namespace
@@ -199,10 +212,10 @@ TEST_CASE("simd.loadSimd.simd.stdsimd")
     CHECK(s[3] == 4.0f);
 }
 
-TEST_CASE("simd.loadSimd.record.scalar")
+TEMPLATE_TEST_CASE("simd.loadSimd.record.scalar", "", llama::mapping::BindAoS<>, llama::mapping::BindSoA<>)
 {
     using ArrayExtents = llama::ArrayExtentsDynamic<int, 1>;
-    const auto mapping = llama::mapping::SoA<ArrayExtents, ParticleSimd>(ArrayExtents{1});
+    const auto mapping = typename TestType::template fn<ArrayExtents, ParticleSimd>(ArrayExtents{1});
     auto view = llama::allocViewUninitialized(mapping);
     iotaFillView(view);
 
@@ -222,10 +235,10 @@ TEST_CASE("simd.loadSimd.record.scalar")
     CHECK(p(tag::Flags{}, llama::RecordCoord<3>{}) == 10);
 }
 
-TEST_CASE("simd.loadSimd.record.stdsimd")
+TEMPLATE_TEST_CASE("simd.loadSimd.record.stdsimd", "", llama::mapping::BindAoS<>, llama::mapping::BindSoA<>)
 {
     using ArrayExtents = llama::ArrayExtentsDynamic<int, 1>;
-    const auto mapping = llama::mapping::SoA<ArrayExtents, ParticleSimd>(ArrayExtents{16});
+    const auto mapping = typename TestType::template fn<ArrayExtents, ParticleSimd>(ArrayExtents{16});
     auto view = llama::allocViewUninitialized(mapping);
     iotaFillView(view);
 
@@ -290,10 +303,10 @@ TEST_CASE("simd.storeSimd.simd.stdsimd")
     CHECK(a[3] == 4.0f);
 }
 
-TEST_CASE("simd.storeSimd.record.scalar")
+TEMPLATE_TEST_CASE("simd.storeSimd.record.scalar", "", llama::mapping::BindAoS<>, llama::mapping::BindSoA<>)
 {
     using ArrayExtents = llama::ArrayExtentsDynamic<int, 1>;
-    const auto mapping = llama::mapping::SoA<ArrayExtents, ParticleSimd>(ArrayExtents{1});
+    const auto mapping = typename TestType::template fn<ArrayExtents, ParticleSimd>(ArrayExtents{1});
     auto view = llama::allocViewUninitialized(mapping);
 
     llama::SimdN<ParticleSimd, 1, stdx::fixed_size_simd> p;
@@ -323,11 +336,11 @@ TEST_CASE("simd.storeSimd.record.scalar")
     CHECK(view(0)(tag::Flags{}, llama::RecordCoord<3>{}) == 10);
 }
 
-TEST_CASE("simd.storeSimd.record.stdsimd")
+TEMPLATE_TEST_CASE("simd.storeSimd.record.stdsimd", "", llama::mapping::BindAoS<>, llama::mapping::BindSoA<>)
 {
     using ArrayExtents = llama::ArrayExtentsDynamic<int, 1>;
-    const auto mapping = llama::mapping::SoA<ArrayExtents, ParticleSimd>(ArrayExtents{16});
-    auto view = llama::allocView(mapping);
+    const auto mapping = typename TestType::template fn<ArrayExtents, ParticleSimd>(ArrayExtents{16});
+    auto view = llama::allocViewUninitialized(mapping);
 
     llama::SimdN<ParticleSimd, 3, stdx::fixed_size_simd> p;
     auto& x = p(tag::Pos{}, tag::X{});
@@ -358,13 +371,13 @@ TEST_CASE("simd.storeSimd.record.stdsimd")
     CHECK(view(3)(tag::Mass{}) == 0);
 }
 
-TEST_CASE("simd.simdForEachN.stdsimd")
+TEMPLATE_TEST_CASE("simd.simdForEachN.stdsimd", "", llama::mapping::BindAoS<>, llama::mapping::BindSoA<>)
 {
     using ArrayExtents = llama::ArrayExtentsDynamic<int, 2>;
     for(auto extents : {ArrayExtents{16, 32}, ArrayExtents{11, 7}})
     {
         CAPTURE(extents);
-        const auto mapping = llama::mapping::SoA<ArrayExtents, Vec2F>(extents);
+        const auto mapping = typename TestType::template fn<ArrayExtents, Vec2F>(extents);
         auto view = llama::allocViewUninitialized(mapping);
         for(int i = 0; i < extents[0]; i++)
             for(int j = 0; j < extents[1]; j++)
@@ -388,13 +401,13 @@ TEST_CASE("simd.simdForEachN.stdsimd")
     }
 }
 
-TEST_CASE("simd.simdForEach.stdsimd")
+TEMPLATE_TEST_CASE("simd.simdForEach.stdsimd", "", llama::mapping::BindAoS<>, llama::mapping::BindSoA<>)
 {
     using ArrayExtents = llama::ArrayExtentsDynamic<int, 2>;
     for(auto extents : {ArrayExtents{16, 32}, ArrayExtents{11, 7}})
     {
         CAPTURE(extents);
-        const auto mapping = llama::mapping::SoA<ArrayExtents, Vec2F>(extents);
+        const auto mapping = typename TestType::template fn<ArrayExtents, Vec2F>(extents);
         auto view = llama::allocViewUninitialized(mapping);
         for(int i = 0; i < extents[0]; i++)
             for(int j = 0; j < extents[1]; j++)