diff --git a/docs/pages/simd.rst b/docs/pages/simd.rst index b7e9641967..af0369ac94 100644 --- a/docs/pages/simd.rst +++ b/docs/pages/simd.rst @@ -48,36 +48,19 @@ SIMD library integration with LLAMA In order for LLAMA to make use of a third-party SIMD library, the class template :cpp:`llama::SimdTraits` has to be specialized for the SIMD types of the SIMD library. - -Here is an exemplary integration of `std::experimental::simd` with LLAMA: - -.. code-block:: C++ - - #include - #include - - namespace stdx = std::experimental; - template - struct llama::SimdTraits> { - using value_type = T; - - inline static constexpr std::size_t lanes = stdx::simd::size(); - - static auto loadUnaligned(const value_type* mem) -> stdx::simd { - return {mem, stdx::element_aligned}; - } - - static void storeUnaligned(stdx::simd simd, value_type* mem) { - simd.copy_to(mem, stdx::element_aligned); - } - }; - Each specialization :cpp:`llama::SimdTraits` must provide: * an alias :cpp:`value_type` to indicate the element type of the Simd. * a :cpp:`static constexpr size_t lanes` variable holding the number of SIMD lanes of the Simd. * a :cpp:`static auto loadUnaligned(const value_type* mem) -> Simd` function, loading a Simd from the given memory address. * a :cpp:`static void storeUnaligned(Simd simd, value_type* mem)` function, storing the given Simd to a given memory address. +* a :cpp:`static auto gather(const value_type* mem, std::array indices) -> Simd` function, + gathering values into a Simd from the memory addresses identified by :cpp:`mem + indices * sizeof(value_type)`. +* a :cpp:`static void scatter(Simd simd, value_type* mem, std::array indices)` function, + scattering the values from a Simd to the memory addresses identified by :cpp:`mem + indices * sizeof(value_type)`. + +For an example integration of `xsimd::batch` with LLAMA, see the nbody example. +For an example integration of `std::experimental::simd` with LLAMA, see the `simd.cpp` unit tests. LLAMA already provides a specialization of :cpp:`llama::SimdTraits` for the built-in scalar `arithmetic types `_. In that sense, these types are SIMD types from LLAMA's perspective and can be used with the SIMD API in LLAMA. diff --git a/include/llama/Simd.hpp b/include/llama/Simd.hpp index 106be2148b..4a0df5f873 100644 --- a/include/llama/Simd.hpp +++ b/include/llama/Simd.hpp @@ -22,6 +22,10 @@ namespace llama /// address. /// * a `static void storeUnaligned(Simd simd, value_type* mem)` function, storing the given Simd to a given /// memory address. + /// * a `static auto gather(const value_type* mem, std::array indices) -> Simd` function, gathering + /// values into a Simd from the memory addresses identified by mem + indices * sizeof(value_type). + /// * a `static void scatter(Simd simd, value_type* mem, std::array indices)` function, scattering the + /// values from a Simd to the memory addresses identified by mem + indices * sizeof(value_type). LLAMA_EXPORT template struct SimdTraits @@ -46,6 +50,16 @@ namespace llama { *mem = t; } + + static LLAMA_FORCE_INLINE auto gather(const value_type* mem, std::array indices) -> T + { + return mem[indices[0]]; + } + + static LLAMA_FORCE_INLINE void scatter(T t, value_type* mem, std::array indices) + { + mem[indices[0]] = t; + } }; /// The number of SIMD simdLanes the given SIMD vector or \ref Simd has. If Simd is not a structural \ref Simd @@ -175,6 +189,19 @@ namespace llama namespace internal { + template + inline constexpr auto aosStridedIndices = []() + { + static constexpr auto stride = flatSizeOf< + typename AoSMapping::Permuter::FlatRecordDim, + AoSMapping::fieldAlignment == llama::mapping::FieldAlignment::Align> + / sizeof(ElementType); + std::array indices{}; + for(int i = 0; i < static_cast(Lanes); i++) + indices[i] = i * stride; + return indices; + }(); + template LLAMA_FN_HOST_ACC_INLINE void loadSimdRecord(const T& srcRef, Simd& dstSimd, RecordCoord rc) { @@ -205,15 +232,7 @@ namespace llama else if constexpr(mapping::isAoS) { static_assert(mapping::isAoS); - static constexpr auto srcStride = flatSizeOf< - typename Mapping::Permuter::FlatRecordDim, - Mapping::fieldAlignment == llama::mapping::FieldAlignment::Align>; - const auto* srcBaseAddr = reinterpret_cast(&srcRef(rc)); - ElementSimd elemSimd; // g++-12 really needs the intermediate elemSimd and memcpy - for(auto i = 0; i < Traits::lanes; i++) - reinterpret_cast(&elemSimd)[i] - = *reinterpret_cast(srcBaseAddr + i * srcStride); - std::memcpy(&dstSimd(rc), &elemSimd, sizeof(elemSimd)); + dstSimd(rc) = Traits::gather(&srcRef(rc), aosStridedIndices); } else { @@ -245,14 +264,7 @@ namespace llama } else if constexpr(mapping::isAoS) { - static constexpr auto stride = flatSizeOf< - typename Mapping::Permuter::FlatRecordDim, - Mapping::fieldAlignment == llama::mapping::FieldAlignment::Align>; - auto* dstBaseAddr = reinterpret_cast(&dstRef(rc)); - const ElementSimd elemSimd = srcSimd(rc); - for(auto i = 0; i < Traits::lanes; i++) - *reinterpret_cast(dstBaseAddr + i * stride) - = reinterpret_cast(&elemSimd)[i]; + Traits::scatter(srcSimd(rc), &dstRef(rc), aosStridedIndices); } else { diff --git a/tests/simd.cpp b/tests/simd.cpp index 5e244c6f14..c41be1fdcf 100644 --- a/tests/simd.cpp +++ b/tests/simd.cpp @@ -39,6 +39,19 @@ struct llama::SimdTraits> { simd.copy_to(mem, stdx::element_aligned); } + + static LLAMA_FORCE_INLINE auto gather(const value_type* mem, std::array indices) -> stdx::simd + { + // no native support for gather yet + return stdx::simd{[&](auto ic) { return mem[indices[ic]]; }}; + } + + static LLAMA_FORCE_INLINE void scatter(stdx::simd simd, value_type* mem, std::array indices) + { + // no native support for scatter yet + for(std::size_t i = 0; i < lanes; i++) + mem[indices[i]] = simd[i]; + } }; namespace @@ -199,10 +212,10 @@ TEST_CASE("simd.loadSimd.simd.stdsimd") CHECK(s[3] == 4.0f); } -TEST_CASE("simd.loadSimd.record.scalar") +TEMPLATE_TEST_CASE("simd.loadSimd.record.scalar", "", llama::mapping::BindAoS<>, llama::mapping::BindSoA<>) { using ArrayExtents = llama::ArrayExtentsDynamic; - const auto mapping = llama::mapping::SoA(ArrayExtents{1}); + const auto mapping = typename TestType::template fn(ArrayExtents{1}); auto view = llama::allocViewUninitialized(mapping); iotaFillView(view); @@ -222,10 +235,10 @@ TEST_CASE("simd.loadSimd.record.scalar") CHECK(p(tag::Flags{}, llama::RecordCoord<3>{}) == 10); } -TEST_CASE("simd.loadSimd.record.stdsimd") +TEMPLATE_TEST_CASE("simd.loadSimd.record.stdsimd", "", llama::mapping::BindAoS<>, llama::mapping::BindSoA<>) { using ArrayExtents = llama::ArrayExtentsDynamic; - const auto mapping = llama::mapping::SoA(ArrayExtents{16}); + const auto mapping = typename TestType::template fn(ArrayExtents{16}); auto view = llama::allocViewUninitialized(mapping); iotaFillView(view); @@ -290,10 +303,10 @@ TEST_CASE("simd.storeSimd.simd.stdsimd") CHECK(a[3] == 4.0f); } -TEST_CASE("simd.storeSimd.record.scalar") +TEMPLATE_TEST_CASE("simd.storeSimd.record.scalar", "", llama::mapping::BindAoS<>, llama::mapping::BindSoA<>) { using ArrayExtents = llama::ArrayExtentsDynamic; - const auto mapping = llama::mapping::SoA(ArrayExtents{1}); + const auto mapping = typename TestType::template fn(ArrayExtents{1}); auto view = llama::allocViewUninitialized(mapping); llama::SimdN p; @@ -323,11 +336,11 @@ TEST_CASE("simd.storeSimd.record.scalar") CHECK(view(0)(tag::Flags{}, llama::RecordCoord<3>{}) == 10); } -TEST_CASE("simd.storeSimd.record.stdsimd") +TEMPLATE_TEST_CASE("simd.storeSimd.record.stdsimd", "", llama::mapping::BindAoS<>, llama::mapping::BindSoA<>) { using ArrayExtents = llama::ArrayExtentsDynamic; - const auto mapping = llama::mapping::SoA(ArrayExtents{16}); - auto view = llama::allocView(mapping); + const auto mapping = typename TestType::template fn(ArrayExtents{16}); + auto view = llama::allocViewUninitialized(mapping); llama::SimdN p; auto& x = p(tag::Pos{}, tag::X{}); @@ -358,13 +371,13 @@ TEST_CASE("simd.storeSimd.record.stdsimd") CHECK(view(3)(tag::Mass{}) == 0); } -TEST_CASE("simd.simdForEachN.stdsimd") +TEMPLATE_TEST_CASE("simd.simdForEachN.stdsimd", "", llama::mapping::BindAoS<>, llama::mapping::BindSoA<>) { using ArrayExtents = llama::ArrayExtentsDynamic; for(auto extents : {ArrayExtents{16, 32}, ArrayExtents{11, 7}}) { CAPTURE(extents); - const auto mapping = llama::mapping::SoA(extents); + const auto mapping = typename TestType::template fn(extents); auto view = llama::allocViewUninitialized(mapping); for(int i = 0; i < extents[0]; i++) for(int j = 0; j < extents[1]; j++) @@ -388,13 +401,13 @@ TEST_CASE("simd.simdForEachN.stdsimd") } } -TEST_CASE("simd.simdForEach.stdsimd") +TEMPLATE_TEST_CASE("simd.simdForEach.stdsimd", "", llama::mapping::BindAoS<>, llama::mapping::BindSoA<>) { using ArrayExtents = llama::ArrayExtentsDynamic; for(auto extents : {ArrayExtents{16, 32}, ArrayExtents{11, 7}}) { CAPTURE(extents); - const auto mapping = llama::mapping::SoA(extents); + const auto mapping = typename TestType::template fn(extents); auto view = llama::allocViewUninitialized(mapping); for(int i = 0; i < extents[0]; i++) for(int j = 0; j < extents[1]; j++)