Skip to content

Commit

Permalink
Implement SIMD gather/scatter
Browse files Browse the repository at this point in the history
  • Loading branch information
bernhardmgruber committed Jan 1, 2024
1 parent 9680e3d commit abf44ef
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 54 deletions.
31 changes: 7 additions & 24 deletions docs/pages/simd.rst
Original file line number Diff line number Diff line change
Expand Up @@ -48,36 +48,19 @@ SIMD library integration with LLAMA

In order for LLAMA to make use of a third-party SIMD library,
the class template :cpp:`llama::SimdTraits` has to be specialized for the SIMD types of the SIMD library.

Here is an exemplary integration of `std::experimental::simd<T, Abi>` with LLAMA:

.. code-block:: C++

#include <llama/llama.hpp>
#include <experimental/simd>

namespace stdx = std::experimental;
template<typename T, typename Abi>
struct llama::SimdTraits<stdx::simd<T, Abi>> {
using value_type = T;

inline static constexpr std::size_t lanes = stdx::simd<T, Abi>::size();

static auto loadUnaligned(const value_type* mem) -> stdx::simd<T, Abi> {
return {mem, stdx::element_aligned};
}

static void storeUnaligned(stdx::simd<T, Abi> simd, value_type* mem) {
simd.copy_to(mem, stdx::element_aligned);
}
};

Each specialization :cpp:`llama::SimdTraits<Simd>` must provide:

* an alias :cpp:`value_type` to indicate the element type of the Simd.
* a :cpp:`static constexpr size_t lanes` variable holding the number of SIMD lanes of the Simd.
* a :cpp:`static auto loadUnaligned(const value_type* mem) -> Simd` function, loading a Simd from the given memory address.
* a :cpp:`static void storeUnaligned(Simd simd, value_type* mem)` function, storing the given Simd to a given memory address.
* a :cpp:`static auto gather(const value_type* mem, std::array<int, lanes> indices) -> Simd` function,
gathering values into a Simd from the memory addresses identified by :cpp:`mem + indices * sizeof(value_type)`.
* a :cpp:`static void scatter(Simd simd, value_type* mem, std::array<int, lanes> indices)` function,
scattering the values from a Simd to the memory addresses identified by :cpp:`mem + indices * sizeof(value_type)`.

For an example integration of `xsimd::batch<T, A>` with LLAMA, see the nbody example.
For an example integration of `std::experimental::simd<T, Abi>` with LLAMA, see the `simd.cpp` unit tests.

LLAMA already provides a specialization of :cpp:`llama::SimdTraits` for the built-in scalar `arithmetic types <https://en.cppreference.com/w/c/language/arithmetic_types>`_.
In that sense, these types are SIMD types from LLAMA's perspective and can be used with the SIMD API in LLAMA.
Expand Down
46 changes: 29 additions & 17 deletions include/llama/Simd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ namespace llama
/// address.
/// * a `static void storeUnaligned(Simd simd, value_type* mem)` function, storing the given Simd to a given
/// memory address.
/// * a `static auto gather(const value_type* mem, std::array<int, lanes> indices) -> Simd` function, gathering
/// values into a Simd from the memory addresses identified by mem + indices * sizeof(value_type).
/// * a `static void scatter(Simd simd, value_type* mem, std::array<int, lanes> indices)` function, scattering the
/// values from a Simd to the memory addresses identified by mem + indices * sizeof(value_type).
LLAMA_EXPORT
template<typename Simd, typename SFINAE = void>
struct SimdTraits
Expand All @@ -46,6 +50,16 @@ namespace llama
{
*mem = t;
}

static LLAMA_FORCE_INLINE auto gather(const value_type* mem, std::array<int, lanes> indices) -> T
{
return mem[indices[0]];
}

static LLAMA_FORCE_INLINE void scatter(T t, value_type* mem, std::array<int, lanes> indices)
{
mem[indices[0]] = t;
}
};

/// The number of SIMD simdLanes the given SIMD vector or \ref Simd<T> has. If Simd is not a structural \ref Simd
Expand Down Expand Up @@ -175,6 +189,19 @@ namespace llama

namespace internal
{
template<typename AoSMapping, typename ElementType, std::size_t Lanes>
inline constexpr auto aosStridedIndices = []()
{
static constexpr auto stride = flatSizeOf<
typename AoSMapping::Permuter::FlatRecordDim,
AoSMapping::fieldAlignment == llama::mapping::FieldAlignment::Align>
/ sizeof(ElementType);
std::array<int, Lanes> indices{};
for(int i = 0; i < static_cast<int>(Lanes); i++)
indices[i] = i * stride;
return indices;
}();

template<typename T, typename Simd, typename RecordCoord>
LLAMA_FN_HOST_ACC_INLINE void loadSimdRecord(const T& srcRef, Simd& dstSimd, RecordCoord rc)
{
Expand Down Expand Up @@ -205,15 +232,7 @@ namespace llama
else if constexpr(mapping::isAoS<Mapping>)
{
static_assert(mapping::isAoS<Mapping>);
static constexpr auto srcStride = flatSizeOf<
typename Mapping::Permuter::FlatRecordDim,
Mapping::fieldAlignment == llama::mapping::FieldAlignment::Align>;
const auto* srcBaseAddr = reinterpret_cast<const std::byte*>(&srcRef(rc));
ElementSimd elemSimd; // g++-12 really needs the intermediate elemSimd and memcpy
for(auto i = 0; i < Traits::lanes; i++)
reinterpret_cast<FieldType*>(&elemSimd)[i]
= *reinterpret_cast<const FieldType*>(srcBaseAddr + i * srcStride);
std::memcpy(&dstSimd(rc), &elemSimd, sizeof(elemSimd));
dstSimd(rc) = Traits::gather(&srcRef(rc), aosStridedIndices<Mapping, FieldType, Traits::lanes>);
}
else
{
Expand Down Expand Up @@ -245,14 +264,7 @@ namespace llama
}
else if constexpr(mapping::isAoS<Mapping>)
{
static constexpr auto stride = flatSizeOf<
typename Mapping::Permuter::FlatRecordDim,
Mapping::fieldAlignment == llama::mapping::FieldAlignment::Align>;
auto* dstBaseAddr = reinterpret_cast<std::byte*>(&dstRef(rc));
const ElementSimd elemSimd = srcSimd(rc);
for(auto i = 0; i < Traits::lanes; i++)
*reinterpret_cast<FieldType*>(dstBaseAddr + i * stride)
= reinterpret_cast<const FieldType*>(&elemSimd)[i];
Traits::scatter(srcSimd(rc), &dstRef(rc), aosStridedIndices<Mapping, FieldType, Traits::lanes>);
}
else
{
Expand Down
39 changes: 26 additions & 13 deletions tests/simd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,19 @@ struct llama::SimdTraits<stdx::simd<T, Abi>>
{
simd.copy_to(mem, stdx::element_aligned);
}

static LLAMA_FORCE_INLINE auto gather(const value_type* mem, std::array<int, lanes> indices) -> stdx::simd<T, Abi>
{
// no native support for gather yet
return stdx::simd<T, Abi>{[&](auto ic) { return mem[indices[ic]]; }};
}

static LLAMA_FORCE_INLINE void scatter(stdx::simd<T, Abi> simd, value_type* mem, std::array<int, lanes> indices)
{
// no native support for scatter yet
for(std::size_t i = 0; i < lanes; i++)
mem[indices[i]] = simd[i];
}
};

namespace
Expand Down Expand Up @@ -199,10 +212,10 @@ TEST_CASE("simd.loadSimd.simd.stdsimd")
CHECK(s[3] == 4.0f);
}

TEST_CASE("simd.loadSimd.record.scalar")
TEMPLATE_TEST_CASE("simd.loadSimd.record.scalar", "", llama::mapping::BindAoS<>, llama::mapping::BindSoA<>)
{
using ArrayExtents = llama::ArrayExtentsDynamic<int, 1>;
const auto mapping = llama::mapping::SoA<ArrayExtents, ParticleSimd>(ArrayExtents{1});
const auto mapping = typename TestType::template fn<ArrayExtents, ParticleSimd>(ArrayExtents{1});
auto view = llama::allocViewUninitialized(mapping);
iotaFillView(view);

Expand All @@ -222,10 +235,10 @@ TEST_CASE("simd.loadSimd.record.scalar")
CHECK(p(tag::Flags{}, llama::RecordCoord<3>{}) == 10);
}

TEST_CASE("simd.loadSimd.record.stdsimd")
TEMPLATE_TEST_CASE("simd.loadSimd.record.stdsimd", "", llama::mapping::BindAoS<>, llama::mapping::BindSoA<>)
{
using ArrayExtents = llama::ArrayExtentsDynamic<int, 1>;
const auto mapping = llama::mapping::SoA<ArrayExtents, ParticleSimd>(ArrayExtents{16});
const auto mapping = typename TestType::template fn<ArrayExtents, ParticleSimd>(ArrayExtents{16});
auto view = llama::allocViewUninitialized(mapping);
iotaFillView(view);

Expand Down Expand Up @@ -290,10 +303,10 @@ TEST_CASE("simd.storeSimd.simd.stdsimd")
CHECK(a[3] == 4.0f);
}

TEST_CASE("simd.storeSimd.record.scalar")
TEMPLATE_TEST_CASE("simd.storeSimd.record.scalar", "", llama::mapping::BindAoS<>, llama::mapping::BindSoA<>)
{
using ArrayExtents = llama::ArrayExtentsDynamic<int, 1>;
const auto mapping = llama::mapping::SoA<ArrayExtents, ParticleSimd>(ArrayExtents{1});
const auto mapping = typename TestType::template fn<ArrayExtents, ParticleSimd>(ArrayExtents{1});
auto view = llama::allocViewUninitialized(mapping);

llama::SimdN<ParticleSimd, 1, stdx::fixed_size_simd> p;
Expand Down Expand Up @@ -323,11 +336,11 @@ TEST_CASE("simd.storeSimd.record.scalar")
CHECK(view(0)(tag::Flags{}, llama::RecordCoord<3>{}) == 10);
}

TEST_CASE("simd.storeSimd.record.stdsimd")
TEMPLATE_TEST_CASE("simd.storeSimd.record.stdsimd", "", llama::mapping::BindAoS<>, llama::mapping::BindSoA<>)
{
using ArrayExtents = llama::ArrayExtentsDynamic<int, 1>;
const auto mapping = llama::mapping::SoA<ArrayExtents, ParticleSimd>(ArrayExtents{16});
auto view = llama::allocView(mapping);
const auto mapping = typename TestType::template fn<ArrayExtents, ParticleSimd>(ArrayExtents{16});
auto view = llama::allocViewUninitialized(mapping);

llama::SimdN<ParticleSimd, 3, stdx::fixed_size_simd> p;
auto& x = p(tag::Pos{}, tag::X{});
Expand Down Expand Up @@ -358,13 +371,13 @@ TEST_CASE("simd.storeSimd.record.stdsimd")
CHECK(view(3)(tag::Mass{}) == 0);
}

TEST_CASE("simd.simdForEachN.stdsimd")
TEMPLATE_TEST_CASE("simd.simdForEachN.stdsimd", "", llama::mapping::BindAoS<>, llama::mapping::BindSoA<>)
{
using ArrayExtents = llama::ArrayExtentsDynamic<int, 2>;
for(auto extents : {ArrayExtents{16, 32}, ArrayExtents{11, 7}})
{
CAPTURE(extents);
const auto mapping = llama::mapping::SoA<ArrayExtents, Vec2F>(extents);
const auto mapping = typename TestType::template fn<ArrayExtents, Vec2F>(extents);
auto view = llama::allocViewUninitialized(mapping);
for(int i = 0; i < extents[0]; i++)
for(int j = 0; j < extents[1]; j++)
Expand All @@ -388,13 +401,13 @@ TEST_CASE("simd.simdForEachN.stdsimd")
}
}

TEST_CASE("simd.simdForEach.stdsimd")
TEMPLATE_TEST_CASE("simd.simdForEach.stdsimd", "", llama::mapping::BindAoS<>, llama::mapping::BindSoA<>)
{
using ArrayExtents = llama::ArrayExtentsDynamic<int, 2>;
for(auto extents : {ArrayExtents{16, 32}, ArrayExtents{11, 7}})
{
CAPTURE(extents);
const auto mapping = llama::mapping::SoA<ArrayExtents, Vec2F>(extents);
const auto mapping = typename TestType::template fn<ArrayExtents, Vec2F>(extents);
auto view = llama::allocViewUninitialized(mapping);
for(int i = 0; i < extents[0]; i++)
for(int j = 0; j < extents[1]; j++)
Expand Down

0 comments on commit abf44ef

Please sign in to comment.