diff --git a/include/llama/Simd.hpp b/include/llama/Simd.hpp index 6f4eb9e7a0..ae27244dac 100644 --- a/include/llama/Simd.hpp +++ b/include/llama/Simd.hpp @@ -206,19 +206,23 @@ namespace llama //} else if constexpr(mapping::isAoS) { - constexpr static auto stride + constexpr static auto srcStride = flatSizeOf; - auto* base = reinterpret_cast(&srcRef(rc)); + const auto* srcBaseAddr = reinterpret_cast(&srcRef(rc)); + ElementSimd elemSimd; // g++-12 really needs the intermediate elemSimd and memcpy for(auto i = 0; i < Traits::lanes; i++) - reinterpret_cast(&dstSimd(rc))[i] - = *reinterpret_cast(base + i * stride); + reinterpret_cast(&elemSimd)[i] + = *reinterpret_cast(srcBaseAddr + i * srcStride); + std::memcpy(&dstSimd(rc), &elemSimd, sizeof(elemSimd)); } else { auto b = ArrayIndexIterator{srcRef.view.mapping().extents(), srcRef.arrayIndex()}; + ElementSimd elemSimd; // g++-12 really needs the intermediate elemSimd and memcpy for(auto i = 0; i < Traits::lanes; i++) - reinterpret_cast(&dstSimd(rc))[i] + reinterpret_cast(&elemSimd)[i] = srcRef.view(*b++)(cat(typename T::BoundRecordCoord{}, rc)); // scalar loads + std::memcpy(&dstSimd(rc), &elemSimd, sizeof(elemSimd)); } }); } @@ -266,19 +270,21 @@ namespace llama { constexpr static auto stride = flatSizeOf; - auto* base = reinterpret_cast(&dstRef(rc)); + auto* dstBaseAddr = reinterpret_cast(&dstRef(rc)); + const ElementSimd elemSimd = srcSimd(rc); for(auto i = 0; i < Traits::lanes; i++) - *reinterpret_cast(base + i * stride) - = reinterpret_cast(&srcSimd(rc))[i]; + *reinterpret_cast(dstBaseAddr + i * stride) + = reinterpret_cast(&elemSimd)[i]; } else { // TODO(bgruber): how does this generalize conceptually to 2D and higher dimensions? in which // direction should we collect SIMD values? + const ElementSimd elemSimd = srcSimd(rc); auto b = ArrayIndexIterator{dstRef.view.mapping().extents(), dstRef.arrayIndex()}; for(auto i = 0; i < Traits::lanes; i++) dstRef.view (*b++)(cat(typename T::BoundRecordCoord{}, rc)) - = reinterpret_cast(&srcSimd(rc))[i]; // scalar store + = reinterpret_cast(&elemSimd)[i]; // scalar store } }); }