From d811a3344dd72bc7d2f3f2de703bdfa6d251fff4 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 7 Oct 2021 14:43:36 +0200
Subject: [PATCH 1/2] support proxy references in VirtualRecord assignment and
 arithmetic operators

---
 include/llama/VirtualRecord.hpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)
diff --git a/include/llama/VirtualRecord.hpp b/include/llama/VirtualRecord.hpp
index a7370238e4..bf54be03b3 100644
--- a/include/llama/VirtualRecord.hpp
+++ b/include/llama/VirtualRecord.hpp
@@ -144,54 +144,54 @@ namespace llama
         struct Assign
         {
             template<typename A, typename B>
-            LLAMA_FN_HOST_ACC_INLINE auto operator()(A& a, const B& b) const -> decltype(auto)
+            LLAMA_FN_HOST_ACC_INLINE auto operator()(A&& a, const B& b) const -> decltype(auto)
             {
-                return a = b;
+                return std::forward<A>(a) = b;
             }
         };
 
         struct PlusAssign
         {
             template<typename A, typename B>
-            LLAMA_FN_HOST_ACC_INLINE auto operator()(A& a, const B& b) const -> decltype(auto)
+            LLAMA_FN_HOST_ACC_INLINE auto operator()(A&& a, const B& b) const -> decltype(auto)
             {
-                return a += b;
+                return std::forward<A>(a) += b;
             }
         };
 
         struct MinusAssign
         {
             template<typename A, typename B>
-            LLAMA_FN_HOST_ACC_INLINE auto operator()(A& a, const B& b) const -> decltype(auto)
+            LLAMA_FN_HOST_ACC_INLINE auto operator()(A&& a, const B& b) const -> decltype(auto)
             {
-                return a -= b;
+                return std::forward<A>(a) -= b;
             }
         };
 
         struct MultiplyAssign
         {
             template<typename A, typename B>
-            LLAMA_FN_HOST_ACC_INLINE auto operator()(A& a, const B& b) const -> decltype(auto)
+            LLAMA_FN_HOST_ACC_INLINE auto operator()(A&& a, const B& b) const -> decltype(auto)
             {
-                return a *= b;
+                return std::forward<A>(a) *= b;
             }
         };
 
         struct DivideAssign
         {
             template<typename A, typename B>
-            LLAMA_FN_HOST_ACC_INLINE auto operator()(A& a, const B& b) const -> decltype(auto)
+            LLAMA_FN_HOST_ACC_INLINE auto operator()(A&& a, const B& b) const -> decltype(auto)
             {
-                return a /= b;
+                return std::forward<A>(a) /= b;
             }
         };
 
         struct ModuloAssign
         {
             template<typename A, typename B>
-            LLAMA_FN_HOST_ACC_INLINE auto operator()(A& a, const B& b) const -> decltype(auto)
+            LLAMA_FN_HOST_ACC_INLINE auto operator()(A&& a, const B& b) const -> decltype(auto)
             {
-                return a %= b;
+                return std::forward<A>(a) %= b;
             }
         };
 

From 61ec8080ca5660e28e3e3bfcc25e838df9ea28de Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 7 Oct 2021 14:58:51 +0200
Subject: [PATCH 2/2] add example using byte splitting

---
 CMakeLists.txt                    |   1 +
 examples/bytesplit/CMakeLists.txt |   9 ++
 examples/bytesplit/bytesplit.cpp  | 171 ++++++++++++++++++++++++++++++
 3 files changed, 181 insertions(+)
 create mode 100644 examples/bytesplit/CMakeLists.txt
 create mode 100644 examples/bytesplit/bytesplit.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c025bcaaf2..5eb58482a4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -72,6 +72,7 @@ if (LLAMA_BUILD_EXAMPLES)
 	add_subdirectory("examples/bufferguard")
 	add_subdirectory("examples/raycast")
 	add_subdirectory("examples/bitpack")
+	add_subdirectory("examples/bytesplit")
 
 	# alpaka examples
 	find_package(alpaka 0.7.0 QUIET)
diff --git a/examples/bytesplit/CMakeLists.txt b/examples/bytesplit/CMakeLists.txt
new file mode 100644
index 0000000000..7f025935b6
--- /dev/null
+++ b/examples/bytesplit/CMakeLists.txt
@@ -0,0 +1,9 @@
+cmake_minimum_required (VERSION 3.15)
+project(llama-bytesplit CXX)
+
+if (NOT TARGET llama::llama)
+	find_package(llama REQUIRED)
+endif()
+add_executable(${PROJECT_NAME} bytesplit.cpp)
+target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17)
+target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama)
diff --git a/examples/bytesplit/bytesplit.cpp b/examples/bytesplit/bytesplit.cpp
new file mode 100644
index 0000000000..87b23b194f
--- /dev/null
+++ b/examples/bytesplit/bytesplit.cpp
@@ -0,0 +1,171 @@
+#include <cstdint>
+#include <fmt/core.h>
+#include <llama/llama.hpp>
+
+// clang-format off
+namespace tag
+{
+    struct A{};
+    struct B{};
+    struct C{};
+    struct D{};
+    struct E{};
+    struct F{};
+} // namespace tag
+
+using Data = llama::Record<
+    llama::Field<tag::A, std::uint16_t>,
+    llama::Field<tag::B, std::int32_t>,
+    llama::Field<tag::C, std::uint64_t>,
+    llama::Field<tag::D, float>,
+    llama::Field<tag::E, double>,
+    llama::Field<tag::F, unsigned char>
+>;
+// clang-format on
+
+template<typename T>
+using ReplaceByByteArray = std::byte[sizeof(T)];
+
+template<typename RecordDim>
+using SplitBytes = llama::TransformLeaves<RecordDim, ReplaceByByteArray>;
+
+template<typename TArrayExtents, typename TRecordDim>
+struct BytesplitSoA : private llama::mapping::SoA<TArrayExtents, SplitBytes<TRecordDim>, false>
+{
+    using Base = llama::mapping::SoA<TArrayExtents, SplitBytes<TRecordDim>, false>;
+
+    using ArrayExtents = typename Base::ArrayExtents;
+    using ArrayIndex = typename Base::ArrayIndex;
+    using RecordDim = TRecordDim; // hide Base::RecordDim
+    using Base::blobCount;
+
+    using Base::Base;
+    using Base::blobSize;
+    using Base::extents;
+
+    LLAMA_FN_HOST_ACC_INLINE
+    constexpr explicit BytesplitSoA(TArrayExtents extents, TRecordDim = {}) : Base(extents)
+    {
+    }
+
+    template<std::size_t... RecordCoords>
+    static constexpr auto isComputed(llama::RecordCoord<RecordCoords...>)
+    {
+        return true;
+    }
+
+    template<typename QualifiedBase, typename RC, typename BlobArray>
+    struct Reference
+    {
+        QualifiedBase& innerMapping;
+        ArrayIndex ai;
+        BlobArray& blobs;
+
+        using DstType = llama::GetType<TRecordDim, RC>;
+
+        // NOLINTNEXTLINE(google-explicit-constructor,hicpp-explicit-conversions)
+        operator DstType() const
+        {
+            DstType v;
+            auto* p = reinterpret_cast<std::byte*>(&v);
+            boost::mp11::mp_for_each<boost::mp11::mp_iota_c<sizeof(DstType)>>(
+                [&](auto ic)
+                {
+                    constexpr auto i = decltype(ic)::value;
+                    const auto [nr, off] = innerMapping.blobNrAndOffset(ai, llama::Cat<RC, llama::RecordCoord<i>>{});
+                    p[i] = blobs[nr][off];
+                });
+            return v;
+        }
+
+        auto operator=(DstType v) -> Reference&
+        {
+            auto* p = reinterpret_cast<std::byte*>(&v);
+            boost::mp11::mp_for_each<boost::mp11::mp_iota_c<sizeof(DstType)>>(
+                [&](auto ic)
+                {
+                    constexpr auto i = decltype(ic)::value;
+                    const auto [nr, off] = innerMapping.blobNrAndOffset(ai, llama::Cat<RC, llama::RecordCoord<i>>{});
+                    blobs[nr][off] = p[i];
+                });
+            return *this;
+        }
+    };
+
+    template<std::size_t... RecordCoords, typename BlobArray>
+    LLAMA_FN_HOST_ACC_INLINE constexpr auto compute(
+        typename Base::ArrayIndex ai,
+        llama::RecordCoord<RecordCoords...>,
+        BlobArray& blobs) const
+    {
+        return Reference<decltype(*this), llama::RecordCoord<RecordCoords...>, BlobArray>{*this, ai, blobs};
+    }
+};
+
+auto main() -> int
+{
+    constexpr auto N = 128;
+    using ArrayExtents = llama::ArrayExtentsDynamic<1>;
+    const auto mapping = BytesplitSoA<ArrayExtents, Data>{{N}};
+
+    auto view = llama::allocView(mapping);
+
+    int value = 0;
+    for(std::size_t i = 0; i < N; i++)
+        llama::forEachLeafCoord<Data>([&](auto rc) { view(i)(rc) = ++value; });
+
+    value = 0;
+    for(std::size_t i = 0; i < N; i++)
+        llama::forEachLeafCoord<Data>(
+            [&](auto rc)
+            {
+                using T = llama::GetType<Data, decltype(rc)>;
+                ++value;
+                if(view(i)(rc) != static_cast<T>(value))
+                    fmt::print("Error: value after store is corrupt. {} != {}\n", view(i)(rc), value);
+            });
+
+    // extract into a view of unsplit fields
+    auto viewExtracted = llama::allocViewUninitialized(llama::mapping::AoS<ArrayExtents, Data>{{N}});
+    llama::copy(view, viewExtracted);
+    if(!std::equal(view.begin(), view.end(), viewExtracted.begin(), viewExtracted.end()))
+        fmt::print("ERROR: unsplit view is different\n");
+
+    // compute something on the extracted view
+    for(std::size_t i = 0; i < N; i++)
+        viewExtracted(i) *= 2;
+
+    // rearrange back into split view
+    llama::copy(viewExtracted, view);
+
+    value = 0;
+    for(std::size_t i = 0; i < N; i++)
+        llama::forEachLeafCoord<Data>(
+            [&](auto rc)
+            {
+                using T = llama::GetType<Data, decltype(rc)>;
+                ++value;
+                if(view(i)(rc) != static_cast<T>(static_cast<T>(value) * 2))
+                    fmt::print("Error: value after resplit is corrupt. {} != {}\n", view(i)(rc), value);
+            });
+
+    // compute something on the split view
+    for(std::size_t i = 0; i < N; i++)
+        view(i) = view(i) * 2; // cannot do view(i) *= 2; with proxy references
+
+    value = 0;
+    for(std::size_t i = 0; i < N; i++)
+        llama::forEachLeafCoord<Data>(
+            [&](auto rc)
+            {
+                using T = llama::GetType<Data, decltype(rc)>;
+                ++value;
+                if(view(i)(rc) != static_cast<T>(static_cast<T>(value) * 4))
+                    fmt::print(
+                        "Error: value after computation on split data is corrupt. {} != {}\n",
+                        view(i)(rc),
+                        value);
+            });
+
+    fmt::print("Done\n");
+}