From 9a9625eb58165ebca9805af27755769aa31d7953 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Fri, 6 Nov 2020 13:47:17 +0100
Subject: [PATCH 1/3] add parallel version of naive_copy and memcpy

---
 examples/viewcopy/viewcopy.cpp | 47 +++++++++++++++++++++++++++++++---
 1 file changed, 44 insertions(+), 3 deletions(-)
diff --git a/examples/viewcopy/viewcopy.cpp b/examples/viewcopy/viewcopy.cpp
index 6c1688adc0..e878025142 100644
--- a/examples/viewcopy/viewcopy.cpp
+++ b/examples/viewcopy/viewcopy.cpp
@@ -1,9 +1,12 @@
 #include <boost/functional/hash.hpp>
 #include <boost/mp11.hpp>
+#include <boost/range/irange.hpp>
 #include <chrono>
+#include <execution>
 #include <llama/llama.hpp>
 #include <numeric>
 #include <string_view>
+#include <thread>
 
 // clang-format off
 namespace tag
@@ -31,15 +34,21 @@ using Particle = llama::DS<
 >;
 // clang-format on
 
-template <typename Mapping1, typename BlobType1, typename Mapping2, typename BlobType2>
-void naive_copy(const llama::View<Mapping1, BlobType1>& srcView, llama::View<Mapping2, BlobType2>& dstView)
+template <
+    typename Mapping1,
+    typename BlobType1,
+    typename Mapping2,
+    typename BlobType2,
+    typename Ex = std::execution::sequenced_policy>
+void naive_copy(const llama::View<Mapping1, BlobType1>& srcView, llama::View<Mapping2, BlobType2>& dstView, Ex ex = {})
 {
     static_assert(std::is_same_v<typename Mapping1::DatumDomain, typename Mapping2::DatumDomain>);
 
     if (srcView.mapping.arrayDomainSize != dstView.mapping.arrayDomainSize)
         throw std::runtime_error{"UserDomain sizes are different"};
 
-    for (auto ad : llama::ArrayDomainIndexRange{srcView.mapping.arrayDomainSize})
+    auto r = llama::ArrayDomainIndexRange{srcView.mapping.arrayDomainSize};
+    std::for_each(ex, std::begin(r), std::end(r), [&](auto ad) {
         llama::forEach<typename Mapping1::DatumDomain>([&](auto coord) {
             dstView(ad)(coord) = srcView(ad)(coord);
             // std::memcpy(
@@ -47,6 +56,21 @@ void naive_copy(const llama::View<Mapping1, BlobType1>& srcView, llama::View<Map
             //    &srcView(ad)(coord),
             //    sizeof(llama::GetType<typename Mapping1::DatumDomain, decltype(coord)>));
         });
+    });
+}
+
+void parallel_memcpy(std::byte* dst, const std::byte* src, std::size_t size)
+{
+    const auto threads = std::thread::hardware_concurrency();
+    const auto sizePerThread = size / threads;
+    const auto sizeLastThread = sizePerThread + size % threads;
+    auto r = boost::irange(0u, threads);
+    std::for_each(std::execution::par, std::begin(r), std::end(r), [&](auto id) {
+        std::memcpy(
+            dst + id * sizePerThread,
+            src + id * sizePerThread,
+            id == threads - 1 ? sizeLastThread : sizePerThread);
+    });
 }
 
 template <
@@ -180,11 +204,22 @@ int main(int argc, char** argv)
         benchmarkCopy("naive_copy", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
             naive_copy(srcView, dstView);
         });
+        benchmarkCopy("naive_copy(p)", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
+            naive_copy(srcView, dstView, std::execution::par);
+        });
         benchmarkCopy("memcpy    ", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
             static_assert(srcView.storageBlobs.rank == 1);
             static_assert(dstView.storageBlobs.rank == 1);
             std::memcpy(dstView.storageBlobs[0].data(), srcView.storageBlobs[0].data(), dstView.storageBlobs[0].size());
         });
+        benchmarkCopy("memcpy(p) ", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
+            static_assert(srcView.storageBlobs.rank == 1);
+            static_assert(dstView.storageBlobs.rank == 1);
+            parallel_memcpy(
+                dstView.storageBlobs[0].data(),
+                srcView.storageBlobs[0].data(),
+                dstView.storageBlobs[0].size());
+        });
     }
 
     {
@@ -196,6 +231,9 @@ int main(int argc, char** argv)
         benchmarkCopy("naive_copy", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
             naive_copy(srcView, dstView);
         });
+        benchmarkCopy("naive_copy(p)", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
+            naive_copy(srcView, dstView, std::execution::par);
+        });
         benchmarkCopy("memcpy    ", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
             static_assert(srcView.storageBlobs.rank == 1);
             static_assert(dstView.storageBlobs.rank == 1);
@@ -220,6 +258,9 @@ int main(int argc, char** argv)
         benchmarkCopy("naive_copy   ", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
             naive_copy(srcView, dstView);
         });
+        benchmarkCopy("naive_copy(p)", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
+            naive_copy(srcView, dstView, std::execution::par);
+        });
         benchmarkCopy("memcpy       ", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
             static_assert(srcView.storageBlobs.rank == 1);
             static_assert(dstView.storageBlobs.rank == 1);

From 26876b35d0db604ef0144c4bf8ab87a52ce11010 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Fri, 6 Nov 2020 17:09:18 +0100
Subject: [PATCH 2/3] add parallel version aosoa copy

---
 examples/viewcopy/viewcopy.cpp | 86 +++++++++++++++++++++++-----------
 1 file changed, 58 insertions(+), 28 deletions(-)

diff --git a/examples/viewcopy/viewcopy.cpp b/examples/viewcopy/viewcopy.cpp
index e878025142..33616c76ff 100644
--- a/examples/viewcopy/viewcopy.cpp
+++ b/examples/viewcopy/viewcopy.cpp
@@ -61,10 +61,10 @@ void naive_copy(const llama::View<Mapping1, BlobType1>& srcView, llama::View<Map
 
 void parallel_memcpy(std::byte* dst, const std::byte* src, std::size_t size)
 {
-    const auto threads = std::thread::hardware_concurrency();
+    const auto threads = std::size_t{std::thread::hardware_concurrency()};
     const auto sizePerThread = size / threads;
     const auto sizeLastThread = sizePerThread + size % threads;
-    auto r = boost::irange(0u, threads);
+    auto r = boost::irange(threads);
     std::for_each(std::execution::par, std::begin(r), std::end(r), [&](auto id) {
         std::memcpy(
             dst + id * sizePerThread,
@@ -80,14 +80,16 @@ template <
     std::size_t LanesSrc,
     typename BlobType1,
     std::size_t LanesDst,
-    typename BlobType2>
+    typename BlobType2,
+    typename Ex = std::execution::sequenced_policy>
 void aosoa_copy(
     const llama::View<
         llama::mapping::AoSoA<ArrayDomain, DatumDomain, LanesSrc, llama::mapping::LinearizeArrayDomainCpp>,
         BlobType1>& srcView,
     llama::View<
         llama::mapping::AoSoA<ArrayDomain, DatumDomain, LanesDst, llama::mapping::LinearizeArrayDomainCpp>,
-        BlobType2>& dstView)
+        BlobType2>& dstView,
+    Ex ex = {})
 {
     static_assert(srcView.storageBlobs.rank == 1);
     static_assert(dstView.storageBlobs.rank == 1);
@@ -114,37 +116,59 @@ void aosoa_copy(
         return offset;
     };
 
+    const auto threads = [&]() -> std::size_t {
+        if constexpr (std::is_same_v<Ex, std::execution::sequenced_policy>)
+            return 1u;
+        else
+            return std::thread::hardware_concurrency();
+    }();
+    const auto threadIds = boost::irange(threads);
     if constexpr (ReadOpt)
     {
         // optimized for linear reading
-        for (std::size_t i = 0; i < flatSize; i += LanesSrc)
-        {
-            llama::forEach<DatumDomain>([&](auto coord) {
-                constexpr auto L = std::min(LanesSrc, LanesDst);
-                for (std::size_t j = 0; j < LanesSrc; j += L)
-                {
-                    constexpr auto bytes = L * sizeof(llama::GetType<DatumDomain, decltype(coord)>);
-                    std::memcpy(&dst[map(i + j, coord, LanesDst)], src, bytes);
-                    src += bytes;
-                }
-            });
-        }
+        const auto elementsPerThread = ((flatSize / LanesSrc) / threads) * LanesSrc;
+        std::for_each(ex, std::begin(threadIds), std::end(threadIds), [&](auto id) {
+            const auto start = id * elementsPerThread;
+            const auto stop = id == threads - 1 ? flatSize : (id + 1) * elementsPerThread;
+            auto* threadSrc = src + map(start, llama::DatumCoord<>{}, LanesSrc);
+
+            for (std::size_t i = start; i < stop; i += LanesSrc)
+            {
+                llama::forEach<DatumDomain>([&](auto coord) {
+                    constexpr auto L = std::min(LanesSrc, LanesDst);
+                    for (std::size_t j = 0; j < LanesSrc; j += L)
+                    {
+                        constexpr auto bytes = L * sizeof(llama::GetType<DatumDomain, decltype(coord)>);
+                        std::memcpy(&dst[map(i + j, coord, LanesDst)], threadSrc, bytes);
+                        threadSrc += bytes;
+                    }
+                });
+            }
+        });
     }
     else
     {
         // optimized for linear writing
-        for (std::size_t i = 0; i < flatSize; i += LanesDst)
-        {
-            llama::forEach<DatumDomain>([&](auto coord) {
-                constexpr auto L = std::min(LanesSrc, LanesDst);
-                for (std::size_t j = 0; j < LanesDst; j += L)
-                {
-                    constexpr auto bytes = L * sizeof(llama::GetType<DatumDomain, decltype(coord)>);
-                    std::memcpy(dst, &src[map(i + j, coord, LanesSrc)], bytes);
-                    dst += bytes;
-                }
-            });
-        }
+        const auto elementsPerThread = ((flatSize / LanesDst) / threads) * LanesDst;
+        std::for_each(ex, std::begin(threadIds), std::end(threadIds), [&](auto id) {
+            const auto start = id * elementsPerThread;
+            const auto stop = id == threads - 1 ? flatSize : (id + 1) * elementsPerThread;
+
+            auto* threadDst = dst + map(start, llama::DatumCoord<>{}, LanesDst);
+
+            for (std::size_t i = start; i < stop; i += LanesDst)
+            {
+                llama::forEach<DatumDomain>([&](auto coord) {
+                    constexpr auto L = std::min(LanesSrc, LanesDst);
+                    for (std::size_t j = 0; j < LanesDst; j += L)
+                    {
+                        constexpr auto bytes = L * sizeof(llama::GetType<DatumDomain, decltype(coord)>);
+                        std::memcpy(threadDst, &src[map(i + j, coord, LanesSrc)], bytes);
+                        threadDst += bytes;
+                    }
+                });
+            }
+        });
     }
 }
 
@@ -272,5 +296,11 @@ int main(int argc, char** argv)
         benchmarkCopy("aosoa_copy(w)", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
             aosoa_copy<false>(srcView, dstView);
         });
+        benchmarkCopy("aosoa_copy(r,p)", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
+            aosoa_copy<true>(srcView, dstView, std::execution::par);
+        });
+        benchmarkCopy("aosoa_copy(w,p)", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
+            aosoa_copy<false>(srcView, dstView, std::execution::par);
+        });
     });
 }

From eeabdaca25a69b7fd2d1039e55ff7df61e4c5a18 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Mon, 9 Nov 2020 12:37:26 +0100
Subject: [PATCH 3/3] upgrade to g++-10 for Linux CI to support parallel STL

---
 .github/workflows/ci.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 8c874a392c..d227c6742c 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -14,7 +14,7 @@ jobs:
         mkdir build
         cd build
         export BOOST_ROOT=$BOOST_ROOT_1_72_0
-        CXX=g++-9 cmake .. -DCMAKE_BUILD_TYPE=RelWithDebInfo -DASAN_FOR_TESTS=ON -DCMAKE_TOOLCHAIN_FILE=/usr/local/share/vcpkg/scripts/buildsystems/vcpkg.cmake
+        CXX=g++-10 cmake .. -DCMAKE_BUILD_TYPE=RelWithDebInfo -DASAN_FOR_TESTS=ON -DCMAKE_TOOLCHAIN_FILE=/usr/local/share/vcpkg/scripts/buildsystems/vcpkg.cmake
     - name: build
       run: cmake --build build
     - name: tests