Merge pull request #120 from bernhardmgruber/pcopy

Add parallel versions of copy algorithms in viewcopy example
alpaka-group · Nov 9, 2020 · 33d1409 · 33d1409
2 parents 2f60140 + eeabdac
commit 33d1409
Show file tree

Hide file tree

Showing 2 changed files with 101 additions and 30 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -14,7 +14,7 @@ jobs:
         mkdir build
         cd build
         export BOOST_ROOT=$BOOST_ROOT_1_72_0
-        CXX=g++-9 cmake .. -DCMAKE_BUILD_TYPE=RelWithDebInfo -DASAN_FOR_TESTS=ON -DCMAKE_TOOLCHAIN_FILE=/usr/local/share/vcpkg/scripts/buildsystems/vcpkg.cmake
+        CXX=g++-10 cmake .. -DCMAKE_BUILD_TYPE=RelWithDebInfo -DASAN_FOR_TESTS=ON -DCMAKE_TOOLCHAIN_FILE=/usr/local/share/vcpkg/scripts/buildsystems/vcpkg.cmake
     - name: build
       run: cmake --build build
     - name: tests

diff --git a/examples/viewcopy/viewcopy.cpp b/examples/viewcopy/viewcopy.cpp
@@ -1,9 +1,12 @@
 #include <boost/functional/hash.hpp>
 #include <boost/mp11.hpp>
+#include <boost/range/irange.hpp>
 #include <chrono>
+#include <execution>
 #include <llama/llama.hpp>
 #include <numeric>
 #include <string_view>
+#include <thread>
 
 // clang-format off
 namespace tag
@@ -31,22 +34,43 @@ using Particle = llama::DS<
 >;
 // clang-format on
 
-template <typename Mapping1, typename BlobType1, typename Mapping2, typename BlobType2>
-void naive_copy(const llama::View<Mapping1, BlobType1>& srcView, llama::View<Mapping2, BlobType2>& dstView)
+template <
+    typename Mapping1,
+    typename BlobType1,
+    typename Mapping2,
+    typename BlobType2,
+    typename Ex = std::execution::sequenced_policy>
+void naive_copy(const llama::View<Mapping1, BlobType1>& srcView, llama::View<Mapping2, BlobType2>& dstView, Ex ex = {})
 {
     static_assert(std::is_same_v<typename Mapping1::DatumDomain, typename Mapping2::DatumDomain>);
 
     if (srcView.mapping.arrayDomainSize != dstView.mapping.arrayDomainSize)
         throw std::runtime_error{"UserDomain sizes are different"};
 
-    for (auto ad : llama::ArrayDomainIndexRange{srcView.mapping.arrayDomainSize})
+    auto r = llama::ArrayDomainIndexRange{srcView.mapping.arrayDomainSize};
+    std::for_each(ex, std::begin(r), std::end(r), [&](auto ad) {
         llama::forEach<typename Mapping1::DatumDomain>([&](auto coord) {
             dstView(ad)(coord) = srcView(ad)(coord);
             // std::memcpy(
             //    &dstView(ad)(coord),
             //    &srcView(ad)(coord),
             //    sizeof(llama::GetType<typename Mapping1::DatumDomain, decltype(coord)>));
         });
+    });
+}
+
+void parallel_memcpy(std::byte* dst, const std::byte* src, std::size_t size)
+{
+    const auto threads = std::size_t{std::thread::hardware_concurrency()};
+    const auto sizePerThread = size / threads;
+    const auto sizeLastThread = sizePerThread + size % threads;
+    auto r = boost::irange(threads);
+    std::for_each(std::execution::par, std::begin(r), std::end(r), [&](auto id) {
+        std::memcpy(
+            dst + id * sizePerThread,
+            src + id * sizePerThread,
+            id == threads - 1 ? sizeLastThread : sizePerThread);
+    });
 }
 
 template <
@@ -56,14 +80,16 @@ template <
     std::size_t LanesSrc,
     typename BlobType1,
     std::size_t LanesDst,
-    typename BlobType2>
+    typename BlobType2,
+    typename Ex = std::execution::sequenced_policy>
 void aosoa_copy(
     const llama::View<
         llama::mapping::AoSoA<ArrayDomain, DatumDomain, LanesSrc, llama::mapping::LinearizeArrayDomainCpp>,
         BlobType1>& srcView,
     llama::View<
         llama::mapping::AoSoA<ArrayDomain, DatumDomain, LanesDst, llama::mapping::LinearizeArrayDomainCpp>,
-        BlobType2>& dstView)
+        BlobType2>& dstView,
+    Ex ex = {})
 {
     static_assert(srcView.storageBlobs.rank == 1);
     static_assert(dstView.storageBlobs.rank == 1);
@@ -90,37 +116,59 @@ void aosoa_copy(
         return offset;
     };
 
+    const auto threads = [&]() -> std::size_t {
+        if constexpr (std::is_same_v<Ex, std::execution::sequenced_policy>)
+            return 1u;
+        else
+            return std::thread::hardware_concurrency();
+    }();
+    const auto threadIds = boost::irange(threads);
     if constexpr (ReadOpt)
     {
         // optimized for linear reading
-        for (std::size_t i = 0; i < flatSize; i += LanesSrc)
-        {
-            llama::forEach<DatumDomain>([&](auto coord) {
-                constexpr auto L = std::min(LanesSrc, LanesDst);
-                for (std::size_t j = 0; j < LanesSrc; j += L)
-                {
-                    constexpr auto bytes = L * sizeof(llama::GetType<DatumDomain, decltype(coord)>);
-                    std::memcpy(&dst[map(i + j, coord, LanesDst)], src, bytes);
-                    src += bytes;
-                }
-            });
-        }
+        const auto elementsPerThread = ((flatSize / LanesSrc) / threads) * LanesSrc;
+        std::for_each(ex, std::begin(threadIds), std::end(threadIds), [&](auto id) {
+            const auto start = id * elementsPerThread;
+            const auto stop = id == threads - 1 ? flatSize : (id + 1) * elementsPerThread;
+            auto* threadSrc = src + map(start, llama::DatumCoord<>{}, LanesSrc);
+
+            for (std::size_t i = start; i < stop; i += LanesSrc)
+            {
+                llama::forEach<DatumDomain>([&](auto coord) {
+                    constexpr auto L = std::min(LanesSrc, LanesDst);
+                    for (std::size_t j = 0; j < LanesSrc; j += L)
+                    {
+                        constexpr auto bytes = L * sizeof(llama::GetType<DatumDomain, decltype(coord)>);
+                        std::memcpy(&dst[map(i + j, coord, LanesDst)], threadSrc, bytes);
+                        threadSrc += bytes;
+                    }
+                });
+            }
+        });
     }
     else
     {
         // optimized for linear writing
-        for (std::size_t i = 0; i < flatSize; i += LanesDst)
-        {
-            llama::forEach<DatumDomain>([&](auto coord) {
-                constexpr auto L = std::min(LanesSrc, LanesDst);
-                for (std::size_t j = 0; j < LanesDst; j += L)
-                {
-                    constexpr auto bytes = L * sizeof(llama::GetType<DatumDomain, decltype(coord)>);
-                    std::memcpy(dst, &src[map(i + j, coord, LanesSrc)], bytes);
-                    dst += bytes;
-                }
-            });
-        }
+        const auto elementsPerThread = ((flatSize / LanesDst) / threads) * LanesDst;
+        std::for_each(ex, std::begin(threadIds), std::end(threadIds), [&](auto id) {
+            const auto start = id * elementsPerThread;
+            const auto stop = id == threads - 1 ? flatSize : (id + 1) * elementsPerThread;
+
+            auto* threadDst = dst + map(start, llama::DatumCoord<>{}, LanesDst);
+
+            for (std::size_t i = start; i < stop; i += LanesDst)
+            {
+                llama::forEach<DatumDomain>([&](auto coord) {
+                    constexpr auto L = std::min(LanesSrc, LanesDst);
+                    for (std::size_t j = 0; j < LanesDst; j += L)
+                    {
+                        constexpr auto bytes = L * sizeof(llama::GetType<DatumDomain, decltype(coord)>);
+                        std::memcpy(threadDst, &src[map(i + j, coord, LanesSrc)], bytes);
+                        threadDst += bytes;
+                    }
+                });
+            }
+        });
     }
 }
 
@@ -180,11 +228,22 @@ int main(int argc, char** argv)
         benchmarkCopy("naive_copy", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
             naive_copy(srcView, dstView);
         });
+        benchmarkCopy("naive_copy(p)", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
+            naive_copy(srcView, dstView, std::execution::par);
+        });
         benchmarkCopy("memcpy    ", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
             static_assert(srcView.storageBlobs.rank == 1);
             static_assert(dstView.storageBlobs.rank == 1);
             std::memcpy(dstView.storageBlobs[0].data(), srcView.storageBlobs[0].data(), dstView.storageBlobs[0].size());
         });
+        benchmarkCopy("memcpy(p) ", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
+            static_assert(srcView.storageBlobs.rank == 1);
+            static_assert(dstView.storageBlobs.rank == 1);
+            parallel_memcpy(
+                dstView.storageBlobs[0].data(),
+                srcView.storageBlobs[0].data(),
+                dstView.storageBlobs[0].size());
+        });
     }
 
     {
@@ -196,6 +255,9 @@ int main(int argc, char** argv)
         benchmarkCopy("naive_copy", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
             naive_copy(srcView, dstView);
         });
+        benchmarkCopy("naive_copy(p)", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
+            naive_copy(srcView, dstView, std::execution::par);
+        });
         benchmarkCopy("memcpy    ", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
             static_assert(srcView.storageBlobs.rank == 1);
             static_assert(dstView.storageBlobs.rank == 1);
@@ -220,6 +282,9 @@ int main(int argc, char** argv)
         benchmarkCopy("naive_copy   ", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
             naive_copy(srcView, dstView);
         });
+        benchmarkCopy("naive_copy(p)", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
+            naive_copy(srcView, dstView, std::execution::par);
+        });
         benchmarkCopy("memcpy       ", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
             static_assert(srcView.storageBlobs.rank == 1);
             static_assert(dstView.storageBlobs.rank == 1);
@@ -231,5 +296,11 @@ int main(int argc, char** argv)
         benchmarkCopy("aosoa_copy(w)", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
             aosoa_copy<false>(srcView, dstView);
         });
+        benchmarkCopy("aosoa_copy(r,p)", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
+            aosoa_copy<true>(srcView, dstView, std::execution::par);
+        });
+        benchmarkCopy("aosoa_copy(w,p)", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) {
+            aosoa_copy<false>(srcView, dstView, std::execution::par);
+        });
     });
 }