From 9a9625eb58165ebca9805af27755769aa31d7953 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 6 Nov 2020 13:47:17 +0100 Subject: [PATCH 1/3] add parallel version of naive_copy and memcpy --- examples/viewcopy/viewcopy.cpp | 47 +++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/examples/viewcopy/viewcopy.cpp b/examples/viewcopy/viewcopy.cpp index 6c1688adc0..e878025142 100644 --- a/examples/viewcopy/viewcopy.cpp +++ b/examples/viewcopy/viewcopy.cpp @@ -1,9 +1,12 @@ #include #include +#include #include +#include #include #include #include +#include // clang-format off namespace tag @@ -31,15 +34,21 @@ using Particle = llama::DS< >; // clang-format on -template -void naive_copy(const llama::View& srcView, llama::View& dstView) +template < + typename Mapping1, + typename BlobType1, + typename Mapping2, + typename BlobType2, + typename Ex = std::execution::sequenced_policy> +void naive_copy(const llama::View& srcView, llama::View& dstView, Ex ex = {}) { static_assert(std::is_same_v); if (srcView.mapping.arrayDomainSize != dstView.mapping.arrayDomainSize) throw std::runtime_error{"UserDomain sizes are different"}; - for (auto ad : llama::ArrayDomainIndexRange{srcView.mapping.arrayDomainSize}) + auto r = llama::ArrayDomainIndexRange{srcView.mapping.arrayDomainSize}; + std::for_each(ex, std::begin(r), std::end(r), [&](auto ad) { llama::forEach([&](auto coord) { dstView(ad)(coord) = srcView(ad)(coord); // std::memcpy( @@ -47,6 +56,21 @@ void naive_copy(const llama::View& srcView, llama::View)); }); + }); +} + +void parallel_memcpy(std::byte* dst, const std::byte* src, std::size_t size) +{ + const auto threads = std::thread::hardware_concurrency(); + const auto sizePerThread = size / threads; + const auto sizeLastThread = sizePerThread + size % threads; + auto r = boost::irange(0u, threads); + std::for_each(std::execution::par, std::begin(r), std::end(r), [&](auto id) { + std::memcpy( + dst + id * sizePerThread, + src + id * sizePerThread, + id == threads - 1 ? sizeLastThread : sizePerThread); + }); } template < @@ -180,11 +204,22 @@ int main(int argc, char** argv) benchmarkCopy("naive_copy", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) { naive_copy(srcView, dstView); }); + benchmarkCopy("naive_copy(p)", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) { + naive_copy(srcView, dstView, std::execution::par); + }); benchmarkCopy("memcpy ", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) { static_assert(srcView.storageBlobs.rank == 1); static_assert(dstView.storageBlobs.rank == 1); std::memcpy(dstView.storageBlobs[0].data(), srcView.storageBlobs[0].data(), dstView.storageBlobs[0].size()); }); + benchmarkCopy("memcpy(p) ", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) { + static_assert(srcView.storageBlobs.rank == 1); + static_assert(dstView.storageBlobs.rank == 1); + parallel_memcpy( + dstView.storageBlobs[0].data(), + srcView.storageBlobs[0].data(), + dstView.storageBlobs[0].size()); + }); } { @@ -196,6 +231,9 @@ int main(int argc, char** argv) benchmarkCopy("naive_copy", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) { naive_copy(srcView, dstView); }); + benchmarkCopy("naive_copy(p)", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) { + naive_copy(srcView, dstView, std::execution::par); + }); benchmarkCopy("memcpy ", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) { static_assert(srcView.storageBlobs.rank == 1); static_assert(dstView.storageBlobs.rank == 1); @@ -220,6 +258,9 @@ int main(int argc, char** argv) benchmarkCopy("naive_copy ", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) { naive_copy(srcView, dstView); }); + benchmarkCopy("naive_copy(p)", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) { + naive_copy(srcView, dstView, std::execution::par); + }); benchmarkCopy("memcpy ", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) { static_assert(srcView.storageBlobs.rank == 1); static_assert(dstView.storageBlobs.rank == 1); From 26876b35d0db604ef0144c4bf8ab87a52ce11010 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 6 Nov 2020 17:09:18 +0100 Subject: [PATCH 2/3] add parallel version aosoa copy --- examples/viewcopy/viewcopy.cpp | 86 +++++++++++++++++++++++----------- 1 file changed, 58 insertions(+), 28 deletions(-) diff --git a/examples/viewcopy/viewcopy.cpp b/examples/viewcopy/viewcopy.cpp index e878025142..33616c76ff 100644 --- a/examples/viewcopy/viewcopy.cpp +++ b/examples/viewcopy/viewcopy.cpp @@ -61,10 +61,10 @@ void naive_copy(const llama::View& srcView, llama::View + typename BlobType2, + typename Ex = std::execution::sequenced_policy> void aosoa_copy( const llama::View< llama::mapping::AoSoA, BlobType1>& srcView, llama::View< llama::mapping::AoSoA, - BlobType2>& dstView) + BlobType2>& dstView, + Ex ex = {}) { static_assert(srcView.storageBlobs.rank == 1); static_assert(dstView.storageBlobs.rank == 1); @@ -114,37 +116,59 @@ void aosoa_copy( return offset; }; + const auto threads = [&]() -> std::size_t { + if constexpr (std::is_same_v) + return 1u; + else + return std::thread::hardware_concurrency(); + }(); + const auto threadIds = boost::irange(threads); if constexpr (ReadOpt) { // optimized for linear reading - for (std::size_t i = 0; i < flatSize; i += LanesSrc) - { - llama::forEach([&](auto coord) { - constexpr auto L = std::min(LanesSrc, LanesDst); - for (std::size_t j = 0; j < LanesSrc; j += L) - { - constexpr auto bytes = L * sizeof(llama::GetType); - std::memcpy(&dst[map(i + j, coord, LanesDst)], src, bytes); - src += bytes; - } - }); - } + const auto elementsPerThread = ((flatSize / LanesSrc) / threads) * LanesSrc; + std::for_each(ex, std::begin(threadIds), std::end(threadIds), [&](auto id) { + const auto start = id * elementsPerThread; + const auto stop = id == threads - 1 ? flatSize : (id + 1) * elementsPerThread; + auto* threadSrc = src + map(start, llama::DatumCoord<>{}, LanesSrc); + + for (std::size_t i = start; i < stop; i += LanesSrc) + { + llama::forEach([&](auto coord) { + constexpr auto L = std::min(LanesSrc, LanesDst); + for (std::size_t j = 0; j < LanesSrc; j += L) + { + constexpr auto bytes = L * sizeof(llama::GetType); + std::memcpy(&dst[map(i + j, coord, LanesDst)], threadSrc, bytes); + threadSrc += bytes; + } + }); + } + }); } else { // optimized for linear writing - for (std::size_t i = 0; i < flatSize; i += LanesDst) - { - llama::forEach([&](auto coord) { - constexpr auto L = std::min(LanesSrc, LanesDst); - for (std::size_t j = 0; j < LanesDst; j += L) - { - constexpr auto bytes = L * sizeof(llama::GetType); - std::memcpy(dst, &src[map(i + j, coord, LanesSrc)], bytes); - dst += bytes; - } - }); - } + const auto elementsPerThread = ((flatSize / LanesDst) / threads) * LanesDst; + std::for_each(ex, std::begin(threadIds), std::end(threadIds), [&](auto id) { + const auto start = id * elementsPerThread; + const auto stop = id == threads - 1 ? flatSize : (id + 1) * elementsPerThread; + + auto* threadDst = dst + map(start, llama::DatumCoord<>{}, LanesDst); + + for (std::size_t i = start; i < stop; i += LanesDst) + { + llama::forEach([&](auto coord) { + constexpr auto L = std::min(LanesSrc, LanesDst); + for (std::size_t j = 0; j < LanesDst; j += L) + { + constexpr auto bytes = L * sizeof(llama::GetType); + std::memcpy(threadDst, &src[map(i + j, coord, LanesSrc)], bytes); + threadDst += bytes; + } + }); + } + }); } } @@ -272,5 +296,11 @@ int main(int argc, char** argv) benchmarkCopy("aosoa_copy(w)", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) { aosoa_copy(srcView, dstView); }); + benchmarkCopy("aosoa_copy(r,p)", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) { + aosoa_copy(srcView, dstView, std::execution::par); + }); + benchmarkCopy("aosoa_copy(w,p)", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) { + aosoa_copy(srcView, dstView, std::execution::par); + }); }); } From eeabdaca25a69b7fd2d1039e55ff7df61e4c5a18 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Mon, 9 Nov 2020 12:37:26 +0100 Subject: [PATCH 3/3] upgrade to g++-10 for Linux CI to support parallel STL --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 8c874a392c..d227c6742c 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -14,7 +14,7 @@ jobs: mkdir build cd build export BOOST_ROOT=$BOOST_ROOT_1_72_0 - CXX=g++-9 cmake .. -DCMAKE_BUILD_TYPE=RelWithDebInfo -DASAN_FOR_TESTS=ON -DCMAKE_TOOLCHAIN_FILE=/usr/local/share/vcpkg/scripts/buildsystems/vcpkg.cmake + CXX=g++-10 cmake .. -DCMAKE_BUILD_TYPE=RelWithDebInfo -DASAN_FOR_TESTS=ON -DCMAKE_TOOLCHAIN_FILE=/usr/local/share/vcpkg/scripts/buildsystems/vcpkg.cmake - name: build run: cmake --build build - name: tests