diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 8c874a392c..d227c6742c 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -14,7 +14,7 @@ jobs: mkdir build cd build export BOOST_ROOT=$BOOST_ROOT_1_72_0 - CXX=g++-9 cmake .. -DCMAKE_BUILD_TYPE=RelWithDebInfo -DASAN_FOR_TESTS=ON -DCMAKE_TOOLCHAIN_FILE=/usr/local/share/vcpkg/scripts/buildsystems/vcpkg.cmake + CXX=g++-10 cmake .. -DCMAKE_BUILD_TYPE=RelWithDebInfo -DASAN_FOR_TESTS=ON -DCMAKE_TOOLCHAIN_FILE=/usr/local/share/vcpkg/scripts/buildsystems/vcpkg.cmake - name: build run: cmake --build build - name: tests diff --git a/examples/viewcopy/viewcopy.cpp b/examples/viewcopy/viewcopy.cpp index 6c1688adc0..33616c76ff 100644 --- a/examples/viewcopy/viewcopy.cpp +++ b/examples/viewcopy/viewcopy.cpp @@ -1,9 +1,12 @@ #include #include +#include #include +#include #include #include #include +#include // clang-format off namespace tag @@ -31,15 +34,21 @@ using Particle = llama::DS< >; // clang-format on -template -void naive_copy(const llama::View& srcView, llama::View& dstView) +template < + typename Mapping1, + typename BlobType1, + typename Mapping2, + typename BlobType2, + typename Ex = std::execution::sequenced_policy> +void naive_copy(const llama::View& srcView, llama::View& dstView, Ex ex = {}) { static_assert(std::is_same_v); if (srcView.mapping.arrayDomainSize != dstView.mapping.arrayDomainSize) throw std::runtime_error{"UserDomain sizes are different"}; - for (auto ad : llama::ArrayDomainIndexRange{srcView.mapping.arrayDomainSize}) + auto r = llama::ArrayDomainIndexRange{srcView.mapping.arrayDomainSize}; + std::for_each(ex, std::begin(r), std::end(r), [&](auto ad) { llama::forEach([&](auto coord) { dstView(ad)(coord) = srcView(ad)(coord); // std::memcpy( @@ -47,6 +56,21 @@ void naive_copy(const llama::View& srcView, llama::View)); }); + }); +} + +void parallel_memcpy(std::byte* dst, const std::byte* src, std::size_t size) +{ + const auto threads = std::size_t{std::thread::hardware_concurrency()}; + const auto sizePerThread = size / threads; + const auto sizeLastThread = sizePerThread + size % threads; + auto r = boost::irange(threads); + std::for_each(std::execution::par, std::begin(r), std::end(r), [&](auto id) { + std::memcpy( + dst + id * sizePerThread, + src + id * sizePerThread, + id == threads - 1 ? sizeLastThread : sizePerThread); + }); } template < @@ -56,14 +80,16 @@ template < std::size_t LanesSrc, typename BlobType1, std::size_t LanesDst, - typename BlobType2> + typename BlobType2, + typename Ex = std::execution::sequenced_policy> void aosoa_copy( const llama::View< llama::mapping::AoSoA, BlobType1>& srcView, llama::View< llama::mapping::AoSoA, - BlobType2>& dstView) + BlobType2>& dstView, + Ex ex = {}) { static_assert(srcView.storageBlobs.rank == 1); static_assert(dstView.storageBlobs.rank == 1); @@ -90,37 +116,59 @@ void aosoa_copy( return offset; }; + const auto threads = [&]() -> std::size_t { + if constexpr (std::is_same_v) + return 1u; + else + return std::thread::hardware_concurrency(); + }(); + const auto threadIds = boost::irange(threads); if constexpr (ReadOpt) { // optimized for linear reading - for (std::size_t i = 0; i < flatSize; i += LanesSrc) - { - llama::forEach([&](auto coord) { - constexpr auto L = std::min(LanesSrc, LanesDst); - for (std::size_t j = 0; j < LanesSrc; j += L) - { - constexpr auto bytes = L * sizeof(llama::GetType); - std::memcpy(&dst[map(i + j, coord, LanesDst)], src, bytes); - src += bytes; - } - }); - } + const auto elementsPerThread = ((flatSize / LanesSrc) / threads) * LanesSrc; + std::for_each(ex, std::begin(threadIds), std::end(threadIds), [&](auto id) { + const auto start = id * elementsPerThread; + const auto stop = id == threads - 1 ? flatSize : (id + 1) * elementsPerThread; + auto* threadSrc = src + map(start, llama::DatumCoord<>{}, LanesSrc); + + for (std::size_t i = start; i < stop; i += LanesSrc) + { + llama::forEach([&](auto coord) { + constexpr auto L = std::min(LanesSrc, LanesDst); + for (std::size_t j = 0; j < LanesSrc; j += L) + { + constexpr auto bytes = L * sizeof(llama::GetType); + std::memcpy(&dst[map(i + j, coord, LanesDst)], threadSrc, bytes); + threadSrc += bytes; + } + }); + } + }); } else { // optimized for linear writing - for (std::size_t i = 0; i < flatSize; i += LanesDst) - { - llama::forEach([&](auto coord) { - constexpr auto L = std::min(LanesSrc, LanesDst); - for (std::size_t j = 0; j < LanesDst; j += L) - { - constexpr auto bytes = L * sizeof(llama::GetType); - std::memcpy(dst, &src[map(i + j, coord, LanesSrc)], bytes); - dst += bytes; - } - }); - } + const auto elementsPerThread = ((flatSize / LanesDst) / threads) * LanesDst; + std::for_each(ex, std::begin(threadIds), std::end(threadIds), [&](auto id) { + const auto start = id * elementsPerThread; + const auto stop = id == threads - 1 ? flatSize : (id + 1) * elementsPerThread; + + auto* threadDst = dst + map(start, llama::DatumCoord<>{}, LanesDst); + + for (std::size_t i = start; i < stop; i += LanesDst) + { + llama::forEach([&](auto coord) { + constexpr auto L = std::min(LanesSrc, LanesDst); + for (std::size_t j = 0; j < LanesDst; j += L) + { + constexpr auto bytes = L * sizeof(llama::GetType); + std::memcpy(threadDst, &src[map(i + j, coord, LanesSrc)], bytes); + threadDst += bytes; + } + }); + } + }); } } @@ -180,11 +228,22 @@ int main(int argc, char** argv) benchmarkCopy("naive_copy", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) { naive_copy(srcView, dstView); }); + benchmarkCopy("naive_copy(p)", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) { + naive_copy(srcView, dstView, std::execution::par); + }); benchmarkCopy("memcpy ", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) { static_assert(srcView.storageBlobs.rank == 1); static_assert(dstView.storageBlobs.rank == 1); std::memcpy(dstView.storageBlobs[0].data(), srcView.storageBlobs[0].data(), dstView.storageBlobs[0].size()); }); + benchmarkCopy("memcpy(p) ", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) { + static_assert(srcView.storageBlobs.rank == 1); + static_assert(dstView.storageBlobs.rank == 1); + parallel_memcpy( + dstView.storageBlobs[0].data(), + srcView.storageBlobs[0].data(), + dstView.storageBlobs[0].size()); + }); } { @@ -196,6 +255,9 @@ int main(int argc, char** argv) benchmarkCopy("naive_copy", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) { naive_copy(srcView, dstView); }); + benchmarkCopy("naive_copy(p)", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) { + naive_copy(srcView, dstView, std::execution::par); + }); benchmarkCopy("memcpy ", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) { static_assert(srcView.storageBlobs.rank == 1); static_assert(dstView.storageBlobs.rank == 1); @@ -220,6 +282,9 @@ int main(int argc, char** argv) benchmarkCopy("naive_copy ", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) { naive_copy(srcView, dstView); }); + benchmarkCopy("naive_copy(p)", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) { + naive_copy(srcView, dstView, std::execution::par); + }); benchmarkCopy("memcpy ", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) { static_assert(srcView.storageBlobs.rank == 1); static_assert(dstView.storageBlobs.rank == 1); @@ -231,5 +296,11 @@ int main(int argc, char** argv) benchmarkCopy("aosoa_copy(w)", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) { aosoa_copy(srcView, dstView); }); + benchmarkCopy("aosoa_copy(r,p)", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) { + aosoa_copy(srcView, dstView, std::execution::par); + }); + benchmarkCopy("aosoa_copy(w,p)", srcView, srcHash, dstMapping, [](const auto& srcView, auto& dstView) { + aosoa_copy(srcView, dstView, std::execution::par); + }); }); }