diff --git a/.clang-tidy b/.clang-tidy index 1c9d01d197..fb4d517538 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -50,7 +50,8 @@ Checks: > -misc-no-recursion, -llvm-header-guard, -cppcoreguidelines-macro-usage, - -fuchsia-statically-constructed-objects + -fuchsia-statically-constructed-objects, + -cppcoreguidelines-pro-type-union-access WarningsAsErrors: '*' HeaderFilterRegex: '' diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index fb6022920a..be55db938c 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -72,9 +72,16 @@ jobs: - uses: actions/checkout@v2 with: fetch-depth: 1 - - name: apt install boost and lcov + - name: install boost + run: | + BOOST_ARCHIVE=boost_${BOOST_VERSION//./_}.tar.bz2 + wget -q https://boostorg.jfrog.io/artifactory/main/release/$BOOST_VERSION/source/$BOOST_ARCHIVE + tar -xf $BOOST_ARCHIVE + rm $BOOST_ARCHIVE + mv boost_${BOOST_VERSION//./_} "${BOOST_ROOT}" + - name: install lcov run: | - sudo apt install libboost-all-dev lcov + sudo apt install lcov - name: vcpkg install dependencies run: | vcpkg install catch2 fmt diff --git a/CMakeLists.txt b/CMakeLists.txt index 96697fbccb..d975590084 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,6 +8,7 @@ add_library(${PROJECT_NAME} INTERFACE) target_include_directories(${PROJECT_NAME} INTERFACE $ $) target_compile_features(${PROJECT_NAME} INTERFACE cxx_std_17) target_link_libraries(${PROJECT_NAME} INTERFACE Boost::headers) +add_compile_definitions(BOOST_ATOMIC_NO_LIB) # we don't need the compiled part in LLAMA or its examples if (fmt_FOUND) target_link_libraries(${PROJECT_NAME} INTERFACE fmt::fmt) else() diff --git a/cmake/llama-config.cmake.in b/cmake/llama-config.cmake.in index ddcd8f1eb0..8e11ee4aa1 100644 --- a/cmake/llama-config.cmake.in +++ b/cmake/llama-config.cmake.in @@ -16,9 +16,15 @@ endif() set(llama_INCLUDE_DIR ${llama_INCLUDE_DIR} "${PACKAGE_PREFIX_DIR}/include") target_include_directories(llama INTERFACE ${llama_INCLUDE_DIR}) -# Boost +# dependencies find_package(Boost 1.70.0 REQUIRED) +find_package(fmt CONFIG QUIET) target_link_libraries(llama INTERFACE Boost::headers) +if (fmt_FOUND) + target_link_libraries(${PROJECT_NAME} INTERFACE fmt::fmt) +else() + message(WARNING "The fmt library was not found. You cannot use llama's dumping facilities.") +endif() INCLUDE(FindPackageHandleStandardArgs) diff --git a/docs/pages/mappings.rst b/docs/pages/mappings.rst index bc8af723ef..b1048cfae7 100644 --- a/docs/pages/mappings.rst +++ b/docs/pages/mappings.rst @@ -206,17 +206,27 @@ Trace ----- The Trace mapping is a meta mapping that wraps over an inner mapping and counts all accesses made to the fields of the record dimension. -A report is printed to the stdout when requested or the mapping instance is destroyed. +A report is printed to the stdout when requested. +The mapping adds an additional blob to the blobs of the inner mapping used as storage for the access counts. .. code-block:: C++ - { - auto anyMapping = ...; - llama::mapping::Trace mapping{anyMapping}; - ... - mapping.print(); // print report explicitly - } // report is printed implicitly + auto anyMapping = ...; + llama::mapping::Trace mapping{anyMapping}; + ... + mapping.printFieldHits(); // print report with read and writes to each field +The Trace mapping uses proxy references to instrument reads and writes. +If this is problematic, Trace can also be configured to return raw C++ references. +In that case, only the number of memory location computations can be traced, +but not how often the program reads/writes to those locations. + +.. code-block:: C++ + + auto anyMapping = ...; + llama::mapping::Trace mapping{anyMapping}; + ... + mapping.printFieldHits(); // print report with number of computed memory locations Null ---- diff --git a/examples/cuda/nbody/nbody.cu b/examples/cuda/nbody/nbody.cu index edf0452b73..951aaefaf5 100644 --- a/examples/cuda/nbody/nbody.cu +++ b/examples/cuda/nbody/nbody.cu @@ -17,6 +17,7 @@ constexpr auto SHARED_ELEMENTS_PER_BLOCK = 512; constexpr auto STEPS = 5; ///< number of steps to calculate constexpr auto ALLOW_RSQRT = true; // rsqrt can be way faster, but less accurate constexpr auto RUN_UPATE = true; // run update step. Useful to disable for benchmarking the move step. +constexpr auto TRACE = true; constexpr FP TIMESTEP = 0.0001f; constexpr auto THREADS_PER_BLOCK = 256; @@ -28,6 +29,12 @@ static_assert(SHARED_ELEMENTS_PER_BLOCK % THREADS_PER_BLOCK == 0); constexpr FP EPS2 = 0.01; +#if __CUDA_ARCH__ >= 600 +using CountType = unsigned long long int; +#else +using CountType = unsigned; +#endif + using namespace std::literals; // clang-format off @@ -205,12 +212,19 @@ try llama::mapping::BindSoA<>::fn, true>{extents}; }(); + auto tmapping = [&] + { + if constexpr(TRACE) + return llama::mapping::Trace, CountType>{mapping}; + else + return mapping; + }(); Stopwatch watch; - auto hostView = llama::allocViewUninitialized(mapping); + auto hostView = llama::allocViewUninitialized(tmapping); auto accView = llama::allocViewUninitialized( - mapping, + tmapping, [](auto alignment, std::size_t size) { std::byte* p = nullptr; @@ -236,6 +250,8 @@ try p(tag::Mass()) = distribution(engine) / FP(100); hostView(i) = p; } + if constexpr(TRACE) + hostView.mapping().fieldHits(hostView.storageBlobs) = {}; watch.printAndReset("init"); @@ -255,12 +271,15 @@ try }; start(); - for(auto i = 0; i < accView.storageBlobs.size(); i++) + const auto blobs = hostView.storageBlobs.size(); + for(std::size_t i = 0; i < blobs; i++) checkError(cudaMemcpy( accView.storageBlobs[i], hostView.storageBlobs[i].data(), - mapping.blobSize(i), + hostView.mapping().blobSize(i), cudaMemcpyHostToDevice)); + if constexpr(TRACE) + cudaMemset(accView.storageBlobs[blobs], 0, accView.mapping().blobSize(blobs)); // init trace count buffer std::cout << "copy H->D " << stop() << " s\n"; const auto blocks = PROBLEM_SIZE / THREADS_PER_BLOCK; @@ -290,15 +309,18 @@ try plotFile << std::quoted(title) << "\t" << sumUpdate / STEPS << '\t' << sumMove / STEPS << '\n'; start(); - for(auto i = 0; i < accView.storageBlobs.size(); i++) + for(std::size_t i = 0; i < blobs; i++) checkError(cudaMemcpy( hostView.storageBlobs[i].data(), accView.storageBlobs[i], - mapping.blobSize(i), + hostView.mapping().blobSize(i), cudaMemcpyDeviceToHost)); std::cout << "copy D->H " << stop() << " s\n"; - for(auto i = 0; i < accView.storageBlobs.size(); i++) + if constexpr(TRACE) + hostView.mapping().printFieldHits(hostView.storageBlobs); + + for(std::size_t i = 0; i < accView.storageBlobs.size(); i++) checkError(cudaFree(accView.storageBlobs[i])); checkError(cudaEventDestroy(startEvent)); checkError(cudaEventDestroy(stopEvent)); diff --git a/examples/nbody/nbody.cpp b/examples/nbody/nbody.cpp index 28b9d82ba2..5cf4ae2449 100644 --- a/examples/nbody/nbody.cpp +++ b/examples/nbody/nbody.cpp @@ -190,6 +190,8 @@ namespace usellama p(tag::Vel{}, tag::Z{}) = dist(engine) / FP(10); p(tag::Mass{}) = dist(engine) / FP(100); } + if constexpr(TRACE) + particles.mapping().fieldHits(particles.storageBlobs) = {}; watch.printAndReset("init"); double sumUpdate = 0; @@ -207,7 +209,9 @@ namespace usellama plotFile << std::quoted(title) << "\t" << sumUpdate / STEPS << '\t' << sumMove / STEPS << '\n'; if constexpr(HEATMAP) - std::ofstream("nbody_heatmap_" + mappingName(Mapping) + ".sh") << particles.mapping.toGnuplotScript(); + std::ofstream("nbody_heatmap_" + mappingName(Mapping) + ".sh") << particles.mapping().toGnuplotScript(); + if constexpr(TRACE) + particles.mapping().printFieldHits(particles.storageBlobs); return 0; } diff --git a/include/llama/Array.hpp b/include/llama/Array.hpp index 856235c9e2..32669f207b 100644 --- a/include/llama/Array.hpp +++ b/include/llama/Array.hpp @@ -24,6 +24,11 @@ namespace llama return N; } + LLAMA_FN_HOST_ACC_INLINE constexpr auto empty() const -> bool + { + return N == 0; + } + LLAMA_FN_HOST_ACC_INLINE constexpr auto begin() -> T* { return &element[0]; @@ -100,6 +105,11 @@ namespace llama return 0; } + LLAMA_FN_HOST_ACC_INLINE constexpr auto empty() const -> bool + { + return true; + } + LLAMA_FN_HOST_ACC_INLINE constexpr auto begin() -> T* { return nullptr; diff --git a/include/llama/View.hpp b/include/llama/View.hpp index a362eee89f..c057f9b1a0 100644 --- a/include/llama/View.hpp +++ b/include/llama/View.hpp @@ -376,11 +376,13 @@ namespace llama return static_cast(*this); } +#if !(defined(_MSC_VER) && defined(__NVCC__)) template auto operator()(llama::ArrayIndex) const { static_assert(!sizeof(V), "Passed ArrayIndex with SizeType different than Mapping::ArrayExtent"); } +#endif /// Retrieves the \ref VirtualRecord at the given \ref ArrayIndex index. LLAMA_FN_HOST_ACC_INLINE auto operator()(ArrayIndex ai) const -> decltype(auto) @@ -451,11 +453,13 @@ namespace llama return (*this)(ai); } +#if !(defined(_MSC_VER) && defined(__NVCC__)) template auto operator[](llama::ArrayIndex) const { static_assert(!sizeof(V), "Passed ArrayIndex with SizeType different than Mapping::ArrayExtent"); } +#endif /// Retrieves the \ref VirtualRecord at the 1D \ref ArrayIndex index constructed from the passed index. LLAMA_FN_HOST_ACC_INLINE auto operator[](size_type index) const -> decltype(auto) diff --git a/include/llama/mapping/Trace.hpp b/include/llama/mapping/Trace.hpp index d56062e9ff..df7661dca5 100644 --- a/include/llama/mapping/Trace.hpp +++ b/include/llama/mapping/Trace.hpp @@ -2,16 +2,51 @@ #include "Common.hpp" -#include #include +#include #include +#ifndef __cpp_lib_atomic_ref +# include +#endif + namespace llama::mapping { - /// Forwards all calls to the inner mapping. Traces all accesses made through this mapping and prints a summary on - /// destruction. - /// \tparam Mapping The type of the inner mapping. - template + namespace internal + { + template + LLAMA_FN_HOST_ACC_INLINE void atomicInc(CountType& i) + { +#ifdef __CUDA_ARCH__ + // if you get an error here that there is no overload of atomicAdd, your CMAKE_CUDA_ARCHITECTURE might be + // too low or you need to use a smaller CountType for the Trace mapping. + atomicAdd(&i, CountType{1}); +#elif defined(__cpp_lib_atomic_ref) + ++std::atomic_ref{i}; +#else + ++boost::atomic_ref{i}; +#endif + } + } // namespace internal + + template + struct AccessCounts + { + union + { + CountType memLocsComputed; + CountType reads; + }; + CountType writes; + }; + + /// Forwards all calls to the inner mapping. Traces all accesses made through this mapping and allows printing a + /// summary. + /// /tparam Mapping The type of the inner mapping. + /// /tparam CountType The type used for counting the number of accesses. + /// /tparam MyCodeHandlesProxyReferences If false, Trace will avoid proxy references but can then only count + /// the number of address computations + template struct Trace : Mapping { private: @@ -19,66 +54,127 @@ namespace llama::mapping public: using RecordDim = typename Mapping::RecordDim; + using FieldHitsArray = Array, flatFieldCount>; + + inline static constexpr auto blobCount = Mapping::blobCount + 1; constexpr Trace() = default; LLAMA_FN_HOST_ACC_INLINE - explicit Trace(Mapping mapping, bool printOnDestruction = true) - : Mapping(mapping) - , fieldHits{} - , printOnDestruction(printOnDestruction) + explicit Trace(Mapping mapping) : Mapping(std::move(mapping)) { } - Trace(const Trace&) = delete; - auto operator=(const Trace&) -> Trace& = delete; + LLAMA_FN_HOST_ACC_INLINE + constexpr auto blobSize(size_type blobIndex) const -> size_type + { + if(blobIndex < size_type{Mapping::blobCount}) + return inner().blobSize(blobIndex); + return sizeof(FieldHitsArray); + } - Trace(Trace&& other) noexcept - : Mapping(std::move(static_cast(other))) - , printOnDestruction(other.printOnDestruction) + template + static constexpr auto isComputed(RecordCoord) { - for(std::size_t i = 0; i < fieldHits.size(); i++) - fieldHits[i] = other.fieldHits[i].load(); - other.printOnDestruction = false; + return true; } - auto operator=(Trace&& other) noexcept -> Trace& + template + LLAMA_FN_HOST_ACC_INLINE auto compute( + typename Mapping::ArrayIndex ai, + RecordCoord rc, + Blobs& blobs) const -> decltype(auto) { - static_cast(*this) = std::move(static_cast(other)); - printOnDestruction = other.printOnDestruction; - for(std::size_t i = 0; i < fieldHits.size(); i++) - fieldHits[i] = other.fieldHits[i].load(); - other.printOnDestruction = false; - return *this; + auto& hits = fieldHits(blobs)[+flatRecordCoord>]; + auto&& ref = mapToMemory(inner(), ai, rc, blobs); + if constexpr(MyCodeHandlesProxyReferences) + { + using Ref = decltype(mapToMemory(inner(), ai, rc, + blobs)); // T& or proxy reference + using VT = GetType; + struct Reference : ProxyRefOpMixin + { + using value_type = VT; + + Ref r; + AccessCounts* hits; + + LLAMA_FN_HOST_ACC_INLINE auto operator=(value_type t) -> Reference& + { + internal::atomicInc(hits->writes); + r = t; + return *this; + } + + LLAMA_FN_HOST_ACC_INLINE operator value_type() const + { + internal::atomicInc(hits->reads); + return static_cast(r); + } + }; + return Reference{{}, std::forward(ref), &hits}; + } + else + { + internal::atomicInc(hits.memLocsComputed); + return ref; + } } - ~Trace() + template + LLAMA_FN_HOST_ACC_INLINE auto fieldHits(const Blobs& blobs) const -> const FieldHitsArray& { - if(printOnDestruction && !fieldHits.empty()) - print(); + return reinterpret_cast(*&blobs[blobCount - 1][0]); } - template - LLAMA_FN_HOST_ACC_INLINE auto blobNrAndOffset( - typename Mapping::ArrayIndex ai, - RecordCoord rc = {}) const -> NrAndOffset + template + LLAMA_FN_HOST_ACC_INLINE auto fieldHits(Blobs& blobs) const -> FieldHitsArray& { - ++fieldHits[flatRecordCoord>]; - return Mapping::blobNrAndOffset(ai, rc); + return const_cast(fieldHits(std::as_const(blobs))); } - void print() const + template + LLAMA_FN_HOST_ACC_INLINE void printFieldHits(const Blobs& blobs) const { - std::cout << "Trace mapping, number of accesses:\n"; + const auto& hits = fieldHits(blobs); +#ifdef __CUDA_ARCH__ + if constexpr(MyCodeHandlesProxyReferences) + printf("Trace mapping, number of accesses:\n"); + else + printf("Trace mapping, number of memory locations computed:\n"); + + for(int i = 0; i < hits.size(); i++) + if constexpr(MyCodeHandlesProxyReferences) + printf( + "\t%i:\tR: %lu\tW: %lu\n", + i, + static_cast(hits[i].reads), + static_cast(hits[i].writes)); + else + printf("\t%i:\t%lu\n", i, static_cast(hits[i].memLocsComputed)); +#else + if constexpr(MyCodeHandlesProxyReferences) + std::cout << "Trace mapping, number of accesses:\n"; + else + std::cout << "Trace mapping, number of memory locations computed:\n"; forEachLeafCoord( - [this](auto rc) + [&](auto rc) { - std::cout << '\t' << recordCoordTags(rc) << ":\t" - << fieldHits[flatRecordCoord] << '\n'; + const size_type i = flatRecordCoord; + if constexpr(MyCodeHandlesProxyReferences) + std::cout << '\t' << recordCoordTags(rc) << ":\tR: " << hits[i].reads + << "\tW: " << hits[i].writes << '\n'; + else + std::cout << '\t' << recordCoordTags(rc) << ":\t " << hits[i].memLocsComputed + << '\n'; }); +#endif } - mutable std::array, flatFieldCount> fieldHits; - bool printOnDestruction; + private: + LLAMA_FN_HOST_ACC_INLINE auto inner() const -> const Mapping& + { + return static_cast(*this); + } }; } // namespace llama::mapping diff --git a/tests/array.cpp b/tests/array.cpp index 7f4335351d..b1324eea50 100644 --- a/tests/array.cpp +++ b/tests/array.cpp @@ -1,5 +1,11 @@ #include "common.hpp" +TEST_CASE("Array.empty") +{ + STATIC_REQUIRE(llama::Array{}.empty()); + STATIC_REQUIRE(!llama::Array{1}.empty()); +} + TEST_CASE("Array.operator<<") { auto put = [](auto array) diff --git a/tests/mapping.HeatmapTrace.cpp b/tests/mapping.HeatmapTrace.cpp index cf0b70dbaa..381d6eb7d8 100644 --- a/tests/mapping.HeatmapTrace.cpp +++ b/tests/mapping.HeatmapTrace.cpp @@ -57,46 +57,81 @@ TEST_CASE("Heatmap.nbody") llama::mapping::SingleBlobSoA, ParticleHeatmap>{}); } -TEST_CASE("Trace.nbody") +TEMPLATE_LIST_TEST_CASE("Trace.nbody.mem_locs_computed", "", SizeTypes) { auto run = [&](auto mapping) { - auto particles = llama::allocView(llama::mapping::Trace{mapping, false}); + auto particles = llama::allocView(llama::mapping::Trace{mapping}); updateAndMove(particles); - auto& hits = particles.mapping().fieldHits; + auto& hits = particles.mapping().fieldHits(particles.storageBlobs); CHECK(hits.size() == 7); - CHECK(hits[0] == 10400); - CHECK(hits[1] == 10400); - CHECK(hits[2] == 10400); - CHECK(hits[3] == 400); - CHECK(hits[4] == 400); - CHECK(hits[5] == 400); - CHECK(hits[6] == 10300); + CHECK(hits[0].memLocsComputed == 10300); + CHECK(hits[1].memLocsComputed == 10300); + CHECK(hits[2].memLocsComputed == 10300); + CHECK(hits[3].memLocsComputed == 300); + CHECK(hits[4].memLocsComputed == 300); + CHECK(hits[5].memLocsComputed == 300); + CHECK(hits[6].memLocsComputed == 10200); + + std::stringstream buffer; + std::streambuf* old = std::cout.rdbuf(buffer.rdbuf()); + particles.mapping().printFieldHits(particles.storageBlobs); + std::cout.rdbuf(old); + CHECK( + buffer.str() + == "Trace mapping, number of memory locations computed:\n" + "\tPos.X:\t 10300\n" + "\tPos.Y:\t 10300\n" + "\tPos.Z:\t 10300\n" + "\tVel.X:\t 300\n" + "\tVel.Y:\t 300\n" + "\tVel.Z:\t 300\n" + "\tMass:\t 10200\n"); }; run(llama::mapping::AlignedAoS, ParticleHeatmap>{}); run(llama::mapping::SingleBlobSoA, ParticleHeatmap>{}); } -TEST_CASE("Trace.print_dtor") +TEMPLATE_LIST_TEST_CASE("Trace.nbody.reads_writes", "", SizeTypes) { - std::stringstream buffer; - std::streambuf* old = std::cout.rdbuf(buffer.rdbuf()); + auto run = [&](auto mapping) { - auto particles = llama::allocView( - llama::mapping::Trace{llama::mapping::AlignedAoS, ParticleHeatmap>{}}); + auto particles = llama::allocView(llama::mapping::Trace{mapping}); updateAndMove(particles); - } - std::cout.rdbuf(old); - CHECK( - buffer.str() - == "Trace mapping, number of accesses:\n" - "\tPos.X:\t10400\n" - "\tPos.Y:\t10400\n" - "\tPos.Z:\t10400\n" - "\tVel.X:\t400\n" - "\tVel.Y:\t400\n" - "\tVel.Z:\t400\n" - "\tMass:\t10300\n"); + auto& hits = particles.mapping().fieldHits(particles.storageBlobs); + CHECK(hits.size() == 7); + CHECK(hits[0].reads == 10200); + CHECK(hits[1].reads == 10200); + CHECK(hits[2].reads == 10200); + CHECK(hits[3].reads == 200); + CHECK(hits[4].reads == 200); + CHECK(hits[5].reads == 200); + CHECK(hits[6].reads == 10100); + CHECK(hits[0].writes == 200); + CHECK(hits[1].writes == 200); + CHECK(hits[2].writes == 200); + CHECK(hits[3].writes == 100); + CHECK(hits[4].writes == 100); + CHECK(hits[5].writes == 100); + CHECK(hits[6].writes == 100); + + std::stringstream buffer; + std::streambuf* old = std::cout.rdbuf(buffer.rdbuf()); + particles.mapping().printFieldHits(particles.storageBlobs); + std::cout.rdbuf(old); + CHECK( + buffer.str() + == "Trace mapping, number of accesses:\n" + "\tPos.X:\tR: 10200\tW: 200\n" + "\tPos.Y:\tR: 10200\tW: 200\n" + "\tPos.Z:\tR: 10200\tW: 200\n" + "\tVel.X:\tR: 200\tW: 100\n" + "\tVel.Y:\tR: 200\tW: 100\n" + "\tVel.Z:\tR: 200\tW: 100\n" + "\tMass:\tR: 10100\tW: 100\n"); + }; + run(llama::mapping::AlignedAoS, ParticleHeatmap>{}); + run(llama::mapping::SingleBlobSoA, ParticleHeatmap>{}); } namespace diff --git a/tests/mapping.cpp b/tests/mapping.cpp index 54a15775a9..fac2a70b22 100644 --- a/tests/mapping.cpp +++ b/tests/mapping.cpp @@ -21,7 +21,7 @@ TEMPLATE_LIST_TEST_CASE("mapping.concepts", "", SizeTypes) STATIC_REQUIRE(llama::PhysicalMapping, Particle, 8>>); using Inner = llama::mapping::AlignedAoS, Particle>; - STATIC_REQUIRE(llama::PhysicalMapping>); + STATIC_REQUIRE(llama::FullyComputedMapping>); STATIC_REQUIRE(llama::PhysicalMapping>); STATIC_REQUIRE(