alpaka-group · bernhardmgruber · May 24, 2022 · May 12, 2022 · May 23, 2022 · May 23, 2022
diff --git a/.clang-tidy b/.clang-tidy
@@ -50,7 +50,8 @@ Checks: >
     -misc-no-recursion,
     -llvm-header-guard,
     -cppcoreguidelines-macro-usage,
-    -fuchsia-statically-constructed-objects
+    -fuchsia-statically-constructed-objects,
+    -cppcoreguidelines-pro-type-union-access
 
 WarningsAsErrors: '*'
 HeaderFilterRegex: ''

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -72,9 +72,16 @@ jobs:
     - uses: actions/checkout@v2
       with:
         fetch-depth: 1
-    - name: apt install boost and lcov
+    - name: install boost
+      run: |
+          BOOST_ARCHIVE=boost_${BOOST_VERSION//./_}.tar.bz2
+          wget -q https://boostorg.jfrog.io/artifactory/main/release/$BOOST_VERSION/source/$BOOST_ARCHIVE
+          tar -xf $BOOST_ARCHIVE
+          rm $BOOST_ARCHIVE
+          mv boost_${BOOST_VERSION//./_} "${BOOST_ROOT}"
+    - name: install lcov
       run: |
-        sudo apt install libboost-all-dev lcov
+        sudo apt install lcov
     - name: vcpkg install dependencies
       run: |
         vcpkg install catch2 fmt

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -8,6 +8,7 @@ add_library(${PROJECT_NAME} INTERFACE)
 target_include_directories(${PROJECT_NAME} INTERFACE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include> $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
 target_compile_features(${PROJECT_NAME} INTERFACE cxx_std_17)
 target_link_libraries(${PROJECT_NAME} INTERFACE Boost::headers)
+add_compile_definitions(BOOST_ATOMIC_NO_LIB) # we don't need the compiled part in LLAMA or its examples
 if (fmt_FOUND)
 	target_link_libraries(${PROJECT_NAME} INTERFACE fmt::fmt)
 else()

diff --git a/cmake/llama-config.cmake.in b/cmake/llama-config.cmake.in
@@ -16,9 +16,15 @@ endif()
 set(llama_INCLUDE_DIR ${llama_INCLUDE_DIR} "${PACKAGE_PREFIX_DIR}/include")
 target_include_directories(llama INTERFACE ${llama_INCLUDE_DIR})
 
-# Boost
+# dependencies
 find_package(Boost 1.70.0 REQUIRED)
+find_package(fmt CONFIG QUIET)
 target_link_libraries(llama INTERFACE Boost::headers)
+if (fmt_FOUND)
+	target_link_libraries(${PROJECT_NAME} INTERFACE fmt::fmt)
+else()
+	message(WARNING "The fmt library was not found. You cannot use llama's dumping facilities.")
+endif()
 
 
 INCLUDE(FindPackageHandleStandardArgs)

diff --git a/docs/pages/mappings.rst b/docs/pages/mappings.rst
@@ -206,17 +206,27 @@ Trace
 -----
 
 The Trace mapping is a meta mapping that wraps over an inner mapping and counts all accesses made to the fields of the record dimension.
-A report is printed to the stdout when requested or the mapping instance is destroyed.
+A report is printed to the stdout when requested.
+The mapping adds an additional blob to the blobs of the inner mapping used as storage for the access counts.
 
 .. code-block:: C++
 
-    {
-        auto anyMapping = ...;
-        llama::mapping::Trace mapping{anyMapping};
-        ...
-        mapping.print(); // print report explicitly
-    } // report is printed implicitly
+    auto anyMapping = ...;
+    llama::mapping::Trace mapping{anyMapping};
+    ...
+    mapping.printFieldHits(); // print report with read and writes to each field
 
+The Trace mapping uses proxy references to instrument reads and writes.
+If this is problematic, Trace can also be configured to return raw C++ references.
+In that case, only the number of memory location computations can be traced,
+but not how often the program reads/writes to those locations.
+
+.. code-block:: C++
+
+    auto anyMapping = ...;
+    llama::mapping::Trace<decltype(anyMapping), false> mapping{anyMapping};
+    ...
+    mapping.printFieldHits(); // print report with number of computed memory locations
 
 Null
 ----

diff --git a/examples/cuda/nbody/nbody.cu b/examples/cuda/nbody/nbody.cu
@@ -17,6 +17,7 @@ constexpr auto SHARED_ELEMENTS_PER_BLOCK = 512;
 constexpr auto STEPS = 5; ///< number of steps to calculate
 constexpr auto ALLOW_RSQRT = true; // rsqrt can be way faster, but less accurate
 constexpr auto RUN_UPATE = true; // run update step. Useful to disable for benchmarking the move step.
+constexpr auto TRACE = true;
 constexpr FP TIMESTEP = 0.0001f;
 
 constexpr auto THREADS_PER_BLOCK = 256;
@@ -28,6 +29,12 @@ static_assert(SHARED_ELEMENTS_PER_BLOCK % THREADS_PER_BLOCK == 0);
 
 constexpr FP EPS2 = 0.01;
 
+#if __CUDA_ARCH__ >= 600
+using CountType = unsigned long long int;
+#else
+using CountType = unsigned;
+#endif
+
 using namespace std::literals;
 
 // clang-format off
@@ -205,12 +212,19 @@ try
                 llama::mapping::BindSoA<>::fn,
                 true>{extents};
     }();
+    auto tmapping = [&]
+    {
+        if constexpr(TRACE)
+            return llama::mapping::Trace<std::decay_t<decltype(mapping)>, CountType>{mapping};
+        else
+            return mapping;
+    }();
 
     Stopwatch watch;
 
-    auto hostView = llama::allocViewUninitialized(mapping);
+    auto hostView = llama::allocViewUninitialized(tmapping);
     auto accView = llama::allocViewUninitialized(
-        mapping,
+        tmapping,
         [](auto alignment, std::size_t size)
         {
             std::byte* p = nullptr;
@@ -236,6 +250,8 @@ try
         p(tag::Mass()) = distribution(engine) / FP(100);
         hostView(i) = p;
     }
+    if constexpr(TRACE)
+        hostView.mapping().fieldHits(hostView.storageBlobs) = {};
 
     watch.printAndReset("init");
 
@@ -255,12 +271,15 @@ try
     };
 
     start();
-    for(auto i = 0; i < accView.storageBlobs.size(); i++)
+    const auto blobs = hostView.storageBlobs.size();
+    for(std::size_t i = 0; i < blobs; i++)
         checkError(cudaMemcpy(
             accView.storageBlobs[i],
             hostView.storageBlobs[i].data(),
-            mapping.blobSize(i),
+            hostView.mapping().blobSize(i),
             cudaMemcpyHostToDevice));
+    if constexpr(TRACE)
+        cudaMemset(accView.storageBlobs[blobs], 0, accView.mapping().blobSize(blobs)); // init trace count buffer
     std::cout << "copy H->D " << stop() << " s\n";
 
     const auto blocks = PROBLEM_SIZE / THREADS_PER_BLOCK;
@@ -290,15 +309,18 @@ try
     plotFile << std::quoted(title) << "\t" << sumUpdate / STEPS << '\t' << sumMove / STEPS << '\n';
 
     start();
-    for(auto i = 0; i < accView.storageBlobs.size(); i++)
+    for(std::size_t i = 0; i < blobs; i++)
         checkError(cudaMemcpy(
             hostView.storageBlobs[i].data(),
             accView.storageBlobs[i],
-            mapping.blobSize(i),
+            hostView.mapping().blobSize(i),
             cudaMemcpyDeviceToHost));
     std::cout << "copy D->H " << stop() << " s\n";
 
-    for(auto i = 0; i < accView.storageBlobs.size(); i++)
+    if constexpr(TRACE)
+        hostView.mapping().printFieldHits(hostView.storageBlobs);
+
+    for(std::size_t i = 0; i < accView.storageBlobs.size(); i++)
         checkError(cudaFree(accView.storageBlobs[i]));
     checkError(cudaEventDestroy(startEvent));
     checkError(cudaEventDestroy(stopEvent));

diff --git a/examples/nbody/nbody.cpp b/examples/nbody/nbody.cpp
@@ -190,6 +190,8 @@ namespace usellama
             p(tag::Vel{}, tag::Z{}) = dist(engine) / FP(10);
             p(tag::Mass{}) = dist(engine) / FP(100);
         }
+        if constexpr(TRACE)
+            particles.mapping().fieldHits(particles.storageBlobs) = {};
         watch.printAndReset("init");
 
         double sumUpdate = 0;
@@ -207,7 +209,9 @@ namespace usellama
         plotFile << std::quoted(title) << "\t" << sumUpdate / STEPS << '\t' << sumMove / STEPS << '\n';
 
         if constexpr(HEATMAP)
-            std::ofstream("nbody_heatmap_" + mappingName(Mapping) + ".sh") << particles.mapping.toGnuplotScript();
+            std::ofstream("nbody_heatmap_" + mappingName(Mapping) + ".sh") << particles.mapping().toGnuplotScript();
+        if constexpr(TRACE)
+            particles.mapping().printFieldHits(particles.storageBlobs);
 
         return 0;
     }

diff --git a/include/llama/Array.hpp b/include/llama/Array.hpp
@@ -24,6 +24,11 @@ namespace llama
             return N;
         }
 
+        LLAMA_FN_HOST_ACC_INLINE constexpr auto empty() const -> bool
+        {
+            return N == 0;
+        }
+
         LLAMA_FN_HOST_ACC_INLINE constexpr auto begin() -> T*
         {
             return &element[0];
@@ -100,6 +105,11 @@ namespace llama
             return 0;
         }
 
+        LLAMA_FN_HOST_ACC_INLINE constexpr auto empty() const -> bool
+        {
+            return true;
+        }
+
         LLAMA_FN_HOST_ACC_INLINE constexpr auto begin() -> T*
         {
             return nullptr;

diff --git a/include/llama/View.hpp b/include/llama/View.hpp
@@ -376,11 +376,13 @@ namespace llama
             return static_cast<const Mapping&>(*this);
         }
 
+#if !(defined(_MSC_VER) && defined(__NVCC__))
         template<typename V>
         auto operator()(llama::ArrayIndex<V, ArrayIndex::rank>) const
         {
             static_assert(!sizeof(V), "Passed ArrayIndex with SizeType different than Mapping::ArrayExtent");
         }
+#endif
 
         /// Retrieves the \ref VirtualRecord at the given \ref ArrayIndex index.
         LLAMA_FN_HOST_ACC_INLINE auto operator()(ArrayIndex ai) const -> decltype(auto)
@@ -451,11 +453,13 @@ namespace llama
             return (*this)(ai);
         }
 
+#if !(defined(_MSC_VER) && defined(__NVCC__))
         template<typename V>
         auto operator[](llama::ArrayIndex<V, ArrayIndex::rank>) const
         {
             static_assert(!sizeof(V), "Passed ArrayIndex with SizeType different than Mapping::ArrayExtent");
         }
+#endif
 
         /// Retrieves the \ref VirtualRecord at the 1D \ref ArrayIndex index constructed from the passed index.
         LLAMA_FN_HOST_ACC_INLINE auto operator[](size_type index) const -> decltype(auto)