Merge pull request #22 from CExA-project/barcelona_fixes

Add feedback from Barcelona training
CExA-project · Jan 20, 2025 · 88b3476 · 88b3476
2 parents eb8dd5b + 64d0fc5
commit 88b3476
Show file tree

Hide file tree

Showing 15 changed files with 24,191 additions and 53 deletions.
diff --git a/.github/workflows/exercises.yml b/.github/workflows/exercises.yml
@@ -12,6 +12,7 @@ on:
   pull_request:
     paths:
       - 'exercises/**'
+      - 'cmake/**'
 
 jobs:
   build_test_exercises:

diff --git a/.github/workflows/projects.yml b/.github/workflows/projects.yml
@@ -12,6 +12,7 @@ on:
   pull_request:
     paths:
       - 'projects/**'
+      - 'cmake/**'
 
 jobs:
   build_test_projects:

diff --git a/cmake/modules/FindKokkos.cmake b/cmake/modules/FindKokkos.cmake
@@ -49,9 +49,9 @@ include(FetchContent)
 
 FetchContent_Declare(
     kokkos
+    DOWNLOAD_EXTRACT_TIMESTAMP ON
     URL https://github.com/kokkos/kokkos/releases/download/4.5.01/kokkos-4.5.01.zip
     SOURCE_DIR ${CexaKokkosTutorials_KOKKOS_SOURCE_DIR}
-    DOWNLOAD_EXTRACT_TIMESTAMP ON
 )
 FetchContent_MakeAvailable(kokkos)
 set(Kokkos_FOUND True)
diff --git a/courses/01_beginners/main.tex b/courses/01_beginners/main.tex
@@ -226,15 +226,27 @@ \section{Introduction}
 
 % _____________________________________________________________________________
 
-\begin{frame}{Host/device model}
-    \begin{itemize}
-        \item The program is always started first on the CPU
-        \item Today's GPUs \highlight{cannot} work standalone
-        \item The CPU is often referred to as the \highlight{Host}
-        \item The CPU orchestrates when the kernels are launched on the GPU and how to make the memory transfers
-        \item The GPU waits for kernels to execute
-        \item The GPU is often referred to as the \highlight{Device}
-    \end{itemize}
+\begin{frame}{Host/Device model}
+    \begin{columns}[T]
+        \begin{column}{0.5\linewidth}
+            CPU
+
+            \begin{itemize}
+                \item Referred to as the \highlight{Host}
+                \item Programs are always started first on it
+                \item Orchestrates when the kernels are launched on the GPU and how to make the CPU/GPU memory transfers
+            \end{itemize}
+        \end{column}
+        \begin{column}{0.5\linewidth}
+            GPU
+
+            \begin{itemize}
+                \item Referred to as the \highlight{Device}
+                \item Cannot work standalone
+                \item Waits for kernels to execute (offloading)
+            \end{itemize}
+        \end{column}
+    \end{columns}
 \end{frame}
 
 % _____________________________________________________________________________
@@ -556,13 +568,13 @@ \subsection{Compilation}
 
 % _____________________________________________________________________________
 
-\begin{frame}[fragile]{Real world case}
+\begin{frame}[fragile]{Real world case: NVIDIA GPU}
     \begin{columns}
         \begin{column}{0.475\linewidth}
             \begin{center}
-                \includegraphics[width=\textwidth]{kokkos_a100_backend.png}
+                \includegraphics[width=\textwidth]{kokkos_a100_compilation.png}
             \end{center}
-            \begin{minted}{bash}
+            \begin{minted}{sh}
                 cmake \
                     -B ${build_dir} \
                     -DKokkos_ENABLE_OPENMP=ON \
@@ -573,7 +585,8 @@ \subsection{Compilation}
         \end{column}
         \begin{column}{0.525\linewidth}
             \begin{itemize}
-                \item Compiling for a multi-core Xeon Skylake CPU and a A100 NVIDIA GPU
+                \item Compiling for a multi-core Xeon Skylake CPU and an NVIDIA A100 GPU
+                \item Compiler is GCC (default)
                 \item OpenMP option for the multi-core CPU
                 \item Skylake option, mostly for vectorization (optional)
                 \item Cuda option for the GPU
@@ -585,6 +598,71 @@ \subsection{Compilation}
 
 % _____________________________________________________________________________
 
+\begin{frame}[fragile]{Real world case: AMD GPU}
+    \begin{columns}
+        \begin{column}{0.475\linewidth}
+            \begin{center}
+                \includegraphics[width=\textwidth]{kokkos_mi300x_compilation.png}
+            \end{center}
+            \begin{minted}{sh}
+                cmake \
+                    -B ${build_dir} \
+                    -DCMAKE_CXX_COMPILER=hipcc \
+                    -DKokkos_ENABLE_OPENMP=ON \
+                    -DKokkos_ARCH_SKX=ON \
+                    -DKokkos_ENABLE_HIP=ON \
+                    -DKokkos_ARCH_AMD_GHX90A
+            \end{minted}
+        \end{column}
+        \begin{column}{0.525\linewidth}
+            \begin{itemize}
+                \item Compiling for a multi-core Xeon Skylake CPU and an AMD MI300X GPU
+                \item Compiler is the HIP compiler
+                \item OpenMP option for the multi-core CPU
+                \item Skylake option, mostly for vectorization (optional)
+                \item HIP option for the GPU
+                \item MI300X option, for the correct architecture (mandatory)
+            \end{itemize}
+        \end{column}
+    \end{columns}
+\end{frame}
+
+% _____________________________________________________________________________
+
+\begin{frame}[fragile]{Real world case: Intel GPU}
+    \begin{columns}
+        \begin{column}{0.475\linewidth}
+            \begin{center}
+                \includegraphics[width=\textwidth]{kokkos_gpu_max_compilation.png}
+            \end{center}
+            \begin{minted}{sh}
+                cmake \
+                -B ${build_dir} \
+                    -DCMAKE_CXX_COMPILER=icpx \
+                    -DCMAKE_CXX_FLAGS=\
+                        "-fp-model=precise" \
+                    -DKokkos_ENABLE_OPENMP=ON \
+                    -DKokkos_ARCH_SKX=ON \
+                    -DKokkos_ENABLE_SYCL=ON \
+                    -DKokkos_ARCH_INTEL_PVC=ON
+            \end{minted}
+        \end{column}
+        \begin{column}{0.525\linewidth}
+            \begin{itemize}
+                \item Compiling for a multi-core Xeon Skylake CPU and an Intel GPU Max
+                \item Compiler is the Intel LLVM compiler
+                \item Add flag to use high precision math operators
+                \item OpenMP option for the multi-core CPU
+                \item Skylake option, mostly for vectorization (optional)
+                \item SYCL option for the GPU
+                \item PVC option, for the correct architecture (mandatory)
+            \end{itemize}
+        \end{column}
+    \end{columns}
+\end{frame}
+
+% _____________________________________________________________________________
+
 \begin{frame}{How to add Kokkos to your project}
     \begin{itemize}
         \item There are several ways to do it (for lack of a standard C++ dependency manager)
@@ -665,13 +743,13 @@ \subsection{Compilation}
             \begin{block}{Pros}
                 \begin{itemize}
                     \item Good for production
+                    \item Has dev mode
                     \item Can bring the backend libs
                 \end{itemize}
             \end{block}
             \begin{alertblock}{Cons}
                 \begin{itemize}
                     \item Not trivial
-                    \item Not for dev
                 \end{itemize}
             \end{alertblock}
         \end{column}
@@ -1339,6 +1417,7 @@ \subsection{Compilation}
                         \item Timing
                         \item Memory (on Unixes only)
                     \end{itemize}
+                    \item Increase the size of the problem to make the differences more visible
                 \end{itemize}
             \end{block}
         \end{column}
@@ -1567,6 +1646,8 @@ \subsection{Parallel loops}
                 \begin{itemize}
                     \item Write a simple parallel loop
                     \item Measure the difference of performance between CPU and GPU
+                    \item Increase the size of the problem to make the difference more visible
+                    \item At which size the GPU is more efficient?
                 \end{itemize}
             \end{block}
         \end{column}
@@ -1929,6 +2010,8 @@ \subsection{Extending loop policies}
                 \begin{itemize}
                     \item Perform a parallel reduction
                     \item Measure the difference of performance between CPU and GPU
+                    \item Increase the size of the problem to make the differences more visible
+                    \item At which size the GPU is more efficient?
                 \end{itemize}
             \end{block}
         \end{column}

diff --git a/exercises/01_first_program/README.md b/exercises/01_first_program/README.md
@@ -33,7 +33,8 @@ Recompile your program using the OpenMP backend.
 You can use the following commands:
 
 ```sh
-cmake -B build_openmp -DKokkos_ENABLE_OPENMP=ON
+cmake -B build_openmp \
+    -DKokkos_ENABLE_OPENMP=ON
 cmake --build build_openmp
 ```
 
@@ -117,10 +118,14 @@ Recompile your program now using the Cuda backend, by instance.
 You can use the following commands:
 
 ```bash
-cmake -B build_cuda -DKokkos_ENABLE_CUDA=ON
+cmake -B build_cuda \
+    -DKokkos_ENABLE_CUDA=ON \
+    -DKokkos_ARCH_<ARCH>=ON
 cmake --build build_cuda
 ```
 
+Specify the architecture flag that applies.
+
 Run the program and check the output:
 
 ```sh

diff --git a/exercises/03_deep_copy/solution/main.cpp b/exercises/03_deep_copy/solution/main.cpp
@@ -79,11 +79,6 @@ int main(int argc, char* argv[]) {
         std::cout << " - Mirror extent: " << extent[0] << " x " << extent[1] << " x " << extent[2] << std::endl;
         std::cout << " - Mirror stride: " << stride[0] << " x " << stride[1] << " x " << stride[2] << std::endl;
 
-#ifdef __unix__
-        getrusage(RUSAGE_SELF, &usage);
-        std::cout << "Total memory usage after `create_mirror_view`: " << usage.ru_maxrss << " KB" << std::endl;
-#endif
-
         // Initialize the matrix
 
         for (int i = 0; i < Nx; i++) {
@@ -103,6 +98,11 @@ int main(int argc, char* argv[]) {
 
         std::cout << "Time to deep copy mirror to matrix: " << timer_stop - timer_start << std::endl;
 
+#ifdef __unix__
+        getrusage(RUSAGE_SELF, &usage);
+        std::cout << "Total memory usage after `create_mirror_view`: " << usage.ru_maxrss << " KB" << std::endl;
+#endif
+
         // _____________________________________________________
         // New mirror view
 

diff --git a/exercises/04_parallel_loop/README.md b/exercises/04_parallel_loop/README.md
@@ -2,46 +2,46 @@
 
 ## Objective
 
-The goal of this exercise is to manipulate the Kokkos loop.
+The goal of this exercise is to use a Kokkos parallel loop.
 
-## Steps 1: create a vector
+## Steps 1: create a View
 
-In the file `main.cpp`, create a vector of size `N` of type `double`.
+In the file `main.cpp`, create a 1D View of size `N` and type `double`.
 
-# Step 2: parallel loop
+## Step 2: parallel loop
 
-Use the `Kokkos::parallel_for` function to initialize the vector with the values of your choice on the Device.
+Use the `Kokkos::parallel_for` function to initialize the View with the values of your choice on the Device.
 
 Add the `Kokkos::fence()` function after the parallel for.
 
-# Step 3: get the data back to the host
+## Step 3: get the data back to the host
 
 Create a mirror View called `mirror` using the `Kokkos::create_mirror_view` function.
 
 Use the `Kokkos::deep_copy` function to copy the data from the original View to the mirror View.
 
-# Step 4: check the result
+## Step 4: check the result
 
 Use a loop to check that the data is correct by computing the error between the `mirror` View and the expected result.
 
-# Step 5: timers
+## Step 5: timers
 
 Add timers to measure the time spent in the parallel loop and the time spent to copy the data back to the host.
 
-# Step 6: compile and run the program
+## Step 6: compile and run the program
 
 Prepare the OpenMP environment:
 
-```bash
+```sh
 export OMP_NUM_THREADS=<the number of cores you want to use>
 export OMP_PROC_BIND=spread
 export OMP_PLACES=threads
 ```
 
 Compile the program with the OpenMP backend and execute.
-Use a large vector size for more meaningful results:
+Use a large View size for more meaningful results:
 
-```bash
+```sh
 build_openmp/exe04 10000000
 ```
 

diff --git a/exercises/05_parallel_reduce/README.md b/exercises/05_parallel_reduce/README.md
@@ -2,7 +2,7 @@
 
 ## Objective
 
-The goal of this exercise is to manipulate the Kokkos parallel reduction.
+The goal of this exercise is to use a Kokkos parallel reduction.
 
 ## Step 1: create a 2D matrix
 

diff --git a/images/kokkos_a100_backend.png b/images/kokkos_a100_backend.png
diff --git a/images/kokkos_a100_compilation.png b/images/kokkos_a100_compilation.png
diff --git a/images/kokkos_a100_compilation.svg b/images/kokkos_a100_compilation.svg
diff --git a/images/kokkos_gpu_max_compilation.png b/images/kokkos_gpu_max_compilation.png