[SYCL][HIP] Add coarse-grained memory advice for HIP on AMD

GeorgeWeb · Jan 10, 2024 · f87b0e0 · f87b0e0
1 parent b32db9a
commit f87b0e0
Show file tree

Hide file tree

Showing 5 changed files with 162 additions and 3 deletions.
diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h
@@ -147,9 +147,10 @@
 // 14.38 Change PI_MEM_ADVICE_* values to flags for use in bitwise operations.
 // 14.39 Added PI_EXT_INTEL_DEVICE_INFO_ESIMD_SUPPORT device info query.
 // 14.40 Add HIP _pi_mem_advice alises to match the PI_MEM_ADVICE_CUDA* ones.
+// 14.41 Add coarse-grain memory advice flag for HIP.
 
 #define _PI_H_VERSION_MAJOR 14
-#define _PI_H_VERSION_MINOR 40
+#define _PI_H_VERSION_MINOR 41
 
 #define _PI_STRING_HELPER(a) #a
 #define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b)
@@ -578,6 +579,8 @@ typedef enum {
   PI_MEM_ADVICE_CUDA_UNSET_PREFERRED_LOCATION_HOST = 1 << 7,
   PI_MEM_ADVICE_CUDA_SET_ACCESSED_BY_HOST = 1 << 8,
   PI_MEM_ADVICE_CUDA_UNSET_ACCESSED_BY_HOST = 1 << 9,
+  PI_MEM_ADVICE_HIP_SET_COARSE_GRAINED = 1 << 10,
+  PI_MEM_ADVICE_HIP_UNSET_COARSE_GRAINED = 1 << 11,
   PI_MEM_ADVICE_UNKNOWN = 0x7FFFFFFF,
 } _pi_mem_advice;
 

diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -4,9 +4,9 @@
 # Options to override the default behaviour of the FetchContent to include UR
 # source code.
 set(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO
-  "" CACHE STRING "Override the Unified Runtime FetchContent repository")
+  "https://github.com/GeorgeWeb/unified-runtime.git" CACHE STRING "Override the Unified Runtime FetchContent repository")
 set(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_TAG
-  "" CACHE STRING "Override the Unified Runtime FetchContent tag")
+  "617f9754e7e582f60d4c8ec2359e027da234ddba" CACHE STRING "Override the Unified Runtime FetchContent tag")
 
 # Options to disable use of FetchContent to include Unified Runtime source code
 # to improve developer workflow.

diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp
@@ -3370,6 +3370,12 @@ inline pi_result piextUSMEnqueueMemAdvise(pi_queue Queue, const void *Ptr,
   if (Advice & PI_MEM_ADVICE_CUDA_UNSET_ACCESSED_BY_HOST) {
     UrAdvice |= UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_HOST;
   }
+  if (Advice & PI_MEM_ADVICE_HIP_SET_COARSE_GRAINED) {
+    UrAdvice |= UR_USM_ADVICE_FLAG_SET_NON_ATOMIC_MOSTLY;
+  }
+  if (Advice & PI_MEM_ADVICE_HIP_UNSET_COARSE_GRAINED) {
+    UrAdvice |= UR_USM_ADVICE_FLAG_CLEAR_NON_ATOMIC_MOSTLY;
+  }
   if (Advice & PI_MEM_ADVICE_RESET) {
     UrAdvice |= UR_USM_ADVICE_FLAG_DEFAULT;
   }

diff --git a/sycl/test-e2e/USM/memadvise_flags.cpp b/sycl/test-e2e/USM/memadvise_flags.cpp
@@ -59,6 +59,8 @@ int main() {
     valid_advices.emplace_back(PI_MEM_ADVICE_HIP_UNSET_PREFERRED_LOCATION_HOST);
     valid_advices.emplace_back(PI_MEM_ADVICE_HIP_SET_ACCESSED_BY_HOST);
     valid_advices.emplace_back(PI_MEM_ADVICE_HIP_UNSET_ACCESSED_BY_HOST);
+    valid_advices.emplace_back(PI_MEM_ADVICE_HIP_SET_COARSE_GRAINED);
+    valid_advices.emplace_back(PI_MEM_ADVICE_HIP_UNSET_COARSE_GRAINED);
   } else {
     // Skip
     return 0;

diff --git a/sycl/test-e2e/USM/memory_coherency_hip.cpp b/sycl/test-e2e/USM/memory_coherency_hip.cpp
@@ -0,0 +1,148 @@
+// RUN: %{build} -o %t1.out
+// REQUIRES: hip_amd
+// RUN: %{run} %t1.out
+
+//==---- memory_coherency_hip.cpp  -----------------------------------------==//
+// USM coarse/fine grain memory coherency test for the HIP-AMD backend.
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <sycl/sycl.hpp>
+
+#include <chrono>
+#include <iostream>
+
+namespace kernels {
+class SquareKrnl final {
+  int *mPtr;
+
+public:
+  SquareKrnl(int *ptr) : mPtr{ptr} {}
+
+  void operator()(sycl::id<1>) const { *mPtr = (*mPtr) * (*mPtr); }
+};
+
+class CoherencyTestKrnl final {
+  int *mPtr;
+
+public:
+  CoherencyTestKrnl(int *ptr) : mPtr{ptr} {}
+
+  void operator()(sycl::id<1>) const {
+    auto atm = sycl::atomic_ref<int, sycl::memory_order::relaxed,
+                                sycl::memory_scope::device>(mPtr[0]);
+
+    // mPtr was initialized to 1 by the host, now set it to 2.
+    atm.fetch_add(1);
+
+    // spin until mPtr is 3, then change it to 4.
+    int expected{3};
+    int old = atm.load();
+    while (true) {
+      old = atm.load();
+      if (old == expected) {
+        if (atm.compare_exchange_strong(old, 4)) {
+          break;
+        }
+      }
+    }
+  }
+};
+} // namespace kernels
+
+int main() {
+  sycl::queue q{};
+  sycl::device dev = q.get_device();
+  sycl::context ctx = q.get_context();
+  if (!dev.get_info<sycl::info::device::usm_shared_allocations>()) {
+    std::cout << "Shared USM is not supported. Skipping test.\n";
+    return 0;
+  }
+
+  bool coherent{false};
+
+  int *ptr = sycl::malloc_shared<int>(1, q);
+
+  // Coherency test 1
+  //
+  // The following test validates if memory access is fine with memory allocated
+  // using malloc_managed() and COARSE_GRAINED advice set via mem_advise().
+  //
+  // Coarse grained memory is only guaranteed to be coherent outside of GPU
+  // kernels that modify it. Changes applied to coarse-grained memory by a GPU
+  // kernel are only visible to the rest of the system (CPU or other GPUs) when
+  // the kernel has completed. A GPU kernel is only guaranteed to see changes
+  // applied to coarse grained memory by the rest of the system (CPU or other
+  // GPUs) if those changes were made before the kernel launched.
+
+  // Hint to use coarse-grain memory.
+  q.mem_advise(ptr, sizeof(int), int{PI_MEM_ADVICE_HIP_SET_COARSE_GRAINED});
+
+  int init_val{9};
+  int expected{init_val * init_val};
+
+  *ptr = init_val;
+  q.parallel_for(sycl::range{1}, kernels::SquareKrnl{ptr});
+  // Synchronise the underlying stream.
+  q.wait();
+
+  // Check if caches are flushed correctly and same memory is between devices.
+  if (*ptr == expected) {
+    coherent = true;
+  } else {
+    std::cerr << "Coherency test failed. Value: " << *ptr
+              << " (expected: " << expected << ")\n";
+    coherent = false;
+  }
+
+  // Coherency test 2
+  //
+  // The following test validates if fine-grain behavior is observed or not with
+  // memory allocated using malloc_managed().
+  //
+  // Fine grained memory allows CPUs and GPUs to synchronize (via atomics) and
+  // coherently communicate with each other while the GPU kernel is running.
+
+  // Hint to use fine-grain memory.
+  q.mem_advise(ptr, sizeof(int), int{PI_MEM_ADVICE_HIP_UNSET_COARSE_GRAINED});
+
+  init_val = 1;
+  expected = 4;
+
+  *ptr = init_val;
+  q.parallel_for(sycl::range{1}, kernels::CoherencyTestKrnl{ptr});
+
+  // wait until ptr is 2 from the kernel (or 3 seconds), then increment to 3.
+  std::chrono::steady_clock::time_point start =
+      std::chrono::steady_clock::now();
+  while (std::chrono::duration_cast<std::chrono::seconds>(
+             std::chrono::steady_clock::now() - start)
+                 .count() < 3 &&
+         *ptr == 2) {
+  }
+  *ptr += 1;
+
+  // Synchronise the underlying stream.
+  q.wait();
+
+  // Check if caches are flushed correctly and same memory is between devices.
+  if (*ptr == expected) {
+    coherent &= true;
+  } else {
+    std::cerr << "Coherency test failed. Value: " << *ptr
+              << " (expected: " << expected << ")\n";
+    coherent = false;
+  }
+
+  // Cleanup
+  sycl::free(ptr, q);
+
+  // Check if all coherency tests passed.
+  assert(coherent);
+
+  return 0;
+}