Skip to content

Commit

Permalink
[SYCL][HIP] Add coarse-grained memory advice for HIP on AMD
Browse files Browse the repository at this point in the history
  • Loading branch information
GeorgeWeb committed Jan 10, 2024
1 parent b32db9a commit f87b0e0
Show file tree
Hide file tree
Showing 5 changed files with 162 additions and 3 deletions.
5 changes: 4 additions & 1 deletion sycl/include/sycl/detail/pi.h
Original file line number Diff line number Diff line change
Expand Up @@ -147,9 +147,10 @@
// 14.38 Change PI_MEM_ADVICE_* values to flags for use in bitwise operations.
// 14.39 Added PI_EXT_INTEL_DEVICE_INFO_ESIMD_SUPPORT device info query.
// 14.40 Add HIP _pi_mem_advice alises to match the PI_MEM_ADVICE_CUDA* ones.
// 14.41 Add coarse-grain memory advice flag for HIP.

#define _PI_H_VERSION_MAJOR 14
#define _PI_H_VERSION_MINOR 40
#define _PI_H_VERSION_MINOR 41

#define _PI_STRING_HELPER(a) #a
#define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b)
Expand Down Expand Up @@ -578,6 +579,8 @@ typedef enum {
PI_MEM_ADVICE_CUDA_UNSET_PREFERRED_LOCATION_HOST = 1 << 7,
PI_MEM_ADVICE_CUDA_SET_ACCESSED_BY_HOST = 1 << 8,
PI_MEM_ADVICE_CUDA_UNSET_ACCESSED_BY_HOST = 1 << 9,
PI_MEM_ADVICE_HIP_SET_COARSE_GRAINED = 1 << 10,
PI_MEM_ADVICE_HIP_UNSET_COARSE_GRAINED = 1 << 11,
PI_MEM_ADVICE_UNKNOWN = 0x7FFFFFFF,
} _pi_mem_advice;

Expand Down
4 changes: 2 additions & 2 deletions sycl/plugins/unified_runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
# Options to override the default behaviour of the FetchContent to include UR
# source code.
set(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO
"" CACHE STRING "Override the Unified Runtime FetchContent repository")
"https://github.com/GeorgeWeb/unified-runtime.git" CACHE STRING "Override the Unified Runtime FetchContent repository")
set(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_TAG
"" CACHE STRING "Override the Unified Runtime FetchContent tag")
"617f9754e7e582f60d4c8ec2359e027da234ddba" CACHE STRING "Override the Unified Runtime FetchContent tag")

# Options to disable use of FetchContent to include Unified Runtime source code
# to improve developer workflow.
Expand Down
6 changes: 6 additions & 0 deletions sycl/plugins/unified_runtime/pi2ur.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3370,6 +3370,12 @@ inline pi_result piextUSMEnqueueMemAdvise(pi_queue Queue, const void *Ptr,
if (Advice & PI_MEM_ADVICE_CUDA_UNSET_ACCESSED_BY_HOST) {
UrAdvice |= UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_HOST;
}
if (Advice & PI_MEM_ADVICE_HIP_SET_COARSE_GRAINED) {
UrAdvice |= UR_USM_ADVICE_FLAG_SET_NON_ATOMIC_MOSTLY;
}
if (Advice & PI_MEM_ADVICE_HIP_UNSET_COARSE_GRAINED) {
UrAdvice |= UR_USM_ADVICE_FLAG_CLEAR_NON_ATOMIC_MOSTLY;
}
if (Advice & PI_MEM_ADVICE_RESET) {
UrAdvice |= UR_USM_ADVICE_FLAG_DEFAULT;
}
Expand Down
2 changes: 2 additions & 0 deletions sycl/test-e2e/USM/memadvise_flags.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ int main() {
valid_advices.emplace_back(PI_MEM_ADVICE_HIP_UNSET_PREFERRED_LOCATION_HOST);
valid_advices.emplace_back(PI_MEM_ADVICE_HIP_SET_ACCESSED_BY_HOST);
valid_advices.emplace_back(PI_MEM_ADVICE_HIP_UNSET_ACCESSED_BY_HOST);
valid_advices.emplace_back(PI_MEM_ADVICE_HIP_SET_COARSE_GRAINED);
valid_advices.emplace_back(PI_MEM_ADVICE_HIP_UNSET_COARSE_GRAINED);
} else {
// Skip
return 0;
Expand Down
148 changes: 148 additions & 0 deletions sycl/test-e2e/USM/memory_coherency_hip.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
// RUN: %{build} -o %t1.out
// REQUIRES: hip_amd
// RUN: %{run} %t1.out

//==---- memory_coherency_hip.cpp -----------------------------------------==//
// USM coarse/fine grain memory coherency test for the HIP-AMD backend.
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include <sycl/sycl.hpp>

#include <chrono>
#include <iostream>

namespace kernels {
class SquareKrnl final {
int *mPtr;

public:
SquareKrnl(int *ptr) : mPtr{ptr} {}

void operator()(sycl::id<1>) const { *mPtr = (*mPtr) * (*mPtr); }
};

class CoherencyTestKrnl final {
int *mPtr;

public:
CoherencyTestKrnl(int *ptr) : mPtr{ptr} {}

void operator()(sycl::id<1>) const {
auto atm = sycl::atomic_ref<int, sycl::memory_order::relaxed,
sycl::memory_scope::device>(mPtr[0]);

// mPtr was initialized to 1 by the host, now set it to 2.
atm.fetch_add(1);

// spin until mPtr is 3, then change it to 4.
int expected{3};
int old = atm.load();
while (true) {
old = atm.load();
if (old == expected) {
if (atm.compare_exchange_strong(old, 4)) {
break;
}
}
}
}
};
} // namespace kernels

int main() {
sycl::queue q{};
sycl::device dev = q.get_device();
sycl::context ctx = q.get_context();
if (!dev.get_info<sycl::info::device::usm_shared_allocations>()) {
std::cout << "Shared USM is not supported. Skipping test.\n";
return 0;
}

bool coherent{false};

int *ptr = sycl::malloc_shared<int>(1, q);

// Coherency test 1
//
// The following test validates if memory access is fine with memory allocated
// using malloc_managed() and COARSE_GRAINED advice set via mem_advise().
//
// Coarse grained memory is only guaranteed to be coherent outside of GPU
// kernels that modify it. Changes applied to coarse-grained memory by a GPU
// kernel are only visible to the rest of the system (CPU or other GPUs) when
// the kernel has completed. A GPU kernel is only guaranteed to see changes
// applied to coarse grained memory by the rest of the system (CPU or other
// GPUs) if those changes were made before the kernel launched.

// Hint to use coarse-grain memory.
q.mem_advise(ptr, sizeof(int), int{PI_MEM_ADVICE_HIP_SET_COARSE_GRAINED});

int init_val{9};
int expected{init_val * init_val};

*ptr = init_val;
q.parallel_for(sycl::range{1}, kernels::SquareKrnl{ptr});
// Synchronise the underlying stream.
q.wait();

// Check if caches are flushed correctly and same memory is between devices.
if (*ptr == expected) {
coherent = true;
} else {
std::cerr << "Coherency test failed. Value: " << *ptr
<< " (expected: " << expected << ")\n";
coherent = false;
}

// Coherency test 2
//
// The following test validates if fine-grain behavior is observed or not with
// memory allocated using malloc_managed().
//
// Fine grained memory allows CPUs and GPUs to synchronize (via atomics) and
// coherently communicate with each other while the GPU kernel is running.

// Hint to use fine-grain memory.
q.mem_advise(ptr, sizeof(int), int{PI_MEM_ADVICE_HIP_UNSET_COARSE_GRAINED});

init_val = 1;
expected = 4;

*ptr = init_val;
q.parallel_for(sycl::range{1}, kernels::CoherencyTestKrnl{ptr});

// wait until ptr is 2 from the kernel (or 3 seconds), then increment to 3.
std::chrono::steady_clock::time_point start =
std::chrono::steady_clock::now();
while (std::chrono::duration_cast<std::chrono::seconds>(
std::chrono::steady_clock::now() - start)
.count() < 3 &&
*ptr == 2) {
}
*ptr += 1;

// Synchronise the underlying stream.
q.wait();

// Check if caches are flushed correctly and same memory is between devices.
if (*ptr == expected) {
coherent &= true;
} else {
std::cerr << "Coherency test failed. Value: " << *ptr
<< " (expected: " << expected << ")\n";
coherent = false;
}

// Cleanup
sycl::free(ptr, q);

// Check if all coherency tests passed.
assert(coherent);

return 0;
}

0 comments on commit f87b0e0

Please sign in to comment.