Skip to content

Commit

Permalink
[SYCL][HIP] Add coarse-grained memory advice for HIP-AMD
Browse files Browse the repository at this point in the history
  • Loading branch information
GeorgeWeb committed Aug 8, 2023
1 parent f714d74 commit 606c4dd
Show file tree
Hide file tree
Showing 5 changed files with 161 additions and 25 deletions.
2 changes: 2 additions & 0 deletions sycl/include/sycl/detail/pi.h
Original file line number Diff line number Diff line change
Expand Up @@ -574,6 +574,8 @@ typedef enum {
PI_MEM_ADVICE_CUDA_UNSET_PREFERRED_LOCATION_HOST = 1 << 7,
PI_MEM_ADVICE_CUDA_SET_ACCESSED_BY_HOST = 1 << 8,
PI_MEM_ADVICE_CUDA_UNSET_ACCESSED_BY_HOST = 1 << 9,
PI_MEM_ADVICE_HIP_SET_COARSE_GRAINED = 1 << 10,
PI_MEM_ADVICE_HIP_UNSET_COARSE_GRAINED = 1 << 11,
PI_MEM_ADVICE_UNKNOWN = 0x7FFFFFFF,
} _pi_mem_advice;

Expand Down
6 changes: 6 additions & 0 deletions sycl/plugins/unified_runtime/pi2ur.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3240,6 +3240,12 @@ inline pi_result piextUSMEnqueueMemAdvise(pi_queue Queue, const void *Ptr,
if (Advice & PI_MEM_ADVICE_CUDA_UNSET_ACCESSED_BY_HOST) {
UrAdvice |= UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_HOST;
}
if (Advice & PI_MEM_ADVICE_HIP_SET_COARSE_GRAINED) {
UrAdvice |= UR_USM_ADVICE_FLAG_SET_NON_ATOMIC_MOSTLY;
}
if (Advice & PI_MEM_ADVICE_HIP_UNSET_COARSE_GRAINED) {
UrAdvice |= UR_USM_ADVICE_FLAG_CLEAR_NON_ATOMIC_MOSTLY;
}
if (Advice & PI_MEM_ADVICE_RESET) {
UrAdvice |= UR_USM_ADVICE_FLAG_DEFAULT;
}
Expand Down
63 changes: 38 additions & 25 deletions sycl/plugins/unified_runtime/ur/adapters/hip/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "kernel.hpp"
#include "memory.hpp"
#include "queue.hpp"
#include <hip/hip_runtime_api.h>

namespace {

Expand Down Expand Up @@ -88,24 +89,35 @@ ur_result_t setHipMemAdvise(const void *DevPtr, size_t Size,
hipDevice_t Device) {
using ur_to_hip_advice_t = std::pair<ur_usm_advice_flags_t, hipMemoryAdvise>;

static constexpr std::array<ur_to_hip_advice_t, 6>
URToHIPMemAdviseDeviceFlags{
std::make_pair(UR_USM_ADVICE_FLAG_SET_READ_MOSTLY,
hipMemAdviseSetReadMostly),
std::make_pair(UR_USM_ADVICE_FLAG_CLEAR_READ_MOSTLY,
hipMemAdviseUnsetReadMostly),
std::make_pair(UR_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION,
hipMemAdviseSetPreferredLocation),
std::make_pair(UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION,
hipMemAdviseUnsetPreferredLocation),
std::make_pair(UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_DEVICE,
hipMemAdviseSetAccessedBy),
std::make_pair(UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_DEVICE,
hipMemAdviseUnsetAccessedBy),
};
for (auto &FlagPair : URToHIPMemAdviseDeviceFlags) {
if (URAdviceFlags & FlagPair.first) {
UR_CHECK_ERROR(hipMemAdvise(DevPtr, Size, FlagPair.second, Device));
#if defined(__HIP_PLATFORM_AMD__)
constexpr int DeviceFlagCount = 8;
#else
constexpr int DeviceFlagCount = 6;
#endif
static constexpr std::array<ur_to_hip_advice_t, DeviceFlagCount>
URToHIPMemAdviseDeviceFlags {
std::make_pair(UR_USM_ADVICE_FLAG_SET_READ_MOSTLY,
hipMemAdviseSetReadMostly),
std::make_pair(UR_USM_ADVICE_FLAG_CLEAR_READ_MOSTLY,
hipMemAdviseUnsetReadMostly),
std::make_pair(UR_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION,
hipMemAdviseSetPreferredLocation),
std::make_pair(UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION,
hipMemAdviseUnsetPreferredLocation),
std::make_pair(UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_DEVICE,
hipMemAdviseSetAccessedBy),
std::make_pair(UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_DEVICE,
hipMemAdviseUnsetAccessedBy),
#if defined(__HIP_PLATFORM_AMD__)
std::make_pair(UR_USM_ADVICE_FLAG_SET_NON_ATOMIC_MOSTLY,
hipMemAdviseSetCoarseGrain),
std::make_pair(UR_USM_ADVICE_FLAG_CLEAR_NON_ATOMIC_MOSTLY,
hipMemAdviseUnsetCoarseGrain),
#endif
};
for (auto &[URDevAdvice, HipDevAdvice] : URToHIPMemAdviseDeviceFlags) {
if (URAdviceFlags & URDevAdvice) {
UR_CHECK_ERROR(hipMemAdvise(DevPtr, Size, HipDevAdvice, Device));
}
}

Expand All @@ -120,18 +132,15 @@ ur_result_t setHipMemAdvise(const void *DevPtr, size_t Size,
hipMemAdviseUnsetAccessedBy),
};

for (auto &FlagPair : URToHIPMemAdviseHostFlags) {
if (URAdviceFlags & FlagPair.first) {
UR_CHECK_ERROR(
hipMemAdvise(DevPtr, Size, FlagPair.second, hipCpuDeviceId));
for (auto &[URHostAdvice, HipHostAdvice] : URToHIPMemAdviseHostFlags) {
if (URAdviceFlags & URHostAdvice) {
UR_CHECK_ERROR(hipMemAdvise(DevPtr, Size, HipHostAdvice, hipCpuDeviceId));
}
}

// Handle unmapped memory advice flags
if (URAdviceFlags &
(UR_USM_ADVICE_FLAG_SET_NON_ATOMIC_MOSTLY |
UR_USM_ADVICE_FLAG_CLEAR_NON_ATOMIC_MOSTLY |
UR_USM_ADVICE_FLAG_BIAS_CACHED | UR_USM_ADVICE_FLAG_BIAS_UNCACHED)) {
(UR_USM_ADVICE_FLAG_BIAS_CACHED | UR_USM_ADVICE_FLAG_BIAS_UNCACHED)) {
return UR_RESULT_ERROR_INVALID_ENUMERATION;
}

Expand Down Expand Up @@ -1443,6 +1452,10 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
pMem, size, hipMemAdviseUnsetPreferredLocation, DeviceID));
UR_CHECK_ERROR(
hipMemAdvise(pMem, size, hipMemAdviseUnsetAccessedBy, DeviceID));
#if defined(__HIP_PLATFORM_AMD__)
UR_CHECK_ERROR(
hipMemAdvise(pMem, size, hipMemAdviseUnsetCoarseGrain, DeviceID));
#endif
} else {
Result = setHipMemAdvise(HIPDevicePtr, size, advice, DeviceID);
// UR_RESULT_ERROR_INVALID_ENUMERATION is returned when using a valid but
Expand Down
58 changes: 58 additions & 0 deletions sycl/test-e2e/USM/memadvise_hip.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
// RUN: %{build} -o %t1.out
// REQUIRES: hip_amd
// RUN: %{run} %t1.out

//==---------------- memadvise_cuda.cpp ------------------------------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "sycl/context.hpp"
#include "sycl/detail/pi.h"
#include "sycl/device.hpp"
#include <iostream>
#include <sycl/sycl.hpp>
#include <vector>

int main() {
sycl::queue q;
sycl::device dev = q.get_device();
sycl::context ctx = q.get_context();
if (!dev.get_info<sycl::info::device::usm_shared_allocations>()) {
std::cout << "Shared USM is not supported. Skipping test." << std::endl;
return 0;
}

constexpr size_t size = 100;
void *ptr = sycl::malloc_shared(size, dev, ctx);
if (ptr == nullptr) {
std::cout << "Allocation failed!" << std::endl;
return -1;
}

// NOTE: PI_MEM_ADVICE_CUDA_* advice values are mapped to the HIP backend too.
std::vector<int> valid_advices{
PI_MEM_ADVICE_CUDA_SET_READ_MOSTLY,
PI_MEM_ADVICE_CUDA_UNSET_READ_MOSTLY,
PI_MEM_ADVICE_CUDA_SET_PREFERRED_LOCATION,
PI_MEM_ADVICE_CUDA_UNSET_PREFERRED_LOCATION,
PI_MEM_ADVICE_CUDA_SET_ACCESSED_BY,
PI_MEM_ADVICE_CUDA_UNSET_ACCESSED_BY,
PI_MEM_ADVICE_CUDA_SET_PREFERRED_LOCATION_HOST,
PI_MEM_ADVICE_CUDA_UNSET_PREFERRED_LOCATION_HOST,
PI_MEM_ADVICE_CUDA_SET_ACCESSED_BY_HOST,
PI_MEM_ADVICE_CUDA_UNSET_ACCESSED_BY_HOST,
PI_MEM_ADVICE_HIP_SET_COARSE_GRAINED,
PI_MEM_ADVICE_HIP_UNSET_COARSE_GRAINED,
};
for (int advice : valid_advices) {
q.mem_advise(ptr, size, advice);
}

q.wait_and_throw();
std::cout << "Test passed." << std::endl;
return 0;
}
57 changes: 57 additions & 0 deletions sycl/test-e2e/USM/memadvise_hip_coherency.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// RUN: %{build} -o %t1.out
// REQUIRES: hip_amd
// RUN: %{run} %t1.out

#include <iostream>
#include <sycl/sycl.hpp>

namespace kernels {
class SquareKrnl final {
public:
SquareKrnl(int *ptr) : mPtr(ptr) {}

void operator()(sycl::id<1>) const {
// mPtr value squared here
*mPtr = (*mPtr) * (*mPtr);
}

private:
int *mPtr;
};
} // namespace kernels

int main() {
sycl::queue q;
sycl::device dev = q.get_device();
sycl::context ctx = q.get_context();
if (!dev.get_info<sycl::info::device::usm_shared_allocations>()) {
std::cout << "Shared USM is not supported. Skipping test.\n";
return 0;
}

int *ptr = sycl::malloc_shared<int>(1, q);

// Hint that data coherency during simultaneous execution on
// both host and device is not necessary
constexpr int MemAdviseCoarseGrained = PI_MEM_ADVICE_HIP_SET_COARSE_GRAINED;
q.mem_advise(ptr, sizeof(int), MemAdviseCoarseGrained);

// Call this routine TesCoherency function!
constexpr int number = 9;
constexpr int expected = 81;
*ptr = number;
q.submit([&](sycl::handler &h) {
h.parallel_for(sycl::range{1}, kernels::SquareKrnl(ptr));
});
// Synchronise the underlying stream the work is run on before host access.
q.wait();
// Check if caches are flushed correctly and same memory is between devices.
if (*ptr != expected) {
std::cout << "Coarse-grained mode coherency failed. Value = " << *ptr
<< '\n';
return 1;
}

sycl::free(ptr, q);
return 0;
}

0 comments on commit 606c4dd

Please sign in to comment.