Skip to content

Commit

Permalink
Merge pull request intel#2277 from igchor/cooperative_fix
Browse files Browse the repository at this point in the history
[Spec] fix urKernelSuggestMaxCooperativeGroupCountExp
  • Loading branch information
martygrant authored Dec 19, 2024
2 parents bb64b3e + 4a89e1c commit ea0f3a1
Show file tree
Hide file tree
Showing 16 changed files with 69 additions and 17 deletions.
3 changes: 3 additions & 0 deletions include/ur_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -9543,13 +9543,15 @@ urEnqueueCooperativeKernelLaunchExp(
/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
/// + `NULL == hKernel`
/// + `NULL == hDevice`
/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
/// + `NULL == pLocalWorkSize`
/// + `NULL == pGroupCountRet`
/// - ::UR_RESULT_ERROR_INVALID_KERNEL
UR_APIEXPORT ur_result_t UR_APICALL
urKernelSuggestMaxCooperativeGroupCountExp(
ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
ur_device_handle_t hDevice, ///< [in] handle of the device object
uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group
///< work-items
const size_t *pLocalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
Expand Down Expand Up @@ -11090,6 +11092,7 @@ typedef struct ur_kernel_set_specialization_constants_params_t {
/// allowing the callback the ability to modify the parameter's value
typedef struct ur_kernel_suggest_max_cooperative_group_count_exp_params_t {
ur_kernel_handle_t *phKernel;
ur_device_handle_t *phDevice;
uint32_t *pworkDim;
const size_t **ppLocalWorkSize;
size_t *pdynamicSharedMemorySize;
Expand Down
1 change: 1 addition & 0 deletions include/ur_ddi.h
Original file line number Diff line number Diff line change
Expand Up @@ -651,6 +651,7 @@ typedef ur_result_t(UR_APICALL *ur_pfnGetKernelProcAddrTable_t)(
/// @brief Function-pointer for urKernelSuggestMaxCooperativeGroupCountExp
typedef ur_result_t(UR_APICALL *ur_pfnKernelSuggestMaxCooperativeGroupCountExp_t)(
ur_kernel_handle_t,
ur_device_handle_t,
uint32_t,
const size_t *,
size_t,
Expand Down
6 changes: 6 additions & 0 deletions include/ur_print.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13203,6 +13203,12 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
ur::details::printPtr(os,
*(params->phKernel));

os << ", ";
os << ".hDevice = ";

ur::details::printPtr(os,
*(params->phDevice));

os << ", ";
os << ".workDim = ";

Expand Down
3 changes: 3 additions & 0 deletions scripts/core/exp-cooperative-kernels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,9 @@ params:
- type: $x_kernel_handle_t
name: hKernel
desc: "[in] handle of the kernel object"
- type: $x_device_handle_t
name: hDevice
desc: "[in] handle of the device object"
- type: uint32_t
name: workDim
desc: "[in] number of dimensions, from 1 to 3, to specify the work-group work-items"
Expand Down
7 changes: 5 additions & 2 deletions source/adapters/cuda/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,10 +190,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle(
}

UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pLocalWorkSize,
size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) {
ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, uint32_t workDim,
const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize,
uint32_t *pGroupCountRet) {
UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_KERNEL);

std::ignore = hDevice;

size_t localWorkSize = pLocalWorkSize[0];
localWorkSize *= (workDim >= 2 ? pLocalWorkSize[1] : 1);
localWorkSize *= (workDim == 3 ? pLocalWorkSize[2] : 1);
Expand Down
6 changes: 4 additions & 2 deletions source/adapters/hip/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,9 +169,11 @@ urKernelGetNativeHandle(ur_kernel_handle_t, ur_native_handle_t *) {
}

UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pLocalWorkSize,
size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) {
ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, uint32_t workDim,
const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize,
uint32_t *pGroupCountRet) {
std::ignore = hKernel;
std::ignore = hDevice;
std::ignore = workDim;
std::ignore = pLocalWorkSize;
std::ignore = dynamicSharedMemorySize;
Expand Down
9 changes: 6 additions & 3 deletions source/adapters/level_zero/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1054,8 +1054,9 @@ ur_result_t urKernelGetNativeHandle(
}

ur_result_t urKernelSuggestMaxCooperativeGroupCountExp(
ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pLocalWorkSize,
size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) {
ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, uint32_t workDim,
const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize,
uint32_t *pGroupCountRet) {
(void)dynamicSharedMemorySize;
std::shared_lock<ur_shared_mutex> Guard(hKernel->Mutex);

Expand All @@ -1066,8 +1067,10 @@ ur_result_t urKernelSuggestMaxCooperativeGroupCountExp(
ZE2UR_CALL(zeKernelSetGroupSize, (hKernel->ZeKernel, WG[0], WG[1], WG[2]));

uint32_t TotalGroupCount = 0;
ze_kernel_handle_t ZeKernel;
UR_CALL(getZeKernel(hDevice->ZeDevice, hKernel, &ZeKernel));
ZE2UR_CALL(zeKernelSuggestMaxCooperativeGroupCount,
(hKernel->ZeKernel, &TotalGroupCount));
(ZeKernel, &TotalGroupCount));
*pGroupCountRet = TotalGroupCount;
return UR_RESULT_SUCCESS;
}
Expand Down
5 changes: 3 additions & 2 deletions source/adapters/level_zero/ur_interface_loader.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -691,8 +691,9 @@ ur_result_t urEnqueueCooperativeKernelLaunchExp(
const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent);
ur_result_t urKernelSuggestMaxCooperativeGroupCountExp(
ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pLocalWorkSize,
size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet);
ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, uint32_t workDim,
const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize,
uint32_t *pGroupCountRet);
ur_result_t urEnqueueTimestampRecordingExp(
ur_queue_handle_t hQueue, bool blocking, uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent);
Expand Down
5 changes: 3 additions & 2 deletions source/adapters/level_zero/v2/api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -485,8 +485,9 @@ ur_result_t urCommandBufferCommandGetInfoExp(
}

ur_result_t urKernelSuggestMaxCooperativeGroupCountExp(
ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pLocalWorkSize,
size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) {
ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, uint32_t workDim,
const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize,
uint32_t *pGroupCountRet) {
logger::error("{} function not implemented!", __FUNCTION__);
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
Expand Down
7 changes: 6 additions & 1 deletion source/adapters/mock/ur_mockddi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10057,6 +10057,7 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
/// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCountExp
__urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
ur_device_handle_t hDevice, ///< [in] handle of the device object
uint32_t
workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group
///< work-items
Expand All @@ -10072,7 +10073,11 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
ur_result_t result = UR_RESULT_SUCCESS;

ur_kernel_suggest_max_cooperative_group_count_exp_params_t params = {
&hKernel, &workDim, &pLocalWorkSize, &dynamicSharedMemorySize,
&hKernel,
&hDevice,
&workDim,
&pLocalWorkSize,
&dynamicSharedMemorySize,
&pGroupCountRet};

auto beforeCallback = reinterpret_cast<ur_mock_callback_t>(
Expand Down
1 change: 1 addition & 0 deletions source/adapters/opencl/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle(

UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
[[maybe_unused]] ur_kernel_handle_t hKernel,
[[maybe_unused]] ur_device_handle_t hDevice,
[[maybe_unused]] uint32_t workDim,
[[maybe_unused]] const size_t *pLocalWorkSize,
[[maybe_unused]] size_t dynamicSharedMemorySize,
Expand Down
9 changes: 7 additions & 2 deletions source/loader/layers/tracing/ur_trcddi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8633,6 +8633,7 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
/// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCountExp
__urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
ur_device_handle_t hDevice, ///< [in] handle of the device object
uint32_t
workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group
///< work-items
Expand All @@ -8654,7 +8655,11 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
}

ur_kernel_suggest_max_cooperative_group_count_exp_params_t params = {
&hKernel, &workDim, &pLocalWorkSize, &dynamicSharedMemorySize,
&hKernel,
&hDevice,
&workDim,
&pLocalWorkSize,
&dynamicSharedMemorySize,
&pGroupCountRet};
uint64_t instance = getContext()->notify_begin(
UR_FUNCTION_KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT_EXP,
Expand All @@ -8664,7 +8669,7 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
logger.info(" ---> urKernelSuggestMaxCooperativeGroupCountExp\n");

ur_result_t result = pfnSuggestMaxCooperativeGroupCountExp(
hKernel, workDim, pLocalWorkSize, dynamicSharedMemorySize,
hKernel, hDevice, workDim, pLocalWorkSize, dynamicSharedMemorySize,
pGroupCountRet);

getContext()->notify_end(
Expand Down
12 changes: 11 additions & 1 deletion source/loader/layers/validation/ur_valddi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9656,6 +9656,7 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
/// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCountExp
__urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
ur_device_handle_t hDevice, ///< [in] handle of the device object
uint32_t
workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group
///< work-items
Expand All @@ -9681,6 +9682,10 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
}

if (NULL == hDevice) {
return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
}

if (NULL == pLocalWorkSize) {
return UR_RESULT_ERROR_INVALID_NULL_POINTER;
}
Expand All @@ -9695,8 +9700,13 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
getContext()->refCountContext->logInvalidReference(hKernel);
}

if (getContext()->enableLifetimeValidation &&
!getContext()->refCountContext->isReferenceValid(hDevice)) {
getContext()->refCountContext->logInvalidReference(hDevice);
}

ur_result_t result = pfnSuggestMaxCooperativeGroupCountExp(
hKernel, workDim, pLocalWorkSize, dynamicSharedMemorySize,
hKernel, hDevice, workDim, pLocalWorkSize, dynamicSharedMemorySize,
pGroupCountRet);

return result;
Expand Down
6 changes: 5 additions & 1 deletion source/loader/ur_ldrddi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8877,6 +8877,7 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
/// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCountExp
__urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
ur_device_handle_t hDevice, ///< [in] handle of the device object
uint32_t
workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group
///< work-items
Expand Down Expand Up @@ -8904,9 +8905,12 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
// convert loader handle to platform handle
hKernel = reinterpret_cast<ur_kernel_object_t *>(hKernel)->handle;

// convert loader handle to platform handle
hDevice = reinterpret_cast<ur_device_object_t *>(hDevice)->handle;

// forward to device-platform
result = pfnSuggestMaxCooperativeGroupCountExp(
hKernel, workDim, pLocalWorkSize, dynamicSharedMemorySize,
hKernel, hDevice, workDim, pLocalWorkSize, dynamicSharedMemorySize,
pGroupCountRet);

return result;
Expand Down
4 changes: 3 additions & 1 deletion source/loader/ur_libapi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8935,12 +8935,14 @@ ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
/// + `NULL == hKernel`
/// + `NULL == hDevice`
/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
/// + `NULL == pLocalWorkSize`
/// + `NULL == pGroupCountRet`
/// - ::UR_RESULT_ERROR_INVALID_KERNEL
ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
ur_device_handle_t hDevice, ///< [in] handle of the device object
uint32_t
workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group
///< work-items
Expand All @@ -8961,7 +8963,7 @@ ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
}

return pfnSuggestMaxCooperativeGroupCountExp(
hKernel, workDim, pLocalWorkSize, dynamicSharedMemorySize,
hKernel, hDevice, workDim, pLocalWorkSize, dynamicSharedMemorySize,
pGroupCountRet);
} catch (...) {
return exceptionToResult(std::current_exception());
Expand Down
2 changes: 2 additions & 0 deletions source/ur_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7578,12 +7578,14 @@ ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
/// + `NULL == hKernel`
/// + `NULL == hDevice`
/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
/// + `NULL == pLocalWorkSize`
/// + `NULL == pGroupCountRet`
/// - ::UR_RESULT_ERROR_INVALID_KERNEL
ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
ur_device_handle_t hDevice, ///< [in] handle of the device object
uint32_t
workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group
///< work-items
Expand Down

0 comments on commit ea0f3a1

Please sign in to comment.