From 4a89e1c69a65acd4f2792743584dfc704086da5e Mon Sep 17 00:00:00 2001 From: Igor Chorazewicz Date: Fri, 1 Nov 2024 19:05:08 +0100 Subject: [PATCH] [Spec] fix urKernelSuggestMaxCooperativeGroupCountExp Add extra param: ur_device_handle_t It is necessary to implement this function on L0 for kernels that are build for multiple devices. Right now, the implementation only works when the kernel is created from a native handle. Ref: https://github.com/oneapi-src/unified-runtime/issues/2262 --- include/ur_api.h | 3 +++ include/ur_ddi.h | 1 + include/ur_print.hpp | 6 ++++++ scripts/core/exp-cooperative-kernels.yml | 3 +++ source/adapters/cuda/kernel.cpp | 7 +++++-- source/adapters/hip/kernel.cpp | 6 ++++-- source/adapters/level_zero/kernel.cpp | 9 ++++++--- source/adapters/level_zero/ur_interface_loader.hpp | 5 +++-- source/adapters/level_zero/v2/api.cpp | 5 +++-- source/adapters/mock/ur_mockddi.cpp | 7 ++++++- source/adapters/opencl/kernel.cpp | 1 + source/loader/layers/tracing/ur_trcddi.cpp | 9 +++++++-- source/loader/layers/validation/ur_valddi.cpp | 12 +++++++++++- source/loader/ur_ldrddi.cpp | 6 +++++- source/loader/ur_libapi.cpp | 4 +++- source/ur_api.cpp | 2 ++ 16 files changed, 69 insertions(+), 17 deletions(-) diff --git a/include/ur_api.h b/include/ur_api.h index 28569597c424c..e504a3aa88190 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -9536,6 +9536,7 @@ urEnqueueCooperativeKernelLaunchExp( /// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hKernel` +/// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == pLocalWorkSize` /// + `NULL == pGroupCountRet` @@ -9543,6 +9544,7 @@ urEnqueueCooperativeKernelLaunchExp( UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object + ur_device_handle_t hDevice, ///< [in] handle of the device object uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group ///< work-items const size_t *pLocalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the @@ -11083,6 +11085,7 @@ typedef struct ur_kernel_set_specialization_constants_params_t { /// allowing the callback the ability to modify the parameter's value typedef struct ur_kernel_suggest_max_cooperative_group_count_exp_params_t { ur_kernel_handle_t *phKernel; + ur_device_handle_t *phDevice; uint32_t *pworkDim; const size_t **ppLocalWorkSize; size_t *pdynamicSharedMemorySize; diff --git a/include/ur_ddi.h b/include/ur_ddi.h index eeb323fc5851b..2384a68ea16e7 100644 --- a/include/ur_ddi.h +++ b/include/ur_ddi.h @@ -651,6 +651,7 @@ typedef ur_result_t(UR_APICALL *ur_pfnGetKernelProcAddrTable_t)( /// @brief Function-pointer for urKernelSuggestMaxCooperativeGroupCountExp typedef ur_result_t(UR_APICALL *ur_pfnKernelSuggestMaxCooperativeGroupCountExp_t)( ur_kernel_handle_t, + ur_device_handle_t, uint32_t, const size_t *, size_t, diff --git a/include/ur_print.hpp b/include/ur_print.hpp index 5255a20f78caf..08a2fc6ce262b 100644 --- a/include/ur_print.hpp +++ b/include/ur_print.hpp @@ -13187,6 +13187,12 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur::details::printPtr(os, *(params->phKernel)); + os << ", "; + os << ".hDevice = "; + + ur::details::printPtr(os, + *(params->phDevice)); + os << ", "; os << ".workDim = "; diff --git a/scripts/core/exp-cooperative-kernels.yml b/scripts/core/exp-cooperative-kernels.yml index ad3ba0ffbad7b..6020ca5f45d08 100644 --- a/scripts/core/exp-cooperative-kernels.yml +++ b/scripts/core/exp-cooperative-kernels.yml @@ -78,6 +78,9 @@ params: - type: $x_kernel_handle_t name: hKernel desc: "[in] handle of the kernel object" + - type: $x_device_handle_t + name: hDevice + desc: "[in] handle of the device object" - type: uint32_t name: workDim desc: "[in] number of dimensions, from 1 to 3, to specify the work-group work-items" diff --git a/source/adapters/cuda/kernel.cpp b/source/adapters/cuda/kernel.cpp index 46c4907d4b487..340e5ff6344c7 100644 --- a/source/adapters/cuda/kernel.cpp +++ b/source/adapters/cuda/kernel.cpp @@ -190,10 +190,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle( } UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( - ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pLocalWorkSize, - size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) { + ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, uint32_t workDim, + const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize, + uint32_t *pGroupCountRet) { UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_KERNEL); + std::ignore = hDevice; + size_t localWorkSize = pLocalWorkSize[0]; localWorkSize *= (workDim >= 2 ? pLocalWorkSize[1] : 1); localWorkSize *= (workDim == 3 ? pLocalWorkSize[2] : 1); diff --git a/source/adapters/hip/kernel.cpp b/source/adapters/hip/kernel.cpp index 1ba50c4360f19..a5aefb1293632 100644 --- a/source/adapters/hip/kernel.cpp +++ b/source/adapters/hip/kernel.cpp @@ -169,9 +169,11 @@ urKernelGetNativeHandle(ur_kernel_handle_t, ur_native_handle_t *) { } UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( - ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pLocalWorkSize, - size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) { + ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, uint32_t workDim, + const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize, + uint32_t *pGroupCountRet) { std::ignore = hKernel; + std::ignore = hDevice; std::ignore = workDim; std::ignore = pLocalWorkSize; std::ignore = dynamicSharedMemorySize; diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp index b15b4ce147997..db9337289f58b 100644 --- a/source/adapters/level_zero/kernel.cpp +++ b/source/adapters/level_zero/kernel.cpp @@ -1054,8 +1054,9 @@ ur_result_t urKernelGetNativeHandle( } ur_result_t urKernelSuggestMaxCooperativeGroupCountExp( - ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pLocalWorkSize, - size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) { + ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, uint32_t workDim, + const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize, + uint32_t *pGroupCountRet) { (void)dynamicSharedMemorySize; std::shared_lock Guard(hKernel->Mutex); @@ -1066,8 +1067,10 @@ ur_result_t urKernelSuggestMaxCooperativeGroupCountExp( ZE2UR_CALL(zeKernelSetGroupSize, (hKernel->ZeKernel, WG[0], WG[1], WG[2])); uint32_t TotalGroupCount = 0; + ze_kernel_handle_t ZeKernel; + UR_CALL(getZeKernel(hDevice->ZeDevice, hKernel, &ZeKernel)); ZE2UR_CALL(zeKernelSuggestMaxCooperativeGroupCount, - (hKernel->ZeKernel, &TotalGroupCount)); + (ZeKernel, &TotalGroupCount)); *pGroupCountRet = TotalGroupCount; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/level_zero/ur_interface_loader.hpp b/source/adapters/level_zero/ur_interface_loader.hpp index 5bd7c904f1a06..f2fd6a46d4d18 100644 --- a/source/adapters/level_zero/ur_interface_loader.hpp +++ b/source/adapters/level_zero/ur_interface_loader.hpp @@ -691,8 +691,9 @@ ur_result_t urEnqueueCooperativeKernelLaunchExp( const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); ur_result_t urKernelSuggestMaxCooperativeGroupCountExp( - ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pLocalWorkSize, - size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet); + ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, uint32_t workDim, + const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize, + uint32_t *pGroupCountRet); ur_result_t urEnqueueTimestampRecordingExp( ur_queue_handle_t hQueue, bool blocking, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); diff --git a/source/adapters/level_zero/v2/api.cpp b/source/adapters/level_zero/v2/api.cpp index c04d4cf6ca973..5fa6478118bff 100644 --- a/source/adapters/level_zero/v2/api.cpp +++ b/source/adapters/level_zero/v2/api.cpp @@ -560,8 +560,9 @@ ur_result_t urCommandBufferCommandGetInfoExp( } ur_result_t urKernelSuggestMaxCooperativeGroupCountExp( - ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pLocalWorkSize, - size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) { + ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, uint32_t workDim, + const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize, + uint32_t *pGroupCountRet) { logger::error("{} function not implemented!", __FUNCTION__); return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/source/adapters/mock/ur_mockddi.cpp b/source/adapters/mock/ur_mockddi.cpp index b60be1d561124..b27c4efaa1a3f 100644 --- a/source/adapters/mock/ur_mockddi.cpp +++ b/source/adapters/mock/ur_mockddi.cpp @@ -10057,6 +10057,7 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( /// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCountExp __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object + ur_device_handle_t hDevice, ///< [in] handle of the device object uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group ///< work-items @@ -10072,7 +10073,11 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( ur_result_t result = UR_RESULT_SUCCESS; ur_kernel_suggest_max_cooperative_group_count_exp_params_t params = { - &hKernel, &workDim, &pLocalWorkSize, &dynamicSharedMemorySize, + &hKernel, + &hDevice, + &workDim, + &pLocalWorkSize, + &dynamicSharedMemorySize, &pGroupCountRet}; auto beforeCallback = reinterpret_cast( diff --git a/source/adapters/opencl/kernel.cpp b/source/adapters/opencl/kernel.cpp index df160b65eba7b..fb2c735adc4a3 100644 --- a/source/adapters/opencl/kernel.cpp +++ b/source/adapters/opencl/kernel.cpp @@ -390,6 +390,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle( UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( [[maybe_unused]] ur_kernel_handle_t hKernel, + [[maybe_unused]] ur_device_handle_t hDevice, [[maybe_unused]] uint32_t workDim, [[maybe_unused]] const size_t *pLocalWorkSize, [[maybe_unused]] size_t dynamicSharedMemorySize, diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp index 55f8d00beabcd..3e8043a2580a6 100644 --- a/source/loader/layers/tracing/ur_trcddi.cpp +++ b/source/loader/layers/tracing/ur_trcddi.cpp @@ -8633,6 +8633,7 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( /// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCountExp __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object + ur_device_handle_t hDevice, ///< [in] handle of the device object uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group ///< work-items @@ -8654,7 +8655,11 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( } ur_kernel_suggest_max_cooperative_group_count_exp_params_t params = { - &hKernel, &workDim, &pLocalWorkSize, &dynamicSharedMemorySize, + &hKernel, + &hDevice, + &workDim, + &pLocalWorkSize, + &dynamicSharedMemorySize, &pGroupCountRet}; uint64_t instance = getContext()->notify_begin( UR_FUNCTION_KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT_EXP, @@ -8664,7 +8669,7 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( logger.info(" ---> urKernelSuggestMaxCooperativeGroupCountExp\n"); ur_result_t result = pfnSuggestMaxCooperativeGroupCountExp( - hKernel, workDim, pLocalWorkSize, dynamicSharedMemorySize, + hKernel, hDevice, workDim, pLocalWorkSize, dynamicSharedMemorySize, pGroupCountRet); getContext()->notify_end( diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp index 6e48f79edc263..d13df673cd1b9 100644 --- a/source/loader/layers/validation/ur_valddi.cpp +++ b/source/loader/layers/validation/ur_valddi.cpp @@ -9656,6 +9656,7 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( /// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCountExp __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object + ur_device_handle_t hDevice, ///< [in] handle of the device object uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group ///< work-items @@ -9681,6 +9682,10 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( return UR_RESULT_ERROR_INVALID_NULL_HANDLE; } + if (NULL == hDevice) { + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + } + if (NULL == pLocalWorkSize) { return UR_RESULT_ERROR_INVALID_NULL_POINTER; } @@ -9695,8 +9700,13 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( getContext()->refCountContext->logInvalidReference(hKernel); } + if (getContext()->enableLifetimeValidation && + !getContext()->refCountContext->isReferenceValid(hDevice)) { + getContext()->refCountContext->logInvalidReference(hDevice); + } + ur_result_t result = pfnSuggestMaxCooperativeGroupCountExp( - hKernel, workDim, pLocalWorkSize, dynamicSharedMemorySize, + hKernel, hDevice, workDim, pLocalWorkSize, dynamicSharedMemorySize, pGroupCountRet); return result; diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp index c74b9d6caff2e..480678d598577 100644 --- a/source/loader/ur_ldrddi.cpp +++ b/source/loader/ur_ldrddi.cpp @@ -8844,6 +8844,7 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( /// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCountExp __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object + ur_device_handle_t hDevice, ///< [in] handle of the device object uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group ///< work-items @@ -8871,9 +8872,12 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( // convert loader handle to platform handle hKernel = reinterpret_cast(hKernel)->handle; + // convert loader handle to platform handle + hDevice = reinterpret_cast(hDevice)->handle; + // forward to device-platform result = pfnSuggestMaxCooperativeGroupCountExp( - hKernel, workDim, pLocalWorkSize, dynamicSharedMemorySize, + hKernel, hDevice, workDim, pLocalWorkSize, dynamicSharedMemorySize, pGroupCountRet); return result; diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp index e257366a7f4c7..fc24d9347b9af 100644 --- a/source/loader/ur_libapi.cpp +++ b/source/loader/ur_libapi.cpp @@ -8935,12 +8935,14 @@ ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( /// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hKernel` +/// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == pLocalWorkSize` /// + `NULL == pGroupCountRet` /// - ::UR_RESULT_ERROR_INVALID_KERNEL ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object + ur_device_handle_t hDevice, ///< [in] handle of the device object uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group ///< work-items @@ -8961,7 +8963,7 @@ ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( } return pfnSuggestMaxCooperativeGroupCountExp( - hKernel, workDim, pLocalWorkSize, dynamicSharedMemorySize, + hKernel, hDevice, workDim, pLocalWorkSize, dynamicSharedMemorySize, pGroupCountRet); } catch (...) { return exceptionToResult(std::current_exception()); diff --git a/source/ur_api.cpp b/source/ur_api.cpp index 793045bcb48e4..eb3f20c77b50b 100644 --- a/source/ur_api.cpp +++ b/source/ur_api.cpp @@ -7578,12 +7578,14 @@ ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( /// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hKernel` +/// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == pLocalWorkSize` /// + `NULL == pGroupCountRet` /// - ::UR_RESULT_ERROR_INVALID_KERNEL ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object + ur_device_handle_t hDevice, ///< [in] handle of the device object uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group ///< work-items