diff --git a/include/ur_api.h b/include/ur_api.h index 96a58e04ea..9d88eecbc6 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -223,6 +223,7 @@ typedef enum ur_function_t { UR_FUNCTION_COMMAND_BUFFER_GET_INFO_EXP = 221, ///< Enumerator for ::urCommandBufferGetInfoExp UR_FUNCTION_COMMAND_BUFFER_COMMAND_GET_INFO_EXP = 222, ///< Enumerator for ::urCommandBufferCommandGetInfoExp UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP = 223, ///< Enumerator for ::urEnqueueTimestampRecordingExp + UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP = 224, ///< Enumerator for ::urEnqueueKernelLaunchCustomExp /// @cond UR_FUNCTION_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -8935,6 +8936,133 @@ urEnqueueTimestampRecordingExp( ///< reports the timestamp recorded when the command is executed on the device. ); +#if !defined(__GNUC__) +#pragma endregion +#endif +// Intel 'oneAPI' Unified Runtime Experimental APIs for (kernel) Launch Properties +#if !defined(__GNUC__) +#pragma region launch properties(experimental) +#endif +/////////////////////////////////////////////////////////////////////////////// +#ifndef UR_LAUNCH_PROPERTIES_EXTENSION_STRING_EXP +/// @brief The extension string that defines support for the Launch Properties +/// extension, which is returned when querying device extensions. +#define UR_LAUNCH_PROPERTIES_EXTENSION_STRING_EXP "ur_exp_launch_properties" +#endif // UR_LAUNCH_PROPERTIES_EXTENSION_STRING_EXP + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Specifies a launch property id +/// +/// @remarks +/// _Analogues_ +/// - **CUlaunchAttributeID** +typedef enum ur_exp_launch_property_id_t { + UR_EXP_LAUNCH_PROPERTY_ID_IGNORE = 0, ///< The property has no effect + UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE = 1, ///< Whether to launch a cooperative kernel + UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION = 2, ///< work-group cluster dimensions + /// @cond + UR_EXP_LAUNCH_PROPERTY_ID_FORCE_UINT32 = 0x7fffffff + /// @endcond + +} ur_exp_launch_property_id_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Specifies a launch property value +/// +/// @remarks +/// _Analogues_ +/// - **CUlaunchAttributeValue** +typedef union ur_exp_launch_property_value_t { + uint32_t clusterDim[3]; ///< [in] dimensions of the cluster (units of work-group) (x, y, z). Each + ///< value must be a divisor of the corresponding global work-size + ///< dimension (in units of work-group). + int cooperative; ///< [in] non-zero value indicates a cooperative kernel + +} ur_exp_launch_property_value_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Kernel launch property +/// +/// @remarks +/// _Analogues_ +/// - **cuLaunchAttribute** +typedef struct ur_exp_launch_property_t { + ur_exp_launch_property_id_t id; ///< [in] launch property id + ur_exp_launch_property_value_t value; ///< [in][tagged_by(id)] launch property value + +} ur_exp_launch_property_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Launch kernel with custom launch properties +/// +/// @details +/// - Launches the kernel using the specified launch properties +/// - If numPropsInLaunchPropList == 0 then a regular kernel launch is used: +/// `urEnqueueKernelLaunch` +/// - Consult the appropriate adapter driver documentation for details of +/// adapter specific behavior and native error codes that may be returned. +/// +/// @remarks +/// _Analogues_ +/// - **cuLaunchKernelEx** +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hQueue` +/// + `NULL == hKernel` +/// + NULL == hQueue +/// + NULL == hKernel +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pGlobalWorkSize` +/// + `NULL == launchPropList` +/// + NULL == pGlobalWorkSize +/// + numPropsInLaunchpropList != 0 && launchPropList == NULL +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_QUEUE +/// - ::UR_RESULT_ERROR_INVALID_KERNEL +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + phEventWaitList == NULL && numEventsInWaitList > 0 +/// + phEventWaitList != NULL && numEventsInWaitList == 0 +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_IN_EVENT_LIST_EXEC_STATUS +/// + An event in phEventWaitList has ::UR_EVENT_STATUS_ERROR +/// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION +/// - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE +/// - ::UR_RESULT_ERROR_INVALID_VALUE +/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY +/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES +UR_APIEXPORT ur_result_t UR_APICALL +urEnqueueKernelLaunchCustomExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object + uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and + ///< work-group work-items + const size_t *pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the + ///< number of global work-items in workDim that will execute the kernel + ///< function + const size_t *pLocalWorkSize, ///< [in][optional] pointer to an array of workDim unsigned values that + ///< specify the number of local work-items forming a work-group that will + ///< execute the kernel function. If nullptr, the runtime implementation + ///< will choose the work-group size. + uint32_t numPropsInLaunchPropList, ///< [in] size of the launch prop list + const ur_exp_launch_property_t *launchPropList, ///< [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch + ///< properties + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating that no wait event. + ur_event_handle_t *phEvent ///< [out][optional] return an event object that identifies this particular + ///< kernel execution instance. +); + #if !defined(__GNUC__) #pragma endregion #endif @@ -10629,6 +10757,23 @@ typedef struct ur_enqueue_write_host_pipe_params_t { ur_event_handle_t **pphEvent; } ur_enqueue_write_host_pipe_params_t; +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function parameters for urEnqueueKernelLaunchCustomExp +/// @details Each entry is a pointer to the parameter passed to the function; +/// allowing the callback the ability to modify the parameter's value +typedef struct ur_enqueue_kernel_launch_custom_exp_params_t { + ur_queue_handle_t *phQueue; + ur_kernel_handle_t *phKernel; + uint32_t *pworkDim; + const size_t **ppGlobalWorkSize; + const size_t **ppLocalWorkSize; + uint32_t *pnumPropsInLaunchPropList; + const ur_exp_launch_property_t **plaunchPropList; + uint32_t *pnumEventsInWaitList; + const ur_event_handle_t **pphEventWaitList; + ur_event_handle_t **pphEvent; +} ur_enqueue_kernel_launch_custom_exp_params_t; + /////////////////////////////////////////////////////////////////////////////// /// @brief Function parameters for urEnqueueCooperativeKernelLaunchExp /// @details Each entry is a pointer to the parameter passed to the function; diff --git a/include/ur_ddi.h b/include/ur_ddi.h index 52ae226c2a..fb1f1823b3 100644 --- a/include/ur_ddi.h +++ b/include/ur_ddi.h @@ -1435,6 +1435,20 @@ typedef ur_result_t(UR_APICALL *ur_pfnGetEnqueueProcAddrTable_t)( ur_api_version_t, ur_enqueue_dditable_t *); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function-pointer for urEnqueueKernelLaunchCustomExp +typedef ur_result_t(UR_APICALL *ur_pfnEnqueueKernelLaunchCustomExp_t)( + ur_queue_handle_t, + ur_kernel_handle_t, + uint32_t, + const size_t *, + const size_t *, + uint32_t, + const ur_exp_launch_property_t *, + uint32_t, + const ur_event_handle_t *, + ur_event_handle_t *); + /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urEnqueueCooperativeKernelLaunchExp typedef ur_result_t(UR_APICALL *ur_pfnEnqueueCooperativeKernelLaunchExp_t)( @@ -1460,6 +1474,7 @@ typedef ur_result_t(UR_APICALL *ur_pfnEnqueueTimestampRecordingExp_t)( /////////////////////////////////////////////////////////////////////////////// /// @brief Table of EnqueueExp functions pointers typedef struct ur_enqueue_exp_dditable_t { + ur_pfnEnqueueKernelLaunchCustomExp_t pfnKernelLaunchCustomExp; ur_pfnEnqueueCooperativeKernelLaunchExp_t pfnCooperativeKernelLaunchExp; ur_pfnEnqueueTimestampRecordingExp_t pfnTimestampRecordingExp; } ur_enqueue_exp_dditable_t; diff --git a/include/ur_print.h b/include/ur_print.h index 3377980ce7..753875ace9 100644 --- a/include/ur_print.h +++ b/include/ur_print.h @@ -1002,6 +1002,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintExpCommandBufferUpdateValueArgDesc(co /// - `buff_size < out_size` UR_APIEXPORT ur_result_t UR_APICALL urPrintExpCommandBufferUpdateKernelLaunchDesc(const struct ur_exp_command_buffer_update_kernel_launch_desc_t params, char *buffer, const size_t buff_size, size_t *out_size); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_exp_launch_property_id_t enum +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintExpLaunchPropertyId(enum ur_exp_launch_property_id_t value, char *buffer, const size_t buff_size, size_t *out_size); + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_exp_launch_property_t struct +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintExpLaunchProperty(const struct ur_exp_launch_property_t params, char *buffer, const size_t buff_size, size_t *out_size); + /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_exp_peer_info_t enum /// @returns @@ -1946,6 +1962,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueReadHostPipeParams(const struc /// - `buff_size < out_size` UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueWriteHostPipeParams(const struct ur_enqueue_write_host_pipe_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_enqueue_kernel_launch_custom_exp_params_t struct +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueKernelLaunchCustomExpParams(const struct ur_enqueue_kernel_launch_custom_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); + /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_enqueue_cooperative_kernel_launch_exp_params_t struct /// @returns diff --git a/include/ur_print.hpp b/include/ur_print.hpp index bed5f01670..db230c91d7 100644 --- a/include/ur_print.hpp +++ b/include/ur_print.hpp @@ -207,6 +207,11 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_exp_command template <> inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_exp_command_buffer_command_info_t value, size_t size); +inline ur_result_t printUnion( + std::ostream &os, + const union ur_exp_launch_property_value_t params, + const enum ur_exp_launch_property_id_t tag); + template <> inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_exp_peer_info_t value, size_t size); @@ -335,6 +340,8 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_exp_command_buffer_update_pointer_arg_desc_t params); inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_exp_command_buffer_update_value_arg_desc_t params); inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_exp_command_buffer_update_kernel_launch_desc_t params); +inline std::ostream &operator<<(std::ostream &os, enum ur_exp_launch_property_id_t value); +inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_exp_launch_property_t params); inline std::ostream &operator<<(std::ostream &os, enum ur_exp_peer_info_t value); /////////////////////////////////////////////////////////////////////////////// @@ -919,6 +926,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) { case UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP: os << "UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP"; break; + case UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP: + os << "UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP"; + break; default: os << "unknown enumerator"; break; @@ -9838,6 +9848,84 @@ inline std::ostream &operator<<(std::ostream &os, const struct ur_exp_command_bu return os; } /////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_exp_launch_property_id_t type +/// @returns +/// std::ostream & +inline std::ostream &operator<<(std::ostream &os, enum ur_exp_launch_property_id_t value) { + switch (value) { + case UR_EXP_LAUNCH_PROPERTY_ID_IGNORE: + os << "UR_EXP_LAUNCH_PROPERTY_ID_IGNORE"; + break; + case UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE: + os << "UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE"; + break; + case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: + os << "UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION"; + break; + default: + os << "unknown enumerator"; + break; + } + return os; +} +namespace ur::details { + +/////////////////////////////////////////////////////////////////////////////// +// @brief Print ur_exp_launch_property_value_t union +inline ur_result_t printUnion( + std::ostream &os, + const union ur_exp_launch_property_value_t params, + const enum ur_exp_launch_property_id_t tag) { + os << "(union ur_exp_launch_property_value_t){"; + + switch (tag) { + case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: + + os << ".clusterDim = {"; + for (auto i = 0; i < 3; i++) { + if (i != 0) { + os << ", "; + } + + os << (params.clusterDim[i]); + } + os << "}"; + + break; + case UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE: + + os << ".cooperative = "; + + os << (params.cooperative); + + break; + default: + os << ""; + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + os << "}"; + return UR_RESULT_SUCCESS; +} +} // namespace ur::details +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_exp_launch_property_t type +/// @returns +/// std::ostream & +inline std::ostream &operator<<(std::ostream &os, const struct ur_exp_launch_property_t params) { + os << "(struct ur_exp_launch_property_t){"; + + os << ".id = "; + + os << (params.id); + + os << ", "; + os << ".value = "; + ur::details::printUnion(os, (params.value), params.id); + + os << "}"; + return os; +} +/////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the ur_exp_peer_info_t type /// @returns /// std::ostream & @@ -14059,6 +14147,82 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct return os; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_enqueue_kernel_launch_custom_exp_params_t type +/// @returns +/// std::ostream & +inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_enqueue_kernel_launch_custom_exp_params_t *params) { + + os << ".hQueue = "; + + ur::details::printPtr(os, + *(params->phQueue)); + + os << ", "; + os << ".hKernel = "; + + ur::details::printPtr(os, + *(params->phKernel)); + + os << ", "; + os << ".workDim = "; + + os << *(params->pworkDim); + + os << ", "; + os << ".pGlobalWorkSize = "; + + ur::details::printPtr(os, + *(params->ppGlobalWorkSize)); + + os << ", "; + os << ".pLocalWorkSize = "; + + ur::details::printPtr(os, + *(params->ppLocalWorkSize)); + + os << ", "; + os << ".numPropsInLaunchPropList = "; + + os << *(params->pnumPropsInLaunchPropList); + + os << ", "; + os << ".launchPropList = {"; + for (size_t i = 0; *(params->plaunchPropList) != NULL && i < *params->pnumPropsInLaunchPropList; ++i) { + if (i != 0) { + os << ", "; + } + + os << (*(params->plaunchPropList))[i]; + } + os << "}"; + + os << ", "; + os << ".numEventsInWaitList = "; + + os << *(params->pnumEventsInWaitList); + + os << ", "; + os << ".phEventWaitList = {"; + for (size_t i = 0; *(params->pphEventWaitList) != NULL && i < *params->pnumEventsInWaitList; ++i) { + if (i != 0) { + os << ", "; + } + + ur::details::printPtr(os, + (*(params->pphEventWaitList))[i]); + } + os << "}"; + + os << ", "; + os << ".phEvent = "; + + ur::details::printPtr(os, + *(params->pphEvent)); + + return os; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the ur_enqueue_cooperative_kernel_launch_exp_params_t type /// @returns @@ -17174,6 +17338,9 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, ur_function_ case UR_FUNCTION_ENQUEUE_WRITE_HOST_PIPE: { os << (const struct ur_enqueue_write_host_pipe_params_t *)params; } break; + case UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP: { + os << (const struct ur_enqueue_kernel_launch_custom_exp_params_t *)params; + } break; case UR_FUNCTION_ENQUEUE_COOPERATIVE_KERNEL_LAUNCH_EXP: { os << (const struct ur_enqueue_cooperative_kernel_launch_exp_params_t *)params; } break; diff --git a/scripts/core/EXP-LAUNCH-PROPERTIES.rst b/scripts/core/EXP-LAUNCH-PROPERTIES.rst new file mode 100644 index 0000000000..a0a116b293 --- /dev/null +++ b/scripts/core/EXP-LAUNCH-PROPERTIES.rst @@ -0,0 +1,102 @@ +<% + OneApi=tags['$OneApi'] + x=tags['$x'] + X=x.upper() +%> + +.. _experimental-launch-properties: + +================================================================================ +LAUNCH Properties +================================================================================ + +.. warning:: + + Experimental features: + + * May be replaced, updated, or removed at any time. + * Do not require maintaining API/ABI stability of their own additions over + time. + * Do not require conformance testing of their own additions. + + +Terminology +-------------------------------------------------------------------------------- +"Launch Properties" is used to indicate optional kernel launch properties that +can be specified at the time of a kernel launch. Such properties can be used to +enable hardware specific kernel launch features. + +Motivation +-------------------------------------------------------------------------------- +Advances in hardware sometimes require new kernel properties. One example is +distributed shared memory as used by Nvidia Hopper GPUs. Launching a kernel +that supports distributed shared memory requires specifying a set of "cluster" +dimensions, in units of work-groups, over which the shared memory is +"distributed". Additionally some applications require specification of kernel +properties at launch-time. + +This extension is a future-proof and portable solution that supports these two +requirements. Instead of using a fixed set of kernel enqueue arguments, the +approach is to introduce the ${x}_exp_launch_property_t type that enables a +more extendable API. + +Each ${x}_exp_launch_property_t instance corresponds to a specific kernel +launch property. +Only one new function is introduced: ${x}EnqueueKernelLaunchCustomExp. +${x}EnqueueKernelLaunchCustomExp takes an array of ${x}_exp_launch_property_t +as an argument, and launches a kernel using these properties. +${x}EnqueueKernelLaunchCustomExp corresponds closely to the CUDA Driver API +``cuLaunchKernelEx``. + +Many kernel lauch properties can be supported, such as cooperative kernel +launches. As such, eventually this extension should be able to replace the +cooperative kernels Unified-Runtime extension. + +API +-------------------------------------------------------------------------------- + +Macros +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* ${X}_LAUNCH_PROPERTIES_EXTENSION_STRING_EXP + +Enums +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* ${x}_exp_launch_property_id_t + +Unions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* ${x}_exp_launch_property_value_t + +Structs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* ${x}_exp_launch_property_t + +Functions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* ${x}EnqueueKernelLaunchCustomExp + +Support +-------------------------------------------------------------------------------- + +Adapters which support this experimental feature *must* return the valid string +defined in ${X}_LAUNCH_PROPERTIES_EXTENSION_STRING_EXP as one of the options from +${x}DeviceGetInfo when querying for ${X}_DEVICE_INFO_EXTENSIONS. + +Changelog +-------------------------------------------------------------------------------- + ++-----------+---------------------------------------------+ +| Revision | Changes | ++===========+=============================================+ +| 1.0 | Initial Draft | ++-----------+---------------------------------------------+ + +Contributors +-------------------------------------------------------------------------------- + +* JackAKirk `jack.kirk@codeplay.com `_ diff --git a/scripts/core/exp-launch-properties.yml b/scripts/core/exp-launch-properties.yml new file mode 100644 index 0000000000..05f90aa7d2 --- /dev/null +++ b/scripts/core/exp-launch-properties.yml @@ -0,0 +1,131 @@ +# +# Copyright (C) 2024 Intel Corporation +# +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# See YaML.md for syntax definition +# +--- #-------------------------------------------------------------------------- +type: header +desc: "Intel $OneApi Unified Runtime Experimental APIs for (kernel) Launch Properties" +ordinal: "99" +--- #-------------------------------------------------------------------------- +type: macro +desc: "The extension string that defines support for the Launch Properties extension, which is returned when querying device extensions." +name: $X_LAUNCH_PROPERTIES_EXTENSION_STRING_EXP +value: "\"$x_exp_launch_properties\"" +--- #-------------------------------------------------------------------------- +type: enum +desc: "Specifies a launch property id" +name: $x_exp_launch_property_id_t +analogue: + - "**CUlaunchAttributeID**" +etors: + - name: IGNORE + desc: "The property has no effect" + - name: COOPERATIVE + desc: "Whether to launch a cooperative kernel" + - name: CLUSTER_DIMENSION + desc: "work-group cluster dimensions" +--- #-------------------------------------------------------------------------- +type: union +desc: "Specifies a launch property value" +name: $x_exp_launch_property_value_t +tag: $x_exp_launch_property_id_t +analogue: + - "**CUlaunchAttributeValue**" +members: + - type: uint32_t[3] + name: clusterDim + desc: "[in] dimensions of the cluster (units of work-group) (x, y, z). Each value must be a divisor of the corresponding global work-size dimension (in units of work-group)." + tag: $X_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION + - type: int + name: cooperative + desc: "[in] non-zero value indicates a cooperative kernel" + tag: $X_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE +--- #-------------------------------------------------------------------------- +type: struct +desc: "Kernel launch property" +name: $x_exp_launch_property_t +analogue: + - "**cuLaunchAttribute**" +members: + - type: $x_exp_launch_property_id_t + name: id + desc: "[in] launch property id" + init: $X_EXP_LAUNCH_PROPERTY_ID_IGNORE + - type: $x_exp_launch_property_value_t + name: value + desc: "[in][tagged_by(id)] launch property value" + init: nullptr +--- #-------------------------------------------------------------------------- +type: function +desc: "Launch kernel with custom launch properties" +class: $xEnqueue +name: KernelLaunchCustomExp +ordinal: "0" +analogue: + - "**cuLaunchKernelEx**" +details: + - "Launches the kernel using the specified launch properties" + - "If numPropsInLaunchPropList == 0 then a regular kernel launch is used: `urEnqueueKernelLaunch`" + - "Consult the appropriate adapter driver documentation for details of adapter specific behavior and native error codes that may be returned." +params: + - type: $x_queue_handle_t + name: hQueue + desc: "[in] handle of the queue object" + - type: $x_kernel_handle_t + name: hKernel + desc: "[in] handle of the kernel object" + - type: uint32_t + name: workDim + desc: "[in] number of dimensions, from 1 to 3, to specify the global and work-group work-items" + - type: const size_t* + name: pGlobalWorkSize + desc: "[in] pointer to an array of workDim unsigned values that specify the number of global work-items in workDim that will execute the kernel function" + - type: const size_t* + name: pLocalWorkSize + desc: "[in][optional] pointer to an array of workDim unsigned values that specify the number of local work-items forming a work-group that will execute the kernel function. If nullptr, the runtime implementation will choose the work-group size." + - type: uint32_t + name: numPropsInLaunchPropList + desc: "[in] size of the launch prop list" + - type: const $x_exp_launch_property_t* + name: launchPropList + desc: "[in][range(0, numPropsInLaunchPropList)] pointer to a list of launch properties" + - type: uint32_t + name: numEventsInWaitList + desc: "[in] size of the event wait list" + - type: const ur_event_handle_t* + name: phEventWaitList + desc: "[in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. " + - type: ur_event_handle_t* + name: phEvent + desc: "[out][optional] return an event object that identifies this particular kernel execution instance." +returns: + - $X_RESULT_SUCCESS + - $X_RESULT_ERROR_UNINITIALIZED + - $X_RESULT_ERROR_DEVICE_LOST + - $X_RESULT_ERROR_ADAPTER_SPECIFIC + - $X_RESULT_ERROR_INVALID_NULL_HANDLE: + - "NULL == hQueue" + - "NULL == hKernel" + - $X_RESULT_ERROR_INVALID_NULL_POINTER: + - "NULL == pGlobalWorkSize" + - "numPropsInLaunchpropList != 0 && launchPropList == NULL" + - $X_RESULT_ERROR_INVALID_QUEUE + - $X_RESULT_ERROR_INVALID_KERNEL + - $X_RESULT_ERROR_INVALID_EVENT + - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: + - "phEventWaitList == NULL && numEventsInWaitList > 0" + - "phEventWaitList != NULL && numEventsInWaitList == 0" + - "If event objects in phEventWaitList are not valid events." + - $X_RESULT_ERROR_IN_EVENT_LIST_EXEC_STATUS: + - "An event in phEventWaitList has $X_EVENT_STATUS_ERROR" + - $X_RESULT_ERROR_INVALID_WORK_DIMENSION + - $X_RESULT_ERROR_INVALID_WORK_GROUP_SIZE + - $X_RESULT_ERROR_INVALID_VALUE + - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY + - $X_RESULT_ERROR_OUT_OF_RESOURCES + diff --git a/scripts/core/registry.yml b/scripts/core/registry.yml index 6a551d5821..b0a61e7f88 100644 --- a/scripts/core/registry.yml +++ b/scripts/core/registry.yml @@ -583,6 +583,9 @@ etors: - name: ENQUEUE_TIMESTAMP_RECORDING_EXP desc: Enumerator for $xEnqueueTimestampRecordingExp value: '223' +- name: ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP + desc: Enumerator for $xEnqueueKernelLaunchCustomExp + value: '224' --- type: enum desc: Defines structure types diff --git a/source/adapters/cuda/device.cpp b/source/adapters/cuda/device.cpp index 3a94587d1f..e6389c5ee2 100644 --- a/source/adapters/cuda/device.cpp +++ b/source/adapters/cuda/device.cpp @@ -628,6 +628,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, // Return supported for the UR command-buffer experimental feature SupportedExtensions += "ur_exp_command_buffer "; SupportedExtensions += "ur_exp_usm_p2p "; + SupportedExtensions += "ur_exp_launch_properties "; SupportedExtensions += " "; int Major = 0; diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index 9627fc6da2..279426a41e 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -540,6 +540,190 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( numEventsInWaitList, phEventWaitList, phEvent); } +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( + ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, + uint32_t numPropsInLaunchPropList, + const ur_exp_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + + if (numPropsInLaunchPropList == 0) { + urEnqueueKernelLaunch(hQueue, hKernel, workDim, nullptr, pGlobalWorkSize, + pLocalWorkSize, numEventsInWaitList, phEventWaitList, + phEvent); + } + + // Preconditions + UR_ASSERT(hQueue->getDevice() == hKernel->getProgram()->getDevice(), + UR_RESULT_ERROR_INVALID_KERNEL); + UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + + if (launchPropList == NULL) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + + std::vector launch_attribute(numPropsInLaunchPropList); + for (uint32_t i = 0; i < numPropsInLaunchPropList; i++) { + switch (launchPropList[i].id) { + case UR_EXP_LAUNCH_PROPERTY_ID_IGNORE: { + launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_IGNORE; + break; + } + case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: { + + launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION; + // Note that cuda orders from right to left wrt SYCL dimensional order. + launch_attribute[i].value.clusterDim.x = + launchPropList[i].value.clusterDim[2]; + launch_attribute[i].value.clusterDim.y = + launchPropList[i].value.clusterDim[1]; + launch_attribute[i].value.clusterDim.z = + launchPropList[i].value.clusterDim[0]; + break; + } + case UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE: { + launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE; + launch_attribute[i].value.cooperative = + launchPropList[i].value.cooperative; + break; + } + default: { + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + } + } + + std::vector DepEvents( + phEventWaitList, phEventWaitList + numEventsInWaitList); + std::vector> MemMigrationLocks; + + // phEventWaitList only contains events that are handed to UR by the SYCL + // runtime. However since UR handles memory dependencies within a context + // we may need to add more events to our dependent events list if the UR + // context contains multiple devices + if (hQueue->getContext()->Devices.size() > 1) { + MemMigrationLocks.reserve(hKernel->Args.MemObjArgs.size()); + for (auto &MemArg : hKernel->Args.MemObjArgs) { + bool PushBack = false; + if (auto MemDepEvent = MemArg.Mem->LastEventWritingToMemObj; + MemDepEvent && std::find(DepEvents.begin(), DepEvents.end(), + MemDepEvent) == DepEvents.end()) { + DepEvents.push_back(MemDepEvent); + PushBack = true; + } + if ((MemArg.AccessFlags & + (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)) || + PushBack) { + if (std::find_if(MemMigrationLocks.begin(), MemMigrationLocks.end(), + [MemArg](auto &Lock) { + return Lock.first == MemArg.Mem; + }) == MemMigrationLocks.end()) + MemMigrationLocks.emplace_back( + std::pair{MemArg.Mem, ur_lock{MemArg.Mem->MemoryMigrationMutex}}); + } + } + } + + // Early exit for zero size kernel + if (*pGlobalWorkSize == 0) { + return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList, + phEventWaitList, phEvent); + } + + // Set the number of threads per block to the number of threads per warp + // by default unless user has provided a better number + size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; + size_t BlocksPerGrid[3] = {1u, 1u, 1u}; + + uint32_t LocalSize = hKernel->getLocalSize(); + CUfunction CuFunc = hKernel->get(); + + // This might return UR_RESULT_ERROR_ADAPTER_SPECIFIC, which cannot be handled + // using the standard UR_CHECK_ERROR + if (ur_result_t Ret = + setKernelParams(hQueue->getContext(), hQueue->Device, workDim, + nullptr, pGlobalWorkSize, pLocalWorkSize, hKernel, + CuFunc, ThreadsPerBlock, BlocksPerGrid); + Ret != UR_RESULT_SUCCESS) + return Ret; + + try { + std::unique_ptr RetImplEvent{nullptr}; + + ScopedContext Active(hQueue->getDevice()); + uint32_t StreamToken; + ur_stream_guard_ Guard; + CUstream CuStream = hQueue->getNextComputeStream( + numEventsInWaitList, phEventWaitList, Guard, &StreamToken); + + if (DepEvents.size()) { + UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, DepEvents.size(), + DepEvents.data())); + } + + // For memory migration across devices in the same context + if (hQueue->getContext()->Devices.size() > 1) { + for (auto &MemArg : hKernel->Args.MemObjArgs) { + migrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice()); + } + } + + if (phEvent) { + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_KERNEL_LAUNCH, hQueue, CuStream, StreamToken)); + UR_CHECK_ERROR(RetImplEvent->start()); + } + + // Once event has been started we can unlock MemoryMigrationMutex + if (hQueue->getContext()->Devices.size() > 1) { + for (auto &MemArg : hKernel->Args.MemObjArgs) { + // Telling the ur_mem_handle_t that it will need to wait on this kernel + // if it has been written to + if (phEvent && (MemArg.AccessFlags & + (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY))) { + MemArg.Mem->setLastEventWritingToMemObj(RetImplEvent.get()); + } + } + // We can release the MemoryMigrationMutexes now + MemMigrationLocks.clear(); + } + + auto &ArgIndices = hKernel->getArgIndices(); + + CUlaunchConfig launch_config; + launch_config.gridDimX = BlocksPerGrid[0]; + launch_config.gridDimY = BlocksPerGrid[1]; + launch_config.gridDimZ = BlocksPerGrid[2]; + launch_config.blockDimX = ThreadsPerBlock[0]; + launch_config.blockDimY = ThreadsPerBlock[1]; + launch_config.blockDimZ = ThreadsPerBlock[2]; + + launch_config.sharedMemBytes = LocalSize; + launch_config.hStream = CuStream; + launch_config.attrs = &launch_attribute[0]; + launch_config.numAttrs = numPropsInLaunchPropList; + + UR_CHECK_ERROR(cuLaunchKernelEx(&launch_config, CuFunc, + const_cast(ArgIndices.data()), + nullptr)); + + if (LocalSize != 0) + hKernel->clearLocalSize(); + + if (phEvent) { + UR_CHECK_ERROR(RetImplEvent->record()); + *phEvent = RetImplEvent.release(); + } + + } catch (ur_result_t Err) { + return Err; + } + return UR_RESULT_SUCCESS; +} + /// Set parameters for general 3D memory copy. /// If the source and/or destination is on the device, SrcPtr and/or DstPtr /// must be a pointer to a CUdeviceptr diff --git a/source/adapters/cuda/ur_interface_loader.cpp b/source/adapters/cuda/ur_interface_loader.cpp index 80fd211863..b70198b227 100644 --- a/source/adapters/cuda/ur_interface_loader.cpp +++ b/source/adapters/cuda/ur_interface_loader.cpp @@ -408,6 +408,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnCooperativeKernelLaunchExp = urEnqueueCooperativeKernelLaunchExp; pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; + pDdiTable->pfnKernelLaunchCustomExp = urEnqueueKernelLaunchCustomExp; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/null/ur_nullddi.cpp b/source/adapters/null/ur_nullddi.cpp index 45dd6a59c5..2278d5907e 100644 --- a/source/adapters/null/ur_nullddi.cpp +++ b/source/adapters/null/ur_nullddi.cpp @@ -5547,6 +5547,55 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueKernelLaunchCustomExp +__urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object + uint32_t + workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and + ///< work-group work-items + const size_t * + pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the + ///< number of global work-items in workDim that will execute the kernel + ///< function + const size_t * + pLocalWorkSize, ///< [in][optional] pointer to an array of workDim unsigned values that + ///< specify the number of local work-items forming a work-group that will + ///< execute the kernel function. If nullptr, the runtime implementation + ///< will choose the work-group size. + uint32_t numPropsInLaunchPropList, ///< [in] size of the launch prop list + const ur_exp_launch_property_t * + launchPropList, ///< [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch + ///< properties + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating that no wait event. + ur_event_handle_t * + phEvent ///< [out][optional] return an event object that identifies this particular + ///< kernel execution instance. + ) try { + ur_result_t result = UR_RESULT_SUCCESS; + + // if the driver has created a custom function, then call it instead of using the generic path + auto pfnKernelLaunchCustomExp = + d_context.urDdiTable.EnqueueExp.pfnKernelLaunchCustomExp; + if (nullptr != pfnKernelLaunchCustomExp) { + result = pfnKernelLaunchCustomExp( + hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize, + numPropsInLaunchPropList, launchPropList, numEventsInWaitList, + phEventWaitList, phEvent); + } else { + // generic implementation + } + + return result; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urProgramBuildExp __urdlllocal ur_result_t UR_APICALL urProgramBuildExp( @@ -6100,6 +6149,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( ur_result_t result = UR_RESULT_SUCCESS; + pDdiTable->pfnKernelLaunchCustomExp = + driver::urEnqueueKernelLaunchCustomExp; + pDdiTable->pfnCooperativeKernelLaunchExp = driver::urEnqueueCooperativeKernelLaunchExp; diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp index 83987e5d90..56f270a9d9 100644 --- a/source/loader/layers/tracing/ur_trcddi.cpp +++ b/source/loader/layers/tracing/ur_trcddi.cpp @@ -7401,6 +7401,72 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueKernelLaunchCustomExp +__urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object + uint32_t + workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and + ///< work-group work-items + const size_t * + pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the + ///< number of global work-items in workDim that will execute the kernel + ///< function + const size_t * + pLocalWorkSize, ///< [in][optional] pointer to an array of workDim unsigned values that + ///< specify the number of local work-items forming a work-group that will + ///< execute the kernel function. If nullptr, the runtime implementation + ///< will choose the work-group size. + uint32_t numPropsInLaunchPropList, ///< [in] size of the launch prop list + const ur_exp_launch_property_t * + launchPropList, ///< [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch + ///< properties + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating that no wait event. + ur_event_handle_t * + phEvent ///< [out][optional] return an event object that identifies this particular + ///< kernel execution instance. +) { + auto pfnKernelLaunchCustomExp = + context.urDdiTable.EnqueueExp.pfnKernelLaunchCustomExp; + + if (nullptr == pfnKernelLaunchCustomExp) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + ur_enqueue_kernel_launch_custom_exp_params_t params = { + &hQueue, &hKernel, + &workDim, &pGlobalWorkSize, + &pLocalWorkSize, &numPropsInLaunchPropList, + &launchPropList, &numEventsInWaitList, + &phEventWaitList, &phEvent}; + uint64_t instance = + context.notify_begin(UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP, + "urEnqueueKernelLaunchCustomExp", ¶ms); + + context.logger.info("---> urEnqueueKernelLaunchCustomExp"); + + ur_result_t result = pfnKernelLaunchCustomExp( + hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize, + numPropsInLaunchPropList, launchPropList, numEventsInWaitList, + phEventWaitList, phEvent); + + context.notify_end(UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP, + "urEnqueueKernelLaunchCustomExp", ¶ms, &result, + instance); + + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urProgramBuildExp __urdlllocal ur_result_t UR_APICALL urProgramBuildExp( @@ -8159,6 +8225,10 @@ __urdlllocal ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( ur_result_t result = UR_RESULT_SUCCESS; + dditable.pfnKernelLaunchCustomExp = pDdiTable->pfnKernelLaunchCustomExp; + pDdiTable->pfnKernelLaunchCustomExp = + ur_tracing_layer::urEnqueueKernelLaunchCustomExp; + dditable.pfnCooperativeKernelLaunchExp = pDdiTable->pfnCooperativeKernelLaunchExp; pDdiTable->pfnCooperativeKernelLaunchExp = diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp index d2f63921e2..4bdd801c1a 100644 --- a/source/loader/layers/validation/ur_valddi.cpp +++ b/source/loader/layers/validation/ur_valddi.cpp @@ -8993,6 +8993,87 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueKernelLaunchCustomExp +__urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object + uint32_t + workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and + ///< work-group work-items + const size_t * + pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the + ///< number of global work-items in workDim that will execute the kernel + ///< function + const size_t * + pLocalWorkSize, ///< [in][optional] pointer to an array of workDim unsigned values that + ///< specify the number of local work-items forming a work-group that will + ///< execute the kernel function. If nullptr, the runtime implementation + ///< will choose the work-group size. + uint32_t numPropsInLaunchPropList, ///< [in] size of the launch prop list + const ur_exp_launch_property_t * + launchPropList, ///< [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch + ///< properties + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating that no wait event. + ur_event_handle_t * + phEvent ///< [out][optional] return an event object that identifies this particular + ///< kernel execution instance. +) { + auto pfnKernelLaunchCustomExp = + context.urDdiTable.EnqueueExp.pfnKernelLaunchCustomExp; + + if (nullptr == pfnKernelLaunchCustomExp) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + if (context.enableParameterValidation) { + if (NULL == hQueue) { + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + } + + if (NULL == hKernel) { + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + } + + if (NULL == pGlobalWorkSize) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + + if (NULL == launchPropList) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + + if (phEventWaitList != NULL && numEventsInWaitList > 0) { + for (uint32_t i = 0; i < numEventsInWaitList; ++i) { + if (phEventWaitList[i] == NULL) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + } + } + } + + if (context.enableLifetimeValidation && + !refCountContext.isReferenceValid(hQueue)) { + refCountContext.logInvalidReference(hQueue); + } + + if (context.enableLifetimeValidation && + !refCountContext.isReferenceValid(hKernel)) { + refCountContext.logInvalidReference(hKernel); + } + + ur_result_t result = pfnKernelLaunchCustomExp( + hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize, + numPropsInLaunchPropList, launchPropList, numEventsInWaitList, + phEventWaitList, phEvent); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urProgramBuildExp __urdlllocal ur_result_t UR_APICALL urProgramBuildExp( @@ -9797,6 +9878,10 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( ur_result_t result = UR_RESULT_SUCCESS; + dditable.pfnKernelLaunchCustomExp = pDdiTable->pfnKernelLaunchCustomExp; + pDdiTable->pfnKernelLaunchCustomExp = + ur_validation_layer::urEnqueueKernelLaunchCustomExp; + dditable.pfnCooperativeKernelLaunchExp = pDdiTable->pfnCooperativeKernelLaunchExp; pDdiTable->pfnCooperativeKernelLaunchExp = diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp index a8c9dc8dcc..d7a9447b06 100644 --- a/source/loader/ur_ldrddi.cpp +++ b/source/loader/ur_ldrddi.cpp @@ -7698,6 +7698,61 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueKernelLaunchCustomExp +__urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object + uint32_t + workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and + ///< work-group work-items + const size_t * + pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the + ///< number of global work-items in workDim that will execute the kernel + ///< function + const size_t * + pLocalWorkSize, ///< [in][optional] pointer to an array of workDim unsigned values that + ///< specify the number of local work-items forming a work-group that will + ///< execute the kernel function. If nullptr, the runtime implementation + ///< will choose the work-group size. + uint32_t numPropsInLaunchPropList, ///< [in] size of the launch prop list + const ur_exp_launch_property_t * + launchPropList, ///< [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch + ///< properties + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating that no wait event. + ur_event_handle_t * + phEvent ///< [out][optional] return an event object that identifies this particular + ///< kernel execution instance. +) { + ur_result_t result = UR_RESULT_SUCCESS; + + // extract platform's function pointer table + auto dditable = reinterpret_cast(hQueue)->dditable; + auto pfnKernelLaunchCustomExp = + dditable->ur.EnqueueExp.pfnKernelLaunchCustomExp; + if (nullptr == pfnKernelLaunchCustomExp) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + // convert loader handle to platform handle + hQueue = reinterpret_cast(hQueue)->handle; + + // convert loader handle to platform handle + hKernel = reinterpret_cast(hKernel)->handle; + + // forward to device-platform + result = pfnKernelLaunchCustomExp(hQueue, hKernel, workDim, pGlobalWorkSize, + pLocalWorkSize, numPropsInLaunchPropList, + launchPropList, numEventsInWaitList, + phEventWaitList, phEvent); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urProgramBuildExp __urdlllocal ur_result_t UR_APICALL urProgramBuildExp( @@ -8416,6 +8471,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( if (ur_loader::context->platforms.size() != 1 || ur_loader::context->forceIntercept) { // return pointers to loader's DDIs + pDdiTable->pfnKernelLaunchCustomExp = + ur_loader::urEnqueueKernelLaunchCustomExp; pDdiTable->pfnCooperativeKernelLaunchExp = ur_loader::urEnqueueCooperativeKernelLaunchExp; pDdiTable->pfnTimestampRecordingExp = diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp index 62b502095c..35e5d68e36 100644 --- a/source/loader/ur_libapi.cpp +++ b/source/loader/ur_libapi.cpp @@ -8360,6 +8360,95 @@ ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Launch kernel with custom launch properties +/// +/// @details +/// - Launches the kernel using the specified launch properties +/// - If numPropsInLaunchPropList == 0 then a regular kernel launch is used: +/// `urEnqueueKernelLaunch` +/// - Consult the appropriate adapter driver documentation for details of +/// adapter specific behavior and native error codes that may be returned. +/// +/// @remarks +/// _Analogues_ +/// - **cuLaunchKernelEx** +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hQueue` +/// + `NULL == hKernel` +/// + NULL == hQueue +/// + NULL == hKernel +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pGlobalWorkSize` +/// + `NULL == launchPropList` +/// + NULL == pGlobalWorkSize +/// + numPropsInLaunchpropList != 0 && launchPropList == NULL +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_QUEUE +/// - ::UR_RESULT_ERROR_INVALID_KERNEL +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + phEventWaitList == NULL && numEventsInWaitList > 0 +/// + phEventWaitList != NULL && numEventsInWaitList == 0 +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_IN_EVENT_LIST_EXEC_STATUS +/// + An event in phEventWaitList has ::UR_EVENT_STATUS_ERROR +/// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION +/// - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE +/// - ::UR_RESULT_ERROR_INVALID_VALUE +/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY +/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES +ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object + uint32_t + workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and + ///< work-group work-items + const size_t * + pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the + ///< number of global work-items in workDim that will execute the kernel + ///< function + const size_t * + pLocalWorkSize, ///< [in][optional] pointer to an array of workDim unsigned values that + ///< specify the number of local work-items forming a work-group that will + ///< execute the kernel function. If nullptr, the runtime implementation + ///< will choose the work-group size. + uint32_t numPropsInLaunchPropList, ///< [in] size of the launch prop list + const ur_exp_launch_property_t * + launchPropList, ///< [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch + ///< properties + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating that no wait event. + ur_event_handle_t * + phEvent ///< [out][optional] return an event object that identifies this particular + ///< kernel execution instance. + ) try { + auto pfnKernelLaunchCustomExp = + ur_lib::context->urDdiTable.EnqueueExp.pfnKernelLaunchCustomExp; + if (nullptr == pfnKernelLaunchCustomExp) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + return pfnKernelLaunchCustomExp(hQueue, hKernel, workDim, pGlobalWorkSize, + pLocalWorkSize, numPropsInLaunchPropList, + launchPropList, numEventsInWaitList, + phEventWaitList, phEvent); +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Produces an executable program from one program, negates need for the /// linking step. diff --git a/source/loader/ur_print.cpp b/source/loader/ur_print.cpp index 3b144c87ad..5af2165ea4 100644 --- a/source/loader/ur_print.cpp +++ b/source/loader/ur_print.cpp @@ -1011,6 +1011,23 @@ ur_result_t urPrintExpCommandBufferUpdateKernelLaunchDesc( return str_copy(&ss, buffer, buff_size, out_size); } +ur_result_t urPrintExpLaunchPropertyId(enum ur_exp_launch_property_id_t value, + char *buffer, const size_t buff_size, + size_t *out_size) { + std::stringstream ss; + ss << value; + return str_copy(&ss, buffer, buff_size, out_size); +} + +ur_result_t +urPrintExpLaunchProperty(const struct ur_exp_launch_property_t params, + char *buffer, const size_t buff_size, + size_t *out_size) { + std::stringstream ss; + ss << params; + return str_copy(&ss, buffer, buff_size, out_size); +} + ur_result_t urPrintExpPeerInfo(enum ur_exp_peer_info_t value, char *buffer, const size_t buff_size, size_t *out_size) { std::stringstream ss; @@ -1645,6 +1662,14 @@ ur_result_t urPrintEnqueueWriteHostPipeParams( return str_copy(&ss, buffer, buff_size, out_size); } +ur_result_t urPrintEnqueueKernelLaunchCustomExpParams( + const struct ur_enqueue_kernel_launch_custom_exp_params_t *params, + char *buffer, const size_t buff_size, size_t *out_size) { + std::stringstream ss; + ss << params; + return str_copy(&ss, buffer, buff_size, out_size); +} + ur_result_t urPrintEnqueueCooperativeKernelLaunchExpParams( const struct ur_enqueue_cooperative_kernel_launch_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size) { diff --git a/source/ur_api.cpp b/source/ur_api.cpp index 1ed70e0494..bf90700e7d 100644 --- a/source/ur_api.cpp +++ b/source/ur_api.cpp @@ -7071,6 +7071,85 @@ ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Launch kernel with custom launch properties +/// +/// @details +/// - Launches the kernel using the specified launch properties +/// - If numPropsInLaunchPropList == 0 then a regular kernel launch is used: +/// `urEnqueueKernelLaunch` +/// - Consult the appropriate adapter driver documentation for details of +/// adapter specific behavior and native error codes that may be returned. +/// +/// @remarks +/// _Analogues_ +/// - **cuLaunchKernelEx** +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hQueue` +/// + `NULL == hKernel` +/// + NULL == hQueue +/// + NULL == hKernel +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pGlobalWorkSize` +/// + `NULL == launchPropList` +/// + NULL == pGlobalWorkSize +/// + numPropsInLaunchpropList != 0 && launchPropList == NULL +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_QUEUE +/// - ::UR_RESULT_ERROR_INVALID_KERNEL +/// - ::UR_RESULT_ERROR_INVALID_EVENT +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + phEventWaitList == NULL && numEventsInWaitList > 0 +/// + phEventWaitList != NULL && numEventsInWaitList == 0 +/// + If event objects in phEventWaitList are not valid events. +/// - ::UR_RESULT_ERROR_IN_EVENT_LIST_EXEC_STATUS +/// + An event in phEventWaitList has ::UR_EVENT_STATUS_ERROR +/// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION +/// - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE +/// - ::UR_RESULT_ERROR_INVALID_VALUE +/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY +/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES +ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object + uint32_t + workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and + ///< work-group work-items + const size_t * + pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the + ///< number of global work-items in workDim that will execute the kernel + ///< function + const size_t * + pLocalWorkSize, ///< [in][optional] pointer to an array of workDim unsigned values that + ///< specify the number of local work-items forming a work-group that will + ///< execute the kernel function. If nullptr, the runtime implementation + ///< will choose the work-group size. + uint32_t numPropsInLaunchPropList, ///< [in] size of the launch prop list + const ur_exp_launch_property_t * + launchPropList, ///< [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch + ///< properties + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. If nullptr, + ///< the numEventsInWaitList must be 0, indicating that no wait event. + ur_event_handle_t * + phEvent ///< [out][optional] return an event object that identifies this particular + ///< kernel execution instance. +) { + ur_result_t result = UR_RESULT_SUCCESS; + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Produces an executable program from one program, negates need for the /// linking step. diff --git a/test/conformance/CMakeLists.txt b/test/conformance/CMakeLists.txt index 9a80c5345e..79cefdd06f 100644 --- a/test/conformance/CMakeLists.txt +++ b/test/conformance/CMakeLists.txt @@ -139,6 +139,7 @@ if(UR_DPCXX) add_subdirectory(integration) add_subdirectory(exp_command_buffer) add_subdirectory(exp_usm_p2p) + add_subdirectory(exp_launch_properties) else() message(WARNING "UR_DPCXX is not defined, the following conformance test executables \ diff --git a/test/conformance/exp_launch_properties/CMakeLists.txt b/test/conformance/exp_launch_properties/CMakeLists.txt new file mode 100644 index 0000000000..db59883149 --- /dev/null +++ b/test/conformance/exp_launch_properties/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (C) 2024 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +add_conformance_test_with_kernels_environment(exp_launch_properties + launch_properties.cpp + ) + diff --git a/test/conformance/exp_launch_properties/exp_launch_properties_adapter_cuda.match b/test/conformance/exp_launch_properties/exp_launch_properties_adapter_cuda.match new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/conformance/exp_launch_properties/exp_launch_properties_adapter_hip.match b/test/conformance/exp_launch_properties/exp_launch_properties_adapter_hip.match new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/conformance/exp_launch_properties/exp_launch_properties_adapter_level_zero.match b/test/conformance/exp_launch_properties/exp_launch_properties_adapter_level_zero.match new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/conformance/exp_launch_properties/exp_launch_properties_adapter_native_cpu.match b/test/conformance/exp_launch_properties/exp_launch_properties_adapter_native_cpu.match new file mode 100644 index 0000000000..235921db1e --- /dev/null +++ b/test/conformance/exp_launch_properties/exp_launch_properties_adapter_native_cpu.match @@ -0,0 +1 @@ +urEnqueueKernelLaunchCustomTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_ diff --git a/test/conformance/exp_launch_properties/exp_launch_properties_adapter_opencl.match b/test/conformance/exp_launch_properties/exp_launch_properties_adapter_opencl.match new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/conformance/exp_launch_properties/launch_properties.cpp b/test/conformance/exp_launch_properties/launch_properties.cpp new file mode 100644 index 0000000000..bc252392eb --- /dev/null +++ b/test/conformance/exp_launch_properties/launch_properties.cpp @@ -0,0 +1,97 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +struct urEnqueueKernelLaunchCustomTest : uur::urKernelExecutionTest { + void SetUp() override { + program_name = "fill"; + UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp()); + } + + uint32_t val = 42; + size_t global_size = 32; + size_t global_offset = 0; + size_t n_dimensions = 1; +}; +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueKernelLaunchCustomTest); + +TEST_P(urEnqueueKernelLaunchCustomTest, Success) { + + size_t returned_size; + ASSERT_SUCCESS(urDeviceGetInfo(device, UR_DEVICE_INFO_EXTENSIONS, 0, + nullptr, &returned_size)); + + std::unique_ptr returned_extensions(new char[returned_size]); + + ASSERT_SUCCESS(urDeviceGetInfo(device, UR_DEVICE_INFO_EXTENSIONS, + returned_size, returned_extensions.get(), + nullptr)); + + std::string_view extensions_string(returned_extensions.get()); + const bool launch_properties_support = + extensions_string.find(UR_LAUNCH_PROPERTIES_EXTENSION_STRING_EXP) != + std::string::npos; + + if (!launch_properties_support) { + GTEST_SKIP() << "EXP launch properties feature is not supported."; + } + + std::vector props(1); + props[0].id = UR_EXP_LAUNCH_PROPERTY_ID_IGNORE; + + ASSERT_SUCCESS(urDeviceGetInfo(device, UR_DEVICE_INFO_PROFILE, 0, nullptr, + &returned_size)); + + std::unique_ptr returned_backend(new char[returned_size]); + + ASSERT_SUCCESS(urDeviceGetInfo(device, UR_DEVICE_INFO_PROFILE, + returned_size, returned_backend.get(), + nullptr)); + + std::string_view backend_string(returned_backend.get()); + const bool cuda_backend = backend_string.find("CUDA") != std::string::npos; + + if (cuda_backend) { + ASSERT_SUCCESS(urDeviceGetInfo(device, UR_DEVICE_INFO_VERSION, 0, + nullptr, &returned_size)); + + std::unique_ptr returned_compute_capability( + new char[returned_size]); + + ASSERT_SUCCESS( + urDeviceGetInfo(device, UR_DEVICE_INFO_VERSION, returned_size, + returned_compute_capability.get(), nullptr)); + + auto compute_capability = + std::stof(std::string(returned_compute_capability.get())); + + if (compute_capability >= 6.0) { + ur_exp_launch_property_t coop_prop; + coop_prop.id = UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE; + coop_prop.value.cooperative = 1; + props.push_back(coop_prop); + } + + if (compute_capability >= 9.0) { + ur_exp_launch_property_t cluster_dims_prop; + cluster_dims_prop.id = UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION; + cluster_dims_prop.value.clusterDim[0] = 1; + cluster_dims_prop.value.clusterDim[1] = 1; + cluster_dims_prop.value.clusterDim[2] = 1; + + props.push_back(cluster_dims_prop); + } + } + ur_mem_handle_t buffer = nullptr; + AddBuffer1DArg(sizeof(val) * global_size, &buffer); + AddPodArg(val); + + ASSERT_SUCCESS(urEnqueueKernelLaunchCustomExp( + queue, kernel, n_dimensions, &global_size, nullptr, 1, &props[0], 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + ValidateBuffer(buffer, sizeof(val) * global_size, val); +}