diff --git a/numba_dpex/core/runtime/CMakeLists.txt b/numba_dpex/core/runtime/CMakeLists.txt index 3d00278fcb..abee96af2a 100644 --- a/numba_dpex/core/runtime/CMakeLists.txt +++ b/numba_dpex/core/runtime/CMakeLists.txt @@ -109,7 +109,7 @@ python_add_library(${PROJECT_NAME} MODULE ${SOURCES}) # Add SYCL to target, this must come after python_add_library() # FIXME: sources incompatible with sycl include? -# add_sycl_to_target(TARGET ${PROJECT_NAME} SOURCES ${KERNEL_SOURCES}) +add_sycl_to_target(TARGET ${PROJECT_NAME} SOURCES ${KERNEL_SOURCES}) # Link the DPCTLSyclInterface library to target target_link_libraries(${PROJECT_NAME} PRIVATE DPCTLSyclInterface) diff --git a/numba_dpex/core/runtime/_dpexrt_python.c b/numba_dpex/core/runtime/_dpexrt_python.c index 3dcdaf8000..b10079d33f 100644 --- a/numba_dpex/core/runtime/_dpexrt_python.c +++ b/numba_dpex/core/runtime/_dpexrt_python.c @@ -24,6 +24,7 @@ #include "_queuestruct.h" #include "_usmarraystruct.h" +#include "experimental/nrt_reserve_meminfo.h" #include "numba/core/runtime/nrt_external.h" // forward declarations @@ -1490,6 +1491,8 @@ static PyObject *build_c_helpers_dict(void) &DPEXRT_sycl_event_from_python); _declpointer("DPEXRT_sycl_event_to_python", &DPEXRT_sycl_event_to_python); _declpointer("DPEXRT_sycl_event_init", &DPEXRT_sycl_event_init); + _declpointer("DPEXRT_nrt_acquire_meminfo_and_schedule_release", + &DPEXRT_nrt_acquire_meminfo_and_schedule_release); #undef _declpointer return dct; @@ -1557,6 +1560,9 @@ MOD_INIT(_dpexrt_python) PyLong_FromVoidPtr(&DPEXRT_MemInfo_alloc)); PyModule_AddObject(m, "DPEXRT_MemInfo_fill", PyLong_FromVoidPtr(&DPEXRT_MemInfo_fill)); + PyModule_AddObject( + m, "DPEXRT_nrt_acquire_meminfo_and_schedule_release", + PyLong_FromVoidPtr(&DPEXRT_nrt_acquire_meminfo_and_schedule_release)); PyModule_AddObject(m, "c_helpers", build_c_helpers_dict()); return MOD_SUCCESS_VAL(m); } diff --git a/numba_dpex/core/runtime/context.py b/numba_dpex/core/runtime/context.py index e6cab3e2f2..1d9f30dad7 100644 --- a/numba_dpex/core/runtime/context.py +++ b/numba_dpex/core/runtime/context.py @@ -433,3 +433,41 @@ def submit_ndrange( ) return ret + + def acquire_meminfo_and_schedule_release( + self, builder: llvmir.IRBuilder, args + ): + """Inserts LLVM IR to call nrt_acquire_meminfo_and_schedule_release. + + DPCTLSyclEventRef + DPEXRT_nrt_acquire_meminfo_and_schedule_release( + NRT_api_functions *nrt, + DPCTLSyclQueueRef QRef, + NRT_MemInfo **meminfo_array, + size_t meminfo_array_size, + DPCTLSyclEventRef *depERefs, + size_t nDepERefs, + int *status, + ); + + """ + mod = builder.module + + func_ty = llvmir.FunctionType( + cgutils.voidptr_t, + [ + cgutils.voidptr_t, + cgutils.voidptr_t, + cgutils.voidptr_t.as_pointer(), + llvmir.IntType(64), + cgutils.voidptr_t.as_pointer(), + llvmir.IntType(64), + llvmir.IntType(64).as_pointer(), + ], + ) + fn = cgutils.get_or_insert_function( + mod, func_ty, "DPEXRT_nrt_acquire_meminfo_and_schedule_release" + ) + ret = builder.call(fn, args) + + return ret diff --git a/numba_dpex/core/runtime/experimental/nrt_reserve_meminfo.cpp b/numba_dpex/core/runtime/experimental/nrt_reserve_meminfo.cpp new file mode 100644 index 0000000000..9beae11ac9 --- /dev/null +++ b/numba_dpex/core/runtime/experimental/nrt_reserve_meminfo.cpp @@ -0,0 +1,71 @@ +// SPDX-FileCopyrightText: 2023 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 + +#include "nrt_reserve_meminfo.h" + +#include "_dbg_printer.h" +#include "syclinterface/dpctl_sycl_type_casters.hpp" +#include + +extern "C" +{ + DPCTLSyclEventRef + DPEXRT_nrt_acquire_meminfo_and_schedule_release(NRT_api_functions *nrt, + DPCTLSyclQueueRef QRef, + NRT_MemInfo **meminfo_array, + size_t meminfo_array_size, + DPCTLSyclEventRef *depERefs, + size_t nDepERefs, + int *status) + { + DPEXRT_DEBUG(drt_debug_print( + "DPEXRT-DEBUG: scheduling nrt meminfo release.\n");); + + using dpctl::syclinterface::unwrap; + using dpctl::syclinterface::wrap; + + sycl::queue *q = unwrap(QRef); + + std::vector meminfo_vec( + meminfo_array, meminfo_array + meminfo_array_size); + + for (size_t i = 0; i < meminfo_array_size; ++i) { + nrt->acquire(meminfo_vec[i]); + } + + DPEXRT_DEBUG(drt_debug_print("DPEXRT-DEBUG: acquired meminfo.\n");); + + try { + sycl::event ht_ev = q->submit([&](sycl::handler &cgh) { + for (size_t ev_id = 0; ev_id < nDepERefs; ++ev_id) { + cgh.depends_on(*(unwrap(depERefs[ev_id]))); + } + cgh.host_task([meminfo_array_size, meminfo_vec, nrt]() { + for (size_t i = 0; i < meminfo_array_size; ++i) { + nrt->release(meminfo_vec[i]); + DPEXRT_DEBUG( + drt_debug_print("DPEXRT-DEBUG: released meminfo " + "from host_task.\n");); + } + }); + }); + + constexpr int result_ok = 0; + + *status = result_ok; + auto e_ptr = new sycl::event(ht_ev); + return wrap(e_ptr); + } catch (const std::exception &e) { + constexpr int result_std_exception = 1; + + *status = result_std_exception; + return nullptr; + } + + constexpr int result_other_abnormal = 2; + + *status = result_other_abnormal; + return nullptr; + } +} diff --git a/numba_dpex/core/runtime/experimental/nrt_reserve_meminfo.h b/numba_dpex/core/runtime/experimental/nrt_reserve_meminfo.h new file mode 100644 index 0000000000..0afef82f02 --- /dev/null +++ b/numba_dpex/core/runtime/experimental/nrt_reserve_meminfo.h @@ -0,0 +1,27 @@ +// SPDX-FileCopyrightText: 2023 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef _EXPERIMENTAL_H_ +#define _EXPERIMENTAL_H_ + +#include "dpctl_capi.h" +#include "numba/core/runtime/nrt_external.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + DPCTLSyclEventRef + DPEXRT_nrt_acquire_meminfo_and_schedule_release(NRT_api_functions *nrt, + DPCTLSyclQueueRef QRef, + NRT_MemInfo **meminfo_array, + size_t meminfo_array_size, + DPCTLSyclEventRef *depERefs, + size_t nDepERefs, + int *status); +#ifdef __cplusplus +} +#endif + +#endif /* _EXPERIMENTAL_H_ */ diff --git a/numba_dpex/experimental/__init__.py b/numba_dpex/experimental/__init__.py index 589d10df5d..7782fb0e76 100644 --- a/numba_dpex/experimental/__init__.py +++ b/numba_dpex/experimental/__init__.py @@ -10,7 +10,7 @@ from .decorators import kernel from .kernel_dispatcher import KernelDispatcher -from .launcher import call_kernel +from .launcher import call_kernel, call_kernel_async from .models import * from .types import KernelDispatcherType @@ -26,4 +26,4 @@ def dpex_dispatcher_const(context): return context.get_dummy_value() -__all__ = ["kernel", "KernelDispatcher", "call_kernel"] +__all__ = ["kernel", "KernelDispatcher", "call_kernel", "call_kernel_async"] diff --git a/numba_dpex/experimental/kernel_dispatcher.py b/numba_dpex/experimental/kernel_dispatcher.py index 3e93ce4736..4488f835ac 100644 --- a/numba_dpex/experimental/kernel_dispatcher.py +++ b/numba_dpex/experimental/kernel_dispatcher.py @@ -254,7 +254,7 @@ def get_overload_device_ir(self, sig): args, _ = sigutils.normalize_signature(sig) return self.overloads[tuple(args)].kernel_device_ir_module - def compile(self, sig) -> _KernelCompileResult: + def compile(self, sig) -> any: disp = self._get_dispatcher_for_current_target() if disp is not self: return disp.compile(sig) diff --git a/numba_dpex/experimental/launcher.py b/numba_dpex/experimental/launcher.py index e3bad6b1e5..4435946354 100644 --- a/numba_dpex/experimental/launcher.py +++ b/numba_dpex/experimental/launcher.py @@ -9,17 +9,25 @@ from collections import namedtuple from typing import Union +import dpctl from llvmlite import ir as llvmir from numba.core import cgutils, cpu, types from numba.core.datamodel import default_manager as numba_default_dmm -from numba.extending import intrinsic, overload +from numba.extending import intrinsic -from numba_dpex import config, dpjit +from numba_dpex import config, dpjit, utils from numba_dpex.core.exceptions import UnreachableError +from numba_dpex.core.runtime.context import DpexRTContext from numba_dpex.core.targets.kernel_target import DpexKernelTargetContext -from numba_dpex.core.types import DpnpNdArray, NdRangeType, RangeType +from numba_dpex.core.types import ( + DpctlSyclEvent, + DpnpNdArray, + NdRangeType, + RangeType, +) from numba_dpex.core.utils import kernel_launcher as kl from numba_dpex.dpctl_iface import libsyclinterface_bindings as sycl +from numba_dpex.dpctl_iface.wrappers import wrap_event_reference from numba_dpex.experimental.kernel_dispatcher import _KernelModule from numba_dpex.utils import create_null_ptr @@ -79,7 +87,10 @@ def __init__( self._cpu_codegen_targetctx = codegen_targetctx self._kernel_targetctx = kernel_targetctx self._builder = builder - self._klbuilder = kl.KernelLaunchIRBuilder(kernel_targetctx, builder) + if kernel_targetctx: + self._klbuilder = kl.KernelLaunchIRBuilder( + kernel_targetctx, builder + ) if config.DEBUG_KERNEL_LAUNCHER: cgutils.printf( @@ -98,6 +109,26 @@ def insert_kernel_bitcode_as_byte_str( bytes=kernel_module.kernel_bitcode, ) + def allocate_meminfos_array(self, num_meminfos): + """Allocates an array to store nrt memory infos. + + Args: + num_meminfos (int): The number of memory infos to allocate. + + Returns: An LLVM IR value pointing to an array to store the memory + infos. + """ + builder = self._builder + context = self._cpu_codegen_targetctx + + meminfo_list = cgutils.alloca_once( + builder, + utils.get_llvm_type(context=context, type=types.voidptr), + size=context.get_constant(types.uintp, num_meminfos), + ) + + return meminfo_list + def populate_kernel_args_and_argsty_arrays( self, kernel_argtys: tuple[types.Type, ...], @@ -145,9 +176,47 @@ def populate_kernel_args_and_argsty_arrays( array_of_kernel_arg_types=args_ty_list, ) + def allocate_meminfo_array( + self, + kernel_argtys: tuple[types.Type, ...], + kernel_args: [llvmir.Instruction, ...], + ) -> tuple[int, list[llvmir.Instruction]]: + """Allocates an LLVM array value to store each memory info from all + kernel arguments. The array is the populated with the LLVM value for + every meminfo of the kernel arguments. + """ + builder = self._builder + context = self._cpu_codegen_targetctx + + meminfos = [] + for arg_num, argtype in enumerate(kernel_argtys): + llvm_val = kernel_args[arg_num] + + meminfos += [ + meminfo + for ty, meminfo in context.nrt.get_meminfos( + builder, argtype, llvm_val + ) + ] + + meminfo_list = self.allocate_meminfos_array(len(meminfos)) + + for meminfo_num, meminfo in enumerate(meminfos): + meminfo_arg_dst = builder.gep( + meminfo_list, + [context.get_constant(types.int32, meminfo_num)], + ) + meminfo_ptr = builder.bitcast( + meminfo, + utils.get_llvm_type(context=context, type=types.voidptr), + ) + builder.store(meminfo_ptr, meminfo_arg_dst) + + return len(meminfos), meminfo_list + def get_queue_ref_val( self, - kernel_argtys: [types.Type, ...], + kernel_argtys: tuple[types.Type, ...], kernel_args: [llvmir.Instruction, ...], ): """ @@ -160,8 +229,8 @@ def get_queue_ref_val( for arg_num, argty in enumerate(kernel_argtys): if isinstance(argty, DpnpNdArray): llvm_val = kernel_args[arg_num] - datamodel = self._kernel_targetctx.data_model_manager.lookup( - argty + datamodel = ( + self._cpu_codegen_targetctx.data_model_manager.lookup(argty) ) sycl_queue_attr_pos = datamodel.get_field_position("sycl_queue") ptr_to_queue_ref = self._builder.extract_value( @@ -257,9 +326,11 @@ def create_kernel_bundle_from_spirv( return kbref - def submit_and_wait(self, submit_call_args: _KernelSubmissionArgs) -> None: - """Generates LLVM IR CallInst to submit a kernel to specified SYCL queue - and then call DPCTLEvent_Wait on the returned event. + def submit( + self, submit_call_args: _KernelSubmissionArgs + ) -> llvmir.PointerType(llvmir.IntType(8)): + """Generates LLVM IR CallInst to submit a kernel to specified SYCL + queue. """ if config.DEBUG_KERNEL_LAUNCHER: cgutils.printf( @@ -279,8 +350,42 @@ def submit_and_wait(self, submit_call_args: _KernelSubmissionArgs) -> None: if config.DEBUG_KERNEL_LAUNCHER: cgutils.printf(self._builder, "DPEX-DEBUG: Wait on event.\n") - sycl.dpctl_event_wait(self._builder, eref) - sycl.dpctl_event_delete(self._builder, eref) + return eref + + def acquire_meminfo_and_schedule_release( + self, + qref, + eref, + total_meminfos, + meminfo_list, + ): + """Schedule sycl host task to release nrt meminfo of the arguments used + to run job. Use it to keep arguments alive during kernel execution.""" + ctx = self._cpu_codegen_targetctx + builder = self._builder + + eref_ptr = builder.alloca(eref.type) + builder.store(eref, eref_ptr) + + status_ptr = cgutils.alloca_once( + builder, ctx.get_value_type(types.uint64) + ) + # TODO: get dpex RT from cached property once the PR is merged + # https://github.com/IntelPython/numba-dpex/pull/1027 + # host_eref = ctx.dpexrt.acquire_meminfo_and_schedule_release( # noqa: W0621 + host_eref = DpexRTContext(ctx).acquire_meminfo_and_schedule_release( + builder, + [ + ctx.nrt.get_nrt_api(builder), + qref, + meminfo_list, + ctx.get_constant(types.uintp, total_meminfos), + eref_ptr, + ctx.get_constant(types.uintp, 1), + status_ptr, + ], + ) + return host_eref def cleanup( self, @@ -297,10 +402,13 @@ def cleanup( @intrinsic(target="cpu") -def intrin_launch_trampoline( - typingctx, kernel_fn, index_space, kernel_args # pylint: disable=W0613 +def _submit_kernel( + typingctx, # pylint: disable=W0613 + kernel_fn, + index_space, + kernel_args, ): - """Generates the body of the launch_trampoline overload. + """Generates IR code for call_kernel dpjit function. The intrinsic first compiles the kernel function to SPIRV, and then to a sycl kernel bundle. The arguments to the kernel are also packed into @@ -310,7 +418,8 @@ def intrin_launch_trampoline( """ kernel_args_list = list(kernel_args) # signature of this intrinsic - sig = types.void(kernel_fn, index_space, kernel_args) + ty_event = DpctlSyclEvent() + sig = ty_event(kernel_fn, index_space, kernel_args) # signature of the kernel_fn kernel_sig = types.void(*kernel_args_list) kernel_fn.dispatcher.compile(kernel_sig) @@ -319,7 +428,8 @@ def intrin_launch_trampoline( ) kernel_targetctx = kernel_fn.dispatcher.targetctx - def codegen(cgctx, builder, sig, llargs): + # TODO: refactor so there are no too many locals + def codegen(cgctx, builder, sig, llargs): # pylint: disable=R0914 kernel_argtys = kernel_sig.args kernel_args_unpacked = [] for pos in range(len(kernel_args)): @@ -342,7 +452,7 @@ def codegen(cgctx, builder, sig, llargs): ) qref = fn_body_gen.get_queue_ref_val( - kernel_argtys=kernel_argtys, + kernel_argtys=kernel_args_list, kernel_args=kernel_args_unpacked, ) @@ -367,32 +477,101 @@ def codegen(cgctx, builder, sig, llargs): local_range_extents=index_space_values.local_range_extents, ) - fn_body_gen.submit_and_wait(submit_call_args) - - fn_body_gen.cleanup(kernel_bundle_ref=kbref, kernel_ref=kref) + eref = fn_body_gen.submit(submit_call_args) + device_event = wrap_event_reference(cgctx, builder, eref) + return device_event return sig, codegen -# pylint: disable=W0613 -def _launch_trampoline(kernel_fn, index_space, *kernel_args): - pass +@intrinsic(target="cpu") +def _acquire_meminfo_and_schedule_release( + typingctx, # pylint: disable=W0613 + ty_device_event, # pylint: disable=W0613 + ty_kernel_args, +): + """Generates IR code to keep arguments alive during kernel execution. + The intrinsic collects all memory infos from the kernel arguments, acquires + them and schecules host task to release them. Returns host task's event. + """ + # signature of this intrinsic + ty_event = DpctlSyclEvent() + sig = ty_event(ty_event, ty_kernel_args) -@overload(_launch_trampoline, target="cpu") -def _ol_launch_trampoline(kernel_fn, index_space, *kernel_args): - def impl(kernel_fn, index_space, *kernel_args): - intrin_launch_trampoline( # pylint: disable=E1120 - kernel_fn, index_space, kernel_args + def codegen(cgctx, builder, sig, llargs): + device_event = cgutils.create_struct_proxy(sig.args[0])( + cgctx, builder, value=llargs[0] + ) + + kernel_args_tuple = llargs[1] + ty_kernel_args = sig.args[1] + + kernel_args = [] + for pos in range(len(ty_kernel_args)): + kernel_args.append(builder.extract_value(kernel_args_tuple, pos)) + + fn_body_gen = _LaunchTrampolineFunctionBodyGenerator( + codegen_targetctx=cgctx, + kernel_targetctx=None, + builder=builder, + ) + + total_meminfos, meminfo_list = fn_body_gen.allocate_meminfo_array( + ty_kernel_args, kernel_args ) - return impl + qref = fn_body_gen.get_queue_ref_val( + kernel_argtys=ty_kernel_args, + kernel_args=kernel_args, + ) + + host_eref = fn_body_gen.acquire_meminfo_and_schedule_release( + qref=qref, + eref=device_event.event_ref, + total_meminfos=total_meminfos, + meminfo_list=meminfo_list, + ) + + host_event = wrap_event_reference(cgctx, builder, host_eref) + + return host_event + + return sig, codegen + + +@dpjit +def call_kernel(kernel_fn, index_space, *kernel_args) -> None: + """Calls a numba_dpex.kernel decorated function from CPython or from another + dpjit function. Kernel execution happens in syncronous way, so the thread + will be blocked till the kernel done exectuion. + + Args: + kernel_fn (numba_dpex.experimental.KernelDispatcher): A + numba_dpex.kernel decorated function that is compiled to a + KernelDispatcher by numba_dpex. + index_space (Range | NdRange): A numba_dpex.Range or numba_dpex.NdRange + type object that specifies the index space for the kernel. + kernel_args : List of objects that are passed to the numba_dpex.kernel + decorated function. + """ + device_event = _submit_kernel( # pylint: disable=E1120 + kernel_fn, + index_space, + kernel_args, + ) + device_event.wait() # pylint: disable=E1101 @dpjit -def call_kernel(kernel_fn, index_space, *kernel_args): +def call_kernel_async( + kernel_fn, index_space, *kernel_args +) -> tuple[dpctl.SyclEvent, dpctl.SyclEvent]: """Calls a numba_dpex.kernel decorated function from CPython or from another - dpjit function. + dpjit function. Kernel execution happens in asyncronous way, so the thread + will not be blocked till the kernel done exectuion. That means that it is + user responsiblity to properly use any memory used by kernel until the + kernel execution is completed. Args: kernel_fn (numba_dpex.experimental.KernelDispatcher): A @@ -402,5 +581,18 @@ def call_kernel(kernel_fn, index_space, *kernel_args): type object that specifies the index space for the kernel. kernel_args : List of objects that are passed to the numba_dpex.kernel decorated function. + + Returns: + pair of host event and device event. Host event represent host task + that releases use of any kernel argument so it can be deallocated. + This task may be executed only after device task is done. """ - _launch_trampoline(kernel_fn, index_space, *kernel_args) + device_event = _submit_kernel( # pylint: disable=E1120 + kernel_fn, + index_space, + kernel_args, + ) + host_event = _acquire_meminfo_and_schedule_release( # pylint: disable=E1120 + device_event, kernel_args + ) + return host_event, device_event diff --git a/numba_dpex/tests/experimental/test_async_kernel.py b/numba_dpex/tests/experimental/test_async_kernel.py new file mode 100644 index 0000000000..53a962588b --- /dev/null +++ b/numba_dpex/tests/experimental/test_async_kernel.py @@ -0,0 +1,49 @@ +# SPDX-FileCopyrightText: 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +import dpctl +import dpnp + +import numba_dpex as dpex +import numba_dpex.experimental as exp_dpex +from numba_dpex import Range + + +@exp_dpex.kernel( + release_gil=False, + no_compile=True, + no_cpython_wrapper=True, + no_cfunc_wrapper=True, +) +def add(a, b, c): + i = dpex.get_global_id(0) + c[i] = b[i] + a[i] + + +def test_async_add(): + size = 10 + a = dpnp.ones(size) + b = dpnp.ones(size) + c = dpnp.zeros(size) + + r = Range(size) + + host_ref, event_ref = exp_dpex.call_kernel_async( + add, + r, + a, + b, + c, + ) + + assert type(host_ref) == dpctl.SyclEvent + assert type(event_ref) == dpctl.SyclEvent + assert host_ref is not None + assert event_ref is not None + + event_ref.wait() + host_ref.wait() + + d = a + b + assert dpnp.array_equal(c, d)