Skip to content

Commit

Permalink
Initial async kernel support
Browse files Browse the repository at this point in the history
  • Loading branch information
ZzEeKkAa committed Nov 16, 2023
1 parent 1fe4ad0 commit 5c1a60d
Show file tree
Hide file tree
Showing 8 changed files with 254 additions and 10 deletions.
10 changes: 8 additions & 2 deletions numba_dpex/core/runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -94,12 +94,18 @@ include_directories(${Python_INCLUDE_DIRS})
include_directories(${NumPy_INCLUDE_DIRS})
include_directories(${Numba_INCLUDE_DIRS})
include_directories(${Dpctl_INCLUDE_DIRS})
# include_directories(${CMAKE_CURRENT_SOURCE_DIR})
# include_directories(${CMAKE_CURRENT_SOURCE_DIR}/kernels/tensor/include)
include_directories(.)

# Source files, *.c
file(GLOB_RECURSE DPEXRT_SOURCES CONFIGURE_DEPENDS "*.c")
file(GLOB_RECURSE KERNEL_SOURCES CONFIGURE_DEPENDS "*.cpp")
set(SOURCES ${DPEXRT_SOURCES} ${KERNEL_SOURCES})
# set(SOURCES ${DPEXRT_SOURCES} ${KERNEL_SOURCES})
set(SOURCES ${KERNEL_SOURCES} ${DPEXRT_SOURCES})

message(KERNEL_SOURCES="${KERNEL_SOURCES}")
message(SOURCES="${SOURCES}")

# Link dpctl library path with -L
link_directories(${DPCTL_LIBRARY_PATH})
Expand All @@ -109,7 +115,7 @@ python_add_library(${PROJECT_NAME} MODULE ${SOURCES})

# Add SYCL to target, this must come after python_add_library()
# FIXME: sources incompatible with sycl include?
# add_sycl_to_target(TARGET ${PROJECT_NAME} SOURCES ${KERNEL_SOURCES})
add_sycl_to_target(TARGET ${PROJECT_NAME} SOURCES ${KERNEL_SOURCES})

# Link the DPCTLSyclInterface library to target
target_link_libraries(${PROJECT_NAME} PRIVATE DPCTLSyclInterface)
Expand Down
6 changes: 6 additions & 0 deletions numba_dpex/core/runtime/_dpexrt_python.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "_queuestruct.h"
#include "_usmarraystruct.h"

#include "experimental/experimental.h"
#include "numba/core/runtime/nrt_external.h"

// forward declarations
Expand Down Expand Up @@ -1490,6 +1491,8 @@ static PyObject *build_c_helpers_dict(void)
&DPEXRT_sycl_event_from_python);
_declpointer("DPEXRT_sycl_event_to_python", &DPEXRT_sycl_event_to_python);
_declpointer("DPEXRT_sycl_event_init", &DPEXRT_sycl_event_init);
_declpointer("DPEXRT_schedule_nrt_meminfo_release",
&DPEXRT_schedule_nrt_meminfo_release);

#undef _declpointer
return dct;
Expand Down Expand Up @@ -1557,6 +1560,9 @@ MOD_INIT(_dpexrt_python)
PyLong_FromVoidPtr(&DPEXRT_MemInfo_alloc));
PyModule_AddObject(m, "DPEXRT_MemInfo_fill",
PyLong_FromVoidPtr(&DPEXRT_MemInfo_fill));
PyModule_AddObject(
m, "DPEXRT_schedule_nrt_meminfo_release",
PyLong_FromVoidPtr(&DPEXRT_schedule_nrt_meminfo_release));
PyModule_AddObject(m, "c_helpers", build_c_helpers_dict());
return MOD_SUCCESS_VAL(m);
}
34 changes: 34 additions & 0 deletions numba_dpex/core/runtime/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,3 +433,37 @@ def submit_ndrange(
)

return ret

def schedule_nrt_meminfo_release(self, builder: llvmir.IRBuilder, *args):
"""Inserts LLVM IR to call schedule_nrt_release.
DPCTLSyclEventRef
schedule_nrt_memino_release(
NRT_api_functions *nrt,
DPCTLSyclQueueRef QRef,
NRT_MemInfo **meminfo_array,
size_t meminfo_array_size,
DPCTLSyclEventRef *depERefs,
size_t nDepERefs,
int *status,
);
"""
mod = builder.module
fn = _build_dpctl_function(
llvm_module=mod,
return_ty=cgutils.voidptr_t,
arg_list=[
cgutils.voidptr_t,
cgutils.voidptr_t,
cgutils.voidptr_t.as_pointer(),
llvmir.IntType(64),
cgutils.voidptr_t,
llvmir.IntType(64),
llvmir.IntType(64).as_pointer(),
],
func_name="DPEXRT_schedule_nrt_meminfo_release",
)
ret = builder.call(fn, args)

return ret
23 changes: 23 additions & 0 deletions numba_dpex/core/runtime/experimental/experimental.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#ifndef _EXPERIMENTAL_H_
#define _EXPERIMENTAL_H_

#include "dpctl_capi.h"
#include "numba/core/runtime/nrt_external.h"

#ifdef __cplusplus
extern "C"
{
#endif
DPCTLSyclEventRef
DPEXRT_schedule_nrt_meminfo_release(NRT_api_functions *nrt,
DPCTLSyclQueueRef QRef,
NRT_MemInfo **meminfo_array,
size_t meminfo_array_size,
DPCTLSyclEventRef *depERefs,
size_t nDepERefs,
int *status);
#ifdef __cplusplus
}
#endif

#endif /* _EXPERIMENTAL_H_ */
59 changes: 59 additions & 0 deletions numba_dpex/core/runtime/experimental/sycl_release_nrt.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#include "experimental.h"

#include "_dbg_printer.h"
#include "syclinterface/dpctl_sycl_type_casters.hpp"
#include <CL/sycl.hpp>

extern "C"
{
DPCTLSyclEventRef
DPEXRT_schedule_nrt_meminfo_release(NRT_api_functions *nrt,
DPCTLSyclQueueRef QRef,
NRT_MemInfo **meminfo_array,
size_t meminfo_array_size,
DPCTLSyclEventRef *depERefs,
size_t nDepERefs,
int *status)
{
using dpctl::syclinterface::unwrap;
using dpctl::syclinterface::wrap;

sycl::queue *q = unwrap<sycl::queue>(QRef);

std::vector<NRT_MemInfo *> meminfo_vec(
meminfo_array, meminfo_array + meminfo_array_size);

for (size_t i = 0; i < meminfo_array_size; ++i) {
nrt->acquire(meminfo_vec[i]);
}

try {
sycl::event ht_ev = q->submit([&](sycl::handler &cgh) {
for (size_t ev_id = 0; ev_id < nDepERefs; ++ev_id) {
cgh.depends_on(*(unwrap<sycl::event>(depERefs[ev_id])));
}
cgh.host_task([meminfo_array_size, meminfo_vec, nrt]() {
for (size_t i = 0; i < meminfo_array_size; ++i) {
nrt->release(meminfo_vec[i]);
}
});
});

constexpr int result_ok = 0;

*status = result_ok;
auto e_ptr = new sycl::event(ht_ev);
return wrap<sycl::event>(e_ptr);
} catch (const std::exception &e) {
constexpr int result_std_exception = 1;

*status = result_std_exception;
return nullptr;
}

constexpr int result_other_abnormal = 2;

*status = result_other_abnormal;
return nullptr;
}
}
19 changes: 19 additions & 0 deletions numba_dpex/core/utils/kernel_launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,7 @@ def submit_sycl_kernel(
"""
Submits the kernel to the specified queue, waits.
"""

eref = None
gr = self._create_sycl_range(global_range)
args1 = [
Expand Down Expand Up @@ -409,6 +410,24 @@ def submit_sycl_kernel(
sycl.dpctl_event_delete(self.builder, eref)
return None
else:
# Making sure that arguments not released before the end of
# execution.
meminfo_list = arg_list # FIXME: extract meminfos
total_meminfos = total_kernel_args # FIXME: -||-
dpexrtCtx = DpexRTContext(self.context)
host_eref = dpexrtCtx.schedule_nrt_meminfo_release(
self.builder,
[
dpexrtCtx._context.nrt.get_nrt_api(self.builder),
sycl_queue_ref,
meminfo_list,
self.context.get_constant(types.uintp, total_meminfos),
eref, # ???? should I get pointer of it
1, # ???? should I wrap it as a constant?
None, # put pointer for the status
],
)
# FIXME: should we return host event instead
return eref

def populate_kernel_args_and_args_ty_arrays(
Expand Down
50 changes: 42 additions & 8 deletions numba_dpex/experimental/launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,21 @@
from collections import namedtuple
from typing import Union

import dpctl
from llvmlite import ir as llvmir
from numba.core import cgutils, cpu, types
from numba.extending import intrinsic, overload

from numba_dpex import config, dpjit
from numba_dpex.core.exceptions import UnreachableError
from numba_dpex.core.runtime import context as dpexrt
from numba_dpex.core.targets.kernel_target import DpexKernelTargetContext
from numba_dpex.core.types import DpnpNdArray, NdRangeType, RangeType
from numba_dpex.core.types import (
DpctlSyclEvent,
DpnpNdArray,
NdRangeType,
RangeType,
)
from numba_dpex.core.utils import kernel_launcher as kl
from numba_dpex.dpctl_iface import libsyclinterface_bindings as sycl
from numba_dpex.experimental.kernel_dispatcher import _KernelModule
Expand Down Expand Up @@ -256,6 +263,16 @@ def submit_and_wait(self, submit_call_args: _KernelSubmissionArgs) -> None:
"""Generates LLVM IR CallInst to submit a kernel to specified SYCL queue
and then call DPCTLEvent_Wait on the returned event.
"""
eref = self.submit(submit_call_args)
sycl.dpctl_event_wait(self._builder, eref)
sycl.dpctl_event_delete(self._builder, eref)

def submit(
self, submit_call_args: _KernelSubmissionArgs
) -> llvmir.PointerType(llvmir.IntType(8)):
"""Generates LLVM IR CallInst to submit a kernel to specified SYCL
queue.
"""
if config.DEBUG_KERNEL_LAUNCHER:
cgutils.printf(
self._builder, "DPEX-DEBUG: Submit sync range kernel.\n"
Expand All @@ -274,8 +291,7 @@ def submit_and_wait(self, submit_call_args: _KernelSubmissionArgs) -> None:
if config.DEBUG_KERNEL_LAUNCHER:
cgutils.printf(self._builder, "DPEX-DEBUG: Wait on event.\n")

sycl.dpctl_event_wait(self._builder, eref)
sycl.dpctl_event_delete(self._builder, eref)
return eref

def cleanup(
self,
Expand Down Expand Up @@ -305,7 +321,8 @@ def intrin_launch_trampoline(
"""
kernel_args_list = list(kernel_args)
# signature of this intrinsic
sig = types.void(kernel_fn, index_space, kernel_args)
ty_event = DpctlSyclEvent()
sig = ty_event(kernel_fn, index_space, kernel_args)
# signature of the kernel_fn
kernel_sig = types.void(*kernel_args_list)
kmodule: _KernelModule = kernel_fn.dispatcher.compile(kernel_sig)
Expand Down Expand Up @@ -359,10 +376,27 @@ def codegen(cgctx, builder, sig, llargs):
local_range_extents=index_space_values.local_range_extents,
)

fn_body_gen.submit_and_wait(submit_call_args)
eref = fn_body_gen.submit(submit_call_args)

fn_body_gen.cleanup(kernel_bundle_ref=kbref, kernel_ref=kref)

pyapi = cgctx.get_python_api(builder)

event_struct_proxy = cgutils.create_struct_proxy(ty_event)(
cgctx, builder
)

dpexrtCtx = dpexrt.DpexRTContext(cgctx)

# Ref count after the call is equal to 1.
dpexrtCtx.eventstruct_init(
pyapi, eref, event_struct_proxy._getpointer()
)

event_value = event_struct_proxy._getvalue()

return event_value

return sig, codegen


Expand All @@ -374,15 +408,15 @@ def _launch_trampoline(kernel_fn, index_space, *kernel_args):
@overload(_launch_trampoline, target="cpu")
def _ol_launch_trampoline(kernel_fn, index_space, *kernel_args):
def impl(kernel_fn, index_space, *kernel_args):
intrin_launch_trampoline( # pylint: disable=E1120
return intrin_launch_trampoline( # pylint: disable=E1120
kernel_fn, index_space, kernel_args
)

return impl


@dpjit
def call_kernel(kernel_fn, index_space, *kernel_args):
def call_kernel(kernel_fn, index_space, *kernel_args) -> dpctl.SyclEvent:
"""Calls a numba_dpex.kernel decorated function from CPython or from another
dpjit function.
Expand All @@ -395,4 +429,4 @@ def call_kernel(kernel_fn, index_space, *kernel_args):
kernel_args : List of objects that are passed to the numba_dpex.kernel
decorated function.
"""
_launch_trampoline(kernel_fn, index_space, *kernel_args)
return _launch_trampoline(kernel_fn, index_space, *kernel_args)
63 changes: 63 additions & 0 deletions numba_dpex/tests/experimental/test_async_kernel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import time

import dpnp

import numba_dpex as dpex
import numba_dpex.experimental as exp_dpex
from numba_dpex import Range


@exp_dpex.kernel(
release_gil=False,
no_compile=True,
no_cpython_wrapper=True,
no_cfunc_wrapper=True,
)
def add(a, b, c):
i = dpex.get_global_id(0)
c[i] = b[i] + a[i]


@exp_dpex.kernel(
release_gil=False,
no_compile=True,
no_cpython_wrapper=True,
no_cfunc_wrapper=True,
)
def count_of_two(a, count):
i = dpex.get_global_id(0)
# if a[i]==2:
dpex.atomic.add(count, a[i], 1)


def test_async_add():
size = 1000_000_000
# size = 1000
a = dpnp.ones(size)
b = dpnp.ones(size)
c = dpnp.zeros(size)
count = dpnp.zeros(10)
print(a)
print(b)
print(c)

r = Range(size)
r2 = Range(480)

event_ref = exp_dpex.call_kernel(add, r, a, b, c)
print(event_ref)
event_ref.wait()

# time.sleep(3)

event_ref2 = exp_dpex.call_kernel(count_of_two, r2, b, count)
event_ref2.wait()
print(count)

count[0] = 0

print(c)


if __name__ == "__main__":
test_async_add()

0 comments on commit 5c1a60d

Please sign in to comment.