soda-inria · fcharras · Mar 1, 2023 · Feb 21, 2023 · Mar 1, 2023 · Mar 1, 2023
diff --git a/README.md b/README.md
@@ -113,19 +113,24 @@ You can setup a [`conda`](https://docs.conda.io/en/latest/) environment, and ins
 dependencies (`numba-dpex` and `intel::dpcpp_linux-64`) distributed on the
 `conda-forge`, `intel` and experimental `dppy/label/dev` channels with:
 
+```bash
+export CONDA_DPEX_ENV_NAME=my-dpex-env
+```
+
+(where you can replace the name of the environment `my-dpex-env` with a name of your
+liking) followed by
+
 ```bash
 export CONDA_DPEX_ENV_NAME=my-dpex-env
 conda create --yes --name $CONDA_DPEX_ENV_NAME \
              --channel dppy/label/dev \
              --channel conda-forge \
              --channel intel \
-             `# NB: different versions of `sklearn_numba_dpex` can require to pin` \
-             `# different versions, builds or channels here.` \
-             numba-dpex=0.19.0=py39hfc4b9b4_5 "intel::dpcpp_linux-64"
+             numba-dpex=0.20.0dev3=py310hfc4b9b4_4 "intel::dpcpp_linux-64"
 ```
 
-(where you can replace the name of the environment `my-dpex-env` with a name of your
-liking)
+Note that different versions of `sklearn_numba_dpex` can require to pin different
+versions, builds or channels in this last command.
 
 An additional command is currently required to work around missing Intel CPU OpenCL
 runtime activation. To resolve it, one needs to set environment variables for the

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -24,7 +24,7 @@
 #
 # TODO: make a custom build with other base images to test compatibility
 # with other OSes.
-ARG BASE_IMAGE_VERSION=latest-1299e21c883616aedd48495f77429f3d235cfaa4
+ARG BASE_IMAGE_VERSION=latest-3a7f783934955c5caaf61d41cce613804100c72a
 ARG BASE_IMAGE_TAG=ghcr.io/intel/llvm/ubuntu2204_intel_drivers
 
 ARG BASE=${BASE_IMAGE_TAG}:${BASE_IMAGE_VERSION}
@@ -81,18 +81,18 @@ ARG LLVM_SPIRV_INSTALL_DIR=/opt/llvm-spirv
 # Bump it if necessary.
 
 ARG CMAKE_VERSION=3.25
-ARG CMAKE_VERSION_BUILD=1
+ARG CMAKE_VERSION_BUILD=2
 
 
 # Versions of the intel python packages
 
-ARG DPCTL_GIT_BRANCH=0.14.1dev1
+ARG DPCTL_GIT_BRANCH=0.14.1dev2
 ARG DPCTL_GIT_URL=https://github.com/IntelPython/dpctl.git
 
 ARG DPNP_GIT_BRANCH=0.11.0
 ARG DPNP_GIT_URL=https://github.com/IntelPython/dpnp.git
 
-ARG NUMBA_DPEX_GIT_BRANCH=0.19.0
+ARG NUMBA_DPEX_GIT_BRANCH=0.20.0dev3
 ARG NUMBA_DPEX_GIT_URL=https://github.com/IntelPython/numba-dpex.git
 
 # Version of other python packages explicitly installed either within the

diff --git a/pyproject.toml b/pyproject.toml
@@ -37,8 +37,8 @@ classifiers = [
 # see https://github.com/IntelPython/dpctl/issues/886
 dependencies = [
     "scikit-learn",
-    "numba-dpex>=0.19.0",
-    "dpctl>=0.14.1dev0",
+    "numba-dpex>=0.20.0dev3",
+    "dpctl>=0.14.1dev2",
 ]
 
 

diff --git a/sklearn_numba_dpex/__init__.py b/sklearn_numba_dpex/__init__.py
@@ -1,3 +0,0 @@
-from .patches.load_numba_dpex import _load_numba_dpex_with_patches
-
-_load_numba_dpex_with_patches()

diff --git a/sklearn_numba_dpex/common/_utils.py b/sklearn_numba_dpex/common/_utils.py
@@ -12,41 +12,23 @@ def get_maximum_power_of_2_smaller_than(x):
     return 2 ** (math.floor(math.log2(x)))
 
 
-# HACK: the following function are defined as closures to work around a `numba_dpex`
-# bug.
-# Revert it (and everything related, see
-# https://github.com/soda-inria/sklearn-numba-dpex/pull/82 )
-# when the bug is fixed. The bugfix can be tracked at
-# https://github.com/IntelPython/numba-dpex/issues/867
-def _square():
-    def _square_closure(x):
-        return x * x
+def _square(x):
+    return x * x
 
-    return _square_closure
 
+def _minus(x, y):
+    return x - y
 
-def _minus():
-    def _minus_closure(x, y):
-        return x - y
 
-    return _minus_closure
-
-
-def _plus():
-    def _plus_closure(x, y):
-        return x + y
-
-    return _plus_closure
+def _plus(x, y):
+    return x + y
 
 
 def _divide_by(divisor):
-    def _divide_by_fn():
-        def _divide_closure(x):
-            return x / divisor
-
-        return _divide_closure
+    def _divide_closure(x):
+        return x / divisor
 
-    return _divide_by_fn
+    return _divide_closure
 
 
 def _check_max_work_group_size(

diff --git a/sklearn_numba_dpex/common/kernels.py b/sklearn_numba_dpex/common/kernels.py
@@ -31,22 +31,9 @@
 zero_idx = np.int64(0)
 
 
-# HACK: make_* functions that take an operator as input (named `op`, `ops`,
-# `fused_unary_func`,...) now expect those operators to be wrapped in another function
-# that takes no arguments and returns the said operator. See the notice in the
-# `sklearn_numba_dpex.common._utils` where some operators are pre-defined for more
-# information.
-
-
-# HACK: dtype argument is passed to prevent sharing a device function instance
-# between kernels specialized for different argument types.
-# This is a workaround for:
-# https://github.com/IntelPython/numba-dpex/issues/867. Revert changes in
-# https://github.com/soda-inria/sklearn-numba-dpex/pull/82 when
-# fixed.
 @lru_cache
-def make_apply_elementwise_func(shape, func, work_group_size, dtype):
-    func_ = dpex.func(func())
+def make_apply_elementwise_func(shape, func, work_group_size):
+    func = dpex.func(func)
     n_items = math.prod(shape)
 
     @dpex.kernel
@@ -60,7 +47,7 @@ def elementwise_ops_kernel(
             return
 
         item = data[item_idx]
-        data[item_idx] = func_(item)
+        data[item_idx] = func(item)
 
     global_size = math.ceil(n_items / work_group_size) * work_group_size
 
@@ -73,7 +60,6 @@ def elementwise_ops(data):
 
 @lru_cache
 def make_initialize_to_zeros_kernel(shape, work_group_size, dtype):
-
     n_items = math.prod(shape)
     global_size = math.ceil(n_items / work_group_size) * work_group_size
     zero = dtype(0.0)
@@ -120,7 +106,7 @@ def broadcast_division(dividend_array, divisor_vector):
 
 
 @lru_cache
-def make_broadcast_ops_1d_2d_axis1_kernel(shape, ops, work_group_size, dtype):
+def make_broadcast_ops_1d_2d_axis1_kernel(shape, ops, work_group_size):
     """
     ops must be a function that will be interpreted as a dpex.func and is subject to
     the same rules. It is expected to take two scalar arguments and return one scalar
@@ -130,7 +116,7 @@ def make_broadcast_ops_1d_2d_axis1_kernel(shape, ops, work_group_size, dtype):
     n_rows, n_cols = shape
 
     global_size = math.ceil(n_cols / work_group_size) * work_group_size
-    ops = dpex.func(ops())
+    ops = dpex.func(ops)
 
     # NB: the left operand is modified inplace, the right operand is only read into.
     # Optimized for C-contiguous array and for
@@ -336,10 +322,9 @@ def make_sum_reduction_2d_kernel(
         result_shape = get_result_shape(result_sum_axis_size)
         result = dpt.empty(result_shape, dtype=dtype, device=device)
 
-        global_size = get_global_size(result_sum_axis_size)
-        kernel = kernel[global_size, work_group_size]
+        sizes = (get_global_size(result_sum_axis_size), work_group_size)
 
-        kernels_and_empty_tensors_pairs.append((kernel, result))
+        kernels_and_empty_tensors_pairs.append((kernel, sizes, result))
         kernel = nofunc_kernel
 
         next_input_size = result_sum_axis_size
@@ -355,8 +340,8 @@ def sum_reduction(summands):
             summands = dpt.zeros(sh=get_result_shape(1))
 
         # TODO: manually dispatch the kernels with a SyclQueue
-        for kernel, result in kernels_and_empty_tensors_pairs:
-            kernel(summands, result)
+        for kernel, sizes, result in kernels_and_empty_tensors_pairs:
+            kernel[sizes](summands, result)
             summands = result
 
         if is_1d:
@@ -379,7 +364,7 @@ def fused_elementwise_func_(x):
             return x
 
     else:
-        fused_elementwise_func_ = dpex.func(fused_elementwise_func())
+        fused_elementwise_func_ = dpex.func(fused_elementwise_func)
 
     input_work_group_size = work_group_size
     work_group_size = _check_max_work_group_size(
@@ -439,10 +424,7 @@ def _make_partial_sum_reduction_2d_axis1_kernel(
     n_local_iterations = np.int64(math.log2(work_group_size) - 1)
     reduction_block_size = 2 * work_group_size
 
-    # HACK: must define twice to work around the bug highlighted in
-    # test_regression_fix
-    local_sum_and_set_items_if = _make_sum_and_set_items_if_kernel_func()
-    global_sum_and_set_items_if = _make_sum_and_set_items_if_kernel_func()
+    _sum_and_set_items_if = _make_sum_and_set_items_if_kernel_func()
 
     @dpex.kernel
     # fmt: off
@@ -527,7 +509,7 @@ def partial_sum_reduction(
             local_values
         )
 
-        dpex.barrier(dpex.CLK_LOCAL_MEM_FENCE)
+        dpex.barrier(dpex.LOCAL_MEM_FENCE)
 
         # Then, the sums of two scalars that have been written in `local_array` are
         # further summed together into `local_array[0]`. At each iteration, half
@@ -544,7 +526,7 @@ def partial_sum_reduction(
             # Yet again, the remaining work items choose two values to sum such that
             # contiguous work items read and write into contiguous slots of
             # `local_values`.
-            local_sum_and_set_items_if(
+            _sum_and_set_items_if(
                 (
                     (local_work_id < n_active_work_items) and
                     (work_item_idx < sum_axis_size)
@@ -557,12 +539,12 @@ def partial_sum_reduction(
                 local_values
             )
 
-            dpex.barrier(dpex.CLK_LOCAL_MEM_FENCE)
+            dpex.barrier(dpex.LOCAL_MEM_FENCE)
 
         # At this point local_values[0] + local_values[1] is equal to the sum of all
         # elements in summands that have been covered by the work group, we write it
         # into global memory
-        global_sum_and_set_items_if(
+        _sum_and_set_items_if(
             local_work_id == zero_idx,
             (row_idx, local_work_group_id_in_row),
             zero_idx,
@@ -612,7 +594,7 @@ def fused_elementwise_func_(x):
             return x
 
     else:
-        fused_elementwise_func_ = dpex.func(fused_elementwise_func())
+        fused_elementwise_func_ = dpex.func(fused_elementwise_func)
 
     input_work_group_size = work_group_size
     work_group_size = _check_max_work_group_size(
@@ -681,10 +663,7 @@ def _make_partial_sum_reduction_2d_axis0_kernel(
     local_values_size = (n_sub_groups_per_work_group, sub_group_size)
     reduction_block_size = 2 * n_sub_groups_per_work_group
 
-    # HACK: must define twice to work around the bug highlighted in
-    # test_regression_fix
-    local_sum_and_set_items_if = _make_sum_and_set_items_if_kernel_func()
-    global_sum_and_set_items_if = _make_sum_and_set_items_if_kernel_func()
+    _sum_and_set_items_if = _make_sum_and_set_items_if_kernel_func()
 
     # ???: how does this strategy compares to having each thread reducing N contiguous
     # items ?
@@ -778,7 +757,7 @@ def partial_sum_reduction(
             local_values
         )
 
-        dpex.barrier(dpex.CLK_LOCAL_MEM_FENCE)
+        dpex.barrier(dpex.LOCAL_MEM_FENCE)
 
         # Then, the sums of two scalars that have been written in `local_array` are
         # further summed together into `local_array[0, :]`. At each iteration, half
@@ -792,7 +771,7 @@ def partial_sum_reduction(
             n_active_sub_groups = n_active_sub_groups // two_as_a_long
             work_item_row_idx = first_row_idx + local_row_idx + n_active_sub_groups
 
-            local_sum_and_set_items_if(
+            _sum_and_set_items_if(
                 (
                     (local_row_idx < n_active_sub_groups) and
                     (col_idx < n_cols) and
@@ -806,12 +785,12 @@ def partial_sum_reduction(
                 local_values
             )
 
-            dpex.barrier(dpex.CLK_LOCAL_MEM_FENCE)
+            dpex.barrier(dpex.LOCAL_MEM_FENCE)
 
         # At this point local_values[0, :] + local_values[1, :] is equal to the sum of
         # all elements in summands that have been covered by the work group, we write
         # it into global memory
-        global_sum_and_set_items_if(
+        _sum_and_set_items_if(
             (local_row_idx == zero_idx) and (col_idx < n_cols),
             (local_block_id_in_col, col_idx),
             (zero_idx, local_col_idx),
@@ -934,7 +913,7 @@ def partial_argmin_reduction(
             local_values,
         )
 
-        dpex.barrier(dpex.CLK_LOCAL_MEM_FENCE)
+        dpex.barrier(dpex.LOCAL_MEM_FENCE)
         n_active_work_items = work_group_size
         for i in range(n_local_iterations):
             n_active_work_items = n_active_work_items // two_as_a_long
@@ -945,7 +924,7 @@ def partial_argmin_reduction(
                 local_values,
                 local_argmin
             )
-            dpex.barrier(dpex.CLK_LOCAL_MEM_FENCE)
+            dpex.barrier(dpex.LOCAL_MEM_FENCE)
 
         _register_result(
             first_work_id,
@@ -1051,15 +1030,16 @@ def _register_result(
     previous_result = dpt.empty((1,), dtype=np.int32, device=device)
     while n_groups > 1:
         n_groups = math.ceil(n_groups / (2 * work_group_size))
-        global_size = n_groups * work_group_size
-        kernel = partial_argmin_reduction[global_size, work_group_size]
+        sizes = (n_groups * work_group_size, work_group_size)
         result = dpt.empty(n_groups, dtype=np.int32, device=device)
-        kernels_and_empty_tensors_tuples.append((kernel, previous_result, result))
+        kernels_and_empty_tensors_tuples.append(
+            (partial_argmin_reduction, sizes, previous_result, result)
+        )
         previous_result = result
 
     def argmin_reduction(values):
-        for kernel, previous_result, result in kernels_and_empty_tensors_tuples:
-            kernel(values, previous_result, result)
+        for kernel, sizes, previous_result, result in kernels_and_empty_tensors_tuples:
+            kernel[sizes](values, previous_result, result)
         return result
 
     return argmin_reduction
Original file line number	Diff line number	Diff line change
		@@ -1,3 +0,0 @@
		from .patches.load_numba_dpex import _load_numba_dpex_with_patches

		_load_numba_dpex_with_patches()