Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MNT bump dependencies and update, including removing hacks required by 0.19.0 #93

Merged
merged 3 commits into from
Mar 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,19 +113,24 @@ You can setup a [`conda`](https://docs.conda.io/en/latest/) environment, and ins
dependencies (`numba-dpex` and `intel::dpcpp_linux-64`) distributed on the
`conda-forge`, `intel` and experimental `dppy/label/dev` channels with:

```bash
export CONDA_DPEX_ENV_NAME=my-dpex-env
```

(where you can replace the name of the environment `my-dpex-env` with a name of your
liking) followed by

```bash
export CONDA_DPEX_ENV_NAME=my-dpex-env
conda create --yes --name $CONDA_DPEX_ENV_NAME \
--channel dppy/label/dev \
--channel conda-forge \
--channel intel \
`# NB: different versions of `sklearn_numba_dpex` can require to pin` \
`# different versions, builds or channels here.` \
numba-dpex=0.19.0=py39hfc4b9b4_5 "intel::dpcpp_linux-64"
numba-dpex=0.20.0dev3=py310hfc4b9b4_4 "intel::dpcpp_linux-64"
```

(where you can replace the name of the environment `my-dpex-env` with a name of your
liking)
Note that different versions of `sklearn_numba_dpex` can require to pin different
versions, builds or channels in this last command.

An additional command is currently required to work around missing Intel CPU OpenCL
runtime activation. To resolve it, one needs to set environment variables for the
Expand Down
8 changes: 4 additions & 4 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
#
# TODO: make a custom build with other base images to test compatibility
# with other OSes.
ARG BASE_IMAGE_VERSION=latest-1299e21c883616aedd48495f77429f3d235cfaa4
ARG BASE_IMAGE_VERSION=latest-3a7f783934955c5caaf61d41cce613804100c72a
ARG BASE_IMAGE_TAG=ghcr.io/intel/llvm/ubuntu2204_intel_drivers

ARG BASE=${BASE_IMAGE_TAG}:${BASE_IMAGE_VERSION}
Expand Down Expand Up @@ -81,18 +81,18 @@ ARG LLVM_SPIRV_INSTALL_DIR=/opt/llvm-spirv
# Bump it if necessary.

ARG CMAKE_VERSION=3.25
ARG CMAKE_VERSION_BUILD=1
ARG CMAKE_VERSION_BUILD=2


# Versions of the intel python packages

ARG DPCTL_GIT_BRANCH=0.14.1dev1
ARG DPCTL_GIT_BRANCH=0.14.1dev2
ARG DPCTL_GIT_URL=https://github.com/IntelPython/dpctl.git

ARG DPNP_GIT_BRANCH=0.11.0
ARG DPNP_GIT_URL=https://github.com/IntelPython/dpnp.git

ARG NUMBA_DPEX_GIT_BRANCH=0.19.0
ARG NUMBA_DPEX_GIT_BRANCH=0.20.0dev3
ARG NUMBA_DPEX_GIT_URL=https://github.com/IntelPython/numba-dpex.git

# Version of other python packages explicitly installed either within the
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ classifiers = [
# see https://github.com/IntelPython/dpctl/issues/886
dependencies = [
"scikit-learn",
"numba-dpex>=0.19.0",
"dpctl>=0.14.1dev0",
"numba-dpex>=0.20.0dev3",
"dpctl>=0.14.1dev2",
]


Expand Down
3 changes: 0 additions & 3 deletions sklearn_numba_dpex/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +0,0 @@
from .patches.load_numba_dpex import _load_numba_dpex_with_patches

_load_numba_dpex_with_patches()
36 changes: 9 additions & 27 deletions sklearn_numba_dpex/common/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,41 +12,23 @@ def get_maximum_power_of_2_smaller_than(x):
return 2 ** (math.floor(math.log2(x)))


# HACK: the following function are defined as closures to work around a `numba_dpex`
# bug.
# Revert it (and everything related, see
# https://github.com/soda-inria/sklearn-numba-dpex/pull/82 )
# when the bug is fixed. The bugfix can be tracked at
# https://github.com/IntelPython/numba-dpex/issues/867
def _square():
def _square_closure(x):
return x * x
def _square(x):
return x * x

return _square_closure

def _minus(x, y):
return x - y

def _minus():
def _minus_closure(x, y):
return x - y

return _minus_closure


def _plus():
def _plus_closure(x, y):
return x + y

return _plus_closure
def _plus(x, y):
return x + y


def _divide_by(divisor):
def _divide_by_fn():
def _divide_closure(x):
return x / divisor

return _divide_closure
def _divide_closure(x):
return x / divisor

return _divide_by_fn
return _divide_closure


def _check_max_work_group_size(
Expand Down
78 changes: 29 additions & 49 deletions sklearn_numba_dpex/common/kernels.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,22 +31,9 @@
zero_idx = np.int64(0)


# HACK: make_* functions that take an operator as input (named `op`, `ops`,
# `fused_unary_func`,...) now expect those operators to be wrapped in another function
# that takes no arguments and returns the said operator. See the notice in the
# `sklearn_numba_dpex.common._utils` where some operators are pre-defined for more
# information.


# HACK: dtype argument is passed to prevent sharing a device function instance
# between kernels specialized for different argument types.
# This is a workaround for:
# https://github.com/IntelPython/numba-dpex/issues/867. Revert changes in
# https://github.com/soda-inria/sklearn-numba-dpex/pull/82 when
# fixed.
@lru_cache
def make_apply_elementwise_func(shape, func, work_group_size, dtype):
func_ = dpex.func(func())
def make_apply_elementwise_func(shape, func, work_group_size):
func = dpex.func(func)
n_items = math.prod(shape)

@dpex.kernel
Expand All @@ -60,7 +47,7 @@ def elementwise_ops_kernel(
return

item = data[item_idx]
data[item_idx] = func_(item)
data[item_idx] = func(item)

global_size = math.ceil(n_items / work_group_size) * work_group_size

Expand All @@ -73,7 +60,6 @@ def elementwise_ops(data):

@lru_cache
def make_initialize_to_zeros_kernel(shape, work_group_size, dtype):

n_items = math.prod(shape)
global_size = math.ceil(n_items / work_group_size) * work_group_size
zero = dtype(0.0)
Expand Down Expand Up @@ -120,7 +106,7 @@ def broadcast_division(dividend_array, divisor_vector):


@lru_cache
def make_broadcast_ops_1d_2d_axis1_kernel(shape, ops, work_group_size, dtype):
def make_broadcast_ops_1d_2d_axis1_kernel(shape, ops, work_group_size):
"""
ops must be a function that will be interpreted as a dpex.func and is subject to
the same rules. It is expected to take two scalar arguments and return one scalar
Expand All @@ -130,7 +116,7 @@ def make_broadcast_ops_1d_2d_axis1_kernel(shape, ops, work_group_size, dtype):
n_rows, n_cols = shape

global_size = math.ceil(n_cols / work_group_size) * work_group_size
ops = dpex.func(ops())
ops = dpex.func(ops)

# NB: the left operand is modified inplace, the right operand is only read into.
# Optimized for C-contiguous array and for
Expand Down Expand Up @@ -336,10 +322,9 @@ def make_sum_reduction_2d_kernel(
result_shape = get_result_shape(result_sum_axis_size)
result = dpt.empty(result_shape, dtype=dtype, device=device)

global_size = get_global_size(result_sum_axis_size)
kernel = kernel[global_size, work_group_size]
sizes = (get_global_size(result_sum_axis_size), work_group_size)

kernels_and_empty_tensors_pairs.append((kernel, result))
kernels_and_empty_tensors_pairs.append((kernel, sizes, result))
kernel = nofunc_kernel

next_input_size = result_sum_axis_size
Expand All @@ -355,8 +340,8 @@ def sum_reduction(summands):
summands = dpt.zeros(sh=get_result_shape(1))

# TODO: manually dispatch the kernels with a SyclQueue
for kernel, result in kernels_and_empty_tensors_pairs:
kernel(summands, result)
for kernel, sizes, result in kernels_and_empty_tensors_pairs:
kernel[sizes](summands, result)
summands = result

if is_1d:
Expand All @@ -379,7 +364,7 @@ def fused_elementwise_func_(x):
return x

else:
fused_elementwise_func_ = dpex.func(fused_elementwise_func())
fused_elementwise_func_ = dpex.func(fused_elementwise_func)

input_work_group_size = work_group_size
work_group_size = _check_max_work_group_size(
Expand Down Expand Up @@ -439,10 +424,7 @@ def _make_partial_sum_reduction_2d_axis1_kernel(
n_local_iterations = np.int64(math.log2(work_group_size) - 1)
reduction_block_size = 2 * work_group_size

# HACK: must define twice to work around the bug highlighted in
# test_regression_fix
local_sum_and_set_items_if = _make_sum_and_set_items_if_kernel_func()
global_sum_and_set_items_if = _make_sum_and_set_items_if_kernel_func()
_sum_and_set_items_if = _make_sum_and_set_items_if_kernel_func()

@dpex.kernel
# fmt: off
Expand Down Expand Up @@ -527,7 +509,7 @@ def partial_sum_reduction(
local_values
)

dpex.barrier(dpex.CLK_LOCAL_MEM_FENCE)
dpex.barrier(dpex.LOCAL_MEM_FENCE)

# Then, the sums of two scalars that have been written in `local_array` are
# further summed together into `local_array[0]`. At each iteration, half
Expand All @@ -544,7 +526,7 @@ def partial_sum_reduction(
# Yet again, the remaining work items choose two values to sum such that
# contiguous work items read and write into contiguous slots of
# `local_values`.
local_sum_and_set_items_if(
_sum_and_set_items_if(
(
(local_work_id < n_active_work_items) and
(work_item_idx < sum_axis_size)
Expand All @@ -557,12 +539,12 @@ def partial_sum_reduction(
local_values
)

dpex.barrier(dpex.CLK_LOCAL_MEM_FENCE)
dpex.barrier(dpex.LOCAL_MEM_FENCE)

# At this point local_values[0] + local_values[1] is equal to the sum of all
# elements in summands that have been covered by the work group, we write it
# into global memory
global_sum_and_set_items_if(
_sum_and_set_items_if(
local_work_id == zero_idx,
(row_idx, local_work_group_id_in_row),
zero_idx,
Expand Down Expand Up @@ -612,7 +594,7 @@ def fused_elementwise_func_(x):
return x

else:
fused_elementwise_func_ = dpex.func(fused_elementwise_func())
fused_elementwise_func_ = dpex.func(fused_elementwise_func)

input_work_group_size = work_group_size
work_group_size = _check_max_work_group_size(
Expand Down Expand Up @@ -681,10 +663,7 @@ def _make_partial_sum_reduction_2d_axis0_kernel(
local_values_size = (n_sub_groups_per_work_group, sub_group_size)
reduction_block_size = 2 * n_sub_groups_per_work_group

# HACK: must define twice to work around the bug highlighted in
# test_regression_fix
local_sum_and_set_items_if = _make_sum_and_set_items_if_kernel_func()
global_sum_and_set_items_if = _make_sum_and_set_items_if_kernel_func()
_sum_and_set_items_if = _make_sum_and_set_items_if_kernel_func()

# ???: how does this strategy compares to having each thread reducing N contiguous
# items ?
Expand Down Expand Up @@ -778,7 +757,7 @@ def partial_sum_reduction(
local_values
)

dpex.barrier(dpex.CLK_LOCAL_MEM_FENCE)
dpex.barrier(dpex.LOCAL_MEM_FENCE)

# Then, the sums of two scalars that have been written in `local_array` are
# further summed together into `local_array[0, :]`. At each iteration, half
Expand All @@ -792,7 +771,7 @@ def partial_sum_reduction(
n_active_sub_groups = n_active_sub_groups // two_as_a_long
work_item_row_idx = first_row_idx + local_row_idx + n_active_sub_groups

local_sum_and_set_items_if(
_sum_and_set_items_if(
(
(local_row_idx < n_active_sub_groups) and
(col_idx < n_cols) and
Expand All @@ -806,12 +785,12 @@ def partial_sum_reduction(
local_values
)

dpex.barrier(dpex.CLK_LOCAL_MEM_FENCE)
dpex.barrier(dpex.LOCAL_MEM_FENCE)

# At this point local_values[0, :] + local_values[1, :] is equal to the sum of
# all elements in summands that have been covered by the work group, we write
# it into global memory
global_sum_and_set_items_if(
_sum_and_set_items_if(
(local_row_idx == zero_idx) and (col_idx < n_cols),
(local_block_id_in_col, col_idx),
(zero_idx, local_col_idx),
Expand Down Expand Up @@ -934,7 +913,7 @@ def partial_argmin_reduction(
local_values,
)

dpex.barrier(dpex.CLK_LOCAL_MEM_FENCE)
dpex.barrier(dpex.LOCAL_MEM_FENCE)
n_active_work_items = work_group_size
for i in range(n_local_iterations):
n_active_work_items = n_active_work_items // two_as_a_long
Expand All @@ -945,7 +924,7 @@ def partial_argmin_reduction(
local_values,
local_argmin
)
dpex.barrier(dpex.CLK_LOCAL_MEM_FENCE)
dpex.barrier(dpex.LOCAL_MEM_FENCE)

_register_result(
first_work_id,
Expand Down Expand Up @@ -1051,15 +1030,16 @@ def _register_result(
previous_result = dpt.empty((1,), dtype=np.int32, device=device)
while n_groups > 1:
n_groups = math.ceil(n_groups / (2 * work_group_size))
global_size = n_groups * work_group_size
kernel = partial_argmin_reduction[global_size, work_group_size]
sizes = (n_groups * work_group_size, work_group_size)
result = dpt.empty(n_groups, dtype=np.int32, device=device)
kernels_and_empty_tensors_tuples.append((kernel, previous_result, result))
kernels_and_empty_tensors_tuples.append(
(partial_argmin_reduction, sizes, previous_result, result)
)
previous_result = result

def argmin_reduction(values):
for kernel, previous_result, result in kernels_and_empty_tensors_tuples:
kernel(values, previous_result, result)
for kernel, sizes, previous_result, result in kernels_and_empty_tensors_tuples:
kernel[sizes](values, previous_result, result)
return result

return argmin_reduction
Loading