Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Solving perferomance regression issue by caching the kernel_bundle #896

Merged
merged 2 commits into from
Feb 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions numba_dpex/core/caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,11 @@
from numba_dpex.core.types import USMNdArray


def build_key(argtypes, pyfunc, codegen, backend=None, device_type=None):
"""Constructs a key from python function, context, backend and the device
type.
def build_key(
argtypes, pyfunc, codegen, backend=None, device_type=None, exec_queue=None
):
"""Constructs a key from python function, context, backend, the device
type and execution queue.

Compute index key for the given argument types and codegen. It includes a
description of the OS, target architecture and hashes of the bytecode for
Expand All @@ -32,6 +34,7 @@ def build_key(argtypes, pyfunc, codegen, backend=None, device_type=None):
Defaults to None.
device_type (enum, optional): A 'device_type' enum.
Defaults to None.
exec_queue (dpctl._sycl_queue.SyclQueue', optional): A SYCL queue object.

Returns:
tuple: A tuple of return type, argtpes, magic_tuple of codegen
Expand Down Expand Up @@ -64,6 +67,7 @@ def build_key(argtypes, pyfunc, codegen, backend=None, device_type=None):
codegen.magic_tuple(),
backend,
device_type,
exec_queue,
(
hashlib.sha256(codebytes).hexdigest(),
hashlib.sha256(cvarbytes).hexdigest(),
Expand Down
31 changes: 26 additions & 5 deletions numba_dpex/core/kernel_interface/dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,14 +88,21 @@ def __init__(
# caching related attributes
if not config.ENABLE_CACHE:
self._cache = NullCache()
self._kernel_bundle_cache = NullCache()
elif enable_cache:
self._cache = LRUCache(
name="SPIRVKernelCache",
capacity=config.CACHE_SIZE,
pyfunc=self.pyfunc,
)
self._kernel_bundle_cache = LRUCache(
name="KernelBundleCache",
capacity=config.CACHE_SIZE,
pyfunc=self.pyfunc,
)
chudur-budur marked this conversation as resolved.
Show resolved Hide resolved
else:
self._cache = NullCache()
self._kernel_bundle_cache = NullCache()
self._cache_hits = 0
chudur-budur marked this conversation as resolved.
Show resolved Hide resolved

if array_access_specifiers:
Expand Down Expand Up @@ -627,12 +634,26 @@ def __call__(self, *args):
cache=self._cache,
)

# create a sycl::KernelBundle
kernel_bundle = dpctl_prog.create_program_from_spirv(
exec_queue,
device_driver_ir_module,
" ".join(self._create_sycl_kernel_bundle_flags),
kernel_bundle_key = build_key(
tuple(argtypes),
self.pyfunc,
dpex_kernel_target.target_context.codegen(),
exec_queue=exec_queue,
)

artifact = self._kernel_bundle_cache.get(kernel_bundle_key)

if artifact is None:
# create a sycl::KernelBundle
kernel_bundle = dpctl_prog.create_program_from_spirv(
exec_queue,
device_driver_ir_module,
" ".join(self._create_sycl_kernel_bundle_flags),
)
self._kernel_bundle_cache.put(kernel_bundle_key, kernel_bundle)
else:
kernel_bundle = artifact

# get the sycl::kernel
sycl_kernel = kernel_bundle.get_sycl_kernel(kernel_module_name)

Expand Down
76 changes: 58 additions & 18 deletions numba_dpex/tests/kernel_tests/test_arg_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@
#
# SPDX-License-Identifier: Apache-2.0

import sys

import dpctl
import dpctl.tensor as dpt
import numpy as np
import pytest

import numba_dpex as dpex
from numba_dpex.tests._helper import filter_strings

global_size = 1054
local_size = 1
Expand Down Expand Up @@ -35,15 +37,39 @@ def input_arrays(request):
return a, b, c[0]


@pytest.mark.parametrize("filter_str", filter_strings)
def test_kernel_arg_types(filter_str, input_arrays):
kernel = dpex.kernel(mul_kernel)
a, actual, c = input_arrays
def test_kernel_arg_types(input_arrays):
usm_type = "device"

a, b, c = input_arrays
expected = a * c
device = dpctl.SyclDevice(filter_str)
with dpctl.device_context(device):
kernel[global_size, local_size](a, actual, c)
np.testing.assert_allclose(actual, expected, rtol=1e-5, atol=0)

queue = dpctl.SyclQueue(dpctl.select_default_device())

da = dpt.usm_ndarray(
a.shape,
dtype=a.dtype,
buffer=usm_type,
buffer_ctor_kwargs={"queue": queue},
)
da.usm_data.copy_from_host(a.reshape((-1)).view("|u1"))

db = dpt.usm_ndarray(
b.shape,
dtype=b.dtype,
buffer=usm_type,
buffer_ctor_kwargs={"queue": queue},
)
db.usm_data.copy_from_host(b.reshape((-1)).view("|u1"))

kernel = dpex.kernel(mul_kernel)
kernel[dpex.NdRange(dpex.Range(global_size), dpex.Range(local_size))](
da, db, c
chudur-budur marked this conversation as resolved.
Show resolved Hide resolved
)

result = np.zeros_like(b)
db.usm_data.copy_to_host(result.reshape((-1)).view("|u1"))

np.testing.assert_allclose(result, expected, rtol=1e-5, atol=0)


def check_bool_kernel(A, test):
Expand All @@ -53,14 +79,28 @@ def check_bool_kernel(A, test):
A[0] = 222


@pytest.mark.parametrize("filter_str", filter_strings)
def test_bool_type(filter_str):
kernel = dpex.kernel(check_bool_kernel)
def test_bool_type():
usm_type = "device"
a = np.array([2], np.int64)

device = dpctl.SyclDevice(filter_str)
with dpctl.device_context(device):
kernel[a.size, dpex.DEFAULT_LOCAL_SIZE](a, True)
assert a[0] == 111
kernel[a.size, dpex.DEFAULT_LOCAL_SIZE](a, False)
assert a[0] == 222
queue = dpctl.SyclQueue(dpctl.select_default_device())

da = dpt.usm_ndarray(
a.shape,
dtype=a.dtype,
buffer=usm_type,
buffer_ctor_kwargs={"queue": queue},
)
da.usm_data.copy_from_host(a.reshape((-1)).view("|u1"))

kernel = dpex.kernel(check_bool_kernel)

kernel[dpex.Range(a.size)](da, True)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test can also be done using a scalar and setting range to 1, in effect launching a sycl single_task.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, you can directly use dpctl constructors and do not need to copy data from NumPy.

result = np.zeros_like(a)
da.usm_data.copy_to_host(result.reshape((-1)).view("|u1"))
assert result[0] == 111

kernel[dpex.Range(a.size)](da, False)
result = np.zeros_like(a)
da.usm_data.copy_to_host(result.reshape((-1)).view("|u1"))
assert result[0] == 222
4 changes: 2 additions & 2 deletions numba_dpex/tests/test_device_array_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def data_parallel_sum(a, b, c):


@skip_no_opencl_cpu
class TestArrayArgsGPU:
class TestArrayArgsCPU:
def test_device_array_args_cpu(self):
c = np.ones_like(a)

Expand All @@ -37,7 +37,7 @@ def test_device_array_args_cpu(self):


@skip_no_opencl_gpu
class TestArrayArgsCPU:
class TestArrayArgsGPU:
def test_device_array_args_gpu(self):
c = np.ones_like(a)

Expand Down