From 603961ea6e5663fa47c210c32c04a5bac23cd38b Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Tue, 17 Nov 2020 17:48:37 +0300
Subject: [PATCH 01/32] Add versioneer (#11)

* Add versioneer

Installed versioneer: pip install versioneer
Added code in setup.py. Added code in setup.cfg.
Install: versioneer install

* Add tag_prefix in setup.cfg
---
 .gitattributes         |    1 +
 MANIFEST.in            |    2 +
 numba_dppy/__init__.py |    4 +
 numba_dppy/_version.py |  525 ++++++++++++
 setup.cfg              |   13 +
 setup.py               |    5 +-
 versioneer.py          | 1855 ++++++++++++++++++++++++++++++++++++++++
 7 files changed, 2404 insertions(+), 1 deletion(-)
 create mode 100644 .gitattributes
 create mode 100644 numba_dppy/_version.py
 create mode 100644 setup.cfg
 create mode 100644 versioneer.py

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000..3123046333
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1 @@
+numba_dppy/_version.py export-subst
diff --git a/MANIFEST.in b/MANIFEST.in
index e9635a5f83..74d44bdc67 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -3,3 +3,5 @@ include README.md setup.py LICENSE
 
 recursive-include numba_dppy *.cl
 
+include versioneer.py
+include numba_dppy/_version.py
diff --git a/numba_dppy/__init__.py b/numba_dppy/__init__.py
index 7d52138691..35c5e0a9f5 100644
--- a/numba_dppy/__init__.py
+++ b/numba_dppy/__init__.py
@@ -517,3 +517,7 @@ def test(*args, **kwargs):
         dppl_error()
 
     return numba.testing.test("numba_dppy.tests", *args, **kwargs)
+
+from ._version import get_versions
+__version__ = get_versions()['version']
+del get_versions
diff --git a/numba_dppy/_version.py b/numba_dppy/_version.py
new file mode 100644
index 0000000000..165dbf4d17
--- /dev/null
+++ b/numba_dppy/_version.py
@@ -0,0 +1,525 @@
+
+# This file helps to compute a version number in source trees obtained from
+# git-archive tarball (such as those provided by githubs download-from-tag
+# feature). Distribution tarballs (built by setup.py sdist) and build
+# directories (produced by setup.py build) will contain a much shorter file
+# that just contains the computed version number.
+
+# This file is released into the public domain. Generated by
+# versioneer-0.19 (https://github.com/python-versioneer/python-versioneer)
+
+"""Git implementation of _version.py."""
+
+import errno
+import os
+import re
+import subprocess
+import sys
+
+
+def get_keywords():
+    """Get the keywords needed to look up the version information."""
+    # these strings will be replaced by git during git-archive.
+    # setup.py/versioneer.py will grep for the variable names, so they must
+    # each be defined on a line of their own. _version.py will just call
+    # get_keywords().
+    git_refnames = "$Format:%d$"
+    git_full = "$Format:%H$"
+    git_date = "$Format:%ci$"
+    keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
+    return keywords
+
+
+class VersioneerConfig:
+    """Container for Versioneer configuration parameters."""
+
+
+def get_config():
+    """Create, populate and return the VersioneerConfig() object."""
+    # these strings are filled in when 'setup.py versioneer' creates
+    # _version.py
+    cfg = VersioneerConfig()
+    cfg.VCS = "git"
+    cfg.style = "pep440"
+    cfg.tag_prefix = "None"
+    cfg.parentdir_prefix = "None"
+    cfg.versionfile_source = "numba_dppy/_version.py"
+    cfg.verbose = False
+    return cfg
+
+
+class NotThisMethod(Exception):
+    """Exception raised if a method is not valid for the current scenario."""
+
+
+LONG_VERSION_PY = {}
+HANDLERS = {}
+
+
+def register_vcs_handler(vcs, method):  # decorator
+    """Create decorator to mark a method as the handler of a VCS."""
+    def decorate(f):
+        """Store f in HANDLERS[vcs][method]."""
+        if vcs not in HANDLERS:
+            HANDLERS[vcs] = {}
+        HANDLERS[vcs][method] = f
+        return f
+    return decorate
+
+
+def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
+                env=None):
+    """Call the given command(s)."""
+    assert isinstance(commands, list)
+    p = None
+    for c in commands:
+        try:
+            dispcmd = str([c] + args)
+            # remember shell=False, so use git.cmd on windows, not just git
+            p = subprocess.Popen([c] + args, cwd=cwd, env=env,
+                                 stdout=subprocess.PIPE,
+                                 stderr=(subprocess.PIPE if hide_stderr
+                                         else None))
+            break
+        except EnvironmentError:
+            e = sys.exc_info()[1]
+            if e.errno == errno.ENOENT:
+                continue
+            if verbose:
+                print("unable to run %s" % dispcmd)
+                print(e)
+            return None, None
+    else:
+        if verbose:
+            print("unable to find command, tried %s" % (commands,))
+        return None, None
+    stdout = p.communicate()[0].strip().decode()
+    if p.returncode != 0:
+        if verbose:
+            print("unable to run %s (error)" % dispcmd)
+            print("stdout was %s" % stdout)
+        return None, p.returncode
+    return stdout, p.returncode
+
+
+def versions_from_parentdir(parentdir_prefix, root, verbose):
+    """Try to determine the version from the parent directory name.
+
+    Source tarballs conventionally unpack into a directory that includes both
+    the project name and a version string. We will also support searching up
+    two directory levels for an appropriately named parent directory
+    """
+    rootdirs = []
+
+    for i in range(3):
+        dirname = os.path.basename(root)
+        if dirname.startswith(parentdir_prefix):
+            return {"version": dirname[len(parentdir_prefix):],
+                    "full-revisionid": None,
+                    "dirty": False, "error": None, "date": None}
+        else:
+            rootdirs.append(root)
+            root = os.path.dirname(root)  # up a level
+
+    if verbose:
+        print("Tried directories %s but none started with prefix %s" %
+              (str(rootdirs), parentdir_prefix))
+    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
+
+
+@register_vcs_handler("git", "get_keywords")
+def git_get_keywords(versionfile_abs):
+    """Extract version information from the given file."""
+    # the code embedded in _version.py can just fetch the value of these
+    # keywords. When used from setup.py, we don't want to import _version.py,
+    # so we do it with a regexp instead. This function is not used from
+    # _version.py.
+    keywords = {}
+    try:
+        f = open(versionfile_abs, "r")
+        for line in f.readlines():
+            if line.strip().startswith("git_refnames ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["refnames"] = mo.group(1)
+            if line.strip().startswith("git_full ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["full"] = mo.group(1)
+            if line.strip().startswith("git_date ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["date"] = mo.group(1)
+        f.close()
+    except EnvironmentError:
+        pass
+    return keywords
+
+
+@register_vcs_handler("git", "keywords")
+def git_versions_from_keywords(keywords, tag_prefix, verbose):
+    """Get version information from git keywords."""
+    if not keywords:
+        raise NotThisMethod("no keywords at all, weird")
+    date = keywords.get("date")
+    if date is not None:
+        # Use only the last line.  Previous lines may contain GPG signature
+        # information.
+        date = date.splitlines()[-1]
+
+        # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
+        # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
+        # -like" string, which we must then edit to make compliant), because
+        # it's been around since git-1.5.3, and it's too difficult to
+        # discover which version we're using, or to work around using an
+        # older one.
+        date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+    refnames = keywords["refnames"].strip()
+    if refnames.startswith("$Format"):
+        if verbose:
+            print("keywords are unexpanded, not using")
+        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
+    refs = set([r.strip() for r in refnames.strip("()").split(",")])
+    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
+    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
+    TAG = "tag: "
+    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+    if not tags:
+        # Either we're using git < 1.8.3, or there really are no tags. We use
+        # a heuristic: assume all version tags have a digit. The old git %d
+        # expansion behaves like git log --decorate=short and strips out the
+        # refs/heads/ and refs/tags/ prefixes that would let us distinguish
+        # between branches and tags. By ignoring refnames without digits, we
+        # filter out many common branch names like "release" and
+        # "stabilization", as well as "HEAD" and "master".
+        tags = set([r for r in refs if re.search(r'\d', r)])
+        if verbose:
+            print("discarding '%s', no digits" % ",".join(refs - tags))
+    if verbose:
+        print("likely tags: %s" % ",".join(sorted(tags)))
+    for ref in sorted(tags):
+        # sorting will prefer e.g. "2.0" over "2.0rc1"
+        if ref.startswith(tag_prefix):
+            r = ref[len(tag_prefix):]
+            if verbose:
+                print("picking %s" % r)
+            return {"version": r,
+                    "full-revisionid": keywords["full"].strip(),
+                    "dirty": False, "error": None,
+                    "date": date}
+    # no suitable tags, so version is "0+unknown", but full hex is still there
+    if verbose:
+        print("no suitable tags, using unknown + full revision id")
+    return {"version": "0+unknown",
+            "full-revisionid": keywords["full"].strip(),
+            "dirty": False, "error": "no suitable tags", "date": None}
+
+
+@register_vcs_handler("git", "pieces_from_vcs")
+def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
+    """Get version from 'git describe' in the root of the source tree.
+
+    This only gets called if the git-archive 'subst' keywords were *not*
+    expanded, and _version.py hasn't already been rewritten with a short
+    version string, meaning we're inside a checked out source tree.
+    """
+    GITS = ["git"]
+    if sys.platform == "win32":
+        GITS = ["git.cmd", "git.exe"]
+
+    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
+                          hide_stderr=True)
+    if rc != 0:
+        if verbose:
+            print("Directory %s not under git control" % root)
+        raise NotThisMethod("'git rev-parse --git-dir' returned error")
+
+    # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
+    # if there isn't one, this yields HEX[-dirty] (no NUM)
+    describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
+                                          "--always", "--long",
+                                          "--match", "%s*" % tag_prefix],
+                                   cwd=root)
+    # --long was added in git-1.5.5
+    if describe_out is None:
+        raise NotThisMethod("'git describe' failed")
+    describe_out = describe_out.strip()
+    full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
+    if full_out is None:
+        raise NotThisMethod("'git rev-parse' failed")
+    full_out = full_out.strip()
+
+    pieces = {}
+    pieces["long"] = full_out
+    pieces["short"] = full_out[:7]  # maybe improved later
+    pieces["error"] = None
+
+    # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
+    # TAG might have hyphens.
+    git_describe = describe_out
+
+    # look for -dirty suffix
+    dirty = git_describe.endswith("-dirty")
+    pieces["dirty"] = dirty
+    if dirty:
+        git_describe = git_describe[:git_describe.rindex("-dirty")]
+
+    # now we have TAG-NUM-gHEX or HEX
+
+    if "-" in git_describe:
+        # TAG-NUM-gHEX
+        mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
+        if not mo:
+            # unparseable. Maybe git-describe is misbehaving?
+            pieces["error"] = ("unable to parse git-describe output: '%s'"
+                               % describe_out)
+            return pieces
+
+        # tag
+        full_tag = mo.group(1)
+        if not full_tag.startswith(tag_prefix):
+            if verbose:
+                fmt = "tag '%s' doesn't start with prefix '%s'"
+                print(fmt % (full_tag, tag_prefix))
+            pieces["error"] = ("tag '%s' doesn't start with prefix '%s'"
+                               % (full_tag, tag_prefix))
+            return pieces
+        pieces["closest-tag"] = full_tag[len(tag_prefix):]
+
+        # distance: number of commits since tag
+        pieces["distance"] = int(mo.group(2))
+
+        # commit: short hex revision ID
+        pieces["short"] = mo.group(3)
+
+    else:
+        # HEX: no tags
+        pieces["closest-tag"] = None
+        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
+                                    cwd=root)
+        pieces["distance"] = int(count_out)  # total number of commits
+
+    # commit date: see ISO-8601 comment in git_versions_from_keywords()
+    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"],
+                       cwd=root)[0].strip()
+    # Use only the last line.  Previous lines may contain GPG signature
+    # information.
+    date = date.splitlines()[-1]
+    pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+
+    return pieces
+
+
+def plus_or_dot(pieces):
+    """Return a + if we don't already have one, else return a ."""
+    if "+" in pieces.get("closest-tag", ""):
+        return "."
+    return "+"
+
+
+def render_pep440(pieces):
+    """Build up version string, with post-release "local version identifier".
+
+    Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
+    get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
+
+    Exceptions:
+    1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += plus_or_dot(pieces)
+            rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
+            if pieces["dirty"]:
+                rendered += ".dirty"
+    else:
+        # exception #1
+        rendered = "0+untagged.%d.g%s" % (pieces["distance"],
+                                          pieces["short"])
+        if pieces["dirty"]:
+            rendered += ".dirty"
+    return rendered
+
+
+def render_pep440_pre(pieces):
+    """TAG[.post0.devDISTANCE] -- No -dirty.
+
+    Exceptions:
+    1: no tags. 0.post0.devDISTANCE
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += ".post0.dev%d" % pieces["distance"]
+    else:
+        # exception #1
+        rendered = "0.post0.dev%d" % pieces["distance"]
+    return rendered
+
+
+def render_pep440_post(pieces):
+    """TAG[.postDISTANCE[.dev0]+gHEX] .
+
+    The ".dev0" means dirty. Note that .dev0 sorts backwards
+    (a dirty tree will appear "older" than the corresponding clean one),
+    but you shouldn't be releasing software with -dirty anyways.
+
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+            rendered += plus_or_dot(pieces)
+            rendered += "g%s" % pieces["short"]
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+        rendered += "+g%s" % pieces["short"]
+    return rendered
+
+
+def render_pep440_old(pieces):
+    """TAG[.postDISTANCE[.dev0]] .
+
+    The ".dev0" means dirty.
+
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+    return rendered
+
+
+def render_git_describe(pieces):
+    """TAG[-DISTANCE-gHEX][-dirty].
+
+    Like 'git describe --tags --dirty --always'.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render_git_describe_long(pieces):
+    """TAG-DISTANCE-gHEX[-dirty].
+
+    Like 'git describe --tags --dirty --always -long'.
+    The distance/hash is unconditional.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render(pieces, style):
+    """Render the given version pieces into the requested style."""
+    if pieces["error"]:
+        return {"version": "unknown",
+                "full-revisionid": pieces.get("long"),
+                "dirty": None,
+                "error": pieces["error"],
+                "date": None}
+
+    if not style or style == "default":
+        style = "pep440"  # the default
+
+    if style == "pep440":
+        rendered = render_pep440(pieces)
+    elif style == "pep440-pre":
+        rendered = render_pep440_pre(pieces)
+    elif style == "pep440-post":
+        rendered = render_pep440_post(pieces)
+    elif style == "pep440-old":
+        rendered = render_pep440_old(pieces)
+    elif style == "git-describe":
+        rendered = render_git_describe(pieces)
+    elif style == "git-describe-long":
+        rendered = render_git_describe_long(pieces)
+    else:
+        raise ValueError("unknown style '%s'" % style)
+
+    return {"version": rendered, "full-revisionid": pieces["long"],
+            "dirty": pieces["dirty"], "error": None,
+            "date": pieces.get("date")}
+
+
+def get_versions():
+    """Get version information or return default if unable to do so."""
+    # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
+    # __file__, we can work backwards from there to the root. Some
+    # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
+    # case we can only use expanded keywords.
+
+    cfg = get_config()
+    verbose = cfg.verbose
+
+    try:
+        return git_versions_from_keywords(get_keywords(), cfg.tag_prefix,
+                                          verbose)
+    except NotThisMethod:
+        pass
+
+    try:
+        root = os.path.realpath(__file__)
+        # versionfile_source is the relative path from the top of the source
+        # tree (where the .git directory might live) to this file. Invert
+        # this to find the root from __file__.
+        for i in cfg.versionfile_source.split('/'):
+            root = os.path.dirname(root)
+    except NameError:
+        return {"version": "0+unknown", "full-revisionid": None,
+                "dirty": None,
+                "error": "unable to find root of source tree",
+                "date": None}
+
+    try:
+        pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
+        return render(pieces, cfg.style)
+    except NotThisMethod:
+        pass
+
+    try:
+        if cfg.parentdir_prefix:
+            return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
+    except NotThisMethod:
+        pass
+
+    return {"version": "0+unknown", "full-revisionid": None,
+            "dirty": None,
+            "error": "unable to compute version", "date": None}
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000000..a853949500
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,13 @@
+
+# See the docstring in versioneer.py for instructions. Note that you must
+# re-run 'versioneer.py setup' after changing this section, and commit the
+# resulting files.
+
+[versioneer]
+VCS = git
+style = pep440
+versionfile_source = numba_dppy/_version.py
+versionfile_build = numba_dppy/_version.py
+tag_prefix =
+#parentdir_prefix =
+
diff --git a/setup.py b/setup.py
index 2a7efb7205..37ad0bfc68 100644
--- a/setup.py
+++ b/setup.py
@@ -2,6 +2,8 @@
 from setuptools import Extension, find_packages, setup
 from Cython.Build import cythonize
 
+import versioneer
+
 
 def get_ext_modules():
     ext_modules = []
@@ -41,7 +43,7 @@ def get_ext_modules():
 
 metadata = dict(
     name="numba-dppy",
-    version="0.0.1",
+    version=versioneer.get_version(),
     description="Numba extension for Intel CPU and GPU backend",
     url="https://github.com/IntelPython/numba-dppy",
     packages=packages,
@@ -61,6 +63,7 @@ def get_ext_modules():
         "Programming Language :: Python :: Implementation :: CPython",
         "Topic :: Software Development :: Compilers",
     ],
+    cmdclass=versioneer.get_cmdclass(),
 )
 
 setup(**metadata)
diff --git a/versioneer.py b/versioneer.py
new file mode 100644
index 0000000000..1040c21892
--- /dev/null
+++ b/versioneer.py
@@ -0,0 +1,1855 @@
+
+# Version: 0.19
+
+"""The Versioneer - like a rocketeer, but for versions.
+
+The Versioneer
+==============
+
+* like a rocketeer, but for versions!
+* https://github.com/python-versioneer/python-versioneer
+* Brian Warner
+* License: Public Domain
+* Compatible with: Python 3.6, 3.7, 3.8, 3.9 and pypy3
+* [![Latest Version][pypi-image]][pypi-url]
+* [![Build Status][travis-image]][travis-url]
+
+This is a tool for managing a recorded version number in distutils-based
+python projects. The goal is to remove the tedious and error-prone "update
+the embedded version string" step from your release process. Making a new
+release should be as easy as recording a new tag in your version-control
+system, and maybe making new tarballs.
+
+
+## Quick Install
+
+* `pip install versioneer` to somewhere in your $PATH
+* add a `[versioneer]` section to your setup.cfg (see [Install](INSTALL.md))
+* run `versioneer install` in your source tree, commit the results
+* Verify version information with `python setup.py version`
+
+## Version Identifiers
+
+Source trees come from a variety of places:
+
+* a version-control system checkout (mostly used by developers)
+* a nightly tarball, produced by build automation
+* a snapshot tarball, produced by a web-based VCS browser, like github's
+  "tarball from tag" feature
+* a release tarball, produced by "setup.py sdist", distributed through PyPI
+
+Within each source tree, the version identifier (either a string or a number,
+this tool is format-agnostic) can come from a variety of places:
+
+* ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows
+  about recent "tags" and an absolute revision-id
+* the name of the directory into which the tarball was unpacked
+* an expanded VCS keyword ($Id$, etc)
+* a `_version.py` created by some earlier build step
+
+For released software, the version identifier is closely related to a VCS
+tag. Some projects use tag names that include more than just the version
+string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool
+needs to strip the tag prefix to extract the version identifier. For
+unreleased software (between tags), the version identifier should provide
+enough information to help developers recreate the same tree, while also
+giving them an idea of roughly how old the tree is (after version 1.2, before
+version 1.3). Many VCS systems can report a description that captures this,
+for example `git describe --tags --dirty --always` reports things like
+"0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the
+0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has
+uncommitted changes).
+
+The version identifier is used for multiple purposes:
+
+* to allow the module to self-identify its version: `myproject.__version__`
+* to choose a name and prefix for a 'setup.py sdist' tarball
+
+## Theory of Operation
+
+Versioneer works by adding a special `_version.py` file into your source
+tree, where your `__init__.py` can import it. This `_version.py` knows how to
+dynamically ask the VCS tool for version information at import time.
+
+`_version.py` also contains `$Revision$` markers, and the installation
+process marks `_version.py` to have this marker rewritten with a tag name
+during the `git archive` command. As a result, generated tarballs will
+contain enough information to get the proper version.
+
+To allow `setup.py` to compute a version too, a `versioneer.py` is added to
+the top level of your source tree, next to `setup.py` and the `setup.cfg`
+that configures it. This overrides several distutils/setuptools commands to
+compute the version when invoked, and changes `setup.py build` and `setup.py
+sdist` to replace `_version.py` with a small static file that contains just
+the generated version data.
+
+## Installation
+
+See [INSTALL.md](./INSTALL.md) for detailed installation instructions.
+
+## Version-String Flavors
+
+Code which uses Versioneer can learn about its version string at runtime by
+importing `_version` from your main `__init__.py` file and running the
+`get_versions()` function. From the "outside" (e.g. in `setup.py`), you can
+import the top-level `versioneer.py` and run `get_versions()`.
+
+Both functions return a dictionary with different flavors of version
+information:
+
+* `['version']`: A condensed version string, rendered using the selected
+  style. This is the most commonly used value for the project's version
+  string. The default "pep440" style yields strings like `0.11`,
+  `0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section
+  below for alternative styles.
+
+* `['full-revisionid']`: detailed revision identifier. For Git, this is the
+  full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac".
+
+* `['date']`: Date and time of the latest `HEAD` commit. For Git, it is the
+  commit date in ISO 8601 format. This will be None if the date is not
+  available.
+
+* `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that
+  this is only accurate if run in a VCS checkout, otherwise it is likely to
+  be False or None
+
+* `['error']`: if the version string could not be computed, this will be set
+  to a string describing the problem, otherwise it will be None. It may be
+  useful to throw an exception in setup.py if this is set, to avoid e.g.
+  creating tarballs with a version string of "unknown".
+
+Some variants are more useful than others. Including `full-revisionid` in a
+bug report should allow developers to reconstruct the exact code being tested
+(or indicate the presence of local changes that should be shared with the
+developers). `version` is suitable for display in an "about" box or a CLI
+`--version` output: it can be easily compared against release notes and lists
+of bugs fixed in various releases.
+
+The installer adds the following text to your `__init__.py` to place a basic
+version in `YOURPROJECT.__version__`:
+
+    from ._version import get_versions
+    __version__ = get_versions()['version']
+    del get_versions
+
+## Styles
+
+The setup.cfg `style=` configuration controls how the VCS information is
+rendered into a version string.
+
+The default style, "pep440", produces a PEP440-compliant string, equal to the
+un-prefixed tag name for actual releases, and containing an additional "local
+version" section with more detail for in-between builds. For Git, this is
+TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags
+--dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the
+tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and
+that this commit is two revisions ("+2") beyond the "0.11" tag. For released
+software (exactly equal to a known tag), the identifier will only contain the
+stripped tag, e.g. "0.11".
+
+Other styles are available. See [details.md](details.md) in the Versioneer
+source tree for descriptions.
+
+## Debugging
+
+Versioneer tries to avoid fatal errors: if something goes wrong, it will tend
+to return a version of "0+unknown". To investigate the problem, run `setup.py
+version`, which will run the version-lookup code in a verbose mode, and will
+display the full contents of `get_versions()` (including the `error` string,
+which may help identify what went wrong).
+
+## Known Limitations
+
+Some situations are known to cause problems for Versioneer. This details the
+most significant ones. More can be found on Github
+[issues page](https://github.com/python-versioneer/python-versioneer/issues).
+
+### Subprojects
+
+Versioneer has limited support for source trees in which `setup.py` is not in
+the root directory (e.g. `setup.py` and `.git/` are *not* siblings). The are
+two common reasons why `setup.py` might not be in the root:
+
+* Source trees which contain multiple subprojects, such as
+  [Buildbot](https://github.com/buildbot/buildbot), which contains both
+  "master" and "slave" subprojects, each with their own `setup.py`,
+  `setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI
+  distributions (and upload multiple independently-installable tarballs).
+* Source trees whose main purpose is to contain a C library, but which also
+  provide bindings to Python (and perhaps other languages) in subdirectories.
+
+Versioneer will look for `.git` in parent directories, and most operations
+should get the right version string. However `pip` and `setuptools` have bugs
+and implementation details which frequently cause `pip install .` from a
+subproject directory to fail to find a correct version string (so it usually
+defaults to `0+unknown`).
+
+`pip install --editable .` should work correctly. `setup.py install` might
+work too.
+
+Pip-8.1.1 is known to have this problem, but hopefully it will get fixed in
+some later version.
+
+[Bug #38](https://github.com/python-versioneer/python-versioneer/issues/38) is tracking
+this issue. The discussion in
+[PR #61](https://github.com/python-versioneer/python-versioneer/pull/61) describes the
+issue from the Versioneer side in more detail.
+[pip PR#3176](https://github.com/pypa/pip/pull/3176) and
+[pip PR#3615](https://github.com/pypa/pip/pull/3615) contain work to improve
+pip to let Versioneer work correctly.
+
+Versioneer-0.16 and earlier only looked for a `.git` directory next to the
+`setup.cfg`, so subprojects were completely unsupported with those releases.
+
+### Editable installs with setuptools <= 18.5
+
+`setup.py develop` and `pip install --editable .` allow you to install a
+project into a virtualenv once, then continue editing the source code (and
+test) without re-installing after every change.
+
+"Entry-point scripts" (`setup(entry_points={"console_scripts": ..})`) are a
+convenient way to specify executable scripts that should be installed along
+with the python package.
+
+These both work as expected when using modern setuptools. When using
+setuptools-18.5 or earlier, however, certain operations will cause
+`pkg_resources.DistributionNotFound` errors when running the entrypoint
+script, which must be resolved by re-installing the package. This happens
+when the install happens with one version, then the egg_info data is
+regenerated while a different version is checked out. Many setup.py commands
+cause egg_info to be rebuilt (including `sdist`, `wheel`, and installing into
+a different virtualenv), so this can be surprising.
+
+[Bug #83](https://github.com/python-versioneer/python-versioneer/issues/83) describes
+this one, but upgrading to a newer version of setuptools should probably
+resolve it.
+
+
+## Updating Versioneer
+
+To upgrade your project to a new release of Versioneer, do the following:
+
+* install the new Versioneer (`pip install -U versioneer` or equivalent)
+* edit `setup.cfg`, if necessary, to include any new configuration settings
+  indicated by the release notes. See [UPGRADING](./UPGRADING.md) for details.
+* re-run `versioneer install` in your source tree, to replace
+  `SRC/_version.py`
+* commit any changed files
+
+## Future Directions
+
+This tool is designed to make it easily extended to other version-control
+systems: all VCS-specific components are in separate directories like
+src/git/ . The top-level `versioneer.py` script is assembled from these
+components by running make-versioneer.py . In the future, make-versioneer.py
+will take a VCS name as an argument, and will construct a version of
+`versioneer.py` that is specific to the given VCS. It might also take the
+configuration arguments that are currently provided manually during
+installation by editing setup.py . Alternatively, it might go the other
+direction and include code from all supported VCS systems, reducing the
+number of intermediate scripts.
+
+## Similar projects
+
+* [setuptools_scm](https://github.com/pypa/setuptools_scm/) - a non-vendored build-time
+  dependency
+* [minver](https://github.com/jbweston/miniver) - a lightweight reimplementation of
+  versioneer
+
+## License
+
+To make Versioneer easier to embed, all its code is dedicated to the public
+domain. The `_version.py` that it creates is also in the public domain.
+Specifically, both are released under the Creative Commons "Public Domain
+Dedication" license (CC0-1.0), as described in
+https://creativecommons.org/publicdomain/zero/1.0/ .
+
+[pypi-image]: https://img.shields.io/pypi/v/versioneer.svg
+[pypi-url]: https://pypi.python.org/pypi/versioneer/
+[travis-image]:
+https://img.shields.io/travis/com/python-versioneer/python-versioneer.svg
+[travis-url]: https://travis-ci.com/github/python-versioneer/python-versioneer
+
+"""
+
+import configparser
+import errno
+import json
+import os
+import re
+import subprocess
+import sys
+
+
+class VersioneerConfig:
+    """Container for Versioneer configuration parameters."""
+
+
+def get_root():
+    """Get the project root directory.
+
+    We require that all commands are run from the project root, i.e. the
+    directory that contains setup.py, setup.cfg, and versioneer.py .
+    """
+    root = os.path.realpath(os.path.abspath(os.getcwd()))
+    setup_py = os.path.join(root, "setup.py")
+    versioneer_py = os.path.join(root, "versioneer.py")
+    if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)):
+        # allow 'python path/to/setup.py COMMAND'
+        root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0])))
+        setup_py = os.path.join(root, "setup.py")
+        versioneer_py = os.path.join(root, "versioneer.py")
+    if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)):
+        err = ("Versioneer was unable to run the project root directory. "
+               "Versioneer requires setup.py to be executed from "
+               "its immediate directory (like 'python setup.py COMMAND'), "
+               "or in a way that lets it use sys.argv[0] to find the root "
+               "(like 'python path/to/setup.py COMMAND').")
+        raise VersioneerBadRootError(err)
+    try:
+        # Certain runtime workflows (setup.py install/develop in a setuptools
+        # tree) execute all dependencies in a single python process, so
+        # "versioneer" may be imported multiple times, and python's shared
+        # module-import table will cache the first one. So we can't use
+        # os.path.dirname(__file__), as that will find whichever
+        # versioneer.py was first imported, even in later projects.
+        me = os.path.realpath(os.path.abspath(__file__))
+        me_dir = os.path.normcase(os.path.splitext(me)[0])
+        vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0])
+        if me_dir != vsr_dir:
+            print("Warning: build in %s is using versioneer.py from %s"
+                  % (os.path.dirname(me), versioneer_py))
+    except NameError:
+        pass
+    return root
+
+
+def get_config_from_root(root):
+    """Read the project setup.cfg file to determine Versioneer config."""
+    # This might raise EnvironmentError (if setup.cfg is missing), or
+    # configparser.NoSectionError (if it lacks a [versioneer] section), or
+    # configparser.NoOptionError (if it lacks "VCS="). See the docstring at
+    # the top of versioneer.py for instructions on writing your setup.cfg .
+    setup_cfg = os.path.join(root, "setup.cfg")
+    parser = configparser.ConfigParser()
+    with open(setup_cfg, "r") as f:
+        parser.read_file(f)
+    VCS = parser.get("versioneer", "VCS")  # mandatory
+
+    def get(parser, name):
+        if parser.has_option("versioneer", name):
+            return parser.get("versioneer", name)
+        return None
+    cfg = VersioneerConfig()
+    cfg.VCS = VCS
+    cfg.style = get(parser, "style") or ""
+    cfg.versionfile_source = get(parser, "versionfile_source")
+    cfg.versionfile_build = get(parser, "versionfile_build")
+    cfg.tag_prefix = get(parser, "tag_prefix")
+    if cfg.tag_prefix in ("''", '""'):
+        cfg.tag_prefix = ""
+    cfg.parentdir_prefix = get(parser, "parentdir_prefix")
+    cfg.verbose = get(parser, "verbose")
+    return cfg
+
+
+class NotThisMethod(Exception):
+    """Exception raised if a method is not valid for the current scenario."""
+
+
+# these dictionaries contain VCS-specific tools
+LONG_VERSION_PY = {}
+HANDLERS = {}
+
+
+def register_vcs_handler(vcs, method):  # decorator
+    """Create decorator to mark a method as the handler of a VCS."""
+    def decorate(f):
+        """Store f in HANDLERS[vcs][method]."""
+        if vcs not in HANDLERS:
+            HANDLERS[vcs] = {}
+        HANDLERS[vcs][method] = f
+        return f
+    return decorate
+
+
+def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
+                env=None):
+    """Call the given command(s)."""
+    assert isinstance(commands, list)
+    p = None
+    for c in commands:
+        try:
+            dispcmd = str([c] + args)
+            # remember shell=False, so use git.cmd on windows, not just git
+            p = subprocess.Popen([c] + args, cwd=cwd, env=env,
+                                 stdout=subprocess.PIPE,
+                                 stderr=(subprocess.PIPE if hide_stderr
+                                         else None))
+            break
+        except EnvironmentError:
+            e = sys.exc_info()[1]
+            if e.errno == errno.ENOENT:
+                continue
+            if verbose:
+                print("unable to run %s" % dispcmd)
+                print(e)
+            return None, None
+    else:
+        if verbose:
+            print("unable to find command, tried %s" % (commands,))
+        return None, None
+    stdout = p.communicate()[0].strip().decode()
+    if p.returncode != 0:
+        if verbose:
+            print("unable to run %s (error)" % dispcmd)
+            print("stdout was %s" % stdout)
+        return None, p.returncode
+    return stdout, p.returncode
+
+
+LONG_VERSION_PY['git'] = r'''
+# This file helps to compute a version number in source trees obtained from
+# git-archive tarball (such as those provided by githubs download-from-tag
+# feature). Distribution tarballs (built by setup.py sdist) and build
+# directories (produced by setup.py build) will contain a much shorter file
+# that just contains the computed version number.
+
+# This file is released into the public domain. Generated by
+# versioneer-0.19 (https://github.com/python-versioneer/python-versioneer)
+
+"""Git implementation of _version.py."""
+
+import errno
+import os
+import re
+import subprocess
+import sys
+
+
+def get_keywords():
+    """Get the keywords needed to look up the version information."""
+    # these strings will be replaced by git during git-archive.
+    # setup.py/versioneer.py will grep for the variable names, so they must
+    # each be defined on a line of their own. _version.py will just call
+    # get_keywords().
+    git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s"
+    git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s"
+    git_date = "%(DOLLAR)sFormat:%%ci%(DOLLAR)s"
+    keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
+    return keywords
+
+
+class VersioneerConfig:
+    """Container for Versioneer configuration parameters."""
+
+
+def get_config():
+    """Create, populate and return the VersioneerConfig() object."""
+    # these strings are filled in when 'setup.py versioneer' creates
+    # _version.py
+    cfg = VersioneerConfig()
+    cfg.VCS = "git"
+    cfg.style = "%(STYLE)s"
+    cfg.tag_prefix = "%(TAG_PREFIX)s"
+    cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s"
+    cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s"
+    cfg.verbose = False
+    return cfg
+
+
+class NotThisMethod(Exception):
+    """Exception raised if a method is not valid for the current scenario."""
+
+
+LONG_VERSION_PY = {}
+HANDLERS = {}
+
+
+def register_vcs_handler(vcs, method):  # decorator
+    """Create decorator to mark a method as the handler of a VCS."""
+    def decorate(f):
+        """Store f in HANDLERS[vcs][method]."""
+        if vcs not in HANDLERS:
+            HANDLERS[vcs] = {}
+        HANDLERS[vcs][method] = f
+        return f
+    return decorate
+
+
+def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
+                env=None):
+    """Call the given command(s)."""
+    assert isinstance(commands, list)
+    p = None
+    for c in commands:
+        try:
+            dispcmd = str([c] + args)
+            # remember shell=False, so use git.cmd on windows, not just git
+            p = subprocess.Popen([c] + args, cwd=cwd, env=env,
+                                 stdout=subprocess.PIPE,
+                                 stderr=(subprocess.PIPE if hide_stderr
+                                         else None))
+            break
+        except EnvironmentError:
+            e = sys.exc_info()[1]
+            if e.errno == errno.ENOENT:
+                continue
+            if verbose:
+                print("unable to run %%s" %% dispcmd)
+                print(e)
+            return None, None
+    else:
+        if verbose:
+            print("unable to find command, tried %%s" %% (commands,))
+        return None, None
+    stdout = p.communicate()[0].strip().decode()
+    if p.returncode != 0:
+        if verbose:
+            print("unable to run %%s (error)" %% dispcmd)
+            print("stdout was %%s" %% stdout)
+        return None, p.returncode
+    return stdout, p.returncode
+
+
+def versions_from_parentdir(parentdir_prefix, root, verbose):
+    """Try to determine the version from the parent directory name.
+
+    Source tarballs conventionally unpack into a directory that includes both
+    the project name and a version string. We will also support searching up
+    two directory levels for an appropriately named parent directory
+    """
+    rootdirs = []
+
+    for i in range(3):
+        dirname = os.path.basename(root)
+        if dirname.startswith(parentdir_prefix):
+            return {"version": dirname[len(parentdir_prefix):],
+                    "full-revisionid": None,
+                    "dirty": False, "error": None, "date": None}
+        else:
+            rootdirs.append(root)
+            root = os.path.dirname(root)  # up a level
+
+    if verbose:
+        print("Tried directories %%s but none started with prefix %%s" %%
+              (str(rootdirs), parentdir_prefix))
+    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
+
+
+@register_vcs_handler("git", "get_keywords")
+def git_get_keywords(versionfile_abs):
+    """Extract version information from the given file."""
+    # the code embedded in _version.py can just fetch the value of these
+    # keywords. When used from setup.py, we don't want to import _version.py,
+    # so we do it with a regexp instead. This function is not used from
+    # _version.py.
+    keywords = {}
+    try:
+        f = open(versionfile_abs, "r")
+        for line in f.readlines():
+            if line.strip().startswith("git_refnames ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["refnames"] = mo.group(1)
+            if line.strip().startswith("git_full ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["full"] = mo.group(1)
+            if line.strip().startswith("git_date ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["date"] = mo.group(1)
+        f.close()
+    except EnvironmentError:
+        pass
+    return keywords
+
+
+@register_vcs_handler("git", "keywords")
+def git_versions_from_keywords(keywords, tag_prefix, verbose):
+    """Get version information from git keywords."""
+    if not keywords:
+        raise NotThisMethod("no keywords at all, weird")
+    date = keywords.get("date")
+    if date is not None:
+        # Use only the last line.  Previous lines may contain GPG signature
+        # information.
+        date = date.splitlines()[-1]
+
+        # git-2.2.0 added "%%cI", which expands to an ISO-8601 -compliant
+        # datestamp. However we prefer "%%ci" (which expands to an "ISO-8601
+        # -like" string, which we must then edit to make compliant), because
+        # it's been around since git-1.5.3, and it's too difficult to
+        # discover which version we're using, or to work around using an
+        # older one.
+        date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+    refnames = keywords["refnames"].strip()
+    if refnames.startswith("$Format"):
+        if verbose:
+            print("keywords are unexpanded, not using")
+        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
+    refs = set([r.strip() for r in refnames.strip("()").split(",")])
+    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
+    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
+    TAG = "tag: "
+    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+    if not tags:
+        # Either we're using git < 1.8.3, or there really are no tags. We use
+        # a heuristic: assume all version tags have a digit. The old git %%d
+        # expansion behaves like git log --decorate=short and strips out the
+        # refs/heads/ and refs/tags/ prefixes that would let us distinguish
+        # between branches and tags. By ignoring refnames without digits, we
+        # filter out many common branch names like "release" and
+        # "stabilization", as well as "HEAD" and "master".
+        tags = set([r for r in refs if re.search(r'\d', r)])
+        if verbose:
+            print("discarding '%%s', no digits" %% ",".join(refs - tags))
+    if verbose:
+        print("likely tags: %%s" %% ",".join(sorted(tags)))
+    for ref in sorted(tags):
+        # sorting will prefer e.g. "2.0" over "2.0rc1"
+        if ref.startswith(tag_prefix):
+            r = ref[len(tag_prefix):]
+            if verbose:
+                print("picking %%s" %% r)
+            return {"version": r,
+                    "full-revisionid": keywords["full"].strip(),
+                    "dirty": False, "error": None,
+                    "date": date}
+    # no suitable tags, so version is "0+unknown", but full hex is still there
+    if verbose:
+        print("no suitable tags, using unknown + full revision id")
+    return {"version": "0+unknown",
+            "full-revisionid": keywords["full"].strip(),
+            "dirty": False, "error": "no suitable tags", "date": None}
+
+
+@register_vcs_handler("git", "pieces_from_vcs")
+def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
+    """Get version from 'git describe' in the root of the source tree.
+
+    This only gets called if the git-archive 'subst' keywords were *not*
+    expanded, and _version.py hasn't already been rewritten with a short
+    version string, meaning we're inside a checked out source tree.
+    """
+    GITS = ["git"]
+    if sys.platform == "win32":
+        GITS = ["git.cmd", "git.exe"]
+
+    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
+                          hide_stderr=True)
+    if rc != 0:
+        if verbose:
+            print("Directory %%s not under git control" %% root)
+        raise NotThisMethod("'git rev-parse --git-dir' returned error")
+
+    # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
+    # if there isn't one, this yields HEX[-dirty] (no NUM)
+    describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
+                                          "--always", "--long",
+                                          "--match", "%%s*" %% tag_prefix],
+                                   cwd=root)
+    # --long was added in git-1.5.5
+    if describe_out is None:
+        raise NotThisMethod("'git describe' failed")
+    describe_out = describe_out.strip()
+    full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
+    if full_out is None:
+        raise NotThisMethod("'git rev-parse' failed")
+    full_out = full_out.strip()
+
+    pieces = {}
+    pieces["long"] = full_out
+    pieces["short"] = full_out[:7]  # maybe improved later
+    pieces["error"] = None
+
+    # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
+    # TAG might have hyphens.
+    git_describe = describe_out
+
+    # look for -dirty suffix
+    dirty = git_describe.endswith("-dirty")
+    pieces["dirty"] = dirty
+    if dirty:
+        git_describe = git_describe[:git_describe.rindex("-dirty")]
+
+    # now we have TAG-NUM-gHEX or HEX
+
+    if "-" in git_describe:
+        # TAG-NUM-gHEX
+        mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
+        if not mo:
+            # unparseable. Maybe git-describe is misbehaving?
+            pieces["error"] = ("unable to parse git-describe output: '%%s'"
+                               %% describe_out)
+            return pieces
+
+        # tag
+        full_tag = mo.group(1)
+        if not full_tag.startswith(tag_prefix):
+            if verbose:
+                fmt = "tag '%%s' doesn't start with prefix '%%s'"
+                print(fmt %% (full_tag, tag_prefix))
+            pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'"
+                               %% (full_tag, tag_prefix))
+            return pieces
+        pieces["closest-tag"] = full_tag[len(tag_prefix):]
+
+        # distance: number of commits since tag
+        pieces["distance"] = int(mo.group(2))
+
+        # commit: short hex revision ID
+        pieces["short"] = mo.group(3)
+
+    else:
+        # HEX: no tags
+        pieces["closest-tag"] = None
+        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
+                                    cwd=root)
+        pieces["distance"] = int(count_out)  # total number of commits
+
+    # commit date: see ISO-8601 comment in git_versions_from_keywords()
+    date = run_command(GITS, ["show", "-s", "--format=%%ci", "HEAD"],
+                       cwd=root)[0].strip()
+    # Use only the last line.  Previous lines may contain GPG signature
+    # information.
+    date = date.splitlines()[-1]
+    pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+
+    return pieces
+
+
+def plus_or_dot(pieces):
+    """Return a + if we don't already have one, else return a ."""
+    if "+" in pieces.get("closest-tag", ""):
+        return "."
+    return "+"
+
+
+def render_pep440(pieces):
+    """Build up version string, with post-release "local version identifier".
+
+    Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
+    get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
+
+    Exceptions:
+    1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += plus_or_dot(pieces)
+            rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"])
+            if pieces["dirty"]:
+                rendered += ".dirty"
+    else:
+        # exception #1
+        rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"],
+                                          pieces["short"])
+        if pieces["dirty"]:
+            rendered += ".dirty"
+    return rendered
+
+
+def render_pep440_pre(pieces):
+    """TAG[.post0.devDISTANCE] -- No -dirty.
+
+    Exceptions:
+    1: no tags. 0.post0.devDISTANCE
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += ".post0.dev%%d" %% pieces["distance"]
+    else:
+        # exception #1
+        rendered = "0.post0.dev%%d" %% pieces["distance"]
+    return rendered
+
+
+def render_pep440_post(pieces):
+    """TAG[.postDISTANCE[.dev0]+gHEX] .
+
+    The ".dev0" means dirty. Note that .dev0 sorts backwards
+    (a dirty tree will appear "older" than the corresponding clean one),
+    but you shouldn't be releasing software with -dirty anyways.
+
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%%d" %% pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+            rendered += plus_or_dot(pieces)
+            rendered += "g%%s" %% pieces["short"]
+    else:
+        # exception #1
+        rendered = "0.post%%d" %% pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+        rendered += "+g%%s" %% pieces["short"]
+    return rendered
+
+
+def render_pep440_old(pieces):
+    """TAG[.postDISTANCE[.dev0]] .
+
+    The ".dev0" means dirty.
+
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%%d" %% pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+    else:
+        # exception #1
+        rendered = "0.post%%d" %% pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+    return rendered
+
+
+def render_git_describe(pieces):
+    """TAG[-DISTANCE-gHEX][-dirty].
+
+    Like 'git describe --tags --dirty --always'.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render_git_describe_long(pieces):
+    """TAG-DISTANCE-gHEX[-dirty].
+
+    Like 'git describe --tags --dirty --always -long'.
+    The distance/hash is unconditional.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render(pieces, style):
+    """Render the given version pieces into the requested style."""
+    if pieces["error"]:
+        return {"version": "unknown",
+                "full-revisionid": pieces.get("long"),
+                "dirty": None,
+                "error": pieces["error"],
+                "date": None}
+
+    if not style or style == "default":
+        style = "pep440"  # the default
+
+    if style == "pep440":
+        rendered = render_pep440(pieces)
+    elif style == "pep440-pre":
+        rendered = render_pep440_pre(pieces)
+    elif style == "pep440-post":
+        rendered = render_pep440_post(pieces)
+    elif style == "pep440-old":
+        rendered = render_pep440_old(pieces)
+    elif style == "git-describe":
+        rendered = render_git_describe(pieces)
+    elif style == "git-describe-long":
+        rendered = render_git_describe_long(pieces)
+    else:
+        raise ValueError("unknown style '%%s'" %% style)
+
+    return {"version": rendered, "full-revisionid": pieces["long"],
+            "dirty": pieces["dirty"], "error": None,
+            "date": pieces.get("date")}
+
+
+def get_versions():
+    """Get version information or return default if unable to do so."""
+    # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
+    # __file__, we can work backwards from there to the root. Some
+    # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
+    # case we can only use expanded keywords.
+
+    cfg = get_config()
+    verbose = cfg.verbose
+
+    try:
+        return git_versions_from_keywords(get_keywords(), cfg.tag_prefix,
+                                          verbose)
+    except NotThisMethod:
+        pass
+
+    try:
+        root = os.path.realpath(__file__)
+        # versionfile_source is the relative path from the top of the source
+        # tree (where the .git directory might live) to this file. Invert
+        # this to find the root from __file__.
+        for i in cfg.versionfile_source.split('/'):
+            root = os.path.dirname(root)
+    except NameError:
+        return {"version": "0+unknown", "full-revisionid": None,
+                "dirty": None,
+                "error": "unable to find root of source tree",
+                "date": None}
+
+    try:
+        pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
+        return render(pieces, cfg.style)
+    except NotThisMethod:
+        pass
+
+    try:
+        if cfg.parentdir_prefix:
+            return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
+    except NotThisMethod:
+        pass
+
+    return {"version": "0+unknown", "full-revisionid": None,
+            "dirty": None,
+            "error": "unable to compute version", "date": None}
+'''
+
+
+@register_vcs_handler("git", "get_keywords")
+def git_get_keywords(versionfile_abs):
+    """Extract version information from the given file."""
+    # the code embedded in _version.py can just fetch the value of these
+    # keywords. When used from setup.py, we don't want to import _version.py,
+    # so we do it with a regexp instead. This function is not used from
+    # _version.py.
+    keywords = {}
+    try:
+        f = open(versionfile_abs, "r")
+        for line in f.readlines():
+            if line.strip().startswith("git_refnames ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["refnames"] = mo.group(1)
+            if line.strip().startswith("git_full ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["full"] = mo.group(1)
+            if line.strip().startswith("git_date ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["date"] = mo.group(1)
+        f.close()
+    except EnvironmentError:
+        pass
+    return keywords
+
+
+@register_vcs_handler("git", "keywords")
+def git_versions_from_keywords(keywords, tag_prefix, verbose):
+    """Get version information from git keywords."""
+    if not keywords:
+        raise NotThisMethod("no keywords at all, weird")
+    date = keywords.get("date")
+    if date is not None:
+        # Use only the last line.  Previous lines may contain GPG signature
+        # information.
+        date = date.splitlines()[-1]
+
+        # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
+        # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
+        # -like" string, which we must then edit to make compliant), because
+        # it's been around since git-1.5.3, and it's too difficult to
+        # discover which version we're using, or to work around using an
+        # older one.
+        date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+    refnames = keywords["refnames"].strip()
+    if refnames.startswith("$Format"):
+        if verbose:
+            print("keywords are unexpanded, not using")
+        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
+    refs = set([r.strip() for r in refnames.strip("()").split(",")])
+    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
+    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
+    TAG = "tag: "
+    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+    if not tags:
+        # Either we're using git < 1.8.3, or there really are no tags. We use
+        # a heuristic: assume all version tags have a digit. The old git %d
+        # expansion behaves like git log --decorate=short and strips out the
+        # refs/heads/ and refs/tags/ prefixes that would let us distinguish
+        # between branches and tags. By ignoring refnames without digits, we
+        # filter out many common branch names like "release" and
+        # "stabilization", as well as "HEAD" and "master".
+        tags = set([r for r in refs if re.search(r'\d', r)])
+        if verbose:
+            print("discarding '%s', no digits" % ",".join(refs - tags))
+    if verbose:
+        print("likely tags: %s" % ",".join(sorted(tags)))
+    for ref in sorted(tags):
+        # sorting will prefer e.g. "2.0" over "2.0rc1"
+        if ref.startswith(tag_prefix):
+            r = ref[len(tag_prefix):]
+            if verbose:
+                print("picking %s" % r)
+            return {"version": r,
+                    "full-revisionid": keywords["full"].strip(),
+                    "dirty": False, "error": None,
+                    "date": date}
+    # no suitable tags, so version is "0+unknown", but full hex is still there
+    if verbose:
+        print("no suitable tags, using unknown + full revision id")
+    return {"version": "0+unknown",
+            "full-revisionid": keywords["full"].strip(),
+            "dirty": False, "error": "no suitable tags", "date": None}
+
+
+@register_vcs_handler("git", "pieces_from_vcs")
+def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
+    """Get version from 'git describe' in the root of the source tree.
+
+    This only gets called if the git-archive 'subst' keywords were *not*
+    expanded, and _version.py hasn't already been rewritten with a short
+    version string, meaning we're inside a checked out source tree.
+    """
+    GITS = ["git"]
+    if sys.platform == "win32":
+        GITS = ["git.cmd", "git.exe"]
+
+    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
+                          hide_stderr=True)
+    if rc != 0:
+        if verbose:
+            print("Directory %s not under git control" % root)
+        raise NotThisMethod("'git rev-parse --git-dir' returned error")
+
+    # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
+    # if there isn't one, this yields HEX[-dirty] (no NUM)
+    describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
+                                          "--always", "--long",
+                                          "--match", "%s*" % tag_prefix],
+                                   cwd=root)
+    # --long was added in git-1.5.5
+    if describe_out is None:
+        raise NotThisMethod("'git describe' failed")
+    describe_out = describe_out.strip()
+    full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
+    if full_out is None:
+        raise NotThisMethod("'git rev-parse' failed")
+    full_out = full_out.strip()
+
+    pieces = {}
+    pieces["long"] = full_out
+    pieces["short"] = full_out[:7]  # maybe improved later
+    pieces["error"] = None
+
+    # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
+    # TAG might have hyphens.
+    git_describe = describe_out
+
+    # look for -dirty suffix
+    dirty = git_describe.endswith("-dirty")
+    pieces["dirty"] = dirty
+    if dirty:
+        git_describe = git_describe[:git_describe.rindex("-dirty")]
+
+    # now we have TAG-NUM-gHEX or HEX
+
+    if "-" in git_describe:
+        # TAG-NUM-gHEX
+        mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
+        if not mo:
+            # unparseable. Maybe git-describe is misbehaving?
+            pieces["error"] = ("unable to parse git-describe output: '%s'"
+                               % describe_out)
+            return pieces
+
+        # tag
+        full_tag = mo.group(1)
+        if not full_tag.startswith(tag_prefix):
+            if verbose:
+                fmt = "tag '%s' doesn't start with prefix '%s'"
+                print(fmt % (full_tag, tag_prefix))
+            pieces["error"] = ("tag '%s' doesn't start with prefix '%s'"
+                               % (full_tag, tag_prefix))
+            return pieces
+        pieces["closest-tag"] = full_tag[len(tag_prefix):]
+
+        # distance: number of commits since tag
+        pieces["distance"] = int(mo.group(2))
+
+        # commit: short hex revision ID
+        pieces["short"] = mo.group(3)
+
+    else:
+        # HEX: no tags
+        pieces["closest-tag"] = None
+        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
+                                    cwd=root)
+        pieces["distance"] = int(count_out)  # total number of commits
+
+    # commit date: see ISO-8601 comment in git_versions_from_keywords()
+    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"],
+                       cwd=root)[0].strip()
+    # Use only the last line.  Previous lines may contain GPG signature
+    # information.
+    date = date.splitlines()[-1]
+    pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+
+    return pieces
+
+
+def do_vcs_install(manifest_in, versionfile_source, ipy):
+    """Git-specific installation logic for Versioneer.
+
+    For Git, this means creating/changing .gitattributes to mark _version.py
+    for export-subst keyword substitution.
+    """
+    GITS = ["git"]
+    if sys.platform == "win32":
+        GITS = ["git.cmd", "git.exe"]
+    files = [manifest_in, versionfile_source]
+    if ipy:
+        files.append(ipy)
+    try:
+        me = __file__
+        if me.endswith(".pyc") or me.endswith(".pyo"):
+            me = os.path.splitext(me)[0] + ".py"
+        versioneer_file = os.path.relpath(me)
+    except NameError:
+        versioneer_file = "versioneer.py"
+    files.append(versioneer_file)
+    present = False
+    try:
+        f = open(".gitattributes", "r")
+        for line in f.readlines():
+            if line.strip().startswith(versionfile_source):
+                if "export-subst" in line.strip().split()[1:]:
+                    present = True
+        f.close()
+    except EnvironmentError:
+        pass
+    if not present:
+        f = open(".gitattributes", "a+")
+        f.write("%s export-subst\n" % versionfile_source)
+        f.close()
+        files.append(".gitattributes")
+    run_command(GITS, ["add", "--"] + files)
+
+
+def versions_from_parentdir(parentdir_prefix, root, verbose):
+    """Try to determine the version from the parent directory name.
+
+    Source tarballs conventionally unpack into a directory that includes both
+    the project name and a version string. We will also support searching up
+    two directory levels for an appropriately named parent directory
+    """
+    rootdirs = []
+
+    for i in range(3):
+        dirname = os.path.basename(root)
+        if dirname.startswith(parentdir_prefix):
+            return {"version": dirname[len(parentdir_prefix):],
+                    "full-revisionid": None,
+                    "dirty": False, "error": None, "date": None}
+        else:
+            rootdirs.append(root)
+            root = os.path.dirname(root)  # up a level
+
+    if verbose:
+        print("Tried directories %s but none started with prefix %s" %
+              (str(rootdirs), parentdir_prefix))
+    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
+
+
+SHORT_VERSION_PY = """
+# This file was generated by 'versioneer.py' (0.19) from
+# revision-control system data, or from the parent directory name of an
+# unpacked source archive. Distribution tarballs contain a pre-generated copy
+# of this file.
+
+import json
+
+version_json = '''
+%s
+'''  # END VERSION_JSON
+
+
+def get_versions():
+    return json.loads(version_json)
+"""
+
+
+def versions_from_file(filename):
+    """Try to determine the version from _version.py if present."""
+    try:
+        with open(filename) as f:
+            contents = f.read()
+    except EnvironmentError:
+        raise NotThisMethod("unable to read _version.py")
+    mo = re.search(r"version_json = '''\n(.*)'''  # END VERSION_JSON",
+                   contents, re.M | re.S)
+    if not mo:
+        mo = re.search(r"version_json = '''\r\n(.*)'''  # END VERSION_JSON",
+                       contents, re.M | re.S)
+    if not mo:
+        raise NotThisMethod("no version_json in _version.py")
+    return json.loads(mo.group(1))
+
+
+def write_to_version_file(filename, versions):
+    """Write the given version number to the given _version.py file."""
+    os.unlink(filename)
+    contents = json.dumps(versions, sort_keys=True,
+                          indent=1, separators=(",", ": "))
+    with open(filename, "w") as f:
+        f.write(SHORT_VERSION_PY % contents)
+
+    print("set %s to '%s'" % (filename, versions["version"]))
+
+
+def plus_or_dot(pieces):
+    """Return a + if we don't already have one, else return a ."""
+    if "+" in pieces.get("closest-tag", ""):
+        return "."
+    return "+"
+
+
+def render_pep440(pieces):
+    """Build up version string, with post-release "local version identifier".
+
+    Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
+    get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
+
+    Exceptions:
+    1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += plus_or_dot(pieces)
+            rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
+            if pieces["dirty"]:
+                rendered += ".dirty"
+    else:
+        # exception #1
+        rendered = "0+untagged.%d.g%s" % (pieces["distance"],
+                                          pieces["short"])
+        if pieces["dirty"]:
+            rendered += ".dirty"
+    return rendered
+
+
+def render_pep440_pre(pieces):
+    """TAG[.post0.devDISTANCE] -- No -dirty.
+
+    Exceptions:
+    1: no tags. 0.post0.devDISTANCE
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += ".post0.dev%d" % pieces["distance"]
+    else:
+        # exception #1
+        rendered = "0.post0.dev%d" % pieces["distance"]
+    return rendered
+
+
+def render_pep440_post(pieces):
+    """TAG[.postDISTANCE[.dev0]+gHEX] .
+
+    The ".dev0" means dirty. Note that .dev0 sorts backwards
+    (a dirty tree will appear "older" than the corresponding clean one),
+    but you shouldn't be releasing software with -dirty anyways.
+
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+            rendered += plus_or_dot(pieces)
+            rendered += "g%s" % pieces["short"]
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+        rendered += "+g%s" % pieces["short"]
+    return rendered
+
+
+def render_pep440_old(pieces):
+    """TAG[.postDISTANCE[.dev0]] .
+
+    The ".dev0" means dirty.
+
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+    return rendered
+
+
+def render_git_describe(pieces):
+    """TAG[-DISTANCE-gHEX][-dirty].
+
+    Like 'git describe --tags --dirty --always'.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render_git_describe_long(pieces):
+    """TAG-DISTANCE-gHEX[-dirty].
+
+    Like 'git describe --tags --dirty --always -long'.
+    The distance/hash is unconditional.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render(pieces, style):
+    """Render the given version pieces into the requested style."""
+    if pieces["error"]:
+        return {"version": "unknown",
+                "full-revisionid": pieces.get("long"),
+                "dirty": None,
+                "error": pieces["error"],
+                "date": None}
+
+    if not style or style == "default":
+        style = "pep440"  # the default
+
+    if style == "pep440":
+        rendered = render_pep440(pieces)
+    elif style == "pep440-pre":
+        rendered = render_pep440_pre(pieces)
+    elif style == "pep440-post":
+        rendered = render_pep440_post(pieces)
+    elif style == "pep440-old":
+        rendered = render_pep440_old(pieces)
+    elif style == "git-describe":
+        rendered = render_git_describe(pieces)
+    elif style == "git-describe-long":
+        rendered = render_git_describe_long(pieces)
+    else:
+        raise ValueError("unknown style '%s'" % style)
+
+    return {"version": rendered, "full-revisionid": pieces["long"],
+            "dirty": pieces["dirty"], "error": None,
+            "date": pieces.get("date")}
+
+
+class VersioneerBadRootError(Exception):
+    """The project root directory is unknown or missing key files."""
+
+
+def get_versions(verbose=False):
+    """Get the project version from whatever source is available.
+
+    Returns dict with two keys: 'version' and 'full'.
+    """
+    if "versioneer" in sys.modules:
+        # see the discussion in cmdclass.py:get_cmdclass()
+        del sys.modules["versioneer"]
+
+    root = get_root()
+    cfg = get_config_from_root(root)
+
+    assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg"
+    handlers = HANDLERS.get(cfg.VCS)
+    assert handlers, "unrecognized VCS '%s'" % cfg.VCS
+    verbose = verbose or cfg.verbose
+    assert cfg.versionfile_source is not None, \
+        "please set versioneer.versionfile_source"
+    assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix"
+
+    versionfile_abs = os.path.join(root, cfg.versionfile_source)
+
+    # extract version from first of: _version.py, VCS command (e.g. 'git
+    # describe'), parentdir. This is meant to work for developers using a
+    # source checkout, for users of a tarball created by 'setup.py sdist',
+    # and for users of a tarball/zipball created by 'git archive' or github's
+    # download-from-tag feature or the equivalent in other VCSes.
+
+    get_keywords_f = handlers.get("get_keywords")
+    from_keywords_f = handlers.get("keywords")
+    if get_keywords_f and from_keywords_f:
+        try:
+            keywords = get_keywords_f(versionfile_abs)
+            ver = from_keywords_f(keywords, cfg.tag_prefix, verbose)
+            if verbose:
+                print("got version from expanded keyword %s" % ver)
+            return ver
+        except NotThisMethod:
+            pass
+
+    try:
+        ver = versions_from_file(versionfile_abs)
+        if verbose:
+            print("got version from file %s %s" % (versionfile_abs, ver))
+        return ver
+    except NotThisMethod:
+        pass
+
+    from_vcs_f = handlers.get("pieces_from_vcs")
+    if from_vcs_f:
+        try:
+            pieces = from_vcs_f(cfg.tag_prefix, root, verbose)
+            ver = render(pieces, cfg.style)
+            if verbose:
+                print("got version from VCS %s" % ver)
+            return ver
+        except NotThisMethod:
+            pass
+
+    try:
+        if cfg.parentdir_prefix:
+            ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
+            if verbose:
+                print("got version from parentdir %s" % ver)
+            return ver
+    except NotThisMethod:
+        pass
+
+    if verbose:
+        print("unable to compute version")
+
+    return {"version": "0+unknown", "full-revisionid": None,
+            "dirty": None, "error": "unable to compute version",
+            "date": None}
+
+
+def get_version():
+    """Get the short version string for this project."""
+    return get_versions()["version"]
+
+
+def get_cmdclass(cmdclass=None):
+    """Get the custom setuptools/distutils subclasses used by Versioneer.
+
+    If the package uses a different cmdclass (e.g. one from numpy), it
+    should be provide as an argument.
+    """
+    if "versioneer" in sys.modules:
+        del sys.modules["versioneer"]
+        # this fixes the "python setup.py develop" case (also 'install' and
+        # 'easy_install .'), in which subdependencies of the main project are
+        # built (using setup.py bdist_egg) in the same python process. Assume
+        # a main project A and a dependency B, which use different versions
+        # of Versioneer. A's setup.py imports A's Versioneer, leaving it in
+        # sys.modules by the time B's setup.py is executed, causing B to run
+        # with the wrong versioneer. Setuptools wraps the sub-dep builds in a
+        # sandbox that restores sys.modules to it's pre-build state, so the
+        # parent is protected against the child's "import versioneer". By
+        # removing ourselves from sys.modules here, before the child build
+        # happens, we protect the child from the parent's versioneer too.
+        # Also see https://github.com/python-versioneer/python-versioneer/issues/52
+
+    cmds = {} if cmdclass is None else cmdclass.copy()
+
+    # we add "version" to both distutils and setuptools
+    from distutils.core import Command
+
+    class cmd_version(Command):
+        description = "report generated version string"
+        user_options = []
+        boolean_options = []
+
+        def initialize_options(self):
+            pass
+
+        def finalize_options(self):
+            pass
+
+        def run(self):
+            vers = get_versions(verbose=True)
+            print("Version: %s" % vers["version"])
+            print(" full-revisionid: %s" % vers.get("full-revisionid"))
+            print(" dirty: %s" % vers.get("dirty"))
+            print(" date: %s" % vers.get("date"))
+            if vers["error"]:
+                print(" error: %s" % vers["error"])
+    cmds["version"] = cmd_version
+
+    # we override "build_py" in both distutils and setuptools
+    #
+    # most invocation pathways end up running build_py:
+    #  distutils/build -> build_py
+    #  distutils/install -> distutils/build ->..
+    #  setuptools/bdist_wheel -> distutils/install ->..
+    #  setuptools/bdist_egg -> distutils/install_lib -> build_py
+    #  setuptools/install -> bdist_egg ->..
+    #  setuptools/develop -> ?
+    #  pip install:
+    #   copies source tree to a tempdir before running egg_info/etc
+    #   if .git isn't copied too, 'git describe' will fail
+    #   then does setup.py bdist_wheel, or sometimes setup.py install
+    #  setup.py egg_info -> ?
+
+    # we override different "build_py" commands for both environments
+    if 'build_py' in cmds:
+        _build_py = cmds['build_py']
+    elif "setuptools" in sys.modules:
+        from setuptools.command.build_py import build_py as _build_py
+    else:
+        from distutils.command.build_py import build_py as _build_py
+
+    class cmd_build_py(_build_py):
+        def run(self):
+            root = get_root()
+            cfg = get_config_from_root(root)
+            versions = get_versions()
+            _build_py.run(self)
+            # now locate _version.py in the new build/ directory and replace
+            # it with an updated value
+            if cfg.versionfile_build:
+                target_versionfile = os.path.join(self.build_lib,
+                                                  cfg.versionfile_build)
+                print("UPDATING %s" % target_versionfile)
+                write_to_version_file(target_versionfile, versions)
+    cmds["build_py"] = cmd_build_py
+
+    if "setuptools" in sys.modules:
+        from setuptools.command.build_ext import build_ext as _build_ext
+    else:
+        from distutils.command.build_ext import build_ext as _build_ext
+
+    class cmd_build_ext(_build_ext):
+        def run(self):
+            root = get_root()
+            cfg = get_config_from_root(root)
+            versions = get_versions()
+            _build_ext.run(self)
+            if self.inplace:
+                # build_ext --inplace will only build extensions in
+                # build/lib<..> dir with no _version.py to write to.
+                # As in place builds will already have a _version.py
+                # in the module dir, we do not need to write one.
+                return
+            # now locate _version.py in the new build/ directory and replace
+            # it with an updated value
+            target_versionfile = os.path.join(self.build_lib,
+                                              cfg.versionfile_source)
+            print("UPDATING %s" % target_versionfile)
+            write_to_version_file(target_versionfile, versions)
+    cmds["build_ext"] = cmd_build_ext
+
+    if "cx_Freeze" in sys.modules:  # cx_freeze enabled?
+        from cx_Freeze.dist import build_exe as _build_exe
+        # nczeczulin reports that py2exe won't like the pep440-style string
+        # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g.
+        # setup(console=[{
+        #   "version": versioneer.get_version().split("+", 1)[0], # FILEVERSION
+        #   "product_version": versioneer.get_version(),
+        #   ...
+
+        class cmd_build_exe(_build_exe):
+            def run(self):
+                root = get_root()
+                cfg = get_config_from_root(root)
+                versions = get_versions()
+                target_versionfile = cfg.versionfile_source
+                print("UPDATING %s" % target_versionfile)
+                write_to_version_file(target_versionfile, versions)
+
+                _build_exe.run(self)
+                os.unlink(target_versionfile)
+                with open(cfg.versionfile_source, "w") as f:
+                    LONG = LONG_VERSION_PY[cfg.VCS]
+                    f.write(LONG %
+                            {"DOLLAR": "$",
+                             "STYLE": cfg.style,
+                             "TAG_PREFIX": cfg.tag_prefix,
+                             "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+                             "VERSIONFILE_SOURCE": cfg.versionfile_source,
+                             })
+        cmds["build_exe"] = cmd_build_exe
+        del cmds["build_py"]
+
+    if 'py2exe' in sys.modules:  # py2exe enabled?
+        from py2exe.distutils_buildexe import py2exe as _py2exe
+
+        class cmd_py2exe(_py2exe):
+            def run(self):
+                root = get_root()
+                cfg = get_config_from_root(root)
+                versions = get_versions()
+                target_versionfile = cfg.versionfile_source
+                print("UPDATING %s" % target_versionfile)
+                write_to_version_file(target_versionfile, versions)
+
+                _py2exe.run(self)
+                os.unlink(target_versionfile)
+                with open(cfg.versionfile_source, "w") as f:
+                    LONG = LONG_VERSION_PY[cfg.VCS]
+                    f.write(LONG %
+                            {"DOLLAR": "$",
+                             "STYLE": cfg.style,
+                             "TAG_PREFIX": cfg.tag_prefix,
+                             "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+                             "VERSIONFILE_SOURCE": cfg.versionfile_source,
+                             })
+        cmds["py2exe"] = cmd_py2exe
+
+    # we override different "sdist" commands for both environments
+    if 'sdist' in cmds:
+        _sdist = cmds['sdist']
+    elif "setuptools" in sys.modules:
+        from setuptools.command.sdist import sdist as _sdist
+    else:
+        from distutils.command.sdist import sdist as _sdist
+
+    class cmd_sdist(_sdist):
+        def run(self):
+            versions = get_versions()
+            self._versioneer_generated_versions = versions
+            # unless we update this, the command will keep using the old
+            # version
+            self.distribution.metadata.version = versions["version"]
+            return _sdist.run(self)
+
+        def make_release_tree(self, base_dir, files):
+            root = get_root()
+            cfg = get_config_from_root(root)
+            _sdist.make_release_tree(self, base_dir, files)
+            # now locate _version.py in the new base_dir directory
+            # (remembering that it may be a hardlink) and replace it with an
+            # updated value
+            target_versionfile = os.path.join(base_dir, cfg.versionfile_source)
+            print("UPDATING %s" % target_versionfile)
+            write_to_version_file(target_versionfile,
+                                  self._versioneer_generated_versions)
+    cmds["sdist"] = cmd_sdist
+
+    return cmds
+
+
+CONFIG_ERROR = """
+setup.cfg is missing the necessary Versioneer configuration. You need
+a section like:
+
+ [versioneer]
+ VCS = git
+ style = pep440
+ versionfile_source = src/myproject/_version.py
+ versionfile_build = myproject/_version.py
+ tag_prefix =
+ parentdir_prefix = myproject-
+
+You will also need to edit your setup.py to use the results:
+
+ import versioneer
+ setup(version=versioneer.get_version(),
+       cmdclass=versioneer.get_cmdclass(), ...)
+
+Please read the docstring in ./versioneer.py for configuration instructions,
+edit setup.cfg, and re-run the installer or 'python versioneer.py setup'.
+"""
+
+SAMPLE_CONFIG = """
+# See the docstring in versioneer.py for instructions. Note that you must
+# re-run 'versioneer.py setup' after changing this section, and commit the
+# resulting files.
+
+[versioneer]
+#VCS = git
+#style = pep440
+#versionfile_source =
+#versionfile_build =
+#tag_prefix =
+#parentdir_prefix =
+
+"""
+
+INIT_PY_SNIPPET = """
+from ._version import get_versions
+__version__ = get_versions()['version']
+del get_versions
+"""
+
+
+def do_setup():
+    """Do main VCS-independent setup function for installing Versioneer."""
+    root = get_root()
+    try:
+        cfg = get_config_from_root(root)
+    except (EnvironmentError, configparser.NoSectionError,
+            configparser.NoOptionError) as e:
+        if isinstance(e, (EnvironmentError, configparser.NoSectionError)):
+            print("Adding sample versioneer config to setup.cfg",
+                  file=sys.stderr)
+            with open(os.path.join(root, "setup.cfg"), "a") as f:
+                f.write(SAMPLE_CONFIG)
+        print(CONFIG_ERROR, file=sys.stderr)
+        return 1
+
+    print(" creating %s" % cfg.versionfile_source)
+    with open(cfg.versionfile_source, "w") as f:
+        LONG = LONG_VERSION_PY[cfg.VCS]
+        f.write(LONG % {"DOLLAR": "$",
+                        "STYLE": cfg.style,
+                        "TAG_PREFIX": cfg.tag_prefix,
+                        "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+                        "VERSIONFILE_SOURCE": cfg.versionfile_source,
+                        })
+
+    ipy = os.path.join(os.path.dirname(cfg.versionfile_source),
+                       "__init__.py")
+    if os.path.exists(ipy):
+        try:
+            with open(ipy, "r") as f:
+                old = f.read()
+        except EnvironmentError:
+            old = ""
+        if INIT_PY_SNIPPET not in old:
+            print(" appending to %s" % ipy)
+            with open(ipy, "a") as f:
+                f.write(INIT_PY_SNIPPET)
+        else:
+            print(" %s unmodified" % ipy)
+    else:
+        print(" %s doesn't exist, ok" % ipy)
+        ipy = None
+
+    # Make sure both the top-level "versioneer.py" and versionfile_source
+    # (PKG/_version.py, used by runtime code) are in MANIFEST.in, so
+    # they'll be copied into source distributions. Pip won't be able to
+    # install the package without this.
+    manifest_in = os.path.join(root, "MANIFEST.in")
+    simple_includes = set()
+    try:
+        with open(manifest_in, "r") as f:
+            for line in f:
+                if line.startswith("include "):
+                    for include in line.split()[1:]:
+                        simple_includes.add(include)
+    except EnvironmentError:
+        pass
+    # That doesn't cover everything MANIFEST.in can do
+    # (http://docs.python.org/2/distutils/sourcedist.html#commands), so
+    # it might give some false negatives. Appending redundant 'include'
+    # lines is safe, though.
+    if "versioneer.py" not in simple_includes:
+        print(" appending 'versioneer.py' to MANIFEST.in")
+        with open(manifest_in, "a") as f:
+            f.write("include versioneer.py\n")
+    else:
+        print(" 'versioneer.py' already in MANIFEST.in")
+    if cfg.versionfile_source not in simple_includes:
+        print(" appending versionfile_source ('%s') to MANIFEST.in" %
+              cfg.versionfile_source)
+        with open(manifest_in, "a") as f:
+            f.write("include %s\n" % cfg.versionfile_source)
+    else:
+        print(" versionfile_source already in MANIFEST.in")
+
+    # Make VCS-specific changes. For git, this means creating/changing
+    # .gitattributes to mark _version.py for export-subst keyword
+    # substitution.
+    do_vcs_install(manifest_in, cfg.versionfile_source, ipy)
+    return 0
+
+
+def scan_setup_py():
+    """Validate the contents of setup.py against Versioneer's expectations."""
+    found = set()
+    setters = False
+    errors = 0
+    with open("setup.py", "r") as f:
+        for line in f.readlines():
+            if "import versioneer" in line:
+                found.add("import")
+            if "versioneer.get_cmdclass()" in line:
+                found.add("cmdclass")
+            if "versioneer.get_version()" in line:
+                found.add("get_version")
+            if "versioneer.VCS" in line:
+                setters = True
+            if "versioneer.versionfile_source" in line:
+                setters = True
+    if len(found) != 3:
+        print("")
+        print("Your setup.py appears to be missing some important items")
+        print("(but I might be wrong). Please make sure it has something")
+        print("roughly like the following:")
+        print("")
+        print(" import versioneer")
+        print(" setup( version=versioneer.get_version(),")
+        print("        cmdclass=versioneer.get_cmdclass(),  ...)")
+        print("")
+        errors += 1
+    if setters:
+        print("You should remove lines like 'versioneer.VCS = ' and")
+        print("'versioneer.versionfile_source = ' . This configuration")
+        print("now lives in setup.cfg, and should be removed from setup.py")
+        print("")
+        errors += 1
+    return errors
+
+
+if __name__ == "__main__":
+    cmd = sys.argv[1]
+    if cmd == "setup":
+        errors = do_setup()
+        errors += scan_setup_py()
+        if errors:
+            sys.exit(1)

From 2d4dc3cc7e542e36dd648ad7448857db7cae119c Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Wed, 18 Nov 2020 13:34:25 +0300
Subject: [PATCH 02/32] Add llvmdev in conda-recipe (#13)

`llvmdev` is necessary in `numba-dppy`.
Previously IntelPython/Numba required it.

Also `llvm-spirv` is not used in build.
---
 conda-recipe/meta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index 920f79cfe7..4775e4b00d 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -17,7 +17,6 @@ requirements:
         - python
         - setuptools
         - cython
-        - llvm-spirv
         - numba
         - dpctl
         - dpnp
@@ -27,6 +26,7 @@ requirements:
         - dpctl
         - spirv-tools
         - llvm-spirv
+        - llvmdev
         - dpnp
 
 about:

From f0d9ebd955572ac3103198fbef29692888878646 Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Wed, 18 Nov 2020 13:36:24 +0300
Subject: [PATCH 03/32] Move numba/dppl_config.py to numba_dppy/config.py (#14)

dppl_config.py contains flag `dppl_present` which indicate that `dpCtl`
and devices are available.
As `numba-dppy` will depend on `dpCtl` then the check is not necessary.
Also checking for available platform and device could be done in
`numba-dppy`. It also can raise and exception on import as before.

`dppl_present` also renamed to `dppy_present`.
---
 numba_dppy/__init__.py                | 6 +++---
 numba_dppy/config.py                  | 6 ++++++
 numba_dppy/dppl_offload_dispatcher.py | 4 ++--
 numba_dppy/tests/__init__.py          | 4 ++--
 4 files changed, 13 insertions(+), 7 deletions(-)
 create mode 100644 numba_dppy/config.py

diff --git a/numba_dppy/__init__.py b/numba_dppy/__init__.py
index 35c5e0a9f5..a9fe7f4a31 100644
--- a/numba_dppy/__init__.py
+++ b/numba_dppy/__init__.py
@@ -506,14 +506,14 @@ def main():
 from numba import config
 import numba.testing
 
-from numba.dppl_config import *
-if dppl_present:
+from .config import dppy_present
+if dppy_present:
     from .device_init import *
 else:
     raise ImportError("Importing dppl failed")
 
 def test(*args, **kwargs):
-    if not dppl_present and not is_available():
+    if not dppy_present and not is_available():
         dppl_error()
 
     return numba.testing.test("numba_dppy.tests", *args, **kwargs)
diff --git a/numba_dppy/config.py b/numba_dppy/config.py
new file mode 100644
index 0000000000..d1393daf0f
--- /dev/null
+++ b/numba_dppy/config.py
@@ -0,0 +1,6 @@
+try:
+    import dpctl
+
+    dppy_present = dpctl.has_sycl_platforms() and dpctl.has_gpu_queues()
+except:
+    dppy_present = False
diff --git a/numba_dppy/dppl_offload_dispatcher.py b/numba_dppy/dppl_offload_dispatcher.py
index 49a599589e..db841bef06 100644
--- a/numba_dppy/dppl_offload_dispatcher.py
+++ b/numba_dppy/dppl_offload_dispatcher.py
@@ -1,13 +1,13 @@
 from numba.core import dispatcher, compiler
 from numba.core.registry import cpu_target, dispatcher_registry
-import numba.dppl_config as dppl_config
+import numba_dppy.config as dppy_config
 
 
 class DpplOffloadDispatcher(dispatcher.Dispatcher):
     targetdescr = cpu_target
 
     def __init__(self, py_func, locals={}, targetoptions={}, impl_kind='direct', pipeline_class=compiler.Compiler):
-        if dppl_config.dppl_present:
+        if dppy_config.dppy_present:
             from numba_dppy.compiler import DPPLCompiler
             targetoptions['parallel'] = True
             dispatcher.Dispatcher.__init__(self, py_func, locals=locals,
diff --git a/numba_dppy/tests/__init__.py b/numba_dppy/tests/__init__.py
index c9e582dac3..d29208fb91 100644
--- a/numba_dppy/tests/__init__.py
+++ b/numba_dppy/tests/__init__.py
@@ -3,14 +3,14 @@
 from os.path import dirname, join
 
 
-import numba.dppl_config as dppl_config
+import numba_dppy.config as dppy_config
 
 def load_tests(loader, tests, pattern):
 
     suite = SerialSuite()
     this_dir = dirname(__file__)
 
-    if dppl_config.dppl_present:
+    if dppy_config.dppy_present:
         suite.addTests(load_testsuite(loader, join(this_dir, 'dppl')))
     else:
         print("skipped DPPL tests")

From 00a771eb1b97cef98df17756f78e2f677d6fc5a4 Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Wed, 18 Nov 2020 16:08:20 +0300
Subject: [PATCH 04/32] Move config variables from numba to numba_dppy (#15)

Variables in module numba_dppy.config are renamed.
They do not use `DPPY_` prefix:
SAVE_DPPL_IR_FILES -> SAVE_IR_FILES

Environament variables should have `NUMBA_DPPY_` prefix:
NUMBA_SAVE_DPPL_IR_FILES -> NUMBA_DPPY_SAVE_IR_FILES
NUMBA_SPIRV_VAL -> NUMBA_DPPY_SPIRV_VAL
---
 numba_dppy/config.py          | 30 ++++++++++++++++++++++++++++++
 numba_dppy/spirv_generator.py |  9 +++++----
 2 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/numba_dppy/config.py b/numba_dppy/config.py
index d1393daf0f..880d18d7b5 100644
--- a/numba_dppy/config.py
+++ b/numba_dppy/config.py
@@ -1,6 +1,36 @@
+import os
+
+
 try:
     import dpctl
 
     dppy_present = dpctl.has_sycl_platforms() and dpctl.has_gpu_queues()
 except:
     dppy_present = False
+
+
+def _readenv(name, ctor, default):
+    """Original version from numba\core\config.py
+    class _EnvReloader():
+        ...
+        def process_environ():
+            def _readenv(): ...
+    """
+    value = os.environ.get(name)
+    if value is None:
+        return default() if callable(default) else default
+    try:
+        return ctor(value)
+    except Exception:
+        warnings.warn(
+            "environ %s defined but failed to parse '%s'" % (name, value),
+            RuntimeWarning,
+        )
+        return default
+
+
+# Save intermediate files being generated by DPPY
+SAVE_IR_FILES = _readenv("NUMBA_DPPY_SAVE_IR_FILES", int, 0)
+
+# Turn SPIRV-VALIDATION ON/OFF switch
+SPIRV_VAL = _readenv("NUMBA_DPPY_SPIRV_VAL", int, 0)
diff --git a/numba_dppy/spirv_generator.py b/numba_dppy/spirv_generator.py
index cee4672ded..5bac98e014 100644
--- a/numba_dppy/spirv_generator.py
+++ b/numba_dppy/spirv_generator.py
@@ -7,6 +7,7 @@
 import tempfile
 
 from numba import config
+from numba_dppy import config as dppy_config
 from numba_dppy.target import LINK_ATOMIC
 
 
@@ -61,7 +62,7 @@ def generate(self, ipath, opath):
         #     b) hoist all allocas to the enty block of the module
         check_call(["opt","-O1","-o",ipath+'.bc',ipath])
         check_call(["llvm-spirv","-o",opath,ipath+'.bc'])
-        if config.SAVE_DPPL_IR_FILES == 0:
+        if dppy_config.SAVE_IR_FILES == 0:
             os.unlink(ipath + '.bc')
 
     def link(self, opath, binaries):
@@ -84,12 +85,12 @@ def __init__(self, context):
     def __del__(self):
         # Remove all temporary files
         for afile in self._tempfiles:
-            if config.SAVE_DPPL_IR_FILES != 0:
+            if dppy_config.SAVE_IR_FILES != 0:
                 print(afile)
             else:
                 os.unlink(afile)
         # Remove directory
-        if config.SAVE_DPPL_IR_FILES == 0:
+        if dppy_config.SAVE_IR_FILES == 0:
             os.rmdir(self._tmpdir)
 
     def _create_temp_file(self, name, mode='wb'):
@@ -136,7 +137,7 @@ def finalize(self):
             self._cmd.link(spirv_path, binary_paths)
 
         # Validate the SPIR-V code
-        if config.SPIRV_VAL == 1:
+        if dppy_config.SPIRV_VAL == 1:
             try:
                 self._cmd.validate(ipath=spirv_path)
             except CalledProcessError:

From 9804a1226b4370e5bb2ff4db4ad5f9f503e02f46 Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Thu, 19 Nov 2020 18:32:24 +0300
Subject: [PATCH 05/32] Build with dpNP only on Linux (#17)

dpNP is not available for Windows yet.
---
 conda-recipe/meta.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index 4775e4b00d..d8f6c1ecbb 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -19,7 +19,7 @@ requirements:
         - cython
         - numba
         - dpctl
-        - dpnp
+        - dpnp  # [linux]
     run:
         - python
         - numba >=0.51
@@ -27,7 +27,7 @@ requirements:
         - spirv-tools
         - llvm-spirv
         - llvmdev
-        - dpnp
+        - dpnp  # [linux]
 
 about:
     home: https://github.com/IntelPython/numba-dppy

From 76eba4414039a18acc10deb0e93f28759e8bbfc9 Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Fri, 20 Nov 2020 23:30:56 +0300
Subject: [PATCH 06/32] Fixing tests on CI (#19)

* Use GPU device_context in tests

Test `test_parfor_message` skip unless GPU device is available.
But GPU device will be used only with `device_context`.

* Use ctypes c_longlong for numba int64

On Windows ctypes.c_longlong is 8 bytes as on Linux, but
on Windows ctypes.c_long is 4 bytes and on Linux it is 8 bytes.

* Activate oneAPI compiler env for using OpenCL CPU

Without OpenCL CPU some tests are skipped.

* Expected failure for test_with_dppl_context_cpu

* Not import numba.config in numba_dppy/__init__.py

* Activate CPU for conda recipe tests on Linux

* Add expectedFailureIf for some tests on Windows

* Print captured stdout on error
---
 conda-recipe/run_test.bat                       |  5 +++++
 conda-recipe/run_test.sh                        |  8 +++++++-
 numba_dppy/__init__.py                          |  1 -
 numba_dppy/compiler.py                          | 14 +++++++-------
 numba_dppy/testing.py                           | 13 +++++++++++++
 .../tests/dppl/test_parfor_lower_message.py     | 17 +++++++++--------
 numba_dppy/tests/dppl/test_prange.py            | 15 ++++++++-------
 numba_dppy/tests/dppl/test_with_context.py      |  5 ++++-
 8 files changed, 53 insertions(+), 25 deletions(-)

diff --git a/conda-recipe/run_test.bat b/conda-recipe/run_test.bat
index 17e8140115..031bc6e69a 100644
--- a/conda-recipe/run_test.bat
+++ b/conda-recipe/run_test.bat
@@ -1,3 +1,8 @@
+REM For activating OpenCL CPU
+call "%ONEAPI_ROOT%\compiler\latest\env\vars.bat"
+
+@echo on
+
 python -m numba.runtests -b -v -m -- numba_dppy.tests
 IF %ERRORLEVEL% NEQ 0 exit /B 1
 
diff --git a/conda-recipe/run_test.sh b/conda-recipe/run_test.sh
index 27b1d4722f..8a30af0c51 100644
--- a/conda-recipe/run_test.sh
+++ b/conda-recipe/run_test.sh
@@ -1,6 +1,12 @@
 #!/bin/bash
 
-set -ex
+set -e
+
+# For activating OpenCL CPU
+source ${ONEAPI_ROOT}/compiler/latest/env/vars.sh
+source ${ONEAPI_ROOT}/tbb/latest/env/vars.sh
+
+set -x
 
 python -m numba.runtests -b -v -m -- numba_dppy.tests
 
diff --git a/numba_dppy/__init__.py b/numba_dppy/__init__.py
index a9fe7f4a31..6eff949d16 100644
--- a/numba_dppy/__init__.py
+++ b/numba_dppy/__init__.py
@@ -503,7 +503,6 @@ def main():
 
 from __future__ import print_function, absolute_import, division
 
-from numba import config
 import numba.testing
 
 from .config import dppy_present
diff --git a/numba_dppy/compiler.py b/numba_dppy/compiler.py
index 7f0f7c8411..cf7bca2822 100644
--- a/numba_dppy/compiler.py
+++ b/numba_dppy/compiler.py
@@ -402,15 +402,15 @@ def _unpack_device_array_argument(self, val, kernelargs):
         # parent
         kernelargs.append(ctypes.c_size_t(0))
 
-        kernelargs.append(ctypes.c_long(val.size))
-        kernelargs.append(ctypes.c_long(val.dtype.itemsize))
+        kernelargs.append(ctypes.c_longlong(val.size))
+        kernelargs.append(ctypes.c_longlong(val.dtype.itemsize))
 
         kernelargs.append(val.base)
 
         for ax in range(val.ndim):
-            kernelargs.append(ctypes.c_long(val.shape[ax]))
+            kernelargs.append(ctypes.c_longlong(val.shape[ax]))
         for ax in range(val.ndim):
-            kernelargs.append(ctypes.c_long(val.strides[ax]))
+            kernelargs.append(ctypes.c_longlong(val.strides[ax]))
 
 
     def _unpack_argument(self, ty, val, sycl_queue, retr, kernelargs,
@@ -439,16 +439,16 @@ def _unpack_argument(self, ty, val, sycl_queue, retr, kernelargs,
                 self._unpack_device_array_argument(usm_ndarr, kernelargs)
 
         elif ty == types.int64:
-            cval = ctypes.c_long(val)
+            cval = ctypes.c_longlong(val)
             kernelargs.append(cval)
         elif ty == types.uint64:
-            cval = ctypes.c_long(val)
+            cval = ctypes.c_ulonglong(val)
             kernelargs.append(cval)
         elif ty == types.int32:
             cval = ctypes.c_int(val)
             kernelargs.append(cval)
         elif ty == types.uint32:
-            cval = ctypes.c_int(val)
+            cval = ctypes.c_uint(val)
             kernelargs.append(cval)
         elif ty == types.float64:
             cval = ctypes.c_double(val)
diff --git a/numba_dppy/testing.py b/numba_dppy/testing.py
index 11090ebedc..8da0b7b91e 100644
--- a/numba_dppy/testing.py
+++ b/numba_dppy/testing.py
@@ -46,3 +46,16 @@ def captured_dppl_stdout():
     import numba_dppy, numba_dppy as dppl
     with redirect_c_stdout() as stream:
         yield DPPLTextCapture(stream)
+
+
+def _id(obj):
+    return obj
+
+
+def expectedFailureIf(condition):
+    """
+    Expected failure for a test if the condition is true.
+    """
+    if condition:
+        return unittest.expectedFailure
+    return _id
diff --git a/numba_dppy/tests/dppl/test_parfor_lower_message.py b/numba_dppy/tests/dppl/test_parfor_lower_message.py
index 728d46ddf3..fe8c85d356 100644
--- a/numba_dppy/tests/dppl/test_parfor_lower_message.py
+++ b/numba_dppy/tests/dppl/test_parfor_lower_message.py
@@ -4,7 +4,7 @@
 import numba_dppy, numba_dppy as dppl
 from numba_dppy.testing import unittest, DPPLTestCase
 from numba.tests.support import captured_stdout
-import dpctl.ocldrv as ocldrv
+import dpctl
 
 
 def prange_example():
@@ -18,17 +18,18 @@ def prange_example():
     return a
 
 
-@unittest.skipUnless(ocldrv.has_gpu_device, 'test only on GPU system')
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestParforMessage(DPPLTestCase):
     def test_parfor_message(self):
-        numba_dppy.compiler.DEBUG = 1
-        jitted = njit(parallel={'offload':True})(prange_example)
+        with dpctl.device_context("opencl:gpu") as gpu_queue:
+            numba_dppy.compiler.DEBUG = 1
+            jitted = njit(parallel={"offload": True})(prange_example)
 
-        with captured_stdout() as got:
-            jitted()
+            with captured_stdout() as got:
+                jitted()
 
-        numba_dppy.compiler.DEBUG = 0
-        self.assertTrue('Parfor lowered on DPPL-device' in got.getvalue())
+            numba_dppy.compiler.DEBUG = 0
+            self.assertTrue("Parfor lowered on DPPL-device" in got.getvalue())
 
 
 if __name__ == '__main__':
diff --git a/numba_dppy/tests/dppl/test_prange.py b/numba_dppy/tests/dppl/test_prange.py
index f1ceb3b2ce..317c2cbb2f 100644
--- a/numba_dppy/tests/dppl/test_prange.py
+++ b/numba_dppy/tests/dppl/test_prange.py
@@ -7,7 +7,7 @@
 import numba
 from numba import njit, prange
 import numba_dppy, numba_dppy as dppl
-from numba_dppy.testing import unittest
+from numba_dppy.testing import unittest, expectedFailureIf
 from numba_dppy.testing import DPPLTestCase
 from numba.tests.support import captured_stdout
 
@@ -95,6 +95,7 @@ def f(a, b):
         self.assertTrue(np.all(b == 12))
 
 
+    @expectedFailureIf(sys.platform.startswith('win'))
     def test_two_consequent_prange(self):
         def prange_example():
             n = 10
@@ -110,15 +111,15 @@ def prange_example():
         numba_dppy.compiler.DEBUG = 1
 
         jitted = njit(parallel={'offload':True})(prange_example)
-        with captured_stdout() as got:
+        with captured_stdout() as stdout:
             jitted_res = jitted()
 
         res = prange_example()
 
         numba_dppy.compiler.DEBUG = old_debug
 
-        self.assertEqual(got.getvalue().count('Parfor lowered on DPPL-device'), 2)
-        self.assertEqual(got.getvalue().count('Failed to lower parfor on DPPL-device'), 0)
+        self.assertEqual(stdout.getvalue().count('Parfor lowered on DPPL-device'), 2, stdout.getvalue())
+        self.assertEqual(stdout.getvalue().count('Failed to lower parfor on DPPL-device'), 0, stdout.getvalue())
         np.testing.assert_equal(res, jitted_res)
 
 
@@ -138,15 +139,15 @@ def prange_example():
         numba_dppy.compiler.DEBUG = 1
 
         jitted = njit(parallel={'offload':True})(prange_example)
-        with captured_stdout() as got:
+        with captured_stdout() as stdout:
             jitted_res = jitted()
 
         res = prange_example()
 
         numba_dppy.compiler.DEBUG = old_debug
 
-        self.assertEqual(got.getvalue().count('Parfor lowered on DPPL-device'), 2)
-        self.assertEqual(got.getvalue().count('Failed to lower parfor on DPPL-device'), 0)
+        self.assertEqual(stdout.getvalue().count('Parfor lowered on DPPL-device'), 2, stdout.getvalue())
+        self.assertEqual(stdout.getvalue().count('Failed to lower parfor on DPPL-device'), 0, stdout.getvalue())
         np.testing.assert_equal(res, jitted_res)
 
 
diff --git a/numba_dppy/tests/dppl/test_with_context.py b/numba_dppy/tests/dppl/test_with_context.py
index 4e34c939cb..0749ff3e89 100644
--- a/numba_dppy/tests/dppl/test_with_context.py
+++ b/numba_dppy/tests/dppl/test_with_context.py
@@ -1,16 +1,18 @@
+import sys
 import numba
 import numpy as np
 from numba import njit
 import numba_dppy, numba_dppy as dppl
 from numba.core import errors
 from numba.tests.support import captured_stdout
-from numba_dppy.testing import DPPLTestCase, unittest
+from numba_dppy.testing import DPPLTestCase, unittest, expectedFailureIf
 import dpctl
 
 
 class TestWithDPPLContext(DPPLTestCase):
 
     @unittest.skipIf(not dpctl.has_gpu_queues(), "No GPU platforms available")
+    @expectedFailureIf(sys.platform.startswith('win'))
     def test_with_dppl_context_gpu(self):
 
         @njit
@@ -37,6 +39,7 @@ def func(b):
         self.assertTrue('Parfor lowered on DPPL-device' in got_gpu_message.getvalue())
 
     @unittest.skipIf(not dpctl.has_cpu_queues(), "No CPU platforms available")
+    @unittest.expectedFailure
     def test_with_dppl_context_cpu(self):
 
         @njit

From 49a172dd908856c0bebc1017b1cf02eafb1004a6 Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Tue, 24 Nov 2020 11:05:31 +0300
Subject: [PATCH 07/32] Fix README about runing test command (#24)

Fixed #22
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 255f9085df..ca36b1a758 100644
--- a/README.md
+++ b/README.md
@@ -43,7 +43,7 @@ See folder `numba_dppy/tests`.
 
 Run tests:
 ```bash
-python -m numba.runtests numba_dppy.tests
+python -m unittest numba_dppy.tests
 ```
 
 ## Examples

From 3444a65770fcbaf2e318563ea908a1d7a7c6bb5e Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Fri, 4 Dec 2020 10:44:42 +0300
Subject: [PATCH 08/32] Update to new API of dpCtl 0.4.0 related to USM (#44)

* Update to new API of dpCtl 0.4.0 related to USM

* numba-dppy requires cffi
---
 conda-recipe/meta.yaml | 1 +
 numba_dppy/compiler.py | 4 ++--
 setup.py               | 1 +
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index d8f6c1ecbb..4967295c05 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -23,6 +23,7 @@ requirements:
     run:
         - python
         - numba >=0.51
+        - cffi
         - dpctl
         - spirv-tools
         - llvm-spirv
diff --git a/numba_dppy/compiler.py b/numba_dppy/compiler.py
index cf7bca2822..736cd96a26 100644
--- a/numba_dppy/compiler.py
+++ b/numba_dppy/compiler.py
@@ -11,7 +11,7 @@
 from inspect import signature
 
 import dpctl
-import dpctl._memory as dpctl_mem
+import dpctl.memory as dpctl_mem
 import numpy as np
 
 from . import spirv_generator
@@ -422,7 +422,7 @@ def _unpack_argument(self, ty, val, sycl_queue, retr, kernelargs,
         device_arrs.append(None)
 
         if isinstance(ty, types.Array):
-            if isinstance(val.base, dpctl_mem.Memory):
+            if hasattr(val.base, "__sycl_usm_array_interface__"):
                 self._unpack_device_array_argument(val, kernelargs)
             else:
                 default_behavior = self.check_for_invalid_access_type(access_type)
diff --git a/setup.py b/setup.py
index 37ad0bfc68..13f3d782d9 100644
--- a/setup.py
+++ b/setup.py
@@ -38,6 +38,7 @@ def get_ext_modules():
 build_requires = ["cython"]
 install_requires = [
     "numba",
+    "cffi",
     "dpctl",
 ]
 

From 59faa938db0fb678358ac976bdb306edfe18a48c Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Fri, 4 Dec 2020 11:40:45 +0300
Subject: [PATCH 09/32] Update README.md

dpCtl >=0.4.0
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ca36b1a758..92c61fc1bd 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ https://intelpython.github.io/dpnp/
 ## Dependencies
 
 * numba >=0.51 (IntelPython/numba)
-* dpCtl >=0.3.8
+* dpCtl >=0.4.0
 * dpNP >=0.3 (optional)
 * llvm-spirv (SPIRV generation from LLVM IR)
 * llvmdev (LLVM IR generation)

From cefb6ef360fc330fabd82db527b85a2e104653dc Mon Sep 17 00:00:00 2001
From: Elena Totmenina <totmeninal@mail.ru>
Date: Fri, 4 Dec 2020 15:39:40 +0300
Subject: [PATCH 10/32] Delete old backup file (#45)

Co-authored-by: etotmeni <elena.totmenina@intel.com>
---
 .../parfor_loop_invariant_hoisting.py.bkp     | 213 ------------------
 1 file changed, 213 deletions(-)
 delete mode 100644 numba_dppy/parfor_loop_invariant_hoisting.py.bkp

diff --git a/numba_dppy/parfor_loop_invariant_hoisting.py.bkp b/numba_dppy/parfor_loop_invariant_hoisting.py.bkp
deleted file mode 100644
index fb37a1c97b..0000000000
--- a/numba_dppy/parfor_loop_invariant_hoisting.py.bkp
+++ /dev/null
@@ -1,213 +0,0 @@
-from __future__ import print_function, division, absolute_import
-
-def add_to_def_once_sets(a_def, def_once, def_more):
-    '''If the variable is already defined more than once, do nothing.
-       Else if defined exactly once previously then transition this
-       variable to the defined more than once set (remove it from
-       def_once set and add to def_more set).
-       Else this must be the first time we've seen this variable defined
-       so add to def_once set.
-    '''
-    if a_def in def_more:
-        pass
-    elif a_def in def_once:
-        def_more.add(a_def)
-        def_once.remove(a_def)
-    else:
-        def_once.add(a_def)
-
-def compute_def_once_block(block, def_once, def_more, getattr_taken, typemap, module_assigns):
-    '''Effect changes to the set of variables defined once or more than once
-       for a single block.
-       block - the block to process
-       def_once - set of variable names known to be defined exactly once
-       def_more - set of variable names known to be defined more than once
-       getattr_taken - dict mapping variable name to tuple of object and attribute taken
-       module_assigns - dict mapping variable name to the Global that they came from
-    '''
-    # The only "defs" occur in assignments, so find such instructions.
-    assignments = block.find_insts(ir.Assign)
-    # For each assignment...
-    for one_assign in assignments:
-        # Get the LHS/target of the assignment.
-        a_def = one_assign.target.name
-        # Add variable to def sets.
-        add_to_def_once_sets(a_def, def_once, def_more)
-
-        rhs = one_assign.value
-        if isinstance(rhs, ir.Global):
-            # Remember assignments of the form "a = Global(...)"
-            # Is this a module?
-            if isinstance(rhs.value, pytypes.ModuleType):
-                module_assigns[a_def] = rhs.value.__name__
-        if isinstance(rhs, ir.Expr) and rhs.op == 'getattr' and rhs.value.name in def_once:
-            # Remember assignments of the form "a = b.c"
-            getattr_taken[a_def] = (rhs.value.name, rhs.attr)
-        if isinstance(rhs, ir.Expr) and rhs.op == 'call' and rhs.func.name in getattr_taken:
-            # If "a" is being called then lookup the getattr definition of "a"
-            # as above, getting the module variable "b" (base_obj)
-            # and the attribute "c" (base_attr).
-            base_obj, base_attr = getattr_taken[rhs.func.name]
-            if base_obj in module_assigns:
-                # If we know the definition of the module variable then get the module
-                # name from module_assigns.
-                base_mod_name = module_assigns[base_obj]
-                if not is_const_call(base_mod_name, base_attr):
-                    # Calling a method on an object could modify the object and is thus
-                    # like a def of that object.  We call is_const_call to see if this module/attribute
-                    # combination is known to not modify the module state.  If we don't know that
-                    # the combination is safe then we have to assume there could be a modification to
-                    # the module and thus add the module variable as defined more than once.
-                    add_to_def_once_sets(base_obj, def_once, def_more)
-            else:
-                # Assume the worst and say that base_obj could be modified by the call.
-                add_to_def_once_sets(base_obj, def_once, def_more)
-        if isinstance(rhs, ir.Expr) and rhs.op == 'call':
-            # If a mutable object is passed to a function, then it may be changed and
-            # therefore can't be hoisted.
-            # For each argument to the function...
-            for argvar in rhs.args:
-                # Get the argument's type.
-                if isinstance(argvar, ir.Var):
-                    argvar = argvar.name
-                avtype = typemap[argvar]
-                # If that type doesn't have a mutable attribute or it does and it's set to
-                # not mutable then this usage is safe for hoisting.
-                if getattr(avtype, 'mutable', False):
-                    # Here we have a mutable variable passed to a function so add this variable
-                    # to the def lists.
-                    add_to_def_once_sets(argvar, def_once, def_more)
-
-def compute_def_once_internal(loop_body, def_once, def_more, getattr_taken, typemap, module_assigns):
-    '''Compute the set of variables defined exactly once in the given set of blocks
-       and use the given sets for storing which variables are defined once, more than
-       once and which have had a getattr call on them.
-    '''
-    # For each block...
-    for label, block in loop_body.items():
-        # Scan this block and effect changes to def_once, def_more, and getattr_taken
-        # based on the instructions in that block.
-        compute_def_once_block(block, def_once, def_more, getattr_taken, typemap, module_assigns)
-        # Have to recursively process parfors manually here.
-        for inst in block.body:
-            if isinstance(inst, parfor.Parfor):
-                # Recursively compute for the parfor's init block.
-                compute_def_once_block(inst.init_block, def_once, def_more, getattr_taken, typemap, module_assigns)
-                # Recursively compute for the parfor's loop body.
-                compute_def_once_internal(inst.loop_body, def_once, def_more, getattr_taken, typemap, module_assigns)
-
-def compute_def_once(loop_body, typemap):
-    '''Compute the set of variables defined exactly once in the given set of blocks.
-    '''
-    def_once = set()   # set to hold variables defined exactly once
-    def_more = set()   # set to hold variables defined more than once
-    getattr_taken = {}
-    module_assigns = {}
-    compute_def_once_internal(loop_body, def_once, def_more, getattr_taken, typemap, module_assigns)
-    return def_once
-
-def find_vars(var, varset):
-    assert isinstance(var, ir.Var)
-    varset.add(var.name)
-    return var
-
-def _hoist_internal(inst, dep_on_param, call_table, hoisted, not_hoisted,
-                    typemap, stored_arrays):
-    if inst.target.name in stored_arrays:
-        not_hoisted.append((inst, "stored array"))
-        if config.DEBUG_ARRAY_OPT >= 1:
-            print("Instruction", inst, " could not be hoisted because the created array is stored.")
-        return False
-
-    uses = set()
-    visit_vars_inner(inst.value, find_vars, uses)
-    diff = uses.difference(dep_on_param)
-    if config.DEBUG_ARRAY_OPT >= 1:
-        print("_hoist_internal:", inst, "uses:", uses, "diff:", diff)
-    if len(diff) == 0 and is_pure(inst.value, None, call_table):
-        if config.DEBUG_ARRAY_OPT >= 1:
-            print("Will hoist instruction", inst, typemap[inst.target.name])
-        hoisted.append(inst)
-        if not isinstance(typemap[inst.target.name], types.npytypes.Array):
-            dep_on_param += [inst.target.name]
-        return True
-    else:
-        if len(diff) > 0:
-            not_hoisted.append((inst, "dependency"))
-            if config.DEBUG_ARRAY_OPT >= 1:
-                print("Instruction", inst, " could not be hoisted because of a dependency.")
-        else:
-            not_hoisted.append((inst, "not pure"))
-            if config.DEBUG_ARRAY_OPT >= 1:
-                print("Instruction", inst, " could not be hoisted because it isn't pure.")
-    return False
-
-def find_setitems_block(setitems, itemsset, block, typemap):
-    for inst in block.body:
-        if isinstance(inst, ir.StaticSetItem) or isinstance(inst, ir.SetItem):
-            setitems.add(inst.target.name)
-            # If we store a non-mutable object into an array then that is safe to hoist.
-            # If the stored object is mutable and you hoist then multiple entries in the
-            # outer array could reference the same object and changing one index would then
-            # change other indices.
-            if getattr(typemap[inst.value.name], "mutable", False):
-                itemsset.add(inst.value.name)
-        elif isinstance(inst, parfor.Parfor):
-            find_setitems_block(setitems, itemsset, inst.init_block, typemap)
-            find_setitems_body(setitems, itemsset, inst.loop_body, typemap)
-
-def find_setitems_body(setitems, itemsset, loop_body, typemap):
-    """
-      Find the arrays that are written into (goes into setitems) and the
-      mutable objects (mostly arrays) that are written into other arrays
-      (goes into itemsset).
-    """
-    for label, block in loop_body.items():
-        find_setitems_block(setitems, itemsset, block, typemap)
-
-def hoist(parfor_params, loop_body, typemap, wrapped_blocks):
-    dep_on_param = copy.copy(parfor_params)
-    hoisted = []
-    not_hoisted = []
-
-    # Compute the set of variable defined exactly once in the loop body.
-    def_once = compute_def_once(loop_body, typemap)
-    (call_table, reverse_call_table) = get_call_table(wrapped_blocks)
-
-    setitems = set()
-    itemsset = set()
-    find_setitems_body(setitems, itemsset, loop_body, typemap)
-    dep_on_param = list(set(dep_on_param).difference(setitems))
-    if config.DEBUG_ARRAY_OPT >= 1:
-        print("hoist - def_once:", def_once, "setitems:",
-              setitems, "itemsset:", itemsset, "dep_on_param:",
-              dep_on_param, "parfor_params:", parfor_params)
-
-    for label, block in loop_body.items():
-        new_block = []
-        for inst in block.body:
-            if isinstance(inst, ir.Assign) and inst.target.name in def_once:
-                if _hoist_internal(inst, dep_on_param, call_table,
-                                   hoisted, not_hoisted, typemap, itemsset):
-                    # don't add this instruction to the block since it is
-                    # hoisted
-                    continue
-            elif isinstance(inst, parfor.Parfor):
-                new_init_block = []
-                if config.DEBUG_ARRAY_OPT >= 1:
-                    print("parfor")
-                    inst.dump()
-                for ib_inst in inst.init_block.body:
-                    if (isinstance(ib_inst, ir.Assign) and
-                        ib_inst.target.name in def_once):
-                        if _hoist_internal(ib_inst, dep_on_param, call_table,
-                                           hoisted, not_hoisted, typemap, itemsset):
-                            # don't add this instuction to the block since it is hoisted
-                            continue
-                    new_init_block.append(ib_inst)
-                inst.init_block.body = new_init_block
-
-            new_block.append(inst)
-        block.body = new_block
-    return hoisted, not_hoisted
-

From d727a0a11c20aad318d9f0384df050a0e881a3fd Mon Sep 17 00:00:00 2001
From: Elena Totmenina <totmeninal@mail.ru>
Date: Fri, 4 Dec 2020 15:41:15 +0300
Subject: [PATCH 11/32] Del dppl dir in tests (#43)

* Del dppl dir in tests

* Del unused var

Co-authored-by: etotmeni <elena.totmenina@intel.com>
---
 numba_dppy/tests/__init__.py                                | 3 +--
 numba_dppy/tests/dppl/__init__.py                           | 6 ------
 numba_dppy/tests/{dppl => }/test_arg_accessor.py            | 0
 numba_dppy/tests/{dppl => }/test_arg_types.py               | 0
 numba_dppy/tests/{dppl => }/test_atomic_op.py               | 0
 numba_dppy/tests/{dppl => }/test_barrier.py                 | 0
 numba_dppy/tests/{dppl => }/test_black_scholes.py           | 0
 numba_dppy/tests/{dppl => }/test_caching.py                 | 0
 numba_dppy/tests/{dppl => }/test_device_array_args.py       | 0
 numba_dppy/tests/{dppl => }/test_dpctl_api.py               | 0
 numba_dppy/tests/{dppl => }/test_dpnp_functions.py          | 0
 numba_dppy/tests/{dppl => }/test_dppl_fallback.py           | 0
 numba_dppy/tests/{dppl => }/test_dppl_func.py               | 0
 numba_dppy/tests/{dppl => }/test_math_functions.py          | 0
 .../tests/{dppl => }/test_numpy_bit_twiddling_functions.py  | 0
 .../tests/{dppl => }/test_numpy_comparison_functions.py     | 0
 .../tests/{dppl => }/test_numpy_floating_functions.py       | 0
 numba_dppy/tests/{dppl => }/test_numpy_math_functions.py    | 0
 .../tests/{dppl => }/test_numpy_trigonomteric_functions.py  | 0
 numba_dppy/tests/{dppl => }/test_parfor_lower_message.py    | 0
 numba_dppy/tests/{dppl => }/test_prange.py                  | 0
 numba_dppy/tests/{dppl => }/test_print.py                   | 0
 numba_dppy/tests/{dppl => }/test_sum_reduction.py           | 0
 numba_dppy/tests/{dppl => }/test_vectorize.py               | 0
 numba_dppy/tests/{dppl => }/test_with_context.py            | 0
 25 files changed, 1 insertion(+), 8 deletions(-)
 delete mode 100644 numba_dppy/tests/dppl/__init__.py
 rename numba_dppy/tests/{dppl => }/test_arg_accessor.py (100%)
 rename numba_dppy/tests/{dppl => }/test_arg_types.py (100%)
 rename numba_dppy/tests/{dppl => }/test_atomic_op.py (100%)
 rename numba_dppy/tests/{dppl => }/test_barrier.py (100%)
 rename numba_dppy/tests/{dppl => }/test_black_scholes.py (100%)
 rename numba_dppy/tests/{dppl => }/test_caching.py (100%)
 rename numba_dppy/tests/{dppl => }/test_device_array_args.py (100%)
 rename numba_dppy/tests/{dppl => }/test_dpctl_api.py (100%)
 rename numba_dppy/tests/{dppl => }/test_dpnp_functions.py (100%)
 rename numba_dppy/tests/{dppl => }/test_dppl_fallback.py (100%)
 rename numba_dppy/tests/{dppl => }/test_dppl_func.py (100%)
 rename numba_dppy/tests/{dppl => }/test_math_functions.py (100%)
 rename numba_dppy/tests/{dppl => }/test_numpy_bit_twiddling_functions.py (100%)
 rename numba_dppy/tests/{dppl => }/test_numpy_comparison_functions.py (100%)
 rename numba_dppy/tests/{dppl => }/test_numpy_floating_functions.py (100%)
 rename numba_dppy/tests/{dppl => }/test_numpy_math_functions.py (100%)
 rename numba_dppy/tests/{dppl => }/test_numpy_trigonomteric_functions.py (100%)
 rename numba_dppy/tests/{dppl => }/test_parfor_lower_message.py (100%)
 rename numba_dppy/tests/{dppl => }/test_prange.py (100%)
 rename numba_dppy/tests/{dppl => }/test_print.py (100%)
 rename numba_dppy/tests/{dppl => }/test_sum_reduction.py (100%)
 rename numba_dppy/tests/{dppl => }/test_vectorize.py (100%)
 rename numba_dppy/tests/{dppl => }/test_with_context.py (100%)

diff --git a/numba_dppy/tests/__init__.py b/numba_dppy/tests/__init__.py
index d29208fb91..5a2199f149 100644
--- a/numba_dppy/tests/__init__.py
+++ b/numba_dppy/tests/__init__.py
@@ -8,10 +8,9 @@
 def load_tests(loader, tests, pattern):
 
     suite = SerialSuite()
-    this_dir = dirname(__file__)
 
     if dppy_config.dppy_present:
-        suite.addTests(load_testsuite(loader, join(this_dir, 'dppl')))
+        suite.addTests(load_testsuite(loader, dirname(__file__)))
     else:
         print("skipped DPPL tests")
 
diff --git a/numba_dppy/tests/dppl/__init__.py b/numba_dppy/tests/dppl/__init__.py
deleted file mode 100644
index cff5a36cc2..0000000000
--- a/numba_dppy/tests/dppl/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from numba.testing import SerialSuite
-from numba.testing import load_testsuite
-import os
-
-def load_tests(loader, tests, pattern):
-    return SerialSuite(load_testsuite(loader, os.path.dirname(__file__)))
diff --git a/numba_dppy/tests/dppl/test_arg_accessor.py b/numba_dppy/tests/test_arg_accessor.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_arg_accessor.py
rename to numba_dppy/tests/test_arg_accessor.py
diff --git a/numba_dppy/tests/dppl/test_arg_types.py b/numba_dppy/tests/test_arg_types.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_arg_types.py
rename to numba_dppy/tests/test_arg_types.py
diff --git a/numba_dppy/tests/dppl/test_atomic_op.py b/numba_dppy/tests/test_atomic_op.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_atomic_op.py
rename to numba_dppy/tests/test_atomic_op.py
diff --git a/numba_dppy/tests/dppl/test_barrier.py b/numba_dppy/tests/test_barrier.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_barrier.py
rename to numba_dppy/tests/test_barrier.py
diff --git a/numba_dppy/tests/dppl/test_black_scholes.py b/numba_dppy/tests/test_black_scholes.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_black_scholes.py
rename to numba_dppy/tests/test_black_scholes.py
diff --git a/numba_dppy/tests/dppl/test_caching.py b/numba_dppy/tests/test_caching.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_caching.py
rename to numba_dppy/tests/test_caching.py
diff --git a/numba_dppy/tests/dppl/test_device_array_args.py b/numba_dppy/tests/test_device_array_args.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_device_array_args.py
rename to numba_dppy/tests/test_device_array_args.py
diff --git a/numba_dppy/tests/dppl/test_dpctl_api.py b/numba_dppy/tests/test_dpctl_api.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_dpctl_api.py
rename to numba_dppy/tests/test_dpctl_api.py
diff --git a/numba_dppy/tests/dppl/test_dpnp_functions.py b/numba_dppy/tests/test_dpnp_functions.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_dpnp_functions.py
rename to numba_dppy/tests/test_dpnp_functions.py
diff --git a/numba_dppy/tests/dppl/test_dppl_fallback.py b/numba_dppy/tests/test_dppl_fallback.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_dppl_fallback.py
rename to numba_dppy/tests/test_dppl_fallback.py
diff --git a/numba_dppy/tests/dppl/test_dppl_func.py b/numba_dppy/tests/test_dppl_func.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_dppl_func.py
rename to numba_dppy/tests/test_dppl_func.py
diff --git a/numba_dppy/tests/dppl/test_math_functions.py b/numba_dppy/tests/test_math_functions.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_math_functions.py
rename to numba_dppy/tests/test_math_functions.py
diff --git a/numba_dppy/tests/dppl/test_numpy_bit_twiddling_functions.py b/numba_dppy/tests/test_numpy_bit_twiddling_functions.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_numpy_bit_twiddling_functions.py
rename to numba_dppy/tests/test_numpy_bit_twiddling_functions.py
diff --git a/numba_dppy/tests/dppl/test_numpy_comparison_functions.py b/numba_dppy/tests/test_numpy_comparison_functions.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_numpy_comparison_functions.py
rename to numba_dppy/tests/test_numpy_comparison_functions.py
diff --git a/numba_dppy/tests/dppl/test_numpy_floating_functions.py b/numba_dppy/tests/test_numpy_floating_functions.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_numpy_floating_functions.py
rename to numba_dppy/tests/test_numpy_floating_functions.py
diff --git a/numba_dppy/tests/dppl/test_numpy_math_functions.py b/numba_dppy/tests/test_numpy_math_functions.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_numpy_math_functions.py
rename to numba_dppy/tests/test_numpy_math_functions.py
diff --git a/numba_dppy/tests/dppl/test_numpy_trigonomteric_functions.py b/numba_dppy/tests/test_numpy_trigonomteric_functions.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_numpy_trigonomteric_functions.py
rename to numba_dppy/tests/test_numpy_trigonomteric_functions.py
diff --git a/numba_dppy/tests/dppl/test_parfor_lower_message.py b/numba_dppy/tests/test_parfor_lower_message.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_parfor_lower_message.py
rename to numba_dppy/tests/test_parfor_lower_message.py
diff --git a/numba_dppy/tests/dppl/test_prange.py b/numba_dppy/tests/test_prange.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_prange.py
rename to numba_dppy/tests/test_prange.py
diff --git a/numba_dppy/tests/dppl/test_print.py b/numba_dppy/tests/test_print.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_print.py
rename to numba_dppy/tests/test_print.py
diff --git a/numba_dppy/tests/dppl/test_sum_reduction.py b/numba_dppy/tests/test_sum_reduction.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_sum_reduction.py
rename to numba_dppy/tests/test_sum_reduction.py
diff --git a/numba_dppy/tests/dppl/test_vectorize.py b/numba_dppy/tests/test_vectorize.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_vectorize.py
rename to numba_dppy/tests/test_vectorize.py
diff --git a/numba_dppy/tests/dppl/test_with_context.py b/numba_dppy/tests/test_with_context.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_with_context.py
rename to numba_dppy/tests/test_with_context.py

From 2b17ae8a05cdc922fdb16b6bd7e57dc24646df8f Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Fri, 4 Dec 2020 09:04:32 -0600
Subject: [PATCH 12/32] Revert "numba-dppy requires cffi"

This reverts commit 776bf2228e2aef77ea9767ce2ae90ff204482230.
---
 conda-recipe/meta.yaml | 1 -
 setup.py               | 1 -
 2 files changed, 2 deletions(-)

diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index 4967295c05..d8f6c1ecbb 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -23,7 +23,6 @@ requirements:
     run:
         - python
         - numba >=0.51
-        - cffi
         - dpctl
         - spirv-tools
         - llvm-spirv
diff --git a/setup.py b/setup.py
index 13f3d782d9..37ad0bfc68 100644
--- a/setup.py
+++ b/setup.py
@@ -38,7 +38,6 @@ def get_ext_modules():
 build_requires = ["cython"]
 install_requires = [
     "numba",
-    "cffi",
     "dpctl",
 ]
 

From 9cf5aeb990ec9ecd57f1fa9a1dd3674f3e5ff091 Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Fri, 4 Dec 2020 09:06:12 -0600
Subject: [PATCH 13/32] Remove use of cffi

---
 numba_dppy/dppl_lowerer.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/numba_dppy/dppl_lowerer.py b/numba_dppy/dppl_lowerer.py
index 51fb072551..a317c990a6 100644
--- a/numba_dppy/dppl_lowerer.py
+++ b/numba_dppy/dppl_lowerer.py
@@ -979,14 +979,13 @@ def relatively_deep_copy(obj, memo):
     from numba.core.compiler import CompileResult
     from numba.np.ufunc.dufunc import DUFunc
     from ctypes import _CFuncPtr
-    from cffi.api import FFI
     from types import ModuleType
     from numba.core.types.abstract import Type
 
     # objects which shouldn't or can't be copied and it's ok not to copy it.
     if isinstance(obj, (FunctionIdentity, _DispatcherBase, Function, Type, Dispatcher, ModuleType,
                         Signature, DPPLFunctionTemplate, CompileResult,
-                        DUFunc, _CFuncPtr, FFI,
+                        DUFunc, _CFuncPtr,
                         type, str, bool, type(None))):
         return obj
 

From ff8fe55f50c14b0e010f32d570d974f572cdd956 Mon Sep 17 00:00:00 2001
From: Elena Totmenina <totmeninal@mail.ru>
Date: Mon, 7 Dec 2020 22:44:44 +0300
Subject: [PATCH 14/32] Rename dppl to dppy (#42)

Co-authored-by: etotmeni <elena.totmenina@intel.com>
Co-authored-by: Diptorup Deb <diptorup.deb@intel.com>
---
 .gitignore                                    |  27 +++
 HowTo.rst                                     |   6 +-
 numba_dppy/CHANGE_LOG                         |   4 +-
 numba_dppy/__init__.py                        |  52 +++---
 numba_dppy/compiler.py                        | 103 +++++------
 numba_dppy/decorators.py                      |  14 +-
 numba_dppy/descriptor.py                      |  20 +--
 numba_dppy/dispatcher.py                      |  22 +--
 ...n_call_gen.py => dppy_host_fn_call_gen.py} |  14 +-
 .../{dppl_lowerer.py => dppy_lowerer.py}      |  52 +++---
 ...spatcher.py => dppy_offload_dispatcher.py} |  12 +-
 ...ppl_passbuilder.py => dppy_passbuilder.py} |  34 ++--
 numba_dppy/{dppl_passes.py => dppy_passes.py} |  30 ++--
 .../examples/{dppl_func.py => dppy_func.py}   |  10 +-
 ...l_with_context.py => dppy_with_context.py} |   2 +-
 numba_dppy/examples/matmul.py                 |  12 +-
 numba_dppy/examples/pairwise_distance.py      |   6 +-
 numba_dppy/examples/sum-hybrid.py             |  10 +-
 numba_dppy/examples/sum.py                    |   8 +-
 numba_dppy/examples/sum2D.py                  |  10 +-
 numba_dppy/examples/sum_ndarray.py            |   6 +-
 numba_dppy/examples/sum_reduction.py          |   8 +-
 numba_dppy/examples/sum_reduction_ocl.py      |  16 +-
 .../examples/sum_reduction_recursive_ocl.py   |  16 +-
 .../experimental_numpy_lowering_overload.py   |  12 +-
 numba_dppy/initialize.py                      |   8 +-
 numba_dppy/ocl/atomics/atomic_ops.cl          |  56 +++---
 numba_dppy/ocl/ocldecl.py                     |  44 ++---
 numba_dppy/ocl/oclimpl.py                     |  22 +--
 numba_dppy/ocl/stubs.py                       |   6 +-
 numba_dppy/printimpl.py                       |   4 +-
 numba_dppy/target.py                          |  26 +--
 numba_dppy/target_dispatcher.py               |  12 +-
 numba_dppy/testing.py                         |  12 +-
 numba_dppy/tests/__init__.py                  |   6 +-
 numba_dppy/tests/test_arg_accessor.py         |  18 +-
 numba_dppy/tests/test_arg_types.py            |  26 +--
 numba_dppy/tests/test_atomic_op.py            | 168 +++++++++---------
 numba_dppy/tests/test_barrier.py              |  34 ++--
 numba_dppy/tests/test_black_scholes.py        |  14 +-
 numba_dppy/tests/test_caching.py              |  14 +-
 numba_dppy/tests/test_device_array_args.py    |  16 +-
 numba_dppy/tests/test_dpctl_api.py            |   4 +-
 numba_dppy/tests/test_dpnp_functions.py       |   6 +-
 numba_dppy/tests/test_dppl_fallback.py        |  26 +--
 numba_dppy/tests/test_dppl_func.py            |  32 ++--
 numba_dppy/tests/test_math_functions.py       |  76 ++++----
 .../test_numpy_bit_twiddling_functions.py     |   6 +-
 .../tests/test_numpy_comparison_functions.py  |   6 +-
 .../tests/test_numpy_floating_functions.py    |   6 +-
 numba_dppy/tests/test_numpy_math_functions.py |   6 +-
 .../test_numpy_trigonomteric_functions.py     |   6 +-
 numba_dppy/tests/test_parfor_lower_message.py |   8 +-
 numba_dppy/tests/test_prange.py               |  14 +-
 numba_dppy/tests/test_print.py                |  16 +-
 numba_dppy/tests/test_sum_reduction.py        |  12 +-
 numba_dppy/tests/test_vectorize.py            |   6 +-
 numba_dppy/tests/test_with_context.py         |  16 +-
 58 files changed, 634 insertions(+), 604 deletions(-)
 create mode 100644 .gitignore
 rename numba_dppy/{dppl_host_fn_call_gen.py => dppy_host_fn_call_gen.py} (98%)
 rename numba_dppy/{dppl_lowerer.py => dppy_lowerer.py} (97%)
 rename numba_dppy/{dppl_offload_dispatcher.py => dppy_offload_dispatcher.py} (73%)
 rename numba_dppy/{dppl_passbuilder.py => dppy_passbuilder.py} (82%)
 rename numba_dppy/{dppl_passes.py => dppy_passes.py} (95%)
 rename numba_dppy/examples/{dppl_func.py => dppy_func.py} (81%)
 rename numba_dppy/examples/{dppl_with_context.py => dppy_with_context.py} (94%)

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000..340ae2678b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,27 @@
+*.pyc
+*.o
+*.so
+*.dylib
+*.pyd
+*.pdb
+*.egg-info
+*.sw[po]
+*.out
+*.ll
+.coverage
+.nfs*
+tags
+MANIFEST
+
+build/
+docs/_build/
+docs/gh-pages/
+dist/
+htmlcov/
+.idea/
+.vscode/
+.mypy_cache/
+.ipynb_checkpoints/
+__pycache__/
+
+docs/source/developer/autogen*
diff --git a/HowTo.rst b/HowTo.rst
index 03927c0ea7..7689bc52bf 100644
--- a/HowTo.rst
+++ b/HowTo.rst
@@ -7,7 +7,7 @@ are listed below with the help of sample code snippets. In this release we have
 the implementation of the OAK approach described in MS138 in section 4.3.2. The
 new decorator is described below.
 
-To access the features driver module have to be imported from numba_dppy.dppl_driver
+To access the features driver module have to be imported from numba_dppy.dppy_driver
 
 New Decorator
 =============
@@ -61,7 +61,7 @@ Primitive types are passed by value to the kernel, currently supported are int,
 Math Kernels
 ============
 
-This release has support for math kernels. See numba_dppy/tests/dppl/test_math_functions.py
+This release has support for math kernels. See numba_dppy/tests/dppy/test_math_functions.py
 for more details.
 
 
@@ -170,6 +170,6 @@ Testing
 
 All examples can be found in numba_dppy/examples/
 
-All tests can be found in numba_dppy/tests/dppl and can be triggered by the following command:
+All tests can be found in numba_dppy/tests/dppy and can be triggered by the following command:
 
 ``python -m numba.runtests numba_dppy.tests``
diff --git a/numba_dppy/CHANGE_LOG b/numba_dppy/CHANGE_LOG
index e3cb06522c..2a1fcdee40 100644
--- a/numba_dppy/CHANGE_LOG
+++ b/numba_dppy/CHANGE_LOG
@@ -1,7 +1,7 @@
-NUMBA Version 0.48.0 + DPPL Version 0.3.0 (June 29, 2020)
+NUMBA Version 0.48.0 + DPPY Version 0.3.0 (June 29, 2020)
 --------------------------------------------------------
 
 This release includes:
 
-* Caching of dppl.kernels which will improve performance.
+* Caching of dppy.kernels which will improve performance.
 * Addition of support for Intel Advisor which will help in profiling applications.
diff --git a/numba_dppy/__init__.py b/numba_dppy/__init__.py
index 6eff949d16..ac4e898889 100644
--- a/numba_dppy/__init__.py
+++ b/numba_dppy/__init__.py
@@ -4,9 +4,9 @@
 
 
 Extensions to Numba for Intel GPUs introduce two new features into Numba:
-    a.  A new backend that has a new decorator called @dppl.kernel that
+    a.  A new backend that has a new decorator called @dppy.kernel that
         exposes an explicit kernel programming interface similar to the
-        existing Numba GPU code-generation backends. The @dppl.kernel
+        existing Numba GPU code-generation backends. The @dppy.kernel
         decorator currently implements a subset of OpenCL’s API through
         Numba’s intrinsic functions.
 
@@ -20,48 +20,48 @@
 Explicit Kernel Prgoramming with new Docorators:
 
 
-@dppl.kernel
+@dppy.kernel
 
-    The @dppl.kernel decorator can be used with or without extra arguments.
+    The @dppy.kernel decorator can be used with or without extra arguments.
     Optionally, users can pass the signature of the arguments to the
     decorator. When a signature is provided to the DK decorator the version
     of the OpenCL kernel generated gets specialized for that type signature.
 
     ---------------------------------------------------------------------------
-    @dppl.kernel
+    @dppy.kernel
     def data_parallel_sum(a, b, c):
-        i = dppl.get_global_id(0)
+        i = dppy.get_global_id(0)
         c[i] = a[i] + b[i]
     ---------------------------------------------------------------------------
 
     To invoke the above function users will need to provide a
     global size (OpenCL) which is the size of a (same as b and c) and a
-    local size (dppl.DEFAULT_LOCAL_SIZE if user don't want to specify).
+    local size (dppy.DEFAULT_LOCAL_SIZE if user don't want to specify).
     Example shown below:
 
     ---------------------------------------------------------------------------
-    data_parallel_sum[len(a), dppl.DEFAULT_LOCAL_SIZE](dA, dB, dC)
+    data_parallel_sum[len(a), dppy.DEFAULT_LOCAL_SIZE](dA, dB, dC)
     ---------------------------------------------------------------------------
 
 
-@dppl.func
+@dppy.func
 
-    The @dppl.func decorator is the other decorator provided in the explicit
+    The @dppy.func decorator is the other decorator provided in the explicit
     kernel programming model. This decorator allows users to write “device”
     functions that can be invoked from inside DK functions but cannot be invoked
     from the host. The decorator also supports type specialization as with the
-    DK decorator. Functions decorated with @dppl.func will also be JIT compiled
-    and inlined into the OpenCL Program containing the @dppl.kernel function
-    calling it. A @dppl.func will not be launched as an OpenCL kernel.
+    DK decorator. Functions decorated with @dppy.func will also be JIT compiled
+    and inlined into the OpenCL Program containing the @dppy.kernel function
+    calling it. A @dppy.func will not be launched as an OpenCL kernel.
 
     ---------------------------------------------------------------------------
-    @dppl.func
+    @dppy.func
     def bar(a):
         return a*a
 
-    @dppl.kernel
+    @dppy.kernel
     def foo(in, out):
-        i = dppl.get_global_id(0)
+        i = dppy.get_global_id(0)
         out[i] = bar(in[i])
     ---------------------------------------------------------------------------
 
@@ -71,13 +71,13 @@ def foo(in, out):
     The following table has the list of intrinsic functions that can be directly
     used inside a DK function. All the functions are equivalent to the similarly
     named OpenCL function. Wherever there is an implementation difference
-    between the Numba-PyDPPL version and the OpenCL version, the difference is
+    between the Numba-DPPY version and the OpenCL version, the difference is
     explained in table. Note that these functions cannot be used anywhere else
     outside of a DK function in a Numba application. Readers are referred to the
     OpenCL API specs to review the functionality of each function.
 
     +----------------------+----------------------------+----------------------+
-    | Numba-DPPL intrinsic | Equivalent OpenCL function |         Notes        |
+    | Numba-DPPY intrinsic | Equivalent OpenCL function |         Notes        |
     +----------------------+----------------------------+----------------------+
     | get_global_id        | get_global_id              |                      |
     +----------------------+----------------------------+----------------------+
@@ -121,7 +121,7 @@ def foo(in, out):
     |print             |print(varargs)                 |The print function is a  |
     |                  |                               |subset of the OpenCL     |
     |                  |                               |printf function. The     |
-    |                  |                               |Numba-DPPL version of    |
+    |                  |                               |Numba-DPPY version of    |
     |                  |                               |print supports only int, |
     |                  |                               |string, and float        |
     |                  |                               |arguments.               |
@@ -160,16 +160,16 @@ def foo(in, out):
 
 
 
-Complete Example using @dppl.kernel:
+Complete Example using @dppy.kernel:
 
     ---------------------------------------------------------------------------
     import numpy as np
-    import numba_dppy, numba_dppy as dppl
+    import numba_dppy, numba_dppy as dppy
     import dpctl
 
-    @dppl.kernel
+    @dppy.kernel
     def data_parallel_sum(a, b, c):
-        i = dppl.get_global_id(0)
+        i = dppy.get_global_id(0)
         c[i] = a[i] + b[i]
 
     def driver(device_env, a, b, c, global_size):
@@ -181,7 +181,7 @@ def driver(device_env, a, b, c, global_size):
         print("before : ", dA._ndarray)
         print("before : ", dB._ndarray)
         print("before : ", dC._ndarray)
-        data_parallel_sum[global_size, dppl.DEFAULT_LOCAL_SIZE](dA, dB, dC)
+        data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](dA, dB, dC)
         device_env.copy_array_from_device(dC)
         print("after : ", dC._ndarray)
 
@@ -509,11 +509,11 @@ def main():
 if dppy_present:
     from .device_init import *
 else:
-    raise ImportError("Importing dppl failed")
+    raise ImportError("Importing numba-dppy failed")
 
 def test(*args, **kwargs):
     if not dppy_present and not is_available():
-        dppl_error()
+        dppy_error()
 
     return numba.testing.test("numba_dppy.tests", *args, **kwargs)
 
diff --git a/numba_dppy/compiler.py b/numba_dppy/compiler.py
index 736cd96a26..c8a329738a 100644
--- a/numba_dppy/compiler.py
+++ b/numba_dppy/compiler.py
@@ -2,7 +2,7 @@
 import copy
 from collections import namedtuple
 
-from .dppl_passbuilder import DPPLPassBuilder
+from .dppy_passbuilder import DPPYPassBuilder
 from numba.core.typing.templates import ConcreteTemplate
 from numba.core import types, compiler, ir
 from numba.core.typing.templates import AbstractTemplate
@@ -12,6 +12,7 @@
 
 import dpctl
 import dpctl.memory as dpctl_mem
+import dpctl.program as dpctl_prog
 import numpy as np
 
 from . import spirv_generator
@@ -19,10 +20,10 @@
 import os
 from numba.core.compiler import DefaultPassBuilder, CompilerBase
 
-DEBUG=os.environ.get('NUMBA_DPPL_DEBUG', None)
-_NUMBA_DPPL_READ_ONLY  = "read_only"
-_NUMBA_DPPL_WRITE_ONLY = "write_only"
-_NUMBA_DPPL_READ_WRITE = "read_write"
+DEBUG=os.environ.get('NUMBA_DPPY_DEBUG', None)
+_NUMBA_DPPY_READ_ONLY  = "read_only"
+_NUMBA_DPPY_WRITE_ONLY = "write_only"
+_NUMBA_DPPY_READ_WRITE = "read_write"
 
 def _raise_no_device_found_error():
     error_message = ("No OpenCL device specified. "
@@ -30,7 +31,7 @@ def _raise_no_device_found_error():
     raise ValueError(error_message)
 
 def _raise_invalid_kernel_enqueue_args():
-    error_message = ("Incorrect number of arguments for enquing dppl.kernel. "
+    error_message = ("Incorrect number of arguments for enquing dppy.kernel. "
                      "Usage: device_env, global size, local size. "
                      "The local size argument is optional.")
     raise ValueError(error_message)
@@ -51,15 +52,15 @@ def get_ordered_arg_access_types(pyfunc, access_types):
     return ordered_arg_access_types
 
 
-class DPPLCompiler(CompilerBase):
-    """ DPPL Compiler """
+class DPPYCompiler(CompilerBase):
+    """ DPPY Compiler """
 
     def define_pipelines(self):
         # this maintains the objmode fallback behaviour
         pms = []
         if not self.state.flags.force_pyobject:
-            #print("Numba-DPPL [INFO]: Using Numba-DPPL pipeline")
-            pms.append(DPPLPassBuilder.define_nopython_pipeline(self.state))
+            #print("Numba-DPPY [INFO]: Using Numba-DPPY pipeline")
+            pms.append(DPPYPassBuilder.define_nopython_pipeline(self.state))
         if self.state.status.can_fallback or self.state.flags.force_pyobject:
             pms.append(
                 DefaultPassBuilder.define_objectmode_pipeline(self.state)
@@ -71,12 +72,12 @@ def define_pipelines(self):
         return pms
 
 
-def compile_with_dppl(pyfunc, return_type, args, debug):
+def compile_with_dppy(pyfunc, return_type, args, debug):
     # First compilation will trigger the initialization of the OpenCL backend.
-    from .descriptor import dppl_target
+    from .descriptor import dppy_target
 
-    typingctx = dppl_target.typing_context
-    targetctx = dppl_target.target_context
+    typingctx = dppy_target.typing_context
+    targetctx = dppy_target.target_context
     # TODO handle debug flag
     flags = compiler.Flags()
     # Do not compile (generate native code), just lower (to LLVM)
@@ -93,7 +94,7 @@ def compile_with_dppl(pyfunc, return_type, args, debug):
                                       return_type=return_type,
                                       flags=flags,
                                       locals={},
-                                      pipeline_class=DPPLCompiler)
+                                      pipeline_class=DPPYCompiler)
     elif isinstance(pyfunc, ir.FunctionIR):
         cres = compiler.compile_ir(typingctx=typingctx,
                                    targetctx=targetctx,
@@ -102,7 +103,7 @@ def compile_with_dppl(pyfunc, return_type, args, debug):
                                    return_type=return_type,
                                    flags=flags,
                                    locals={},
-                                   pipeline_class=DPPLCompiler)
+                                   pipeline_class=DPPYCompiler)
     else:
         assert(0)
     # Linking depending libraries
@@ -120,7 +121,7 @@ def compile_kernel(sycl_queue, pyfunc, args, access_types, debug=False):
         # This will be get_current_queue
         sycl_queue = dpctl.get_current_queue()
 
-    cres = compile_with_dppl(pyfunc, None, args, debug=debug)
+    cres = compile_with_dppy(pyfunc, None, args, debug=debug)
     func = cres.library.get_function(cres.fndesc.llvm_func_name)
     kernel = cres.target_context.prepare_ocl_kernel(func, cres.signature.args)
     # The kernel objet should have a reference to the target context it is compiled for.
@@ -128,7 +129,7 @@ def compile_kernel(sycl_queue, pyfunc, args, access_types, debug=False):
     # depending on the target context. For example, we want to link our kernel object
     # with implementation containing atomic operations only when atomic operations
     # are being used in the kernel.
-    oclkern = DPPLKernel(context=cres.target_context,
+    oclkern = DPPYKernel(context=cres.target_context,
                          sycl_queue=sycl_queue,
                          llvm_module=kernel.module,
                          name=kernel.name,
@@ -146,7 +147,7 @@ def compile_kernel_parfor(sycl_queue, func_ir, args, args_with_addrspaces,
             if isinstance(a, types.npytypes.Array):
                 print("addrspace:", a.addrspace)
 
-    cres = compile_with_dppl(func_ir, None, args_with_addrspaces,
+    cres = compile_with_dppy(func_ir, None, args_with_addrspaces,
                              debug=debug)
     func = cres.library.get_function(cres.fndesc.llvm_func_name)
 
@@ -159,7 +160,7 @@ def compile_kernel_parfor(sycl_queue, func_ir, args, args_with_addrspaces,
 
     kernel = cres.target_context.prepare_ocl_kernel(func, cres.signature.args)
     #kernel = cres.target_context.prepare_ocl_kernel(func, args_with_addrspaces)
-    oclkern = DPPLKernel(context=cres.target_context,
+    oclkern = DPPYKernel(context=cres.target_context,
                          sycl_queue=sycl_queue,
                          llvm_module=kernel.module,
                          name=kernel.name,
@@ -168,44 +169,44 @@ def compile_kernel_parfor(sycl_queue, func_ir, args, args_with_addrspaces,
     return oclkern
 
 
-def compile_dppl_func(pyfunc, return_type, args, debug=False):
-    cres = compile_with_dppl(pyfunc, return_type, args, debug=debug)
+def compile_dppy_func(pyfunc, return_type, args, debug=False):
+    cres = compile_with_dppy(pyfunc, return_type, args, debug=debug)
     func = cres.library.get_function(cres.fndesc.llvm_func_name)
     cres.target_context.mark_ocl_device(func)
-    devfn = DPPLFunction(cres)
+    devfn = DPPYFunction(cres)
 
-    class dppl_function_template(ConcreteTemplate):
+    class dppy_function_template(ConcreteTemplate):
         key = devfn
         cases = [cres.signature]
 
-    cres.typing_context.insert_user_function(devfn, dppl_function_template)
+    cres.typing_context.insert_user_function(devfn, dppy_function_template)
     libs = [cres.library]
     cres.target_context.insert_user_function(devfn, cres.fndesc, libs)
     return devfn
 
 
-# Compile dppl function template
-def compile_dppl_func_template(pyfunc):
-    """Compile a DPPLFunctionTemplate
+# Compile dppy function template
+def compile_dppy_func_template(pyfunc):
+    """Compile a DPPYFunctionTemplate
     """
-    from .descriptor import dppl_target
+    from .descriptor import dppy_target
 
-    dft = DPPLFunctionTemplate(pyfunc)
+    dft = DPPYFunctionTemplate(pyfunc)
 
-    class dppl_function_template(AbstractTemplate):
+    class dppy_function_template(AbstractTemplate):
         key = dft
 
         def generic(self, args, kws):
             assert not kws
             return dft.compile(args)
 
-    typingctx = dppl_target.typing_context
-    typingctx.insert_user_function(dft, dppl_function_template)
+    typingctx = dppy_target.typing_context
+    typingctx.insert_user_function(dft, dppy_function_template)
     return dft
 
 
-class DPPLFunctionTemplate(object):
-    """Unmaterialized dppl function
+class DPPYFunctionTemplate(object):
+    """Unmaterialized dppy function
     """
     def __init__(self, pyfunc, debug=False):
         self.py_func = pyfunc
@@ -220,7 +221,7 @@ def compile(self, args):
         this object.
         """
         if args not in self._compileinfos:
-            cres = compile_with_dppl(self.py_func, None, args, debug=self.debug)
+            cres = compile_with_dppy(self.py_func, None, args, debug=self.debug)
             func = cres.library.get_function(cres.fndesc.llvm_func_name)
             cres.target_context.mark_ocl_device(func)
             first_definition = not self._compileinfos
@@ -240,7 +241,7 @@ def compile(self, args):
         return cres.signature
 
 
-class DPPLFunction(object):
+class DPPYFunction(object):
     def __init__(self, cres):
         self.cres = cres
 
@@ -282,7 +283,7 @@ def _ensure_valid_work_group_size(val, work_item_grid):
     return list(val[::-1]) # reversing due to sycl and opencl interop kernel range mismatch semantic
 
 
-class DPPLKernelBase(object):
+class DPPYKernelBase(object):
     """Define interface for configurable kernels
     """
 
@@ -293,9 +294,9 @@ def __init__(self):
 
         # list of supported access types, stored in dict for fast lookup
         self.valid_access_types = {
-                _NUMBA_DPPL_READ_ONLY: _NUMBA_DPPL_READ_ONLY,
-                _NUMBA_DPPL_WRITE_ONLY: _NUMBA_DPPL_WRITE_ONLY,
-                _NUMBA_DPPL_READ_WRITE: _NUMBA_DPPL_READ_WRITE}
+                _NUMBA_DPPY_READ_ONLY: _NUMBA_DPPY_READ_ONLY,
+                _NUMBA_DPPY_WRITE_ONLY: _NUMBA_DPPY_WRITE_ONLY,
+                _NUMBA_DPPY_READ_WRITE: _NUMBA_DPPY_READ_WRITE}
 
     def copy(self):
         return copy.copy(self)
@@ -331,14 +332,14 @@ def __getitem__(self, args):
         return self.configure(sycl_queue, gs, ls)
 
 
-class DPPLKernel(DPPLKernelBase):
+class DPPYKernel(DPPYKernelBase):
     """
     A OCL kernel object
     """
 
     def __init__(self, context, sycl_queue, llvm_module, name, argtypes,
                  ordered_arg_access_types=None):
-        super(DPPLKernel, self).__init__()
+        super(DPPYKernel, self).__init__()
         self._llvm_module = llvm_module
         self.assembly = self.binary = llvm_module.__str__()
         self.entry_name = name
@@ -355,7 +356,7 @@ def __init__(self, context, sycl_queue, llvm_module, name, argtypes,
         self.spirv_bc = spirv_generator.llvm_to_spirv(self.context, self.binary)
 
         # create a program
-        self.program = dpctl.create_program_from_spirv(self.sycl_queue, self.spirv_bc)
+        self.program = dpctl_prog.create_program_from_spirv(self.sycl_queue, self.spirv_bc)
         #  create a kernel
         self.kernel = self.program.get_sycl_kernel(self.entry_name)
 
@@ -385,7 +386,7 @@ def _pack_argument(self, ty, val, sycl_queue, device_arr, access_type):
         """
         if (device_arr and (access_type not in self.valid_access_types or
             access_type in self.valid_access_types and
-            self.valid_access_types[access_type] != _NUMBA_DPPL_READ_ONLY)):
+            self.valid_access_types[access_type] != _NUMBA_DPPY_READ_ONLY)):
             # we get the date back to host if have created a
             # device_array or if access_type of this device_array
             # is not of type read_only and read_write
@@ -431,8 +432,8 @@ def _unpack_argument(self, ty, val, sycl_queue, retr, kernelargs,
                 usm_ndarr = np.ndarray(val.shape, buffer=usm_buf, dtype=val.dtype)
 
                 if (default_behavior or
-                    self.valid_access_types[access_type] == _NUMBA_DPPL_READ_ONLY or
-                    self.valid_access_types[access_type] == _NUMBA_DPPL_READ_WRITE):
+                    self.valid_access_types[access_type] == _NUMBA_DPPY_READ_ONLY or
+                    self.valid_access_types[access_type] == _NUMBA_DPPY_READ_WRITE):
                     np.copyto(usm_ndarr, val)
 
                 device_arrs[-1] = (usm_buf, usm_ndarr, val)
@@ -486,18 +487,18 @@ def check_for_invalid_access_type(self, access_type):
             return False
 
 
-class JitDPPLKernel(DPPLKernelBase):
+class JitDPPYKernel(DPPYKernelBase):
     def __init__(self, func, access_types):
 
-        super(JitDPPLKernel, self).__init__()
+        super(JitDPPYKernel, self).__init__()
 
         self.py_func = func
         self.definitions = {}
         self.access_types = access_types
 
-        from .descriptor import dppl_target
+        from .descriptor import dppy_target
 
-        self.typingctx = dppl_target.typing_context
+        self.typingctx = dppy_target.typing_context
 
     def __call__(self, *args, **kwargs):
         assert not kwargs, "Keyword Arguments are not supported"
diff --git a/numba_dppy/decorators.py b/numba_dppy/decorators.py
index a8b6bbba36..641d924134 100644
--- a/numba_dppy/decorators.py
+++ b/numba_dppy/decorators.py
@@ -1,11 +1,11 @@
 from __future__ import print_function, absolute_import, division
 from numba.core import sigutils, types
-from .compiler import (compile_kernel, JitDPPLKernel, compile_dppl_func_template,
-                       compile_dppl_func, get_ordered_arg_access_types)
+from .compiler import (compile_kernel, JitDPPYKernel, compile_dppy_func_template,
+                       compile_dppy_func, get_ordered_arg_access_types)
 
 
 def kernel(signature=None, access_types=None, debug=False):
-    """JIT compile a python function conforming using the DPPL backend.
+    """JIT compile a python function conforming using the DPPY backend.
 
     A kernel is equvalent to an OpenCL kernel function, and has the
     same restrictions as definined by SPIR_KERNEL calling convention.
@@ -22,14 +22,14 @@ def kernel(signature=None, access_types=None, debug=False):
 def autojit(debug=False, access_types=None):
     def _kernel_autojit(pyfunc):
         ordered_arg_access_types = get_ordered_arg_access_types(pyfunc, access_types)
-        return JitDPPLKernel(pyfunc, ordered_arg_access_types)
+        return JitDPPYKernel(pyfunc, ordered_arg_access_types)
     return _kernel_autojit
 
 
 def _kernel_jit(signature, debug, access_types):
     argtypes, restype = sigutils.normalize_signature(signature)
     if restype is not None and restype != types.void:
-        msg = ("DPPL kernel must have void return type but got {restype}")
+        msg = ("DPPY kernel must have void return type but got {restype}")
         raise TypeError(msg.format(restype=restype))
 
     def _wrapped(pyfunc):
@@ -54,9 +54,9 @@ def _func_jit(signature):
     argtypes, restype = sigutils.normalize_signature(signature)
 
     def _wrapped(pyfunc):
-        return compile_dppl_func(pyfunc, restype, argtypes)
+        return compile_dppy_func(pyfunc, restype, argtypes)
 
     return _wrapped
 
 def _func_autojit(pyfunc):
-    return compile_dppl_func_template(pyfunc)
+    return compile_dppy_func_template(pyfunc)
diff --git a/numba_dppy/descriptor.py b/numba_dppy/descriptor.py
index c0a24868c2..c8e6a58ec7 100644
--- a/numba_dppy/descriptor.py
+++ b/numba_dppy/descriptor.py
@@ -3,41 +3,41 @@
 from numba.core.options import TargetOptions
 
 from numba.core import dispatcher, utils, typing
-from .target import DPPLTargetContext, DPPLTypingContext
+from .target import DPPYTargetContext, DPPYTypingContext
 
 from numba.core.cpu import CPUTargetOptions
 
 
-class DPPLTarget(TargetDescriptor):
+class DPPYTarget(TargetDescriptor):
     options = CPUTargetOptions
-    #typingctx = DPPLTypingContext()
-    #targetctx = DPPLTargetContext(typingctx)
+    #typingctx = DPPYTypingContext()
+    #targetctx = DPPYTargetContext(typingctx)
 
     @utils.cached_property
     def _toplevel_target_context(self):
         # Lazily-initialized top-level target context, for all threads
-        return DPPLTargetContext(self.typing_context)
+        return DPPYTargetContext(self.typing_context)
 
     @utils.cached_property
     def _toplevel_typing_context(self):
         # Lazily-initialized top-level typing context, for all threads
-        return DPPLTypingContext()
+        return DPPYTypingContext()
 
     @property
     def target_context(self):
         """
-        The target context for DPPL targets.
+        The target context for DPPY targets.
         """
         return self._toplevel_target_context
 
     @property
     def typing_context(self):
         """
-        The typing context for DPPL targets.
+        The typing context for DPPY targets.
         """
         return self._toplevel_typing_context
 
 
 
-# The global DPPL target
-dppl_target = DPPLTarget()
+# The global DPPY target
+dppy_target = DPPYTarget()
diff --git a/numba_dppy/dispatcher.py b/numba_dppy/dispatcher.py
index a4c32ec7ec..d00a597875 100644
--- a/numba_dppy/dispatcher.py
+++ b/numba_dppy/dispatcher.py
@@ -4,17 +4,17 @@
 
 #from numba.targets.descriptors import TargetDescriptor
 #from numba.targets.options import TargetOptions
-#import numba_dppy, numba_dppy as dppl
+#import numba_dppy, numba_dppy as dppy
 from numba_dppy import kernel, autojit
-from .descriptor import dppl_target
+from .descriptor import dppy_target
 #from numba.npyufunc.deviceufunc import (UFuncMechanism, GenerializedUFunc,
  #                                       GUFuncCallSteps)
 
 from .. import dispatcher, utils, typing
-from .compiler import DPPLCompiler
+from .compiler import DPPYCompiler
 
-class DPPLDispatcher(dispatcher.Dispatcher):
-    targetdescr = dppl_target
+class DPPYDispatcher(dispatcher.Dispatcher):
+    targetdescr = dppy_target
 
 
     def __init__(self, py_func, locals={}, targetoptions={}):
@@ -58,7 +58,7 @@ def __getitem__(self, *args):
     def __getattr__(self, key):
         return getattr(self.compiled, key)
 
-class DPPLUFuncDispatcher(object):
+class DPPYUFuncDispatcher(object):
     """
     Invoke the OpenCL ufunc specialization for the given inputs.
     """
@@ -86,7 +86,7 @@ def __call__(self, *args, **kws):
                       depending on the input arguments.  Type must match
                       the input arguments.
         """
-        return DPPLUFuncMechanism.call(self.functions, args, kws)
+        return DPPYUFuncMechanism.call(self.functions, args, kws)
 
     def reduce(self, arg, stream=0):
         assert len(list(self.functions.keys())[0]) == 2, "must be a binary " \
@@ -142,7 +142,7 @@ def __reduce(self, mem, gpu_mems, stream):
                 return left
 
 
-class _DPPLGUFuncCallSteps(GUFuncCallSteps):
+class _DPPYGUFuncCallSteps(GUFuncCallSteps):
     __slots__ = [
         '_stream',
     ]
@@ -167,10 +167,10 @@ def launch_kernel(self, kernel, nelem, args):
         kernel.forall(nelem, queue=self._stream)(*args)
 
 
-class DPPLGenerializedUFunc(GenerializedUFunc):
+class DPPYGenerializedUFunc(GenerializedUFunc):
     @property
     def _call_steps(self):
-        return _DPPLGUFuncCallSteps
+        return _DPPYGUFuncCallSteps
 
     def _broadcast_scalar_input(self, ary, shape):
         return devicearray.DeviceNDArray(shape=shape,
@@ -188,7 +188,7 @@ def _broadcast_add_axis(self, ary, newshape):
                                          gpu_data=ary.gpu_data)
 
 
-class DPPLUFuncMechanism(UFuncMechanism):
+class DPPYUFuncMechanism(UFuncMechanism):
     """
     Provide OpenCL specialization
     """
diff --git a/numba_dppy/dppl_host_fn_call_gen.py b/numba_dppy/dppy_host_fn_call_gen.py
similarity index 98%
rename from numba_dppy/dppl_host_fn_call_gen.py
rename to numba_dppy/dppy_host_fn_call_gen.py
index 10a4820906..7d1c9bcea4 100644
--- a/numba_dppy/dppl_host_fn_call_gen.py
+++ b/numba_dppy/dppy_host_fn_call_gen.py
@@ -9,7 +9,7 @@
 
 from numba.core.ir_utils import legalize_names
 
-class DPPLHostFunctionCallsGenerator(object):
+class DPPYHostFunctionCallsGenerator(object):
     def __init__(self, lowerer, cres, num_inputs):
         self.lowerer = lowerer
         self.context = self.lowerer.context
@@ -70,31 +70,31 @@ def _init_llvm_types_and_constants(self):
     def _declare_functions(self):
         get_queue_fnty = lc.Type.function(self.void_ptr_t, ())
         self.get_queue = self.builder.module.get_or_insert_function(get_queue_fnty,
-                                                                name="DPPLQueueMgr_GetCurrentQueue")
+                                                                name="DPCTLQueueMgr_GetCurrentQueue")
 
         submit_range_fnty = lc.Type.function(self.void_ptr_t,
                 [self.void_ptr_t, self.void_ptr_t, self.void_ptr_ptr_t,
                     self.int32_ptr_t, self.intp_t, self.intp_ptr_t,
                     self.intp_t, self.void_ptr_t, self.intp_t])
         self.submit_range = self.builder.module.get_or_insert_function(submit_range_fnty,
-                                                                name="DPPLQueue_SubmitRange")
+                                                                name="DPCTLQueue_SubmitRange")
 
 
         queue_memcpy_fnty = lc.Type.function(lir.VoidType(), [self.void_ptr_t, self.void_ptr_t, self.void_ptr_t, self.intp_t])
         self.queue_memcpy = self.builder.module.get_or_insert_function(queue_memcpy_fnty,
-                                                                name="DPPLQueue_Memcpy")
+                                                                name="DPCTLQueue_Memcpy")
 
         queue_wait_fnty =  lc.Type.function(lir.VoidType(), [self.void_ptr_t])
         self.queue_wait = self.builder.module.get_or_insert_function(queue_wait_fnty,
-                                                                name="DPPLQueue_Wait")
+                                                                name="DPCTLQueue_Wait")
 
         usm_shared_fnty = lc.Type.function(self.void_ptr_t, [self.intp_t, self.void_ptr_t])
         self.usm_shared = self.builder.module.get_or_insert_function(usm_shared_fnty,
-                                                                name="DPPLmalloc_shared")
+                                                                name="DPCTLmalloc_shared")
 
         usm_free_fnty = lc.Type.function(lir.VoidType(), [self.void_ptr_t, self.void_ptr_t])
         self.usm_free = self.builder.module.get_or_insert_function(usm_free_fnty,
-                                                                   name="DPPLfree_with_queue")
+                                                                   name="DPCTLfree_with_queue")
 
     def allocate_kenrel_arg_array(self, num_kernel_args):
         self.sycl_queue_val = cgutils.alloca_once(self.builder, self.void_ptr_t)
diff --git a/numba_dppy/dppl_lowerer.py b/numba_dppy/dppy_lowerer.py
similarity index 97%
rename from numba_dppy/dppl_lowerer.py
rename to numba_dppy/dppy_lowerer.py
index a317c990a6..1561a6d85e 100644
--- a/numba_dppy/dppl_lowerer.py
+++ b/numba_dppy/dppy_lowerer.py
@@ -12,7 +12,7 @@
 from numba.core import (compiler, ir, types, sigutils, lowering,
                 funcdesc, config)
 from numba.parfors import parfor
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba.core.ir_utils import (add_offset_to_labels,
                             replace_var_names,
                             remove_dels,
@@ -38,9 +38,9 @@
 from numba.core.errors import NumbaParallelSafetyWarning, NumbaPerformanceWarning
 
 from .dufunc_inliner import dufunc_inliner
-from . import dppl_host_fn_call_gen as dppl_call_gen
+from . import dppy_host_fn_call_gen as dppy_call_gen
 import dpctl
-from numba_dppy.target import DPPLTargetContext
+from numba_dppy.target import DPPYTargetContext
 
 
 def _print_block(block):
@@ -72,7 +72,7 @@ def _schedule_loop(parfor_dim, legal_loop_indices, loop_ranges, param_dict):
 
     for eachdim in range(global_id_dim):
         gufunc_txt += ("    " + legal_loop_indices[eachdim] + " = "
-                       + "dppl.get_global_id(" + str(eachdim) + ")\n")
+                       + "dppy.get_global_id(" + str(eachdim) + ")\n")
 
 
     for eachdim in range(global_id_dim, for_loop_dim):
@@ -444,7 +444,7 @@ def print_arg_with_addrspaces(args):
         print("gufunc_txt = ", type(gufunc_txt), "\n", gufunc_txt)
         sys.stdout.flush()
     # Force gufunc outline into existence.
-    globls = {"np": np, "numba": numba, "dppl": dppl}
+    globls = {"np": np, "numba": numba, "dppy": dppy}
     locls = {}
     exec(gufunc_txt, globls, locls)
     gufunc_func = locls[gufunc_name]
@@ -740,7 +740,7 @@ def _lower_parfor_gufunc(lowerer, parfor):
         parfor.races,
         typemap)
 
-    generate_dppl_host_wrapper(
+    generate_dppy_host_wrapper(
         lowerer,
         func,
         gu_signature,
@@ -828,10 +828,10 @@ def bump_alpha(c, class_map):
     return (gu_sin, gu_sout)
 
 
-# Keep all the dppl kernels and programs created alive indefinitely.
+# Keep all the dppy kernels and programs created alive indefinitely.
 keep_alive_kernels = []
 
-def generate_dppl_host_wrapper(lowerer,
+def generate_dppy_host_wrapper(lowerer,
                                cres,
                                gu_signature,
                                outer_sig,
@@ -852,7 +852,7 @@ def generate_dppl_host_wrapper(lowerer,
     num_dim = len(loop_ranges)
 
     if config.DEBUG_ARRAY_OPT:
-        print("generate_dppl_host_wrapper")
+        print("generate_dppy_host_wrapper")
         print("args = ", expr_args)
         print("outer_sig = ", outer_sig.args, outer_sig.return_type,
               outer_sig.recvr, outer_sig.pysig)
@@ -868,8 +868,8 @@ def generate_dppl_host_wrapper(lowerer,
 #        print("cres.fndesc", cres.fndesc, type(cres.fndesc))
 
 
-    # get dppl_cpu_portion_lowerer object
-    dppl_cpu_lowerer = dppl_call_gen.DPPLHostFunctionCallsGenerator(
+    # get dppy_cpu_portion_lowerer object
+    dppy_cpu_lowerer = dppy_call_gen.DPPYHostFunctionCallsGenerator(
                            lowerer, cres, num_inputs)
 
     # Compute number of args ------------------------------------------------
@@ -886,7 +886,7 @@ def generate_dppl_host_wrapper(lowerer,
 
     # now that we know the total number of kernel args, lets allocate
     # a kernel_arg array
-    dppl_cpu_lowerer.allocate_kenrel_arg_array(num_expanded_args)
+    dppy_cpu_lowerer.allocate_kenrel_arg_array(num_expanded_args)
 
     ninouts = len(expr_args)
 
@@ -931,7 +931,7 @@ def val_type_or_none(context, lowerer, x):
                   "\n\tval_type:", val_type, type(val_type),
                   "\n\tindex:", index)
 
-        dppl_cpu_lowerer.process_kernel_arg(var, llvm_arg, arg_type, gu_sig,
+        dppy_cpu_lowerer.process_kernel_arg(var, llvm_arg, arg_type, gu_sig,
                                             val_type, index, modified_arrays)
     # -----------------------------------------------------------------------
 
@@ -951,7 +951,7 @@ def load_range(v):
         step = load_range(step)
         loop_ranges[i] = (start, stop, step)
 
-    dppl_cpu_lowerer.enqueue_kernel_and_read_back(loop_ranges)
+    dppy_cpu_lowerer.enqueue_kernel_and_read_back(loop_ranges)
 
 
 from numba.core.lowering import Lower
@@ -975,7 +975,7 @@ def relatively_deep_copy(obj, memo):
     from numba.core.types.functions import Function, Dispatcher
     from numba.core.bytecode import FunctionIdentity
     from numba.core.typing.templates import Signature
-    from numba_dppy.compiler import DPPLFunctionTemplate
+    from numba_dppy.compiler import DPPYFunctionTemplate
     from numba.core.compiler import CompileResult
     from numba.np.ufunc.dufunc import DUFunc
     from ctypes import _CFuncPtr
@@ -983,9 +983,9 @@ def relatively_deep_copy(obj, memo):
     from numba.core.types.abstract import Type
 
     # objects which shouldn't or can't be copied and it's ok not to copy it.
-    if isinstance(obj, (FunctionIdentity, _DispatcherBase, Function, Type, Dispatcher, ModuleType,
-                        Signature, DPPLFunctionTemplate, CompileResult,
-                        DUFunc, _CFuncPtr,
+    if isinstance(obj, (FunctionIdentity, _DispatcherBase, Function, Type,
+                        Dispatcher, ModuleType, Signature,
+                        DPPYFunctionTemplate, CompileResult, DUFunc, _CFuncPtr,
                         type, str, bool, type(None))):
         return obj
 
@@ -1132,7 +1132,7 @@ def get_slots_members(obj):
     return cpy
 
 
-class DPPLLower(Lower):
+class DPPYLower(Lower):
     def __init__(self, context, library, fndesc, func_ir, metadata=None):
         Lower.__init__(self, context, library, fndesc, func_ir, metadata)
         memo = {}
@@ -1141,7 +1141,7 @@ def __init__(self, context, library, fndesc, func_ir, metadata=None):
         func_ir_cpu = relatively_deep_copy(func_ir, memo)
 
 
-        cpu_context = context.cpu_context if isinstance(context, DPPLTargetContext) else context
+        cpu_context = context.cpu_context if isinstance(context, DPPYTargetContext) else context
         self.gpu_lower = Lower(context, library, fndesc, func_ir, metadata)
         self.cpu_lower = Lower(cpu_context, library, fndesc_cpu, func_ir_cpu, metadata)
 
@@ -1151,11 +1151,11 @@ def lower(self):
         # 1. Start lowering of parent function
         # 2. Try to lower parfor on GPU
         #     2.a. enter lower_parfor_rollback and prepare function to lower on GPU - insert get_global_id.
-        #         2.a.a. starting lower parfor body - enter this point (DPPLLower.lower()) second time.
+        #         2.a.a. starting lower parfor body - enter this point (DPPYLower.lower()) second time.
         #         2.a.b. If lowering on GPU failed - try on CPU.
         #         2.a.d. Since get_global_id is NOT supported with CPU context - fail and throw exception
         #     2.b. in lower_parfor_rollback catch exception and restore parfor body and other to its initial state
-        #     2.c. in lower_parfor_rollback throw expeption to catch it here (DPPLLower.lower())
+        #     2.c. in lower_parfor_rollback throw expeption to catch it here (DPPYLower.lower())
         # 3. Catch exception and start parfor lowering with CPU context.
 
         # WARNING: this approach only works in case no device specific modifications were added to
@@ -1169,7 +1169,7 @@ def lower(self):
             lowering.lower_extensions[parfor.Parfor].pop()
         except Exception as e:
             if numba_dppy.compiler.DEBUG:
-                print("Failed to lower parfor on DPPL-device. Due to:\n", e)
+                print("Failed to lower parfor on DPPY-device. Due to:\n", e)
             lowering.lower_extensions[parfor.Parfor].pop()
             if (lowering.lower_extensions[parfor.Parfor][-1] == numba.parfors.parfor_lowering._lower_parfor_parallel):
                 self.cpu_lower.lower()
@@ -1195,13 +1195,13 @@ def lower_parfor_rollback(lowerer, parfor):
     try:
         _lower_parfor_gufunc(lowerer, parfor)
         if numba_dppy.compiler.DEBUG:
-            msg = "Parfor lowered on DPPL-device"
+            msg = "Parfor lowered on DPPY-device"
             print(msg, parfor.loc)
     except Exception as e:
-        msg = "Failed to lower parfor on DPPL-device.\nTo see details set environment variable NUMBA_DPPL_DEBUG=1"
+        msg = "Failed to lower parfor on DPPY-device.\nTo see details set environment variable NUMBA_DPPY_DEBUG=1"
         warnings.warn(NumbaPerformanceWarning(msg, parfor.loc))
         raise e
 
 
-def dppl_lower_array_expr(lowerer, expr):
+def dppy_lower_array_expr(lowerer, expr):
     raise NotImplementedError(expr)
diff --git a/numba_dppy/dppl_offload_dispatcher.py b/numba_dppy/dppy_offload_dispatcher.py
similarity index 73%
rename from numba_dppy/dppl_offload_dispatcher.py
rename to numba_dppy/dppy_offload_dispatcher.py
index db841bef06..0c5fe10f5e 100644
--- a/numba_dppy/dppl_offload_dispatcher.py
+++ b/numba_dppy/dppy_offload_dispatcher.py
@@ -3,21 +3,21 @@
 import numba_dppy.config as dppy_config
 
 
-class DpplOffloadDispatcher(dispatcher.Dispatcher):
+class DppyOffloadDispatcher(dispatcher.Dispatcher):
     targetdescr = cpu_target
 
     def __init__(self, py_func, locals={}, targetoptions={}, impl_kind='direct', pipeline_class=compiler.Compiler):
         if dppy_config.dppy_present:
-            from numba_dppy.compiler import DPPLCompiler
+            from numba_dppy.compiler import DPPYCompiler
             targetoptions['parallel'] = True
             dispatcher.Dispatcher.__init__(self, py_func, locals=locals,
-                    targetoptions=targetoptions, impl_kind=impl_kind, pipeline_class=DPPLCompiler)
+                    targetoptions=targetoptions, impl_kind=impl_kind, pipeline_class=DPPYCompiler)
         else:
             print("---------------------------------------------------------------------")
-            print("WARNING : DPPL pipeline ignored. Ensure OpenCL drivers are installed.")
+            print("WARNING : DPPY pipeline ignored. Ensure OpenCL drivers are installed.")
             print("---------------------------------------------------------------------")
             dispatcher.Dispatcher.__init__(self, py_func, locals=locals,
                 targetoptions=targetoptions, impl_kind=impl_kind, pipeline_class=pipeline_class)
 
-dispatcher_registry['__dppl_offload_gpu__'] = DpplOffloadDispatcher
-dispatcher_registry['__dppl_offload_cpu__'] = DpplOffloadDispatcher
+dispatcher_registry['__dppy_offload_gpu__'] = DppyOffloadDispatcher
+dispatcher_registry['__dppy_offload_cpu__'] = DppyOffloadDispatcher
diff --git a/numba_dppy/dppl_passbuilder.py b/numba_dppy/dppy_passbuilder.py
similarity index 82%
rename from numba_dppy/dppl_passbuilder.py
rename to numba_dppy/dppy_passbuilder.py
index 0ddaea6d0b..0a32a099cf 100644
--- a/numba_dppy/dppl_passbuilder.py
+++ b/numba_dppy/dppy_passbuilder.py
@@ -17,19 +17,19 @@
                                 DumpParforDiagnostics, IRLegalization,
                                 InlineOverloads, PreLowerStripPhis)
 
-from .dppl_passes import (
-        DPPLConstantSizeStaticLocalMemoryPass,
-        DPPLPreParforPass,
-        DPPLParforPass,
+from .dppy_passes import (
+        DPPYConstantSizeStaticLocalMemoryPass,
+        DPPYPreParforPass,
+        DPPYParforPass,
         SpirvFriendlyLowering,
-        DPPLAddNumpyOverloadPass,
-        DPPLAddNumpyRemoveOverloadPass,
-        DPPLNoPythonBackend
+        DPPYAddNumpyOverloadPass,
+        DPPYAddNumpyRemoveOverloadPass,
+        DPPYNoPythonBackend
         )
 
-class DPPLPassBuilder(object):
+class DPPYPassBuilder(object):
     """
-    This is the DPPL pass builder to run Intel GPU/CPU specific
+    This is the DPPY pass builder to run Intel GPU/CPU specific
     code-generation and optimization passes. This pass builder does
     not offer objectmode and interpreted passes.
     """
@@ -46,12 +46,12 @@ def default_numba_nopython_pipeline(state, pm):
 
         # this pass adds required logic to overload default implementation of
         # Numpy functions
-        pm.add_pass(DPPLAddNumpyOverloadPass, "dppl add typing template for Numpy functions")
+        pm.add_pass(DPPYAddNumpyOverloadPass, "dppy add typing template for Numpy functions")
 
         # Add pass to ensure when users are allocating static
         # constant memory the size is a constant and can not
         # come from a closure variable
-        pm.add_pass(DPPLConstantSizeStaticLocalMemoryPass, "dppl constant size for static local memory")
+        pm.add_pass(DPPYConstantSizeStaticLocalMemoryPass, "dppy constant size for static local memory")
 
         # pre typing
         if not state.flags.no_rewrites:
@@ -90,24 +90,24 @@ def default_numba_nopython_pipeline(state, pm):
 
 
     @staticmethod
-    def define_nopython_pipeline(state, name='dppl_nopython'):
+    def define_nopython_pipeline(state, name='dppy_nopython'):
         """Returns an nopython mode pipeline based PassManager
         """
         pm = PassManager(name)
-        DPPLPassBuilder.default_numba_nopython_pipeline(state, pm)
+        DPPYPassBuilder.default_numba_nopython_pipeline(state, pm)
 
         # Intel GPU/CPU specific optimizations
-        pm.add_pass(DPPLPreParforPass, "Preprocessing for parfors")
+        pm.add_pass(DPPYPreParforPass, "Preprocessing for parfors")
         if not state.flags.no_rewrites:
             pm.add_pass(NopythonRewrites, "nopython rewrites")
-        pm.add_pass(DPPLParforPass, "convert to parfors")
+        pm.add_pass(DPPYParforPass, "convert to parfors")
 
         # legalise
         pm.add_pass(IRLegalization, "ensure IR is legal prior to lowering")
 
         # lower
         pm.add_pass(SpirvFriendlyLowering, "SPIRV-friendly lowering pass")
-        pm.add_pass(DPPLNoPythonBackend, "nopython mode backend")
-        pm.add_pass(DPPLAddNumpyRemoveOverloadPass, "dppl remove typing template for Numpy functions")
+        pm.add_pass(DPPYNoPythonBackend, "nopython mode backend")
+        pm.add_pass(DPPYAddNumpyRemoveOverloadPass, "dppy remove typing template for Numpy functions")
         pm.finalize()
         return pm
diff --git a/numba_dppy/dppl_passes.py b/numba_dppy/dppy_passes.py
similarity index 95%
rename from numba_dppy/dppl_passes.py
rename to numba_dppy/dppy_passes.py
index f9e2633c3c..0bb2eadb48 100644
--- a/numba_dppy/dppl_passes.py
+++ b/numba_dppy/dppy_passes.py
@@ -24,7 +24,7 @@
 
 from numba.core.compiler_machinery import FunctionPass, LoweringPass, register_pass
 
-from .dppl_lowerer import DPPLLower
+from .dppy_lowerer import DPPYLower
 
 from numba.parfors.parfor import PreParforPass as _parfor_PreParforPass, replace_functions_map
 from numba.parfors.parfor import ParforPass as _parfor_ParforPass
@@ -40,8 +40,8 @@ def dpnp_available():
 
 
 @register_pass(mutates_CFG=False, analysis_only=True)
-class DPPLAddNumpyOverloadPass(FunctionPass):
-    _name = "dppl_add_numpy_overload_pass"
+class DPPYAddNumpyOverloadPass(FunctionPass):
+    _name = "dppy_add_numpy_overload_pass"
 
     def __init__(self):
         FunctionPass.__init__(self)
@@ -122,8 +122,8 @@ def generic(self, args, kws):
         return True
 
 @register_pass(mutates_CFG=False, analysis_only=True)
-class DPPLAddNumpyRemoveOverloadPass(FunctionPass):
-    _name = "dppl_remove_numpy_overload_pass"
+class DPPYAddNumpyRemoveOverloadPass(FunctionPass):
+    _name = "dppy_remove_numpy_overload_pass"
 
     def __init__(self):
         FunctionPass.__init__(self)
@@ -143,9 +143,9 @@ def run_pass(self, state):
         return True
 
 @register_pass(mutates_CFG=True, analysis_only=False)
-class DPPLConstantSizeStaticLocalMemoryPass(FunctionPass):
+class DPPYConstantSizeStaticLocalMemoryPass(FunctionPass):
 
-    _name = "dppl_constant_size_static_local_memory_pass"
+    _name = "dppy_constant_size_static_local_memory_pass"
 
     def __init__(self):
         FunctionPass.__init__(self)
@@ -218,9 +218,9 @@ def run_pass(self, state):
 
 
 @register_pass(mutates_CFG=True, analysis_only=False)
-class DPPLPreParforPass(FunctionPass):
+class DPPYPreParforPass(FunctionPass):
 
-    _name = "dppl_pre_parfor_pass"
+    _name = "dppy_pre_parfor_pass"
 
     def __init__(self):
         FunctionPass.__init__(self)
@@ -262,9 +262,9 @@ def run_pass(self, state):
 
 
 @register_pass(mutates_CFG=True, analysis_only=False)
-class DPPLParforPass(FunctionPass):
+class DPPYParforPass(FunctionPass):
 
-    _name = "dppl_parfor_pass"
+    _name = "dppy_parfor_pass"
 
     def __init__(self):
         FunctionPass.__init__(self)
@@ -343,9 +343,9 @@ def run_pass(self, state):
         targetctx = state.targetctx
 
         # This should not happen here, after we have the notion of context in Numba
-        # we should have specialized dispatcher for dppl context and that dispatcher
+        # we should have specialized dispatcher for dppy context and that dispatcher
         # should be a cpu dispatcher that will overload the lowering functions for
-        # linalg for dppl.cpu_dispatcher and the dppl.gpu_dipatcher should be the
+        # linalg for dppy.cpu_dispatcher and the dppy.gpu_dipatcher should be the
         # current target context we have to launch kernels.
         # This is broken as this essentially adds the new lowering in a list which
         # means it does not get replaced with the new lowering_buitins
@@ -373,7 +373,7 @@ def run_pass(self, state):
                     noalias=flags.noalias)
 
             with targetctx.push_code_library(library):
-                lower = DPPLLower(targetctx, library, fndesc, interp,
+                lower = DPPYLower(targetctx, library, fndesc, interp,
                                        metadata=metadata)
                 lower.lower()
                 if not flags.no_cpython_wrapper:
@@ -400,7 +400,7 @@ def run_pass(self, state):
 
 
 @register_pass(mutates_CFG=True, analysis_only=False)
-class DPPLNoPythonBackend(FunctionPass):
+class DPPYNoPythonBackend(FunctionPass):
 
     _name = "nopython_backend"
 
diff --git a/numba_dppy/examples/dppl_func.py b/numba_dppy/examples/dppy_func.py
similarity index 81%
rename from numba_dppy/examples/dppl_func.py
rename to numba_dppy/examples/dppy_func.py
index ec86681457..353ba48995 100644
--- a/numba_dppy/examples/dppl_func.py
+++ b/numba_dppy/examples/dppy_func.py
@@ -1,26 +1,26 @@
 import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import math
 
 import dpctl
 
 
-@dppl.func
+@dppy.func
 def g(a):
     return a + 1
 
 
-@dppl.kernel
+@dppy.kernel
 def f(a, b):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     b[i] = g(a[i])
 
 
 def driver(a, b, N):
     print(b)
     print("--------")
-    f[N, dppl.DEFAULT_LOCAL_SIZE](a, b)
+    f[N, dppy.DEFAULT_LOCAL_SIZE](a, b)
     print(b)
 
 
diff --git a/numba_dppy/examples/dppl_with_context.py b/numba_dppy/examples/dppy_with_context.py
similarity index 94%
rename from numba_dppy/examples/dppl_with_context.py
rename to numba_dppy/examples/dppy_with_context.py
index c830e81ec6..6df025f5ca 100644
--- a/numba_dppy/examples/dppl_with_context.py
+++ b/numba_dppy/examples/dppy_with_context.py
@@ -1,6 +1,6 @@
 import numpy as np
 from numba import njit, prange
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 
 @njit
diff --git a/numba_dppy/examples/matmul.py b/numba_dppy/examples/matmul.py
index 35bef5be8a..b97ac49ca1 100644
--- a/numba_dppy/examples/matmul.py
+++ b/numba_dppy/examples/matmul.py
@@ -4,14 +4,14 @@
 
 import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 
 
-@dppl.kernel
-def dppl_gemm(a, b, c):
-    i = dppl.get_global_id(0)
-    j = dppl.get_global_id(1)
+@dppy.kernel
+def dppy_gemm(a, b, c):
+    i = dppy.get_global_id(0)
+    j = dppy.get_global_id(1)
     if i >= c.shape[0] or j >= c.shape[1]:
         return
     c[i,j] = 0
@@ -30,7 +30,7 @@ def dppl_gemm(a, b, c):
 
 def driver(a, b, c):
     # Invoke the kernel
-    dppl_gemm[griddim,blockdim](a, b, c)
+    dppy_gemm[griddim,blockdim](a, b, c)
 
 
 def main():
diff --git a/numba_dppy/examples/pairwise_distance.py b/numba_dppy/examples/pairwise_distance.py
index cc5c232c92..b72c41ba9c 100644
--- a/numba_dppy/examples/pairwise_distance.py
+++ b/numba_dppy/examples/pairwise_distance.py
@@ -6,7 +6,7 @@
 import argparse
 import timeit
 
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 import dpctl._memory as dpctl_mem
 
@@ -28,9 +28,9 @@
 D = np.empty((args.n, args.n))
 
 
-@dppl.kernel
+@dppy.kernel
 def pairwise_distance(X, D, xshape0, xshape1):
-    idx = dppl.get_global_id(0)
+    idx = dppy.get_global_id(0)
 
     #for i in range(xshape0):
     for j in range(X.shape[0]):
diff --git a/numba_dppy/examples/sum-hybrid.py b/numba_dppy/examples/sum-hybrid.py
index 418976f53a..e66c51ae2c 100644
--- a/numba_dppy/examples/sum-hybrid.py
+++ b/numba_dppy/examples/sum-hybrid.py
@@ -4,13 +4,13 @@
 
 import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 
 
-@dppl.kernel
+@dppy.kernel
 def data_parallel_sum(a, b, c):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     c[i] = a[i] + b[i]
 
 
@@ -27,7 +27,7 @@ def main():
             c = np.ones_like(a)
             print("before A: ", a)
             print("before B: ", b)
-            data_parallel_sum[global_size, dppl.DEFAULT_LOCAL_SIZE](a, b, c)
+            data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)
             print("after  C: ", c)
     else:
         print("CPU device not found")
@@ -40,7 +40,7 @@ def main():
             c = np.ones_like(a)
             print("before A: ", a)
             print("before B: ", b)
-            data_parallel_sum[global_size, dppl.DEFAULT_LOCAL_SIZE](a, b, c)
+            data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)
             print("after  C: ", c)
     else:
         print("GPU device not found")
diff --git a/numba_dppy/examples/sum.py b/numba_dppy/examples/sum.py
index f97b8243cb..fdc1623fa7 100644
--- a/numba_dppy/examples/sum.py
+++ b/numba_dppy/examples/sum.py
@@ -4,13 +4,13 @@
 
 import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 
 
-@dppl.kernel
+@dppy.kernel
 def data_parallel_sum(a, b, c):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     c[i] = a[i] + b[i]
 
 
@@ -18,7 +18,7 @@ def driver(a, b, c, global_size):
     print("before : ", a)
     print("before : ", b)
     print("before : ", c)
-    data_parallel_sum[global_size, dppl.DEFAULT_LOCAL_SIZE](a, b, c)
+    data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)
     print("after : ", c)
 
 
diff --git a/numba_dppy/examples/sum2D.py b/numba_dppy/examples/sum2D.py
index 00be613d2b..90959c8bdf 100644
--- a/numba_dppy/examples/sum2D.py
+++ b/numba_dppy/examples/sum2D.py
@@ -4,21 +4,21 @@
 
 import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 
 
-@dppl.kernel
+@dppy.kernel
 def data_parallel_sum(a, b, c):
-    i = dppl.get_global_id(0)
-    j = dppl.get_global_id(1)
+    i = dppy.get_global_id(0)
+    j = dppy.get_global_id(1)
     c[i,j] = a[i,j] + b[i,j]
 
 
 def driver(a, b, c, global_size):
     print("before A: ", a)
     print("before B: ", b)
-    data_parallel_sum[global_size, dppl.DEFAULT_LOCAL_SIZE](a, b, c)
+    data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)
     print("after  C : ", c)
 
 
diff --git a/numba_dppy/examples/sum_ndarray.py b/numba_dppy/examples/sum_ndarray.py
index 6486be0275..2aea8e080a 100644
--- a/numba_dppy/examples/sum_ndarray.py
+++ b/numba_dppy/examples/sum_ndarray.py
@@ -4,13 +4,13 @@
 
 import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 
 
-@dppl.kernel(access_types={"read_only": ['a', 'b'], "write_only": ['c'], "read_write": []})
+@dppy.kernel(access_types={"read_only": ['a', 'b'], "write_only": ['c'], "read_write": []})
 def data_parallel_sum(a, b, c):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     c[i] = a[i] + b[i]
 
 
diff --git a/numba_dppy/examples/sum_reduction.py b/numba_dppy/examples/sum_reduction.py
index 3e00f95631..367fa37952 100644
--- a/numba_dppy/examples/sum_reduction.py
+++ b/numba_dppy/examples/sum_reduction.py
@@ -4,13 +4,13 @@
 import math
 import time
 
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 
 
-@dppl.kernel
+@dppy.kernel
 def reduction_kernel(A, R, stride):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     # sum two element
     R[i] = A[i] + A[i+stride]
     # store the sum to be used in nex iteration
@@ -34,7 +34,7 @@ def test_sum_reduction():
             while (total > 1):
                 # call kernel
                 global_size = total // 2
-                reduction_kernel[global_size, dppl.DEFAULT_LOCAL_SIZE](A, R, global_size)
+                reduction_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, R, global_size)
                 total = total // 2
 
     else:
diff --git a/numba_dppy/examples/sum_reduction_ocl.py b/numba_dppy/examples/sum_reduction_ocl.py
index e2605a7bbc..8d8e0411aa 100644
--- a/numba_dppy/examples/sum_reduction_ocl.py
+++ b/numba_dppy/examples/sum_reduction_ocl.py
@@ -1,20 +1,20 @@
 import sys
 import numpy as np
 from numba import int32
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import math
 
 import dpctl
 
 def sum_reduction_device_plus_host():
-    @dppl.kernel
+    @dppy.kernel
     def sum_reduction_kernel(inp, partial_sums):
-        local_id   = dppl.get_local_id(0)
-        global_id  = dppl.get_global_id(0)
-        group_size = dppl.get_local_size(0)
-        group_id   = dppl.get_group_id(0)
+        local_id   = dppy.get_local_id(0)
+        global_id  = dppy.get_global_id(0)
+        group_size = dppy.get_local_size(0)
+        group_id   = dppy.get_group_id(0)
 
-        local_sums = dppl.local.static_alloc(64, int32)
+        local_sums = dppy.local.static_alloc(64, int32)
 
         # Copy from global to local memory
         local_sums[local_id] = inp[global_id]
@@ -23,7 +23,7 @@ def sum_reduction_kernel(inp, partial_sums):
         stride = group_size // 2
         while (stride > 0):
             # Waiting for each 2x2 addition into given workgroup
-            dppl.barrier(dppl.CLK_LOCAL_MEM_FENCE)
+            dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE)
 
             # Add elements 2 by 2 between local_id and local_id + stride
             if (local_id < stride):
diff --git a/numba_dppy/examples/sum_reduction_recursive_ocl.py b/numba_dppy/examples/sum_reduction_recursive_ocl.py
index 11f5023a3b..c5dd6daa47 100644
--- a/numba_dppy/examples/sum_reduction_recursive_ocl.py
+++ b/numba_dppy/examples/sum_reduction_recursive_ocl.py
@@ -1,7 +1,7 @@
 import sys
 import numpy as np
 from numba import int32
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import math
 
 import dpctl
@@ -11,15 +11,15 @@
 def recursive_reduction(size, group_size,
                         Dinp, Dpartial_sums):
 
-    @dppl.kernel
+    @dppy.kernel
     def sum_reduction_kernel(inp, input_size,
                              partial_sums):
-        local_id   = dppl.get_local_id(0)
-        global_id  = dppl.get_global_id(0)
-        group_size = dppl.get_local_size(0)
-        group_id   = dppl.get_group_id(0)
+        local_id   = dppy.get_local_id(0)
+        global_id  = dppy.get_global_id(0)
+        group_size = dppy.get_local_size(0)
+        group_id   = dppy.get_group_id(0)
 
-        local_sums = dppl.local.static_alloc(64, int32)
+        local_sums = dppy.local.static_alloc(64, int32)
 
         local_sums[local_id] = 0
 
@@ -30,7 +30,7 @@ def sum_reduction_kernel(inp, input_size,
         stride = group_size // 2
         while (stride > 0):
             # Waiting for each 2x2 addition into given workgroup
-            dppl.barrier(dppl.CLK_LOCAL_MEM_FENCE)
+            dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE)
 
             # Add elements 2 by 2 between local_id and local_id + stride
             if (local_id < stride):
diff --git a/numba_dppy/experimental_numpy_lowering_overload.py b/numba_dppy/experimental_numpy_lowering_overload.py
index 2123e6667d..dd1e2a1eb6 100644
--- a/numba_dppy/experimental_numpy_lowering_overload.py
+++ b/numba_dppy/experimental_numpy_lowering_overload.py
@@ -77,7 +77,7 @@ def get_sycl_queue(context, builder):
     void_ptr_t = context.get_value_type(types.voidptr)
     get_queue_fnty = lc.Type.function(void_ptr_t, ())
     get_queue = builder.module.get_or_insert_function(get_queue_fnty,
-                                            name="DPPLQueueMgr_GetCurrentQueue")
+                                            name="DPCTLQueueMgr_GetCurrentQueue")
     sycl_queue_val = cgutils.alloca_once(builder, void_ptr_t)
     builder.store(builder.call(get_queue, []), sycl_queue_val)
 
@@ -87,7 +87,7 @@ def allocate_usm(context, builder, size, sycl_queue):
     void_ptr_t = context.get_value_type(types.voidptr)
     usm_shared_fnty = lc.Type.function(void_ptr_t, [ll_intp_t, void_ptr_t])
     usm_shared = builder.module.get_or_insert_function(usm_shared_fnty,
-                                                       name="DPPLmalloc_shared")
+                                                       name="DPCTLmalloc_shared")
 
     buffer_ptr = cgutils.alloca_once(builder, void_ptr_t)
     args = [size, builder.load(sycl_queue)]
@@ -100,7 +100,7 @@ def copy_usm(context, builder, src, dst, size, sycl_queue):
     queue_memcpy_fnty = lc.Type.function(ir.VoidType(), [void_ptr_t, void_ptr_t, void_ptr_t,
                                                          ll_intp_t])
     queue_memcpy = builder.module.get_or_insert_function(queue_memcpy_fnty,
-                                                       name="DPPLQueue_Memcpy")
+                                                       name="DPCTLQueue_Memcpy")
     args = [builder.load(sycl_queue),
             builder.bitcast(dst, void_ptr_t),
             builder.bitcast(src, void_ptr_t),
@@ -113,7 +113,7 @@ def free_usm(context, builder, usm_buf, sycl_queue):
 
     usm_free_fnty = lc.Type.function(ir.VoidType(), [void_ptr_t, void_ptr_t])
     usm_free = builder.module.get_or_insert_function(usm_free_fnty,
-                                               name="DPPLfree_with_queue")
+                                               name="DPCTLfree_with_queue")
 
     builder.call(usm_free, [usm_buf, builder.load(sycl_queue)])
 
@@ -350,7 +350,7 @@ def make_res(a, b):
 
 
 @lower_builtin(np.dot, types.Array, types.Array)
-def dot_dppl(context, builder, sig, args):
+def dot_dppy(context, builder, sig, args):
     """
     np.dot(a, b)
     a @ b
@@ -374,7 +374,7 @@ def dot_dppl(context, builder, sig, args):
 
 
 @lower_builtin("np.matmul", types.Array, types.Array)
-def matmul_dppl(context, builder, sig, args):
+def matmul_dppy(context, builder, sig, args):
     """
     np.matmul(matrix, matrix)
     """
diff --git a/numba_dppy/initialize.py b/numba_dppy/initialize.py
index c8ba56220a..745e8031eb 100644
--- a/numba_dppy/initialize.py
+++ b/numba_dppy/initialize.py
@@ -5,8 +5,8 @@
 
 
 def init_jit():
-    from numba_dppy.dispatcher import DPPLDispatcher
-    return DPPLDispatcher
+    from numba_dppy.dispatcher import DPPYDispatcher
+    return DPPYDispatcher
 
 def initialize_all():
     from numba.core.registry import dispatcher_registry
@@ -17,9 +17,9 @@ def initialize_all():
     import platform as plt
     platform = plt.system()
     if platform == 'Windows':
-        paths = glob.glob(os.path.join(os.path.dirname(dpctl.__file__), '*DPPLSyclInterface.dll'))
+        paths = glob.glob(os.path.join(os.path.dirname(dpctl.__file__), '*DPCTLSyclInterface.dll'))
     else:
-        paths = glob.glob(os.path.join(os.path.dirname(dpctl.__file__), '*DPPLSyclInterface*'))
+        paths = glob.glob(os.path.join(os.path.dirname(dpctl.__file__), '*DPCTLSyclInterface*'))
 
     if len(paths) == 1:
         ll.load_library_permanently(find_library(paths[0]))
diff --git a/numba_dppy/ocl/atomics/atomic_ops.cl b/numba_dppy/ocl/atomics/atomic_ops.cl
index ad581716de..56228d8bf5 100644
--- a/numba_dppy/ocl/atomics/atomic_ops.cl
+++ b/numba_dppy/ocl/atomics/atomic_ops.cl
@@ -5,7 +5,7 @@
 #ifdef cl_khr_int64_base_atomics
   #pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable
 
-  long numba_dppl_atomic_add_i64_local(volatile __generic long *p, long val) {
+  long numba_dppy_atomic_add_i64_local(volatile __generic long *p, long val) {
       long found = *p;
       long expected;
       do {
@@ -15,7 +15,7 @@
       return found;
   }
 
-  long numba_dppl_atomic_add_i64_global(volatile __generic long *p, long val) {
+  long numba_dppy_atomic_add_i64_global(volatile __generic long *p, long val) {
       long found = *p;
       long expected;
       do {
@@ -25,7 +25,7 @@
       return found;
   }
 
-  long numba_dppl_atomic_sub_i64_local(volatile __generic long *p, long val) {
+  long numba_dppy_atomic_sub_i64_local(volatile __generic long *p, long val) {
       long found = *p;
       long expected;
       do {
@@ -35,7 +35,7 @@
       return found;
   }
 
-  long numba_dppl_atomic_sub_i64_global(volatile __generic long *p, long val) {
+  long numba_dppy_atomic_sub_i64_global(volatile __generic long *p, long val) {
       long found = *p;
       long expected;
       do {
@@ -48,7 +48,7 @@
   #ifdef cl_khr_fp64
     #pragma OPENCL EXTENSION cl_khr_fp64: enable
 
-    double numba_dppl_atomic_cmpxchg_f64_local(volatile __generic double *p, double cmp, double val) {
+    double numba_dppy_atomic_cmpxchg_f64_local(volatile __generic double *p, double cmp, double val) {
         union {
             ulong  u64;
             double f64;
@@ -60,7 +60,7 @@
         return old_union.f64;
     }
 
-    double numba_dppl_atomic_cmpxchg_f64_global(volatile __generic double *p, double cmp, double val) {
+    double numba_dppy_atomic_cmpxchg_f64_global(volatile __generic double *p, double cmp, double val) {
         union {
             ulong  u64;
             double f64;
@@ -72,50 +72,50 @@
         return old_union.f64;
     }
 
-    double numba_dppl_atomic_add_f64_local(volatile __generic double *p, double val) {
+    double numba_dppy_atomic_add_f64_local(volatile __generic double *p, double val) {
         double  found = *p;
         double  expected;
         do {
             expected = found;
-            found = numba_dppl_atomic_cmpxchg_f64_local(p, expected, expected + val);
+            found = numba_dppy_atomic_cmpxchg_f64_local(p, expected, expected + val);
         } while (found != expected);
         return found;
     }
 
-    double numba_dppl_atomic_add_f64_global(volatile __generic double *p, double val) {
+    double numba_dppy_atomic_add_f64_global(volatile __generic double *p, double val) {
         double  found = *p;
         double  expected;
         do {
             expected = found;
-            found = numba_dppl_atomic_cmpxchg_f64_global(p, expected, expected + val);
+            found = numba_dppy_atomic_cmpxchg_f64_global(p, expected, expected + val);
         } while (found != expected);
         return found;
     }
 
 
-    double numba_dppl_atomic_sub_f64_local(volatile __generic double *p, double val) {
+    double numba_dppy_atomic_sub_f64_local(volatile __generic double *p, double val) {
         double  found = *p;
         double  expected;
         do {
             expected = found;
-            found = numba_dppl_atomic_cmpxchg_f64_local(p, expected, expected - val);
+            found = numba_dppy_atomic_cmpxchg_f64_local(p, expected, expected - val);
         } while (found != expected);
         return found;
     }
 
-    double numba_dppl_atomic_sub_f64_global(volatile __generic double *p, double val) {
+    double numba_dppy_atomic_sub_f64_global(volatile __generic double *p, double val) {
         double  found = *p;
         double  expected;
         do {
             expected = found;
-            found = numba_dppl_atomic_cmpxchg_f64_global(p, expected, expected - val);
+            found = numba_dppy_atomic_cmpxchg_f64_global(p, expected, expected - val);
         } while (found != expected);
         return found;
     }
   #endif
 #endif
 
-float numba_dppl_atomic_cmpxchg_f32_local(volatile __generic float *p, float cmp, float val) {
+float numba_dppy_atomic_cmpxchg_f32_local(volatile __generic float *p, float cmp, float val) {
     union {
         unsigned int u32;
         float        f32;
@@ -127,7 +127,7 @@ float numba_dppl_atomic_cmpxchg_f32_local(volatile __generic float *p, float cmp
     return old_union.f32;
 }
 
-float numba_dppl_atomic_cmpxchg_f32_global(volatile __generic float *p, float cmp, float val) {
+float numba_dppy_atomic_cmpxchg_f32_global(volatile __generic float *p, float cmp, float val) {
     union {
         unsigned int u32;
         float        f32;
@@ -139,47 +139,47 @@ float numba_dppl_atomic_cmpxchg_f32_global(volatile __generic float *p, float cm
     return old_union.f32;
 }
 
-float numba_dppl_atomic_add_f32_local(volatile __generic float *p, float val) {
+float numba_dppy_atomic_add_f32_local(volatile __generic float *p, float val) {
     float found = *p;
     float expected;
     do {
         expected = found;
-        found = numba_dppl_atomic_cmpxchg_f32_local(p, expected, expected + val);
+        found = numba_dppy_atomic_cmpxchg_f32_local(p, expected, expected + val);
     } while (found != expected);
     return found;
 }
 
-float numba_dppl_atomic_add_f32_global(volatile __generic float *p, float val) {
+float numba_dppy_atomic_add_f32_global(volatile __generic float *p, float val) {
     float found = *p;
     float expected;
     do {
         expected = found;
-        found = numba_dppl_atomic_cmpxchg_f32_global(p, expected, expected + val);
+        found = numba_dppy_atomic_cmpxchg_f32_global(p, expected, expected + val);
     } while (found != expected);
     return found;
 }
 
-float numba_dppl_atomic_sub_f32_local(volatile __generic float *p, float val) {
+float numba_dppy_atomic_sub_f32_local(volatile __generic float *p, float val) {
     float found = *p;
     float expected;
     do {
         expected = found;
-        found = numba_dppl_atomic_cmpxchg_f32_local(p, expected, expected - val);
+        found = numba_dppy_atomic_cmpxchg_f32_local(p, expected, expected - val);
     } while (found != expected);
     return found;
 }
 
-float numba_dppl_atomic_sub_f32_global(volatile __generic float *p, float val) {
+float numba_dppy_atomic_sub_f32_global(volatile __generic float *p, float val) {
     float found = *p;
     float expected;
     do {
         expected = found;
-        found = numba_dppl_atomic_cmpxchg_f32_global(p, expected, expected - val);
+        found = numba_dppy_atomic_cmpxchg_f32_global(p, expected, expected - val);
     } while (found != expected);
     return found;
 }
 
-int numba_dppl_atomic_add_i32_local(volatile __generic int *p, int val) {
+int numba_dppy_atomic_add_i32_local(volatile __generic int *p, int val) {
     int found = *p;
     int expected;
     do {
@@ -189,7 +189,7 @@ int numba_dppl_atomic_add_i32_local(volatile __generic int *p, int val) {
     return found;
 }
 
-int numba_dppl_atomic_add_i32_global(volatile __generic int *p, int val) {
+int numba_dppy_atomic_add_i32_global(volatile __generic int *p, int val) {
     int found = *p;
     int expected;
     do {
@@ -199,7 +199,7 @@ int numba_dppl_atomic_add_i32_global(volatile __generic int *p, int val) {
     return found;
 }
 
-int numba_dppl_atomic_sub_i32_local(volatile __generic int *p, int val) {
+int numba_dppy_atomic_sub_i32_local(volatile __generic int *p, int val) {
     int found = *p;
     int expected;
     do {
@@ -209,7 +209,7 @@ int numba_dppl_atomic_sub_i32_local(volatile __generic int *p, int val) {
     return found;
 }
 
-int numba_dppl_atomic_sub_i32_global(volatile __generic int *p, int val) {
+int numba_dppy_atomic_sub_i32_global(volatile __generic int *p, int val) {
     int found = *p;
     int expected;
     do {
diff --git a/numba_dppy/ocl/ocldecl.py b/numba_dppy/ocl/ocldecl.py
index 1af90a6884..adf14a1815 100644
--- a/numba_dppy/ocl/ocldecl.py
+++ b/numba_dppy/ocl/ocldecl.py
@@ -4,7 +4,7 @@
 from numba.core.typing.templates import (AttributeTemplate, ConcreteTemplate,
                                         AbstractTemplate, MacroTemplate,
                                         signature, Registry)
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 
 registry = Registry()
 intrinsic = registry.register
@@ -15,71 +15,71 @@
 
 @intrinsic
 class Ocl_get_global_id(ConcreteTemplate):
-    key = dppl.get_global_id
+    key = dppy.get_global_id
     cases = [signature(types.intp, types.uint32)]
 
 
 @intrinsic
 class Ocl_get_local_id(ConcreteTemplate):
-    key = dppl.get_local_id
+    key = dppy.get_local_id
     cases = [signature(types.intp, types.uint32)]
 
 
 @intrinsic
 class Ocl_get_group_id(ConcreteTemplate):
-    key = dppl.get_group_id
+    key = dppy.get_group_id
     cases = [signature(types.intp, types.uint32)]
 
 
 @intrinsic
 class Ocl_get_num_groups(ConcreteTemplate):
-    key = dppl.get_num_groups
+    key = dppy.get_num_groups
     cases = [signature(types.intp, types.uint32)]
 
 
 @intrinsic
 class Ocl_get_work_dim(ConcreteTemplate):
-    key = dppl.get_work_dim
+    key = dppy.get_work_dim
     cases = [signature(types.uint32)]
 
 
 @intrinsic
 class Ocl_get_global_size(ConcreteTemplate):
-    key = dppl.get_global_size
+    key = dppy.get_global_size
     cases = [signature(types.intp, types.uint32)]
 
 
 @intrinsic
 class Ocl_get_local_size(ConcreteTemplate):
-    key = dppl.get_local_size
+    key = dppy.get_local_size
     cases = [signature(types.intp, types.uint32)]
 
 
 @intrinsic
 class Ocl_barrier(ConcreteTemplate):
-    key = dppl.barrier
+    key = dppy.barrier
     cases = [signature(types.void, types.uint32),
              signature(types.void)]
 
 
 @intrinsic
 class Ocl_mem_fence(ConcreteTemplate):
-    key = dppl.mem_fence
+    key = dppy.mem_fence
     cases = [signature(types.void, types.uint32)]
 
 
 @intrinsic
 class Ocl_sub_group_barrier(ConcreteTemplate):
-    key = dppl.sub_group_barrier
+    key = dppy.sub_group_barrier
 
     cases = [signature(types.void)]
 
 
-# dppl.atomic submodule -------------------------------------------------------
+# dppy.atomic submodule -------------------------------------------------------
 
 @intrinsic
 class Ocl_atomic_add(AbstractTemplate):
-    key = dppl.atomic.add
+    key = dppy.atomic.add
 
     def generic(self, args, kws):
         assert not kws
@@ -92,7 +92,7 @@ def generic(self, args, kws):
 
 @intrinsic
 class Ocl_atomic_sub(AbstractTemplate):
-    key = dppl.atomic.sub
+    key = dppy.atomic.sub
 
     def generic(self, args, kws):
         assert not kws
@@ -106,7 +106,7 @@ def generic(self, args, kws):
 
 @intrinsic_attr
 class OclAtomicTemplate(AttributeTemplate):
-    key = types.Module(dppl.atomic)
+    key = types.Module(dppy.atomic)
 
     def resolve_add(self, mod):
         return types.Function(Ocl_atomic_add)
@@ -115,15 +115,15 @@ def resolve_sub(self, mod):
         return types.Function(Ocl_atomic_sub)
 
 
-# dppl.local submodule -------------------------------------------------------
+# dppy.local submodule -------------------------------------------------------
 
 class Ocl_local_alloc(MacroTemplate):
-    key = dppl.local.static_alloc
+    key = dppy.local.static_alloc
 
 
 @intrinsic_attr
 class OclLocalTemplate(AttributeTemplate):
-    key = types.Module(dppl.local)
+    key = types.Module(dppy.local)
 
     def resolve_static_alloc(self, mod):
         return types.Macro(Ocl_local_alloc)
@@ -133,7 +133,7 @@ def resolve_static_alloc(self, mod):
 
 @intrinsic_attr
 class OclModuleTemplate(AttributeTemplate):
-    key = types.Module(dppl)
+    key = types.Module(dppy)
 
     def resolve_get_global_id(self, mod):
         return types.Function(Ocl_get_global_id)
@@ -166,11 +166,11 @@ def resolve_sub_group_barrier(self, mod):
         return types.Function(Ocl_sub_group_barrier)
 
     def resolve_atomic(self, mod):
-        return types.Module(dppl.atomic)
+        return types.Module(dppy.atomic)
 
     def resolve_local(self, mod):
-        return types.Module(dppl.local)
+        return types.Module(dppy.local)
 
 # intrinsic
 
-#intrinsic_global(dppl, types.Module(dppl))
+#intrinsic_global(dppy, types.Module(dppy))
diff --git a/numba_dppy/ocl/oclimpl.py b/numba_dppy/ocl/oclimpl.py
index b92dca7bae..26f8482799 100644
--- a/numba_dppy/ocl/oclimpl.py
+++ b/numba_dppy/ocl/oclimpl.py
@@ -169,9 +169,9 @@ def insert_and_call_atomic_fn(context, builder, sig, fn_type,
         ll_val = ir.IntType(32)
         ll_p = ll_val.as_pointer()
         if fn_type == "add":
-            name = "numba_dppl_atomic_add_i32"
+            name = "numba_dppy_atomic_add_i32"
         elif fn_type == "sub":
-            name = "numba_dppl_atomic_sub_i32"
+            name = "numba_dppy_atomic_sub_i32"
         else:
             raise TypeError("Operation type is not supported %s" %
                              (fn_type))
@@ -182,9 +182,9 @@ def insert_and_call_atomic_fn(context, builder, sig, fn_type,
             ll_val = ir.IntType(64)
             ll_p = ll_val.as_pointer()
             if fn_type == "add":
-                name = "numba_dppl_atomic_add_i64"
+                name = "numba_dppy_atomic_add_i64"
             elif fn_type == "sub":
-                name = "numba_dppl_atomic_sub_i64"
+                name = "numba_dppy_atomic_sub_i64"
             else:
                 raise TypeError("Operation type is not supported %s" %
                                  (fn_type))
@@ -195,9 +195,9 @@ def insert_and_call_atomic_fn(context, builder, sig, fn_type,
         ll_val = ir.FloatType()
         ll_p = ll_val.as_pointer()
         if fn_type == "add":
-            name = "numba_dppl_atomic_add_f32"
+            name = "numba_dppy_atomic_add_f32"
         elif fn_type == "sub":
-            name = "numba_dppl_atomic_sub_f32"
+            name = "numba_dppy_atomic_sub_f32"
         else:
             raise TypeError("Operation type is not supported %s" %
                              (fn_type))
@@ -208,9 +208,9 @@ def insert_and_call_atomic_fn(context, builder, sig, fn_type,
             ll_val = ir.DoubleType()
             ll_p = ll_val.as_pointer()
             if fn_type == "add":
-                name = "numba_dppl_atomic_add_f64"
+                name = "numba_dppy_atomic_add_f64"
             elif fn_type == "sub":
-                name = "numba_dppl_atomic_sub_f64"
+                name = "numba_dppy_atomic_sub_f64"
             else:
                 raise TypeError("Operation type is not supported %s" %
                                  (fn_type))
@@ -331,11 +331,11 @@ def atomic_sub_tuple(context, builder, sig, args):
         raise ImportError("Atomic support is not present, can not perform atomic_add")
 
 
-@lower('dppl.lmem.alloc', types.UniTuple, types.Any)
-def dppl_lmem_alloc_array(context, builder, sig, args):
+@lower('dppy.lmem.alloc', types.UniTuple, types.Any)
+def dppy_lmem_alloc_array(context, builder, sig, args):
     shape, dtype = args
     return _generic_array(context, builder, shape=shape, dtype=dtype,
-                          symbol_name='_dppl_lmem',
+                          symbol_name='_dppy_lmem',
                           addrspace=target.SPIR_LOCAL_ADDRSPACE)
 
 
diff --git a/numba_dppy/ocl/stubs.py b/numba_dppy/ocl/stubs.py
index 2ec95fa9c8..190b685955 100644
--- a/numba_dppy/ocl/stubs.py
+++ b/numba_dppy/ocl/stubs.py
@@ -83,9 +83,9 @@ def sub_group_barrier():
 
 class Stub(object):
     """A stub object to represent special objects which is meaningless
-    outside the context of DPPL compilation context.
+    outside the context of DPPY compilation context.
     """
-    _description_ = '<dppl special value>'
+    _description_ = '<dppy special value>'
     __slots__ = ()  # don't allocate __dict__
 
     def __new__(cls):
@@ -100,7 +100,7 @@ def __repr__(self):
 def local_alloc(shape, dtype):
     shape = _legalize_shape(shape)
     ndim = len(shape)
-    fname = "dppl.lmem.alloc"
+    fname = "dppy.lmem.alloc"
     restype = types.Array(dtype, ndim, 'C', addrspace=SPIR_LOCAL_ADDRSPACE)
     sig = typing.signature(restype, types.UniTuple(types.intp, ndim), types.Any)
     return ir.Intrinsic(fname, sig, args=(shape, dtype))
diff --git a/numba_dppy/printimpl.py b/numba_dppy/printimpl.py
index 74319b1bdd..e5c9d4f793 100644
--- a/numba_dppy/printimpl.py
+++ b/numba_dppy/printimpl.py
@@ -79,8 +79,8 @@ def print_varargs(context, builder, sig, args):
     va_arg.extend(values)
     va_arg = tuple(va_arg)
 
-    dppl_print = declare_print(builder.module)
+    dppy_print = declare_print(builder.module)
 
-    builder.call(dppl_print, va_arg)
+    builder.call(dppy_print, va_arg)
 
     return context.get_dummy_value()
diff --git a/numba_dppy/target.py b/numba_dppy/target.py
index aac4efcd4b..6444a6e601 100644
--- a/numba_dppy/target.py
+++ b/numba_dppy/target.py
@@ -24,7 +24,7 @@
 # Typing
 
 
-class DPPLTypingContext(typing.BaseContext):
+class DPPYTypingContext(typing.BaseContext):
     def load_additional_registries(self):
         # Declarations for OpenCL API functions and OpenCL Math functions
         from .ocl import ocldecl, mathdecl
@@ -91,7 +91,7 @@ def _replace_numpy_ufunc_with_opencl_supported_functions():
                 ufunc_db[ufunc][sig] = lower_ocl_impl[(name, sig_mapper[sig])]
 
 
-class DPPLTargetContext(BaseContext):
+class DPPYTargetContext(BaseContext):
     implement_powi_as_math_call = True
     generic_addrspace = SPIR_GENERIC_ADDRSPACE
 
@@ -153,7 +153,7 @@ def load_additional_registries(self):
 
     @cached_property
     def call_conv(self):
-        return DPPLCallConv(self)
+        return DPPYCallConv(self)
 
     def codegen(self):
         return self._internal_codegen
@@ -169,7 +169,7 @@ def repl(m):
 
         qualified = name + '.' + '.'.join(str(a) for a in argtypes)
         mangled = VALID_CHARS.sub(repl, qualified)
-        return 'dppl_py_devfn_' + mangled
+        return 'dppy_py_devfn_' + mangled
 
     def prepare_ocl_kernel(self, func, argtypes):
         module = func.module
@@ -208,8 +208,8 @@ def sub_gen_with_global(lty):
             llargtys = changed = ()
         wrapperfnty = lc.Type.function(lc.Type.void(), llargtys)
 
-        wrapper_module = self.create_module("dppl.kernel.wrapper")
-        wrappername = 'dpplPy_{name}'.format(name=func.name)
+        wrapper_module = self.create_module("dppy.kernel.wrapper")
+        wrappername = 'dppyPy_{name}'.format(name=func.name)
 
         argtys = list(arginfo.argument_types)
         fnty = lc.Type.function(lc.Type.int(),
@@ -239,7 +239,7 @@ def sub_gen_with_global(lty):
                                                  argtypes, callargs)
         builder.ret_void()
 
-        set_dppl_kernel(wrapper)
+        set_dppy_kernel(wrapper)
 
         #print(str(wrapper_module))
         # Link
@@ -255,9 +255,9 @@ def declare_function(self, module, fndesc):
         fnty = self.call_conv.get_function_type(fndesc.restype, fndesc.argtypes)
         fn = module.get_or_insert_function(fnty, name=fndesc.mangled_name)
         fn.attributes.add('alwaysinline')
-        ret = super(DPPLTargetContext, self).declare_function(module, fndesc)
+        ret = super(DPPYTargetContext, self).declare_function(module, fndesc)
         # XXX: Refactor fndesc instead of this special case
-        if fndesc.llvm_func_name.startswith('dppl_py_devfn'):
+        if fndesc.llvm_func_name.startswith('dppy_py_devfn'):
             ret.calling_convention = CC_SPIR_FUNC
         return ret
 
@@ -305,7 +305,7 @@ def addrspacecast(self, builder, src, addrspace):
         return builder.addrspacecast(src, ptras)
 
 
-def set_dppl_kernel(fn):
+def set_dppy_kernel(fn):
     """
     Ensure `fn` is usable as a SPIR kernel.
     - Fix calling convention
@@ -332,11 +332,11 @@ def set_dppl_kernel(fn):
     make_constant = lambda x: lc.Constant.int(lc.Type.int(), x)
     spir_version_constant = [make_constant(x) for x in SPIR_VERSION]
 
-    spir_version = mod.get_or_insert_named_metadata("dppl.spir.version")
+    spir_version = mod.get_or_insert_named_metadata("dppy.spir.version")
     if not spir_version.operands:
         spir_version.add(lc.MetaData.get(mod, spir_version_constant))
 
-    ocl_version = mod.get_or_insert_named_metadata("dppl.ocl.version")
+    ocl_version = mod.get_or_insert_named_metadata("dppy.ocl.version")
     if not ocl_version.operands:
         ocl_version.add(lc.MetaData.get(mod, spir_version_constant))
 
@@ -414,7 +414,7 @@ def gen_arg_base_type(fn):
     return lc.MetaData.get(mod, [name] + consts)
 
 
-class DPPLCallConv(MinimalCallConv):
+class DPPYCallConv(MinimalCallConv):
     def call_function(self, builder, callee, resty, argtys, args, env=None):
         """
         Call the Numba-compiled *callee*.
diff --git a/numba_dppy/target_dispatcher.py b/numba_dppy/target_dispatcher.py
index 40b9d589d9..dde38eb75b 100644
--- a/numba_dppy/target_dispatcher.py
+++ b/numba_dppy/target_dispatcher.py
@@ -8,9 +8,9 @@
 class TargetDispatcher(serialize.ReduceMixin, metaclass=dispatcher.DispatcherMeta):
     __numba__ = 'py_func'
 
-    target_offload_gpu = '__dppl_offload_gpu__'
-    target_offload_cpu = '__dppl_offload_cpu__'
-    target_dppl = 'dppy'
+    target_offload_gpu = '__dppy_offload_gpu__'
+    target_offload_cpu = '__dppy_offload_cpu__'
+    target_dppy = 'dppy'
 
     def __init__(self, py_func, wrapper, target, parallel_options, compiled=None):
 
@@ -53,7 +53,7 @@ def get_compiled(self, target=None):
         return self.__compiled[disp]
 
     def __is_with_context_target(self, target):
-        return target is None or target == TargetDispatcher.target_dppl
+        return target is None or target == TargetDispatcher.target_dppy
 
     def get_current_disp(self):
         target = self.__target
@@ -66,7 +66,7 @@ def get_current_disp(self):
             if parallel is False or (isinstance(parallel, dict) and parallel.get('offload') is False):
                 raise UnsupportedError(f"Can't use 'with' context with parallel option '{parallel}'")
 
-            from numba_dppy import dppl_offload_dispatcher
+            from numba_dppy import dppy_offload_dispatcher
 
             if target is None:
                 if dpctl.get_current_device_type() == dpctl.device_type.gpu:
@@ -75,7 +75,7 @@ def get_current_disp(self):
                     return registry.dispatcher_registry[TargetDispatcher.target_offload_cpu]
                 else:
                     if dpctl.is_in_device_context():
-                        raise UnsupportedError('Unknown dppl device type')
+                        raise UnsupportedError('Unknown dppy device type')
                     if offload:
                         if dpctl.has_gpu_queues():
                             return registry.dispatcher_registry[TargetDispatcher.target_offload_gpu]
diff --git a/numba_dppy/testing.py b/numba_dppy/testing.py
index 8da0b7b91e..e309b7f0c9 100644
--- a/numba_dppy/testing.py
+++ b/numba_dppy/testing.py
@@ -11,7 +11,7 @@
     redirect_c_stdout,
 )
 
-class DPPLTestCase(SerialMixin, unittest.TestCase):
+class DPPYTestCase(SerialMixin, unittest.TestCase):
     def setUp(self):
         #init()
 	#TODO
@@ -21,7 +21,7 @@ def tearDown(self):
 	#TODO
         pass
 
-class DPPLTextCapture(object):
+class DPPYTextCapture(object):
     def __init__(self, stream):
         self._stream = stream
 
@@ -36,16 +36,16 @@ def getvalue(self):
         return self._stream.getvalue()
 
 @contextlib.contextmanager
-def captured_dppl_stdout():
+def captured_dppy_stdout():
     """
-    Return a minimal stream-like object capturing the text output of dppl
+    Return a minimal stream-like object capturing the text output of dppy
     """
     # Prevent accidentally capturing previously output text
     sys.stdout.flush()
 
-    import numba_dppy, numba_dppy as dppl
+    import numba_dppy, numba_dppy as dppy
     with redirect_c_stdout() as stream:
-        yield DPPLTextCapture(stream)
+        yield DPPYTextCapture(stream)
 
 
 def _id(obj):
diff --git a/numba_dppy/tests/__init__.py b/numba_dppy/tests/__init__.py
index 5a2199f149..939c95c567 100644
--- a/numba_dppy/tests/__init__.py
+++ b/numba_dppy/tests/__init__.py
@@ -2,9 +2,11 @@
 from numba.testing import load_testsuite
 from os.path import dirname, join
 
-
+import numba_dppy
 import numba_dppy.config as dppy_config
 
+# from numba_dppy.tests.dppy import *
+
 def load_tests(loader, tests, pattern):
 
     suite = SerialSuite()
@@ -12,6 +14,6 @@ def load_tests(loader, tests, pattern):
     if dppy_config.dppy_present:
         suite.addTests(load_testsuite(loader, dirname(__file__)))
     else:
-        print("skipped DPPL tests")
+        print("skipped DPPY tests")
 
     return suite
diff --git a/numba_dppy/tests/test_arg_accessor.py b/numba_dppy/tests/test_arg_accessor.py
index ecc5d839bb..3de2d31770 100644
--- a/numba_dppy/tests/test_arg_accessor.py
+++ b/numba_dppy/tests/test_arg_accessor.py
@@ -2,25 +2,25 @@
 
 import numpy as np
 
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 import dpctl
 
 
-@dppl.kernel(access_types={"read_only": ['a', 'b'], "write_only": ['c'], "read_write": []})
+@dppy.kernel(access_types={"read_only": ['a', 'b'], "write_only": ['c'], "read_write": []})
 def sum_with_accessor(a, b, c):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     c[i] = a[i] + b[i]
 
-@dppl.kernel
+@dppy.kernel
 def sum_without_accessor(a, b, c):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     c[i] = a[i] + b[i]
 
 def call_kernel(global_size, local_size,
                 A, B, C, func):
-        func[global_size, dppl.DEFAULT_LOCAL_SIZE](A, B, C)
+        func[global_size, dppy.DEFAULT_LOCAL_SIZE](A, B, C)
 
 
 global_size = 10
@@ -33,7 +33,7 @@ def call_kernel(global_size, local_size,
 
 
 @unittest.skipUnless(dpctl.has_cpu_queues(), 'test only on CPU system')
-class TestDPPLArgAccessorCPU(DPPLTestCase):
+class TestDPPYArgAccessorCPU(DPPYTestCase):
     def test_arg_with_accessor(self):
         C = np.ones_like(A)
         with dpctl.device_context("opencl:cpu") as cpu_queue:
@@ -50,7 +50,7 @@ def test_arg_without_accessor(self):
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPLArgAccessorGPU(DPPLTestCase):
+class TestDPPYArgAccessorGPU(DPPYTestCase):
     def test_arg_with_accessor(self):
         C = np.ones_like(A)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
diff --git a/numba_dppy/tests/test_arg_types.py b/numba_dppy/tests/test_arg_types.py
index fc2eae105d..7b06ef11f8 100644
--- a/numba_dppy/tests/test_arg_types.py
+++ b/numba_dppy/tests/test_arg_types.py
@@ -2,19 +2,19 @@
 
 import numpy as np
 
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 import dpctl
 
 
-@dppl.kernel
+@dppy.kernel
 def mul_kernel(A, B, test):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     B[i] = A[i] * test
 
 def call_mul_device_kernel(global_size, A, B, test):
-    mul_kernel[global_size, dppl.DEFAULT_LOCAL_SIZE](A, B, test)
+    mul_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, B, test)
 
 
 global_size = 10
@@ -24,7 +24,7 @@ def call_mul_device_kernel(global_size, A, B, test):
 
 
 @unittest.skipUnless(dpctl.has_cpu_queues(), 'test only on CPU system')
-class TestDPPLArrayArgCPU(DPPLTestCase):
+class TestDPPYArrayArgCPU(DPPYTestCase):
     def test_integer_arg(self):
         x = np.int32(2)
         with dpctl.device_context("opencl:cpu") as cpu_queue:
@@ -42,7 +42,7 @@ def test_float_arg(self):
             self.assertTrue(np.all(A * x == B))
 
     def test_bool_arg(self):
-        @dppl.kernel
+        @dppy.kernel
         def check_bool_kernel(A, test):
             if test:
                 A[0] = 111
@@ -52,14 +52,14 @@ def check_bool_kernel(A, test):
         A = np.array([0], dtype='float64')
 
         with dpctl.device_context("opencl:cpu") as cpu_queue:
-            check_bool_kernel[global_size, dppl.DEFAULT_LOCAL_SIZE](A, True)
+            check_bool_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, True)
             self.assertTrue(A[0] == 111)
-            check_bool_kernel[global_size, dppl.DEFAULT_LOCAL_SIZE](A, False)
+            check_bool_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, False)
             self.assertTrue(A[0] == 222)
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPLArrayArgGPU(DPPLTestCase):
+class TestDPPYArrayArgGPU(DPPYTestCase):
     def test_integer_arg(self):
         x = np.int32(2)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
@@ -77,7 +77,7 @@ def test_float_arg(self):
             self.assertTrue(np.all(A * x == B))
 
     def test_bool_arg(self):
-        @dppl.kernel
+        @dppy.kernel
         def check_bool_kernel(A, test):
             if test:
                 A[0] = 111
@@ -87,9 +87,9 @@ def check_bool_kernel(A, test):
         A = np.array([0], dtype='float64')
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            check_bool_kernel[global_size, dppl.DEFAULT_LOCAL_SIZE](A, True)
+            check_bool_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, True)
             self.assertTrue(A[0] == 111)
-            check_bool_kernel[global_size, dppl.DEFAULT_LOCAL_SIZE](A, False)
+            check_bool_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, False)
             self.assertTrue(A[0] == 222)
 
 if __name__ == '__main__':
diff --git a/numba_dppy/tests/test_atomic_op.py b/numba_dppy/tests/test_atomic_op.py
index 9825c707d1..9d8e88def1 100644
--- a/numba_dppy/tests/test_atomic_op.py
+++ b/numba_dppy/tests/test_atomic_op.py
@@ -3,106 +3,106 @@
 import numpy as np
 
 import numba
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 import dpctl
 
 def atomic_add_int32(ary):
-    tid = dppl.get_local_id(0)
-    lm = dppl.local.static_alloc(32, numba.uint32)
+    tid = dppy.get_local_id(0)
+    lm = dppy.local.static_alloc(32, numba.uint32)
     lm[tid] = 0
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     bin = ary[tid] % 32
-    dppl.atomic.add(lm, bin, 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.add(lm, bin, 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[tid] = lm[tid]
 
 
 def atomic_sub_int32(ary):
-    tid = dppl.get_local_id(0)
-    lm = dppl.local.static_alloc(32, numba.uint32)
+    tid = dppy.get_local_id(0)
+    lm = dppy.local.static_alloc(32, numba.uint32)
     lm[tid] = 0
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     bin = ary[tid] % 32
-    dppl.atomic.sub(lm, bin, 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.sub(lm, bin, 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[tid] = lm[tid]
 
 
 def atomic_add_float32(ary):
-    lm = dppl.local.static_alloc(1, numba.float32)
+    lm = dppy.local.static_alloc(1, numba.float32)
     lm[0] = ary[0]
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
-    dppl.atomic.add(lm, 0, 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.add(lm, 0, 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[0] = lm[0]
 
 
 def atomic_sub_float32(ary):
-    lm = dppl.local.static_alloc(1, numba.float32)
+    lm = dppy.local.static_alloc(1, numba.float32)
     lm[0] = ary[0]
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
-    dppl.atomic.sub(lm, 0, 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.sub(lm, 0, 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[0] = lm[0]
 
 
 def atomic_add_int64(ary):
-    lm = dppl.local.static_alloc(1, numba.int64)
+    lm = dppy.local.static_alloc(1, numba.int64)
     lm[0] = ary[0]
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
-    dppl.atomic.add(lm, 0, 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.add(lm, 0, 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[0] = lm[0]
 
 
 def atomic_sub_int64(ary):
-    lm = dppl.local.static_alloc(1, numba.int64)
+    lm = dppy.local.static_alloc(1, numba.int64)
     lm[0] = ary[0]
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
-    dppl.atomic.sub(lm, 0, 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.sub(lm, 0, 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[0] = lm[0]
 
 
 def atomic_add_float64(ary):
-    lm = dppl.local.static_alloc(1, numba.float64)
+    lm = dppy.local.static_alloc(1, numba.float64)
     lm[0] = ary[0]
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
-    dppl.atomic.add(lm, 0, 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.add(lm, 0, 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[0] = lm[0]
 
 
 def atomic_sub_float64(ary):
-    lm = dppl.local.static_alloc(1, numba.float64)
+    lm = dppy.local.static_alloc(1, numba.float64)
     lm[0] = ary[0]
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
-    dppl.atomic.sub(lm, 0, 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.sub(lm, 0, 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[0] = lm[0]
 
 
 def atomic_add2(ary):
-    tx = dppl.get_local_id(0)
-    ty = dppl.get_local_id(1)
-    lm = dppl.local.static_alloc((4, 8), numba.uint32)
+    tx = dppy.get_local_id(0)
+    ty = dppy.get_local_id(1)
+    lm = dppy.local.static_alloc((4, 8), numba.uint32)
     lm[tx, ty] = ary[tx, ty]
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
-    dppl.atomic.add(lm, (tx, ty), 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.add(lm, (tx, ty), 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[tx, ty] = lm[tx, ty]
 
 
 def atomic_add3(ary):
-    tx = dppl.get_local_id(0)
-    ty = dppl.get_local_id(1)
-    lm = dppl.local.static_alloc((4, 8), numba.uint32)
+    tx = dppy.get_local_id(0)
+    ty = dppy.get_local_id(1)
+    lm = dppy.local.static_alloc((4, 8), numba.uint32)
     lm[tx, ty] = ary[tx, ty]
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
-    dppl.atomic.add(lm, (tx, numba.uint64(ty)), 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.add(lm, (tx, numba.uint64(ty)), 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[tx, ty] = lm[tx, ty]
 
 
@@ -118,18 +118,18 @@ def call_fn_for_datatypes(fn, result, input, global_size):
             #    continue
             #if dtype == np.int64 and not device_env.device_support_int64_atomics():
             #    continue
-            fn[global_size, dppl.DEFAULT_LOCAL_SIZE](a)
+            fn[global_size, dppy.DEFAULT_LOCAL_SIZE](a)
 
         assert(a[0] == result)
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
 @unittest.skipUnless(numba_dppy.ocl.atomic_support_present(), 'test only when atomic support is present')
-class TestAtomicOp(DPPLTestCase):
+class TestAtomicOp(DPPYTestCase):
     def test_atomic_add_global(self):
-        @dppl.kernel
+        @dppy.kernel
         def atomic_add(B):
-            dppl.atomic.add(B, 0, 1)
+            dppy.atomic.add(B, 0, 1)
 
         N = 100
         B = np.array([0])
@@ -138,9 +138,9 @@ def atomic_add(B):
 
 
     def test_atomic_sub_global(self):
-        @dppl.kernel
+        @dppy.kernel
         def atomic_sub(B):
-            dppl.atomic.sub(B, 0, 1)
+            dppy.atomic.sub(B, 0, 1)
 
         N = 100
         B = np.array([100])
@@ -152,10 +152,10 @@ def test_atomic_add_local_int32(self):
         ary = np.random.randint(0, 32, size=32).astype(np.uint32)
         orig = ary.copy()
 
-        #dppl_atomic_add = dppl.kernel('void(uint32[:])')(atomic_add_int32)
-        dppl_atomic_add = dppl.kernel(atomic_add_int32)
+        #dppy_atomic_add = dppy.kernel('void(uint32[:])')(atomic_add_int32)
+        dppy_atomic_add = dppy.kernel(atomic_add_int32)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            dppl_atomic_add[32, dppl.DEFAULT_LOCAL_SIZE](ary)
+            dppy_atomic_add[32, dppy.DEFAULT_LOCAL_SIZE](ary)
 
         gold = np.zeros(32, dtype=np.uint32)
         for i in range(orig.size):
@@ -168,10 +168,10 @@ def test_atomic_sub_local_int32(self):
         ary = np.random.randint(0, 32, size=32).astype(np.uint32)
         orig = ary.copy()
 
-        #dppl_atomic_sub = dppl.kernel('void(uint32[:])')(atomic_sub_int32)
-        dppl_atomic_sub = dppl.kernel(atomic_sub_int32)
+        #dppy_atomic_sub = dppy.kernel('void(uint32[:])')(atomic_sub_int32)
+        dppy_atomic_sub = dppy.kernel(atomic_sub_int32)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            dppl_atomic_sub[32, dppl.DEFAULT_LOCAL_SIZE](ary)
+            dppy_atomic_sub[32, dppy.DEFAULT_LOCAL_SIZE](ary)
 
         gold = np.zeros(32, dtype=np.uint32)
         for i in range(orig.size):
@@ -183,10 +183,10 @@ def test_atomic_sub_local_int32(self):
     def test_atomic_add_local_float32(self):
         ary = np.array([0], dtype=np.float32)
 
-        #dppl_atomic_add = dppl.kernel('void(float32[:])')(atomic_add_float32)
-        dppl_atomic_add = dppl.kernel(atomic_add_float32)
+        #dppy_atomic_add = dppy.kernel('void(float32[:])')(atomic_add_float32)
+        dppy_atomic_add = dppy.kernel(atomic_add_float32)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            dppl_atomic_add[32, dppl.DEFAULT_LOCAL_SIZE](ary)
+            dppy_atomic_add[32, dppy.DEFAULT_LOCAL_SIZE](ary)
 
         self.assertTrue(ary[0] == 32)
 
@@ -194,11 +194,11 @@ def test_atomic_add_local_float32(self):
     def test_atomic_sub_local_float32(self):
         ary = np.array([32], dtype=np.float32)
 
-        #dppl_atomic_sub = dppl.kernel('void(float32[:])')(atomic_sub_float32)
-        dppl_atomic_sub = dppl.kernel(atomic_sub_float32)
+        #dppy_atomic_sub = dppy.kernel('void(float32[:])')(atomic_sub_float32)
+        dppy_atomic_sub = dppy.kernel(atomic_sub_float32)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
 
-            dppl_atomic_sub[32, dppl.DEFAULT_LOCAL_SIZE](ary)
+            dppy_atomic_sub[32, dppy.DEFAULT_LOCAL_SIZE](ary)
 
         self.assertTrue(ary[0] == 0)
 
@@ -206,12 +206,12 @@ def test_atomic_sub_local_float32(self):
     def test_atomic_add_local_int64(self):
         ary = np.array([0], dtype=np.int64)
 
-        #dppl_atomic_add = dppl.kernel('void(int64[:])')(atomic_add_int64)
-        dppl_atomic_add = dppl.kernel(atomic_add_int64)
+        #dppy_atomic_add = dppy.kernel('void(int64[:])')(atomic_add_int64)
+        dppy_atomic_add = dppy.kernel(atomic_add_int64)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             # TODO: dpctl needs to expose this functions
             #if device_env.device_support_int64_atomics():
-            dppl_atomic_add[32, dppl.DEFAULT_LOCAL_SIZE](ary)
+            dppy_atomic_add[32, dppy.DEFAULT_LOCAL_SIZE](ary)
             self.assertTrue(ary[0] == 32)
             #else:
             #    return
@@ -220,12 +220,12 @@ def test_atomic_add_local_int64(self):
     def test_atomic_sub_local_int64(self):
         ary = np.array([32], dtype=np.int64)
 
-        #fn = dppl.kernel('void(int64[:])')(atomic_sub_int64)
-        fn = dppl.kernel(atomic_sub_int64)
+        #fn = dppy.kernel('void(int64[:])')(atomic_sub_int64)
+        fn = dppy.kernel(atomic_sub_int64)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             # TODO: dpctl needs to expose this functions
             #if device_env.device_support_int64_atomics():
-            fn[32, dppl.DEFAULT_LOCAL_SIZE](ary)
+            fn[32, dppy.DEFAULT_LOCAL_SIZE](ary)
             self.assertTrue(ary[0] == 0)
             #else:
             #    return
@@ -234,12 +234,12 @@ def test_atomic_sub_local_int64(self):
     def test_atomic_add_local_float64(self):
         ary = np.array([0], dtype=np.double)
 
-        #fn = dppl.kernel('void(float64[:])')(atomic_add_float64)
-        fn = dppl.kernel(atomic_add_float64)
+        #fn = dppy.kernel('void(float64[:])')(atomic_add_float64)
+        fn = dppy.kernel(atomic_add_float64)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             # TODO: dpctl needs to expose this functions
             #if device_env.device_support_float64_atomics():
-            fn[32, dppl.DEFAULT_LOCAL_SIZE](ary)
+            fn[32, dppy.DEFAULT_LOCAL_SIZE](ary)
             self.assertTrue(ary[0] == 32)
             #else:
             #    return
@@ -248,12 +248,12 @@ def test_atomic_add_local_float64(self):
     def test_atomic_sub_local_float64(self):
         ary = np.array([32], dtype=np.double)
 
-        #fn = dppl.kernel('void(float64[:])')(atomic_sub_int64)
-        fn = dppl.kernel(atomic_sub_int64)
+        #fn = dppy.kernel('void(float64[:])')(atomic_sub_int64)
+        fn = dppy.kernel(atomic_sub_int64)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             # TODO: dpctl needs to expose this functions
             #if device_env.device_support_float64_atomics():
-            fn[32, dppl.DEFAULT_LOCAL_SIZE](ary)
+            fn[32, dppy.DEFAULT_LOCAL_SIZE](ary)
             self.assertTrue(ary[0] == 0)
             #else:
             #    return
@@ -262,20 +262,20 @@ def test_atomic_sub_local_float64(self):
     def test_atomic_add2(self):
         ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
         orig = ary.copy()
-        #dppl_atomic_add2 = dppl.kernel('void(uint32[:,:])')(atomic_add2)
-        dppl_atomic_add2 = dppl.kernel(atomic_add2)
+        #dppy_atomic_add2 = dppy.kernel('void(uint32[:,:])')(atomic_add2)
+        dppy_atomic_add2 = dppy.kernel(atomic_add2)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            dppl_atomic_add2[(4, 8), dppl.DEFAULT_LOCAL_SIZE](ary)
+            dppy_atomic_add2[(4, 8), dppy.DEFAULT_LOCAL_SIZE](ary)
         self.assertTrue(np.all(ary == orig + 1))
 
 
     def test_atomic_add3(self):
         ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
         orig = ary.copy()
-        #dppl_atomic_add3 = dppl.kernel('void(uint32[:,:])')(atomic_add3)
-        dppl_atomic_add3 = dppl.kernel(atomic_add3)
+        #dppy_atomic_add3 = dppy.kernel('void(uint32[:,:])')(atomic_add3)
+        dppy_atomic_add3 = dppy.kernel(atomic_add3)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            dppl_atomic_add3[(4, 8), dppl.DEFAULT_LOCAL_SIZE](ary)
+            dppy_atomic_add3[(4, 8), dppy.DEFAULT_LOCAL_SIZE](ary)
 
         self.assertTrue(np.all(ary == orig + 1))
 
diff --git a/numba_dppy/tests/test_barrier.py b/numba_dppy/tests/test_barrier.py
index aeff16dd40..3657672240 100644
--- a/numba_dppy/tests/test_barrier.py
+++ b/numba_dppy/tests/test_barrier.py
@@ -3,21 +3,21 @@
 import numpy as np
 
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 from numba import float32
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
 class TestBarrier(unittest.TestCase):
     def test_proper_lowering(self):
-        #@dppl.kernel("void(float32[::1])")
-        @dppl.kernel
+        #@dppy.kernel("void(float32[::1])")
+        @dppy.kernel
         def twice(A):
-            i = dppl.get_global_id(0)
+            i = dppy.get_global_id(0)
             d = A[i]
-            dppl.barrier(dppl.CLK_LOCAL_MEM_FENCE)  # local mem fence
+            dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE)  # local mem fence
             A[i] = d * 2
 
         N = 256
@@ -31,13 +31,13 @@ def twice(A):
         np.testing.assert_allclose(orig * 2, arr)
 
     def test_no_arg_barrier_support(self):
-        #@dppl.kernel("void(float32[::1])")
-        @dppl.kernel
+        #@dppy.kernel("void(float32[::1])")
+        @dppy.kernel
         def twice(A):
-            i = dppl.get_global_id(0)
+            i = dppy.get_global_id(0)
             d = A[i]
             # no argument defaults to global mem fence
-            dppl.barrier()
+            dppy.barrier()
             A[i] = d * 2
 
         N = 256
@@ -45,7 +45,7 @@ def twice(A):
         orig = arr.copy()
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            twice[N, dppl.DEFAULT_LOCAL_SIZE](arr)
+            twice[N, dppy.DEFAULT_LOCAL_SIZE](arr)
 
         # The computation is correct?
         np.testing.assert_allclose(orig * 2, arr)
@@ -54,16 +54,16 @@ def twice(A):
     def test_local_memory(self):
         blocksize = 10
 
-        #@dppl.kernel("void(float32[::1])")
-        @dppl.kernel
+        #@dppy.kernel("void(float32[::1])")
+        @dppy.kernel
         def reverse_array(A):
-            lm = dppl.local.static_alloc(shape=10, dtype=float32)
-            i = dppl.get_global_id(0)
+            lm = dppy.local.static_alloc(shape=10, dtype=float32)
+            i = dppy.get_global_id(0)
 
             # preload
             lm[i] = A[i]
             # barrier local or global will both work as we only have one work group
-            dppl.barrier(dppl.CLK_LOCAL_MEM_FENCE)  # local mem fence
+            dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE)  # local mem fence
             # write
             A[i] += lm[blocksize - 1 - i]
 
@@ -71,7 +71,7 @@ def reverse_array(A):
         orig = arr.copy()
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            reverse_array[blocksize, dppl.DEFAULT_LOCAL_SIZE](arr)
+            reverse_array[blocksize, dppy.DEFAULT_LOCAL_SIZE](arr)
 
         expected = orig[::-1] + orig
         np.testing.assert_allclose(expected, arr)
diff --git a/numba_dppy/tests/test_black_scholes.py b/numba_dppy/tests/test_black_scholes.py
index 3d9581bb54..312536d33a 100644
--- a/numba_dppy/tests/test_black_scholes.py
+++ b/numba_dppy/tests/test_black_scholes.py
@@ -4,9 +4,9 @@
 import math
 import time
 
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 import dpctl
 
 
@@ -49,7 +49,7 @@ def randfloat(rand_var, low, high):
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPLBlackScholes(DPPLTestCase):
+class TestDPPYBlackScholes(DPPYTestCase):
     def test_black_scholes(self):
         OPT_N = 400
         iterations = 2
@@ -70,9 +70,9 @@ def test_black_scholes(self):
                           optionStrike, optionYears, RISKFREE, VOLATILITY)
 
 
-        @dppl.kernel
-        def black_scholes_dppl(callResult, putResult, S, X, T, R, V):
-            i = dppl.get_global_id(0)
+        @dppy.kernel
+        def black_scholes_dppy(callResult, putResult, S, X, T, R, V):
+            i = dppy.get_global_id(0)
             if i >= S.shape[0]:
                 return
             sqrtT = math.sqrt(T[i])
@@ -103,7 +103,7 @@ def black_scholes_dppl(callResult, putResult, S, X, T, R, V):
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             time1 = time.time()
             for i in range(iterations):
-                black_scholes_dppl[blockdim, griddim](
+                black_scholes_dppy[blockdim, griddim](
                     callResultNumbapro, putResultNumbapro, stockPrice, optionStrike,
                     optionYears, RISKFREE, VOLATILITY)
 
diff --git a/numba_dppy/tests/test_caching.py b/numba_dppy/tests/test_caching.py
index 6a6a7967a5..ae693190a3 100644
--- a/numba_dppy/tests/test_caching.py
+++ b/numba_dppy/tests/test_caching.py
@@ -3,18 +3,18 @@
 
 import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 
 
 def data_parallel_sum(a, b, c):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     c[i] = a[i] + b[i]
 
 
-class TestCaching(DPPLTestCase):
+class TestCaching(DPPYTestCase):
     def test_caching_kernel(self):
         global_size = 10
         N = global_size
@@ -25,11 +25,11 @@ def test_caching_kernel(self):
 
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            func = dppl.kernel(data_parallel_sum)
-            caching_kernel = func[global_size, dppl.DEFAULT_LOCAL_SIZE].specialize(a, b, c)
+            func = dppy.kernel(data_parallel_sum)
+            caching_kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize(a, b, c)
 
             for i in range(10):
-                cached_kernel = func[global_size, dppl.DEFAULT_LOCAL_SIZE].specialize(a, b, c)
+                cached_kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize(a, b, c)
                 self.assertIs(caching_kernel, cached_kernel)
 
 
diff --git a/numba_dppy/tests/test_device_array_args.py b/numba_dppy/tests/test_device_array_args.py
index 024e3723a9..b38eac12fe 100644
--- a/numba_dppy/tests/test_device_array_args.py
+++ b/numba_dppy/tests/test_device_array_args.py
@@ -4,14 +4,14 @@
 
 import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 
-@dppl.kernel
+@dppy.kernel
 def data_parallel_sum(a, b, c):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     c[i] = a[i] + b[i]
 
 
@@ -24,23 +24,23 @@ def data_parallel_sum(a, b, c):
 
 
 @unittest.skipUnless(dpctl.has_cpu_queues(), 'test only on CPU system')
-class TestDPPLDeviceArrayArgsGPU(DPPLTestCase):
+class TestDPPYDeviceArrayArgsGPU(DPPYTestCase):
     def test_device_array_args_cpu(self):
         c = np.ones_like(a)
 
         with dpctl.device_context("opencl:cpu") as cpu_queue:
-            data_parallel_sum[global_size, dppl.DEFAULT_LOCAL_SIZE](a, b, c)
+            data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)
 
             self.assertTrue(np.all(c == d))
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPLDeviceArrayArgsCPU(DPPLTestCase):
+class TestDPPYDeviceArrayArgsCPU(DPPYTestCase):
     def test_device_array_args_gpu(self):
         c = np.ones_like(a)
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            data_parallel_sum[global_size, dppl.DEFAULT_LOCAL_SIZE](a, b, c)
+            data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)
 
         self.assertTrue(np.all(c == d))
 
diff --git a/numba_dppy/tests/test_dpctl_api.py b/numba_dppy/tests/test_dpctl_api.py
index bb72a35cf2..dcbb95e163 100644
--- a/numba_dppy/tests/test_dpctl_api.py
+++ b/numba_dppy/tests/test_dpctl_api.py
@@ -3,12 +3,12 @@
 import numpy as np
 
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 import dpctl
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPCTLAPI(DPPLTestCase):
+class TestDPCTLAPI(DPPYTestCase):
     def test_dpctl_api(self):
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             dpctl.dump()
diff --git a/numba_dppy/tests/test_dpnp_functions.py b/numba_dppy/tests/test_dpnp_functions.py
index bbffb30c3f..b0837f5ba6 100644
--- a/numba_dppy/tests/test_dpnp_functions.py
+++ b/numba_dppy/tests/test_dpnp_functions.py
@@ -5,9 +5,9 @@
 import sys
 import numpy as np
 from numba import njit
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 
 
 def test_for_different_datatypes(fn, test_fn, dims, arg_count, tys, np_all=False, matrix=None):
@@ -76,7 +76,7 @@ def ensure_dpnp():
 
 
 @unittest.skipUnless(ensure_dpnp(), 'test only when dpNP is available')
-class Testdpnp_functions(DPPLTestCase):
+class Testdpnp_functions(DPPYTestCase):
     N = 10
 
     a = np.array(np.random.random(N), dtype=np.float32)
diff --git a/numba_dppy/tests/test_dppl_fallback.py b/numba_dppy/tests/test_dppl_fallback.py
index adb7ae868b..8519f4fb14 100644
--- a/numba_dppy/tests/test_dppl_fallback.py
+++ b/numba_dppy/tests/test_dppl_fallback.py
@@ -3,9 +3,9 @@
 import numpy as np
 
 import numba
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 from numba.tests.support import captured_stderr
 import dpctl
 import sys
@@ -13,8 +13,8 @@
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPLFallback(DPPLTestCase):
-    def test_dppl_fallback_inner_call(self):
+class TestDPPYFallback(DPPYTestCase):
+    def test_dppy_fallback_inner_call(self):
         @numba.jit
         def fill_value(i):
             return i
@@ -29,27 +29,27 @@ def inner_call_fallback():
             return a
 
         with captured_stderr() as msg:
-            dppl = numba.njit(parallel={'offload':True})(inner_call_fallback)
-            dppl_result = dppl()
+            dppy = numba.njit(parallel={'offload':True})(inner_call_fallback)
+            dppy_result = dppy()
 
         ref_result = inner_call_fallback()
 
-        np.testing.assert_array_equal(dppl_result, ref_result)
-        self.assertTrue('Failed to lower parfor on DPPL-device' in msg.getvalue())
+        np.testing.assert_array_equal(dppy_result, ref_result)
+        self.assertTrue('Failed to lower parfor on DPPY-device' in msg.getvalue())
 
-    def test_dppl_fallback_reductions(self):
+    def test_dppy_fallback_reductions(self):
         def reduction(a):
             return np.amax(a)
 
         a = np.ones(10)
         with captured_stderr() as msg:
-            dppl = numba.njit(parallel={'offload':True})(reduction)
-            dppl_result = dppl(a)
+            dppy = numba.njit(parallel={'offload':True})(reduction)
+            dppy_result = dppy(a)
 
         ref_result = reduction(a)
 
-        np.testing.assert_array_equal(dppl_result, ref_result)
-        self.assertTrue('Failed to lower parfor on DPPL-device' in msg.getvalue())
+        np.testing.assert_array_equal(dppy_result, ref_result)
+        self.assertTrue('Failed to lower parfor on DPPY-device' in msg.getvalue())
 
 
 if __name__ == '__main__':
diff --git a/numba_dppy/tests/test_dppl_func.py b/numba_dppy/tests/test_dppl_func.py
index 0f64046082..c58908554e 100644
--- a/numba_dppy/tests/test_dppl_func.py
+++ b/numba_dppy/tests/test_dppl_func.py
@@ -2,59 +2,59 @@
 
 import numpy as np
 
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 import dpctl
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPLFunc(DPPLTestCase):
+class TestDPPYFunc(DPPYTestCase):
     N = 257
 
-    def test_dppl_func_device_array(self):
-        @dppl.func
+    def test_dppy_func_device_array(self):
+        @dppy.func
         def g(a):
             return a + 1
 
-        @dppl.kernel
+        @dppy.kernel
         def f(a, b):
-            i = dppl.get_global_id(0)
+            i = dppy.get_global_id(0)
             b[i] = g(a[i])
 
         a = np.ones(self.N)
         b = np.ones(self.N)
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            f[self.N, dppl.DEFAULT_LOCAL_SIZE](a, b)
+            f[self.N, dppy.DEFAULT_LOCAL_SIZE](a, b)
 
 
         self.assertTrue(np.all(b == 2))
 
-    def test_dppl_func_ndarray(self):
-        @dppl.func
+    def test_dppy_func_ndarray(self):
+        @dppy.func
         def g(a):
             return a + 1
 
-        @dppl.kernel
+        @dppy.kernel
         def f(a, b):
-            i = dppl.get_global_id(0)
+            i = dppy.get_global_id(0)
             b[i] = g(a[i])
 
-        @dppl.kernel
+        @dppy.kernel
         def h(a, b):
-            i = dppl.get_global_id(0)
+            i = dppy.get_global_id(0)
             b[i] = g(a[i]) + 1
 
         a = np.ones(self.N)
         b = np.ones(self.N)
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            f[self.N, dppl.DEFAULT_LOCAL_SIZE](a, b)
+            f[self.N, dppy.DEFAULT_LOCAL_SIZE](a, b)
 
             self.assertTrue(np.all(b == 2))
 
-            h[self.N, dppl.DEFAULT_LOCAL_SIZE](a, b)
+            h[self.N, dppy.DEFAULT_LOCAL_SIZE](a, b)
 
             self.assertTrue(np.all(b == 3))
 
diff --git a/numba_dppy/tests/test_math_functions.py b/numba_dppy/tests/test_math_functions.py
index 977fe85fef..f83fdd30ee 100644
--- a/numba_dppy/tests/test_math_functions.py
+++ b/numba_dppy/tests/test_math_functions.py
@@ -4,45 +4,45 @@
 
 import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 import math
 
-@dppl.kernel
-def dppl_fabs(a,b):
-    i = dppl.get_global_id(0)
+@dppy.kernel
+def dppy_fabs(a,b):
+    i = dppy.get_global_id(0)
     b[i] = math.fabs(a[i])
 
-@dppl.kernel
-def dppl_exp(a,b):
-    i = dppl.get_global_id(0)
+@dppy.kernel
+def dppy_exp(a,b):
+    i = dppy.get_global_id(0)
     b[i] = math.exp(a[i])
 
-@dppl.kernel
-def dppl_log(a,b):
-    i = dppl.get_global_id(0)
+@dppy.kernel
+def dppy_log(a,b):
+    i = dppy.get_global_id(0)
     b[i] = math.log(a[i])
 
-@dppl.kernel
-def dppl_sqrt(a,b):
-    i = dppl.get_global_id(0)
+@dppy.kernel
+def dppy_sqrt(a,b):
+    i = dppy.get_global_id(0)
     b[i] = math.sqrt(a[i])
 
-@dppl.kernel
-def dppl_sin(a,b):
-    i = dppl.get_global_id(0)
+@dppy.kernel
+def dppy_sin(a,b):
+    i = dppy.get_global_id(0)
     b[i] = math.sin(a[i])
 
-@dppl.kernel
-def dppl_cos(a,b):
-    i = dppl.get_global_id(0)
+@dppy.kernel
+def dppy_cos(a,b):
+    i = dppy.get_global_id(0)
     b[i] = math.cos(a[i])
 
-@dppl.kernel
-def dppl_tan(a,b):
-    i = dppl.get_global_id(0)
+@dppy.kernel
+def dppy_tan(a,b):
+    i = dppy.get_global_id(0)
     b[i] = math.tan(a[i])
 
 global_size = 10
@@ -53,7 +53,7 @@ def dppl_tan(a,b):
 def driver(a, jitfunc):
     b = np.ones_like(a)
     # Device buffers
-    jitfunc[global_size, dppl.DEFAULT_LOCAL_SIZE](a, b)
+    jitfunc[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b)
     return b
 
 
@@ -73,67 +73,67 @@ def test_driver(input_arr, device_ty, jitfunc):
 
 
 @unittest.skipUnless(dpctl.has_cpu_queues(), 'test only on CPU system')
-class TestDPPLMathFunctionsCPU(DPPLTestCase):
+class TestDPPYMathFunctionsCPU(DPPYTestCase):
     def test_fabs_cpu(self):
-        b_actual = test_driver(a, "CPU", dppl_fabs)
+        b_actual = test_driver(a, "CPU", dppy_fabs)
         b_expected = np.fabs(a)
         self.assertTrue(np.all(b_actual == b_expected))
 
     def test_sin_cpu(self):
-        b_actual = test_driver(a, "CPU", dppl_sin)
+        b_actual = test_driver(a, "CPU", dppy_sin)
         b_expected = np.sin(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
     def test_cos_cpu(self):
-        b_actual = test_driver(a, "CPU", dppl_cos)
+        b_actual = test_driver(a, "CPU", dppy_cos)
         b_expected = np.cos(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
     def test_exp_cpu(self):
-        b_actual = test_driver(a, "CPU", dppl_exp)
+        b_actual = test_driver(a, "CPU", dppy_exp)
         b_expected = np.exp(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
     def test_sqrt_cpu(self):
-        b_actual = test_driver(a, "CPU", dppl_sqrt)
+        b_actual = test_driver(a, "CPU", dppy_sqrt)
         b_expected = np.sqrt(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
     def test_log_cpu(self):
-        b_actual = test_driver(a, "CPU", dppl_log)
+        b_actual = test_driver(a, "CPU", dppy_log)
         b_expected = np.log(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPLMathFunctionsGPU(DPPLTestCase):
+class TestDPPYMathFunctionsGPU(DPPYTestCase):
     def test_fabs_gpu(self):
-        b_actual = test_driver(a, "GPU", dppl_fabs)
+        b_actual = test_driver(a, "GPU", dppy_fabs)
         b_expected = np.fabs(a)
         self.assertTrue(np.all(b_actual == b_expected))
 
     def test_sin_gpu(self):
-        b_actual = test_driver(a, "GPU", dppl_sin)
+        b_actual = test_driver(a, "GPU", dppy_sin)
         b_expected = np.sin(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
     def test_cos_gpu(self):
-        b_actual = test_driver(a, "GPU", dppl_cos)
+        b_actual = test_driver(a, "GPU", dppy_cos)
         b_expected = np.cos(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
     def test_exp_gpu(self):
-        b_actual = test_driver(a, "GPU", dppl_exp)
+        b_actual = test_driver(a, "GPU", dppy_exp)
         b_expected = np.exp(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
     def test_sqrt_gpu(self):
-        b_actual = test_driver(a, "GPU", dppl_sqrt)
+        b_actual = test_driver(a, "GPU", dppy_sqrt)
         b_expected = np.sqrt(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
     def test_log_gpu(self):
-        b_actual = test_driver(a, "GPU", dppl_log)
+        b_actual = test_driver(a, "GPU", dppy_log)
         b_expected = np.log(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
diff --git a/numba_dppy/tests/test_numpy_bit_twiddling_functions.py b/numba_dppy/tests/test_numpy_bit_twiddling_functions.py
index 5e3cd9ba24..de6b7bc963 100644
--- a/numba_dppy/tests/test_numpy_bit_twiddling_functions.py
+++ b/numba_dppy/tests/test_numpy_bit_twiddling_functions.py
@@ -5,12 +5,12 @@
 import sys
 import numpy as np
 from numba import njit
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 
 
-class TestNumpy_bit_twiddling_functions(DPPLTestCase):
+class TestNumpy_bit_twiddling_functions(DPPYTestCase):
     def test_bitwise_and(self):
         @njit(parallel={'offload':True})
         def f(a, b):
diff --git a/numba_dppy/tests/test_numpy_comparison_functions.py b/numba_dppy/tests/test_numpy_comparison_functions.py
index 0bd7dcbb69..5daf1fc813 100644
--- a/numba_dppy/tests/test_numpy_comparison_functions.py
+++ b/numba_dppy/tests/test_numpy_comparison_functions.py
@@ -5,11 +5,11 @@
 import sys
 import numpy as np
 from numba import njit
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 
-class TestNumpy_comparison_functions(DPPLTestCase):
+class TestNumpy_comparison_functions(DPPYTestCase):
     a = np.array([4,5,6])
     b = np.array([2,6,6])
     def test_greater(self):
diff --git a/numba_dppy/tests/test_numpy_floating_functions.py b/numba_dppy/tests/test_numpy_floating_functions.py
index 62b76b1ade..c05c10498d 100644
--- a/numba_dppy/tests/test_numpy_floating_functions.py
+++ b/numba_dppy/tests/test_numpy_floating_functions.py
@@ -4,12 +4,12 @@
 import sys
 import numpy as np
 from numba import njit
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 
 
-class TestNumpy_floating_functions(DPPLTestCase):
+class TestNumpy_floating_functions(DPPYTestCase):
     def test_isfinite(self):
         @njit(parallel={'offload':True})
         def f(a):
diff --git a/numba_dppy/tests/test_numpy_math_functions.py b/numba_dppy/tests/test_numpy_math_functions.py
index ddbb568ede..155b352c7e 100644
--- a/numba_dppy/tests/test_numpy_math_functions.py
+++ b/numba_dppy/tests/test_numpy_math_functions.py
@@ -5,12 +5,12 @@
 import sys
 import numpy as np
 from numba import njit
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 
 
-class TestNumpy_math_functions(DPPLTestCase):
+class TestNumpy_math_functions(DPPYTestCase):
     N = 10
     a = np.array(np.random.random(N), dtype=np.float32)
     b = np.array(np.random.random(N), dtype=np.float32)
diff --git a/numba_dppy/tests/test_numpy_trigonomteric_functions.py b/numba_dppy/tests/test_numpy_trigonomteric_functions.py
index 8f61f941c9..7ce18b870a 100644
--- a/numba_dppy/tests/test_numpy_trigonomteric_functions.py
+++ b/numba_dppy/tests/test_numpy_trigonomteric_functions.py
@@ -5,12 +5,12 @@
 import sys
 import numpy as np
 from numba import njit
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 
 
-class TestNumpy_math_functions(DPPLTestCase):
+class TestNumpy_math_functions(DPPYTestCase):
     N = 10
     a = np.array(np.random.random(N), dtype=np.float32)
     b = np.array(np.random.random(N), dtype=np.float32)
diff --git a/numba_dppy/tests/test_parfor_lower_message.py b/numba_dppy/tests/test_parfor_lower_message.py
index fe8c85d356..591fd2cb0e 100644
--- a/numba_dppy/tests/test_parfor_lower_message.py
+++ b/numba_dppy/tests/test_parfor_lower_message.py
@@ -1,8 +1,8 @@
 import numpy as np
 import numba
 from numba import njit, prange
-import numba_dppy, numba_dppy as dppl
-from numba_dppy.testing import unittest, DPPLTestCase
+import numba_dppy, numba_dppy as dppy
+from numba_dppy.testing import unittest, DPPYTestCase
 from numba.tests.support import captured_stdout
 import dpctl
 
@@ -19,7 +19,7 @@ def prange_example():
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
-class TestParforMessage(DPPLTestCase):
+class TestParforMessage(DPPYTestCase):
     def test_parfor_message(self):
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             numba_dppy.compiler.DEBUG = 1
@@ -29,7 +29,7 @@ def test_parfor_message(self):
                 jitted()
 
             numba_dppy.compiler.DEBUG = 0
-            self.assertTrue("Parfor lowered on DPPL-device" in got.getvalue())
+            self.assertTrue("Parfor lowered on DPPY-device" in got.getvalue())
 
 
 if __name__ == '__main__':
diff --git a/numba_dppy/tests/test_prange.py b/numba_dppy/tests/test_prange.py
index 317c2cbb2f..f4c13c4b1f 100644
--- a/numba_dppy/tests/test_prange.py
+++ b/numba_dppy/tests/test_prange.py
@@ -6,13 +6,13 @@
 import numpy as np
 import numba
 from numba import njit, prange
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest, expectedFailureIf
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 from numba.tests.support import captured_stdout
 
 
-class TestPrange(DPPLTestCase):
+class TestPrange(DPPYTestCase):
     def test_one_prange(self):
         @njit(parallel={'offload':True})
         def f(a, b):
@@ -118,8 +118,8 @@ def prange_example():
 
         numba_dppy.compiler.DEBUG = old_debug
 
-        self.assertEqual(stdout.getvalue().count('Parfor lowered on DPPL-device'), 2, stdout.getvalue())
-        self.assertEqual(stdout.getvalue().count('Failed to lower parfor on DPPL-device'), 0, stdout.getvalue())
+        self.assertEqual(stdout.getvalue().count('Parfor lowered on DPPY-device'), 2, stdout.getvalue())
+        self.assertEqual(stdout.getvalue().count('Failed to lower parfor on DPPY-device'), 0, stdout.getvalue())
         np.testing.assert_equal(res, jitted_res)
 
 
@@ -146,8 +146,8 @@ def prange_example():
 
         numba_dppy.compiler.DEBUG = old_debug
 
-        self.assertEqual(stdout.getvalue().count('Parfor lowered on DPPL-device'), 2, stdout.getvalue())
-        self.assertEqual(stdout.getvalue().count('Failed to lower parfor on DPPL-device'), 0, stdout.getvalue())
+        self.assertEqual(stdout.getvalue().count('Parfor lowered on DPPY-device'), 2, stdout.getvalue())
+        self.assertEqual(stdout.getvalue().count('Failed to lower parfor on DPPY-device'), 0, stdout.getvalue())
         np.testing.assert_equal(res, jitted_res)
 
 
diff --git a/numba_dppy/tests/test_print.py b/numba_dppy/tests/test_print.py
index ca1e47978a..0bc4a7cc2b 100644
--- a/numba_dppy/tests/test_print.py
+++ b/numba_dppy/tests/test_print.py
@@ -5,24 +5,24 @@
 import sys
 import numpy as np
 from numba import njit, prange
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 
 import dpctl
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestPrint(DPPLTestCase):
-    def test_print_dppl_kernel(self):
-        @dppl.func
+class TestPrint(DPPYTestCase):
+    def test_print_dppy_kernel(self):
+        @dppy.func
         def g(a):
             print("value of a:", a)
             return a + 1
 
-        @dppl.kernel
+        @dppy.kernel
         def f(a, b):
-            i = dppl.get_global_id(0)
+            i = dppy.get_global_id(0)
             b[i] = g(a[i])
             print("value of b at:", i, "is", b[i])
 
@@ -32,7 +32,7 @@ def f(a, b):
         b = np.ones(N)
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            f[N, dppl.DEFAULT_LOCAL_SIZE](a, b)
+            f[N, dppy.DEFAULT_LOCAL_SIZE](a, b)
 
 
 if __name__ == '__main__':
diff --git a/numba_dppy/tests/test_sum_reduction.py b/numba_dppy/tests/test_sum_reduction.py
index 3095497a66..8ec7b3d5a9 100644
--- a/numba_dppy/tests/test_sum_reduction.py
+++ b/numba_dppy/tests/test_sum_reduction.py
@@ -4,14 +4,14 @@
 import math
 import time
 
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 import dpctl
 
-@dppl.kernel
+@dppy.kernel
 def reduction_kernel(A, R, stride):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     # sum two element
     R[i] = A[i] + A[i+stride]
     # store the sum to be used in nex iteration
@@ -19,7 +19,7 @@ def reduction_kernel(A, R, stride):
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPLSumReduction(DPPLTestCase):
+class TestDPPYSumReduction(DPPYTestCase):
     def test_sum_reduction(self):
         # This test will only work for even case
         N = 1024
@@ -36,7 +36,7 @@ def test_sum_reduction(self):
             while (total > 1):
                 # call kernel
                 global_size = total // 2
-                reduction_kernel[global_size, dppl.DEFAULT_LOCAL_SIZE](A, R, global_size)
+                reduction_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, R, global_size)
                 total = total // 2
 
             result = A_copy.sum()
diff --git a/numba_dppy/tests/test_vectorize.py b/numba_dppy/tests/test_vectorize.py
index 12dc7b5ed3..04891ca296 100644
--- a/numba_dppy/tests/test_vectorize.py
+++ b/numba_dppy/tests/test_vectorize.py
@@ -5,12 +5,12 @@
 import sys
 import numpy as np
 from numba import njit, vectorize
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 
 
-class TestVectorize(DPPLTestCase):
+class TestVectorize(DPPYTestCase):
     def test_vectorize(self):
 
         @vectorize(nopython=True)
diff --git a/numba_dppy/tests/test_with_context.py b/numba_dppy/tests/test_with_context.py
index 0749ff3e89..e025a77784 100644
--- a/numba_dppy/tests/test_with_context.py
+++ b/numba_dppy/tests/test_with_context.py
@@ -2,18 +2,18 @@
 import numba
 import numpy as np
 from numba import njit
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba.core import errors
 from numba.tests.support import captured_stdout
-from numba_dppy.testing import DPPLTestCase, unittest, expectedFailureIf
+from numba_dppy.testing import DPPYTestCase, unittest, expectedFailureIf
 import dpctl
 
 
-class TestWithDPPLContext(DPPLTestCase):
+class TestWithDPPYContext(DPPYTestCase):
 
     @unittest.skipIf(not dpctl.has_gpu_queues(), "No GPU platforms available")
     @expectedFailureIf(sys.platform.startswith('win'))
-    def test_with_dppl_context_gpu(self):
+    def test_with_dppy_context_gpu(self):
 
         @njit
         def nested_func(a, b):
@@ -36,11 +36,11 @@ def func(b):
         func(expected)
 
         np.testing.assert_array_equal(expected, got_gpu)
-        self.assertTrue('Parfor lowered on DPPL-device' in got_gpu_message.getvalue())
+        self.assertTrue('Parfor lowered on DPPY-device' in got_gpu_message.getvalue())
 
     @unittest.skipIf(not dpctl.has_cpu_queues(), "No CPU platforms available")
     @unittest.expectedFailure
-    def test_with_dppl_context_cpu(self):
+    def test_with_dppy_context_cpu(self):
 
         @njit
         def nested_func(a, b):
@@ -63,11 +63,11 @@ def func(b):
         func(expected)
 
         np.testing.assert_array_equal(expected, got_cpu)
-        self.assertTrue('Parfor lowered on DPPL-device' in got_cpu_message.getvalue())
+        self.assertTrue('Parfor lowered on DPPY-device' in got_cpu_message.getvalue())
 
 
     @unittest.skipIf(not dpctl.has_gpu_queues(), "No GPU platforms available")
-    def test_with_dppl_context_target(self):
+    def test_with_dppy_context_target(self):
 
         @njit(target='cpu')
         def nested_func_target(a, b):

From 0105e830a42829ea7206cf4d87d9f9d9246253f2 Mon Sep 17 00:00:00 2001
From: Reazul Hoque <reazulhoque@users.noreply.github.com>
Date: Wed, 9 Dec 2020 00:46:00 -0600
Subject: [PATCH 15/32] Pass to rewrite Numpy function names to be able to
 overload them for Numba-dppy pipeline (#52)

* Sum example

* Moved from infer_type, lower_builtin to overload

* Added two level module name functions

* Remove cython generated file

* Module name fix for moving to new extension

* Incomplete linalg.eig implementation

* Updted all dppl to dppy and moved rewrite_numpy_function_pass to it's own file

* Import module at correct locations

* Added comments

* Added test and updated comments

* Revert unneeded changes

* Update Eigen implementation

* Remove eig implementation

* Add checking equivalent IR

Co-authored-by: reazul.hoque <reazul.hoque@intel.com>
---
 numba_dppy/device_init.py                     |  13 +-
 numba_dppy/dpctl_functions.py                 |  30 +++++
 numba_dppy/dpnp_glue/__init__.py              |   0
 numba_dppy/dpnp_glue/dpnp_fptr_interface.pyx  |  44 +++++-
 numba_dppy/dpnp_glue/dpnpdecl.py              |  10 ++
 numba_dppy/dpnp_glue/dpnpimpl.py              |  89 +++++++++++++
 numba_dppy/dpnp_glue/stubs.py                 |   9 ++
 numba_dppy/dppy_passbuilder.py                |   7 +
 numba_dppy/dppy_passes.py                     |   3 +-
 numba_dppy/rename_numpy_functions_pass.py     | 125 ++++++++++++++++++
 .../tests/test_rename_numpy_function_pass.py  |  67 ++++++++++
 11 files changed, 389 insertions(+), 8 deletions(-)
 create mode 100644 numba_dppy/dpctl_functions.py
 create mode 100644 numba_dppy/dpnp_glue/__init__.py
 create mode 100644 numba_dppy/dpnp_glue/dpnpdecl.py
 create mode 100644 numba_dppy/dpnp_glue/dpnpimpl.py
 create mode 100644 numba_dppy/dpnp_glue/stubs.py
 create mode 100644 numba_dppy/rename_numpy_functions_pass.py
 create mode 100644 numba_dppy/tests/test_rename_numpy_function_pass.py

diff --git a/numba_dppy/device_init.py b/numba_dppy/device_init.py
index c4506014a8..efec55ba83 100644
--- a/numba_dppy/device_init.py
+++ b/numba_dppy/device_init.py
@@ -18,6 +18,14 @@
     CLK_GLOBAL_MEM_FENCE,
 )
 
+"""
+We are importing dpnp stub module to make Numba recognize the
+module when we rename Numpy functions.
+"""
+from .dpnp_glue.stubs import (
+    dpnp
+)
+
 DEFAULT_LOCAL_SIZE = []
 
 from . import initialize
@@ -35,9 +43,4 @@ def is_available():
     return dpctl.has_gpu_queues()
 
 
-#def ocl_error():
-#    """Returns None or an exception if the OpenCL driver fails to initialize.
-#    """
-#    return driver.driver.initialization_error
-
 initialize.initialize_all()
diff --git a/numba_dppy/dpctl_functions.py b/numba_dppy/dpctl_functions.py
new file mode 100644
index 0000000000..67bc358185
--- /dev/null
+++ b/numba_dppy/dpctl_functions.py
@@ -0,0 +1,30 @@
+from numba import types
+from numba.core.typing import signature
+
+
+class _DPCTL_FUNCTIONS:
+    @classmethod
+    def dpctl_get_current_queue(cls):
+        ret_type = types.voidptr
+        sig = signature(ret_type)
+        return types.ExternalFunction("DPCTLQueueMgr_GetCurrentQueue", sig)
+
+    @classmethod
+    def dpctl_malloc_shared(cls):
+        ret_type = types.voidptr
+        sig = signature(ret_type, types.int64, types.voidptr)
+        return types.ExternalFunction("DPCTLmalloc_shared", sig)
+
+    @classmethod
+    def dpctl_queue_memcpy(cls):
+        ret_type = types.void
+        sig = signature(
+            ret_type, types.voidptr, types.voidptr, types.voidptr, types.int64
+        )
+        return types.ExternalFunction("DPCTLQueue_Memcpy", sig)
+
+    @classmethod
+    def dpctl_free_with_queue(cls):
+        ret_type = types.void
+        sig = signature(ret_type, types.voidptr, types.voidptr)
+        return types.ExternalFunction("DPCTLfree_with_queue", sig)
diff --git a/numba_dppy/dpnp_glue/__init__.py b/numba_dppy/dpnp_glue/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/numba_dppy/dpnp_glue/dpnp_fptr_interface.pyx b/numba_dppy/dpnp_glue/dpnp_fptr_interface.pyx
index 8eba8bf74c..a63d4fdafa 100644
--- a/numba_dppy/dpnp_glue/dpnp_fptr_interface.pyx
+++ b/numba_dppy/dpnp_glue/dpnp_fptr_interface.pyx
@@ -8,6 +8,7 @@ cdef extern from "backend_iface_fptr.hpp" namespace "DPNPFuncName":  # need this
     cdef enum DPNPFuncName "DPNPFuncName":
         DPNP_FN_ABSOLUTE
         DPNP_FN_ADD
+        DPNP_FN_ARANGE
         DPNP_FN_ARCCOS
         DPNP_FN_ARCCOSH
         DPNP_FN_ARCSIN
@@ -18,40 +19,77 @@ cdef extern from "backend_iface_fptr.hpp" namespace "DPNPFuncName":  # need this
         DPNP_FN_ARGMAX
         DPNP_FN_ARGMIN
         DPNP_FN_ARGSORT
+        DPNP_FN_BITWISE_AND
+        DPNP_FN_BITWISE_OR
+        DPNP_FN_BITWISE_XOR
         DPNP_FN_CBRT
         DPNP_FN_CEIL
+        DPNP_FN_CHOLESKY
+        DPNP_FN_COPYSIGN
+        DPNP_FN_CORRELATE
         DPNP_FN_COS
         DPNP_FN_COSH
         DPNP_FN_COV
         DPNP_FN_DEGREES
+        DPNP_FN_DET
         DPNP_FN_DIVIDE
         DPNP_FN_DOT
         DPNP_FN_EIG
+        DPNP_FN_EIGVALS
         DPNP_FN_EXP
         DPNP_FN_EXP2
         DPNP_FN_EXPM1
         DPNP_FN_FABS
+        DPNP_FN_FFT_FFT
         DPNP_FN_FLOOR
+        DPNP_FN_FLOOR_DIVIDE
         DPNP_FN_FMOD
-        DPNP_FN_GAUSSIAN
         DPNP_FN_HYPOT
+        DPNP_FN_INVERT
+        DPNP_FN_LEFT_SHIFT
         DPNP_FN_LOG
         DPNP_FN_LOG10
         DPNP_FN_LOG1P
         DPNP_FN_LOG2
         DPNP_FN_MATMUL
+        DPNP_FN_MATRIX_RANK
         DPNP_FN_MAX
         DPNP_FN_MAXIMUM
         DPNP_FN_MEAN
         DPNP_FN_MEDIAN
         DPNP_FN_MIN
         DPNP_FN_MINIMUM
+        DPNP_FN_MODF
         DPNP_FN_MULTIPLY
         DPNP_FN_POWER
         DPNP_FN_PROD
-        DPNP_FN_UNIFORM
         DPNP_FN_RADIANS
+        DPNP_FN_REMAINDER
         DPNP_FN_RECIP
+        DPNP_FN_RIGHT_SHIFT
+        DPNP_FN_RNG_BETA
+        DPNP_FN_RNG_BINOMIAL
+        DPNP_FN_RNG_CHISQUARE
+        DPNP_FN_RNG_EXPONENTIAL
+        DPNP_FN_RNG_GAMMA
+        DPNP_FN_RNG_GAUSSIAN
+        DPNP_FN_RNG_GEOMETRIC
+        DPNP_FN_RNG_GUMBEL
+        DPNP_FN_RNG_HYPERGEOMETRIC
+        DPNP_FN_RNG_LAPLACE
+        DPNP_FN_RNG_LOGNORMAL
+        DPNP_FN_RNG_MULTINOMIAL
+        DPNP_FN_RNG_MULTIVARIATE_NORMAL
+        DPNP_FN_RNG_NEGATIVE_BINOMIAL
+        DPNP_FN_RNG_NORMAL
+        DPNP_FN_RNG_POISSON
+        DPNP_FN_RNG_RAYLEIGH
+        DPNP_FN_RNG_STANDARD_CAUCHY
+        DPNP_FN_RNG_STANDARD_EXPONENTIAL
+        DPNP_FN_RNG_STANDARD_GAMMA
+        DPNP_FN_RNG_STANDARD_NORMAL
+        DPNP_FN_RNG_UNIFORM
+        DPNP_FN_RNG_WEIBULL
         DPNP_FN_SIGN
         DPNP_FN_SIN
         DPNP_FN_SINH
@@ -109,6 +147,8 @@ cdef DPNPFuncName get_DPNPFuncName_from_str(name):
         return DPNPFuncName.DPNP_FN_ARGSORT
     elif name == "dpnp_cov":
         return DPNPFuncName.DPNP_FN_COV
+    elif name == "dpnp_eig":
+        return DPNPFuncName.DPNP_FN_EIG
     else:
         return  DPNPFuncName.DPNP_FN_DOT
 
diff --git a/numba_dppy/dpnp_glue/dpnpdecl.py b/numba_dppy/dpnp_glue/dpnpdecl.py
new file mode 100644
index 0000000000..e77739eeda
--- /dev/null
+++ b/numba_dppy/dpnp_glue/dpnpdecl.py
@@ -0,0 +1,10 @@
+from numba.core.typing.templates import (AttributeTemplate, infer_getattr)
+import numba_dppy
+from numba import types
+
+@infer_getattr
+class DppyDpnpTemplate(AttributeTemplate):
+    key = types.Module(numba_dppy)
+
+    def resolve_dpnp(self, mod):
+        return types.Module(numba_dppy.dpnp)
diff --git a/numba_dppy/dpnp_glue/dpnpimpl.py b/numba_dppy/dpnp_glue/dpnpimpl.py
new file mode 100644
index 0000000000..d6e53c4b99
--- /dev/null
+++ b/numba_dppy/dpnp_glue/dpnpimpl.py
@@ -0,0 +1,89 @@
+from numba.core.imputils import lower_builtin
+import numba_dppy.experimental_numpy_lowering_overload as dpnp_lowering
+from numba import types
+from numba.core.typing import signature
+from numba.core.extending import overload, register_jitable
+from . import stubs
+import numpy as np
+from numba_dppy.dpctl_functions import _DPCTL_FUNCTIONS
+
+
+def get_dpnp_fptr(fn_name, type_names):
+    from . import dpnp_fptr_interface as dpnp_glue
+
+    f_ptr = dpnp_glue.get_dpnp_fn_ptr(fn_name, type_names)
+    return f_ptr
+
+
+@register_jitable
+def _check_finite_matrix(a):
+    for v in np.nditer(a):
+        if not np.isfinite(v.item()):
+            raise np.linalg.LinAlgError("Array must not contain infs or NaNs.")
+
+
+@register_jitable
+def _dummy_liveness_func(a):
+    """pass a list of variables to be preserved through dead code elimination"""
+    return a[0]
+
+
+class RetrieveDpnpFnPtr(types.ExternalFunctionPointer):
+    def __init__(self, fn_name, type_names, sig, get_pointer):
+        self.fn_name = fn_name
+        self.type_names = type_names
+        super(RetrieveDpnpFnPtr, self).__init__(sig, get_pointer)
+
+
+class _DPNP_EXTENSION:
+    def __init__(self, name):
+        dpnp_lowering.ensure_dpnp(name)
+
+    @classmethod
+    def dpnp_sum(cls, fn_name, type_names):
+        ret_type = types.void
+        sig = signature(ret_type, types.voidptr, types.voidptr, types.int64)
+        f_ptr = get_dpnp_fptr(fn_name, type_names)
+
+        def get_pointer(obj):
+            return f_ptr
+
+        return types.ExternalFunctionPointer(sig, get_pointer=get_pointer)
+
+
+@overload(stubs.dpnp.sum)
+def dpnp_sum_impl(a):
+    dpnp_extension = _DPNP_EXTENSION("sum")
+    dpctl_functions = _DPCTL_FUNCTIONS()
+
+    dpnp_sum = dpnp_extension.dpnp_sum("dpnp_sum", [a.dtype.name, "NONE"])
+
+    get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
+    allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
+    copy_usm = dpctl_functions.dpctl_queue_memcpy()
+    free_usm = dpctl_functions.dpctl_free_with_queue()
+
+    def dpnp_sum_impl(a):
+        if a.size == 0:
+            raise ValueError("Passed Empty array")
+
+        sycl_queue = get_sycl_queue()
+        a_usm = allocate_usm_shared(a.size * a.itemsize, sycl_queue)
+        copy_usm(sycl_queue, a_usm, a.ctypes, a.size * a.itemsize)
+
+        out_usm = allocate_usm_shared(a.itemsize, sycl_queue)
+
+        dpnp_sum(a_usm, out_usm, a.size)
+
+        out = np.empty(1, dtype=a.dtype)
+        copy_usm(sycl_queue, out.ctypes, out_usm, out.size * out.itemsize)
+
+        free_usm(a_usm, sycl_queue)
+        free_usm(out_usm, sycl_queue)
+
+
+        _dummy_liveness_func([out.size])
+
+        return out[0]
+
+    return dpnp_sum_impl
diff --git a/numba_dppy/dpnp_glue/stubs.py b/numba_dppy/dpnp_glue/stubs.py
new file mode 100644
index 0000000000..d51cd28ead
--- /dev/null
+++ b/numba_dppy/dpnp_glue/stubs.py
@@ -0,0 +1,9 @@
+from numba_dppy.ocl.stubs import Stub
+
+class dpnp(Stub):
+    """dpnp namespace
+    """
+    _description_ = '<dpnp>'
+
+    class sum(Stub):
+        pass
diff --git a/numba_dppy/dppy_passbuilder.py b/numba_dppy/dppy_passbuilder.py
index 0a32a099cf..b3c632a85a 100644
--- a/numba_dppy/dppy_passbuilder.py
+++ b/numba_dppy/dppy_passbuilder.py
@@ -27,6 +27,8 @@
         DPPYNoPythonBackend
         )
 
+from .rename_numpy_functions_pass import DPPYRewriteOverloadedFunctions
+
 class DPPYPassBuilder(object):
     """
     This is the DPPY pass builder to run Intel GPU/CPU specific
@@ -44,6 +46,11 @@ def default_numba_nopython_pipeline(state, pm):
         pm.add_pass(IRProcessing, "processing IR")
         pm.add_pass(WithLifting, "Handle with contexts")
 
+        # this pass rewrites name of NumPy functions we intend to overload
+        pm.add_pass(DPPYRewriteOverloadedFunctions,
+                "Rewrite name of Numpy functions to overload already overloaded function",
+        )
+
         # this pass adds required logic to overload default implementation of
         # Numpy functions
         pm.add_pass(DPPYAddNumpyOverloadPass, "dppy add typing template for Numpy functions")
diff --git a/numba_dppy/dppy_passes.py b/numba_dppy/dppy_passes.py
index 0bb2eadb48..c73f5a7736 100644
--- a/numba_dppy/dppy_passes.py
+++ b/numba_dppy/dppy_passes.py
@@ -3,6 +3,7 @@
 import warnings
 
 import numpy as np
+import numba
 from numba.core import ir
 import weakref
 from collections import namedtuple, deque
@@ -49,7 +50,7 @@ def __init__(self):
     def run_pass(self, state):
         if dpnp_available():
             typingctx = state.typingctx
-            from numba.core.typing.templates import builtin_registry as reg, infer_global
+            from numba.core.typing.templates import (builtin_registry as reg, infer_global)
             from numba.core.typing.templates import (AbstractTemplate, CallableTemplate, signature)
             from numba.core.typing.npydecl import MatMulTyperMixin
 
diff --git a/numba_dppy/rename_numpy_functions_pass.py b/numba_dppy/rename_numpy_functions_pass.py
new file mode 100644
index 0000000000..a0c4b89b3e
--- /dev/null
+++ b/numba_dppy/rename_numpy_functions_pass.py
@@ -0,0 +1,125 @@
+from numba.core import ir
+from numba.core.compiler_machinery import FunctionPass, register_pass
+from numba.core.ir_utils import (
+    find_topo_order,
+    mk_unique_var,
+    remove_dead,
+    simplify_CFG,
+)
+import numba_dppy
+
+rewrite_function_name_map = {"sum": (["np"], "sum")}
+
+
+class RewriteNumPyOverloadedFunctions(object):
+    def __init__(self, state, rewrite_function_name_map=rewrite_function_name_map):
+        self.state = state
+        self.function_name_map = rewrite_function_name_map
+
+    def run(self):
+        """
+        This function rewrites the name of NumPy functions that exist in self.function_name_map
+        e.g np.sum(a) would produce the following:
+
+        np.sum() --> numba_dppy.dpnp.sum()
+
+        ---------------------------------------------------------------------------------------
+        Numba IR Before Rewrite:
+        ---------------------------------------------------------------------------------------
+
+            $2load_global.0 = global(np: <module 'numpy' from 'numpy/__init__.py'>) ['$2load_global.0']
+            $4load_method.1 = getattr(value=$2load_global.0, attr=sum) ['$2load_global.0', '$4load_method.1']
+            $8call_method.3 = call $4load_method.1(a, func=$4load_method.1, args=[Var(a, test_rewrite.py:7)],
+                                                   kws=(), vararg=None) ['$4load_method.1', '$8call_method.3', 'a']
+
+        ---------------------------------------------------------------------------------------
+        Numba IR After Rewrite:
+        ---------------------------------------------------------------------------------------
+
+            $dppy_replaced_var.0 = global(numba_dppy: <module 'numba_dppy' from 'numba_dppy/__init__.py'>) ['$dppy_replaced_var.0']
+            $dpnp_var.1 = getattr(value=$dppy_replaced_var.0, attr=dpnp) ['$dpnp_var.1', '$dppy_replaced_var.0']
+            $4load_method.1 = getattr(value=$dpnp_var.1, attr=sum) ['$4load_method.1', '$dpnp_var.1']
+            $8call_method.3 = call $4load_method.1(a, func=$4load_method.1, args=[Var(a, test_rewrite.py:7)],
+                                                   kws=(), vararg=None) ['$4load_method.1', '$8call_method.3', 'a']
+
+        ---------------------------------------------------------------------------------------
+        """
+        func_ir = self.state.func_ir
+        blocks = func_ir.blocks
+        topo_order = find_topo_order(blocks)
+
+        for label in topo_order:
+            block = blocks[label]
+            saved_arr_arg = {}
+            new_body = []
+            for stmt in block.body:
+                if isinstance(stmt, ir.Assign) and isinstance(stmt.value, ir.Expr):
+                    lhs = stmt.target.name
+                    rhs = stmt.value
+                    # replace np.FOO with name from self.function_name_map["FOO"]
+                    # e.g. np.sum will be replaced with numba_dppy.dpnp.sum
+                    if rhs.op == "getattr" and rhs.attr in self.function_name_map:
+                        module_node = block.find_variable_assignment(
+                            rhs.value.name
+                        ).value
+                        if (
+                            isinstance(module_node, ir.Global)
+                            and module_node.name in self.function_name_map[rhs.attr][0]
+                        ) or (
+                            isinstance(module_node, ir.Expr)
+                            and module_node.attr in self.function_name_map[rhs.attr][0]
+                        ):
+                            rhs = stmt.value
+                            rhs.attr = self.function_name_map[rhs.attr][1]
+
+                            global_module = rhs.value
+                            saved_arr_arg[lhs] = global_module
+
+                            scope = global_module.scope
+                            loc = global_module.loc
+
+                            g_dppy_var = ir.Var(
+                                scope, mk_unique_var("$2load_global"), loc
+                            )
+                            # We are trying to rename np.function_name/np.linalg.function_name with
+                            # numba_dppy.dpnp.function_name.
+                            # Hence, we need to have a global variable representing module numba_dppy.
+                            # Next, we add attribute dpnp to global module numba_dppy to
+                            # represent numba_dppy.dpnp.
+                            g_dppy = ir.Global("numba_dppy", numba_dppy, loc)
+                            g_dppy_assign = ir.Assign(g_dppy, g_dppy_var, loc)
+
+                            dpnp_var = ir.Var(scope, mk_unique_var("$4load_attr"), loc)
+                            getattr_dpnp = ir.Expr.getattr(g_dppy_var, "dpnp", loc)
+                            dpnp_assign = ir.Assign(getattr_dpnp, dpnp_var, loc)
+
+                            rhs.value = dpnp_var
+                            new_body.append(g_dppy_assign)
+                            new_body.append(dpnp_assign)
+                            func_ir._definitions[dpnp_var.name] = [getattr_dpnp]
+                            func_ir._definitions[g_dppy_var.name] = [g_dppy]
+
+                new_body.append(stmt)
+            block.body = new_body
+
+
+@register_pass(mutates_CFG=True, analysis_only=False)
+class DPPYRewriteOverloadedFunctions(FunctionPass):
+    _name = "dppy_rewrite_overloaded_functions_pass"
+
+    def __init__(self):
+        FunctionPass.__init__(self)
+        import numba_dppy.dpnp_glue.dpnpdecl
+        import numba_dppy.dpnp_glue.dpnpimpl
+
+    def run_pass(self, state):
+        rewrite_function_name_pass = RewriteNumPyOverloadedFunctions(
+            state, rewrite_function_name_map
+        )
+
+        rewrite_function_name_pass.run()
+
+        remove_dead(state.func_ir.blocks, state.func_ir.arg_names, state.func_ir)
+        state.func_ir.blocks = simplify_CFG(state.func_ir.blocks)
+
+        return True
diff --git a/numba_dppy/tests/test_rename_numpy_function_pass.py b/numba_dppy/tests/test_rename_numpy_function_pass.py
new file mode 100644
index 0000000000..b06a03b5e0
--- /dev/null
+++ b/numba_dppy/tests/test_rename_numpy_function_pass.py
@@ -0,0 +1,67 @@
+#! /usr/bin/env python
+
+import unittest
+import numpy as np
+
+import numba
+from numba import njit, prange
+import numba_dppy, numba_dppy as dppy
+
+
+from numba.core import compiler
+from numba_dppy.rename_numpy_functions_pass import DPPYRewriteOverloadedFunctions
+
+
+class MyPipeline(object):
+    def __init__(self, test_ir):
+        self.state = compiler.StateDict()
+        self.state.func_ir = test_ir
+
+
+def check_equivalent(expected_ir, got_ir):
+    expected_block_body = expected_ir.blocks[0].body
+    got_block_body = got_ir.blocks[0].body
+
+    if len(expected_block_body) != len(got_block_body):
+        return False
+
+    for i in range(len(expected_block_body)):
+        expected_stmt = expected_block_body[i]
+        got_stmt = got_block_body[i]
+        if type(expected_stmt) != type(got_stmt):
+            return False
+        else:
+            if isinstance(expected_stmt, numba.core.ir.Assign):
+                if isinstance(expected_stmt.value, numba.core.ir.Global):
+                    if (expected_stmt.value.name != got_stmt.value.name and
+                        expected_stmt.value.name != "numba_dppy"):
+                        return False
+                elif isinstance(expected_stmt.value, numba.core.ir.Expr):
+                    # should get "dpnp" and "sum" as attr
+                    if expected_stmt.value.op == "getattr":
+                        if expected_stmt.value.attr != got_stmt.value.attr:
+                            return False
+    return True
+
+
+class TestRenameNumpyFunctionsPass(unittest.TestCase):
+    def test_rename(self):
+        def expected(a):
+            return numba_dppy.dpnp.sum(a)
+
+        def got(a):
+            return np.sum(a)
+
+        expected_ir = compiler.run_frontend(expected)
+        got_ir = compiler.run_frontend(got)
+
+        pipeline = MyPipeline(got_ir)
+
+        rewrite_numpy_functions_pass = DPPYRewriteOverloadedFunctions()
+        rewrite_numpy_functions_pass.run_pass(pipeline.state)
+
+        self.assertTrue(check_equivalent(expected_ir, pipeline.state.func_ir))
+
+
+if __name__ == "__main__":
+    unittest.main()

From d84fcb42d161c7480d0106adfce6654d315b5040 Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev
 <68376232+vlad-perevezentsev@users.noreply.github.com>
Date: Fri, 11 Dec 2020 15:27:58 +0300
Subject: [PATCH 16/32] numba-dppy tests and examples use `with device_context`
 (#46)

* change the test_prange

* fix all tests and examples

* Replace DPPLTestCase with DPPYTestCase

* Fix typos

* Use explicit device selection in the examples

* Replace dppl with dppy

* Fix int64 to long long conversion on windows

* Fixed test_with_dppy_context_cpu

Co-authored-by: Pokhodenko <sergey.pokhodenko@intel.com>
---
 numba_dppy/dppy_host_fn_call_gen.py           |  25 +--
 numba_dppy/examples/pa_examples/test1-2d.py   |  16 +-
 numba_dppy/examples/pa_examples/test1-3d.py   |  16 +-
 numba_dppy/examples/pa_examples/test1-4d.py   |  16 +-
 numba_dppy/examples/pa_examples/test1-5d.py   |  16 +-
 numba_dppy/examples/pa_examples/test1.py      |   8 +-
 numba_dppy/tests/test_dpnp_functions.py       | 113 +++++++-----
 numba_dppy/tests/test_dppl_fallback.py        |  17 +-
 .../test_numpy_bit_twiddling_functions.py     |  77 ++++----
 .../tests/test_numpy_comparison_functions.py  | 124 +++++++------
 .../tests/test_numpy_floating_functions.py    |  52 +++---
 numba_dppy/tests/test_numpy_math_functions.py | 164 ++++++++++++------
 .../test_numpy_trigonomteric_functions.py     | 123 +++++++------
 numba_dppy/tests/test_parfor_lower_message.py |   8 +-
 numba_dppy/tests/test_prange.py               |  56 +++---
 numba_dppy/tests/test_vectorize.py            |  14 +-
 numba_dppy/tests/test_with_context.py         |   2 -
 17 files changed, 522 insertions(+), 325 deletions(-)

diff --git a/numba_dppy/dppy_host_fn_call_gen.py b/numba_dppy/dppy_host_fn_call_gen.py
index 7d1c9bcea4..2808ddf90d 100644
--- a/numba_dppy/dppy_host_fn_call_gen.py
+++ b/numba_dppy/dppy_host_fn_call_gen.py
@@ -52,7 +52,7 @@ def _init_llvm_types_and_constants(self):
         self.byte_ptr_t      = lc.Type.pointer(self.byte_t)
         self.byte_ptr_ptr_t  = lc.Type.pointer(self.byte_ptr_t)
         self.intp_t          = self.context.get_value_type(types.intp)
-        self.long_t          = self.context.get_value_type(types.int64)
+        self.int64_t         = self.context.get_value_type(types.int64)
         self.int32_t         = self.context.get_value_type(types.int32)
         self.int32_ptr_t     = lc.Type.pointer(self.int32_t)
         self.uintp_t         = self.context.get_value_type(types.uintp)
@@ -113,23 +113,26 @@ def allocate_kenrel_arg_array(self, num_kernel_args):
 
 
     def resolve_and_return_dpctl_type(self, ty):
+        """This function looks up the dpctl defined enum values from DPCTLKernelArgType.
+        """
+
         val = None
         if ty == types.int32 or isinstance(ty, types.scalars.IntegerLiteral):
-            val = self.context.get_constant(types.int32, 4)
+            val = self.context.get_constant(types.int32, 9)  # DPCTL_LONG_LONG
         elif ty == types.uint32:
-            val = self.context.get_constant(types.int32, 5)
+            val = self.context.get_constant(types.int32, 10)  # DPCTL_UNSIGNED_LONG_LONG
         elif ty == types.boolean:
-            val = self.context.get_constant(types.int32, 5)
+            val = self.context.get_constant(types.int32, 5)  # DPCTL_UNSIGNED_INT
         elif ty == types.int64:
-            val = self.context.get_constant(types.int32, 7)
+            val = self.context.get_constant(types.int32, 9)  # DPCTL_LONG_LONG
         elif ty == types.uint64:
-            val = self.context.get_constant(types.int32, 8)
+            val = self.context.get_constant(types.int32, 11)  # DPCTL_SIZE_T
         elif ty == types.float32:
-            val = self.context.get_constant(types.int32, 12)
+            val = self.context.get_constant(types.int32, 12)  # DPCTL_FLOAT
         elif ty == types.float64:
-            val = self.context.get_constant(types.int32, 13)
+            val = self.context.get_constant(types.int32, 13)  # DPCTL_DOUBLE
         elif ty == types.voidptr:
-            val = self.context.get_constant(types.int32, 15)
+            val = self.context.get_constant(types.int32, 15)  # DPCTL_VOID_PTR
         else:
             raise NotImplementedError
 
@@ -151,12 +154,12 @@ def process_kernel_arg(self, var, llvm_arg, arg_type, gu_sig, val_type, index, m
             if llvm_arg is None:
                 raise NotImplementedError(arg_type, var)
 
-            storage = cgutils.alloca_once(self.builder, self.long_t)
+            storage = cgutils.alloca_once(self.builder, self.int64_t)
             self.builder.store(self.context.get_constant(types.int64, 0), storage)
             ty = self.resolve_and_return_dpctl_type(types.int64)
             self.form_kernel_arg_and_arg_ty(self.builder.bitcast(storage, self.void_ptr_t), ty)
 
-            storage = cgutils.alloca_once(self.builder, self.long_t)
+            storage = cgutils.alloca_once(self.builder, self.int64_t)
             self.builder.store(self.context.get_constant(types.int64, 0), storage)
             ty = self.resolve_and_return_dpctl_type(types.int64)
             self.form_kernel_arg_and_arg_ty(self.builder.bitcast(storage, self.void_ptr_t), ty)
diff --git a/numba_dppy/examples/pa_examples/test1-2d.py b/numba_dppy/examples/pa_examples/test1-2d.py
index 7985216aba..df3849b30d 100644
--- a/numba_dppy/examples/pa_examples/test1-2d.py
+++ b/numba_dppy/examples/pa_examples/test1-2d.py
@@ -1,23 +1,29 @@
 from numba import njit, gdb
 import numpy as np
+import dpctl
 
-@njit(parallel={'offload':True})
+
+@njit
 def f1(a, b):
     c = a + b
     return c
 
+
 N = 1000
 print("N", N)
 
-a = np.ones((N,N), dtype=np.float32)
-b = np.ones((N,N), dtype=np.float32)
+a = np.ones((N, N), dtype=np.float32)
+b = np.ones((N, N), dtype=np.float32)
 
 print("a:", a, hex(a.ctypes.data))
 print("b:", b, hex(b.ctypes.data))
-c = f1(a,b)
+
+with dpctl.device_context("opencl:gpu:0"):
+    c = f1(a, b)
+
 print("BIG RESULT c:", c, hex(c.ctypes.data))
 for i in range(N):
     for j in range(N):
-        if c[i,j] != 2.0:
+        if c[i, j] != 2.0:
             print("First index not equal to 2.0 was", i, j)
             break
diff --git a/numba_dppy/examples/pa_examples/test1-3d.py b/numba_dppy/examples/pa_examples/test1-3d.py
index 1304c0762a..a69aa0cbc5 100644
--- a/numba_dppy/examples/pa_examples/test1-3d.py
+++ b/numba_dppy/examples/pa_examples/test1-3d.py
@@ -1,24 +1,30 @@
 from numba import njit, gdb
 import numpy as np
+import dpctl
 
-@njit(parallel={'offload':True})
+
+@njit
 def f1(a, b):
     c = a + b
     return c
 
+
 N = 10
 print("N", N)
 
-a = np.ones((N,N,N), dtype=np.float32)
-b = np.ones((N,N,N), dtype=np.float32)
+a = np.ones((N, N, N), dtype=np.float32)
+b = np.ones((N, N, N), dtype=np.float32)
 
 print("a:", a, hex(a.ctypes.data))
 print("b:", b, hex(b.ctypes.data))
-c = f1(a,b)
+
+with dpctl.device_context("opencl:gpu:0"):
+    c = f1(a, b)
+
 print("BIG RESULT c:", c, hex(c.ctypes.data))
 for i in range(N):
     for j in range(N):
         for k in range(N):
-            if c[i,j,k] != 2.0:
+            if c[i, j, k] != 2.0:
                 print("First index not equal to 2.0 was", i, j, k)
                 break
diff --git a/numba_dppy/examples/pa_examples/test1-4d.py b/numba_dppy/examples/pa_examples/test1-4d.py
index bb52da28de..2647d0e66e 100644
--- a/numba_dppy/examples/pa_examples/test1-4d.py
+++ b/numba_dppy/examples/pa_examples/test1-4d.py
@@ -1,25 +1,31 @@
 from numba import njit, gdb
 import numpy as np
+import dpctl
 
-@njit(parallel={'offload':True})
+
+@njit
 def f1(a, b):
     c = a + b
     return c
 
+
 N = 10
 print("N", N)
 
-a = np.ones((N,N,N,N), dtype=np.float32)
-b = np.ones((N,N,N,N), dtype=np.float32)
+a = np.ones((N, N, N, N), dtype=np.float32)
+b = np.ones((N, N, N, N), dtype=np.float32)
 
 print("a:", a, hex(a.ctypes.data))
 print("b:", b, hex(b.ctypes.data))
-c = f1(a,b)
+
+with dpctl.device_context("opencl:gpu:0"):
+    c = f1(a, b)
+
 print("BIG RESULT c:", c, hex(c.ctypes.data))
 for i in range(N):
     for j in range(N):
         for k in range(N):
             for l in range(N):
-                if c[i,j,k,l] != 2.0:
+                if c[i, j, k, l] != 2.0:
                     print("First index not equal to 2.0 was", i, j, k, l)
                     break
diff --git a/numba_dppy/examples/pa_examples/test1-5d.py b/numba_dppy/examples/pa_examples/test1-5d.py
index e795dbe602..893fe3b6a6 100644
--- a/numba_dppy/examples/pa_examples/test1-5d.py
+++ b/numba_dppy/examples/pa_examples/test1-5d.py
@@ -1,26 +1,32 @@
 from numba import njit, gdb
 import numpy as np
+import dpctl
 
-@njit(parallel={'offload':True})
+
+@njit
 def f1(a, b):
     c = a + b
     return c
 
+
 N = 5
 print("N", N)
 
-a = np.ones((N,N,N,N,N), dtype=np.float32)
-b = np.ones((N,N,N,N,N), dtype=np.float32)
+a = np.ones((N, N, N, N, N), dtype=np.float32)
+b = np.ones((N, N, N, N, N), dtype=np.float32)
 
 print("a:", a, hex(a.ctypes.data))
 print("b:", b, hex(b.ctypes.data))
-c = f1(a,b)
+
+with dpctl.device_context("opencl:gpu:0"):
+    c = f1(a, b)
+
 print("BIG RESULT c:", c, hex(c.ctypes.data))
 for i in range(N):
     for j in range(N):
         for k in range(N):
             for l in range(N):
                 for m in range(N):
-                    if c[i,j,k,l,m] != 2.0:
+                    if c[i, j, k, l, m] != 2.0:
                         print("First index not equal to 2.0 was", i, j, k, l, m)
                         break
diff --git a/numba_dppy/examples/pa_examples/test1.py b/numba_dppy/examples/pa_examples/test1.py
index 1620654cf8..01209b3309 100644
--- a/numba_dppy/examples/pa_examples/test1.py
+++ b/numba_dppy/examples/pa_examples/test1.py
@@ -1,8 +1,9 @@
 from numba import njit
 import numpy as np
+import dpctl
 
 
-@njit(parallel={'offload':True})
+@njit
 def f1(a, b):
     c = a + b
     return c
@@ -19,7 +20,10 @@ def main():
 
     print("a:", a, hex(a.ctypes.data))
     print("b:", b, hex(b.ctypes.data))
-    c = f1(a,b)
+
+    with dpctl.device_context("opencl:gpu:0"):
+        c = f1(a, b)
+
     print("RESULT c:", c, hex(c.ctypes.data))
     for i in range(N):
         if c[i] != 2.0:
diff --git a/numba_dppy/tests/test_dpnp_functions.py b/numba_dppy/tests/test_dpnp_functions.py
index b0837f5ba6..c4749885ba 100644
--- a/numba_dppy/tests/test_dpnp_functions.py
+++ b/numba_dppy/tests/test_dpnp_functions.py
@@ -5,7 +5,9 @@
 import sys
 import numpy as np
 from numba import njit
-import numba_dppy, numba_dppy as dppy
+import numba_dppy
+import numba_dppy as dppy
+import dpctl
 from numba_dppy.testing import unittest
 from numba_dppy.testing import DPPYTestCase
 
@@ -14,10 +16,14 @@ def test_for_different_datatypes(fn, test_fn, dims, arg_count, tys, np_all=False
     if arg_count == 1:
         for ty in tys:
             if matrix and matrix[0]:
-                a = np.array(np.random.random(dims[0] * dims[1]), dtype=ty).reshape(dims[0], dims[1])
+                a = np.array(np.random.random(
+                    dims[0] * dims[1]), dtype=ty).reshape(dims[0], dims[1])
             else:
                 a = np.array(np.random.random(dims[0]), dtype=ty)
-            c = fn(a)
+
+            with dpctl.device_context("opencl:gpu"):
+                c = fn(a)
+
             d = test_fn(a)
             if np_all:
                 max_abs_err = np.all(c - d)
@@ -29,15 +35,19 @@ def test_for_different_datatypes(fn, test_fn, dims, arg_count, tys, np_all=False
     elif arg_count == 2:
         for ty in tys:
             if matrix and matrix[0]:
-                a = np.array(np.random.random(dims[0] * dims[1]), dtype=ty).reshape(dims[0], dims[1])
+                a = np.array(np.random.random(
+                    dims[0] * dims[1]), dtype=ty).reshape(dims[0], dims[1])
             else:
                 a = np.array(np.random.random(dims[0] * dims[1]), dtype=ty)
             if matrix and matrix[1]:
-                b = np.array(np.random.random(dims[2] * dims[3]), dtype=ty).reshape(dims[2], dims[3])
+                b = np.array(np.random.random(
+                    dims[2] * dims[3]), dtype=ty).reshape(dims[2], dims[3])
             else:
                 b = np.array(np.random.random(dims[2] * dims[3]), dtype=ty)
 
-            c = fn(a, b)
+            with dpctl.device_context("opencl:gpu"):
+                c = fn(a, b)
+
             d = test_fn(a, b)
             if np_all:
                 max_abs_err = np.sum(c - d)
@@ -48,6 +58,7 @@ def test_for_different_datatypes(fn, test_fn, dims, arg_count, tys, np_all=False
 
     return True
 
+
 def test_for_dimensions(fn, test_fn, dims, tys, np_all=False):
     total_size = 1
     for d in dims:
@@ -55,7 +66,10 @@ def test_for_dimensions(fn, test_fn, dims, tys, np_all=False):
 
     for ty in tys:
         a = np.array(np.random.random(total_size), dtype=ty).reshape(dims)
-        c = fn(a)
+
+        with dpctl.device_context("opencl:gpu"):
+            c = fn(a)
+
         d = test_fn(a)
         if np_all:
             max_abs_err = np.all(c - d)
@@ -66,6 +80,7 @@ def test_for_dimensions(fn, test_fn, dims, tys, np_all=False):
 
     return True
 
+
 def ensure_dpnp():
     try:
        # import dpnp
@@ -75,8 +90,9 @@ def ensure_dpnp():
         return False
 
 
-@unittest.skipUnless(ensure_dpnp(), 'test only when dpNP is available')
+@unittest.skipUnless(ensure_dpnp() and dpctl.has_gpu_queues(), 'test only when dpNP and GPU is available')
 class Testdpnp_functions(DPPYTestCase):
+
     N = 10
 
     a = np.array(np.random.random(N), dtype=np.float32)
@@ -84,123 +100,140 @@ class Testdpnp_functions(DPPYTestCase):
     tys = [np.int32, np.uint32, np.int64, np.uint64, np.float, np.double]
 
     def test_sum(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.sum(a)
             return c
 
-        self.assertTrue(test_for_different_datatypes(f, np.sum, [10], 1, self.tys))
+        self.assertTrue(test_for_different_datatypes(
+            f, np.sum, [10], 1, self.tys))
         self.assertTrue(test_for_dimensions(f, np.sum, [10, 2], self.tys))
         self.assertTrue(test_for_dimensions(f, np.sum, [10, 2, 3], self.tys))
 
     def test_prod(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.prod(a)
             return c
 
-        self.assertTrue(test_for_different_datatypes(f, np.prod, [10], 1, self.tys))
+        self.assertTrue(test_for_different_datatypes(
+            f, np.prod, [10], 1, self.tys))
         self.assertTrue(test_for_dimensions(f, np.prod, [10, 2], self.tys))
         self.assertTrue(test_for_dimensions(f, np.prod, [10, 2, 3], self.tys))
 
     def test_argmax(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.argmax(a)
             return c
 
-        self.assertTrue(test_for_different_datatypes(f, np.argmax, [10], 1, self.tys))
+        self.assertTrue(test_for_different_datatypes(
+            f, np.argmax, [10], 1, self.tys))
         self.assertTrue(test_for_dimensions(f, np.argmax, [10, 2], self.tys))
-        self.assertTrue(test_for_dimensions(f, np.argmax, [10, 2, 3], self.tys))
+        self.assertTrue(test_for_dimensions(
+            f, np.argmax, [10, 2, 3], self.tys))
 
     def test_max(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.max(a)
             return c
 
-        self.assertTrue(test_for_different_datatypes(f, np.max, [10], 1, self.tys))
+        self.assertTrue(test_for_different_datatypes(
+            f, np.max, [10], 1, self.tys))
         self.assertTrue(test_for_dimensions(f, np.max, [10, 2], self.tys))
         self.assertTrue(test_for_dimensions(f, np.max, [10, 2, 3], self.tys))
 
     def test_argmin(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.argmin(a)
             return c
 
-        self.assertTrue(test_for_different_datatypes(f, np.argmin, [10], 1, self.tys))
+        self.assertTrue(test_for_different_datatypes(
+            f, np.argmin, [10], 1, self.tys))
         self.assertTrue(test_for_dimensions(f, np.argmin, [10, 2], self.tys))
-        self.assertTrue(test_for_dimensions(f, np.argmin, [10, 2, 3], self.tys))
+        self.assertTrue(test_for_dimensions(
+            f, np.argmin, [10, 2, 3], self.tys))
 
     def test_min(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.min(a)
             return c
 
-        self.assertTrue(test_for_different_datatypes(f, np.min, [10], 1, self.tys))
+        self.assertTrue(test_for_different_datatypes(
+            f, np.min, [10], 1, self.tys))
         self.assertTrue(test_for_dimensions(f, np.min, [10, 2], self.tys))
         self.assertTrue(test_for_dimensions(f, np.min, [10, 2, 3], self.tys))
 
     def test_argsort(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.argsort(a)
             return c
 
-        self.assertTrue(test_for_different_datatypes(f, np.argmin, [10], 1, self.tys, np_all=True))
+        self.assertTrue(test_for_different_datatypes(
+            f, np.argmin, [10], 1, self.tys, np_all=True))
 
     def test_median(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.median(a)
             return c
 
-        self.assertTrue(test_for_different_datatypes(f, np.median, [10], 1, self.tys))
+        self.assertTrue(test_for_different_datatypes(
+            f, np.median, [10], 1, self.tys))
         self.assertTrue(test_for_dimensions(f, np.median, [10, 2], self.tys))
-        self.assertTrue(test_for_dimensions(f, np.median, [10, 2, 3], self.tys))
+        self.assertTrue(test_for_dimensions(
+            f, np.median, [10, 2, 3], self.tys))
 
     def test_mean(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.mean(a)
             return c
 
-        self.assertTrue(test_for_different_datatypes(f, np.mean, [10], 1, self.tys))
+        self.assertTrue(test_for_different_datatypes(
+            f, np.mean, [10], 1, self.tys))
         self.assertTrue(test_for_dimensions(f, np.mean, [10, 2], self.tys))
         self.assertTrue(test_for_dimensions(f, np.mean, [10, 2, 3], self.tys))
 
     def test_matmul(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.matmul(a, b)
             return c
 
-        self.assertTrue(test_for_different_datatypes(f, np.matmul, [10, 5, 5, 10], 2, [np.float, np.double], np_all=True, matrix=[True, True]))
+        self.assertTrue(test_for_different_datatypes(f, np.matmul, [10, 5, 5, 10], 2, [
+                        np.float, np.double], np_all=True, matrix=[True, True]))
 
     def test_dot(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.dot(a, b)
             return c
 
-        self.assertTrue(test_for_different_datatypes(f, np.dot, [10, 1, 10, 1], 2, [np.float, np.double]))
-        self.assertTrue(test_for_different_datatypes(f, np.dot, [10, 1, 10, 2], 2, [np.float, np.double], matrix=[False, True], np_all=True))
-        self.assertTrue(test_for_different_datatypes(f, np.dot, [2, 10, 10, 1], 2, [np.float, np.double], matrix=[True, False], np_all=True))
-        self.assertTrue(test_for_different_datatypes(f, np.dot, [10, 2, 2, 10], 2, [np.float, np.double], matrix=[True, True], np_all=True))
-
+        self.assertTrue(test_for_different_datatypes(
+            f, np.dot, [10, 1, 10, 1], 2, [np.float, np.double]))
+        self.assertTrue(test_for_different_datatypes(f, np.dot, [10, 1, 10, 2], 2, [
+                        np.float, np.double], matrix=[False, True], np_all=True))
+        self.assertTrue(test_for_different_datatypes(f, np.dot, [2, 10, 10, 1], 2, [
+                        np.float, np.double], matrix=[True, False], np_all=True))
+        self.assertTrue(test_for_different_datatypes(f, np.dot, [10, 2, 2, 10], 2, [
+                        np.float, np.double], matrix=[True, True], np_all=True))
 
     def test_cov(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.cov(a)
             return c
 
-        self.assertTrue(test_for_different_datatypes(f, np.cov, [10, 7], 1, self.tys, matrix=[True], np_all=True))
+        self.assertTrue(test_for_different_datatypes(
+            f, np.cov, [10, 7], 1, self.tys, matrix=[True], np_all=True))
 
     def test_dpnp_interacting_with_parfor(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.sum(a)
             e = np.add(b, a)
diff --git a/numba_dppy/tests/test_dppl_fallback.py b/numba_dppy/tests/test_dppl_fallback.py
index 8519f4fb14..76792f5744 100644
--- a/numba_dppy/tests/test_dppl_fallback.py
+++ b/numba_dppy/tests/test_dppl_fallback.py
@@ -3,7 +3,8 @@
 import numpy as np
 
 import numba
-import numba_dppy, numba_dppy as dppy
+import numba_dppy
+import numba_dppy as dppy
 from numba_dppy.testing import unittest
 from numba_dppy.testing import DPPYTestCase
 from numba.tests.support import captured_stderr
@@ -28,28 +29,30 @@ def inner_call_fallback():
 
             return a
 
-        with captured_stderr() as msg:
-            dppy = numba.njit(parallel={'offload':True})(inner_call_fallback)
+        with captured_stderr() as msg, dpctl.device_context("opencl:gpu"):
+            dppy = numba.njit(inner_call_fallback)
             dppy_result = dppy()
 
         ref_result = inner_call_fallback()
 
         np.testing.assert_array_equal(dppy_result, ref_result)
-        self.assertTrue('Failed to lower parfor on DPPY-device' in msg.getvalue())
+        self.assertTrue(
+            'Failed to lower parfor on DPPY-device' in msg.getvalue())
 
     def test_dppy_fallback_reductions(self):
         def reduction(a):
             return np.amax(a)
 
         a = np.ones(10)
-        with captured_stderr() as msg:
-            dppy = numba.njit(parallel={'offload':True})(reduction)
+        with captured_stderr() as msg, dpctl.device_context("opencl:gpu"):
+            dppy = numba.njit(reduction)
             dppy_result = dppy(a)
 
         ref_result = reduction(a)
 
         np.testing.assert_array_equal(dppy_result, ref_result)
-        self.assertTrue('Failed to lower parfor on DPPY-device' in msg.getvalue())
+        self.assertTrue(
+            'Failed to lower parfor on DPPY-device' in msg.getvalue())
 
 
 if __name__ == '__main__':
diff --git a/numba_dppy/tests/test_numpy_bit_twiddling_functions.py b/numba_dppy/tests/test_numpy_bit_twiddling_functions.py
index de6b7bc963..8d022a0bb1 100644
--- a/numba_dppy/tests/test_numpy_bit_twiddling_functions.py
+++ b/numba_dppy/tests/test_numpy_bit_twiddling_functions.py
@@ -5,104 +5,115 @@
 import sys
 import numpy as np
 from numba import njit
-import numba_dppy, numba_dppy as dppy
+import numba_dppy
+import numba_dppy as dppy
+import dpctl
 from numba_dppy.testing import unittest
 from numba_dppy.testing import DPPYTestCase
 
 
+@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
 class TestNumpy_bit_twiddling_functions(DPPYTestCase):
     def test_bitwise_and(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.bitwise_and(a, b)
             return c
 
-        a = np.array([2,5,255])
-        b = np.array([3,14,16])
+        a = np.array([2, 5, 255])
+        b = np.array([3, 14, 16])
+
+        with dpctl.device_context("opencl:gpu"):
+            c = f(a, b)
 
-        c = f(a, b)
         d = np.bitwise_and(a, b)
         self.assertTrue(np.all(c == d))
 
-
     def test_bitwise_or(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.bitwise_or(a, b)
             return c
 
-        a = np.array([2,5,255])
-        b = np.array([4,4,4])
+        a = np.array([2, 5, 255])
+        b = np.array([4, 4, 4])
+
+        with dpctl.device_context("opencl:gpu"):
+            c = f(a, b)
 
-        c = f(a, b)
         d = np.bitwise_or(a, b)
         self.assertTrue(np.all(c == d))
 
-
     def test_bitwise_xor(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.bitwise_xor(a, b)
             return c
 
-        a = np.array([2,5,255])
-        b = np.array([4,4,4])
+        a = np.array([2, 5, 255])
+        b = np.array([4, 4, 4])
+
+        with dpctl.device_context("opencl:gpu"):
+            c = f(a, b)
 
-        c = f(a, b)
         d = np.bitwise_xor(a, b)
         self.assertTrue(np.all(c == d))
 
-
     def test_bitwise_not(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.bitwise_not(a)
             return c
 
-        a = np.array([2,5,255])
+        a = np.array([2, 5, 255])
+
+        with dpctl.device_context("opencl:gpu"):
+            c = f(a)
 
-        c = f(a)
         d = np.bitwise_not(a)
         self.assertTrue(np.all(c == d))
 
-
     def test_invert(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.invert(a)
             return c
 
-        a = np.array([2,5,255])
+        a = np.array([2, 5, 255])
+
+        with dpctl.device_context("opencl:gpu"):
+            c = f(a)
 
-        c = f(a)
         d = np.invert(a)
         self.assertTrue(np.all(c == d))
 
-
     def test_left_shift(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.left_shift(a, b)
             return c
 
-        a = np.array([2,3,4])
-        b = np.array([1,2,3])
+        a = np.array([2, 3, 4])
+        b = np.array([1, 2, 3])
+
+        with dpctl.device_context("opencl:gpu"):
+            c = f(a, b)
 
-        c = f(a, b)
         d = np.left_shift(a, b)
         self.assertTrue(np.all(c == d))
 
-
     def test_right_shift(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.right_shift(a, b)
             return c
 
-        a = np.array([2,3,4])
-        b = np.array([1,2,3])
+        a = np.array([2, 3, 4])
+        b = np.array([1, 2, 3])
+
+        with dpctl.device_context("opencl:gpu"):
+            c = f(a, b)
 
-        c = f(a, b)
         d = np.right_shift(a, b)
         self.assertTrue(np.all(c == d))
 
diff --git a/numba_dppy/tests/test_numpy_comparison_functions.py b/numba_dppy/tests/test_numpy_comparison_functions.py
index 5daf1fc813..53a8eed890 100644
--- a/numba_dppy/tests/test_numpy_comparison_functions.py
+++ b/numba_dppy/tests/test_numpy_comparison_functions.py
@@ -5,81 +5,92 @@
 import sys
 import numpy as np
 from numba import njit
-import numba_dppy, numba_dppy as dppy
+import numba_dppy
+import numba_dppy as dppy
+import dpctl
 from numba_dppy.testing import unittest
 from numba_dppy.testing import DPPYTestCase
 
+
+@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
 class TestNumpy_comparison_functions(DPPYTestCase):
-    a = np.array([4,5,6])
-    b = np.array([2,6,6])
+    a = np.array([4, 5, 6])
+    b = np.array([2, 6, 6])
+
     def test_greater(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.greater(a, b)
             return c
 
-        c = f(self.a, self.b)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a, self.b)
+
         d = np.greater(self.a, self.b)
         self.assertTrue(np.all(c == d))
 
-
     def test_greater_equal(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.greater_equal(a, b)
             return c
 
-        c = f(self.a, self.b)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a, self.b)
+
         d = np.greater_equal(self.a, self.b)
         self.assertTrue(np.all(c == d))
 
-
     def test_less(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.less(a, b)
             return c
 
-        c = f(self.a, self.b)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a, self.b)
+
         d = np.less(self.a, self.b)
         self.assertTrue(np.all(c == d))
 
-
     def test_less_equal(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.less_equal(a, b)
             return c
 
-        c = f(self.a, self.b)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a, self.b)
+
         d = np.less_equal(self.a, self.b)
         self.assertTrue(np.all(c == d))
 
-
     def test_not_equal(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.not_equal(a, b)
             return c
 
-        c = f(self.a, self.b)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a, self.b)
+
         d = np.not_equal(self.a, self.b)
         self.assertTrue(np.all(c == d))
 
-
     def test_equal(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.equal(a, b)
             return c
 
-        c = f(self.a, self.b)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a, self.b)
+
         d = np.equal(self.a, self.b)
         self.assertTrue(np.all(c == d))
 
-
     def test_logical_and(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.logical_and(a, b)
             return c
@@ -87,13 +98,14 @@ def f(a, b):
         a = np.array([True, True, False])
         b = np.array([True, False, False])
 
-        c = f(a, b)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(a, b)
+
         d = np.logical_and(a, b)
         self.assertTrue(np.all(c == d))
 
-
     def test_logical_or(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.logical_or(a, b)
             return c
@@ -101,13 +113,14 @@ def f(a, b):
         a = np.array([True, True, False])
         b = np.array([True, False, False])
 
-        c = f(a, b)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(a, b)
+
         d = np.logical_or(a, b)
         self.assertTrue(np.all(c == d))
 
-
     def test_logical_xor(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.logical_xor(a, b)
             return c
@@ -115,76 +128,83 @@ def f(a, b):
         a = np.array([True, True, False])
         b = np.array([True, False, False])
 
-        c = f(a, b)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(a, b)
+
         d = np.logical_xor(a, b)
         self.assertTrue(np.all(c == d))
 
-
     def test_logical_not(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.logical_not(a)
             return c
 
         a = np.array([True, True, False])
 
-        c = f(a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(a)
+
         d = np.logical_not(a)
         self.assertTrue(np.all(c == d))
 
-
     def test_maximum(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.maximum(a, b)
             return c
 
-        a = np.array([5,6,7,np.nan], dtype=np.float32)
-        b = np.array([5,7,6,100], dtype=np.float32)
+        a = np.array([5, 6, 7, np.nan], dtype=np.float32)
+        b = np.array([5, 7, 6, 100], dtype=np.float32)
+
+        with dpctl.device_context("opencl:gpu"):
+            c = f(a, b)
 
-        c = f(a, b)
         d = np.maximum(a, b)
         np.testing.assert_equal(c, d)
 
-
     def test_minimum(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.minimum(a, b)
             return c
 
-        a = np.array([5,6,7,np.nan], dtype=np.float32)
-        b = np.array([5,7,6,100], dtype=np.float32)
+        a = np.array([5, 6, 7, np.nan], dtype=np.float32)
+        b = np.array([5, 7, 6, 100], dtype=np.float32)
+
+        with dpctl.device_context("opencl:gpu"):
+            c = f(a, b)
 
-        c = f(a, b)
         d = np.minimum(a, b)
         np.testing.assert_equal(c, d)
 
-
     def test_fmax(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.fmax(a, b)
             return c
 
-        a = np.array([5,6,7,np.nan], dtype=np.float32)
-        b = np.array([5,7,6,100], dtype=np.float32)
+        a = np.array([5, 6, 7, np.nan], dtype=np.float32)
+        b = np.array([5, 7, 6, 100], dtype=np.float32)
+
+        with dpctl.device_context("opencl:gpu"):
+            c = f(a, b)
 
-        c = f(a, b)
         d = np.fmax(a, b)
         np.testing.assert_equal(c, d)
 
-
     def test_fmin(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.fmin(a, b)
             return c
 
-        a = np.array([5,6,7,np.nan], dtype=np.float32)
-        b = np.array([5,7,6,100], dtype=np.float32)
+        a = np.array([5, 6, 7, np.nan], dtype=np.float32)
+        b = np.array([5, 7, 6, 100], dtype=np.float32)
+
+        with dpctl.device_context("opencl:gpu"):
+            c = f(a, b)
 
-        c = f(a, b)
         d = np.fmin(a, b)
         np.testing.assert_equal(c, d)
 
diff --git a/numba_dppy/tests/test_numpy_floating_functions.py b/numba_dppy/tests/test_numpy_floating_functions.py
index c05c10498d..fb7c1b98e8 100644
--- a/numba_dppy/tests/test_numpy_floating_functions.py
+++ b/numba_dppy/tests/test_numpy_floating_functions.py
@@ -4,92 +4,102 @@
 import sys
 import numpy as np
 from numba import njit
-import numba_dppy, numba_dppy as dppy
+import numba_dppy
+import numba_dppy as dppy
+import dpctl
 from numba_dppy.testing import unittest
 from numba_dppy.testing import DPPYTestCase
 
 
+@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
 class TestNumpy_floating_functions(DPPYTestCase):
     def test_isfinite(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.isfinite(a)
             return c
 
-        test_arr = [np.log(-1.),1.,np.log(0)]
+        test_arr = [np.log(-1.), 1., np.log(0)]
         input_arr = np.asarray(test_arr, dtype=np.float32)
 
-        c = f(input_arr)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
+
         d = np.isfinite(input_arr)
         self.assertTrue(np.all(c == d))
 
-
     def test_isinf(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.isinf(a)
             return c
 
-        test_arr = [np.log(-1.),1.,np.log(0)]
+        test_arr = [np.log(-1.), 1., np.log(0)]
         input_arr = np.asarray(test_arr, dtype=np.float32)
 
-        c = f(input_arr)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
+
         d = np.isinf(input_arr)
         self.assertTrue(np.all(c == d))
 
     def test_isnan(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.isnan(a)
             return c
 
-        test_arr = [np.log(-1.),1.,np.log(0)]
+        test_arr = [np.log(-1.), 1., np.log(0)]
         input_arr = np.asarray(test_arr, dtype=np.float32)
 
-        c = f(input_arr)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
+
         d = np.isnan(input_arr)
         self.assertTrue(np.all(c == d))
 
-
     def test_floor(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.floor(a)
             return c
 
         input_arr = np.array([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0])
 
-        c = f(input_arr)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
+
         d = np.floor(input_arr)
         self.assertTrue(np.all(c == d))
 
-
     def test_ceil(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.ceil(a)
             return c
 
         input_arr = np.array([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0])
 
-        c = f(input_arr)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
+
         d = np.ceil(input_arr)
         self.assertTrue(np.all(c == d))
 
-
     def test_trunc(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.trunc(a)
             return c
 
         input_arr = np.array([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0])
 
-        c = f(input_arr)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
+
         d = np.trunc(input_arr)
         self.assertTrue(np.all(c == d))
 
 
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/numba_dppy/tests/test_numpy_math_functions.py b/numba_dppy/tests/test_numpy_math_functions.py
index 155b352c7e..7af014d4d8 100644
--- a/numba_dppy/tests/test_numpy_math_functions.py
+++ b/numba_dppy/tests/test_numpy_math_functions.py
@@ -5,79 +5,95 @@
 import sys
 import numpy as np
 from numba import njit
-import numba_dppy, numba_dppy as dppy
+import numba_dppy
+import numba_dppy as dppy
+import dpctl
 from numba_dppy.testing import unittest
 from numba_dppy.testing import DPPYTestCase
 
 
+@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
 class TestNumpy_math_functions(DPPYTestCase):
+
     N = 10
     a = np.array(np.random.random(N), dtype=np.float32)
     b = np.array(np.random.random(N), dtype=np.float32)
 
     def test_add(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.add(a, b)
             return c
 
-        c = f(self.a, self.b)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a, self.b)
+
         d = self.a + self.b
         self.assertTrue(np.all(c == d))
 
     def test_subtract(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.subtract(a, b)
             return c
 
-        c = f(self.a, self.b)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a, self.b)
+
         d = self.a - self.b
         self.assertTrue(np.all(c == d))
 
     def test_multiply(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.multiply(a, b)
             return c
 
-        c = f(self.a, self.b)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a, self.b)
+
         d = self.a * self.b
         self.assertTrue(np.all(c == d))
 
     def test_divide(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.divide(a, b)
             return c
 
-        c = f(self.a, self.b)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a, self.b)
+
         d = self.a / self.b
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-2)
 
     def test_true_divide(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.true_divide(a, b)
             return c
 
-        c = f(self.a, self.b)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a, self.b)
+
         d = np.true_divide(self.a, self.b)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-2)
 
     def test_negative(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.negative(a)
             return c
 
-        c = f(self.a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a)
+
         self.assertTrue(np.all(c == -self.a))
 
     def test_power(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.power(a, b)
             return c
@@ -85,11 +101,13 @@ def f(a, b):
         input_arr = np.random.randint(self.N, size=(self.N))
         exp = np.full((self.N), 2, dtype=np.int)
 
-        c = f(input_arr, exp)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr, exp)
+
         self.assertTrue(np.all(c == input_arr * input_arr))
 
     def test_remainder(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.remainder(a, b)
             return c
@@ -97,11 +115,13 @@ def f(a, b):
         input_arr = np.full((self.N), 3, dtype=np.int)
         divisor = np.full((self.N), 2, dtype=np.int)
 
-        c = f(input_arr, divisor)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr, divisor)
+
         self.assertTrue(np.all(c == 1))
 
     def test_mod(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.mod(a, b)
             return c
@@ -109,11 +129,13 @@ def f(a, b):
         input_arr = np.full((self.N), 3, dtype=np.int)
         divisor = np.full((self.N), 2, dtype=np.int)
 
-        c = f(input_arr, divisor)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr, divisor)
+
         self.assertTrue(np.all(c == 1))
 
     def test_fmod(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.fmod(a, b)
             return c
@@ -121,173 +143,201 @@ def f(a, b):
         input_arr = np.full((self.N), 3, dtype=np.float32)
         divisor = np.full((self.N), 2, dtype=np.int)
 
-        c = f(input_arr, divisor)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr, divisor)
+
         self.assertTrue(np.all(c == 1.))
 
     def test_abs(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.abs(a)
             return c
 
         input_arr = 5 * np.random.random_sample(self.N) - 5
 
-        c = f(input_arr)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
+
         self.assertTrue(np.all(c == -input_arr))
 
     def test_absolute(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.absolute(a)
             return c
 
         input_arr = 5 * np.random.random_sample(self.N) - 5
 
-        c = f(input_arr)
-        self.assertTrue(np.all(c == -input_arr))
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
 
+        self.assertTrue(np.all(c == -input_arr))
 
     def test_fabs(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.fabs(a)
             return c
 
         input_arr = 5 * np.random.random_sample(self.N) - 5
 
-        c = f(input_arr)
-        self.assertTrue(np.all(c == -input_arr))
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
 
+        self.assertTrue(np.all(c == -input_arr))
 
     def test_sign(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.sign(a)
             return c
 
         input_arr = 5 * np.random.random_sample(self.N) - 5
 
-        c = f(input_arr)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
+
         self.assertTrue(np.all(c == -1.))
 
     def test_conj(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.conj(a)
             return c
 
         input_arr = np.eye(self.N) + 1j * np.eye(self.N)
 
-        c = f(input_arr)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
+
         d = np.conj(input_arr)
         self.assertTrue(np.all(c == d))
 
     def test_exp(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.exp(a)
             return c
 
         input_arr = np.random.randint(self.N, size=(self.N))
-        c = f(input_arr)
+
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
+
         d = np.exp(input_arr)
         self.assertTrue(np.all(c == d))
 
-
     def test_log(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.log(a)
             return c
 
         input_arr = np.random.randint(1, self.N, size=(self.N))
-        c = f(input_arr)
+
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
+
         d = np.log(input_arr)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
     def test_log10(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.log10(a)
             return c
 
         input_arr = np.random.randint(1, self.N, size=(self.N))
-        c = f(input_arr)
+
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
+
         d = np.log10(input_arr)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
     def test_expm1(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.expm1(a)
             return c
 
         input_arr = np.random.randint(1, self.N, size=(self.N))
-        c = f(input_arr)
+
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
+
         d = np.expm1(input_arr)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
     def test_log1p(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.log1p(a)
             return c
 
         input_arr = np.random.randint(1, self.N, size=(self.N))
-        c = f(input_arr)
+
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
+
         d = np.log1p(input_arr)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
     def test_sqrt(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.sqrt(a)
             return c
 
-        c = f(self.a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a)
+
         d = np.sqrt(self.a)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
     def test_square(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.square(a)
             return c
 
         input_arr = np.random.randint(self.N, size=(self.N))
 
-        c = f(input_arr)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
+
         self.assertTrue(np.all(c == input_arr * input_arr))
 
     def test_reciprocal(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.reciprocal(a)
             return c
 
-        input_arr =  5 * np.random.random_sample(self.N) + 5
+        input_arr = 5 * np.random.random_sample(self.N) + 5
+
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
 
-        c = f(input_arr)
         self.assertTrue(np.all(c == 1/input_arr))
 
     def test_conjugate(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.conjugate(a)
             return c
 
         input_arr = np.eye(self.N) + 1j * np.eye(self.N)
 
-        c = f(input_arr)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
+
         d = np.conj(input_arr)
         self.assertTrue(np.all(c == d))
 
diff --git a/numba_dppy/tests/test_numpy_trigonomteric_functions.py b/numba_dppy/tests/test_numpy_trigonomteric_functions.py
index 7ce18b870a..361273cdee 100644
--- a/numba_dppy/tests/test_numpy_trigonomteric_functions.py
+++ b/numba_dppy/tests/test_numpy_trigonomteric_functions.py
@@ -5,214 +5,239 @@
 import sys
 import numpy as np
 from numba import njit
-import numba_dppy, numba_dppy as dppy
+import numba_dppy
+import numba_dppy as dppy
+import dpctl
 from numba_dppy.testing import unittest
 from numba_dppy.testing import DPPYTestCase
 
 
+@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
 class TestNumpy_math_functions(DPPYTestCase):
+
     N = 10
     a = np.array(np.random.random(N), dtype=np.float32)
     b = np.array(np.random.random(N), dtype=np.float32)
 
     def test_sin(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.sin(a)
             return c
 
-        c = f(self.a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a)
+
         d = np.sin(self.a)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
     def test_cos(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.cos(a)
             return c
 
-        c = f(self.a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a)
+
         d = np.cos(self.a)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
     def test_tan(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.tan(a)
             return c
 
-        c = f(self.a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a)
+
         d = np.tan(self.a)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
     def test_arcsin(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.arcsin(a)
             return c
 
-        c = f(self.a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a)
+
         d = np.arcsin(self.a)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
     def test_arccos(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.arccos(a)
             return c
 
-        c = f(self.a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a)
+
         d = np.arccos(self.a)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
     def test_arctan(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.arctan(a)
             return c
 
-        c = f(self.a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a)
+
         d = np.arctan(self.a)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
     def test_arctan2(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.arctan2(a, b)
             return c
 
-        c = f(self.a, self.b)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a, self.b)
+
         d = np.arctan2(self.a, self.b)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
     def test_sinh(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.sinh(a)
             return c
 
-        c = f(self.a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a)
+
         d = np.sinh(self.a)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
     def test_cosh(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.cosh(a)
             return c
 
-        c = f(self.a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a)
+
         d = np.cosh(self.a)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
     def test_tanh(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.tanh(a)
             return c
 
-        c = f(self.a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a)
+
         d = np.tanh(self.a)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
     def test_arcsinh(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.arcsinh(a)
             return c
 
-        c = f(self.a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a)
+
         d = np.arcsinh(self.a)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
     def test_arccosh(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.arccosh(a)
             return c
 
         input_arr = np.random.randint(1, self.N, size=(self.N))
-        c = f(input_arr)
+
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
+
         d = np.arccosh(input_arr)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
     def test_arctanh(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.arctanh(a)
             return c
 
-        c = f(self.a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a)
+
         d = np.arctanh(self.a)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
     def test_deg2rad(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.deg2rad(a)
             return c
 
-        c = f(self.a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a)
+
         d = np.deg2rad(self.a)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
     def test_rad2deg(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.rad2deg(a)
             return c
 
-        c = f(self.a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a)
+
         d = np.rad2deg(self.a)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-2)
 
     def test_degrees(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.degrees(a)
             return c
 
-        c = f(self.a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a)
+
         d = np.degrees(self.a)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-2)
 
     def test_radians(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.radians(a)
             return c
 
-        c = f(self.a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a)
+
         d = np.radians(self.a)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
diff --git a/numba_dppy/tests/test_parfor_lower_message.py b/numba_dppy/tests/test_parfor_lower_message.py
index 591fd2cb0e..9f4660e01f 100644
--- a/numba_dppy/tests/test_parfor_lower_message.py
+++ b/numba_dppy/tests/test_parfor_lower_message.py
@@ -1,8 +1,10 @@
 import numpy as np
 import numba
 from numba import njit, prange
-import numba_dppy, numba_dppy as dppy
-from numba_dppy.testing import unittest, DPPYTestCase
+import numba_dppy
+import numba_dppy as dppy
+from numba_dppy.testing import unittest
+from numba_dppy.testing import DPPYTestCase
 from numba.tests.support import captured_stdout
 import dpctl
 
@@ -23,7 +25,7 @@ class TestParforMessage(DPPYTestCase):
     def test_parfor_message(self):
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             numba_dppy.compiler.DEBUG = 1
-            jitted = njit(parallel={"offload": True})(prange_example)
+            jitted = njit(prange_example)
 
             with captured_stdout() as got:
                 jitted()
diff --git a/numba_dppy/tests/test_prange.py b/numba_dppy/tests/test_prange.py
index f4c13c4b1f..3a8948d716 100644
--- a/numba_dppy/tests/test_prange.py
+++ b/numba_dppy/tests/test_prange.py
@@ -5,16 +5,19 @@
 import sys
 import numpy as np
 import numba
+import dpctl
 from numba import njit, prange
-import numba_dppy, numba_dppy as dppy
+import numba_dppy
+import numba_dppy as dppy
 from numba_dppy.testing import unittest, expectedFailureIf
 from numba_dppy.testing import DPPYTestCase
 from numba.tests.support import captured_stdout
 
 
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestPrange(DPPYTestCase):
     def test_one_prange(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             for i in prange(4):
                 b[i, 0] = a[i, 0] * 10
@@ -24,14 +27,14 @@ def f(a, b):
         a = np.ones((m, n))
         b = np.ones((m, n))
 
-        f(a, b)
+        with dpctl.device_context("opencl:gpu"):
+            f(a, b)
 
         for i in range(4):
             self.assertTrue(b[i, 0] == a[i, 0] * 10)
 
-
     def test_nested_prange(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             # dimensions must be provided as scalar
             m, n = a.shape
@@ -44,12 +47,13 @@ def f(a, b):
         a = np.ones((m, n))
         b = np.ones((m, n))
 
-        f(a, b)
-        self.assertTrue(np.all(b == 10))
+        with dpctl.device_context("opencl:gpu"):
+            f(a, b)
 
+        self.assertTrue(np.all(b == 10))
 
     def test_multiple_prange(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             # dimensions must be provided as scalar
             m, n = a.shape
@@ -58,7 +62,6 @@ def f(a, b):
                 for j in prange(n):
                     b[i, j] = a[i, j] * val
 
-
             for i in prange(m):
                 for j in prange(n):
                     a[i, j] = a[i, j] * 10
@@ -68,13 +71,14 @@ def f(a, b):
         a = np.ones((m, n))
         b = np.ones((m, n))
 
-        f(a, b)
+        with dpctl.device_context("opencl:gpu"):
+            f(a, b)
+
         self.assertTrue(np.all(b == 10))
         self.assertTrue(np.all(a == 10))
 
-
     def test_three_prange(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             # dimensions must be provided as scalar
             m, n, o = a.shape
@@ -91,9 +95,10 @@ def f(a, b):
         a = np.ones((m, n, o))
         b = np.ones((m, n, o))
 
-        f(a, b)
-        self.assertTrue(np.all(b == 12))
+        with dpctl.device_context("opencl:gpu"):
+            f(a, b)
 
+        self.assertTrue(np.all(b == 12))
 
     @expectedFailureIf(sys.platform.startswith('win'))
     def test_two_consequent_prange(self):
@@ -110,19 +115,21 @@ def prange_example():
         old_debug = numba_dppy.compiler.DEBUG
         numba_dppy.compiler.DEBUG = 1
 
-        jitted = njit(parallel={'offload':True})(prange_example)
-        with captured_stdout() as stdout:
+        jitted = njit(prange_example)
+
+        with captured_stdout() as stdout, dpctl.device_context("opencl:gpu"):
             jitted_res = jitted()
 
         res = prange_example()
 
         numba_dppy.compiler.DEBUG = old_debug
 
-        self.assertEqual(stdout.getvalue().count('Parfor lowered on DPPY-device'), 2, stdout.getvalue())
-        self.assertEqual(stdout.getvalue().count('Failed to lower parfor on DPPY-device'), 0, stdout.getvalue())
+        self.assertEqual(stdout.getvalue().count(
+            'Parfor lowered on DPPY-device'), 2, stdout.getvalue())
+        self.assertEqual(stdout.getvalue().count(
+            'Failed to lower parfor on DPPY-device'), 0, stdout.getvalue())
         np.testing.assert_equal(res, jitted_res)
 
-
     @unittest.skip('NRT required but not enabled')
     def test_2d_arrays(self):
         def prange_example():
@@ -138,16 +145,19 @@ def prange_example():
         old_debug = numba_dppy.compiler.DEBUG
         numba_dppy.compiler.DEBUG = 1
 
-        jitted = njit(parallel={'offload':True})(prange_example)
-        with captured_stdout() as stdout:
+        jitted = njit(prange_example)
+
+        with captured_stdout() as stdout, dpctl.device_context("opencl:gpu"):
             jitted_res = jitted()
 
         res = prange_example()
 
         numba_dppy.compiler.DEBUG = old_debug
 
-        self.assertEqual(stdout.getvalue().count('Parfor lowered on DPPY-device'), 2, stdout.getvalue())
-        self.assertEqual(stdout.getvalue().count('Failed to lower parfor on DPPY-device'), 0, stdout.getvalue())
+        self.assertEqual(stdout.getvalue().count(
+            'Parfor lowered on DPPY-device'), 2, stdout.getvalue())
+        self.assertEqual(stdout.getvalue().count(
+            'Failed to lower parfor on DPPY-device'), 0, stdout.getvalue())
         np.testing.assert_equal(res, jitted_res)
 
 
diff --git a/numba_dppy/tests/test_vectorize.py b/numba_dppy/tests/test_vectorize.py
index 04891ca296..2fed0fc65f 100644
--- a/numba_dppy/tests/test_vectorize.py
+++ b/numba_dppy/tests/test_vectorize.py
@@ -5,11 +5,14 @@
 import sys
 import numpy as np
 from numba import njit, vectorize
-import numba_dppy, numba_dppy as dppy
+import numba_dppy
+import numba_dppy as dppy
+import dpctl
 from numba_dppy.testing import unittest
 from numba_dppy.testing import DPPYTestCase
 
 
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestVectorize(DPPYTestCase):
     def test_vectorize(self):
 
@@ -17,9 +20,9 @@ def test_vectorize(self):
         def axy(a, x, y):
             return a * x + y
 
-        @njit(parallel={'offload':True})
+        @njit
         def f(a0, a1):
-            return np.cos(axy(a0, np.sin(a1) - 1., 1.) )
+            return np.cos(axy(a0, np.sin(a1) - 1., 1.))
 
         def f_np(a0, a1):
             sin_res = np.sin(a1)
@@ -28,11 +31,12 @@ def f_np(a0, a1):
                 res.append(axy(a0[i], sin_res[i] - 1., 1.))
             return np.cos(np.array(res))
 
-
         A = np.random.random(10)
         B = np.random.random(10)
 
-        expected = f(A, B)
+        with dpctl.device_context("opencl:gpu"):
+            expected = f(A, B)
+
         actual = f_np(A, B)
 
         max_abs_err = expected.sum() - actual.sum()
diff --git a/numba_dppy/tests/test_with_context.py b/numba_dppy/tests/test_with_context.py
index e025a77784..1f733829b6 100644
--- a/numba_dppy/tests/test_with_context.py
+++ b/numba_dppy/tests/test_with_context.py
@@ -12,7 +12,6 @@
 class TestWithDPPYContext(DPPYTestCase):
 
     @unittest.skipIf(not dpctl.has_gpu_queues(), "No GPU platforms available")
-    @expectedFailureIf(sys.platform.startswith('win'))
     def test_with_dppy_context_gpu(self):
 
         @njit
@@ -39,7 +38,6 @@ def func(b):
         self.assertTrue('Parfor lowered on DPPY-device' in got_gpu_message.getvalue())
 
     @unittest.skipIf(not dpctl.has_cpu_queues(), "No CPU platforms available")
-    @unittest.expectedFailure
     def test_with_dppy_context_cpu(self):
 
         @njit

From 2e81acbfc06dd0c1e0e3ef0de226bf4b040f7ac5 Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev
 <68376232+vlad-perevezentsev@users.noreply.github.com>
Date: Fri, 11 Dec 2020 16:41:56 +0300
Subject: [PATCH 17/32] Improve tests code (#57)

* change the test_prange

* fix all tests and examples

* Replace DPPLTestCase with DPPYTestCase

* Fix typos

* To update all the tests in numba_dppy/tests/dppl

* Use explicit device selection in the examples

* Replace dppl with dppy

* Fix int64 to long long conversion on windows

* Fixed test_with_dppy_context_cpu

* Remove unused imports in tests

Co-authored-by: Pokhodenko <sergey.pokhodenko@intel.com>
---
 numba_dppy/testing.py                         | 26 -------------------
 numba_dppy/tests/test_arg_accessor.py         |  9 +++----
 numba_dppy/tests/test_arg_types.py            |  9 +++----
 numba_dppy/tests/test_atomic_op.py            |  7 ++---
 numba_dppy/tests/test_barrier.py              |  5 +---
 numba_dppy/tests/test_black_scholes.py        |  7 ++---
 numba_dppy/tests/test_caching.py              |  8 ++----
 numba_dppy/tests/test_device_array_args.py    | 10 +++----
 numba_dppy/tests/test_dpctl_api.py            |  9 ++-----
 numba_dppy/tests/test_dpnp_functions.py       |  8 +++---
 numba_dppy/tests/test_dppl_fallback.py        | 11 ++------
 numba_dppy/tests/test_dppl_func.py            |  7 ++---
 numba_dppy/tests/test_math_functions.py       | 11 +++-----
 .../test_numpy_bit_twiddling_functions.py     | 11 ++------
 .../tests/test_numpy_comparison_functions.py  | 11 ++------
 .../tests/test_numpy_floating_functions.py    | 11 ++------
 numba_dppy/tests/test_numpy_math_functions.py | 12 ++-------
 .../test_numpy_trigonomteric_functions.py     | 11 ++------
 numba_dppy/tests/test_parfor_lower_message.py |  5 ++--
 numba_dppy/tests/test_prange.py               | 10 +++----
 numba_dppy/tests/test_print.py                |  9 ++-----
 .../tests/test_rename_numpy_function_pass.py  |  6 -----
 numba_dppy/tests/test_sum_reduction.py        |  9 ++-----
 numba_dppy/tests/test_vectorize.py            | 11 ++------
 numba_dppy/tests/test_with_context.py         |  8 +++---
 25 files changed, 52 insertions(+), 189 deletions(-)

diff --git a/numba_dppy/testing.py b/numba_dppy/testing.py
index e309b7f0c9..89d012f72f 100644
--- a/numba_dppy/testing.py
+++ b/numba_dppy/testing.py
@@ -1,5 +1,3 @@
-from __future__ import print_function, absolute_import, division
-
 import contextlib
 import sys
 
@@ -7,33 +5,9 @@
 import unittest
 from numba.tests.support import (
     captured_stdout,
-    SerialMixin,
     redirect_c_stdout,
 )
 
-class DPPYTestCase(SerialMixin, unittest.TestCase):
-    def setUp(self):
-        #init()
-	#TODO
-        pass
-    def tearDown(self):
-        #reset()
-	#TODO
-        pass
-
-class DPPYTextCapture(object):
-    def __init__(self, stream):
-        self._stream = stream
-
-    def getvalue(self):
-        return self._stream.read()
-
-class PythonTextCapture(object):
-    def __init__(self, stream):
-        self._stream = stream
-
-    def getvalue(self):
-        return self._stream.getvalue()
 
 @contextlib.contextmanager
 def captured_dppy_stdout():
diff --git a/numba_dppy/tests/test_arg_accessor.py b/numba_dppy/tests/test_arg_accessor.py
index 3de2d31770..494f269c59 100644
--- a/numba_dppy/tests/test_arg_accessor.py
+++ b/numba_dppy/tests/test_arg_accessor.py
@@ -1,10 +1,7 @@
-from __future__ import print_function, division, absolute_import
-
 import numpy as np
 
 import numba_dppy, numba_dppy as dppy
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPYTestCase
+import unittest
 import dpctl
 
 
@@ -33,7 +30,7 @@ def call_kernel(global_size, local_size,
 
 
 @unittest.skipUnless(dpctl.has_cpu_queues(), 'test only on CPU system')
-class TestDPPYArgAccessorCPU(DPPYTestCase):
+class TestDPPYArgAccessorCPU(unittest.TestCase):
     def test_arg_with_accessor(self):
         C = np.ones_like(A)
         with dpctl.device_context("opencl:cpu") as cpu_queue:
@@ -50,7 +47,7 @@ def test_arg_without_accessor(self):
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPYArgAccessorGPU(DPPYTestCase):
+class TestDPPYArgAccessorGPU(unittest.TestCase):
     def test_arg_with_accessor(self):
         C = np.ones_like(A)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
diff --git a/numba_dppy/tests/test_arg_types.py b/numba_dppy/tests/test_arg_types.py
index 7b06ef11f8..ed55e12e16 100644
--- a/numba_dppy/tests/test_arg_types.py
+++ b/numba_dppy/tests/test_arg_types.py
@@ -1,10 +1,7 @@
-from __future__ import print_function, division, absolute_import
-
 import numpy as np
 
 import numba_dppy, numba_dppy as dppy
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPYTestCase
+import unittest
 import dpctl
 
 
@@ -24,7 +21,7 @@ def call_mul_device_kernel(global_size, A, B, test):
 
 
 @unittest.skipUnless(dpctl.has_cpu_queues(), 'test only on CPU system')
-class TestDPPYArrayArgCPU(DPPYTestCase):
+class TestDPPYArrayArgCPU(unittest.TestCase):
     def test_integer_arg(self):
         x = np.int32(2)
         with dpctl.device_context("opencl:cpu") as cpu_queue:
@@ -59,7 +56,7 @@ def check_bool_kernel(A, test):
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPYArrayArgGPU(DPPYTestCase):
+class TestDPPYArrayArgGPU(unittest.TestCase):
     def test_integer_arg(self):
         x = np.int32(2)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
diff --git a/numba_dppy/tests/test_atomic_op.py b/numba_dppy/tests/test_atomic_op.py
index 9d8e88def1..27a810ba08 100644
--- a/numba_dppy/tests/test_atomic_op.py
+++ b/numba_dppy/tests/test_atomic_op.py
@@ -1,11 +1,8 @@
-from __future__ import print_function, division, absolute_import
-
 import numpy as np
 
 import numba
 import numba_dppy, numba_dppy as dppy
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPYTestCase
+import unittest
 import dpctl
 
 def atomic_add_int32(ary):
@@ -125,7 +122,7 @@ def call_fn_for_datatypes(fn, result, input, global_size):
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
 @unittest.skipUnless(numba_dppy.ocl.atomic_support_present(), 'test only when atomic support is present')
-class TestAtomicOp(DPPYTestCase):
+class TestAtomicOp(unittest.TestCase):
     def test_atomic_add_global(self):
         @dppy.kernel
         def atomic_add(B):
diff --git a/numba_dppy/tests/test_barrier.py b/numba_dppy/tests/test_barrier.py
index 3657672240..7cedc18f13 100644
--- a/numba_dppy/tests/test_barrier.py
+++ b/numba_dppy/tests/test_barrier.py
@@ -1,9 +1,6 @@
-from __future__ import print_function, division, absolute_import
-
 import numpy as np
 
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPYTestCase
+import unittest
 from numba import float32
 import numba_dppy, numba_dppy as dppy
 import dpctl
diff --git a/numba_dppy/tests/test_black_scholes.py b/numba_dppy/tests/test_black_scholes.py
index 312536d33a..7baecbeda5 100644
--- a/numba_dppy/tests/test_black_scholes.py
+++ b/numba_dppy/tests/test_black_scholes.py
@@ -1,12 +1,9 @@
-from __future__ import print_function, division, absolute_import
-
 import numpy as np
 import math
 import time
 
 import numba_dppy, numba_dppy as dppy
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPYTestCase
+import unittest
 import dpctl
 
 
@@ -49,7 +46,7 @@ def randfloat(rand_var, low, high):
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPYBlackScholes(DPPYTestCase):
+class TestDPPYBlackScholes(unittest.TestCase):
     def test_black_scholes(self):
         OPT_N = 400
         iterations = 2
diff --git a/numba_dppy/tests/test_caching.py b/numba_dppy/tests/test_caching.py
index ae693190a3..268401ce98 100644
--- a/numba_dppy/tests/test_caching.py
+++ b/numba_dppy/tests/test_caching.py
@@ -1,12 +1,8 @@
-from __future__ import print_function
-from timeit import default_timer as time
-
 import sys
 import numpy as np
 import numba_dppy, numba_dppy as dppy
 import dpctl
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPYTestCase
+import unittest
 
 
 def data_parallel_sum(a, b, c):
@@ -14,7 +10,7 @@ def data_parallel_sum(a, b, c):
     c[i] = a[i] + b[i]
 
 
-class TestCaching(DPPYTestCase):
+class TestCaching(unittest.TestCase):
     def test_caching_kernel(self):
         global_size = 10
         N = global_size
diff --git a/numba_dppy/tests/test_device_array_args.py b/numba_dppy/tests/test_device_array_args.py
index b38eac12fe..eb47cd28bc 100644
--- a/numba_dppy/tests/test_device_array_args.py
+++ b/numba_dppy/tests/test_device_array_args.py
@@ -1,13 +1,9 @@
 #! /usr/bin/env python
-from __future__ import print_function
-from timeit import default_timer as time
-
 import sys
 import numpy as np
 import numba_dppy, numba_dppy as dppy
 import dpctl
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPYTestCase
+import unittest
 
 @dppy.kernel
 def data_parallel_sum(a, b, c):
@@ -24,7 +20,7 @@ def data_parallel_sum(a, b, c):
 
 
 @unittest.skipUnless(dpctl.has_cpu_queues(), 'test only on CPU system')
-class TestDPPYDeviceArrayArgsGPU(DPPYTestCase):
+class TestDPPYDeviceArrayArgsGPU(unittest.TestCase):
     def test_device_array_args_cpu(self):
         c = np.ones_like(a)
 
@@ -35,7 +31,7 @@ def test_device_array_args_cpu(self):
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPYDeviceArrayArgsCPU(DPPYTestCase):
+class TestDPPYDeviceArrayArgsCPU(unittest.TestCase):
     def test_device_array_args_gpu(self):
         c = np.ones_like(a)
 
diff --git a/numba_dppy/tests/test_dpctl_api.py b/numba_dppy/tests/test_dpctl_api.py
index dcbb95e163..59ddd16f65 100644
--- a/numba_dppy/tests/test_dpctl_api.py
+++ b/numba_dppy/tests/test_dpctl_api.py
@@ -1,14 +1,9 @@
-from __future__ import print_function, division, absolute_import
-
-import numpy as np
-
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPYTestCase
+import unittest
 import dpctl
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPCTLAPI(DPPYTestCase):
+class TestDPCTLAPI(unittest.TestCase):
     def test_dpctl_api(self):
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             dpctl.dump()
diff --git a/numba_dppy/tests/test_dpnp_functions.py b/numba_dppy/tests/test_dpnp_functions.py
index c4749885ba..7dcd5407b5 100644
--- a/numba_dppy/tests/test_dpnp_functions.py
+++ b/numba_dppy/tests/test_dpnp_functions.py
@@ -1,5 +1,4 @@
 #! /usr/bin/env python
-from __future__ import print_function
 from timeit import default_timer as time
 
 import sys
@@ -8,8 +7,8 @@
 import numba_dppy
 import numba_dppy as dppy
 import dpctl
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPYTestCase
+import unittest
+
 
 
 def test_for_different_datatypes(fn, test_fn, dims, arg_count, tys, np_all=False, matrix=None):
@@ -83,7 +82,6 @@ def test_for_dimensions(fn, test_fn, dims, tys, np_all=False):
 
 def ensure_dpnp():
     try:
-       # import dpnp
         from numba_dppy.dpnp_glue import dpnp_fptr_interface as dpnp_glue
         return True
     except:
@@ -91,7 +89,7 @@ def ensure_dpnp():
 
 
 @unittest.skipUnless(ensure_dpnp() and dpctl.has_gpu_queues(), 'test only when dpNP and GPU is available')
-class Testdpnp_functions(DPPYTestCase):
+class Testdpnp_functions(unittest.TestCase):
 
     N = 10
 
diff --git a/numba_dppy/tests/test_dppl_fallback.py b/numba_dppy/tests/test_dppl_fallback.py
index 76792f5744..3a7e668f02 100644
--- a/numba_dppy/tests/test_dppl_fallback.py
+++ b/numba_dppy/tests/test_dppl_fallback.py
@@ -1,20 +1,13 @@
-from __future__ import print_function, division, absolute_import
-
 import numpy as np
 
 import numba
-import numba_dppy
-import numba_dppy as dppy
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPYTestCase
+import unittest
 from numba.tests.support import captured_stderr
 import dpctl
-import sys
-import io
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPYFallback(DPPYTestCase):
+class TestDPPYFallback(unittest.TestCase):
     def test_dppy_fallback_inner_call(self):
         @numba.jit
         def fill_value(i):
diff --git a/numba_dppy/tests/test_dppl_func.py b/numba_dppy/tests/test_dppl_func.py
index c58908554e..729030e153 100644
--- a/numba_dppy/tests/test_dppl_func.py
+++ b/numba_dppy/tests/test_dppl_func.py
@@ -1,15 +1,12 @@
-from __future__ import print_function, division, absolute_import
-
 import numpy as np
 
 import numba_dppy, numba_dppy as dppy
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPYTestCase
+import unittest
 import dpctl
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPYFunc(DPPYTestCase):
+class TestDPPYFunc(unittest.TestCase):
     N = 257
 
     def test_dppy_func_device_array(self):
diff --git a/numba_dppy/tests/test_math_functions.py b/numba_dppy/tests/test_math_functions.py
index f83fdd30ee..6336c63759 100644
--- a/numba_dppy/tests/test_math_functions.py
+++ b/numba_dppy/tests/test_math_functions.py
@@ -1,13 +1,8 @@
 #! /usr/bin/env python
-from __future__ import print_function
-from timeit import default_timer as time
-
-import sys
 import numpy as np
 import numba_dppy, numba_dppy as dppy
 import dpctl
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPYTestCase
+import unittest
 import math
 
 @dppy.kernel
@@ -73,7 +68,7 @@ def test_driver(input_arr, device_ty, jitfunc):
 
 
 @unittest.skipUnless(dpctl.has_cpu_queues(), 'test only on CPU system')
-class TestDPPYMathFunctionsCPU(DPPYTestCase):
+class TestDPPYMathFunctionsCPU(unittest.TestCase):
     def test_fabs_cpu(self):
         b_actual = test_driver(a, "CPU", dppy_fabs)
         b_expected = np.fabs(a)
@@ -106,7 +101,7 @@ def test_log_cpu(self):
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPYMathFunctionsGPU(DPPYTestCase):
+class TestDPPYMathFunctionsGPU(unittest.TestCase):
     def test_fabs_gpu(self):
         b_actual = test_driver(a, "GPU", dppy_fabs)
         b_expected = np.fabs(a)
diff --git a/numba_dppy/tests/test_numpy_bit_twiddling_functions.py b/numba_dppy/tests/test_numpy_bit_twiddling_functions.py
index 8d022a0bb1..21a8fc8444 100644
--- a/numba_dppy/tests/test_numpy_bit_twiddling_functions.py
+++ b/numba_dppy/tests/test_numpy_bit_twiddling_functions.py
@@ -1,19 +1,12 @@
 #! /usr/bin/env python
-from __future__ import print_function
-from timeit import default_timer as time
-
-import sys
 import numpy as np
 from numba import njit
-import numba_dppy
-import numba_dppy as dppy
 import dpctl
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPYTestCase
+import unittest
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestNumpy_bit_twiddling_functions(DPPYTestCase):
+class TestNumpy_bit_twiddling_functions(unittest.TestCase):
     def test_bitwise_and(self):
         @njit
         def f(a, b):
diff --git a/numba_dppy/tests/test_numpy_comparison_functions.py b/numba_dppy/tests/test_numpy_comparison_functions.py
index 53a8eed890..9d56e94374 100644
--- a/numba_dppy/tests/test_numpy_comparison_functions.py
+++ b/numba_dppy/tests/test_numpy_comparison_functions.py
@@ -1,19 +1,12 @@
 #! /usr/bin/env python
-from __future__ import print_function
-from timeit import default_timer as time
-
-import sys
 import numpy as np
 from numba import njit
-import numba_dppy
-import numba_dppy as dppy
 import dpctl
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPYTestCase
+import unittest
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestNumpy_comparison_functions(DPPYTestCase):
+class TestNumpy_comparison_functions(unittest.TestCase):
     a = np.array([4, 5, 6])
     b = np.array([2, 6, 6])
 
diff --git a/numba_dppy/tests/test_numpy_floating_functions.py b/numba_dppy/tests/test_numpy_floating_functions.py
index fb7c1b98e8..8df7e2b5d4 100644
--- a/numba_dppy/tests/test_numpy_floating_functions.py
+++ b/numba_dppy/tests/test_numpy_floating_functions.py
@@ -1,18 +1,11 @@
-from __future__ import print_function
-from timeit import default_timer as time
-
-import sys
 import numpy as np
 from numba import njit
-import numba_dppy
-import numba_dppy as dppy
 import dpctl
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPYTestCase
+import unittest
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestNumpy_floating_functions(DPPYTestCase):
+class TestNumpy_floating_functions(unittest.TestCase):
     def test_isfinite(self):
         @njit
         def f(a):
diff --git a/numba_dppy/tests/test_numpy_math_functions.py b/numba_dppy/tests/test_numpy_math_functions.py
index 7af014d4d8..0d19193cb7 100644
--- a/numba_dppy/tests/test_numpy_math_functions.py
+++ b/numba_dppy/tests/test_numpy_math_functions.py
@@ -1,20 +1,12 @@
 #! /usr/bin/env python
-from __future__ import print_function
-from timeit import default_timer as time
-
-import sys
 import numpy as np
 from numba import njit
-import numba_dppy
-import numba_dppy as dppy
 import dpctl
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPYTestCase
+import unittest
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestNumpy_math_functions(DPPYTestCase):
-
+class TestNumpy_math_functions(unittest.TestCase):
     N = 10
     a = np.array(np.random.random(N), dtype=np.float32)
     b = np.array(np.random.random(N), dtype=np.float32)
diff --git a/numba_dppy/tests/test_numpy_trigonomteric_functions.py b/numba_dppy/tests/test_numpy_trigonomteric_functions.py
index 361273cdee..7c0a95d323 100644
--- a/numba_dppy/tests/test_numpy_trigonomteric_functions.py
+++ b/numba_dppy/tests/test_numpy_trigonomteric_functions.py
@@ -1,19 +1,12 @@
 #! /usr/bin/env python
-from __future__ import print_function
-from timeit import default_timer as time
-
-import sys
 import numpy as np
 from numba import njit
-import numba_dppy
-import numba_dppy as dppy
 import dpctl
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPYTestCase
+import unittest
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestNumpy_math_functions(DPPYTestCase):
+class TestNumpy_math_functions(unittest.TestCase):
 
     N = 10
     a = np.array(np.random.random(N), dtype=np.float32)
diff --git a/numba_dppy/tests/test_parfor_lower_message.py b/numba_dppy/tests/test_parfor_lower_message.py
index 9f4660e01f..17f1456bb6 100644
--- a/numba_dppy/tests/test_parfor_lower_message.py
+++ b/numba_dppy/tests/test_parfor_lower_message.py
@@ -3,8 +3,7 @@
 from numba import njit, prange
 import numba_dppy
 import numba_dppy as dppy
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPYTestCase
+import unittest
 from numba.tests.support import captured_stdout
 import dpctl
 
@@ -21,7 +20,7 @@ def prange_example():
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
-class TestParforMessage(DPPYTestCase):
+class TestParforMessage(unittest.TestCase):
     def test_parfor_message(self):
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             numba_dppy.compiler.DEBUG = 1
diff --git a/numba_dppy/tests/test_prange.py b/numba_dppy/tests/test_prange.py
index 3a8948d716..1af03f1cde 100644
--- a/numba_dppy/tests/test_prange.py
+++ b/numba_dppy/tests/test_prange.py
@@ -1,21 +1,17 @@
 #! /usr/bin/env python
-from __future__ import print_function
-from timeit import default_timer as time
-
 import sys
 import numpy as np
 import numba
 import dpctl
 from numba import njit, prange
 import numba_dppy
-import numba_dppy as dppy
-from numba_dppy.testing import unittest, expectedFailureIf
-from numba_dppy.testing import DPPYTestCase
+import unittest
+from numba_dppy.testing import expectedFailureIf
 from numba.tests.support import captured_stdout
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
-class TestPrange(DPPYTestCase):
+class TestPrange(unittest.TestCase):
     def test_one_prange(self):
         @njit
         def f(a, b):
diff --git a/numba_dppy/tests/test_print.py b/numba_dppy/tests/test_print.py
index 0bc4a7cc2b..af19658048 100644
--- a/numba_dppy/tests/test_print.py
+++ b/numba_dppy/tests/test_print.py
@@ -1,19 +1,14 @@
 #! /usr/bin/env python
-from __future__ import print_function
-from timeit import default_timer as time
-
-import sys
 import numpy as np
 from numba import njit, prange
 import numba_dppy, numba_dppy as dppy
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPYTestCase
+import unittest
 
 import dpctl
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestPrint(DPPYTestCase):
+class TestPrint(unittest.TestCase):
     def test_print_dppy_kernel(self):
         @dppy.func
         def g(a):
diff --git a/numba_dppy/tests/test_rename_numpy_function_pass.py b/numba_dppy/tests/test_rename_numpy_function_pass.py
index b06a03b5e0..e568358dad 100644
--- a/numba_dppy/tests/test_rename_numpy_function_pass.py
+++ b/numba_dppy/tests/test_rename_numpy_function_pass.py
@@ -1,13 +1,7 @@
 #! /usr/bin/env python
-
 import unittest
 import numpy as np
-
 import numba
-from numba import njit, prange
-import numba_dppy, numba_dppy as dppy
-
-
 from numba.core import compiler
 from numba_dppy.rename_numpy_functions_pass import DPPYRewriteOverloadedFunctions
 
diff --git a/numba_dppy/tests/test_sum_reduction.py b/numba_dppy/tests/test_sum_reduction.py
index 8ec7b3d5a9..37ca38a12a 100644
--- a/numba_dppy/tests/test_sum_reduction.py
+++ b/numba_dppy/tests/test_sum_reduction.py
@@ -1,12 +1,7 @@
-from __future__ import print_function, division, absolute_import
-
 import numpy as np
 import math
-import time
-
 import numba_dppy, numba_dppy as dppy
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPYTestCase
+import unittest
 import dpctl
 
 @dppy.kernel
@@ -19,7 +14,7 @@ def reduction_kernel(A, R, stride):
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPYSumReduction(DPPYTestCase):
+class TestDPPYSumReduction(unittest.TestCase):
     def test_sum_reduction(self):
         # This test will only work for even case
         N = 1024
diff --git a/numba_dppy/tests/test_vectorize.py b/numba_dppy/tests/test_vectorize.py
index 2fed0fc65f..5b3a41629c 100644
--- a/numba_dppy/tests/test_vectorize.py
+++ b/numba_dppy/tests/test_vectorize.py
@@ -1,19 +1,12 @@
 #! /usr/bin/env python
-from __future__ import print_function
-from timeit import default_timer as time
-
-import sys
 import numpy as np
 from numba import njit, vectorize
-import numba_dppy
-import numba_dppy as dppy
 import dpctl
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPYTestCase
+import unittest
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
-class TestVectorize(DPPYTestCase):
+class TestVectorize(unittest.TestCase):
     def test_vectorize(self):
 
         @vectorize(nopython=True)
diff --git a/numba_dppy/tests/test_with_context.py b/numba_dppy/tests/test_with_context.py
index 1f733829b6..693c155ab2 100644
--- a/numba_dppy/tests/test_with_context.py
+++ b/numba_dppy/tests/test_with_context.py
@@ -1,15 +1,13 @@
-import sys
-import numba
 import numpy as np
 from numba import njit
-import numba_dppy, numba_dppy as dppy
+import numba_dppy
+import unittest
 from numba.core import errors
 from numba.tests.support import captured_stdout
-from numba_dppy.testing import DPPYTestCase, unittest, expectedFailureIf
 import dpctl
 
 
-class TestWithDPPYContext(DPPYTestCase):
+class TestWithDPPYContext(unittest.TestCase):
 
     @unittest.skipIf(not dpctl.has_gpu_queues(), "No GPU platforms available")
     def test_with_dppy_context_gpu(self):

From c5c338169cea7a92b140a2388484a54af6c86221 Mon Sep 17 00:00:00 2001
From: Angelina Kharchevnikova <angelina.kharchevnikova@intel.com>
Date: Fri, 11 Dec 2020 17:13:58 +0300
Subject: [PATCH 18/32] Add flags to generate debug symbols (#27)

* Add flags to generate debug symbols
* Merge main to akharche/enable_kernel_debug
* Add documentation about how to use gdb

Co-authored-by: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
---
 DEBUGGING.md           | 21 +++++++++++++++++++++
 README.md              |  6 +++++-
 numba_dppy/compiler.py | 10 ++++++++--
 numba_dppy/target.py   |  6 +++++-
 4 files changed, 39 insertions(+), 4 deletions(-)
 create mode 100644 DEBUGGING.md

diff --git a/DEBUGGING.md b/DEBUGGING.md
new file mode 100644
index 0000000000..44027b318d
--- /dev/null
+++ b/DEBUGGING.md
@@ -0,0 +1,21 @@
+## Debugging with GDB
+
+Setting the debug environment variable `NUMBA_DPPY_DEBUG` (e.g. `export NUMBA_DPPY_DEBUG=True`) enables the emission of debug info to 
+the llvm and spirv IR. To disable debugging set this variable to None: (e.g. `export NUMBA_DPPL_DEBUG=`).  
+Currently, the following debug info is available:
+- Source location (filename and line number) is available. 
+- Setting break points by the line number.
+- Stepping over break points.
+
+### Requirements
+
+Intel GDB installed to the system  
+follow the instruction: https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/distribution-for-gdb.html
+
+### Example debug usage
+
+```bash
+$ gdb -q python  
+(gdb) break sum.py:13     # Assumes the kernel is in file sum.py, at line 13  
+(gdb) run sum.py
+```
diff --git a/README.md b/README.md
index 92c61fc1bd..483ad7bdad 100644
--- a/README.md
+++ b/README.md
@@ -57,9 +57,13 @@ python numba_dppy/examples/sum.py
 
 ## How Tos
 
-Refer the HowTo.rst guide for an overview of the programming semantics,
+Refer the [HowTo.rst](HowTo.rst) guide for an overview of the programming semantics,
 examples, supported functionalities, and known issues.
 
+## Debugging
+
+Please follow instructions in the [DEBUGGING.md](DEBUGGING.md)
+
 ## Reporting issues
 
 Please use https://github.com/IntelPython/numba-dppy/issues to report issues and bugs.
diff --git a/numba_dppy/compiler.py b/numba_dppy/compiler.py
index c8a329738a..b93f29801a 100644
--- a/numba_dppy/compiler.py
+++ b/numba_dppy/compiler.py
@@ -20,16 +20,19 @@
 import os
 from numba.core.compiler import DefaultPassBuilder, CompilerBase
 
-DEBUG=os.environ.get('NUMBA_DPPY_DEBUG', None)
+
+DEBUG = os.environ.get('NUMBA_DPPY_DEBUG', None)
 _NUMBA_DPPY_READ_ONLY  = "read_only"
 _NUMBA_DPPY_WRITE_ONLY = "write_only"
 _NUMBA_DPPY_READ_WRITE = "read_write"
 
+
 def _raise_no_device_found_error():
     error_message = ("No OpenCL device specified. "
                      "Usage : jit_fn[device, globalsize, localsize](...)")
     raise ValueError(error_message)
 
+
 def _raise_invalid_kernel_enqueue_args():
     error_message = ("Incorrect number of arguments for enquing dppy.kernel. "
                      "Usage: device_env, global size, local size. "
@@ -78,9 +81,11 @@ def compile_with_dppy(pyfunc, return_type, args, debug):
 
     typingctx = dppy_target.typing_context
     targetctx = dppy_target.target_context
-    # TODO handle debug flag
+
     flags = compiler.Flags()
     # Do not compile (generate native code), just lower (to LLVM)
+    if debug:
+        flags.set('debuginfo')
     flags.set('no_compile')
     flags.set('no_cpython_wrapper')
     flags.unset('nrt')
@@ -117,6 +122,7 @@ def compile_with_dppy(pyfunc, return_type, args, debug):
 def compile_kernel(sycl_queue, pyfunc, args, access_types, debug=False):
     if DEBUG:
         print("compile_kernel", args)
+        debug = True
     if not sycl_queue:
         # This will be get_current_queue
         sycl_queue = dpctl.get_current_queue()
diff --git a/numba_dppy/target.py b/numba_dppy/target.py
index 6444a6e601..147b229e77 100644
--- a/numba_dppy/target.py
+++ b/numba_dppy/target.py
@@ -254,8 +254,12 @@ def sub_gen_with_global(lty):
     def declare_function(self, module, fndesc):
         fnty = self.call_conv.get_function_type(fndesc.restype, fndesc.argtypes)
         fn = module.get_or_insert_function(fnty, name=fndesc.mangled_name)
-        fn.attributes.add('alwaysinline')
+
+        if not self.enable_debuginfo:
+            fn.attributes.add('alwaysinline')
+
         ret = super(DPPYTargetContext, self).declare_function(module, fndesc)
+
         # XXX: Refactor fndesc instead of this special case
         if fndesc.llvm_func_name.startswith('dppy_py_devfn'):
             ret.calling_convention = CC_SPIR_FUNC

From 6a7fbc1181ca26756a51309011f4200628f16e8b Mon Sep 17 00:00:00 2001
From: Elena Totmenina <totmeninal@mail.ru>
Date: Fri, 11 Dec 2020 17:51:50 +0300
Subject: [PATCH 19/32] Controllable fallback (#40)

Add configuration variable `NUMBA_DPPY_FALLBACK_ON_CPU`
---
 numba_dppy/config.py                          |  2 +
 numba_dppy/dppy_lowerer.py                    |  3 +-
 .../tests/test_controllable_fallback.py       | 72 +++++++++++++++++++
 3 files changed, 76 insertions(+), 1 deletion(-)
 create mode 100644 numba_dppy/tests/test_controllable_fallback.py

diff --git a/numba_dppy/config.py b/numba_dppy/config.py
index 880d18d7b5..76849ba00c 100644
--- a/numba_dppy/config.py
+++ b/numba_dppy/config.py
@@ -34,3 +34,5 @@ def _readenv(): ...
 
 # Turn SPIRV-VALIDATION ON/OFF switch
 SPIRV_VAL = _readenv("NUMBA_DPPY_SPIRV_VAL", int, 0)
+
+FALLBACK_ON_CPU = _readenv("NUMBA_DPPY_FALLBACK_ON_CPU", int, 1)
diff --git a/numba_dppy/dppy_lowerer.py b/numba_dppy/dppy_lowerer.py
index 1561a6d85e..b7f591d296 100644
--- a/numba_dppy/dppy_lowerer.py
+++ b/numba_dppy/dppy_lowerer.py
@@ -1171,7 +1171,8 @@ def lower(self):
             if numba_dppy.compiler.DEBUG:
                 print("Failed to lower parfor on DPPY-device. Due to:\n", e)
             lowering.lower_extensions[parfor.Parfor].pop()
-            if (lowering.lower_extensions[parfor.Parfor][-1] == numba.parfors.parfor_lowering._lower_parfor_parallel):
+            if ((lowering.lower_extensions[parfor.Parfor][-1] == numba.parfors.parfor_lowering._lower_parfor_parallel) and
+                numba_dppy.config.FALLBACK_ON_CPU == 1):
                 self.cpu_lower.lower()
                 self.base_lower = self.cpu_lower
             else:
diff --git a/numba_dppy/tests/test_controllable_fallback.py b/numba_dppy/tests/test_controllable_fallback.py
new file mode 100644
index 0000000000..45405b9958
--- /dev/null
+++ b/numba_dppy/tests/test_controllable_fallback.py
@@ -0,0 +1,72 @@
+import numpy as np
+
+import numba
+import numba_dppy
+from numba_dppy.testing import unittest
+from numba_dppy.testing import DPPYTestCase
+from numba.tests.support import captured_stderr
+import dpctl
+
+
+@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+class TestDPPYFallback(DPPYTestCase):
+    def test_dppy_fallback_true(self):
+        @numba.jit
+        def fill_value(i):
+            return i
+
+        def inner_call_fallback():
+            x = 10
+            a = np.empty(shape=x, dtype=np.float32)
+
+            for i in numba.prange(x):
+                a[i] = fill_value(i)
+
+            return a
+
+        numba_dppy.compiler.DEBUG = 1
+        with captured_stderr() as msg_fallback_true:
+            with dpctl.device_context("opencl:gpu") as gpu_queue:
+                dppy = numba.njit(parallel=True)(inner_call_fallback)
+                dppy_fallback_true = dppy()
+
+        ref_result = inner_call_fallback()
+        numba_dppy.compiler.DEBUG = 0
+
+        np.testing.assert_array_equal(dppy_fallback_true, ref_result)
+        self.assertTrue('Failed to lower parfor on DPPY-device' in msg_fallback_true.getvalue())
+
+    @unittest.expectedFailure
+    def test_dppy_fallback_false(self):
+        @numba.jit
+        def fill_value(i):
+            return i
+
+        def inner_call_fallback():
+            x = 10
+            a = np.empty(shape=x, dtype=np.float32)
+
+            for i in numba.prange(x):
+                a[i] = fill_value(i)
+
+            return a
+
+        try:
+            numba_dppy.compiler.DEBUG = 1
+            numba_dppy.config.FALLBACK_ON_CPU  = 0
+            with captured_stderr() as msg_fallback_true:
+                with dpctl.device_context("opencl:gpu") as gpu_queue:
+                    dppy = numba.njit(parallel=True)(inner_call_fallback)
+                    dppy_fallback_false = dppy()
+
+        finally:
+            ref_result = inner_call_fallback()
+            numba_dppy.config.FALLBACK_ON_CPU  = 1
+            numba_dppy.compiler.DEBUG = 0
+
+            not np.testing.assert_array_equal(dppy_fallback_false, ref_result)
+            not self.assertTrue('Failed to lower parfor on DPPY-device' in msg_fallback_true.getvalue())
+
+
+if __name__ == '__main__':
+    unittest.main()

From 1e5346b63efef9af13bb52813510437a1ca805ea Mon Sep 17 00:00:00 2001
From: Elena Totmenina <totmeninal@mail.ru>
Date: Fri, 11 Dec 2020 18:27:59 +0300
Subject: [PATCH 20/32] Fix fallback tests (#68)

---
 numba_dppy/tests/test_controllable_fallback.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/numba_dppy/tests/test_controllable_fallback.py b/numba_dppy/tests/test_controllable_fallback.py
index 45405b9958..357f0b5e20 100644
--- a/numba_dppy/tests/test_controllable_fallback.py
+++ b/numba_dppy/tests/test_controllable_fallback.py
@@ -3,13 +3,12 @@
 import numba
 import numba_dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPYTestCase
 from numba.tests.support import captured_stderr
 import dpctl
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPYFallback(DPPYTestCase):
+class TestDPPYFallback(unittest.TestCase):
     def test_dppy_fallback_true(self):
         @numba.jit
         def fill_value(i):

From cbe25b498040a7c056d341142531acef1bc0e228 Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev
 <68376232+vlad-perevezentsev@users.noreply.github.com>
Date: Mon, 14 Dec 2020 16:41:40 +0300
Subject: [PATCH 21/32] Remove find_library in the dpctl library loading (#79)

---
 numba_dppy/initialize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numba_dppy/initialize.py b/numba_dppy/initialize.py
index 745e8031eb..2a2c70f796 100644
--- a/numba_dppy/initialize.py
+++ b/numba_dppy/initialize.py
@@ -22,7 +22,7 @@ def initialize_all():
         paths = glob.glob(os.path.join(os.path.dirname(dpctl.__file__), '*DPCTLSyclInterface*'))
 
     if len(paths) == 1:
-        ll.load_library_permanently(find_library(paths[0]))
+        ll.load_library_permanently(paths[0])
     else:
         raise ImportError
 

From d7a433b71db9d4a56e298cfd59303cd68861f977 Mon Sep 17 00:00:00 2001
From: Reazul Hoque <reazulhoque@users.noreply.github.com>
Date: Mon, 14 Dec 2020 18:16:33 -0600
Subject: [PATCH 22/32] Implementation of np.linalg.eig (#61)

* Sum example

* Moved from infer_type, lower_builtin to overload

* Added two level module name functions

* Remove Cython generated file

* Module name fix for moving to new extension

* Updated all dppl to dppy and moved rewrite_numpy_function_pass to it's own file

* Import module at correct locations

* Added comments

* Added test and updated comments

* Revert unneeded changes

* Update Eigen implementation

* Separate the implementations into their own category files

* Fix float32 precision

* Added test for eigen

Co-authored-by: reazul.hoque <reazul.hoque@intel.com>
---
 numba_dppy/dpnp_glue/dpnp_linalgimpl.py       | 75 +++++++++++++++++++
 .../dpnp_glue/dpnp_transcendentalsimpl.py     | 58 ++++++++++++++
 numba_dppy/dpnp_glue/dpnpimpl.py              | 54 -------------
 numba_dppy/dpnp_glue/stubs.py                 |  3 +
 numba_dppy/rename_numpy_functions_pass.py     |  5 +-
 numba_dppy/tests/test_dpnp_functions.py       | 51 ++++++++++++-
 6 files changed, 189 insertions(+), 57 deletions(-)
 create mode 100644 numba_dppy/dpnp_glue/dpnp_linalgimpl.py
 create mode 100644 numba_dppy/dpnp_glue/dpnp_transcendentalsimpl.py

diff --git a/numba_dppy/dpnp_glue/dpnp_linalgimpl.py b/numba_dppy/dpnp_glue/dpnp_linalgimpl.py
new file mode 100644
index 0000000000..1bee46918b
--- /dev/null
+++ b/numba_dppy/dpnp_glue/dpnp_linalgimpl.py
@@ -0,0 +1,75 @@
+import numba_dppy.dpnp_glue.dpnpimpl as dpnp_ext
+from numba import types
+from numba.core.typing import signature
+from . import stubs
+import numba_dppy.experimental_numpy_lowering_overload as dpnp_lowering
+from numba.core.extending import overload, register_jitable
+import numpy as np
+
+class _DPNP_LINALG_EXTENSION:
+    @classmethod
+    def dpnp_eig(cls, fn_name, type_names):
+        ret_type = types.void
+        sig = signature(
+            ret_type, types.voidptr, types.voidptr, types.voidptr, types.int64
+        )
+        f_ptr = dpnp_ext.get_dpnp_fptr(fn_name, type_names)
+
+        def get_pointer(obj):
+            return f_ptr
+
+        return types.ExternalFunctionPointer(sig, get_pointer=get_pointer)
+
+
+@overload(stubs.dpnp.eig)
+def dpnp_eig_impl(a):
+    dpnp_lowering.ensure_dpnp("eig")
+    dpnp_extension = _DPNP_LINALG_EXTENSION()
+    dpctl_functions = dpnp_ext._DPCTL_FUNCTIONS()
+
+    dpnp_eig = dpnp_extension.dpnp_eig("dpnp_eig", [a.dtype.name, "NONE"])
+
+    get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
+    allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
+    copy_usm = dpctl_functions.dpctl_queue_memcpy()
+    free_usm = dpctl_functions.dpctl_free_with_queue()
+
+    res_dtype = np.float64
+    if a.dtype == types.float32:
+        res_dtype = np.float32
+
+    def dpnp_eig_impl(a):
+        n = a.shape[-1]
+        if a.shape[-2] != n:
+            msg = "Last 2 dimensions of the array must be square."
+            raise ValueError(msg)
+
+        dpnp_ext._check_finite_matrix(a)
+
+        wr = np.empty(n, dtype=res_dtype)
+        vr = np.empty((n, n), dtype=res_dtype)
+
+        if n == 0:
+            return (wr, vr)
+
+        sycl_queue = get_sycl_queue()
+        a_usm = allocate_usm_shared(a.size * a.itemsize, sycl_queue)
+        copy_usm(sycl_queue, a_usm, a.ctypes, a.size * a.itemsize)
+
+        wr_usm = allocate_usm_shared(wr.size * wr.itemsize, sycl_queue)
+        vr_usm = allocate_usm_shared(vr.size * vr.itemsize, sycl_queue)
+
+        dpnp_eig(a_usm, wr_usm, vr_usm, n)
+
+        copy_usm(sycl_queue, wr.ctypes, wr_usm, wr.size * wr.itemsize)
+        copy_usm(sycl_queue, vr.ctypes, vr_usm, vr.size * vr.itemsize)
+
+        free_usm(a_usm, sycl_queue)
+        free_usm(wr_usm, sycl_queue)
+        free_usm(vr_usm, sycl_queue)
+
+        dpnp_ext._dummy_liveness_func([wr.size, vr.size])
+
+        return (wr, vr)
+
+    return dpnp_eig_impl
diff --git a/numba_dppy/dpnp_glue/dpnp_transcendentalsimpl.py b/numba_dppy/dpnp_glue/dpnp_transcendentalsimpl.py
new file mode 100644
index 0000000000..562b16798b
--- /dev/null
+++ b/numba_dppy/dpnp_glue/dpnp_transcendentalsimpl.py
@@ -0,0 +1,58 @@
+import numba_dppy.dpnp_glue.dpnpimpl as dpnp_ext
+from numba import types
+from numba.core.typing import signature
+from . import stubs
+import numba_dppy.experimental_numpy_lowering_overload as dpnp_lowering
+from numba.core.extending import overload, register_jitable
+import numpy as np
+
+class _DPNP_TRANSCENDENTALS_EXTENSION:
+    @classmethod
+    def dpnp_sum(cls, fn_name, type_names):
+        ret_type = types.void
+        sig = signature(ret_type, types.voidptr, types.voidptr, types.int64)
+        f_ptr = dpnp_ext.get_dpnp_fptr(fn_name, type_names)
+
+        def get_pointer(obj):
+            return f_ptr
+
+        return types.ExternalFunctionPointer(sig, get_pointer=get_pointer)
+
+
+@overload(stubs.dpnp.sum)
+def dpnp_sum_impl(a):
+    dpnp_lowering.ensure_dpnp("sum")
+    dpnp_extension = _DPNP_TRANSCENDENTALS_EXTENSION()
+    dpctl_functions = dpnp_ext._DPCTL_FUNCTIONS()
+
+    dpnp_sum = dpnp_extension.dpnp_sum("dpnp_sum", [a.dtype.name, "NONE"])
+
+    get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
+    allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
+    copy_usm = dpctl_functions.dpctl_queue_memcpy()
+    free_usm = dpctl_functions.dpctl_free_with_queue()
+
+    def dpnp_sum_impl(a):
+        if a.size == 0:
+            raise ValueError("Passed Empty array")
+
+        sycl_queue = get_sycl_queue()
+        a_usm = allocate_usm_shared(a.size * a.itemsize, sycl_queue)
+        copy_usm(sycl_queue, a_usm, a.ctypes, a.size * a.itemsize)
+
+        out_usm = allocate_usm_shared(a.itemsize, sycl_queue)
+
+        dpnp_sum(a_usm, out_usm, a.size)
+
+        out = np.empty(1, dtype=a.dtype)
+        copy_usm(sycl_queue, out.ctypes, out_usm, out.size * out.itemsize)
+
+        free_usm(a_usm, sycl_queue)
+        free_usm(out_usm, sycl_queue)
+
+
+        dpnp_ext._dummy_liveness_func([out.size])
+
+        return out[0]
+
+    return dpnp_sum_impl
diff --git a/numba_dppy/dpnp_glue/dpnpimpl.py b/numba_dppy/dpnp_glue/dpnpimpl.py
index d6e53c4b99..97f6d0a7ac 100644
--- a/numba_dppy/dpnp_glue/dpnpimpl.py
+++ b/numba_dppy/dpnp_glue/dpnpimpl.py
@@ -33,57 +33,3 @@ def __init__(self, fn_name, type_names, sig, get_pointer):
         self.fn_name = fn_name
         self.type_names = type_names
         super(RetrieveDpnpFnPtr, self).__init__(sig, get_pointer)
-
-
-class _DPNP_EXTENSION:
-    def __init__(self, name):
-        dpnp_lowering.ensure_dpnp(name)
-
-    @classmethod
-    def dpnp_sum(cls, fn_name, type_names):
-        ret_type = types.void
-        sig = signature(ret_type, types.voidptr, types.voidptr, types.int64)
-        f_ptr = get_dpnp_fptr(fn_name, type_names)
-
-        def get_pointer(obj):
-            return f_ptr
-
-        return types.ExternalFunctionPointer(sig, get_pointer=get_pointer)
-
-
-@overload(stubs.dpnp.sum)
-def dpnp_sum_impl(a):
-    dpnp_extension = _DPNP_EXTENSION("sum")
-    dpctl_functions = _DPCTL_FUNCTIONS()
-
-    dpnp_sum = dpnp_extension.dpnp_sum("dpnp_sum", [a.dtype.name, "NONE"])
-
-    get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
-    allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
-    copy_usm = dpctl_functions.dpctl_queue_memcpy()
-    free_usm = dpctl_functions.dpctl_free_with_queue()
-
-    def dpnp_sum_impl(a):
-        if a.size == 0:
-            raise ValueError("Passed Empty array")
-
-        sycl_queue = get_sycl_queue()
-        a_usm = allocate_usm_shared(a.size * a.itemsize, sycl_queue)
-        copy_usm(sycl_queue, a_usm, a.ctypes, a.size * a.itemsize)
-
-        out_usm = allocate_usm_shared(a.itemsize, sycl_queue)
-
-        dpnp_sum(a_usm, out_usm, a.size)
-
-        out = np.empty(1, dtype=a.dtype)
-        copy_usm(sycl_queue, out.ctypes, out_usm, out.size * out.itemsize)
-
-        free_usm(a_usm, sycl_queue)
-        free_usm(out_usm, sycl_queue)
-
-
-        _dummy_liveness_func([out.size])
-
-        return out[0]
-
-    return dpnp_sum_impl
diff --git a/numba_dppy/dpnp_glue/stubs.py b/numba_dppy/dpnp_glue/stubs.py
index d51cd28ead..041a30c074 100644
--- a/numba_dppy/dpnp_glue/stubs.py
+++ b/numba_dppy/dpnp_glue/stubs.py
@@ -7,3 +7,6 @@ class dpnp(Stub):
 
     class sum(Stub):
         pass
+
+    class eig(Stub):
+        pass
diff --git a/numba_dppy/rename_numpy_functions_pass.py b/numba_dppy/rename_numpy_functions_pass.py
index a0c4b89b3e..329003f881 100644
--- a/numba_dppy/rename_numpy_functions_pass.py
+++ b/numba_dppy/rename_numpy_functions_pass.py
@@ -8,7 +8,7 @@
 )
 import numba_dppy
 
-rewrite_function_name_map = {"sum": (["np"], "sum")}
+rewrite_function_name_map = {"sum": (["np"], "sum"), "eig": (["linalg"], "eig")}
 
 
 class RewriteNumPyOverloadedFunctions(object):
@@ -110,7 +110,8 @@ class DPPYRewriteOverloadedFunctions(FunctionPass):
     def __init__(self):
         FunctionPass.__init__(self)
         import numba_dppy.dpnp_glue.dpnpdecl
-        import numba_dppy.dpnp_glue.dpnpimpl
+        import numba_dppy.dpnp_glue.dpnp_linalgimpl
+        import numba_dppy.dpnp_glue.dpnp_transcendentalsimpl
 
     def run_pass(self, state):
         rewrite_function_name_pass = RewriteNumPyOverloadedFunctions(
diff --git a/numba_dppy/tests/test_dpnp_functions.py b/numba_dppy/tests/test_dpnp_functions.py
index 7dcd5407b5..75ff19f54a 100644
--- a/numba_dppy/tests/test_dpnp_functions.py
+++ b/numba_dppy/tests/test_dpnp_functions.py
@@ -10,6 +10,7 @@
 import unittest
 
 
+import dpctl
 
 def test_for_different_datatypes(fn, test_fn, dims, arg_count, tys, np_all=False, matrix=None):
     if arg_count == 1:
@@ -87,10 +88,58 @@ def ensure_dpnp():
     except:
         return False
 
+# From https://github.com/IntelPython/dpnp/blob/master/tests/test_linalg.py
+def vvsort(val, vec, size):
+    for i in range(size):
+        imax = i
+        for j in range(i + 1, size):
+            if np.abs(val[imax]) < np.abs(val[j]):
+                imax = j
+
+        temp = val[i]
+        val[i] = val[imax]
+        val[imax] = temp
+
+        for k in range(size):
+            temp = vec[k, i]
+            vec[k, i] = vec[k, imax]
+            vec[k, imax] = temp
+
+
+@unittest.skipUnless(ensure_dpnp(), 'test only when dpNP is available')
+class Testdpnp_linalg_functions(unittest.TestCase):
+    tys = [np.int32, np.uint32, np.int64, np.uint64, np.float, np.double]
+    def test_eig(self):
+        @njit
+        def f(a):
+            return np.linalg.eig(a)
+
+        size = 3
+        for ty in self.tys:
+            a = np.arange(size * size, dtype=ty).reshape((size, size))
+            symm_a = np.tril(a) + np.tril(a, -1).T + np.diag(np.full((size,), size * size, dtype=ty))
+
+            with dpctl.device_context("opencl:gpu"):
+                got_val, got_vec = f(symm_a)
+
+            np_val, np_vec = np.linalg.eig(symm_a)
+
+            # sort val/vec by abs value
+            vvsort(got_val, got_vec, size)
+            vvsort(np_val, np_vec, size)
+
+
+	    # NP change sign of vectors
+            for i in range(np_vec.shape[1]):
+                if np_vec[0, i] * got_vec[0, i] < 0:
+                    np_vec[:, i] = -np_vec[:, i]
+
+            self.assertTrue(np.allclose(got_val, np_val))
+            self.assertTrue(np.allclose(got_vec, np_vec))
+
 
 @unittest.skipUnless(ensure_dpnp() and dpctl.has_gpu_queues(), 'test only when dpNP and GPU is available')
 class Testdpnp_functions(unittest.TestCase):
-
     N = 10
 
     a = np.array(np.random.random(N), dtype=np.float32)

From fed3c2f2be82aa2a8873ab90876bf6cb1ad58f47 Mon Sep 17 00:00:00 2001
From: Elena Totmenina <totmeninal@mail.ru>
Date: Tue, 15 Dec 2020 16:59:47 +0300
Subject: [PATCH 23/32] Offload diagnostics (#29)

Co-authored-by: etotmeni <elena.totmenina@intel.com>
Co-authored-by: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
---
 numba_dppy/compiler.py                       |   3 +
 numba_dppy/config.py                         |   3 +
 numba_dppy/dppy_lowerer.py                   |   6 ++
 numba_dppy/dppy_parfor_diagnostics.py        | 106 +++++++++++++++++++
 numba_dppy/dppy_passbuilder.py               |   4 +-
 numba_dppy/dppy_passes.py                    |  21 +++-
 numba_dppy/tests/test_offload_diagnostics.py |  60 +++++++++++
 numba_dppy/tests/test_prange.py              |   2 +-
 8 files changed, 202 insertions(+), 3 deletions(-)
 create mode 100644 numba_dppy/dppy_parfor_diagnostics.py
 create mode 100644 numba_dppy/tests/test_offload_diagnostics.py

diff --git a/numba_dppy/compiler.py b/numba_dppy/compiler.py
index b93f29801a..37b9e25e9f 100644
--- a/numba_dppy/compiler.py
+++ b/numba_dppy/compiler.py
@@ -19,6 +19,7 @@
 
 import os
 from numba.core.compiler import DefaultPassBuilder, CompilerBase
+from numba_dppy.dppy_parfor_diagnostics import ExtendedParforDiagnostics
 
 
 DEBUG = os.environ.get('NUMBA_DPPY_DEBUG', None)
@@ -61,6 +62,8 @@ class DPPYCompiler(CompilerBase):
     def define_pipelines(self):
         # this maintains the objmode fallback behaviour
         pms = []
+        self.state.parfor_diagnostics = ExtendedParforDiagnostics()
+        self.state.metadata['parfor_diagnostics'] = self.state.parfor_diagnostics
         if not self.state.flags.force_pyobject:
             #print("Numba-DPPY [INFO]: Using Numba-DPPY pipeline")
             pms.append(DPPYPassBuilder.define_nopython_pipeline(self.state))
diff --git a/numba_dppy/config.py b/numba_dppy/config.py
index 76849ba00c..84df7913c3 100644
--- a/numba_dppy/config.py
+++ b/numba_dppy/config.py
@@ -35,4 +35,7 @@ def _readenv(): ...
 # Turn SPIRV-VALIDATION ON/OFF switch
 SPIRV_VAL = _readenv("NUMBA_DPPY_SPIRV_VAL", int, 0)
 
+# Dump offload diagnostics
+OFFLOAD_DIAGNOSTICS = _readenv("NUMBA_DPPY_OFFLOAD_DIAGNOSTICS", int, 0)
+
 FALLBACK_ON_CPU = _readenv("NUMBA_DPPY_FALLBACK_ON_CPU", int, 1)
diff --git a/numba_dppy/dppy_lowerer.py b/numba_dppy/dppy_lowerer.py
index b7f591d296..3040362592 100644
--- a/numba_dppy/dppy_lowerer.py
+++ b/numba_dppy/dppy_lowerer.py
@@ -520,6 +520,8 @@ def print_arg_with_addrspaces(args):
     diagnostics.hoist_info[parfor.id] = {'hoisted': hoisted,
                                          'not_hoisted': not_hoisted}
 
+    lowerer.metadata['parfor_diagnostics'].extra_info[str(parfor.id)] = str(dpctl.get_current_queue().get_sycl_device().get_device_name())
+
     if config.DEBUG_ARRAY_OPT:
         print("After hoisting")
         _print_body(loop_body)
@@ -1165,6 +1167,10 @@ def lower(self):
         try:
             lowering.lower_extensions[parfor.Parfor].append(lower_parfor_rollback)
             self.gpu_lower.lower()
+            # if lower dont crash, and parfor_diagnostics is empty then it is kernel
+            if not self.gpu_lower.metadata['parfor_diagnostics'].extra_info:
+                str_name = str(dpctl.get_current_queue().get_sycl_device().get_device_name())
+                self.gpu_lower.metadata['parfor_diagnostics'].extra_info["kernel"] = str_name
             self.base_lower = self.gpu_lower
             lowering.lower_extensions[parfor.Parfor].pop()
         except Exception as e:
diff --git a/numba_dppy/dppy_parfor_diagnostics.py b/numba_dppy/dppy_parfor_diagnostics.py
new file mode 100644
index 0000000000..50e19a1cb1
--- /dev/null
+++ b/numba_dppy/dppy_parfor_diagnostics.py
@@ -0,0 +1,106 @@
+from numba.parfors.parfor import ParforDiagnostics, _termwidth, print_wrapped
+
+
+class ExtendedParforDiagnostics(ParforDiagnostics):
+    def __init__(self):
+        ParforDiagnostics.__init__(self)
+        self.extra_info = {}
+
+    def dump(self, level=1):
+        if level == 0:
+            level = 1
+        super().dump(level)
+
+        if self.extra_info:
+            parfors_simple = self.get_parfors_simple(False)
+            all_lines = self.get_all_lines(parfors_simple)
+            print(' Auto-offloading '.center(_termwidth,'-'))
+            self.print_auto_offloading(all_lines)
+            if 'kernel' in self.extra_info.keys():
+                print_wrapped("Device - '%s'" % self.extra_info['kernel'])
+            print(_termwidth * '-')
+
+    def print_auto_offloading(self, lines):
+        sword = '+--'
+        fac = len(sword)
+
+        summary = dict()
+        # region : {fused, serialized}
+
+        def print_nest(fadj_, nadj_, theroot, reported, region_id):
+            def print_g(fadj_, nadj_, nroot, depth):
+                for k in nadj_[nroot]:
+                    msg = fac * depth * ' ' + '%s%s %s' % (sword, k, '(serial')
+                    if nadj_[k] == []:
+                        fused = []
+                        if fadj_[k] != [] and k not in reported:
+                            fused = sorted(self.reachable_nodes(fadj_, k))
+                            msg += ", fused with loop(s): "
+                            msg += ', '.join([str(x) for x in fused])
+                        msg += ')'
+                        reported.append(k)
+                        print_wrapped(msg)
+                        summary[region_id]['fused'] += len(fused)
+                    else:
+                        print_wrapped(msg + ')')
+                        print_g(fadj_, nadj_, k, depth + 1)
+                    summary[region_id]['serialized'] += 1
+
+            if nadj_[theroot] != []:
+                print_wrapped("Parallel region %s:" % region_id)
+                print_wrapped('%s%s %s' % (sword, theroot, '(parallel)'))
+                summary[region_id] = {'root': theroot, 'fused': 0, 'serialized': 0}
+                print_g(fadj_, nadj_, theroot, 1)
+                print("\n")
+                region_id = region_id + 1
+            return region_id
+
+        def print_fuse(ty, pf_id, adj, depth, region_id):
+            print_wrapped("Parallel region %s:" % region_id)
+            msg = fac * depth * ' ' + '%s%s %s' % (sword, pf_id, '(parallel')
+            fused = []
+            if adj[pf_id] != []:
+                fused = sorted(self.reachable_nodes(adj, pf_id))
+                msg += ", fused with loop(s): "
+                msg += ', '.join([str(x) for x in fused])
+
+            summary[region_id] = {'root': pf_id, 'fused': len(fused), 'serialized': 0}
+            msg += ')'
+            print_wrapped(msg)
+            extra_info = self.extra_info.get(str(region_id))
+            if extra_info:
+                print_wrapped("Device - '%s'" % extra_info)
+            region_id = region_id + 1
+            return region_id
+
+        # Walk the parfors by src line and print optimised structure
+        region_id = 0
+        reported = []
+        for line, info in sorted(lines.items()):
+            opt_ty, pf_id, adj = info
+            if opt_ty == 'fuse':
+                if pf_id not in reported:
+                    region_id = print_fuse('f', pf_id, adj, 0, region_id)
+            elif opt_ty == 'nest':
+                region_id = print_nest(fadj, nadj, pf_id, reported, region_id)
+            else:
+                assert 0
+
+        # print the summary of the fuse/serialize rewrite
+        if summary:
+            for k, v in sorted(summary.items()):
+                msg = ('\n \nParallel region %s (loop #%s) had %s '
+                    'loop(s) fused')
+                root = v['root']
+                fused = v['fused']
+                serialized = v['serialized']
+                if serialized != 0:
+                    msg += (' and %s loop(s) '
+                    'serialized as part of the larger '
+                    'parallel loop (#%s).')
+                    print_wrapped(msg % (k, root, fused, serialized, root))
+                else:
+                    msg += '.'
+                    print_wrapped(msg % (k, root, fused))
+        else:
+            print_wrapped("Parallel structure is already optimal.")
diff --git a/numba_dppy/dppy_passbuilder.py b/numba_dppy/dppy_passbuilder.py
index b3c632a85a..13a675f254 100644
--- a/numba_dppy/dppy_passbuilder.py
+++ b/numba_dppy/dppy_passbuilder.py
@@ -24,7 +24,8 @@
         SpirvFriendlyLowering,
         DPPYAddNumpyOverloadPass,
         DPPYAddNumpyRemoveOverloadPass,
-        DPPYNoPythonBackend
+        DPPYNoPythonBackend,
+        DPPYDumpParforDiagnostics
         )
 
 from .rename_numpy_functions_pass import DPPYRewriteOverloadedFunctions
@@ -116,5 +117,6 @@ def define_nopython_pipeline(state, name='dppy_nopython'):
         pm.add_pass(SpirvFriendlyLowering, "SPIRV-friendly lowering pass")
         pm.add_pass(DPPYNoPythonBackend, "nopython mode backend")
         pm.add_pass(DPPYAddNumpyRemoveOverloadPass, "dppy remove typing template for Numpy functions")
+        pm.add_pass(DPPYDumpParforDiagnostics, "dump parfor diagnostics")
         pm.finalize()
         return pm
diff --git a/numba_dppy/dppy_passes.py b/numba_dppy/dppy_passes.py
index c73f5a7736..925b7dfd8a 100644
--- a/numba_dppy/dppy_passes.py
+++ b/numba_dppy/dppy_passes.py
@@ -23,9 +23,10 @@
 from numba.core.errors import (LoweringError, new_error_context, TypingError,
                      LiteralTypingError)
 
-from numba.core.compiler_machinery import FunctionPass, LoweringPass, register_pass
+from numba.core.compiler_machinery import FunctionPass, LoweringPass, register_pass, AnalysisPass
 
 from .dppy_lowerer import DPPYLower
+from numba_dppy import config as dppy_config
 
 from numba.parfors.parfor import PreParforPass as _parfor_PreParforPass, replace_functions_map
 from numba.parfors.parfor import ParforPass as _parfor_ParforPass
@@ -438,3 +439,21 @@ def run_pass(self, state):
         remove_dels(state.func_ir.blocks)
 
         return True
+
+
+@register_pass(mutates_CFG=False, analysis_only=True)
+class DPPYDumpParforDiagnostics(AnalysisPass):
+
+    _name = "dump_parfor_diagnostics"
+
+    def __init__(self):
+        AnalysisPass.__init__(self)
+
+    def run_pass(self, state):
+        # if state.flags.auto_parallel.enabled: //add in condition flag for kernels
+        if dppy_config.OFFLOAD_DIAGNOSTICS:
+            if state.parfor_diagnostics is not None:
+                state.parfor_diagnostics.dump(config.PARALLEL_DIAGNOSTICS)
+            else:
+                raise RuntimeError("Diagnostics failed.")
+        return True
diff --git a/numba_dppy/tests/test_offload_diagnostics.py b/numba_dppy/tests/test_offload_diagnostics.py
new file mode 100644
index 0000000000..6b41252fc6
--- /dev/null
+++ b/numba_dppy/tests/test_offload_diagnostics.py
@@ -0,0 +1,60 @@
+import numpy as np
+import numba
+from numba import njit, prange
+import numba_dppy, numba_dppy as dppy
+from numba_dppy import config as dppy_config
+from numba_dppy.testing import unittest
+from numba.tests.support import captured_stdout
+import dpctl
+
+
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
+class TestOffloadDiagnostics(unittest.TestCase):
+    def test_parfor(self):
+        def prange_func():
+            n = 10
+            a = np.ones((n), dtype=np.float64)
+            b = np.ones((n), dtype=np.float64)
+            c = np.ones((n), dtype=np.float64)
+            for i in prange(n//2):
+                a[i] = b[i] + c[i]
+
+            return a
+
+        with dpctl.device_context("opencl:gpu"):
+            dppy_config.OFFLOAD_DIAGNOSTICS = 1
+            jitted = njit(parallel=True)(prange_func)
+
+            with captured_stdout() as got:
+                jitted()
+
+            dppy_config.OFFLOAD_DIAGNOSTICS = 0
+            self.assertTrue("Auto-offloading" in got.getvalue())
+            self.assertTrue("Device -" in got.getvalue())
+
+    def test_kernel(self):
+        @dppy.kernel
+        def parallel_sum(a, b, c):
+            i = dppy.get_global_id(0)
+            c[i] = a[i] + b[i]
+
+        global_size = 10
+        N = global_size
+
+        a = np.array(np.random.random(N), dtype=np.float32)
+        b = np.array(np.random.random(N), dtype=np.float32)
+        c = np.ones_like(a)
+
+        with dpctl.device_context("opencl:gpu"):
+            dppy_config.OFFLOAD_DIAGNOSTICS = 1
+
+            with captured_stdout() as got:
+                parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)
+
+            dppy_config.OFFLOAD_DIAGNOSTICS = 0
+            self.assertTrue("Auto-offloading" in got.getvalue())
+            self.assertTrue("Device -" in got.getvalue())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/numba_dppy/tests/test_prange.py b/numba_dppy/tests/test_prange.py
index 1af03f1cde..eda9ccebbc 100644
--- a/numba_dppy/tests/test_prange.py
+++ b/numba_dppy/tests/test_prange.py
@@ -96,7 +96,7 @@ def f(a, b):
 
         self.assertTrue(np.all(b == 12))
 
-    @expectedFailureIf(sys.platform.startswith('win'))
+    @unittest.skip('numba-dppy issue 110')
     def test_two_consequent_prange(self):
         def prange_example():
             n = 10

From f0725854aff0f8e5ef00a18d9fe5fb0686432e06 Mon Sep 17 00:00:00 2001
From: Elena Totmenina <totmeninal@mail.ru>
Date: Tue, 15 Dec 2020 17:32:16 +0300
Subject: [PATCH 24/32] Rename test files dppl to dppy (#117)

Co-authored-by: etotmeni <elena.totmenina@intel.com>
---
 numba_dppy/tests/{test_dppl_fallback.py => test_dppy_fallback.py} | 0
 numba_dppy/tests/{test_dppl_func.py => test_dppy_func.py}         | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename numba_dppy/tests/{test_dppl_fallback.py => test_dppy_fallback.py} (100%)
 rename numba_dppy/tests/{test_dppl_func.py => test_dppy_func.py} (100%)

diff --git a/numba_dppy/tests/test_dppl_fallback.py b/numba_dppy/tests/test_dppy_fallback.py
similarity index 100%
rename from numba_dppy/tests/test_dppl_fallback.py
rename to numba_dppy/tests/test_dppy_fallback.py
diff --git a/numba_dppy/tests/test_dppl_func.py b/numba_dppy/tests/test_dppy_func.py
similarity index 100%
rename from numba_dppy/tests/test_dppl_func.py
rename to numba_dppy/tests/test_dppy_func.py

From 886b6bf706b771f5ebdc0d2fb6ddb8dd6b3e668a Mon Sep 17 00:00:00 2001
From: Angelina Kharchevnikova <angelina.kharchevnikova@intel.com>
Date: Tue, 15 Dec 2020 17:38:49 +0300
Subject: [PATCH 25/32] Extended docs for debugger (#114)

* Extended docs for debugger
---
 DEBUGGING.md | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/DEBUGGING.md b/DEBUGGING.md
index 44027b318d..6199d0f431 100644
--- a/DEBUGGING.md
+++ b/DEBUGGING.md
@@ -1,7 +1,8 @@
 ## Debugging with GDB
 
-Setting the debug environment variable `NUMBA_DPPY_DEBUG` (e.g. `export NUMBA_DPPY_DEBUG=True`) enables the emission of debug info to 
-the llvm and spirv IR. To disable debugging set this variable to None: (e.g. `export NUMBA_DPPL_DEBUG=`).  
+Setting the debug environment variable `NUMBA_DPPY_DEBUG` (e.g. `export NUMBA_DPPY_DEBUG=True`) enables 
+the emission of debug info to the llvm and spirv IR.
+To disable debugging set this variable to None: (e.g. `export NUMBA_DPPY_DEBUG= `).  
 Currently, the following debug info is available:
 - Source location (filename and line number) is available. 
 - Setting break points by the line number.
@@ -9,13 +10,22 @@ Currently, the following debug info is available:
 
 ### Requirements
 
-Intel GDB installed to the system  
-follow the instruction: https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/distribution-for-gdb.html
+Intel® Distribution for GDB installed to the system.  
+Documentation for this debugger can be found in the 
+[Intel® Distribution for GDB documentation](https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/distribution-for-gdb.html).
 
 ### Example debug usage
 
 ```bash
-$ gdb -q python  
-(gdb) break sum.py:13     # Assumes the kernel is in file sum.py, at line 13  
+$ export NUMBA_DPPY_DEBUG=True
+$ gdb-oneapi -q python  
+(gdb) break numba_dppy/examples/sum.py:14     # Assumes the kernel is in file sum.py, at line 14  
 (gdb) run sum.py
 ```
+
+### Limitations
+
+Currently, Numba-dppy provides only initial support of debugging GPU kernels.  
+The following functionality is **not supported** :
+- Printing kernel local variables (e.g. ```info locals```).
+- Stepping over several off-loaded functions.

From 0f5e72d01e6c2099c2478cdf0384c72fbf02b3e2 Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Wed, 16 Dec 2020 14:50:55 +0300
Subject: [PATCH 26/32] dpNP 0.4.* and dpCtl 0.5.* (#123)

---
 README.md              |  4 ++--
 conda-recipe/meta.yaml | 12 ++++++++----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 483ad7bdad..180ac952e7 100644
--- a/README.md
+++ b/README.md
@@ -20,8 +20,8 @@ https://intelpython.github.io/dpnp/
 ## Dependencies
 
 * numba >=0.51 (IntelPython/numba)
-* dpCtl >=0.4.0
-* dpNP >=0.3 (optional)
+* dpCtl 0.5.*
+* dpNP 0.4.* (optional)
 * llvm-spirv (SPIRV generation from LLVM IR)
 * llvmdev (LLVM IR generation)
 * spirv-tools
diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index d8f6c1ecbb..5e5b61a25c 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -18,16 +18,20 @@ requirements:
         - setuptools
         - cython
         - numba
-        - dpctl
-        - dpnp  # [linux]
+        - dpctl 0.5.*
+        - dpnp 0.4.*  # [linux]
     run:
         - python
         - numba >=0.51
-        - dpctl
+        - dpctl 0.5.*
         - spirv-tools
         - llvm-spirv
         - llvmdev
-        - dpnp  # [linux]
+        - dpnp 0.4.*  # [linux]
+
+test:
+  requires:
+    - scipy  # [linux]
 
 about:
     home: https://github.com/IntelPython/numba-dppy

From 18fa309502b082a0dd034cdbff598e26f24a5ba4 Mon Sep 17 00:00:00 2001
From: Reazul Hoque <reazulhoque@users.noreply.github.com>
Date: Wed, 16 Dec 2020 06:55:29 -0600
Subject: [PATCH 27/32] Rewrite supported ndarray.function_name (#63)

* Sum example
* Moved from infer_type, lower_builtin to overload
* Added two level module name functions
* Remove cython generated file
* Module name fix for moving to new extension
* Incomplete linalg.eig implementation
* Updted all dppl to dppy and moved rewrite_numpy_function_pass to it's own file
* Import module at correct locations
* Added comments
* Added test and updated comments
* Revert unneeded changes
* Update Eigen implementation
* Separate the implementations into their own category files
* Fix float32 precision
* Added test for eigen
* Added typed pass to rename ndarray.function_name
* Added test for ndarray.sum
* Update test to match result with tolerance
* Move ensure_dpnp() to testing.py
* Skip test test_rename_ndarray if no dpNP

Co-authored-by: reazul.hoque <reazul.hoque@intel.com>
Co-authored-by: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
---
 .../dpnp_glue/dpnp_transcendentalsimpl.py     |   1 -
 numba_dppy/dppy_passbuilder.py                |   9 +-
 numba_dppy/rename_numpy_functions_pass.py     | 110 +++++++++++++++++-
 numba_dppy/testing.py                         |   8 ++
 numba_dppy/tests/test_dpnp_functions.py       |  26 +++--
 numba_dppy/tests/test_numpy_math_functions.py |   3 +-
 .../tests/test_rename_numpy_function_pass.py  |  61 +++++++++-
 7 files changed, 200 insertions(+), 18 deletions(-)

diff --git a/numba_dppy/dpnp_glue/dpnp_transcendentalsimpl.py b/numba_dppy/dpnp_glue/dpnp_transcendentalsimpl.py
index 562b16798b..d4c91ae794 100644
--- a/numba_dppy/dpnp_glue/dpnp_transcendentalsimpl.py
+++ b/numba_dppy/dpnp_glue/dpnp_transcendentalsimpl.py
@@ -50,7 +50,6 @@ def dpnp_sum_impl(a):
         free_usm(a_usm, sycl_queue)
         free_usm(out_usm, sycl_queue)
 
-
         dpnp_ext._dummy_liveness_func([out.size])
 
         return out[0]
diff --git a/numba_dppy/dppy_passbuilder.py b/numba_dppy/dppy_passbuilder.py
index 13a675f254..da54ad4b3e 100644
--- a/numba_dppy/dppy_passbuilder.py
+++ b/numba_dppy/dppy_passbuilder.py
@@ -28,7 +28,8 @@
         DPPYDumpParforDiagnostics
         )
 
-from .rename_numpy_functions_pass import DPPYRewriteOverloadedFunctions
+from .rename_numpy_functions_pass import (DPPYRewriteOverloadedNumPyFunctions,
+                                          DPPYRewriteNdarrayFunctions)
 
 class DPPYPassBuilder(object):
     """
@@ -48,7 +49,7 @@ def default_numba_nopython_pipeline(state, pm):
         pm.add_pass(WithLifting, "Handle with contexts")
 
         # this pass rewrites name of NumPy functions we intend to overload
-        pm.add_pass(DPPYRewriteOverloadedFunctions,
+        pm.add_pass(DPPYRewriteOverloadedNumPyFunctions,
                 "Rewrite name of Numpy functions to overload already overloaded function",
         )
 
@@ -89,6 +90,10 @@ def default_numba_nopython_pipeline(state, pm):
         pm.add_pass(NopythonTypeInference, "nopython frontend")
         pm.add_pass(AnnotateTypes, "annotate types")
 
+        pm.add_pass(DPPYRewriteNdarrayFunctions,
+                "Rewrite ndarray functions to dppy supported functions",
+        )
+
         # strip phis
         pm.add_pass(PreLowerStripPhis, "remove phis nodes")
 
diff --git a/numba_dppy/rename_numpy_functions_pass.py b/numba_dppy/rename_numpy_functions_pass.py
index 329003f881..bf8af9b661 100644
--- a/numba_dppy/rename_numpy_functions_pass.py
+++ b/numba_dppy/rename_numpy_functions_pass.py
@@ -7,6 +7,7 @@
     simplify_CFG,
 )
 import numba_dppy
+from numba.core import types
 
 rewrite_function_name_map = {"sum": (["np"], "sum"), "eig": (["linalg"], "eig")}
 
@@ -104,7 +105,7 @@ def run(self):
 
 
 @register_pass(mutates_CFG=True, analysis_only=False)
-class DPPYRewriteOverloadedFunctions(FunctionPass):
+class DPPYRewriteOverloadedNumPyFunctions(FunctionPass):
     _name = "dppy_rewrite_overloaded_functions_pass"
 
     def __init__(self):
@@ -124,3 +125,110 @@ def run_pass(self, state):
         state.func_ir.blocks = simplify_CFG(state.func_ir.blocks)
 
         return True
+
+
+def get_dpnp_func_typ(func):
+    from numba.core.typing.templates import builtin_registry
+    for (k, v) in builtin_registry.globals:
+        if k == func:
+            return v
+    raise RuntimeError("type for func ", func, " not found")
+
+
+class RewriteNdarrayFunctions(object):
+    def __init__(self, state, rewrite_function_name_map=rewrite_function_name_map):
+        self.state = state
+        self.function_name_map = rewrite_function_name_map
+        self.typemap = state.type_annotation.typemap
+        self.calltypes = state.type_annotation.calltypes
+
+    def run(self):
+        typingctx = self.state.typingctx
+
+        # save array arg to call
+        # call_varname -> array
+        func_ir = self.state.func_ir
+        blocks = func_ir.blocks
+        saved_arr_arg = {}
+        topo_order = find_topo_order(blocks)
+
+        for label in topo_order:
+            block = blocks[label]
+            new_body = []
+            for stmt in block.body:
+                if isinstance(stmt, ir.Assign) and isinstance(stmt.value, ir.Expr):
+                    lhs = stmt.target.name
+                    rhs = stmt.value
+                    # replace A.func with np.func, and save A in saved_arr_arg
+                    if (rhs.op == 'getattr' and rhs.attr in self.function_name_map
+                            and isinstance(
+                                self.typemap[rhs.value.name], types.npytypes.Array)):
+                        rhs = stmt.value
+                        arr = rhs.value
+                        saved_arr_arg[lhs] = arr
+                        scope = arr.scope
+                        loc = arr.loc
+
+                        g_dppy_var = ir.Var(scope, mk_unique_var("$load_global"), loc)
+                        self.typemap[g_dppy_var.name] = types.misc.Module(numba_dppy)
+                        g_dppy = ir.Global("numba_dppy", numba_dppy, loc)
+                        g_dppy_assign = ir.Assign(g_dppy, g_dppy_var, loc)
+
+                        dpnp_var = ir.Var(scope, mk_unique_var("$load_attr"), loc)
+                        self.typemap[dpnp_var.name] = types.misc.Module(numba_dppy.dpnp)
+                        getattr_dpnp = ir.Expr.getattr(g_dppy_var, "dpnp", loc)
+                        dpnp_assign = ir.Assign(getattr_dpnp, dpnp_var, loc)
+
+                        rhs.value = dpnp_var
+                        new_body.append(g_dppy_assign)
+                        new_body.append(dpnp_assign)
+
+                        func_ir._definitions[g_dppy_var.name] = [getattr_dpnp]
+                        func_ir._definitions[dpnp_var.name] = [getattr_dpnp]
+
+                        # update func var type
+                        func = getattr(numba_dppy.dpnp, rhs.attr)
+                        func_typ = get_dpnp_func_typ(func)
+
+                        self.typemap.pop(lhs)
+                        self.typemap[lhs] = func_typ
+
+                    if rhs.op == 'call' and rhs.func.name in saved_arr_arg:
+                        # add array as first arg
+                        arr = saved_arr_arg[rhs.func.name]
+                        # update call type signature to include array arg
+                        old_sig = self.calltypes.pop(rhs)
+                        # argsort requires kws for typing so sig.args can't be used
+                        # reusing sig.args since some types become Const in sig
+                        argtyps = old_sig.args[:len(rhs.args)]
+                        kwtyps = {name: self.typemap[v.name] for name, v in rhs.kws}
+                        self.calltypes[rhs] = self.typemap[rhs.func.name].get_call_type(
+                            typingctx, [self.typemap[arr.name]] + list(argtyps), kwtyps)
+                        rhs.args = [arr] + rhs.args
+
+                new_body.append(stmt)
+            block.body = new_body
+        return
+
+
+
+@register_pass(mutates_CFG=True, analysis_only=False)
+class DPPYRewriteNdarrayFunctions(FunctionPass):
+    _name = "dppy_rewrite_ndarray_functions_pass"
+
+    def __init__(self):
+        FunctionPass.__init__(self)
+
+    def run_pass(self, state):
+        rewrite_ndarray_function_name_pass = RewriteNdarrayFunctions(
+            state, rewrite_function_name_map
+        )
+
+        rewrite_ndarray_function_name_pass.run()
+
+        remove_dead(state.func_ir.blocks, state.func_ir.arg_names, state.func_ir)
+        state.func_ir.blocks = simplify_CFG(state.func_ir.blocks)
+
+        return True
+
+
diff --git a/numba_dppy/testing.py b/numba_dppy/testing.py
index 89d012f72f..e6ff1e3ab3 100644
--- a/numba_dppy/testing.py
+++ b/numba_dppy/testing.py
@@ -33,3 +33,11 @@ def expectedFailureIf(condition):
     if condition:
         return unittest.expectedFailure
     return _id
+
+
+def ensure_dpnp():
+    try:
+        from numba_dppy.dpnp_glue import dpnp_fptr_interface as dpnp_glue
+        return True
+    except:
+        return False
diff --git a/numba_dppy/tests/test_dpnp_functions.py b/numba_dppy/tests/test_dpnp_functions.py
index 75ff19f54a..6b33a25dfe 100644
--- a/numba_dppy/tests/test_dpnp_functions.py
+++ b/numba_dppy/tests/test_dpnp_functions.py
@@ -8,6 +8,7 @@
 import numba_dppy as dppy
 import dpctl
 import unittest
+from numba_dppy.testing import ensure_dpnp
 
 
 import dpctl
@@ -81,13 +82,6 @@ def test_for_dimensions(fn, test_fn, dims, tys, np_all=False):
     return True
 
 
-def ensure_dpnp():
-    try:
-        from numba_dppy.dpnp_glue import dpnp_fptr_interface as dpnp_glue
-        return True
-    except:
-        return False
-
 # From https://github.com/IntelPython/dpnp/blob/master/tests/test_linalg.py
 def vvsort(val, vec, size):
     for i in range(size):
@@ -138,6 +132,24 @@ def f(a):
             self.assertTrue(np.allclose(got_vec, np_vec))
 
 
+@unittest.skipUnless(ensure_dpnp(), 'test only when dpNP is available')
+class Testdpnp_ndarray_functions(unittest.TestCase):
+    tys = [np.int32, np.uint32, np.int64, np.uint64, np.float, np.double]
+    def test_ndarray_sum(self):
+        @njit
+        def f(a):
+            return a.sum()
+
+        size = 3
+        for ty in self.tys:
+            a = np.arange(size * size, dtype=ty).reshape((size, size))
+
+            with dpctl.device_context("opencl:gpu"):
+                got = f(a)
+                expected = a.sum()
+
+            self.assertTrue(expected == got)
+
 @unittest.skipUnless(ensure_dpnp() and dpctl.has_gpu_queues(), 'test only when dpNP and GPU is available')
 class Testdpnp_functions(unittest.TestCase):
     N = 10
diff --git a/numba_dppy/tests/test_numpy_math_functions.py b/numba_dppy/tests/test_numpy_math_functions.py
index 0d19193cb7..cf5174ac96 100644
--- a/numba_dppy/tests/test_numpy_math_functions.py
+++ b/numba_dppy/tests/test_numpy_math_functions.py
@@ -218,7 +218,8 @@ def f(a):
             c = f(input_arr)
 
         d = np.exp(input_arr)
-        self.assertTrue(np.all(c == d))
+        max_abs_err = c.sum() - d.sum()
+        self.assertTrue(max_abs_err < 1e-5)
 
     def test_log(self):
         @njit
diff --git a/numba_dppy/tests/test_rename_numpy_function_pass.py b/numba_dppy/tests/test_rename_numpy_function_pass.py
index e568358dad..cfeff09b8d 100644
--- a/numba_dppy/tests/test_rename_numpy_function_pass.py
+++ b/numba_dppy/tests/test_rename_numpy_function_pass.py
@@ -2,14 +2,33 @@
 import unittest
 import numpy as np
 import numba
-from numba.core import compiler
-from numba_dppy.rename_numpy_functions_pass import DPPYRewriteOverloadedFunctions
+from numba import njit, typeof
+import numba_dppy, numba_dppy as dppy
+from numba_dppy.testing import ensure_dpnp
+
+
+from numba.core import (compiler, typing, cpu)
+from numba_dppy.rename_numpy_functions_pass import (DPPYRewriteOverloadedNumPyFunctions,
+        DPPYRewriteNdarrayFunctions)
+from numba.core.typed_passes import (NopythonTypeInference, AnnotateTypes)
 
 
 class MyPipeline(object):
-    def __init__(self, test_ir):
+    def __init__(self, test_ir, args):
         self.state = compiler.StateDict()
+        self.state.typingctx = typing.Context()
+        self.state.targetctx = cpu.CPUContext(self.state.typingctx)
         self.state.func_ir = test_ir
+        self.state.func_id = test_ir.func_id
+        self.state.args = args
+        self.state.return_type = None
+        self.state.locals = dict()
+        self.state.status = None
+        self.state.lifted = dict()
+        self.state.lifted_from = None
+
+        self.state.typingctx.refresh()
+        self.state.targetctx.refresh()
 
 
 def check_equivalent(expected_ir, got_ir):
@@ -39,7 +58,7 @@ def check_equivalent(expected_ir, got_ir):
 
 
 class TestRenameNumpyFunctionsPass(unittest.TestCase):
-    def test_rename(self):
+    def test_rename_numpy(self):
         def expected(a):
             return numba_dppy.dpnp.sum(a)
 
@@ -49,13 +68,43 @@ def got(a):
         expected_ir = compiler.run_frontend(expected)
         got_ir = compiler.run_frontend(got)
 
-        pipeline = MyPipeline(got_ir)
+        pipeline = MyPipeline(got_ir, None)
 
-        rewrite_numpy_functions_pass = DPPYRewriteOverloadedFunctions()
+        rewrite_numpy_functions_pass = DPPYRewriteOverloadedNumPyFunctions()
         rewrite_numpy_functions_pass.run_pass(pipeline.state)
 
         self.assertTrue(check_equivalent(expected_ir, pipeline.state.func_ir))
 
 
+@unittest.skipUnless(ensure_dpnp(), 'test only when dpNP is available')
+class TestRenameNdarrayFunctionsPass(unittest.TestCase):
+    def test_rename_ndarray(self):
+        def expected(a):
+            return numba_dppy.dpnp.sum(a)
+
+        def got(a):
+            return a.sum()
+
+        expected_ir = compiler.run_frontend(expected)
+        got_ir = compiler.run_frontend(got)
+
+        a = np.arange(10)
+        args = [a]
+        argtypes = [typeof(x) for x in args]
+
+        pipeline = MyPipeline(got_ir, argtypes)
+
+        tyinfer_pass = NopythonTypeInference()
+        tyinfer_pass.run_pass(pipeline.state)
+
+        annotate_ty_pass = AnnotateTypes()
+        annotate_ty_pass.run_pass(pipeline.state)
+
+        rewrite_ndarray_functions_pass = DPPYRewriteNdarrayFunctions()
+        rewrite_ndarray_functions_pass.run_pass(pipeline.state)
+
+        self.assertTrue(check_equivalent(expected_ir, pipeline.state.func_ir))
+
+
 if __name__ == "__main__":
     unittest.main()

From 36a179dc725b119527b8ba88d10e4d9f9ce47bc7 Mon Sep 17 00:00:00 2001
From: Reazul Hoque <reazulhoque@users.noreply.github.com>
Date: Wed, 16 Dec 2020 12:14:52 -0600
Subject: [PATCH 28/32] Port existing functions enabled through dpnp (#115)

Co-authored-by: reazul.hoque <reazul.hoque@intel.com>
---
 numba_dppy/dpnp_glue/__init__.py              |   6 +
 numba_dppy/dpnp_glue/dpnp_linalgimpl.py       | 237 +++++-
 .../dpnp_glue/dpnp_sort_search_countimpl.py   | 157 ++++
 numba_dppy/dpnp_glue/dpnp_statisticsimpl.py   | 312 ++++++++
 .../dpnp_glue/dpnp_transcendentalsimpl.py     |  81 +-
 numba_dppy/dpnp_glue/dpnpdecl.py              |  13 +
 numba_dppy/dpnp_glue/dpnpimpl.py              |  36 +-
 numba_dppy/dpnp_glue/stubs.py                 |  39 +
 numba_dppy/dppy_passbuilder.py                |   8 -
 numba_dppy/dppy_passes.py                     | 124 ---
 .../experimental_numpy_lowering_overload.py   | 723 ------------------
 numba_dppy/rename_numpy_functions_pass.py     |  23 +-
 numba_dppy/tests/test_dpnp_functions.py       | 130 ++++
 numba_dppy/tests/test_dppy_fallback.py        |   5 +-
 14 files changed, 985 insertions(+), 909 deletions(-)
 create mode 100644 numba_dppy/dpnp_glue/dpnp_sort_search_countimpl.py
 create mode 100644 numba_dppy/dpnp_glue/dpnp_statisticsimpl.py
 delete mode 100644 numba_dppy/experimental_numpy_lowering_overload.py

diff --git a/numba_dppy/dpnp_glue/__init__.py b/numba_dppy/dpnp_glue/__init__.py
index e69de29bb2..17d6b5ad6a 100644
--- a/numba_dppy/dpnp_glue/__init__.py
+++ b/numba_dppy/dpnp_glue/__init__.py
@@ -0,0 +1,6 @@
+def ensure_dpnp(name):
+    try:
+       # import dpnp
+        from . import dpnp_fptr_interface as dpnp_glue
+    except ImportError:
+        raise ImportError("dpNP is needed to call np.%s" % name)
diff --git a/numba_dppy/dpnp_glue/dpnp_linalgimpl.py b/numba_dppy/dpnp_glue/dpnp_linalgimpl.py
index 1bee46918b..9146299b05 100644
--- a/numba_dppy/dpnp_glue/dpnp_linalgimpl.py
+++ b/numba_dppy/dpnp_glue/dpnp_linalgimpl.py
@@ -2,32 +2,30 @@
 from numba import types
 from numba.core.typing import signature
 from . import stubs
-import numba_dppy.experimental_numpy_lowering_overload as dpnp_lowering
+import numba_dppy.dpnp_glue as dpnp_lowering
 from numba.core.extending import overload, register_jitable
 import numpy as np
-
-class _DPNP_LINALG_EXTENSION:
-    @classmethod
-    def dpnp_eig(cls, fn_name, type_names):
-        ret_type = types.void
-        sig = signature(
-            ret_type, types.voidptr, types.voidptr, types.voidptr, types.int64
-        )
-        f_ptr = dpnp_ext.get_dpnp_fptr(fn_name, type_names)
-
-        def get_pointer(obj):
-            return f_ptr
-
-        return types.ExternalFunctionPointer(sig, get_pointer=get_pointer)
-
+from numba_dppy.dpctl_functions import _DPCTL_FUNCTIONS
 
 @overload(stubs.dpnp.eig)
 def dpnp_eig_impl(a):
-    dpnp_lowering.ensure_dpnp("eig")
-    dpnp_extension = _DPNP_LINALG_EXTENSION()
+    name = "eig"
+    dpnp_lowering.ensure_dpnp(name)
     dpctl_functions = dpnp_ext._DPCTL_FUNCTIONS()
 
-    dpnp_eig = dpnp_extension.dpnp_eig("dpnp_eig", [a.dtype.name, "NONE"])
+    ret_type = types.void
+    """
+    dpnp source:
+    https://github.com/IntelPython/dpnp/blob/0.4.0/dpnp/backend/custom_kernels.cpp#L180
+
+    Function declaration:
+    void dpnp_eig_c(const void* array_in, void* result1, void* result2, size_t size)
+
+    """
+    sig = signature(
+        ret_type, types.voidptr, types.voidptr, types.voidptr, types.intp
+    )
+    dpnp_eig = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
 
     get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
     allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
@@ -73,3 +71,204 @@ def dpnp_eig_impl(a):
         return (wr, vr)
 
     return dpnp_eig_impl
+
+
+@overload(stubs.dpnp.matmul)
+@overload(stubs.dpnp.dot)
+def dpnp_dot_impl(a, b):
+    dpnp_lowering.ensure_dpnp("dot")
+    dpctl_functions = dpnp_ext._DPCTL_FUNCTIONS()
+
+    ret_type = types.void
+    """
+    dpnp source:
+    https://github.com/IntelPython/dpnp/blob/0.4.0/dpnp/backend/custom_kernels.cpp#L42
+    https://github.com/IntelPython/dpnp/blob/0.4.0/dpnp/backend/custom_kernels.cpp#L118
+
+    Function declaration:
+    void dpnp_matmul_c(void* array1_in, void* array2_in, void* result1, size_t size_m,
+                       size_t size_n, size_t size_k)
+    void dpnp_dot_c(void* array1_in, void* array2_in, void* result1, size_t size)
+
+    """
+    sig = signature(
+        ret_type, types.voidptr, types.voidptr, types.voidptr,
+                  types.intp, types.intp, types.intp)
+
+    get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
+    allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
+    copy_usm = dpctl_functions.dpctl_queue_memcpy()
+    free_usm = dpctl_functions.dpctl_free_with_queue()
+
+    res_dtype = np.float64
+    if a.dtype == types.int32 and b.dtype == types.int32:
+        res_dtype = np.int32
+    elif a.dtype == types.int32 and b.dtype == types.int64:
+        res_dtype = np.int64
+    elif a.dtype == types.int32 and b.dtype == types.float32:
+        res_dtype = np.float64
+    elif a.dtype == types.int32 and b.dtype == types.float64:
+        res_dtype = np.float64
+    elif a.dtype == types.int64 and b.dtype == types.int32:
+        res_dtype = np.int64
+    elif a.dtype == types.int64 and b.dtype == types.int64:
+        res_dtype = np.int64
+    elif a.dtype == types.int64 and b.dtype == types.float32:
+        res_dtype = np.float64
+    elif a.dtype == types.int64 and b.dtype == types.float64:
+        res_dtype = np.float64
+    elif a.dtype == types.float32 and b.dtype == types.int32:
+        res_dtype = np.float64
+    elif a.dtype == types.float32 and b.dtype == types.int64:
+        res_dtype = np.float64
+    elif a.dtype == types.float32 and b.dtype == types.float32:
+        res_dtype = np.float32
+    elif a.dtype == types.float32 and b.dtype == types.float64:
+        res_dtype = np.float64
+    elif a.dtype == types.float64 and b.dtype == types.int32:
+        res_dtype = np.float64
+    elif a.dtype == types.float64 and b.dtype == types.int64:
+        res_dtype = np.float64
+    elif a.dtype == types.float64 and b.dtype == types.float32:
+        res_dtype = np.float32
+    elif a.dtype == types.float64 and b.dtype == types.float64:
+        res_dtype = np.float64
+
+    ndims = [a.ndim, b.ndim]
+    if ndims == [2, 2]:
+        dpnp_func = dpnp_ext.dpnp_func("dpnp_matmul", [a.dtype.name, "NONE"], sig)
+        def dot_2_mm(a, b):
+            sycl_queue = get_sycl_queue()
+
+            m, k = a.shape
+            _k, n = b.shape
+
+            if _k != k:
+                raise ValueError("Incompatible array sizes for np.dot(a, b)")
+
+            a_usm = allocate_usm_shared(a.size * a.itemsize, sycl_queue)
+            copy_usm(sycl_queue, a_usm, a.ctypes, a.size * a.itemsize)
+
+            b_usm = allocate_usm_shared(b.size * b.itemsize, sycl_queue)
+            copy_usm(sycl_queue, b_usm, b.ctypes, b.size * b.itemsize)
+
+            out = np.empty((m, n), dtype=res_dtype)
+            out_usm = allocate_usm_shared(out.size * out.itemsize, sycl_queue)
+
+            dpnp_func(a_usm, b_usm, out_usm, m, n, k)
+
+            copy_usm(sycl_queue, out.ctypes, out_usm, out.size * out.itemsize)
+
+            free_usm(a_usm, sycl_queue)
+            free_usm(b_usm, sycl_queue)
+            free_usm(out_usm, sycl_queue)
+
+            dpnp_ext._dummy_liveness_func([a.size, b.size, out.size])
+
+            return out
+
+        return dot_2_mm
+    elif ndims == [2, 1]:
+        dpnp_func = dpnp_ext.dpnp_func("dpnp_matmul", [a.dtype.name, "NONE"], sig)
+        def dot_2_mv(a, b):
+            sycl_queue = get_sycl_queue()
+
+            m, k = a.shape
+            _n,  = b.shape
+            n = 1
+
+            if _n != k:
+                raise ValueError("Incompatible array sizes for np.dot(a, b)")
+
+            a_usm = allocate_usm_shared(a.size * a.itemsize, sycl_queue)
+            copy_usm(sycl_queue, a_usm, a.ctypes, a.size * a.itemsize)
+
+            b_usm = allocate_usm_shared(b.size * b.itemsize, sycl_queue)
+            copy_usm(sycl_queue, b_usm, b.ctypes, b.size * b.itemsize)
+
+            out = np.empty((m, ), dtype=res_dtype)
+            out_usm = allocate_usm_shared(out.size * out.itemsize, sycl_queue)
+
+            dpnp_func(a_usm, b_usm, out_usm, m, n, k)
+
+            copy_usm(sycl_queue, out.ctypes, out_usm, out.size * out.itemsize)
+
+            free_usm(a_usm, sycl_queue)
+            free_usm(b_usm, sycl_queue)
+            free_usm(out_usm, sycl_queue)
+
+            dpnp_ext._dummy_liveness_func([a.size, b.size, out.size])
+
+            return out
+
+        return dot_2_mv
+    elif ndims == [1, 2]:
+        dpnp_func = dpnp_ext.dpnp_func("dpnp_matmul", [a.dtype.name, "NONE"], sig)
+        def dot_2_vm(a, b):
+            sycl_queue = get_sycl_queue()
+
+            m, = a.shape
+            k, n = b.shape
+
+            if m != k:
+                raise ValueError("Incompatible array sizes for np.dot(a, b)")
+
+            a_usm = allocate_usm_shared(a.size * a.itemsize, sycl_queue)
+            copy_usm(sycl_queue, a_usm, a.ctypes, a.size * a.itemsize)
+
+            b_usm = allocate_usm_shared(b.size * b.itemsize, sycl_queue)
+            copy_usm(sycl_queue, b_usm, b.ctypes, b.size * b.itemsize)
+
+            out = np.empty((n, ), dtype=res_dtype)
+            out_usm = allocate_usm_shared(out.size * out.itemsize, sycl_queue)
+
+            dpnp_func(a_usm, b_usm, out_usm, m, n, k)
+
+            copy_usm(sycl_queue, out.ctypes, out_usm, out.size * out.itemsize)
+
+            free_usm(a_usm, sycl_queue)
+            free_usm(b_usm, sycl_queue)
+            free_usm(out_usm, sycl_queue)
+
+            dpnp_ext._dummy_liveness_func([a.size, b.size, out.size])
+
+            return out
+
+        return dot_2_vm
+    elif ndims == [1, 1]:
+        sig = signature(ret_type, types.voidptr, types.voidptr, types.voidptr,
+                                  types.intp)
+        dpnp_func = dpnp_ext.dpnp_func("dpnp_dot", [a.dtype.name, "NONE"], sig)
+        def dot_2_vv(a, b):
+            sycl_queue = get_sycl_queue()
+
+            m, = a.shape
+            n, = b.shape
+
+            if m != n:
+                raise ValueError("Incompatible array sizes for np.dot(a, b)")
+
+            a_usm = allocate_usm_shared(a.size * a.itemsize, sycl_queue)
+            copy_usm(sycl_queue, a_usm, a.ctypes, a.size * a.itemsize)
+
+            b_usm = allocate_usm_shared(b.size * b.itemsize, sycl_queue)
+            copy_usm(sycl_queue, b_usm, b.ctypes, b.size * b.itemsize)
+
+            out = np.empty(1, dtype=res_dtype)
+            out_usm = allocate_usm_shared(out.size * out.itemsize, sycl_queue)
+
+            dpnp_func(a_usm, b_usm, out_usm, m)
+
+            copy_usm(sycl_queue, out.ctypes, out_usm, out.size * out.itemsize)
+
+            free_usm(a_usm, sycl_queue)
+            free_usm(b_usm, sycl_queue)
+            free_usm(out_usm, sycl_queue)
+
+            dpnp_ext._dummy_liveness_func([a.size, b.size, out.size])
+
+            return out[0]
+
+        return dot_2_vv
+    else:
+        assert 0
diff --git a/numba_dppy/dpnp_glue/dpnp_sort_search_countimpl.py b/numba_dppy/dpnp_glue/dpnp_sort_search_countimpl.py
new file mode 100644
index 0000000000..8ec200059b
--- /dev/null
+++ b/numba_dppy/dpnp_glue/dpnp_sort_search_countimpl.py
@@ -0,0 +1,157 @@
+import numba_dppy.dpnp_glue.dpnpimpl as dpnp_ext
+from numba.core import types, cgutils
+from numba.core.typing import signature
+from . import stubs
+import numba_dppy.dpnp_glue as dpnp_lowering
+from numba.core.extending import overload, register_jitable
+import numpy as np
+
+
+@overload(stubs.dpnp.argmax)
+def dpnp_argmax_impl(a):
+    name = "argmax"
+    dpnp_lowering.ensure_dpnp(name)
+    dpctl_functions = dpnp_ext._DPCTL_FUNCTIONS()
+
+    ret_type = types.void
+    """
+    dpnp source:
+    https://github.com/IntelPython/dpnp/blob/0.4.0/dpnp/backend/custom_kernels_searching.cpp#L36
+
+    Function declaration:
+    void custom_argmax_c(void* array1_in, void* result1, size_t size)
+    """
+    sig = signature(ret_type, types.voidptr, types.voidptr, types.intp)
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, np.dtype(np.int64).name], sig)
+
+    get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
+    allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
+    copy_usm = dpctl_functions.dpctl_queue_memcpy()
+    free_usm = dpctl_functions.dpctl_free_with_queue()
+
+    res_dtype = np.int64
+
+    def dpnp_impl(a):
+        if a.size == 0:
+            raise ValueError("Passed Empty array")
+
+        sycl_queue = get_sycl_queue()
+
+        a_usm = allocate_usm_shared(a.size * a.itemsize, sycl_queue)
+        copy_usm(sycl_queue, a_usm, a.ctypes, a.size * a.itemsize)
+
+        out = np.empty(1, dtype=res_dtype)
+        out_usm = allocate_usm_shared(out.itemsize, sycl_queue)
+
+        dpnp_func(a_usm, out_usm, a.size)
+
+        copy_usm(sycl_queue, out.ctypes, out_usm, out.size * out.itemsize)
+
+        free_usm(a_usm, sycl_queue)
+        free_usm(out_usm, sycl_queue)
+
+        dpnp_ext._dummy_liveness_func([a.size, out.size])
+
+        return out[0]
+
+    return dpnp_impl
+
+
+@overload(stubs.dpnp.argmin)
+def dpnp_argmin_impl(a):
+    name = "argmin"
+    dpnp_lowering.ensure_dpnp(name)
+    dpctl_functions = dpnp_ext._DPCTL_FUNCTIONS()
+
+    ret_type = types.void
+    """
+    dpnp source:
+    https://github.com/IntelPython/dpnp/blob/0.4.0/dpnp/backend/custom_kernels_searching.cpp#L56
+
+    Function declaration:
+    void custom_argmin_c(void* array1_in, void* result1, size_t size)
+    """
+    sig = signature(ret_type, types.voidptr, types.voidptr, types.intp)
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, np.dtype(np.int64).name], sig)
+
+    get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
+    allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
+    copy_usm = dpctl_functions.dpctl_queue_memcpy()
+    free_usm = dpctl_functions.dpctl_free_with_queue()
+
+    res_dtype = np.int64
+
+    def dpnp_impl(a):
+        if a.size == 0:
+            raise ValueError("Passed Empty array")
+
+        sycl_queue = get_sycl_queue()
+
+        a_usm = allocate_usm_shared(a.size * a.itemsize, sycl_queue)
+        copy_usm(sycl_queue, a_usm, a.ctypes, a.size * a.itemsize)
+
+        out = np.empty(1, dtype=res_dtype)
+        out_usm = allocate_usm_shared(out.itemsize, sycl_queue)
+
+        dpnp_func(a_usm, out_usm, a.size)
+
+        copy_usm(sycl_queue, out.ctypes, out_usm, out.size * out.itemsize)
+
+        free_usm(a_usm, sycl_queue)
+        free_usm(out_usm, sycl_queue)
+
+        dpnp_ext._dummy_liveness_func([a.size, out.size])
+
+        return out[0]
+
+    return dpnp_impl
+
+
+@overload(stubs.dpnp.argsort)
+def dpnp_argsort_impl(a):
+    name = "argsort"
+    dpnp_lowering.ensure_dpnp(name)
+    dpctl_functions = dpnp_ext._DPCTL_FUNCTIONS()
+
+    ret_type = types.void
+    """
+    dpnp source:
+    https://github.com/IntelPython/dpnp/blob/0.4.0/dpnp/backend/custom_kernels_searching.cpp#L56
+
+    Function declaration:
+    void custom_argmin_c(void* array1_in, void* result1, size_t size)
+    """
+    sig = signature(ret_type, types.voidptr, types.voidptr, types.intp)
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
+
+    get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
+    allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
+    copy_usm = dpctl_functions.dpctl_queue_memcpy()
+    free_usm = dpctl_functions.dpctl_free_with_queue()
+
+    res_dtype = np.int64
+
+    def dpnp_impl(a):
+        if a.size == 0:
+            raise ValueError("Passed Empty array")
+
+        sycl_queue = get_sycl_queue()
+
+        a_usm = allocate_usm_shared(a.size * a.itemsize, sycl_queue)
+        copy_usm(sycl_queue, a_usm, a.ctypes, a.size * a.itemsize)
+
+        out = np.arange(a.size, dtype=res_dtype)
+        out_usm = allocate_usm_shared(out.size * out.itemsize, sycl_queue)
+
+        dpnp_func(a_usm, out_usm, a.size)
+
+        copy_usm(sycl_queue, out.ctypes, out_usm, out.size * out.itemsize)
+
+        free_usm(a_usm, sycl_queue)
+        free_usm(out_usm, sycl_queue)
+
+        dpnp_ext._dummy_liveness_func([a.size, out.size])
+
+        return out
+
+    return dpnp_impl
diff --git a/numba_dppy/dpnp_glue/dpnp_statisticsimpl.py b/numba_dppy/dpnp_glue/dpnp_statisticsimpl.py
new file mode 100644
index 0000000000..cae9507902
--- /dev/null
+++ b/numba_dppy/dpnp_glue/dpnp_statisticsimpl.py
@@ -0,0 +1,312 @@
+import numba_dppy.dpnp_glue.dpnpimpl as dpnp_ext
+from numba.core import types, cgutils
+from numba.core.typing import signature
+from . import stubs
+import numba_dppy.dpnp_glue as dpnp_lowering
+from numba.core.extending import overload, register_jitable
+import numpy as np
+
+
+@overload(stubs.dpnp.max)
+@overload(stubs.dpnp.amax)
+def dpnp_amax_impl(a):
+    name = "max"
+    dpnp_lowering.ensure_dpnp(name)
+    dpctl_functions = dpnp_ext._DPCTL_FUNCTIONS()
+
+    ret_type = types.void
+    """
+    dpnp source:
+    https://github.com/IntelPython/dpnp/blob/0.4.0/dpnp/backend/custom_kernels_statistics.cpp#L129
+
+    Function declaration:
+    void custom_max_c(void* array1_in, void* result1, const size_t* shape,
+                      size_t ndim, const size_t* axis, size_t naxis)
+
+    We are using void * in case of size_t * as Numba currently does not have
+    any type to represent size_t *. Since, both the types are pointers,
+    if the compiler allows there should not be any mismatch in the size of
+    the container to hold different types of pointer.
+    """
+    sig = signature(ret_type, types.voidptr, types.voidptr,
+                              types.voidptr, types.intp,
+                              types.voidptr, types.intp)
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
+
+    get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
+    allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
+    copy_usm = dpctl_functions.dpctl_queue_memcpy()
+    free_usm = dpctl_functions.dpctl_free_with_queue()
+
+    def dpnp_impl(a):
+        if a.size == 0:
+            raise ValueError("Passed Empty array")
+
+        sycl_queue = get_sycl_queue()
+
+        a_usm = allocate_usm_shared(a.size * a.itemsize, sycl_queue)
+        copy_usm(sycl_queue, a_usm, a.ctypes, a.size * a.itemsize)
+
+        out_usm = allocate_usm_shared(a.itemsize, sycl_queue)
+
+        dpnp_func(a_usm, out_usm, a.shapeptr, a.ndim, a.shapeptr, a.ndim)
+
+        out = np.empty(1, dtype=a.dtype)
+        copy_usm(sycl_queue, out.ctypes, out_usm, out.size * out.itemsize)
+
+        free_usm(a_usm, sycl_queue)
+        free_usm(out_usm, sycl_queue)
+
+        dpnp_ext._dummy_liveness_func([out.size])
+
+        return out[0]
+
+    return dpnp_impl
+
+
+@overload(stubs.dpnp.min)
+@overload(stubs.dpnp.amin)
+def dpnp_amin_impl(a):
+    name = "min"
+    dpnp_lowering.ensure_dpnp(name)
+    dpctl_functions = dpnp_ext._DPCTL_FUNCTIONS()
+
+    ret_type = types.void
+    """
+    dpnp source:
+    https://github.com/IntelPython/dpnp/blob/0.4.0/dpnp/backend/custom_kernels_statistics.cpp#L247
+
+    Function declaration:
+    void custom_min_c(void* array1_in, void* result1, const size_t* shape,
+                      size_t ndim, const size_t* axis, size_t naxis)
+
+    We are using void * in case of size_t * as Numba currently does not have
+    any type to represent size_t *. Since, both the types are pointers,
+    if the compiler allows there should not be any mismatch in the size of
+    the container to hold different types of pointer.
+    """
+    sig = signature(ret_type, types.voidptr, types.voidptr,
+                              types.voidptr, types.intp,
+                              types.voidptr, types.intp)
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
+
+    get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
+    allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
+    copy_usm = dpctl_functions.dpctl_queue_memcpy()
+    free_usm = dpctl_functions.dpctl_free_with_queue()
+
+    def dpnp_impl(a):
+        if a.size == 0:
+            raise ValueError("Passed Empty array")
+
+        sycl_queue = get_sycl_queue()
+
+        a_usm = allocate_usm_shared(a.size * a.itemsize, sycl_queue)
+        copy_usm(sycl_queue, a_usm, a.ctypes, a.size * a.itemsize)
+
+        out_usm = allocate_usm_shared(a.itemsize, sycl_queue)
+
+        dpnp_func(a_usm, out_usm, a.shapeptr, a.ndim, a.shapeptr, 0)
+
+        out = np.empty(1, dtype=a.dtype)
+        copy_usm(sycl_queue, out.ctypes, out_usm, out.size * out.itemsize)
+
+        free_usm(a_usm, sycl_queue)
+        free_usm(out_usm, sycl_queue)
+
+        dpnp_ext._dummy_liveness_func([out.size])
+
+        return out[0]
+
+    return dpnp_impl
+
+
+@overload(stubs.dpnp.mean)
+def dpnp_mean_impl(a):
+    name = "mean"
+    dpnp_lowering.ensure_dpnp(name)
+    dpctl_functions = dpnp_ext._DPCTL_FUNCTIONS()
+
+    ret_type = types.void
+    """
+    dpnp source:
+    https://github.com/IntelPython/dpnp/blob/0.4.0/dpnp/backend/custom_kernels_statistics.cpp#L169
+
+    Function declaration:
+    void custom_mean_c(void* array1_in, void* result1, const size_t* shape,
+                       size_t ndim, const size_t* axis, size_t naxis)
+
+    We are using void * in case of size_t * as Numba currently does not have
+    any type to represent size_t *. Since, both the types are pointers,
+    if the compiler allows there should not be any mismatch in the size of
+    the container to hold different types of pointer.
+    """
+    sig = signature(ret_type, types.voidptr, types.voidptr,
+                              types.voidptr, types.intp,
+                              types.voidptr, types.intp)
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
+
+    get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
+    allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
+    copy_usm = dpctl_functions.dpctl_queue_memcpy()
+    free_usm = dpctl_functions.dpctl_free_with_queue()
+
+    res_dtype = np.float64
+    if a.dtype == types.float32:
+        res_dtype = np.float32
+
+    def dpnp_impl(a):
+        if a.size == 0:
+            raise ValueError("Passed Empty array")
+
+        sycl_queue = get_sycl_queue()
+
+        a_usm = allocate_usm_shared(a.size * a.itemsize, sycl_queue)
+        copy_usm(sycl_queue, a_usm, a.ctypes, a.size * a.itemsize)
+
+        out = np.empty(1, dtype=res_dtype)
+        out_usm = allocate_usm_shared(out.itemsize, sycl_queue)
+
+        dpnp_func(a_usm, out_usm, a.shapeptr, a.ndim, a.shapeptr, a.ndim)
+
+        copy_usm(sycl_queue, out.ctypes, out_usm, out.size * out.itemsize)
+
+        free_usm(a_usm, sycl_queue)
+        free_usm(out_usm, sycl_queue)
+
+        dpnp_ext._dummy_liveness_func([a.size, out.size])
+        return out[0]
+
+    return dpnp_impl
+
+
+@overload(stubs.dpnp.median)
+def dpnp_median_impl(a):
+    name = "median"
+    dpnp_lowering.ensure_dpnp(name)
+    dpctl_functions = dpnp_ext._DPCTL_FUNCTIONS()
+
+    ret_type = types.void
+    """
+    dpnp source:
+    https://github.com/IntelPython/dpnp/blob/0.4.0/dpnp/backend/custom_kernels_statistics.cpp#L213
+
+    Function declaration:
+    void custom_median_c(void* array1_in, void* result1, const size_t* shape,
+			 size_t ndim, const size_t* axis, size_t naxis)
+
+    We are using void * in case of size_t * as Numba currently does not have
+    any type to represent size_t *. Since, both the types are pointers,
+    if the compiler allows there should not be any mismatch in the size of
+    the container to hold different types of pointer.
+    """
+    sig = signature(ret_type, types.voidptr, types.voidptr,
+                              types.voidptr, types.intp,
+                              types.voidptr, types.intp)
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
+
+    get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
+    allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
+    copy_usm = dpctl_functions.dpctl_queue_memcpy()
+    free_usm = dpctl_functions.dpctl_free_with_queue()
+
+    res_dtype = np.float64
+    if a.dtype == types.float32:
+        res_dtype = np.float32
+
+    def dpnp_impl(a):
+        if a.size == 0:
+            raise ValueError("Passed Empty array")
+
+        sycl_queue = get_sycl_queue()
+
+        a_usm = allocate_usm_shared(a.size * a.itemsize, sycl_queue)
+        copy_usm(sycl_queue, a_usm, a.ctypes, a.size * a.itemsize)
+
+        out = np.empty(1, dtype=res_dtype)
+        out_usm = allocate_usm_shared(out.itemsize, sycl_queue)
+
+        dpnp_func(a_usm, out_usm, a.shapeptr, a.ndim, a.shapeptr, a.ndim)
+
+        copy_usm(sycl_queue, out.ctypes, out_usm, out.size * out.itemsize)
+
+        free_usm(a_usm, sycl_queue)
+        free_usm(out_usm, sycl_queue)
+
+        dpnp_ext._dummy_liveness_func([a.size, out.size])
+
+        return out[0]
+
+    return dpnp_impl
+
+
+@overload(stubs.dpnp.cov)
+def dpnp_cov_impl(a):
+    name = "cov"
+    dpnp_lowering.ensure_dpnp(name)
+    dpctl_functions = dpnp_ext._DPCTL_FUNCTIONS()
+
+    ret_type = types.void
+    """
+    dpnp source:
+    https://github.com/IntelPython/dpnp/blob/0.4.0/dpnp/backend/custom_kernels_statistics.cpp#L51
+
+    Function declaration:
+    void custom_cov_c(void* array1_in, void* result1, size_t nrows, size_t ncols)
+    """
+    sig = signature(ret_type, types.voidptr, types.voidptr,
+                              types.intp, types.intp)
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
+
+    get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
+    allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
+    copy_usm = dpctl_functions.dpctl_queue_memcpy()
+    free_usm = dpctl_functions.dpctl_free_with_queue()
+
+    res_dtype = np.float64
+    copy_input_to_double = True
+    if a.dtype == types.float64:
+        copy_input_to_double = False
+
+
+    def dpnp_impl(a):
+        if a.size == 0:
+            raise ValueError("Passed Empty array")
+
+        sycl_queue = get_sycl_queue()
+
+        """ We have to pass a array in double precision to DpNp """
+        if copy_input_to_double:
+            a_copy_in_double = a.astype(np.float64)
+        else:
+            a_copy_in_double = a
+        a_usm = allocate_usm_shared(a_copy_in_double.size * a_copy_in_double.itemsize, sycl_queue)
+        copy_usm(sycl_queue, a_usm, a_copy_in_double.ctypes,
+                 a_copy_in_double.size * a_copy_in_double.itemsize)
+
+        if a.ndim == 2:
+            rows = a.shape[0]
+            cols = a.shape[1]
+            out = np.empty((rows, rows), dtype=res_dtype)
+        elif a.ndim == 1:
+            rows = 1
+            cols = a.shape[0]
+            out = np.empty(rows, dtype=res_dtype)
+
+        out_usm = allocate_usm_shared(out.size * out.itemsize, sycl_queue)
+
+        dpnp_func(a_usm, out_usm, rows, cols)
+
+        copy_usm(sycl_queue, out.ctypes, out_usm, out.size * out.itemsize)
+
+        free_usm(a_usm, sycl_queue)
+        free_usm(out_usm, sycl_queue)
+
+        dpnp_ext._dummy_liveness_func([a_copy_in_double.size, a.size, out.size])
+
+        if a.ndim == 2:
+            return out
+        elif a.ndim == 1:
+            return out[0]
+
+    return dpnp_impl
diff --git a/numba_dppy/dpnp_glue/dpnp_transcendentalsimpl.py b/numba_dppy/dpnp_glue/dpnp_transcendentalsimpl.py
index d4c91ae794..f7ba425206 100644
--- a/numba_dppy/dpnp_glue/dpnp_transcendentalsimpl.py
+++ b/numba_dppy/dpnp_glue/dpnp_transcendentalsimpl.py
@@ -2,37 +2,82 @@
 from numba import types
 from numba.core.typing import signature
 from . import stubs
-import numba_dppy.experimental_numpy_lowering_overload as dpnp_lowering
+import numba_dppy.dpnp_glue as dpnp_lowering
 from numba.core.extending import overload, register_jitable
 import numpy as np
 
-class _DPNP_TRANSCENDENTALS_EXTENSION:
-    @classmethod
-    def dpnp_sum(cls, fn_name, type_names):
-        ret_type = types.void
-        sig = signature(ret_type, types.voidptr, types.voidptr, types.int64)
-        f_ptr = dpnp_ext.get_dpnp_fptr(fn_name, type_names)
 
-        def get_pointer(obj):
-            return f_ptr
+@overload(stubs.dpnp.sum)
+def dpnp_sum_impl(a):
+    name = "sum"
+    dpnp_lowering.ensure_dpnp(name)
+    dpctl_functions = dpnp_ext._DPCTL_FUNCTIONS()
 
-        return types.ExternalFunctionPointer(sig, get_pointer=get_pointer)
+    ret_type = types.void
+    """
+    dpnp source:
+    https://github.com/IntelPython/dpnp/blob/0.4.0/dpnp/backend/custom_kernels_reduction.cpp#L39
 
+    Function declaration:
+    void custom_sum_c(void* array1_in, void* result1, size_t size)
 
-@overload(stubs.dpnp.sum)
-def dpnp_sum_impl(a):
-    dpnp_lowering.ensure_dpnp("sum")
-    dpnp_extension = _DPNP_TRANSCENDENTALS_EXTENSION()
+    """
+    sig = signature(ret_type, types.voidptr, types.voidptr, types.intp)
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
+
+    get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
+    allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
+    copy_usm = dpctl_functions.dpctl_queue_memcpy()
+    free_usm = dpctl_functions.dpctl_free_with_queue()
+
+    def dpnp_impl(a):
+        if a.size == 0:
+            raise ValueError("Passed Empty array")
+
+        sycl_queue = get_sycl_queue()
+        a_usm = allocate_usm_shared(a.size * a.itemsize, sycl_queue)
+        copy_usm(sycl_queue, a_usm, a.ctypes, a.size * a.itemsize)
+
+        out_usm = allocate_usm_shared(a.itemsize, sycl_queue)
+
+        dpnp_func(a_usm, out_usm, a.size)
+
+        out = np.empty(1, dtype=a.dtype)
+        copy_usm(sycl_queue, out.ctypes, out_usm, out.size * out.itemsize)
+
+        free_usm(a_usm, sycl_queue)
+        free_usm(out_usm, sycl_queue)
+
+        dpnp_ext._dummy_liveness_func([out.size])
+
+        return out[0]
+
+    return dpnp_impl
+
+
+@overload(stubs.dpnp.prod)
+def dpnp_prod_impl(a):
+    name = "prod"
+    dpnp_lowering.ensure_dpnp(name)
     dpctl_functions = dpnp_ext._DPCTL_FUNCTIONS()
 
-    dpnp_sum = dpnp_extension.dpnp_sum("dpnp_sum", [a.dtype.name, "NONE"])
+    ret_type = types.void
+    """
+    dpnp source:
+    https://github.com/IntelPython/dpnp/blob/0.4.0/dpnp/backend/custom_kernels_reduction.cpp#L83
+
+    Function declaration:
+    void custom_prod_c(void* array1_in, void* result1, size_t size)
+    """
+    sig = signature(ret_type, types.voidptr, types.voidptr, types.intp)
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
 
     get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
     allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
     copy_usm = dpctl_functions.dpctl_queue_memcpy()
     free_usm = dpctl_functions.dpctl_free_with_queue()
 
-    def dpnp_sum_impl(a):
+    def dpnp_impl(a):
         if a.size == 0:
             raise ValueError("Passed Empty array")
 
@@ -42,7 +87,7 @@ def dpnp_sum_impl(a):
 
         out_usm = allocate_usm_shared(a.itemsize, sycl_queue)
 
-        dpnp_sum(a_usm, out_usm, a.size)
+        dpnp_func(a_usm, out_usm, a.size)
 
         out = np.empty(1, dtype=a.dtype)
         copy_usm(sycl_queue, out.ctypes, out_usm, out.size * out.itemsize)
@@ -54,4 +99,4 @@ def dpnp_sum_impl(a):
 
         return out[0]
 
-    return dpnp_sum_impl
+    return dpnp_impl
diff --git a/numba_dppy/dpnp_glue/dpnpdecl.py b/numba_dppy/dpnp_glue/dpnpdecl.py
index e77739eeda..ce1f7d3583 100644
--- a/numba_dppy/dpnp_glue/dpnpdecl.py
+++ b/numba_dppy/dpnp_glue/dpnpdecl.py
@@ -1,6 +1,7 @@
 from numba.core.typing.templates import (AttributeTemplate, infer_getattr)
 import numba_dppy
 from numba import types
+from numba.core.types.misc import RawPointer
 
 @infer_getattr
 class DppyDpnpTemplate(AttributeTemplate):
@@ -8,3 +9,15 @@ class DppyDpnpTemplate(AttributeTemplate):
 
     def resolve_dpnp(self, mod):
         return types.Module(numba_dppy.dpnp)
+
+"""
+This adds a shapeptr attribute to Numba type representing np.ndarray.
+This allows us to get the raw pointer to the structure where the shape
+of an ndarray is stored from an overloaded implementation
+"""
+@infer_getattr
+class ArrayAttribute(AttributeTemplate):
+    key = types.Array
+
+    def resolve_shapeptr(self, ary):
+        return types.voidptr
diff --git a/numba_dppy/dpnp_glue/dpnpimpl.py b/numba_dppy/dpnp_glue/dpnpimpl.py
index 97f6d0a7ac..fa429f923f 100644
--- a/numba_dppy/dpnp_glue/dpnpimpl.py
+++ b/numba_dppy/dpnp_glue/dpnpimpl.py
@@ -1,12 +1,12 @@
 from numba.core.imputils import lower_builtin
-import numba_dppy.experimental_numpy_lowering_overload as dpnp_lowering
-from numba import types
-from numba.core.typing import signature
-from numba.core.extending import overload, register_jitable
-from . import stubs
+from numba.core import types
+from numba.core.extending import register_jitable
 import numpy as np
+from llvmlite import ir
+from numba.core.imputils import lower_getattr
 from numba_dppy.dpctl_functions import _DPCTL_FUNCTIONS
 
+ll_void_p = ir.IntType(8).as_pointer()
 
 def get_dpnp_fptr(fn_name, type_names):
     from . import dpnp_fptr_interface as dpnp_glue
@@ -14,22 +14,34 @@ def get_dpnp_fptr(fn_name, type_names):
     f_ptr = dpnp_glue.get_dpnp_fn_ptr(fn_name, type_names)
     return f_ptr
 
-
 @register_jitable
 def _check_finite_matrix(a):
     for v in np.nditer(a):
         if not np.isfinite(v.item()):
             raise np.linalg.LinAlgError("Array must not contain infs or NaNs.")
 
-
 @register_jitable
 def _dummy_liveness_func(a):
     """pass a list of variables to be preserved through dead code elimination"""
     return a[0]
 
+def dpnp_func(fn_name, type_names, sig):
+    f_ptr = get_dpnp_fptr(fn_name, type_names)
+
+    def get_pointer(obj):
+        return f_ptr
+
+    return types.ExternalFunctionPointer(sig, get_pointer=get_pointer)
+
+"""
+This function retrieves the pointer to the structure where the shape
+of an ndarray is stored. We cast it to void * to make it easier to
+pass around.
+"""
+@lower_getattr(types.Array, "shapeptr")
+def array_shape(context, builder, typ, value):
+    shape_ptr = builder.gep(value.operands[0],
+                            [context.get_constant(types.int32, 0),
+                             context.get_constant(types.int32, 5)])
 
-class RetrieveDpnpFnPtr(types.ExternalFunctionPointer):
-    def __init__(self, fn_name, type_names, sig, get_pointer):
-        self.fn_name = fn_name
-        self.type_names = type_names
-        super(RetrieveDpnpFnPtr, self).__init__(sig, get_pointer)
+    return builder.bitcast(shape_ptr, ll_void_p)
diff --git a/numba_dppy/dpnp_glue/stubs.py b/numba_dppy/dpnp_glue/stubs.py
index 041a30c074..2fdd6ecbe3 100644
--- a/numba_dppy/dpnp_glue/stubs.py
+++ b/numba_dppy/dpnp_glue/stubs.py
@@ -10,3 +10,42 @@ class sum(Stub):
 
     class eig(Stub):
         pass
+
+    class prod(Stub):
+        pass
+
+    class max(Stub):
+        pass
+
+    class amax(Stub):
+        pass
+
+    class min(Stub):
+        pass
+
+    class amin(Stub):
+        pass
+
+    class mean(Stub):
+        pass
+
+    class median(Stub):
+        pass
+
+    class argmax(Stub):
+        pass
+
+    class argmin(Stub):
+        pass
+
+    class argsort(Stub):
+        pass
+
+    class cov(Stub):
+        pass
+
+    class dot(Stub):
+        pass
+
+    class matmul(Stub):
+        pass
diff --git a/numba_dppy/dppy_passbuilder.py b/numba_dppy/dppy_passbuilder.py
index da54ad4b3e..994351d509 100644
--- a/numba_dppy/dppy_passbuilder.py
+++ b/numba_dppy/dppy_passbuilder.py
@@ -22,8 +22,6 @@
         DPPYPreParforPass,
         DPPYParforPass,
         SpirvFriendlyLowering,
-        DPPYAddNumpyOverloadPass,
-        DPPYAddNumpyRemoveOverloadPass,
         DPPYNoPythonBackend,
         DPPYDumpParforDiagnostics
         )
@@ -53,10 +51,6 @@ def default_numba_nopython_pipeline(state, pm):
                 "Rewrite name of Numpy functions to overload already overloaded function",
         )
 
-        # this pass adds required logic to overload default implementation of
-        # Numpy functions
-        pm.add_pass(DPPYAddNumpyOverloadPass, "dppy add typing template for Numpy functions")
-
         # Add pass to ensure when users are allocating static
         # constant memory the size is a constant and can not
         # come from a closure variable
@@ -101,7 +95,6 @@ def default_numba_nopython_pipeline(state, pm):
         pm.add_pass(InlineOverloads, "inline overloaded functions")
 
 
-
     @staticmethod
     def define_nopython_pipeline(state, name='dppy_nopython'):
         """Returns an nopython mode pipeline based PassManager
@@ -121,7 +114,6 @@ def define_nopython_pipeline(state, name='dppy_nopython'):
         # lower
         pm.add_pass(SpirvFriendlyLowering, "SPIRV-friendly lowering pass")
         pm.add_pass(DPPYNoPythonBackend, "nopython mode backend")
-        pm.add_pass(DPPYAddNumpyRemoveOverloadPass, "dppy remove typing template for Numpy functions")
         pm.add_pass(DPPYDumpParforDiagnostics, "dump parfor diagnostics")
         pm.finalize()
         return pm
diff --git a/numba_dppy/dppy_passes.py b/numba_dppy/dppy_passes.py
index 925b7dfd8a..be9423230b 100644
--- a/numba_dppy/dppy_passes.py
+++ b/numba_dppy/dppy_passes.py
@@ -32,117 +32,6 @@
 from numba.parfors.parfor import ParforPass as _parfor_ParforPass
 from numba.parfors.parfor import Parfor
 
-def dpnp_available():
-    try:
-       # import dpnp
-        from numba_dppy.dpnp_glue import dpnp_fptr_interface as dpnp_glue
-        return True
-    except:
-        return False
-
-
-@register_pass(mutates_CFG=False, analysis_only=True)
-class DPPYAddNumpyOverloadPass(FunctionPass):
-    _name = "dppy_add_numpy_overload_pass"
-
-    def __init__(self):
-        FunctionPass.__init__(self)
-
-    def run_pass(self, state):
-        if dpnp_available():
-            typingctx = state.typingctx
-            from numba.core.typing.templates import (builtin_registry as reg, infer_global)
-            from numba.core.typing.templates import (AbstractTemplate, CallableTemplate, signature)
-            from numba.core.typing.npydecl import MatMulTyperMixin
-
-            @infer_global(np.cov)
-            class NPCov(AbstractTemplate):
-                def generic(self, args, kws):
-                    assert not kws
-                    if args[0].ndim > 2:
-                        return
-
-                    nb_dtype = types.float64
-                    return_type = types.Array(dtype=nb_dtype, ndim=args[0].ndim, layout='C')
-                    return signature(return_type, *args)
-
-            @infer_global(np.matmul, typing_key="np.matmul")
-            class matmul(MatMulTyperMixin, AbstractTemplate):
-                key = np.matmul
-                func_name = "np.matmul()"
-
-                def generic(self, args, kws):
-                    assert not kws
-                    restype = self.matmul_typer(*args)
-                    if restype is not None:
-                        return signature(restype, *args)
-
-            @infer_global(np.median)
-            class NPMedian(AbstractTemplate):
-                def generic(self, args, kws):
-                    assert not kws
-
-                    retty = args[0].dtype
-                    return signature(retty, *args)
-
-            @infer_global(np.mean)
-            #@infer_global("array.mean")
-            class NPMean(AbstractTemplate):
-                def generic(self, args, kws):
-                    assert not kws
-
-                    if args[0].dtype == types.float32:
-                        retty = types.float32
-                    else:
-                        retty = types.float64
-                    return signature(retty, *args)
-
-
-            prev_cov = None
-            prev_median = None
-            prev_mean = None
-            for idx, g in enumerate(reg.globals):
-                if g[0] == np.cov:
-                    if not prev_cov:
-                        prev_cov = g[1]
-                    else:
-                        prev_cov.templates = g[1].templates
-
-                if g[0] == np.median:
-                    if not prev_median:
-                        prev_median = g[1]
-                    else:
-                        prev_median.templates = g[1].templates
-
-                if g[0] == np.mean:
-                    if not prev_mean:
-                        prev_mean = g[1]
-                    else:
-                        prev_mean.templates = g[1].templates
-
-            typingctx.refresh()
-        return True
-
-@register_pass(mutates_CFG=False, analysis_only=True)
-class DPPYAddNumpyRemoveOverloadPass(FunctionPass):
-    _name = "dppy_remove_numpy_overload_pass"
-
-    def __init__(self):
-        FunctionPass.__init__(self)
-
-    def run_pass(self, state):
-        if dpnp_available():
-            typingctx = state.typingctx
-            targetctx = state.targetctx
-
-            from importlib import reload
-            from numba.np import npyimpl, arrayobj, arraymath
-            reload(npyimpl)
-            reload(arrayobj)
-            reload(arraymath)
-            targetctx.refresh()
-
-        return True
 
 @register_pass(mutates_CFG=True, analysis_only=False)
 class DPPYConstantSizeStaticLocalMemoryPass(FunctionPass):
@@ -341,21 +230,8 @@ def run_pass(self, state):
             # be later serialized.
             state.library.enable_object_caching()
 
-
         targetctx = state.targetctx
 
-        # This should not happen here, after we have the notion of context in Numba
-        # we should have specialized dispatcher for dppy context and that dispatcher
-        # should be a cpu dispatcher that will overload the lowering functions for
-        # linalg for dppy.cpu_dispatcher and the dppy.gpu_dipatcher should be the
-        # current target context we have to launch kernels.
-        # This is broken as this essentially adds the new lowering in a list which
-        # means it does not get replaced with the new lowering_buitins
-
-        if dpnp_available():
-            from . import experimental_numpy_lowering_overload
-            targetctx.refresh()
-
         library   = state.library
         interp    = state.func_ir  # why is it called this?!
         typemap   = state.typemap
diff --git a/numba_dppy/experimental_numpy_lowering_overload.py b/numba_dppy/experimental_numpy_lowering_overload.py
deleted file mode 100644
index dd1e2a1eb6..0000000000
--- a/numba_dppy/experimental_numpy_lowering_overload.py
+++ /dev/null
@@ -1,723 +0,0 @@
-import numpy as np
-from numba.core import types, cgutils
-from numba.core.imputils import (lower_builtin)
-from numba.core.typing import signature
-from numba.np.arrayobj import make_array, _empty_nd_impl, array_copy
-from numba.core import itanium_mangler
-from llvmlite import ir
-import llvmlite.llvmpy.core as lc
-import contextlib
-
-from numba import int32, int64, uint32, uint64, float32, float64
-
-
-@contextlib.contextmanager
-def make_contiguous(context, builder, sig, args):
-    """
-    Ensure that all array arguments are contiguous, if necessary by
-    copying them.
-    A new (sig, args) tuple is yielded.
-    """
-    newtys = []
-    newargs = []
-    copies = []
-    for ty, val in zip(sig.args, args):
-        if not isinstance(ty, types.Array) or ty.layout in 'CF':
-            newty, newval = ty, val
-        else:
-            newty = ty.copy(layout='C')
-            copysig = signature(newty, ty)
-            newval = array_copy(context, builder, copysig, (val,))
-            copies.append((newty, newval))
-        newtys.append(newty)
-        newargs.append(newval)
-    yield signature(sig.return_type, *newtys), tuple(newargs)
-    for ty, val in copies:
-        context.nrt.decref(builder, ty, val)
-
-def check_c_int(context, builder, n):
-    """
-    Check whether *n* fits in a C `int`.
-    """
-    _maxint = 2**31 - 1
-
-    def impl(n):
-        if n > _maxint:
-            raise OverflowError("array size too large to fit in C int")
-
-    context.compile_internal(builder, impl,
-                             signature(types.none, types.intp), (n,))
-
-
-ll_char = ir.IntType(8)
-ll_char_p = ll_char.as_pointer()
-ll_void = ir.VoidType()
-ll_void_p = ll_char_p
-ll_intc = ir.IntType(32)
-ll_intc_p = ll_intc.as_pointer()
-intp_t = cgutils.intp_t
-ll_intp_t = ir.IntType(64)
-ll_intp_p = intp_t.as_pointer()
-
-
-def ensure_dpnp(name):
-    try:
-       # import dpnp
-        from .dpnp_glue import dpnp_fptr_interface as dpnp_glue
-    except ImportError:
-        raise ImportError("dpNP is needed to call np.%s" % name)
-
-def get_total_size_of_array(context, builder, aty, ary):
-    total_size = cgutils.alloca_once(builder, ll_intp_t)
-    builder.store(builder.sext(builder.mul(ary.nitems,
-       context.get_constant(types.intp, context.get_abi_sizeof(context.get_value_type(aty)))), ll_intp_t), total_size)
-    return builder.load(total_size)
-
-def get_sycl_queue(context, builder):
-    void_ptr_t = context.get_value_type(types.voidptr)
-    get_queue_fnty = lc.Type.function(void_ptr_t, ())
-    get_queue = builder.module.get_or_insert_function(get_queue_fnty,
-                                            name="DPCTLQueueMgr_GetCurrentQueue")
-    sycl_queue_val = cgutils.alloca_once(builder, void_ptr_t)
-    builder.store(builder.call(get_queue, []), sycl_queue_val)
-
-    return sycl_queue_val
-
-def allocate_usm(context, builder, size, sycl_queue):
-    void_ptr_t = context.get_value_type(types.voidptr)
-    usm_shared_fnty = lc.Type.function(void_ptr_t, [ll_intp_t, void_ptr_t])
-    usm_shared = builder.module.get_or_insert_function(usm_shared_fnty,
-                                                       name="DPCTLmalloc_shared")
-
-    buffer_ptr = cgutils.alloca_once(builder, void_ptr_t)
-    args = [size, builder.load(sycl_queue)]
-    builder.store(builder.call(usm_shared, args), buffer_ptr)
-
-    return builder.load(buffer_ptr)
-
-def copy_usm(context, builder, src, dst, size, sycl_queue):
-    void_ptr_t = context.get_value_type(types.voidptr)
-    queue_memcpy_fnty = lc.Type.function(ir.VoidType(), [void_ptr_t, void_ptr_t, void_ptr_t,
-                                                         ll_intp_t])
-    queue_memcpy = builder.module.get_or_insert_function(queue_memcpy_fnty,
-                                                       name="DPCTLQueue_Memcpy")
-    args = [builder.load(sycl_queue),
-            builder.bitcast(dst, void_ptr_t),
-            builder.bitcast(src, void_ptr_t),
-            size]
-    builder.call(queue_memcpy, args)
-
-
-def free_usm(context, builder, usm_buf, sycl_queue):
-    void_ptr_t = context.get_value_type(types.voidptr)
-
-    usm_free_fnty = lc.Type.function(ir.VoidType(), [void_ptr_t, void_ptr_t])
-    usm_free = builder.module.get_or_insert_function(usm_free_fnty,
-                                               name="DPCTLfree_with_queue")
-
-    builder.call(usm_free, [usm_buf, builder.load(sycl_queue)])
-
-
-def call_dpnp(context, builder, fn_name, type_names, params, param_tys, ret_ty):
-    from .dpnp_glue import dpnp_fptr_interface as dpnp_glue
-    f_ptr = dpnp_glue.get_dpnp_fn_ptr(fn_name, type_names)
-
-    fnty = ir.FunctionType(ret_ty, param_tys)
-    addr_constant = context.get_constant(int64, f_ptr)
-    fn_ptr = builder.inttoptr(addr_constant, fnty.as_pointer())
-
-    res = builder.call(fn_ptr, params)
-
-
-def dot_2_vv(context, builder, sig, args, conjugate=False):
-    """
-    np.dot(vector, vector)
-    np.vdot(vector, vector)
-    """
-    def check_args(a, b):
-        m, = a.shape
-        n, = b.shape
-        if m != n:
-            print("SIZES ", m, n)
-            raise ValueError("incompatible array sizes for np.dot(a, b) "
-                             "(vector * vector)")
-
-    context.compile_internal(builder, check_args,
-                             signature(types.none, *sig.args), args)
-
-
-    sycl_queue = get_sycl_queue(context, builder)
-    aty, bty = sig.args
-    a = make_array(aty)(context, builder, args[0])
-    b = make_array(bty)(context, builder, args[1])
-    size, = cgutils.unpack_tuple(builder, a.shape)
-
-    check_c_int(context, builder, size)
-
-    total_size_a = get_total_size_of_array(context, builder, aty.dtype, a)
-    a_usm = allocate_usm(context, builder, total_size_a, sycl_queue)
-    copy_usm(context, builder, a.data, a_usm, total_size_a, sycl_queue)
-
-    total_size_b = get_total_size_of_array(context, builder, bty.dtype, b)
-    b_usm = allocate_usm(context, builder, total_size_b, sycl_queue)
-    copy_usm(context, builder, b.data, b_usm, total_size_b, sycl_queue)
-
-    out = cgutils.alloca_once(builder, context.get_value_type(sig.return_type))
-    builder.store(context.get_constant(sig.return_type, 0), out)
-    out_usm = allocate_usm(context, builder,
-            context.get_constant(types.intp, context.get_abi_sizeof(context.get_value_type(sig.return_type))), sycl_queue)
-
-    # arguments are : a->void*, b->void*, result->void*, size->int64
-    param_tys = [ll_void_p, ll_void_p, ll_void_p, ir.IntType(64)]
-    params = (a_usm, b_usm, out_usm, size)
-
-    type_names = []
-    type_names.append(aty.dtype.name)
-    type_names.append(sig.return_type.name)
-
-    call_dpnp(context, builder, "dpnp_dot", type_names, params, param_tys, ll_void)
-
-    copy_usm(context, builder, out_usm, out,
-            context.get_constant(types.intp, context.get_abi_sizeof(context.get_value_type(sig.return_type))), sycl_queue)
-
-    free_usm(context, builder, a_usm, sycl_queue)
-    free_usm(context, builder, out_usm, sycl_queue)
-
-    return builder.load(out)
-
-
-def dot_2_mv(context, builder, sig, args):
-    """
-    np.dot(matrix, matrix)
-    """
-    def make_res(a, b):
-        m, n = a.shape
-        _n, = b.shape
-        if _n != n:
-            raise ValueError("incompatible array sizes for np.dot(a, b)")
-        return np.empty((m, ), a.dtype)
-
-    sycl_queue = get_sycl_queue(context, builder)
-
-    aty, bty = sig.args
-    a = make_array(aty)(context, builder, args[0])
-    b = make_array(bty)(context, builder, args[1])
-    m, k = cgutils.unpack_tuple(builder, a.shape)
-    _n,  = cgutils.unpack_tuple(builder, b.shape)
-    n = context.get_constant(types.int64, 1)
-
-    total_size_a = get_total_size_of_array(context, builder, aty.dtype, a)
-    a_usm = allocate_usm(context, builder, total_size_a, sycl_queue)
-    copy_usm(context, builder, a.data, a_usm, total_size_a, sycl_queue)
-
-    total_size_b = get_total_size_of_array(context, builder, bty.dtype, b)
-    b_usm = allocate_usm(context, builder, total_size_b, sycl_queue)
-    copy_usm(context, builder, b.data, b_usm, total_size_b, sycl_queue)
-
-    out = context.compile_internal(builder, make_res,
-            signature(sig.return_type, *sig.args), args)
-
-    outary = make_array(sig.return_type)(context, builder, out)
-
-    total_size_out = get_total_size_of_array(context, builder, sig.return_type.dtype, outary)
-    out_usm = allocate_usm(context, builder, total_size_out, sycl_queue)
-
-    # arguments are : a->void*, b->void*, result->void*, m->int64, n->int64, k->int64
-    param_tys = [ll_void_p, ll_void_p, ll_void_p, ir.IntType(64), ir.IntType(64), ir.IntType(64)]
-    params = (a_usm, b_usm, out_usm, m, n, k)
-
-    type_names = []
-    type_names.append(aty.dtype.name)
-    type_names.append(sig.return_type.name)
-
-    call_dpnp(context, builder, "dpnp_matmul", type_names, params, param_tys, ll_void)
-
-    copy_usm(context, builder, out_usm, outary.data, total_size_out, sycl_queue)
-
-    free_usm(context, builder, a_usm, sycl_queue)
-    free_usm(context, builder, b_usm, sycl_queue)
-    free_usm(context, builder, out_usm, sycl_queue)
-    return out
-
-
-def dot_2_vm(context, builder, sig, args):
-    """
-    np.dot(vector, matrix)
-    """
-    def make_res(a, b):
-        m,  = a.shape
-        _m, n = b.shape
-        if m != _m:
-            raise ValueError("incompatible array sizes for np.dot(a, b)")
-        return np.empty((n, ), a.dtype)
-
-    sycl_queue = get_sycl_queue(context, builder)
-    aty, bty = sig.args
-    a = make_array(aty)(context, builder, args[0])
-    b = make_array(bty)(context, builder, args[1])
-    m,  = cgutils.unpack_tuple(builder, a.shape)
-    k, n  = cgutils.unpack_tuple(builder, b.shape)
-
-    total_size_a = get_total_size_of_array(context, builder, aty.dtype, a)
-    a_usm = allocate_usm(context, builder, total_size_a, sycl_queue)
-    copy_usm(context, builder, a.data, a_usm, total_size_a, sycl_queue)
-
-    total_size_b = get_total_size_of_array(context, builder, bty.dtype, b)
-    b_usm = allocate_usm(context, builder, total_size_b, sycl_queue)
-    copy_usm(context, builder, b.data, b_usm, total_size_b, sycl_queue)
-
-    out = context.compile_internal(builder, make_res,
-            signature(sig.return_type, *sig.args), args)
-
-    outary = make_array(sig.return_type)(context, builder, out)
-
-    total_size_out = get_total_size_of_array(context, builder, sig.return_type.dtype, outary)
-    out_usm = allocate_usm(context, builder, total_size_out, sycl_queue)
-
-
-    # arguments are : a->void*, b->void*, result->void*, m->int64, n->int64, k->int64
-    param_tys = [ll_void_p, ll_void_p, ll_void_p, ir.IntType(64), ir.IntType(64), ir.IntType(64)]
-    params = (a_usm, b_usm, out_usm, m, n, k)
-
-    type_names = []
-    type_names.append(aty.dtype.name)
-    type_names.append(sig.return_type.name)
-
-    call_dpnp(context, builder, "dpnp_matmul", type_names, params, param_tys, ll_void)
-
-    copy_usm(context, builder, out_usm, outary.data, total_size_out, sycl_queue)
-
-    free_usm(context, builder, a_usm, sycl_queue)
-    free_usm(context, builder, b_usm, sycl_queue)
-    free_usm(context, builder, out_usm, sycl_queue)
-    return out
-
-
-def dot_2_mm(context, builder, sig, args):
-    """
-    np.dot(matrix, matrix), np.matmul(matrix, matrix)
-    """
-    def make_res(a, b):
-        m, k = a.shape
-        _k, n = b.shape
-        if _k != k:
-            raise ValueError("incompatible array sizes for np.dot(a, b)")
-        return np.empty((m, n), a.dtype)
-
-    sycl_queue = get_sycl_queue(context, builder)
-    aty, bty = sig.args
-    a = make_array(aty)(context, builder, args[0])
-    b = make_array(bty)(context, builder, args[1])
-    m, k = cgutils.unpack_tuple(builder, a.shape)
-    _k, n = cgutils.unpack_tuple(builder, b.shape)
-
-    total_size_a = get_total_size_of_array(context, builder, aty.dtype, a)
-    a_usm = allocate_usm(context, builder, total_size_a, sycl_queue)
-    copy_usm(context, builder, a.data, a_usm, total_size_a, sycl_queue)
-
-    total_size_b = get_total_size_of_array(context, builder, bty.dtype, b)
-    b_usm = allocate_usm(context, builder, total_size_b, sycl_queue)
-    copy_usm(context, builder, b.data, b_usm, total_size_b, sycl_queue)
-
-
-    out = context.compile_internal(builder, make_res,
-            signature(sig.return_type, *sig.args), args)
-
-    outary = make_array(sig.return_type)(context, builder, out)
-    total_size_out = get_total_size_of_array(context, builder, sig.return_type.dtype, outary)
-    out_usm = allocate_usm(context, builder, total_size_out, sycl_queue)
-
-
-    # arguments are : a->void*, b->void*, result->void*, m->int64, n->int64, k->int64
-    param_tys = [ll_void_p, ll_void_p, ll_void_p, ir.IntType(64), ir.IntType(64), ir.IntType(64)]
-    params = (a_usm,
-              b_usm,
-              out_usm,
-              m, n, k)
-
-    type_names = []
-    type_names.append(aty.dtype.name)
-    type_names.append(sig.return_type.name)
-
-    call_dpnp(context, builder, "dpnp_matmul", type_names, params, param_tys, ll_void)
-
-    copy_usm(context, builder, out_usm, outary.data, total_size_out, sycl_queue)
-
-    free_usm(context, builder, a_usm, sycl_queue)
-    free_usm(context, builder, b_usm, sycl_queue)
-    free_usm(context, builder, out_usm, sycl_queue)
-    return out
-
-
-@lower_builtin(np.dot, types.Array, types.Array)
-def dot_dppy(context, builder, sig, args):
-    """
-    np.dot(a, b)
-    a @ b
-    """
-
-    ensure_dpnp("dot")
-
-    with make_contiguous(context, builder, sig, args) as (sig, args):
-        ndims = [x.ndim for x in sig.args[:2]]
-        if ndims == [2, 2]:
-            return dot_2_mm(context, builder, sig, args)
-        elif ndims == [2, 1]:
-            return dot_2_mv(context, builder, sig, args)
-        elif ndims == [1, 2]:
-            return dot_2_vm(context, builder, sig, args)
-        elif ndims == [1, 1]:
-            return dot_2_vv(context, builder, sig, args)
-        else:
-            assert 0
-    raise ImportError("scipy 0.16+ is required for linear algebra")
-
-
-@lower_builtin("np.matmul", types.Array, types.Array)
-def matmul_dppy(context, builder, sig, args):
-    """
-    np.matmul(matrix, matrix)
-    """
-    ensure_dpnp("matmul")
-    with make_contiguous(context, builder, sig, args) as (sig, args):
-        ndims = [x.ndim for x in sig.args[:2]]
-        if ndims != [2, 2]:
-            raise ValueError("array dimension has to be 2 for np.matmul(a, b)")
-
-        return dot_2_mm(context, builder, sig, args)
-
-
-def common_sum_prod_impl(context, builder, sig, args, fn_type):
-    def array_size_checker(arry):
-        if arry.size == 0:
-            raise ValueError("Passed Empty array")
-
-    context.compile_internal(builder, array_size_checker,
-                             signature(types.none, *sig.args), args)
-
-    sycl_queue = get_sycl_queue(context, builder)
-
-    aty = sig.args[0]
-    a = make_array(aty)(context, builder, args[0])
-    size = a.nitems
-
-    total_size_a = get_total_size_of_array(context, builder, aty.dtype, a)
-    a_usm = allocate_usm(context, builder, total_size_a, sycl_queue)
-    copy_usm(context, builder, a.data, a_usm, total_size_a, sycl_queue)
-
-    out = cgutils.alloca_once(builder, context.get_value_type(sig.return_type))
-    builder.store(context.get_constant(sig.return_type, 0), out)
-    out_usm = allocate_usm(context, builder,
-            context.get_constant(types.intp, context.get_abi_sizeof(context.get_value_type(aty.dtype))), sycl_queue)
-
-    # arguments are : a ->void*, result->void*, size->int64
-    param_tys = [ll_void_p, ll_void_p, ir.IntType(64)]
-    params = (a_usm, out_usm, size)
-
-    type_names = []
-    type_names.append(aty.dtype.name)
-    type_names.append("NONE")
-
-    call_dpnp(context, builder, fn_type, type_names, params, param_tys, ll_void)
-
-    copy_usm(context, builder, out_usm, out,
-            context.get_constant(types.intp, context.get_abi_sizeof(context.get_value_type(aty.dtype))), sycl_queue)
-
-    free_usm(context, builder, a_usm, sycl_queue)
-    free_usm(context, builder, out_usm, sycl_queue)
-
-    return builder.load(out)
-
-
-
-@lower_builtin(np.sum, types.Array)
-def array_sum(context, builder, sig, args):
-    ensure_dpnp("sum")
-    return common_sum_prod_impl(context, builder, sig, args, "dpnp_sum")
-
-
-@lower_builtin(np.prod, types.Array)
-def array_prod(context, builder, sig, args):
-    ensure_dpnp("prod")
-
-    return common_sum_prod_impl(context, builder, sig, args, "dpnp_prod")
-
-
-def common_max_min_impl(context, builder, sig, args, fn_type):
-    def array_size_checker(arry):
-        if arry.size == 0:
-            raise ValueError("Passed Empty array")
-
-    context.compile_internal(builder, array_size_checker,
-                             signature(types.none, *sig.args), args)
-
-    sycl_queue = get_sycl_queue(context, builder)
-
-    aty = sig.args[0]
-    a = make_array(aty)(context, builder, args[0])
-    a_shape = builder.gep(args[0].operands[0], [context.get_constant(types.int32, 0), context.get_constant(types.int32, 5)])
-    a_ndim = context.get_constant(types.intp, aty.ndim)
-
-    total_size_a = get_total_size_of_array(context, builder, aty.dtype, a)
-    a_usm = allocate_usm(context, builder, total_size_a, sycl_queue)
-    copy_usm(context, builder, a.data, a_usm, total_size_a, sycl_queue)
-
-    out = cgutils.alloca_once(builder, context.get_value_type(sig.return_type))
-    builder.store(context.get_constant(sig.return_type, 0), out)
-    out_usm = allocate_usm(context, builder,
-            context.get_constant(types.intp, context.get_abi_sizeof(context.get_value_type(sig.return_type))), sycl_queue)
-
-    # arguments are : a ->void*, result->void*
-    param_tys = [ll_void_p, ll_void_p, ll_intp_p, ir.IntType(64), ll_intp_p, ir.IntType(64)]
-    params = (a_usm, out_usm, builder.bitcast(a_shape, ll_intp_p), a_ndim,
-              builder.bitcast(a_shape,  ll_intp_p), a_ndim)
-
-    type_names = []
-    type_names.append(aty.dtype.name)
-    if fn_type == "dpnp_mean":
-        type_names.append(aty.dtype.name)
-    else:
-        type_names.append(sig.return_type.name)
-
-    call_dpnp(context, builder, fn_type, type_names, params, param_tys, ll_void)
-
-    copy_usm(context, builder, out_usm, out,
-            context.get_constant(types.intp, context.get_abi_sizeof(context.get_value_type(sig.return_type))), sycl_queue)
-
-    free_usm(context, builder, a_usm, sycl_queue)
-    free_usm(context, builder, out_usm, sycl_queue)
-
-    return builder.load(out)
-
-
-@lower_builtin(np.max, types.Array)
-@lower_builtin("array.max", types.Array)
-def array_max(context, builder, sig, args):
-    ensure_dpnp("max")
-
-    return common_max_min_impl(context, builder, sig, args, "dpnp_max")
-
-@lower_builtin(np.min, types.Array)
-@lower_builtin("array.min", types.Array)
-def array_min(context, builder, sig, args):
-    ensure_dpnp("min")
-
-    return common_max_min_impl(context, builder, sig, args, "dpnp_min")
-
-@lower_builtin(np.mean, types.Array)
-@lower_builtin("array.mean", types.Array)
-def array_mean(context, builder, sig, args):
-    ensure_dpnp("mean")
-
-    return common_max_min_impl(context, builder, sig, args, "dpnp_mean")
-
-@lower_builtin(np.median, types.Array)
-def array_median(context, builder, sig, args):
-    ensure_dpnp("median")
-
-    return common_max_min_impl(context, builder, sig, args, "dpnp_median")
-
-
-def common_argmax_argmin_impl(context, builder, sig, args, fn_type):
-    def array_size_checker(arry):
-        if arry.size == 0:
-            raise ValueError("Passed Empty array")
-
-    context.compile_internal(builder, array_size_checker,
-                             signature(types.none, *sig.args), args)
-
-    sycl_queue = get_sycl_queue(context, builder)
-
-    aty = sig.args[0]
-    a = make_array(aty)(context, builder, args[0])
-    size = a.nitems
-
-    total_size_a = get_total_size_of_array(context, builder, aty.dtype, a)
-    a_usm = allocate_usm(context, builder, total_size_a, sycl_queue)
-    copy_usm(context, builder, a.data, a_usm, total_size_a, sycl_queue)
-
-    out = cgutils.alloca_once(builder, context.get_value_type(sig.return_type))
-    builder.store(context.get_constant(sig.return_type, 0), out)
-    out_usm = allocate_usm(context, builder,
-            context.get_constant(types.intp, context.get_abi_sizeof(context.get_value_type(sig.return_type))), sycl_queue)
-
-    # arguments are : a ->void*, result->void*, size->int64
-    param_tys = [ll_void_p, ll_void_p, ir.IntType(64)]
-    params = (a_usm, out_usm, size)
-
-    type_names = []
-    type_names.append(aty.dtype.name)
-    type_names.append(sig.return_type.name)
-
-    call_dpnp(context, builder, fn_type, type_names, params, param_tys, ll_void)
-
-    copy_usm(context, builder, out_usm, out,
-            context.get_constant(types.intp, context.get_abi_sizeof(context.get_value_type(sig.return_type))), sycl_queue)
-
-    free_usm(context, builder, a_usm, sycl_queue)
-    free_usm(context, builder, out_usm, sycl_queue)
-
-    return builder.load(out)
-
-
-
-@lower_builtin(np.argmax, types.Array)
-def array_argmax(context, builder, sig, args):
-    ensure_dpnp("argmax")
-
-    return common_argmax_argmin_impl(context, builder, sig, args, "dpnp_argmax")
-
-
-@lower_builtin(np.argmin, types.Array)
-def array_argmin(context, builder, sig, args):
-    ensure_dpnp("argmin")
-
-    return common_argmax_argmin_impl(context, builder, sig, args, "dpnp_argmin")
-
-
-@lower_builtin(np.argsort, types.Array, types.StringLiteral)
-def array_argsort(context, builder, sig, args):
-    ensure_dpnp("argsort")
-
-    def make_res(A):
-        return np.arange(A.size)
-
-    def array_dim_checker(arry):
-        if arry.ndim > 1:
-            raise ValueError("Argsort is only supported for 1D array")
-
-    context.compile_internal(builder, array_dim_checker,
-            signature(types.none, *sig.args[:1]), args[:1])
-
-    sycl_queue = get_sycl_queue(context, builder)
-    aty = sig.args[0]
-    a = make_array(aty)(context, builder, args[0])
-    size, = cgutils.unpack_tuple(builder, a.shape)
-
-    total_size_a = get_total_size_of_array(context, builder, aty.dtype, a)
-    a_usm = allocate_usm(context, builder, total_size_a, sycl_queue)
-    copy_usm(context, builder, a.data, a_usm, total_size_a, sycl_queue)
-
-    out = context.compile_internal(builder, make_res,
-            signature(sig.return_type, *sig.args[:1]), args[:1])
-    outary = make_array(sig.return_type)(context, builder, out)
-
-    total_size_out = get_total_size_of_array(context, builder, sig.return_type.dtype, outary)
-    out_usm = allocate_usm(context, builder, total_size_out, sycl_queue)
-
-
-    # arguments are : a ->void*, result->void*, size->int64
-    param_tys = [ll_void_p, ll_void_p, ir.IntType(64)]
-    params = (a_usm, out_usm, size)
-
-    type_names = []
-    for argty in sig.args[:1]:
-        type_names.append(argty.dtype.name)
-    type_names.append(sig.return_type.name)
-
-    call_dpnp(context, builder, "dpnp_argsort", type_names, params, param_tys, ll_void)
-
-    copy_usm(context, builder, out_usm, outary.data, total_size_out, sycl_queue)
-
-    free_usm(context, builder, a_usm, sycl_queue)
-    free_usm(context, builder, out_usm, sycl_queue)
-
-    return out
-
-
-@lower_builtin(np.cov, types.Array)
-def array_cov(context, builder, sig, args):
-    ensure_dpnp("cov")
-    def make_1D_res(size):
-        return np.empty(1, dtype=np.float64)
-
-    def make_2D_res(size):
-        return np.empty((size, size), dtype=np.float64)
-
-    sycl_queue = get_sycl_queue(context, builder)
-    aty = sig.args[0]
-    aty = sig.args[0]
-    a = make_array(aty)(context, builder, args[0])
-
-    total_size_a = get_total_size_of_array(context, builder, aty.dtype, a)
-    a_usm = allocate_usm(context, builder, total_size_a, sycl_queue)
-    copy_usm(context, builder, a.data, a_usm, total_size_a, sycl_queue)
-
-    if aty.ndim == 2:
-        m, n = cgutils.unpack_tuple(builder, a.shape)
-        out = context.compile_internal(builder, make_2D_res,
-                signature(sig.return_type, types.int64), (m,))
-    elif aty.ndim == 1:
-        m, = cgutils.unpack_tuple(builder, a.shape)
-        out = context.compile_internal(builder, make_1D_res,
-                signature(sig.return_type, types.int64), (m,))
-    else:
-        #TODO: Throw error, cov is supported for only 1D and 2D array
-        pass
-
-    outary = make_array(sig.return_type)(context, builder, out)
-
-    total_size_out = get_total_size_of_array(context, builder, sig.return_type.dtype, outary)
-    out_usm = allocate_usm(context, builder, total_size_out, sycl_queue)
-
-    nrows = cgutils.alloca_once(builder, context.get_value_type(types.int64))
-    ncols = cgutils.alloca_once(builder, context.get_value_type(types.int64))
-
-    if aty.ndim == 2:
-        builder.store(m, nrows)
-        builder.store(n, ncols)
-
-    elif aty.ndim == 1:
-        builder.store(context.get_constant(types.int64, 1), nrows)
-        builder.store(m, ncols)
-
-
-    # arguments are : a ->void*, result->void*, nrows->int64, ncols->int64
-    param_tys = [ll_void_p, ll_void_p, ir.IntType(64), ir.IntType(64)]
-    params = (a_usm, out_usm, builder.load(nrows), builder.load(ncols))
-
-    type_names = []
-    type_names.append(aty.dtype.name)
-    type_names.append("NONE")
-
-
-    call_dpnp(context, builder, "dpnp_cov", type_names, params, param_tys, ll_void)
-
-    copy_usm(context, builder, out_usm, outary.data, total_size_out, sycl_queue)
-
-    free_usm(context, builder, a_usm, sycl_queue)
-    free_usm(context, builder, out_usm, sycl_queue)
-
-    return out
-
-
-'''
-@lower_builtin(np.linalg.eig, types.Array)
-def array_cov(context, builder, sig, args):
-    pass
-
-@lower_builtin("np.random.sample")
-def random_impl(context, builder, sig, args):
-
-    def make_res(shape):
-        return np.empty(shape, dtype=np.float64)
-
-    import pdb
-    pdb.set_trace()
-    out = context.compile_internal(builder, make_res,
-            signature(sig.return_type, *sig.args), args)
-
-    outary = make_array(sig.return_type)(context, builder, out)
-
-    # arguments are : result->void*, size->int64
-    param_tys = [ll_void_p, ll_intp_p]
-    params = (builder.bitcast(outary.data, ll_void_p), )
-
-
-    type_names = []
-    for argty in sig.args[:1]:
-        type_names.append(argty.dtype.name.encode('utf-8'))
-    type_names.append(sig.return_type.name.encode('utf-8'))
-
-    call_dpnp(context, builder, b"dpnp_cov", type_names, params, param_tys, ll_void)
-'''
diff --git a/numba_dppy/rename_numpy_functions_pass.py b/numba_dppy/rename_numpy_functions_pass.py
index bf8af9b661..c1d58ce036 100644
--- a/numba_dppy/rename_numpy_functions_pass.py
+++ b/numba_dppy/rename_numpy_functions_pass.py
@@ -9,7 +9,21 @@
 import numba_dppy
 from numba.core import types
 
-rewrite_function_name_map = {"sum": (["np"], "sum"), "eig": (["linalg"], "eig")}
+rewrite_function_name_map = {"sum": (["np"], "sum"),
+                             "eig": (["linalg"], "eig"),
+                             "prod": (["np"], "prod"),
+                             "max": (["np"], "max"),
+                             "amax": (["np"], "amax"),
+                             "min": (["np"], "min"),
+                             "amin": (["np"], "amin"),
+                             "mean": (["np"], "mean"),
+                             "median": (["np"], "median"),
+                             "argmax": (["np"], "argmax"),
+                             "argmin": (["np"], "argmin"),
+                             "argsort": (["np"], "argsort"),
+                             "cov": (["np"], "cov"),
+                             "dot": (["np"], "dot"),
+                             "matmul": (["np"], "matmul")}
 
 
 class RewriteNumPyOverloadedFunctions(object):
@@ -110,9 +124,13 @@ class DPPYRewriteOverloadedNumPyFunctions(FunctionPass):
 
     def __init__(self):
         FunctionPass.__init__(self)
+
         import numba_dppy.dpnp_glue.dpnpdecl
+        import numba_dppy.dpnp_glue.dpnpimpl
         import numba_dppy.dpnp_glue.dpnp_linalgimpl
         import numba_dppy.dpnp_glue.dpnp_transcendentalsimpl
+        import numba_dppy.dpnp_glue.dpnp_statisticsimpl
+        import numba_dppy.dpnp_glue.dpnp_sort_search_countimpl
 
     def run_pass(self, state):
         rewrite_function_name_pass = RewriteNumPyOverloadedFunctions(
@@ -211,7 +229,6 @@ def run(self):
         return
 
 
-
 @register_pass(mutates_CFG=True, analysis_only=False)
 class DPPYRewriteNdarrayFunctions(FunctionPass):
     _name = "dppy_rewrite_ndarray_functions_pass"
@@ -230,5 +247,3 @@ def run_pass(self, state):
         state.func_ir.blocks = simplify_CFG(state.func_ir.blocks)
 
         return True
-
-
diff --git a/numba_dppy/tests/test_dpnp_functions.py b/numba_dppy/tests/test_dpnp_functions.py
index 6b33a25dfe..166937c275 100644
--- a/numba_dppy/tests/test_dpnp_functions.py
+++ b/numba_dppy/tests/test_dpnp_functions.py
@@ -150,6 +150,113 @@ def f(a):
 
             self.assertTrue(expected == got)
 
+    def test_ndarray_prod(self):
+        @njit
+        def f(a):
+            return a.prod()
+
+        size = 3
+        for ty in self.tys:
+            a = np.arange(1, (size * size) + 1, dtype=ty).reshape((size, size))
+
+            with dpctl.device_context("opencl:gpu"):
+                got = f(a)
+                expected = a.prod()
+
+            self.assertTrue(expected == got)
+
+    def test_ndarray_max(self):
+        @njit
+        def f(a):
+            return a.max()
+
+        size = 3
+        for ty in self.tys:
+            a = np.arange(1, (size * size) + 1, dtype=ty).reshape((size, size))
+
+            with dpctl.device_context("opencl:gpu"):
+                got = f(a)
+                expected = a.max()
+
+            self.assertTrue(expected == got)
+
+    def test_ndarray_min(self):
+        @njit
+        def f(a):
+            return a.min()
+
+        size = 3
+        for ty in self.tys:
+            a = np.arange(1, (size * size) + 1, dtype=ty).reshape((size, size))
+
+            with dpctl.device_context("opencl:gpu"):
+                got = f(a)
+                expected = a.min()
+
+            self.assertTrue(expected == got)
+
+    def test_ndarray_mean(self):
+        @njit
+        def f(a):
+            return a.mean()
+
+        size = 3
+        for ty in self.tys:
+            a = np.arange(1, (size * size) + 1, dtype=ty).reshape((size, size))
+
+            with dpctl.device_context("opencl:gpu"):
+                got = f(a)
+                expected = a.mean()
+
+            self.assertTrue(expected == got)
+
+    def test_ndarray_argmax(self):
+        @njit
+        def f(a):
+            return a.argmax()
+
+        size = 3
+        for ty in self.tys:
+            a = np.arange(1, (size * size) + 1, dtype=ty).reshape((size, size))
+
+            with dpctl.device_context("opencl:gpu"):
+                got = f(a)
+                expected = a.argmax()
+
+            self.assertTrue(expected == got)
+
+
+    def test_ndarray_argmin(self):
+        @njit
+        def f(a):
+            return a.argmin()
+
+        size = 3
+        for ty in self.tys:
+            a = np.arange(1, (size * size) + 1, dtype=ty).reshape((size, size))
+
+            with dpctl.device_context("opencl:gpu"):
+                got = f(a)
+                expected = a.argmin()
+
+            self.assertTrue(expected == got)
+
+    def test_ndarray_argsort(self):
+        @njit
+        def f(a):
+            return a.argsort()
+
+        size = 3
+        for ty in self.tys:
+            a = np.arange(1, (size * size) + 1, dtype=ty)
+
+            with dpctl.device_context("opencl:gpu"):
+                got = f(a)
+                expected = a.argsort()
+
+            self.assertTrue(np.array_equal(expected, got))
+
+
 @unittest.skipUnless(ensure_dpnp() and dpctl.has_gpu_queues(), 'test only when dpNP and GPU is available')
 class Testdpnp_functions(unittest.TestCase):
     N = 10
@@ -203,6 +310,18 @@ def f(a):
         self.assertTrue(test_for_dimensions(f, np.max, [10, 2], self.tys))
         self.assertTrue(test_for_dimensions(f, np.max, [10, 2, 3], self.tys))
 
+    def test_amax(self):
+        @njit
+        def f(a):
+            c = np.amax(a)
+            return c
+
+        self.assertTrue(test_for_different_datatypes(
+            f, np.amax, [10], 1, self.tys))
+        self.assertTrue(test_for_dimensions(f, np.amax, [10, 2], self.tys))
+        self.assertTrue(test_for_dimensions(f, np.amax, [10, 2, 3], self.tys))
+
+
     def test_argmin(self):
         @njit
         def f(a):
@@ -226,6 +345,17 @@ def f(a):
         self.assertTrue(test_for_dimensions(f, np.min, [10, 2], self.tys))
         self.assertTrue(test_for_dimensions(f, np.min, [10, 2, 3], self.tys))
 
+    def test_amin(self):
+        @njit
+        def f(a):
+            c = np.amin(a)
+            return c
+
+        self.assertTrue(test_for_different_datatypes(
+            f, np.min, [10], 1, self.tys))
+        self.assertTrue(test_for_dimensions(f, np.min, [10, 2], self.tys))
+        self.assertTrue(test_for_dimensions(f, np.min, [10, 2, 3], self.tys))
+
     def test_argsort(self):
         @njit
         def f(a):
diff --git a/numba_dppy/tests/test_dppy_fallback.py b/numba_dppy/tests/test_dppy_fallback.py
index 3a7e668f02..dd05bbdc84 100644
--- a/numba_dppy/tests/test_dppy_fallback.py
+++ b/numba_dppy/tests/test_dppy_fallback.py
@@ -34,7 +34,10 @@ def inner_call_fallback():
 
     def test_dppy_fallback_reductions(self):
         def reduction(a):
-            return np.amax(a)
+            b = 1
+            for i in numba.prange(len(a)):
+                b += a[i]
+            return b
 
         a = np.ones(10)
         with captured_stderr() as msg, dpctl.device_context("opencl:gpu"):

From 2f3451f7409561d3c59b914fc1e293e0866d27c3 Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Wed, 16 Dec 2020 21:52:35 +0300
Subject: [PATCH 29/32] Add scipy to dependency to README.md (#124)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 180ac952e7..d372a65c9e 100644
--- a/README.md
+++ b/README.md
@@ -25,6 +25,7 @@ https://intelpython.github.io/dpnp/
 * llvm-spirv (SPIRV generation from LLVM IR)
 * llvmdev (LLVM IR generation)
 * spirv-tools
+* scipy (for testing)
 
 ## dpPy
 

From 11c4dbbb0b45deb2e22114ececd94594f0e15b9b Mon Sep 17 00:00:00 2001
From: Elena Totmenina <totmeninal@mail.ru>
Date: Thu, 17 Dec 2020 00:58:31 +0300
Subject: [PATCH 30/32] Added docs for Offload Diagnostics and Controllable
 Fallback (#113)

Co-authored-by: etotmeni <elena.totmenina@intel.com>
---
 README.md     |  4 ++++
 docs/INDEX.md | 18 ++++++++++++++++++
 2 files changed, 22 insertions(+)
 create mode 100644 docs/INDEX.md

diff --git a/README.md b/README.md
index d372a65c9e..9d1969fe3f 100644
--- a/README.md
+++ b/README.md
@@ -68,3 +68,7 @@ Please follow instructions in the [DEBUGGING.md](DEBUGGING.md)
 ## Reporting issues
 
 Please use https://github.com/IntelPython/numba-dppy/issues to report issues and bugs.
+
+## Features
+
+Read this guide for additional features [INDEX.md](docs/INDEX.md)
diff --git a/docs/INDEX.md b/docs/INDEX.md
new file mode 100644
index 0000000000..2c8b990c6c
--- /dev/null
+++ b/docs/INDEX.md
@@ -0,0 +1,18 @@
+# numba-dppy
+
+Below is the functionality that is implemented in numba-dppy. You can follow the detailed descriptions of some of the features.
+
+## Offload Diagnostics
+
+Setting the debug environment variable `NUMBA_DPPY_OFFLOAD_DIAGNOSTICS `
+(e.g. `export NUMBA_DPPY_OFFLOAD_DIAGNOSTICS=1`) enables the parallel and offload diagnostics information.
+
+If set to an integer value between 1 and 4 (inclusive) diagnostic information about parallel transforms undertaken by Numba will be written to STDOUT. The higher the value set the more detailed the information produced.
+In the "Auto-offloading" section there is the information on which device (device name) this parfor or kernel was offloaded.
+
+## Controllable Fallback
+
+With the default behavior of numba-dppy, if a section of code cannot be offloaded on the GPU, then it is automatically executed on the CPU and printed a warning. This behavior only applies to njit functions and auto-offloading of numpy functions, array expressions, and prange loops.
+
+Setting the debug environment variable `NUMBA_DPPY_FALLBACK_OPTION `
+(e.g. `export NUMBA_DPPY_FALLBACK_OPTION=0`) enables the code is not automatically offload to the CPU, and an error occurs. This is necessary in order to understand at an early stage which parts of the code do not work on the GPU, and not to wait for the program to execute on the CPU if you don't need it.

From d8345cf0509a6e29cddfcd3f0b7259ffbe8329c6 Mon Sep 17 00:00:00 2001
From: vlad-perevezentsev
 <68376232+vlad-perevezentsev@users.noreply.github.com>
Date: Thu, 17 Dec 2020 02:52:01 +0300
Subject: [PATCH 31/32] update CHANGELOG (#126)

Co-authored-by: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Co-authored-by: Diptorup Deb <3046810+diptorupd@users.noreply.github.com>
---
 CHANGELOG.md          | 38 ++++++++++++++++++++++++++++++++++++++
 numba_dppy/CHANGE_LOG |  7 -------
 2 files changed, 38 insertions(+), 7 deletions(-)
 create mode 100644 CHANGELOG.md
 delete mode 100644 numba_dppy/CHANGE_LOG

diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000000..fa7e9350f5
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,38 @@
+# Changelog
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [Unreleased]
+
+## [0.12.0] - 2020-12-17
+### Added
+- numba-dppy is a standalone package now. Added setup.py and conda recipe.
+- Offload diagnostics.
+- Controllable fallback.
+- Add flags to generate debug symbols.
+- Implementation of `np.linalg.eig`, `np.ndarray.sum`, `np.ndarray.max`, `np.ndarray.min`, `np.ndarray.mean`.
+- Two new re-write passes to convert NumPy calls into a pseudo `numba_dppy` call site to allow target-specific
+  overload of NumPy functions. The rewrite passes is a temporary fix till Numba gains support for target-specific overlaods.
+- Updated to dpCtl 0.5.* and dpNP 0.4.*
+
+### Changed
+- The `dpnp` interface now uses Numba's `@overload` functionality as opposed to the previous `@lower_builtin` method.
+- Rename `DPPL` to `DPPY`.
+- Cleaned test code.
+- `DPPLTestCase` replaced with `unittest.TestCase`.
+- All tests and examples use `with device_context`.
+- Config environment variables starts with `NUMBA_DPPY_`
+(i.e. NUMBA_DPPY_SAVE_IR_FILES and NUMBA_DPPY_SPIRV_VAL)
+- Remove nested folder `dppl` in `tests`.
+- No dependency on `cffi`.
+
+### Removed
+- The old backup file.
+
+## NUMBA Version 0.48.0 + DPPY Version 0.3.0 (June 29, 2020)
+
+This release includes:
+- Caching of dppy.kernels which will improve performance.
+- Addition of support for Intel Advisor which will help in profiling applications.
diff --git a/numba_dppy/CHANGE_LOG b/numba_dppy/CHANGE_LOG
deleted file mode 100644
index 2a1fcdee40..0000000000
--- a/numba_dppy/CHANGE_LOG
+++ /dev/null
@@ -1,7 +0,0 @@
-NUMBA Version 0.48.0 + DPPY Version 0.3.0 (June 29, 2020)
---------------------------------------------------------
-
-This release includes:
-
-* Caching of dppy.kernels which will improve performance.
-* Addition of support for Intel Advisor which will help in profiling applications.

From c0a17bc98afc83dd2cf2448433cce10f56aae820 Mon Sep 17 00:00:00 2001
From: Reazul Hoque <reazulhoque@users.noreply.github.com>
Date: Thu, 17 Dec 2020 03:09:55 -0600
Subject: [PATCH 32/32] Skip some tests on Gen12 (#120)

Co-authored-by: reazul.hoque <reazul.hoque@intel.com>
Co-authored-by: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
---
 numba_dppy/tests/skip_tests.py                        | 11 +++++++++++
 numba_dppy/tests/test_numpy_math_functions.py         |  6 +++++-
 .../tests/test_numpy_trigonomteric_functions.py       |  2 ++
 3 files changed, 18 insertions(+), 1 deletion(-)
 create mode 100644 numba_dppy/tests/skip_tests.py

diff --git a/numba_dppy/tests/skip_tests.py b/numba_dppy/tests/skip_tests.py
new file mode 100644
index 0000000000..fa18d36181
--- /dev/null
+++ b/numba_dppy/tests/skip_tests.py
@@ -0,0 +1,11 @@
+import dpctl
+
+def is_gen12(device_type):
+    with dpctl.device_context(device_type):
+        q = dpctl.get_current_queue()
+        device = q.get_sycl_device()
+        name = device.get_device_name()
+        if "Gen12" in name:
+            return True
+
+        return False
diff --git a/numba_dppy/tests/test_numpy_math_functions.py b/numba_dppy/tests/test_numpy_math_functions.py
index cf5174ac96..ef5dc235b8 100644
--- a/numba_dppy/tests/test_numpy_math_functions.py
+++ b/numba_dppy/tests/test_numpy_math_functions.py
@@ -3,7 +3,7 @@
 from numba import njit
 import dpctl
 import unittest
-
+from . import skip_tests
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
 class TestNumpy_math_functions(unittest.TestCase):
@@ -179,6 +179,7 @@ def f(a):
 
         self.assertTrue(np.all(c == -input_arr))
 
+    @unittest.skipIf(skip_tests.is_gen12("opencl:gpu"), "Gen12 not supported")
     def test_sign(self):
         @njit
         def f(a):
@@ -221,6 +222,7 @@ def f(a):
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
+    @unittest.skipIf(skip_tests.is_gen12("opencl:gpu"), "Gen12 not supported")
     def test_log(self):
         @njit
         def f(a):
@@ -236,6 +238,7 @@ def f(a):
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
+    @unittest.skipIf(skip_tests.is_gen12("opencl:gpu"), "Gen12 not supported")
     def test_log10(self):
         @njit
         def f(a):
@@ -251,6 +254,7 @@ def f(a):
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
+    @unittest.skipIf(skip_tests.is_gen12("opencl:gpu"), "Gen12 not supported")
     def test_expm1(self):
         @njit
         def f(a):
diff --git a/numba_dppy/tests/test_numpy_trigonomteric_functions.py b/numba_dppy/tests/test_numpy_trigonomteric_functions.py
index 7c0a95d323..812f3d060c 100644
--- a/numba_dppy/tests/test_numpy_trigonomteric_functions.py
+++ b/numba_dppy/tests/test_numpy_trigonomteric_functions.py
@@ -3,6 +3,7 @@
 from numba import njit
 import dpctl
 import unittest
+from . import skip_tests
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
@@ -155,6 +156,7 @@ def f(a):
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
+    @unittest.skipIf(skip_tests.is_gen12("opencl:gpu"), "Gen12 not supported")
     def test_arccosh(self):
         @njit
         def f(a):