diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000..3123046333
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1 @@
+numba_dppy/_version.py export-subst
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000..340ae2678b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,27 @@
+*.pyc
+*.o
+*.so
+*.dylib
+*.pyd
+*.pdb
+*.egg-info
+*.sw[po]
+*.out
+*.ll
+.coverage
+.nfs*
+tags
+MANIFEST
+
+build/
+docs/_build/
+docs/gh-pages/
+dist/
+htmlcov/
+.idea/
+.vscode/
+.mypy_cache/
+.ipynb_checkpoints/
+__pycache__/
+
+docs/source/developer/autogen*
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000000..fa7e9350f5
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,38 @@
+# Changelog
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [Unreleased]
+
+## [0.12.0] - 2020-12-17
+### Added
+- numba-dppy is a standalone package now. Added setup.py and conda recipe.
+- Offload diagnostics.
+- Controllable fallback.
+- Add flags to generate debug symbols.
+- Implementation of `np.linalg.eig`, `np.ndarray.sum`, `np.ndarray.max`, `np.ndarray.min`, `np.ndarray.mean`.
+- Two new re-write passes to convert NumPy calls into a pseudo `numba_dppy` call site to allow target-specific
+  overload of NumPy functions. The rewrite passes is a temporary fix till Numba gains support for target-specific overlaods.
+- Updated to dpCtl 0.5.* and dpNP 0.4.*
+
+### Changed
+- The `dpnp` interface now uses Numba's `@overload` functionality as opposed to the previous `@lower_builtin` method.
+- Rename `DPPL` to `DPPY`.
+- Cleaned test code.
+- `DPPLTestCase` replaced with `unittest.TestCase`.
+- All tests and examples use `with device_context`.
+- Config environment variables starts with `NUMBA_DPPY_`
+(i.e. NUMBA_DPPY_SAVE_IR_FILES and NUMBA_DPPY_SPIRV_VAL)
+- Remove nested folder `dppl` in `tests`.
+- No dependency on `cffi`.
+
+### Removed
+- The old backup file.
+
+## NUMBA Version 0.48.0 + DPPY Version 0.3.0 (June 29, 2020)
+
+This release includes:
+- Caching of dppy.kernels which will improve performance.
+- Addition of support for Intel Advisor which will help in profiling applications.
diff --git a/DEBUGGING.md b/DEBUGGING.md
new file mode 100644
index 0000000000..6199d0f431
--- /dev/null
+++ b/DEBUGGING.md
@@ -0,0 +1,31 @@
+## Debugging with GDB
+
+Setting the debug environment variable `NUMBA_DPPY_DEBUG` (e.g. `export NUMBA_DPPY_DEBUG=True`) enables 
+the emission of debug info to the llvm and spirv IR.
+To disable debugging set this variable to None: (e.g. `export NUMBA_DPPY_DEBUG= `).  
+Currently, the following debug info is available:
+- Source location (filename and line number) is available. 
+- Setting break points by the line number.
+- Stepping over break points.
+
+### Requirements
+
+Intel® Distribution for GDB installed to the system.  
+Documentation for this debugger can be found in the 
+[Intel® Distribution for GDB documentation](https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/distribution-for-gdb.html).
+
+### Example debug usage
+
+```bash
+$ export NUMBA_DPPY_DEBUG=True
+$ gdb-oneapi -q python  
+(gdb) break numba_dppy/examples/sum.py:14     # Assumes the kernel is in file sum.py, at line 14  
+(gdb) run sum.py
+```
+
+### Limitations
+
+Currently, Numba-dppy provides only initial support of debugging GPU kernels.  
+The following functionality is **not supported** :
+- Printing kernel local variables (e.g. ```info locals```).
+- Stepping over several off-loaded functions.
diff --git a/HowTo.rst b/HowTo.rst
index 03927c0ea7..7689bc52bf 100644
--- a/HowTo.rst
+++ b/HowTo.rst
@@ -7,7 +7,7 @@ are listed below with the help of sample code snippets. In this release we have
 the implementation of the OAK approach described in MS138 in section 4.3.2. The
 new decorator is described below.
 
-To access the features driver module have to be imported from numba_dppy.dppl_driver
+To access the features driver module have to be imported from numba_dppy.dppy_driver
 
 New Decorator
 =============
@@ -61,7 +61,7 @@ Primitive types are passed by value to the kernel, currently supported are int,
 Math Kernels
 ============
 
-This release has support for math kernels. See numba_dppy/tests/dppl/test_math_functions.py
+This release has support for math kernels. See numba_dppy/tests/dppy/test_math_functions.py
 for more details.
 
 
@@ -170,6 +170,6 @@ Testing
 
 All examples can be found in numba_dppy/examples/
 
-All tests can be found in numba_dppy/tests/dppl and can be triggered by the following command:
+All tests can be found in numba_dppy/tests/dppy and can be triggered by the following command:
 
 ``python -m numba.runtests numba_dppy.tests``
diff --git a/MANIFEST.in b/MANIFEST.in
index e9635a5f83..74d44bdc67 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -3,3 +3,5 @@ include README.md setup.py LICENSE
 
 recursive-include numba_dppy *.cl
 
+include versioneer.py
+include numba_dppy/_version.py
diff --git a/README.md b/README.md
index 255f9085df..9d1969fe3f 100644
--- a/README.md
+++ b/README.md
@@ -20,11 +20,12 @@ https://intelpython.github.io/dpnp/
 ## Dependencies
 
 * numba >=0.51 (IntelPython/numba)
-* dpCtl >=0.3.8
-* dpNP >=0.3 (optional)
+* dpCtl 0.5.*
+* dpNP 0.4.* (optional)
 * llvm-spirv (SPIRV generation from LLVM IR)
 * llvmdev (LLVM IR generation)
 * spirv-tools
+* scipy (for testing)
 
 ## dpPy
 
@@ -43,7 +44,7 @@ See folder `numba_dppy/tests`.
 
 Run tests:
 ```bash
-python -m numba.runtests numba_dppy.tests
+python -m unittest numba_dppy.tests
 ```
 
 ## Examples
@@ -57,9 +58,17 @@ python numba_dppy/examples/sum.py
 
 ## How Tos
 
-Refer the HowTo.rst guide for an overview of the programming semantics,
+Refer the [HowTo.rst](HowTo.rst) guide for an overview of the programming semantics,
 examples, supported functionalities, and known issues.
 
+## Debugging
+
+Please follow instructions in the [DEBUGGING.md](DEBUGGING.md)
+
 ## Reporting issues
 
 Please use https://github.com/IntelPython/numba-dppy/issues to report issues and bugs.
+
+## Features
+
+Read this guide for additional features [INDEX.md](docs/INDEX.md)
diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index 920f79cfe7..5e5b61a25c 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -17,17 +17,21 @@ requirements:
         - python
         - setuptools
         - cython
-        - llvm-spirv
         - numba
-        - dpctl
-        - dpnp
+        - dpctl 0.5.*
+        - dpnp 0.4.*  # [linux]
     run:
         - python
         - numba >=0.51
-        - dpctl
+        - dpctl 0.5.*
         - spirv-tools
         - llvm-spirv
-        - dpnp
+        - llvmdev
+        - dpnp 0.4.*  # [linux]
+
+test:
+  requires:
+    - scipy  # [linux]
 
 about:
     home: https://github.com/IntelPython/numba-dppy
diff --git a/conda-recipe/run_test.bat b/conda-recipe/run_test.bat
index 17e8140115..031bc6e69a 100644
--- a/conda-recipe/run_test.bat
+++ b/conda-recipe/run_test.bat
@@ -1,3 +1,8 @@
+REM For activating OpenCL CPU
+call "%ONEAPI_ROOT%\compiler\latest\env\vars.bat"
+
+@echo on
+
 python -m numba.runtests -b -v -m -- numba_dppy.tests
 IF %ERRORLEVEL% NEQ 0 exit /B 1
 
diff --git a/conda-recipe/run_test.sh b/conda-recipe/run_test.sh
index 27b1d4722f..8a30af0c51 100644
--- a/conda-recipe/run_test.sh
+++ b/conda-recipe/run_test.sh
@@ -1,6 +1,12 @@
 #!/bin/bash
 
-set -ex
+set -e
+
+# For activating OpenCL CPU
+source ${ONEAPI_ROOT}/compiler/latest/env/vars.sh
+source ${ONEAPI_ROOT}/tbb/latest/env/vars.sh
+
+set -x
 
 python -m numba.runtests -b -v -m -- numba_dppy.tests
 
diff --git a/docs/INDEX.md b/docs/INDEX.md
new file mode 100644
index 0000000000..2c8b990c6c
--- /dev/null
+++ b/docs/INDEX.md
@@ -0,0 +1,18 @@
+# numba-dppy
+
+Below is the functionality that is implemented in numba-dppy. You can follow the detailed descriptions of some of the features.
+
+## Offload Diagnostics
+
+Setting the debug environment variable `NUMBA_DPPY_OFFLOAD_DIAGNOSTICS `
+(e.g. `export NUMBA_DPPY_OFFLOAD_DIAGNOSTICS=1`) enables the parallel and offload diagnostics information.
+
+If set to an integer value between 1 and 4 (inclusive) diagnostic information about parallel transforms undertaken by Numba will be written to STDOUT. The higher the value set the more detailed the information produced.
+In the "Auto-offloading" section there is the information on which device (device name) this parfor or kernel was offloaded.
+
+## Controllable Fallback
+
+With the default behavior of numba-dppy, if a section of code cannot be offloaded on the GPU, then it is automatically executed on the CPU and printed a warning. This behavior only applies to njit functions and auto-offloading of numpy functions, array expressions, and prange loops.
+
+Setting the debug environment variable `NUMBA_DPPY_FALLBACK_OPTION `
+(e.g. `export NUMBA_DPPY_FALLBACK_OPTION=0`) enables the code is not automatically offload to the CPU, and an error occurs. This is necessary in order to understand at an early stage which parts of the code do not work on the GPU, and not to wait for the program to execute on the CPU if you don't need it.
diff --git a/numba_dppy/CHANGE_LOG b/numba_dppy/CHANGE_LOG
deleted file mode 100644
index e3cb06522c..0000000000
--- a/numba_dppy/CHANGE_LOG
+++ /dev/null
@@ -1,7 +0,0 @@
-NUMBA Version 0.48.0 + DPPL Version 0.3.0 (June 29, 2020)
---------------------------------------------------------
-
-This release includes:
-
-* Caching of dppl.kernels which will improve performance.
-* Addition of support for Intel Advisor which will help in profiling applications.
diff --git a/numba_dppy/__init__.py b/numba_dppy/__init__.py
index 7d52138691..ac4e898889 100644
--- a/numba_dppy/__init__.py
+++ b/numba_dppy/__init__.py
@@ -4,9 +4,9 @@
 
 
 Extensions to Numba for Intel GPUs introduce two new features into Numba:
-    a.  A new backend that has a new decorator called @dppl.kernel that
+    a.  A new backend that has a new decorator called @dppy.kernel that
         exposes an explicit kernel programming interface similar to the
-        existing Numba GPU code-generation backends. The @dppl.kernel
+        existing Numba GPU code-generation backends. The @dppy.kernel
         decorator currently implements a subset of OpenCL’s API through
         Numba’s intrinsic functions.
 
@@ -20,48 +20,48 @@
 Explicit Kernel Prgoramming with new Docorators:
 
 
-@dppl.kernel
+@dppy.kernel
 
-    The @dppl.kernel decorator can be used with or without extra arguments.
+    The @dppy.kernel decorator can be used with or without extra arguments.
     Optionally, users can pass the signature of the arguments to the
     decorator. When a signature is provided to the DK decorator the version
     of the OpenCL kernel generated gets specialized for that type signature.
 
     ---------------------------------------------------------------------------
-    @dppl.kernel
+    @dppy.kernel
     def data_parallel_sum(a, b, c):
-        i = dppl.get_global_id(0)
+        i = dppy.get_global_id(0)
         c[i] = a[i] + b[i]
     ---------------------------------------------------------------------------
 
     To invoke the above function users will need to provide a
     global size (OpenCL) which is the size of a (same as b and c) and a
-    local size (dppl.DEFAULT_LOCAL_SIZE if user don't want to specify).
+    local size (dppy.DEFAULT_LOCAL_SIZE if user don't want to specify).
     Example shown below:
 
     ---------------------------------------------------------------------------
-    data_parallel_sum[len(a), dppl.DEFAULT_LOCAL_SIZE](dA, dB, dC)
+    data_parallel_sum[len(a), dppy.DEFAULT_LOCAL_SIZE](dA, dB, dC)
     ---------------------------------------------------------------------------
 
 
-@dppl.func
+@dppy.func
 
-    The @dppl.func decorator is the other decorator provided in the explicit
+    The @dppy.func decorator is the other decorator provided in the explicit
     kernel programming model. This decorator allows users to write “device”
     functions that can be invoked from inside DK functions but cannot be invoked
     from the host. The decorator also supports type specialization as with the
-    DK decorator. Functions decorated with @dppl.func will also be JIT compiled
-    and inlined into the OpenCL Program containing the @dppl.kernel function
-    calling it. A @dppl.func will not be launched as an OpenCL kernel.
+    DK decorator. Functions decorated with @dppy.func will also be JIT compiled
+    and inlined into the OpenCL Program containing the @dppy.kernel function
+    calling it. A @dppy.func will not be launched as an OpenCL kernel.
 
     ---------------------------------------------------------------------------
-    @dppl.func
+    @dppy.func
     def bar(a):
         return a*a
 
-    @dppl.kernel
+    @dppy.kernel
     def foo(in, out):
-        i = dppl.get_global_id(0)
+        i = dppy.get_global_id(0)
         out[i] = bar(in[i])
     ---------------------------------------------------------------------------
 
@@ -71,13 +71,13 @@ def foo(in, out):
     The following table has the list of intrinsic functions that can be directly
     used inside a DK function. All the functions are equivalent to the similarly
     named OpenCL function. Wherever there is an implementation difference
-    between the Numba-PyDPPL version and the OpenCL version, the difference is
+    between the Numba-DPPY version and the OpenCL version, the difference is
     explained in table. Note that these functions cannot be used anywhere else
     outside of a DK function in a Numba application. Readers are referred to the
     OpenCL API specs to review the functionality of each function.
 
     +----------------------+----------------------------+----------------------+
-    | Numba-DPPL intrinsic | Equivalent OpenCL function |         Notes        |
+    | Numba-DPPY intrinsic | Equivalent OpenCL function |         Notes        |
     +----------------------+----------------------------+----------------------+
     | get_global_id        | get_global_id              |                      |
     +----------------------+----------------------------+----------------------+
@@ -121,7 +121,7 @@ def foo(in, out):
     |print             |print(varargs)                 |The print function is a  |
     |                  |                               |subset of the OpenCL     |
     |                  |                               |printf function. The     |
-    |                  |                               |Numba-DPPL version of    |
+    |                  |                               |Numba-DPPY version of    |
     |                  |                               |print supports only int, |
     |                  |                               |string, and float        |
     |                  |                               |arguments.               |
@@ -160,16 +160,16 @@ def foo(in, out):
 
 
 
-Complete Example using @dppl.kernel:
+Complete Example using @dppy.kernel:
 
     ---------------------------------------------------------------------------
     import numpy as np
-    import numba_dppy, numba_dppy as dppl
+    import numba_dppy, numba_dppy as dppy
     import dpctl
 
-    @dppl.kernel
+    @dppy.kernel
     def data_parallel_sum(a, b, c):
-        i = dppl.get_global_id(0)
+        i = dppy.get_global_id(0)
         c[i] = a[i] + b[i]
 
     def driver(device_env, a, b, c, global_size):
@@ -181,7 +181,7 @@ def driver(device_env, a, b, c, global_size):
         print("before : ", dA._ndarray)
         print("before : ", dB._ndarray)
         print("before : ", dC._ndarray)
-        data_parallel_sum[global_size, dppl.DEFAULT_LOCAL_SIZE](dA, dB, dC)
+        data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](dA, dB, dC)
         device_env.copy_array_from_device(dC)
         print("after : ", dC._ndarray)
 
@@ -503,17 +503,20 @@ def main():
 
 from __future__ import print_function, absolute_import, division
 
-from numba import config
 import numba.testing
 
-from numba.dppl_config import *
-if dppl_present:
+from .config import dppy_present
+if dppy_present:
     from .device_init import *
 else:
-    raise ImportError("Importing dppl failed")
+    raise ImportError("Importing numba-dppy failed")
 
 def test(*args, **kwargs):
-    if not dppl_present and not is_available():
-        dppl_error()
+    if not dppy_present and not is_available():
+        dppy_error()
 
     return numba.testing.test("numba_dppy.tests", *args, **kwargs)
+
+from ._version import get_versions
+__version__ = get_versions()['version']
+del get_versions
diff --git a/numba_dppy/_version.py b/numba_dppy/_version.py
new file mode 100644
index 0000000000..165dbf4d17
--- /dev/null
+++ b/numba_dppy/_version.py
@@ -0,0 +1,525 @@
+
+# This file helps to compute a version number in source trees obtained from
+# git-archive tarball (such as those provided by githubs download-from-tag
+# feature). Distribution tarballs (built by setup.py sdist) and build
+# directories (produced by setup.py build) will contain a much shorter file
+# that just contains the computed version number.
+
+# This file is released into the public domain. Generated by
+# versioneer-0.19 (https://github.com/python-versioneer/python-versioneer)
+
+"""Git implementation of _version.py."""
+
+import errno
+import os
+import re
+import subprocess
+import sys
+
+
+def get_keywords():
+    """Get the keywords needed to look up the version information."""
+    # these strings will be replaced by git during git-archive.
+    # setup.py/versioneer.py will grep for the variable names, so they must
+    # each be defined on a line of their own. _version.py will just call
+    # get_keywords().
+    git_refnames = "$Format:%d$"
+    git_full = "$Format:%H$"
+    git_date = "$Format:%ci$"
+    keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
+    return keywords
+
+
+class VersioneerConfig:
+    """Container for Versioneer configuration parameters."""
+
+
+def get_config():
+    """Create, populate and return the VersioneerConfig() object."""
+    # these strings are filled in when 'setup.py versioneer' creates
+    # _version.py
+    cfg = VersioneerConfig()
+    cfg.VCS = "git"
+    cfg.style = "pep440"
+    cfg.tag_prefix = "None"
+    cfg.parentdir_prefix = "None"
+    cfg.versionfile_source = "numba_dppy/_version.py"
+    cfg.verbose = False
+    return cfg
+
+
+class NotThisMethod(Exception):
+    """Exception raised if a method is not valid for the current scenario."""
+
+
+LONG_VERSION_PY = {}
+HANDLERS = {}
+
+
+def register_vcs_handler(vcs, method):  # decorator
+    """Create decorator to mark a method as the handler of a VCS."""
+    def decorate(f):
+        """Store f in HANDLERS[vcs][method]."""
+        if vcs not in HANDLERS:
+            HANDLERS[vcs] = {}
+        HANDLERS[vcs][method] = f
+        return f
+    return decorate
+
+
+def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
+                env=None):
+    """Call the given command(s)."""
+    assert isinstance(commands, list)
+    p = None
+    for c in commands:
+        try:
+            dispcmd = str([c] + args)
+            # remember shell=False, so use git.cmd on windows, not just git
+            p = subprocess.Popen([c] + args, cwd=cwd, env=env,
+                                 stdout=subprocess.PIPE,
+                                 stderr=(subprocess.PIPE if hide_stderr
+                                         else None))
+            break
+        except EnvironmentError:
+            e = sys.exc_info()[1]
+            if e.errno == errno.ENOENT:
+                continue
+            if verbose:
+                print("unable to run %s" % dispcmd)
+                print(e)
+            return None, None
+    else:
+        if verbose:
+            print("unable to find command, tried %s" % (commands,))
+        return None, None
+    stdout = p.communicate()[0].strip().decode()
+    if p.returncode != 0:
+        if verbose:
+            print("unable to run %s (error)" % dispcmd)
+            print("stdout was %s" % stdout)
+        return None, p.returncode
+    return stdout, p.returncode
+
+
+def versions_from_parentdir(parentdir_prefix, root, verbose):
+    """Try to determine the version from the parent directory name.
+
+    Source tarballs conventionally unpack into a directory that includes both
+    the project name and a version string. We will also support searching up
+    two directory levels for an appropriately named parent directory
+    """
+    rootdirs = []
+
+    for i in range(3):
+        dirname = os.path.basename(root)
+        if dirname.startswith(parentdir_prefix):
+            return {"version": dirname[len(parentdir_prefix):],
+                    "full-revisionid": None,
+                    "dirty": False, "error": None, "date": None}
+        else:
+            rootdirs.append(root)
+            root = os.path.dirname(root)  # up a level
+
+    if verbose:
+        print("Tried directories %s but none started with prefix %s" %
+              (str(rootdirs), parentdir_prefix))
+    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
+
+
+@register_vcs_handler("git", "get_keywords")
+def git_get_keywords(versionfile_abs):
+    """Extract version information from the given file."""
+    # the code embedded in _version.py can just fetch the value of these
+    # keywords. When used from setup.py, we don't want to import _version.py,
+    # so we do it with a regexp instead. This function is not used from
+    # _version.py.
+    keywords = {}
+    try:
+        f = open(versionfile_abs, "r")
+        for line in f.readlines():
+            if line.strip().startswith("git_refnames ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["refnames"] = mo.group(1)
+            if line.strip().startswith("git_full ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["full"] = mo.group(1)
+            if line.strip().startswith("git_date ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["date"] = mo.group(1)
+        f.close()
+    except EnvironmentError:
+        pass
+    return keywords
+
+
+@register_vcs_handler("git", "keywords")
+def git_versions_from_keywords(keywords, tag_prefix, verbose):
+    """Get version information from git keywords."""
+    if not keywords:
+        raise NotThisMethod("no keywords at all, weird")
+    date = keywords.get("date")
+    if date is not None:
+        # Use only the last line.  Previous lines may contain GPG signature
+        # information.
+        date = date.splitlines()[-1]
+
+        # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
+        # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
+        # -like" string, which we must then edit to make compliant), because
+        # it's been around since git-1.5.3, and it's too difficult to
+        # discover which version we're using, or to work around using an
+        # older one.
+        date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+    refnames = keywords["refnames"].strip()
+    if refnames.startswith("$Format"):
+        if verbose:
+            print("keywords are unexpanded, not using")
+        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
+    refs = set([r.strip() for r in refnames.strip("()").split(",")])
+    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
+    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
+    TAG = "tag: "
+    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+    if not tags:
+        # Either we're using git < 1.8.3, or there really are no tags. We use
+        # a heuristic: assume all version tags have a digit. The old git %d
+        # expansion behaves like git log --decorate=short and strips out the
+        # refs/heads/ and refs/tags/ prefixes that would let us distinguish
+        # between branches and tags. By ignoring refnames without digits, we
+        # filter out many common branch names like "release" and
+        # "stabilization", as well as "HEAD" and "master".
+        tags = set([r for r in refs if re.search(r'\d', r)])
+        if verbose:
+            print("discarding '%s', no digits" % ",".join(refs - tags))
+    if verbose:
+        print("likely tags: %s" % ",".join(sorted(tags)))
+    for ref in sorted(tags):
+        # sorting will prefer e.g. "2.0" over "2.0rc1"
+        if ref.startswith(tag_prefix):
+            r = ref[len(tag_prefix):]
+            if verbose:
+                print("picking %s" % r)
+            return {"version": r,
+                    "full-revisionid": keywords["full"].strip(),
+                    "dirty": False, "error": None,
+                    "date": date}
+    # no suitable tags, so version is "0+unknown", but full hex is still there
+    if verbose:
+        print("no suitable tags, using unknown + full revision id")
+    return {"version": "0+unknown",
+            "full-revisionid": keywords["full"].strip(),
+            "dirty": False, "error": "no suitable tags", "date": None}
+
+
+@register_vcs_handler("git", "pieces_from_vcs")
+def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
+    """Get version from 'git describe' in the root of the source tree.
+
+    This only gets called if the git-archive 'subst' keywords were *not*
+    expanded, and _version.py hasn't already been rewritten with a short
+    version string, meaning we're inside a checked out source tree.
+    """
+    GITS = ["git"]
+    if sys.platform == "win32":
+        GITS = ["git.cmd", "git.exe"]
+
+    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
+                          hide_stderr=True)
+    if rc != 0:
+        if verbose:
+            print("Directory %s not under git control" % root)
+        raise NotThisMethod("'git rev-parse --git-dir' returned error")
+
+    # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
+    # if there isn't one, this yields HEX[-dirty] (no NUM)
+    describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
+                                          "--always", "--long",
+                                          "--match", "%s*" % tag_prefix],
+                                   cwd=root)
+    # --long was added in git-1.5.5
+    if describe_out is None:
+        raise NotThisMethod("'git describe' failed")
+    describe_out = describe_out.strip()
+    full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
+    if full_out is None:
+        raise NotThisMethod("'git rev-parse' failed")
+    full_out = full_out.strip()
+
+    pieces = {}
+    pieces["long"] = full_out
+    pieces["short"] = full_out[:7]  # maybe improved later
+    pieces["error"] = None
+
+    # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
+    # TAG might have hyphens.
+    git_describe = describe_out
+
+    # look for -dirty suffix
+    dirty = git_describe.endswith("-dirty")
+    pieces["dirty"] = dirty
+    if dirty:
+        git_describe = git_describe[:git_describe.rindex("-dirty")]
+
+    # now we have TAG-NUM-gHEX or HEX
+
+    if "-" in git_describe:
+        # TAG-NUM-gHEX
+        mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
+        if not mo:
+            # unparseable. Maybe git-describe is misbehaving?
+            pieces["error"] = ("unable to parse git-describe output: '%s'"
+                               % describe_out)
+            return pieces
+
+        # tag
+        full_tag = mo.group(1)
+        if not full_tag.startswith(tag_prefix):
+            if verbose:
+                fmt = "tag '%s' doesn't start with prefix '%s'"
+                print(fmt % (full_tag, tag_prefix))
+            pieces["error"] = ("tag '%s' doesn't start with prefix '%s'"
+                               % (full_tag, tag_prefix))
+            return pieces
+        pieces["closest-tag"] = full_tag[len(tag_prefix):]
+
+        # distance: number of commits since tag
+        pieces["distance"] = int(mo.group(2))
+
+        # commit: short hex revision ID
+        pieces["short"] = mo.group(3)
+
+    else:
+        # HEX: no tags
+        pieces["closest-tag"] = None
+        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
+                                    cwd=root)
+        pieces["distance"] = int(count_out)  # total number of commits
+
+    # commit date: see ISO-8601 comment in git_versions_from_keywords()
+    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"],
+                       cwd=root)[0].strip()
+    # Use only the last line.  Previous lines may contain GPG signature
+    # information.
+    date = date.splitlines()[-1]
+    pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+
+    return pieces
+
+
+def plus_or_dot(pieces):
+    """Return a + if we don't already have one, else return a ."""
+    if "+" in pieces.get("closest-tag", ""):
+        return "."
+    return "+"
+
+
+def render_pep440(pieces):
+    """Build up version string, with post-release "local version identifier".
+
+    Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
+    get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
+
+    Exceptions:
+    1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += plus_or_dot(pieces)
+            rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
+            if pieces["dirty"]:
+                rendered += ".dirty"
+    else:
+        # exception #1
+        rendered = "0+untagged.%d.g%s" % (pieces["distance"],
+                                          pieces["short"])
+        if pieces["dirty"]:
+            rendered += ".dirty"
+    return rendered
+
+
+def render_pep440_pre(pieces):
+    """TAG[.post0.devDISTANCE] -- No -dirty.
+
+    Exceptions:
+    1: no tags. 0.post0.devDISTANCE
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += ".post0.dev%d" % pieces["distance"]
+    else:
+        # exception #1
+        rendered = "0.post0.dev%d" % pieces["distance"]
+    return rendered
+
+
+def render_pep440_post(pieces):
+    """TAG[.postDISTANCE[.dev0]+gHEX] .
+
+    The ".dev0" means dirty. Note that .dev0 sorts backwards
+    (a dirty tree will appear "older" than the corresponding clean one),
+    but you shouldn't be releasing software with -dirty anyways.
+
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+            rendered += plus_or_dot(pieces)
+            rendered += "g%s" % pieces["short"]
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+        rendered += "+g%s" % pieces["short"]
+    return rendered
+
+
+def render_pep440_old(pieces):
+    """TAG[.postDISTANCE[.dev0]] .
+
+    The ".dev0" means dirty.
+
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+    return rendered
+
+
+def render_git_describe(pieces):
+    """TAG[-DISTANCE-gHEX][-dirty].
+
+    Like 'git describe --tags --dirty --always'.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render_git_describe_long(pieces):
+    """TAG-DISTANCE-gHEX[-dirty].
+
+    Like 'git describe --tags --dirty --always -long'.
+    The distance/hash is unconditional.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render(pieces, style):
+    """Render the given version pieces into the requested style."""
+    if pieces["error"]:
+        return {"version": "unknown",
+                "full-revisionid": pieces.get("long"),
+                "dirty": None,
+                "error": pieces["error"],
+                "date": None}
+
+    if not style or style == "default":
+        style = "pep440"  # the default
+
+    if style == "pep440":
+        rendered = render_pep440(pieces)
+    elif style == "pep440-pre":
+        rendered = render_pep440_pre(pieces)
+    elif style == "pep440-post":
+        rendered = render_pep440_post(pieces)
+    elif style == "pep440-old":
+        rendered = render_pep440_old(pieces)
+    elif style == "git-describe":
+        rendered = render_git_describe(pieces)
+    elif style == "git-describe-long":
+        rendered = render_git_describe_long(pieces)
+    else:
+        raise ValueError("unknown style '%s'" % style)
+
+    return {"version": rendered, "full-revisionid": pieces["long"],
+            "dirty": pieces["dirty"], "error": None,
+            "date": pieces.get("date")}
+
+
+def get_versions():
+    """Get version information or return default if unable to do so."""
+    # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
+    # __file__, we can work backwards from there to the root. Some
+    # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
+    # case we can only use expanded keywords.
+
+    cfg = get_config()
+    verbose = cfg.verbose
+
+    try:
+        return git_versions_from_keywords(get_keywords(), cfg.tag_prefix,
+                                          verbose)
+    except NotThisMethod:
+        pass
+
+    try:
+        root = os.path.realpath(__file__)
+        # versionfile_source is the relative path from the top of the source
+        # tree (where the .git directory might live) to this file. Invert
+        # this to find the root from __file__.
+        for i in cfg.versionfile_source.split('/'):
+            root = os.path.dirname(root)
+    except NameError:
+        return {"version": "0+unknown", "full-revisionid": None,
+                "dirty": None,
+                "error": "unable to find root of source tree",
+                "date": None}
+
+    try:
+        pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
+        return render(pieces, cfg.style)
+    except NotThisMethod:
+        pass
+
+    try:
+        if cfg.parentdir_prefix:
+            return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
+    except NotThisMethod:
+        pass
+
+    return {"version": "0+unknown", "full-revisionid": None,
+            "dirty": None,
+            "error": "unable to compute version", "date": None}
diff --git a/numba_dppy/compiler.py b/numba_dppy/compiler.py
index 7f0f7c8411..37b9e25e9f 100644
--- a/numba_dppy/compiler.py
+++ b/numba_dppy/compiler.py
@@ -2,7 +2,7 @@
 import copy
 from collections import namedtuple
 
-from .dppl_passbuilder import DPPLPassBuilder
+from .dppy_passbuilder import DPPYPassBuilder
 from numba.core.typing.templates import ConcreteTemplate
 from numba.core import types, compiler, ir
 from numba.core.typing.templates import AbstractTemplate
@@ -11,26 +11,31 @@
 from inspect import signature
 
 import dpctl
-import dpctl._memory as dpctl_mem
+import dpctl.memory as dpctl_mem
+import dpctl.program as dpctl_prog
 import numpy as np
 
 from . import spirv_generator
 
 import os
 from numba.core.compiler import DefaultPassBuilder, CompilerBase
+from numba_dppy.dppy_parfor_diagnostics import ExtendedParforDiagnostics
+
+
+DEBUG = os.environ.get('NUMBA_DPPY_DEBUG', None)
+_NUMBA_DPPY_READ_ONLY  = "read_only"
+_NUMBA_DPPY_WRITE_ONLY = "write_only"
+_NUMBA_DPPY_READ_WRITE = "read_write"
 
-DEBUG=os.environ.get('NUMBA_DPPL_DEBUG', None)
-_NUMBA_DPPL_READ_ONLY  = "read_only"
-_NUMBA_DPPL_WRITE_ONLY = "write_only"
-_NUMBA_DPPL_READ_WRITE = "read_write"
 
 def _raise_no_device_found_error():
     error_message = ("No OpenCL device specified. "
                      "Usage : jit_fn[device, globalsize, localsize](...)")
     raise ValueError(error_message)
 
+
 def _raise_invalid_kernel_enqueue_args():
-    error_message = ("Incorrect number of arguments for enquing dppl.kernel. "
+    error_message = ("Incorrect number of arguments for enquing dppy.kernel. "
                      "Usage: device_env, global size, local size. "
                      "The local size argument is optional.")
     raise ValueError(error_message)
@@ -51,15 +56,17 @@ def get_ordered_arg_access_types(pyfunc, access_types):
     return ordered_arg_access_types
 
 
-class DPPLCompiler(CompilerBase):
-    """ DPPL Compiler """
+class DPPYCompiler(CompilerBase):
+    """ DPPY Compiler """
 
     def define_pipelines(self):
         # this maintains the objmode fallback behaviour
         pms = []
+        self.state.parfor_diagnostics = ExtendedParforDiagnostics()
+        self.state.metadata['parfor_diagnostics'] = self.state.parfor_diagnostics
         if not self.state.flags.force_pyobject:
-            #print("Numba-DPPL [INFO]: Using Numba-DPPL pipeline")
-            pms.append(DPPLPassBuilder.define_nopython_pipeline(self.state))
+            #print("Numba-DPPY [INFO]: Using Numba-DPPY pipeline")
+            pms.append(DPPYPassBuilder.define_nopython_pipeline(self.state))
         if self.state.status.can_fallback or self.state.flags.force_pyobject:
             pms.append(
                 DefaultPassBuilder.define_objectmode_pipeline(self.state)
@@ -71,15 +78,17 @@ def define_pipelines(self):
         return pms
 
 
-def compile_with_dppl(pyfunc, return_type, args, debug):
+def compile_with_dppy(pyfunc, return_type, args, debug):
     # First compilation will trigger the initialization of the OpenCL backend.
-    from .descriptor import dppl_target
+    from .descriptor import dppy_target
+
+    typingctx = dppy_target.typing_context
+    targetctx = dppy_target.target_context
 
-    typingctx = dppl_target.typing_context
-    targetctx = dppl_target.target_context
-    # TODO handle debug flag
     flags = compiler.Flags()
     # Do not compile (generate native code), just lower (to LLVM)
+    if debug:
+        flags.set('debuginfo')
     flags.set('no_compile')
     flags.set('no_cpython_wrapper')
     flags.unset('nrt')
@@ -93,7 +102,7 @@ def compile_with_dppl(pyfunc, return_type, args, debug):
                                       return_type=return_type,
                                       flags=flags,
                                       locals={},
-                                      pipeline_class=DPPLCompiler)
+                                      pipeline_class=DPPYCompiler)
     elif isinstance(pyfunc, ir.FunctionIR):
         cres = compiler.compile_ir(typingctx=typingctx,
                                    targetctx=targetctx,
@@ -102,7 +111,7 @@ def compile_with_dppl(pyfunc, return_type, args, debug):
                                    return_type=return_type,
                                    flags=flags,
                                    locals={},
-                                   pipeline_class=DPPLCompiler)
+                                   pipeline_class=DPPYCompiler)
     else:
         assert(0)
     # Linking depending libraries
@@ -116,11 +125,12 @@ def compile_with_dppl(pyfunc, return_type, args, debug):
 def compile_kernel(sycl_queue, pyfunc, args, access_types, debug=False):
     if DEBUG:
         print("compile_kernel", args)
+        debug = True
     if not sycl_queue:
         # This will be get_current_queue
         sycl_queue = dpctl.get_current_queue()
 
-    cres = compile_with_dppl(pyfunc, None, args, debug=debug)
+    cres = compile_with_dppy(pyfunc, None, args, debug=debug)
     func = cres.library.get_function(cres.fndesc.llvm_func_name)
     kernel = cres.target_context.prepare_ocl_kernel(func, cres.signature.args)
     # The kernel objet should have a reference to the target context it is compiled for.
@@ -128,7 +138,7 @@ def compile_kernel(sycl_queue, pyfunc, args, access_types, debug=False):
     # depending on the target context. For example, we want to link our kernel object
     # with implementation containing atomic operations only when atomic operations
     # are being used in the kernel.
-    oclkern = DPPLKernel(context=cres.target_context,
+    oclkern = DPPYKernel(context=cres.target_context,
                          sycl_queue=sycl_queue,
                          llvm_module=kernel.module,
                          name=kernel.name,
@@ -146,7 +156,7 @@ def compile_kernel_parfor(sycl_queue, func_ir, args, args_with_addrspaces,
             if isinstance(a, types.npytypes.Array):
                 print("addrspace:", a.addrspace)
 
-    cres = compile_with_dppl(func_ir, None, args_with_addrspaces,
+    cres = compile_with_dppy(func_ir, None, args_with_addrspaces,
                              debug=debug)
     func = cres.library.get_function(cres.fndesc.llvm_func_name)
 
@@ -159,7 +169,7 @@ def compile_kernel_parfor(sycl_queue, func_ir, args, args_with_addrspaces,
 
     kernel = cres.target_context.prepare_ocl_kernel(func, cres.signature.args)
     #kernel = cres.target_context.prepare_ocl_kernel(func, args_with_addrspaces)
-    oclkern = DPPLKernel(context=cres.target_context,
+    oclkern = DPPYKernel(context=cres.target_context,
                          sycl_queue=sycl_queue,
                          llvm_module=kernel.module,
                          name=kernel.name,
@@ -168,44 +178,44 @@ def compile_kernel_parfor(sycl_queue, func_ir, args, args_with_addrspaces,
     return oclkern
 
 
-def compile_dppl_func(pyfunc, return_type, args, debug=False):
-    cres = compile_with_dppl(pyfunc, return_type, args, debug=debug)
+def compile_dppy_func(pyfunc, return_type, args, debug=False):
+    cres = compile_with_dppy(pyfunc, return_type, args, debug=debug)
     func = cres.library.get_function(cres.fndesc.llvm_func_name)
     cres.target_context.mark_ocl_device(func)
-    devfn = DPPLFunction(cres)
+    devfn = DPPYFunction(cres)
 
-    class dppl_function_template(ConcreteTemplate):
+    class dppy_function_template(ConcreteTemplate):
         key = devfn
         cases = [cres.signature]
 
-    cres.typing_context.insert_user_function(devfn, dppl_function_template)
+    cres.typing_context.insert_user_function(devfn, dppy_function_template)
     libs = [cres.library]
     cres.target_context.insert_user_function(devfn, cres.fndesc, libs)
     return devfn
 
 
-# Compile dppl function template
-def compile_dppl_func_template(pyfunc):
-    """Compile a DPPLFunctionTemplate
+# Compile dppy function template
+def compile_dppy_func_template(pyfunc):
+    """Compile a DPPYFunctionTemplate
     """
-    from .descriptor import dppl_target
+    from .descriptor import dppy_target
 
-    dft = DPPLFunctionTemplate(pyfunc)
+    dft = DPPYFunctionTemplate(pyfunc)
 
-    class dppl_function_template(AbstractTemplate):
+    class dppy_function_template(AbstractTemplate):
         key = dft
 
         def generic(self, args, kws):
             assert not kws
             return dft.compile(args)
 
-    typingctx = dppl_target.typing_context
-    typingctx.insert_user_function(dft, dppl_function_template)
+    typingctx = dppy_target.typing_context
+    typingctx.insert_user_function(dft, dppy_function_template)
     return dft
 
 
-class DPPLFunctionTemplate(object):
-    """Unmaterialized dppl function
+class DPPYFunctionTemplate(object):
+    """Unmaterialized dppy function
     """
     def __init__(self, pyfunc, debug=False):
         self.py_func = pyfunc
@@ -220,7 +230,7 @@ def compile(self, args):
         this object.
         """
         if args not in self._compileinfos:
-            cres = compile_with_dppl(self.py_func, None, args, debug=self.debug)
+            cres = compile_with_dppy(self.py_func, None, args, debug=self.debug)
             func = cres.library.get_function(cres.fndesc.llvm_func_name)
             cres.target_context.mark_ocl_device(func)
             first_definition = not self._compileinfos
@@ -240,7 +250,7 @@ def compile(self, args):
         return cres.signature
 
 
-class DPPLFunction(object):
+class DPPYFunction(object):
     def __init__(self, cres):
         self.cres = cres
 
@@ -282,7 +292,7 @@ def _ensure_valid_work_group_size(val, work_item_grid):
     return list(val[::-1]) # reversing due to sycl and opencl interop kernel range mismatch semantic
 
 
-class DPPLKernelBase(object):
+class DPPYKernelBase(object):
     """Define interface for configurable kernels
     """
 
@@ -293,9 +303,9 @@ def __init__(self):
 
         # list of supported access types, stored in dict for fast lookup
         self.valid_access_types = {
-                _NUMBA_DPPL_READ_ONLY: _NUMBA_DPPL_READ_ONLY,
-                _NUMBA_DPPL_WRITE_ONLY: _NUMBA_DPPL_WRITE_ONLY,
-                _NUMBA_DPPL_READ_WRITE: _NUMBA_DPPL_READ_WRITE}
+                _NUMBA_DPPY_READ_ONLY: _NUMBA_DPPY_READ_ONLY,
+                _NUMBA_DPPY_WRITE_ONLY: _NUMBA_DPPY_WRITE_ONLY,
+                _NUMBA_DPPY_READ_WRITE: _NUMBA_DPPY_READ_WRITE}
 
     def copy(self):
         return copy.copy(self)
@@ -331,14 +341,14 @@ def __getitem__(self, args):
         return self.configure(sycl_queue, gs, ls)
 
 
-class DPPLKernel(DPPLKernelBase):
+class DPPYKernel(DPPYKernelBase):
     """
     A OCL kernel object
     """
 
     def __init__(self, context, sycl_queue, llvm_module, name, argtypes,
                  ordered_arg_access_types=None):
-        super(DPPLKernel, self).__init__()
+        super(DPPYKernel, self).__init__()
         self._llvm_module = llvm_module
         self.assembly = self.binary = llvm_module.__str__()
         self.entry_name = name
@@ -355,7 +365,7 @@ def __init__(self, context, sycl_queue, llvm_module, name, argtypes,
         self.spirv_bc = spirv_generator.llvm_to_spirv(self.context, self.binary)
 
         # create a program
-        self.program = dpctl.create_program_from_spirv(self.sycl_queue, self.spirv_bc)
+        self.program = dpctl_prog.create_program_from_spirv(self.sycl_queue, self.spirv_bc)
         #  create a kernel
         self.kernel = self.program.get_sycl_kernel(self.entry_name)
 
@@ -385,7 +395,7 @@ def _pack_argument(self, ty, val, sycl_queue, device_arr, access_type):
         """
         if (device_arr and (access_type not in self.valid_access_types or
             access_type in self.valid_access_types and
-            self.valid_access_types[access_type] != _NUMBA_DPPL_READ_ONLY)):
+            self.valid_access_types[access_type] != _NUMBA_DPPY_READ_ONLY)):
             # we get the date back to host if have created a
             # device_array or if access_type of this device_array
             # is not of type read_only and read_write
@@ -402,15 +412,15 @@ def _unpack_device_array_argument(self, val, kernelargs):
         # parent
         kernelargs.append(ctypes.c_size_t(0))
 
-        kernelargs.append(ctypes.c_long(val.size))
-        kernelargs.append(ctypes.c_long(val.dtype.itemsize))
+        kernelargs.append(ctypes.c_longlong(val.size))
+        kernelargs.append(ctypes.c_longlong(val.dtype.itemsize))
 
         kernelargs.append(val.base)
 
         for ax in range(val.ndim):
-            kernelargs.append(ctypes.c_long(val.shape[ax]))
+            kernelargs.append(ctypes.c_longlong(val.shape[ax]))
         for ax in range(val.ndim):
-            kernelargs.append(ctypes.c_long(val.strides[ax]))
+            kernelargs.append(ctypes.c_longlong(val.strides[ax]))
 
 
     def _unpack_argument(self, ty, val, sycl_queue, retr, kernelargs,
@@ -422,7 +432,7 @@ def _unpack_argument(self, ty, val, sycl_queue, retr, kernelargs,
         device_arrs.append(None)
 
         if isinstance(ty, types.Array):
-            if isinstance(val.base, dpctl_mem.Memory):
+            if hasattr(val.base, "__sycl_usm_array_interface__"):
                 self._unpack_device_array_argument(val, kernelargs)
             else:
                 default_behavior = self.check_for_invalid_access_type(access_type)
@@ -431,24 +441,24 @@ def _unpack_argument(self, ty, val, sycl_queue, retr, kernelargs,
                 usm_ndarr = np.ndarray(val.shape, buffer=usm_buf, dtype=val.dtype)
 
                 if (default_behavior or
-                    self.valid_access_types[access_type] == _NUMBA_DPPL_READ_ONLY or
-                    self.valid_access_types[access_type] == _NUMBA_DPPL_READ_WRITE):
+                    self.valid_access_types[access_type] == _NUMBA_DPPY_READ_ONLY or
+                    self.valid_access_types[access_type] == _NUMBA_DPPY_READ_WRITE):
                     np.copyto(usm_ndarr, val)
 
                 device_arrs[-1] = (usm_buf, usm_ndarr, val)
                 self._unpack_device_array_argument(usm_ndarr, kernelargs)
 
         elif ty == types.int64:
-            cval = ctypes.c_long(val)
+            cval = ctypes.c_longlong(val)
             kernelargs.append(cval)
         elif ty == types.uint64:
-            cval = ctypes.c_long(val)
+            cval = ctypes.c_ulonglong(val)
             kernelargs.append(cval)
         elif ty == types.int32:
             cval = ctypes.c_int(val)
             kernelargs.append(cval)
         elif ty == types.uint32:
-            cval = ctypes.c_int(val)
+            cval = ctypes.c_uint(val)
             kernelargs.append(cval)
         elif ty == types.float64:
             cval = ctypes.c_double(val)
@@ -486,18 +496,18 @@ def check_for_invalid_access_type(self, access_type):
             return False
 
 
-class JitDPPLKernel(DPPLKernelBase):
+class JitDPPYKernel(DPPYKernelBase):
     def __init__(self, func, access_types):
 
-        super(JitDPPLKernel, self).__init__()
+        super(JitDPPYKernel, self).__init__()
 
         self.py_func = func
         self.definitions = {}
         self.access_types = access_types
 
-        from .descriptor import dppl_target
+        from .descriptor import dppy_target
 
-        self.typingctx = dppl_target.typing_context
+        self.typingctx = dppy_target.typing_context
 
     def __call__(self, *args, **kwargs):
         assert not kwargs, "Keyword Arguments are not supported"
diff --git a/numba_dppy/config.py b/numba_dppy/config.py
new file mode 100644
index 0000000000..84df7913c3
--- /dev/null
+++ b/numba_dppy/config.py
@@ -0,0 +1,41 @@
+import os
+
+
+try:
+    import dpctl
+
+    dppy_present = dpctl.has_sycl_platforms() and dpctl.has_gpu_queues()
+except:
+    dppy_present = False
+
+
+def _readenv(name, ctor, default):
+    """Original version from numba\core\config.py
+    class _EnvReloader():
+        ...
+        def process_environ():
+            def _readenv(): ...
+    """
+    value = os.environ.get(name)
+    if value is None:
+        return default() if callable(default) else default
+    try:
+        return ctor(value)
+    except Exception:
+        warnings.warn(
+            "environ %s defined but failed to parse '%s'" % (name, value),
+            RuntimeWarning,
+        )
+        return default
+
+
+# Save intermediate files being generated by DPPY
+SAVE_IR_FILES = _readenv("NUMBA_DPPY_SAVE_IR_FILES", int, 0)
+
+# Turn SPIRV-VALIDATION ON/OFF switch
+SPIRV_VAL = _readenv("NUMBA_DPPY_SPIRV_VAL", int, 0)
+
+# Dump offload diagnostics
+OFFLOAD_DIAGNOSTICS = _readenv("NUMBA_DPPY_OFFLOAD_DIAGNOSTICS", int, 0)
+
+FALLBACK_ON_CPU = _readenv("NUMBA_DPPY_FALLBACK_ON_CPU", int, 1)
diff --git a/numba_dppy/decorators.py b/numba_dppy/decorators.py
index a8b6bbba36..641d924134 100644
--- a/numba_dppy/decorators.py
+++ b/numba_dppy/decorators.py
@@ -1,11 +1,11 @@
 from __future__ import print_function, absolute_import, division
 from numba.core import sigutils, types
-from .compiler import (compile_kernel, JitDPPLKernel, compile_dppl_func_template,
-                       compile_dppl_func, get_ordered_arg_access_types)
+from .compiler import (compile_kernel, JitDPPYKernel, compile_dppy_func_template,
+                       compile_dppy_func, get_ordered_arg_access_types)
 
 
 def kernel(signature=None, access_types=None, debug=False):
-    """JIT compile a python function conforming using the DPPL backend.
+    """JIT compile a python function conforming using the DPPY backend.
 
     A kernel is equvalent to an OpenCL kernel function, and has the
     same restrictions as definined by SPIR_KERNEL calling convention.
@@ -22,14 +22,14 @@ def kernel(signature=None, access_types=None, debug=False):
 def autojit(debug=False, access_types=None):
     def _kernel_autojit(pyfunc):
         ordered_arg_access_types = get_ordered_arg_access_types(pyfunc, access_types)
-        return JitDPPLKernel(pyfunc, ordered_arg_access_types)
+        return JitDPPYKernel(pyfunc, ordered_arg_access_types)
     return _kernel_autojit
 
 
 def _kernel_jit(signature, debug, access_types):
     argtypes, restype = sigutils.normalize_signature(signature)
     if restype is not None and restype != types.void:
-        msg = ("DPPL kernel must have void return type but got {restype}")
+        msg = ("DPPY kernel must have void return type but got {restype}")
         raise TypeError(msg.format(restype=restype))
 
     def _wrapped(pyfunc):
@@ -54,9 +54,9 @@ def _func_jit(signature):
     argtypes, restype = sigutils.normalize_signature(signature)
 
     def _wrapped(pyfunc):
-        return compile_dppl_func(pyfunc, restype, argtypes)
+        return compile_dppy_func(pyfunc, restype, argtypes)
 
     return _wrapped
 
 def _func_autojit(pyfunc):
-    return compile_dppl_func_template(pyfunc)
+    return compile_dppy_func_template(pyfunc)
diff --git a/numba_dppy/descriptor.py b/numba_dppy/descriptor.py
index c0a24868c2..c8e6a58ec7 100644
--- a/numba_dppy/descriptor.py
+++ b/numba_dppy/descriptor.py
@@ -3,41 +3,41 @@
 from numba.core.options import TargetOptions
 
 from numba.core import dispatcher, utils, typing
-from .target import DPPLTargetContext, DPPLTypingContext
+from .target import DPPYTargetContext, DPPYTypingContext
 
 from numba.core.cpu import CPUTargetOptions
 
 
-class DPPLTarget(TargetDescriptor):
+class DPPYTarget(TargetDescriptor):
     options = CPUTargetOptions
-    #typingctx = DPPLTypingContext()
-    #targetctx = DPPLTargetContext(typingctx)
+    #typingctx = DPPYTypingContext()
+    #targetctx = DPPYTargetContext(typingctx)
 
     @utils.cached_property
     def _toplevel_target_context(self):
         # Lazily-initialized top-level target context, for all threads
-        return DPPLTargetContext(self.typing_context)
+        return DPPYTargetContext(self.typing_context)
 
     @utils.cached_property
     def _toplevel_typing_context(self):
         # Lazily-initialized top-level typing context, for all threads
-        return DPPLTypingContext()
+        return DPPYTypingContext()
 
     @property
     def target_context(self):
         """
-        The target context for DPPL targets.
+        The target context for DPPY targets.
         """
         return self._toplevel_target_context
 
     @property
     def typing_context(self):
         """
-        The typing context for DPPL targets.
+        The typing context for DPPY targets.
         """
         return self._toplevel_typing_context
 
 
 
-# The global DPPL target
-dppl_target = DPPLTarget()
+# The global DPPY target
+dppy_target = DPPYTarget()
diff --git a/numba_dppy/device_init.py b/numba_dppy/device_init.py
index c4506014a8..efec55ba83 100644
--- a/numba_dppy/device_init.py
+++ b/numba_dppy/device_init.py
@@ -18,6 +18,14 @@
     CLK_GLOBAL_MEM_FENCE,
 )
 
+"""
+We are importing dpnp stub module to make Numba recognize the
+module when we rename Numpy functions.
+"""
+from .dpnp_glue.stubs import (
+    dpnp
+)
+
 DEFAULT_LOCAL_SIZE = []
 
 from . import initialize
@@ -35,9 +43,4 @@ def is_available():
     return dpctl.has_gpu_queues()
 
 
-#def ocl_error():
-#    """Returns None or an exception if the OpenCL driver fails to initialize.
-#    """
-#    return driver.driver.initialization_error
-
 initialize.initialize_all()
diff --git a/numba_dppy/dispatcher.py b/numba_dppy/dispatcher.py
index a4c32ec7ec..d00a597875 100644
--- a/numba_dppy/dispatcher.py
+++ b/numba_dppy/dispatcher.py
@@ -4,17 +4,17 @@
 
 #from numba.targets.descriptors import TargetDescriptor
 #from numba.targets.options import TargetOptions
-#import numba_dppy, numba_dppy as dppl
+#import numba_dppy, numba_dppy as dppy
 from numba_dppy import kernel, autojit
-from .descriptor import dppl_target
+from .descriptor import dppy_target
 #from numba.npyufunc.deviceufunc import (UFuncMechanism, GenerializedUFunc,
  #                                       GUFuncCallSteps)
 
 from .. import dispatcher, utils, typing
-from .compiler import DPPLCompiler
+from .compiler import DPPYCompiler
 
-class DPPLDispatcher(dispatcher.Dispatcher):
-    targetdescr = dppl_target
+class DPPYDispatcher(dispatcher.Dispatcher):
+    targetdescr = dppy_target
 
 
     def __init__(self, py_func, locals={}, targetoptions={}):
@@ -58,7 +58,7 @@ def __getitem__(self, *args):
     def __getattr__(self, key):
         return getattr(self.compiled, key)
 
-class DPPLUFuncDispatcher(object):
+class DPPYUFuncDispatcher(object):
     """
     Invoke the OpenCL ufunc specialization for the given inputs.
     """
@@ -86,7 +86,7 @@ def __call__(self, *args, **kws):
                       depending on the input arguments.  Type must match
                       the input arguments.
         """
-        return DPPLUFuncMechanism.call(self.functions, args, kws)
+        return DPPYUFuncMechanism.call(self.functions, args, kws)
 
     def reduce(self, arg, stream=0):
         assert len(list(self.functions.keys())[0]) == 2, "must be a binary " \
@@ -142,7 +142,7 @@ def __reduce(self, mem, gpu_mems, stream):
                 return left
 
 
-class _DPPLGUFuncCallSteps(GUFuncCallSteps):
+class _DPPYGUFuncCallSteps(GUFuncCallSteps):
     __slots__ = [
         '_stream',
     ]
@@ -167,10 +167,10 @@ def launch_kernel(self, kernel, nelem, args):
         kernel.forall(nelem, queue=self._stream)(*args)
 
 
-class DPPLGenerializedUFunc(GenerializedUFunc):
+class DPPYGenerializedUFunc(GenerializedUFunc):
     @property
     def _call_steps(self):
-        return _DPPLGUFuncCallSteps
+        return _DPPYGUFuncCallSteps
 
     def _broadcast_scalar_input(self, ary, shape):
         return devicearray.DeviceNDArray(shape=shape,
@@ -188,7 +188,7 @@ def _broadcast_add_axis(self, ary, newshape):
                                          gpu_data=ary.gpu_data)
 
 
-class DPPLUFuncMechanism(UFuncMechanism):
+class DPPYUFuncMechanism(UFuncMechanism):
     """
     Provide OpenCL specialization
     """
diff --git a/numba_dppy/dpctl_functions.py b/numba_dppy/dpctl_functions.py
new file mode 100644
index 0000000000..67bc358185
--- /dev/null
+++ b/numba_dppy/dpctl_functions.py
@@ -0,0 +1,30 @@
+from numba import types
+from numba.core.typing import signature
+
+
+class _DPCTL_FUNCTIONS:
+    @classmethod
+    def dpctl_get_current_queue(cls):
+        ret_type = types.voidptr
+        sig = signature(ret_type)
+        return types.ExternalFunction("DPCTLQueueMgr_GetCurrentQueue", sig)
+
+    @classmethod
+    def dpctl_malloc_shared(cls):
+        ret_type = types.voidptr
+        sig = signature(ret_type, types.int64, types.voidptr)
+        return types.ExternalFunction("DPCTLmalloc_shared", sig)
+
+    @classmethod
+    def dpctl_queue_memcpy(cls):
+        ret_type = types.void
+        sig = signature(
+            ret_type, types.voidptr, types.voidptr, types.voidptr, types.int64
+        )
+        return types.ExternalFunction("DPCTLQueue_Memcpy", sig)
+
+    @classmethod
+    def dpctl_free_with_queue(cls):
+        ret_type = types.void
+        sig = signature(ret_type, types.voidptr, types.voidptr)
+        return types.ExternalFunction("DPCTLfree_with_queue", sig)
diff --git a/numba_dppy/dpnp_glue/__init__.py b/numba_dppy/dpnp_glue/__init__.py
new file mode 100644
index 0000000000..17d6b5ad6a
--- /dev/null
+++ b/numba_dppy/dpnp_glue/__init__.py
@@ -0,0 +1,6 @@
+def ensure_dpnp(name):
+    try:
+       # import dpnp
+        from . import dpnp_fptr_interface as dpnp_glue
+    except ImportError:
+        raise ImportError("dpNP is needed to call np.%s" % name)
diff --git a/numba_dppy/dpnp_glue/dpnp_fptr_interface.pyx b/numba_dppy/dpnp_glue/dpnp_fptr_interface.pyx
index 8eba8bf74c..a63d4fdafa 100644
--- a/numba_dppy/dpnp_glue/dpnp_fptr_interface.pyx
+++ b/numba_dppy/dpnp_glue/dpnp_fptr_interface.pyx
@@ -8,6 +8,7 @@ cdef extern from "backend_iface_fptr.hpp" namespace "DPNPFuncName":  # need this
     cdef enum DPNPFuncName "DPNPFuncName":
         DPNP_FN_ABSOLUTE
         DPNP_FN_ADD
+        DPNP_FN_ARANGE
         DPNP_FN_ARCCOS
         DPNP_FN_ARCCOSH
         DPNP_FN_ARCSIN
@@ -18,40 +19,77 @@ cdef extern from "backend_iface_fptr.hpp" namespace "DPNPFuncName":  # need this
         DPNP_FN_ARGMAX
         DPNP_FN_ARGMIN
         DPNP_FN_ARGSORT
+        DPNP_FN_BITWISE_AND
+        DPNP_FN_BITWISE_OR
+        DPNP_FN_BITWISE_XOR
         DPNP_FN_CBRT
         DPNP_FN_CEIL
+        DPNP_FN_CHOLESKY
+        DPNP_FN_COPYSIGN
+        DPNP_FN_CORRELATE
         DPNP_FN_COS
         DPNP_FN_COSH
         DPNP_FN_COV
         DPNP_FN_DEGREES
+        DPNP_FN_DET
         DPNP_FN_DIVIDE
         DPNP_FN_DOT
         DPNP_FN_EIG
+        DPNP_FN_EIGVALS
         DPNP_FN_EXP
         DPNP_FN_EXP2
         DPNP_FN_EXPM1
         DPNP_FN_FABS
+        DPNP_FN_FFT_FFT
         DPNP_FN_FLOOR
+        DPNP_FN_FLOOR_DIVIDE
         DPNP_FN_FMOD
-        DPNP_FN_GAUSSIAN
         DPNP_FN_HYPOT
+        DPNP_FN_INVERT
+        DPNP_FN_LEFT_SHIFT
         DPNP_FN_LOG
         DPNP_FN_LOG10
         DPNP_FN_LOG1P
         DPNP_FN_LOG2
         DPNP_FN_MATMUL
+        DPNP_FN_MATRIX_RANK
         DPNP_FN_MAX
         DPNP_FN_MAXIMUM
         DPNP_FN_MEAN
         DPNP_FN_MEDIAN
         DPNP_FN_MIN
         DPNP_FN_MINIMUM
+        DPNP_FN_MODF
         DPNP_FN_MULTIPLY
         DPNP_FN_POWER
         DPNP_FN_PROD
-        DPNP_FN_UNIFORM
         DPNP_FN_RADIANS
+        DPNP_FN_REMAINDER
         DPNP_FN_RECIP
+        DPNP_FN_RIGHT_SHIFT
+        DPNP_FN_RNG_BETA
+        DPNP_FN_RNG_BINOMIAL
+        DPNP_FN_RNG_CHISQUARE
+        DPNP_FN_RNG_EXPONENTIAL
+        DPNP_FN_RNG_GAMMA
+        DPNP_FN_RNG_GAUSSIAN
+        DPNP_FN_RNG_GEOMETRIC
+        DPNP_FN_RNG_GUMBEL
+        DPNP_FN_RNG_HYPERGEOMETRIC
+        DPNP_FN_RNG_LAPLACE
+        DPNP_FN_RNG_LOGNORMAL
+        DPNP_FN_RNG_MULTINOMIAL
+        DPNP_FN_RNG_MULTIVARIATE_NORMAL
+        DPNP_FN_RNG_NEGATIVE_BINOMIAL
+        DPNP_FN_RNG_NORMAL
+        DPNP_FN_RNG_POISSON
+        DPNP_FN_RNG_RAYLEIGH
+        DPNP_FN_RNG_STANDARD_CAUCHY
+        DPNP_FN_RNG_STANDARD_EXPONENTIAL
+        DPNP_FN_RNG_STANDARD_GAMMA
+        DPNP_FN_RNG_STANDARD_NORMAL
+        DPNP_FN_RNG_UNIFORM
+        DPNP_FN_RNG_WEIBULL
         DPNP_FN_SIGN
         DPNP_FN_SIN
         DPNP_FN_SINH
@@ -109,6 +147,8 @@ cdef DPNPFuncName get_DPNPFuncName_from_str(name):
         return DPNPFuncName.DPNP_FN_ARGSORT
     elif name == "dpnp_cov":
         return DPNPFuncName.DPNP_FN_COV
+    elif name == "dpnp_eig":
+        return DPNPFuncName.DPNP_FN_EIG
     else:
         return  DPNPFuncName.DPNP_FN_DOT
 
diff --git a/numba_dppy/dpnp_glue/dpnp_linalgimpl.py b/numba_dppy/dpnp_glue/dpnp_linalgimpl.py
new file mode 100644
index 0000000000..9146299b05
--- /dev/null
+++ b/numba_dppy/dpnp_glue/dpnp_linalgimpl.py
@@ -0,0 +1,274 @@
+import numba_dppy.dpnp_glue.dpnpimpl as dpnp_ext
+from numba import types
+from numba.core.typing import signature
+from . import stubs
+import numba_dppy.dpnp_glue as dpnp_lowering
+from numba.core.extending import overload, register_jitable
+import numpy as np
+from numba_dppy.dpctl_functions import _DPCTL_FUNCTIONS
+
+@overload(stubs.dpnp.eig)
+def dpnp_eig_impl(a):
+    name = "eig"
+    dpnp_lowering.ensure_dpnp(name)
+    dpctl_functions = dpnp_ext._DPCTL_FUNCTIONS()
+
+    ret_type = types.void
+    """
+    dpnp source:
+    https://github.com/IntelPython/dpnp/blob/0.4.0/dpnp/backend/custom_kernels.cpp#L180
+
+    Function declaration:
+    void dpnp_eig_c(const void* array_in, void* result1, void* result2, size_t size)
+
+    """
+    sig = signature(
+        ret_type, types.voidptr, types.voidptr, types.voidptr, types.intp
+    )
+    dpnp_eig = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
+
+    get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
+    allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
+    copy_usm = dpctl_functions.dpctl_queue_memcpy()
+    free_usm = dpctl_functions.dpctl_free_with_queue()
+
+    res_dtype = np.float64
+    if a.dtype == types.float32:
+        res_dtype = np.float32
+
+    def dpnp_eig_impl(a):
+        n = a.shape[-1]
+        if a.shape[-2] != n:
+            msg = "Last 2 dimensions of the array must be square."
+            raise ValueError(msg)
+
+        dpnp_ext._check_finite_matrix(a)
+
+        wr = np.empty(n, dtype=res_dtype)
+        vr = np.empty((n, n), dtype=res_dtype)
+
+        if n == 0:
+            return (wr, vr)
+
+        sycl_queue = get_sycl_queue()
+        a_usm = allocate_usm_shared(a.size * a.itemsize, sycl_queue)
+        copy_usm(sycl_queue, a_usm, a.ctypes, a.size * a.itemsize)
+
+        wr_usm = allocate_usm_shared(wr.size * wr.itemsize, sycl_queue)
+        vr_usm = allocate_usm_shared(vr.size * vr.itemsize, sycl_queue)
+
+        dpnp_eig(a_usm, wr_usm, vr_usm, n)
+
+        copy_usm(sycl_queue, wr.ctypes, wr_usm, wr.size * wr.itemsize)
+        copy_usm(sycl_queue, vr.ctypes, vr_usm, vr.size * vr.itemsize)
+
+        free_usm(a_usm, sycl_queue)
+        free_usm(wr_usm, sycl_queue)
+        free_usm(vr_usm, sycl_queue)
+
+        dpnp_ext._dummy_liveness_func([wr.size, vr.size])
+
+        return (wr, vr)
+
+    return dpnp_eig_impl
+
+
+@overload(stubs.dpnp.matmul)
+@overload(stubs.dpnp.dot)
+def dpnp_dot_impl(a, b):
+    dpnp_lowering.ensure_dpnp("dot")
+    dpctl_functions = dpnp_ext._DPCTL_FUNCTIONS()
+
+    ret_type = types.void
+    """
+    dpnp source:
+    https://github.com/IntelPython/dpnp/blob/0.4.0/dpnp/backend/custom_kernels.cpp#L42
+    https://github.com/IntelPython/dpnp/blob/0.4.0/dpnp/backend/custom_kernels.cpp#L118
+
+    Function declaration:
+    void dpnp_matmul_c(void* array1_in, void* array2_in, void* result1, size_t size_m,
+                       size_t size_n, size_t size_k)
+    void dpnp_dot_c(void* array1_in, void* array2_in, void* result1, size_t size)
+
+    """
+    sig = signature(
+        ret_type, types.voidptr, types.voidptr, types.voidptr,
+                  types.intp, types.intp, types.intp)
+
+    get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
+    allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
+    copy_usm = dpctl_functions.dpctl_queue_memcpy()
+    free_usm = dpctl_functions.dpctl_free_with_queue()
+
+    res_dtype = np.float64
+    if a.dtype == types.int32 and b.dtype == types.int32:
+        res_dtype = np.int32
+    elif a.dtype == types.int32 and b.dtype == types.int64:
+        res_dtype = np.int64
+    elif a.dtype == types.int32 and b.dtype == types.float32:
+        res_dtype = np.float64
+    elif a.dtype == types.int32 and b.dtype == types.float64:
+        res_dtype = np.float64
+    elif a.dtype == types.int64 and b.dtype == types.int32:
+        res_dtype = np.int64
+    elif a.dtype == types.int64 and b.dtype == types.int64:
+        res_dtype = np.int64
+    elif a.dtype == types.int64 and b.dtype == types.float32:
+        res_dtype = np.float64
+    elif a.dtype == types.int64 and b.dtype == types.float64:
+        res_dtype = np.float64
+    elif a.dtype == types.float32 and b.dtype == types.int32:
+        res_dtype = np.float64
+    elif a.dtype == types.float32 and b.dtype == types.int64:
+        res_dtype = np.float64
+    elif a.dtype == types.float32 and b.dtype == types.float32:
+        res_dtype = np.float32
+    elif a.dtype == types.float32 and b.dtype == types.float64:
+        res_dtype = np.float64
+    elif a.dtype == types.float64 and b.dtype == types.int32:
+        res_dtype = np.float64
+    elif a.dtype == types.float64 and b.dtype == types.int64:
+        res_dtype = np.float64
+    elif a.dtype == types.float64 and b.dtype == types.float32:
+        res_dtype = np.float32
+    elif a.dtype == types.float64 and b.dtype == types.float64:
+        res_dtype = np.float64
+
+    ndims = [a.ndim, b.ndim]
+    if ndims == [2, 2]:
+        dpnp_func = dpnp_ext.dpnp_func("dpnp_matmul", [a.dtype.name, "NONE"], sig)
+        def dot_2_mm(a, b):
+            sycl_queue = get_sycl_queue()
+
+            m, k = a.shape
+            _k, n = b.shape
+
+            if _k != k:
+                raise ValueError("Incompatible array sizes for np.dot(a, b)")
+
+            a_usm = allocate_usm_shared(a.size * a.itemsize, sycl_queue)
+            copy_usm(sycl_queue, a_usm, a.ctypes, a.size * a.itemsize)
+
+            b_usm = allocate_usm_shared(b.size * b.itemsize, sycl_queue)
+            copy_usm(sycl_queue, b_usm, b.ctypes, b.size * b.itemsize)
+
+            out = np.empty((m, n), dtype=res_dtype)
+            out_usm = allocate_usm_shared(out.size * out.itemsize, sycl_queue)
+
+            dpnp_func(a_usm, b_usm, out_usm, m, n, k)
+
+            copy_usm(sycl_queue, out.ctypes, out_usm, out.size * out.itemsize)
+
+            free_usm(a_usm, sycl_queue)
+            free_usm(b_usm, sycl_queue)
+            free_usm(out_usm, sycl_queue)
+
+            dpnp_ext._dummy_liveness_func([a.size, b.size, out.size])
+
+            return out
+
+        return dot_2_mm
+    elif ndims == [2, 1]:
+        dpnp_func = dpnp_ext.dpnp_func("dpnp_matmul", [a.dtype.name, "NONE"], sig)
+        def dot_2_mv(a, b):
+            sycl_queue = get_sycl_queue()
+
+            m, k = a.shape
+            _n,  = b.shape
+            n = 1
+
+            if _n != k:
+                raise ValueError("Incompatible array sizes for np.dot(a, b)")
+
+            a_usm = allocate_usm_shared(a.size * a.itemsize, sycl_queue)
+            copy_usm(sycl_queue, a_usm, a.ctypes, a.size * a.itemsize)
+
+            b_usm = allocate_usm_shared(b.size * b.itemsize, sycl_queue)
+            copy_usm(sycl_queue, b_usm, b.ctypes, b.size * b.itemsize)
+
+            out = np.empty((m, ), dtype=res_dtype)
+            out_usm = allocate_usm_shared(out.size * out.itemsize, sycl_queue)
+
+            dpnp_func(a_usm, b_usm, out_usm, m, n, k)
+
+            copy_usm(sycl_queue, out.ctypes, out_usm, out.size * out.itemsize)
+
+            free_usm(a_usm, sycl_queue)
+            free_usm(b_usm, sycl_queue)
+            free_usm(out_usm, sycl_queue)
+
+            dpnp_ext._dummy_liveness_func([a.size, b.size, out.size])
+
+            return out
+
+        return dot_2_mv
+    elif ndims == [1, 2]:
+        dpnp_func = dpnp_ext.dpnp_func("dpnp_matmul", [a.dtype.name, "NONE"], sig)
+        def dot_2_vm(a, b):
+            sycl_queue = get_sycl_queue()
+
+            m, = a.shape
+            k, n = b.shape
+
+            if m != k:
+                raise ValueError("Incompatible array sizes for np.dot(a, b)")
+
+            a_usm = allocate_usm_shared(a.size * a.itemsize, sycl_queue)
+            copy_usm(sycl_queue, a_usm, a.ctypes, a.size * a.itemsize)
+
+            b_usm = allocate_usm_shared(b.size * b.itemsize, sycl_queue)
+            copy_usm(sycl_queue, b_usm, b.ctypes, b.size * b.itemsize)
+
+            out = np.empty((n, ), dtype=res_dtype)
+            out_usm = allocate_usm_shared(out.size * out.itemsize, sycl_queue)
+
+            dpnp_func(a_usm, b_usm, out_usm, m, n, k)
+
+            copy_usm(sycl_queue, out.ctypes, out_usm, out.size * out.itemsize)
+
+            free_usm(a_usm, sycl_queue)
+            free_usm(b_usm, sycl_queue)
+            free_usm(out_usm, sycl_queue)
+
+            dpnp_ext._dummy_liveness_func([a.size, b.size, out.size])
+
+            return out
+
+        return dot_2_vm
+    elif ndims == [1, 1]:
+        sig = signature(ret_type, types.voidptr, types.voidptr, types.voidptr,
+                                  types.intp)
+        dpnp_func = dpnp_ext.dpnp_func("dpnp_dot", [a.dtype.name, "NONE"], sig)
+        def dot_2_vv(a, b):
+            sycl_queue = get_sycl_queue()
+
+            m, = a.shape
+            n, = b.shape
+
+            if m != n:
+                raise ValueError("Incompatible array sizes for np.dot(a, b)")
+
+            a_usm = allocate_usm_shared(a.size * a.itemsize, sycl_queue)
+            copy_usm(sycl_queue, a_usm, a.ctypes, a.size * a.itemsize)
+
+            b_usm = allocate_usm_shared(b.size * b.itemsize, sycl_queue)
+            copy_usm(sycl_queue, b_usm, b.ctypes, b.size * b.itemsize)
+
+            out = np.empty(1, dtype=res_dtype)
+            out_usm = allocate_usm_shared(out.size * out.itemsize, sycl_queue)
+
+            dpnp_func(a_usm, b_usm, out_usm, m)
+
+            copy_usm(sycl_queue, out.ctypes, out_usm, out.size * out.itemsize)
+
+            free_usm(a_usm, sycl_queue)
+            free_usm(b_usm, sycl_queue)
+            free_usm(out_usm, sycl_queue)
+
+            dpnp_ext._dummy_liveness_func([a.size, b.size, out.size])
+
+            return out[0]
+
+        return dot_2_vv
+    else:
+        assert 0
diff --git a/numba_dppy/dpnp_glue/dpnp_sort_search_countimpl.py b/numba_dppy/dpnp_glue/dpnp_sort_search_countimpl.py
new file mode 100644
index 0000000000..8ec200059b
--- /dev/null
+++ b/numba_dppy/dpnp_glue/dpnp_sort_search_countimpl.py
@@ -0,0 +1,157 @@
+import numba_dppy.dpnp_glue.dpnpimpl as dpnp_ext
+from numba.core import types, cgutils
+from numba.core.typing import signature
+from . import stubs
+import numba_dppy.dpnp_glue as dpnp_lowering
+from numba.core.extending import overload, register_jitable
+import numpy as np
+
+
+@overload(stubs.dpnp.argmax)
+def dpnp_argmax_impl(a):
+    name = "argmax"
+    dpnp_lowering.ensure_dpnp(name)
+    dpctl_functions = dpnp_ext._DPCTL_FUNCTIONS()
+
+    ret_type = types.void
+    """
+    dpnp source:
+    https://github.com/IntelPython/dpnp/blob/0.4.0/dpnp/backend/custom_kernels_searching.cpp#L36
+
+    Function declaration:
+    void custom_argmax_c(void* array1_in, void* result1, size_t size)
+    """
+    sig = signature(ret_type, types.voidptr, types.voidptr, types.intp)
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, np.dtype(np.int64).name], sig)
+
+    get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
+    allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
+    copy_usm = dpctl_functions.dpctl_queue_memcpy()
+    free_usm = dpctl_functions.dpctl_free_with_queue()
+
+    res_dtype = np.int64
+
+    def dpnp_impl(a):
+        if a.size == 0:
+            raise ValueError("Passed Empty array")
+
+        sycl_queue = get_sycl_queue()
+
+        a_usm = allocate_usm_shared(a.size * a.itemsize, sycl_queue)
+        copy_usm(sycl_queue, a_usm, a.ctypes, a.size * a.itemsize)
+
+        out = np.empty(1, dtype=res_dtype)
+        out_usm = allocate_usm_shared(out.itemsize, sycl_queue)
+
+        dpnp_func(a_usm, out_usm, a.size)
+
+        copy_usm(sycl_queue, out.ctypes, out_usm, out.size * out.itemsize)
+
+        free_usm(a_usm, sycl_queue)
+        free_usm(out_usm, sycl_queue)
+
+        dpnp_ext._dummy_liveness_func([a.size, out.size])
+
+        return out[0]
+
+    return dpnp_impl
+
+
+@overload(stubs.dpnp.argmin)
+def dpnp_argmin_impl(a):
+    name = "argmin"
+    dpnp_lowering.ensure_dpnp(name)
+    dpctl_functions = dpnp_ext._DPCTL_FUNCTIONS()
+
+    ret_type = types.void
+    """
+    dpnp source:
+    https://github.com/IntelPython/dpnp/blob/0.4.0/dpnp/backend/custom_kernels_searching.cpp#L56
+
+    Function declaration:
+    void custom_argmin_c(void* array1_in, void* result1, size_t size)
+    """
+    sig = signature(ret_type, types.voidptr, types.voidptr, types.intp)
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, np.dtype(np.int64).name], sig)
+
+    get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
+    allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
+    copy_usm = dpctl_functions.dpctl_queue_memcpy()
+    free_usm = dpctl_functions.dpctl_free_with_queue()
+
+    res_dtype = np.int64
+
+    def dpnp_impl(a):
+        if a.size == 0:
+            raise ValueError("Passed Empty array")
+
+        sycl_queue = get_sycl_queue()
+
+        a_usm = allocate_usm_shared(a.size * a.itemsize, sycl_queue)
+        copy_usm(sycl_queue, a_usm, a.ctypes, a.size * a.itemsize)
+
+        out = np.empty(1, dtype=res_dtype)
+        out_usm = allocate_usm_shared(out.itemsize, sycl_queue)
+
+        dpnp_func(a_usm, out_usm, a.size)
+
+        copy_usm(sycl_queue, out.ctypes, out_usm, out.size * out.itemsize)
+
+        free_usm(a_usm, sycl_queue)
+        free_usm(out_usm, sycl_queue)
+
+        dpnp_ext._dummy_liveness_func([a.size, out.size])
+
+        return out[0]
+
+    return dpnp_impl
+
+
+@overload(stubs.dpnp.argsort)
+def dpnp_argsort_impl(a):
+    name = "argsort"
+    dpnp_lowering.ensure_dpnp(name)
+    dpctl_functions = dpnp_ext._DPCTL_FUNCTIONS()
+
+    ret_type = types.void
+    """
+    dpnp source:
+    https://github.com/IntelPython/dpnp/blob/0.4.0/dpnp/backend/custom_kernels_searching.cpp#L56
+
+    Function declaration:
+    void custom_argmin_c(void* array1_in, void* result1, size_t size)
+    """
+    sig = signature(ret_type, types.voidptr, types.voidptr, types.intp)
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
+
+    get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
+    allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
+    copy_usm = dpctl_functions.dpctl_queue_memcpy()
+    free_usm = dpctl_functions.dpctl_free_with_queue()
+
+    res_dtype = np.int64
+
+    def dpnp_impl(a):
+        if a.size == 0:
+            raise ValueError("Passed Empty array")
+
+        sycl_queue = get_sycl_queue()
+
+        a_usm = allocate_usm_shared(a.size * a.itemsize, sycl_queue)
+        copy_usm(sycl_queue, a_usm, a.ctypes, a.size * a.itemsize)
+
+        out = np.arange(a.size, dtype=res_dtype)
+        out_usm = allocate_usm_shared(out.size * out.itemsize, sycl_queue)
+
+        dpnp_func(a_usm, out_usm, a.size)
+
+        copy_usm(sycl_queue, out.ctypes, out_usm, out.size * out.itemsize)
+
+        free_usm(a_usm, sycl_queue)
+        free_usm(out_usm, sycl_queue)
+
+        dpnp_ext._dummy_liveness_func([a.size, out.size])
+
+        return out
+
+    return dpnp_impl
diff --git a/numba_dppy/dpnp_glue/dpnp_statisticsimpl.py b/numba_dppy/dpnp_glue/dpnp_statisticsimpl.py
new file mode 100644
index 0000000000..cae9507902
--- /dev/null
+++ b/numba_dppy/dpnp_glue/dpnp_statisticsimpl.py
@@ -0,0 +1,312 @@
+import numba_dppy.dpnp_glue.dpnpimpl as dpnp_ext
+from numba.core import types, cgutils
+from numba.core.typing import signature
+from . import stubs
+import numba_dppy.dpnp_glue as dpnp_lowering
+from numba.core.extending import overload, register_jitable
+import numpy as np
+
+
+@overload(stubs.dpnp.max)
+@overload(stubs.dpnp.amax)
+def dpnp_amax_impl(a):
+    name = "max"
+    dpnp_lowering.ensure_dpnp(name)
+    dpctl_functions = dpnp_ext._DPCTL_FUNCTIONS()
+
+    ret_type = types.void
+    """
+    dpnp source:
+    https://github.com/IntelPython/dpnp/blob/0.4.0/dpnp/backend/custom_kernels_statistics.cpp#L129
+
+    Function declaration:
+    void custom_max_c(void* array1_in, void* result1, const size_t* shape,
+                      size_t ndim, const size_t* axis, size_t naxis)
+
+    We are using void * in case of size_t * as Numba currently does not have
+    any type to represent size_t *. Since, both the types are pointers,
+    if the compiler allows there should not be any mismatch in the size of
+    the container to hold different types of pointer.
+    """
+    sig = signature(ret_type, types.voidptr, types.voidptr,
+                              types.voidptr, types.intp,
+                              types.voidptr, types.intp)
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
+
+    get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
+    allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
+    copy_usm = dpctl_functions.dpctl_queue_memcpy()
+    free_usm = dpctl_functions.dpctl_free_with_queue()
+
+    def dpnp_impl(a):
+        if a.size == 0:
+            raise ValueError("Passed Empty array")
+
+        sycl_queue = get_sycl_queue()
+
+        a_usm = allocate_usm_shared(a.size * a.itemsize, sycl_queue)
+        copy_usm(sycl_queue, a_usm, a.ctypes, a.size * a.itemsize)
+
+        out_usm = allocate_usm_shared(a.itemsize, sycl_queue)
+
+        dpnp_func(a_usm, out_usm, a.shapeptr, a.ndim, a.shapeptr, a.ndim)
+
+        out = np.empty(1, dtype=a.dtype)
+        copy_usm(sycl_queue, out.ctypes, out_usm, out.size * out.itemsize)
+
+        free_usm(a_usm, sycl_queue)
+        free_usm(out_usm, sycl_queue)
+
+        dpnp_ext._dummy_liveness_func([out.size])
+
+        return out[0]
+
+    return dpnp_impl
+
+
+@overload(stubs.dpnp.min)
+@overload(stubs.dpnp.amin)
+def dpnp_amin_impl(a):
+    name = "min"
+    dpnp_lowering.ensure_dpnp(name)
+    dpctl_functions = dpnp_ext._DPCTL_FUNCTIONS()
+
+    ret_type = types.void
+    """
+    dpnp source:
+    https://github.com/IntelPython/dpnp/blob/0.4.0/dpnp/backend/custom_kernels_statistics.cpp#L247
+
+    Function declaration:
+    void custom_min_c(void* array1_in, void* result1, const size_t* shape,
+                      size_t ndim, const size_t* axis, size_t naxis)
+
+    We are using void * in case of size_t * as Numba currently does not have
+    any type to represent size_t *. Since, both the types are pointers,
+    if the compiler allows there should not be any mismatch in the size of
+    the container to hold different types of pointer.
+    """
+    sig = signature(ret_type, types.voidptr, types.voidptr,
+                              types.voidptr, types.intp,
+                              types.voidptr, types.intp)
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
+
+    get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
+    allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
+    copy_usm = dpctl_functions.dpctl_queue_memcpy()
+    free_usm = dpctl_functions.dpctl_free_with_queue()
+
+    def dpnp_impl(a):
+        if a.size == 0:
+            raise ValueError("Passed Empty array")
+
+        sycl_queue = get_sycl_queue()
+
+        a_usm = allocate_usm_shared(a.size * a.itemsize, sycl_queue)
+        copy_usm(sycl_queue, a_usm, a.ctypes, a.size * a.itemsize)
+
+        out_usm = allocate_usm_shared(a.itemsize, sycl_queue)
+
+        dpnp_func(a_usm, out_usm, a.shapeptr, a.ndim, a.shapeptr, 0)
+
+        out = np.empty(1, dtype=a.dtype)
+        copy_usm(sycl_queue, out.ctypes, out_usm, out.size * out.itemsize)
+
+        free_usm(a_usm, sycl_queue)
+        free_usm(out_usm, sycl_queue)
+
+        dpnp_ext._dummy_liveness_func([out.size])
+
+        return out[0]
+
+    return dpnp_impl
+
+
+@overload(stubs.dpnp.mean)
+def dpnp_mean_impl(a):
+    name = "mean"
+    dpnp_lowering.ensure_dpnp(name)
+    dpctl_functions = dpnp_ext._DPCTL_FUNCTIONS()
+
+    ret_type = types.void
+    """
+    dpnp source:
+    https://github.com/IntelPython/dpnp/blob/0.4.0/dpnp/backend/custom_kernels_statistics.cpp#L169
+
+    Function declaration:
+    void custom_mean_c(void* array1_in, void* result1, const size_t* shape,
+                       size_t ndim, const size_t* axis, size_t naxis)
+
+    We are using void * in case of size_t * as Numba currently does not have
+    any type to represent size_t *. Since, both the types are pointers,
+    if the compiler allows there should not be any mismatch in the size of
+    the container to hold different types of pointer.
+    """
+    sig = signature(ret_type, types.voidptr, types.voidptr,
+                              types.voidptr, types.intp,
+                              types.voidptr, types.intp)
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
+
+    get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
+    allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
+    copy_usm = dpctl_functions.dpctl_queue_memcpy()
+    free_usm = dpctl_functions.dpctl_free_with_queue()
+
+    res_dtype = np.float64
+    if a.dtype == types.float32:
+        res_dtype = np.float32
+
+    def dpnp_impl(a):
+        if a.size == 0:
+            raise ValueError("Passed Empty array")
+
+        sycl_queue = get_sycl_queue()
+
+        a_usm = allocate_usm_shared(a.size * a.itemsize, sycl_queue)
+        copy_usm(sycl_queue, a_usm, a.ctypes, a.size * a.itemsize)
+
+        out = np.empty(1, dtype=res_dtype)
+        out_usm = allocate_usm_shared(out.itemsize, sycl_queue)
+
+        dpnp_func(a_usm, out_usm, a.shapeptr, a.ndim, a.shapeptr, a.ndim)
+
+        copy_usm(sycl_queue, out.ctypes, out_usm, out.size * out.itemsize)
+
+        free_usm(a_usm, sycl_queue)
+        free_usm(out_usm, sycl_queue)
+
+        dpnp_ext._dummy_liveness_func([a.size, out.size])
+        return out[0]
+
+    return dpnp_impl
+
+
+@overload(stubs.dpnp.median)
+def dpnp_median_impl(a):
+    name = "median"
+    dpnp_lowering.ensure_dpnp(name)
+    dpctl_functions = dpnp_ext._DPCTL_FUNCTIONS()
+
+    ret_type = types.void
+    """
+    dpnp source:
+    https://github.com/IntelPython/dpnp/blob/0.4.0/dpnp/backend/custom_kernels_statistics.cpp#L213
+
+    Function declaration:
+    void custom_median_c(void* array1_in, void* result1, const size_t* shape,
+			 size_t ndim, const size_t* axis, size_t naxis)
+
+    We are using void * in case of size_t * as Numba currently does not have
+    any type to represent size_t *. Since, both the types are pointers,
+    if the compiler allows there should not be any mismatch in the size of
+    the container to hold different types of pointer.
+    """
+    sig = signature(ret_type, types.voidptr, types.voidptr,
+                              types.voidptr, types.intp,
+                              types.voidptr, types.intp)
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
+
+    get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
+    allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
+    copy_usm = dpctl_functions.dpctl_queue_memcpy()
+    free_usm = dpctl_functions.dpctl_free_with_queue()
+
+    res_dtype = np.float64
+    if a.dtype == types.float32:
+        res_dtype = np.float32
+
+    def dpnp_impl(a):
+        if a.size == 0:
+            raise ValueError("Passed Empty array")
+
+        sycl_queue = get_sycl_queue()
+
+        a_usm = allocate_usm_shared(a.size * a.itemsize, sycl_queue)
+        copy_usm(sycl_queue, a_usm, a.ctypes, a.size * a.itemsize)
+
+        out = np.empty(1, dtype=res_dtype)
+        out_usm = allocate_usm_shared(out.itemsize, sycl_queue)
+
+        dpnp_func(a_usm, out_usm, a.shapeptr, a.ndim, a.shapeptr, a.ndim)
+
+        copy_usm(sycl_queue, out.ctypes, out_usm, out.size * out.itemsize)
+
+        free_usm(a_usm, sycl_queue)
+        free_usm(out_usm, sycl_queue)
+
+        dpnp_ext._dummy_liveness_func([a.size, out.size])
+
+        return out[0]
+
+    return dpnp_impl
+
+
+@overload(stubs.dpnp.cov)
+def dpnp_cov_impl(a):
+    name = "cov"
+    dpnp_lowering.ensure_dpnp(name)
+    dpctl_functions = dpnp_ext._DPCTL_FUNCTIONS()
+
+    ret_type = types.void
+    """
+    dpnp source:
+    https://github.com/IntelPython/dpnp/blob/0.4.0/dpnp/backend/custom_kernels_statistics.cpp#L51
+
+    Function declaration:
+    void custom_cov_c(void* array1_in, void* result1, size_t nrows, size_t ncols)
+    """
+    sig = signature(ret_type, types.voidptr, types.voidptr,
+                              types.intp, types.intp)
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
+
+    get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
+    allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
+    copy_usm = dpctl_functions.dpctl_queue_memcpy()
+    free_usm = dpctl_functions.dpctl_free_with_queue()
+
+    res_dtype = np.float64
+    copy_input_to_double = True
+    if a.dtype == types.float64:
+        copy_input_to_double = False
+
+
+    def dpnp_impl(a):
+        if a.size == 0:
+            raise ValueError("Passed Empty array")
+
+        sycl_queue = get_sycl_queue()
+
+        """ We have to pass a array in double precision to DpNp """
+        if copy_input_to_double:
+            a_copy_in_double = a.astype(np.float64)
+        else:
+            a_copy_in_double = a
+        a_usm = allocate_usm_shared(a_copy_in_double.size * a_copy_in_double.itemsize, sycl_queue)
+        copy_usm(sycl_queue, a_usm, a_copy_in_double.ctypes,
+                 a_copy_in_double.size * a_copy_in_double.itemsize)
+
+        if a.ndim == 2:
+            rows = a.shape[0]
+            cols = a.shape[1]
+            out = np.empty((rows, rows), dtype=res_dtype)
+        elif a.ndim == 1:
+            rows = 1
+            cols = a.shape[0]
+            out = np.empty(rows, dtype=res_dtype)
+
+        out_usm = allocate_usm_shared(out.size * out.itemsize, sycl_queue)
+
+        dpnp_func(a_usm, out_usm, rows, cols)
+
+        copy_usm(sycl_queue, out.ctypes, out_usm, out.size * out.itemsize)
+
+        free_usm(a_usm, sycl_queue)
+        free_usm(out_usm, sycl_queue)
+
+        dpnp_ext._dummy_liveness_func([a_copy_in_double.size, a.size, out.size])
+
+        if a.ndim == 2:
+            return out
+        elif a.ndim == 1:
+            return out[0]
+
+    return dpnp_impl
diff --git a/numba_dppy/dpnp_glue/dpnp_transcendentalsimpl.py b/numba_dppy/dpnp_glue/dpnp_transcendentalsimpl.py
new file mode 100644
index 0000000000..f7ba425206
--- /dev/null
+++ b/numba_dppy/dpnp_glue/dpnp_transcendentalsimpl.py
@@ -0,0 +1,102 @@
+import numba_dppy.dpnp_glue.dpnpimpl as dpnp_ext
+from numba import types
+from numba.core.typing import signature
+from . import stubs
+import numba_dppy.dpnp_glue as dpnp_lowering
+from numba.core.extending import overload, register_jitable
+import numpy as np
+
+
+@overload(stubs.dpnp.sum)
+def dpnp_sum_impl(a):
+    name = "sum"
+    dpnp_lowering.ensure_dpnp(name)
+    dpctl_functions = dpnp_ext._DPCTL_FUNCTIONS()
+
+    ret_type = types.void
+    """
+    dpnp source:
+    https://github.com/IntelPython/dpnp/blob/0.4.0/dpnp/backend/custom_kernels_reduction.cpp#L39
+
+    Function declaration:
+    void custom_sum_c(void* array1_in, void* result1, size_t size)
+
+    """
+    sig = signature(ret_type, types.voidptr, types.voidptr, types.intp)
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
+
+    get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
+    allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
+    copy_usm = dpctl_functions.dpctl_queue_memcpy()
+    free_usm = dpctl_functions.dpctl_free_with_queue()
+
+    def dpnp_impl(a):
+        if a.size == 0:
+            raise ValueError("Passed Empty array")
+
+        sycl_queue = get_sycl_queue()
+        a_usm = allocate_usm_shared(a.size * a.itemsize, sycl_queue)
+        copy_usm(sycl_queue, a_usm, a.ctypes, a.size * a.itemsize)
+
+        out_usm = allocate_usm_shared(a.itemsize, sycl_queue)
+
+        dpnp_func(a_usm, out_usm, a.size)
+
+        out = np.empty(1, dtype=a.dtype)
+        copy_usm(sycl_queue, out.ctypes, out_usm, out.size * out.itemsize)
+
+        free_usm(a_usm, sycl_queue)
+        free_usm(out_usm, sycl_queue)
+
+        dpnp_ext._dummy_liveness_func([out.size])
+
+        return out[0]
+
+    return dpnp_impl
+
+
+@overload(stubs.dpnp.prod)
+def dpnp_prod_impl(a):
+    name = "prod"
+    dpnp_lowering.ensure_dpnp(name)
+    dpctl_functions = dpnp_ext._DPCTL_FUNCTIONS()
+
+    ret_type = types.void
+    """
+    dpnp source:
+    https://github.com/IntelPython/dpnp/blob/0.4.0/dpnp/backend/custom_kernels_reduction.cpp#L83
+
+    Function declaration:
+    void custom_prod_c(void* array1_in, void* result1, size_t size)
+    """
+    sig = signature(ret_type, types.voidptr, types.voidptr, types.intp)
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
+
+    get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
+    allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
+    copy_usm = dpctl_functions.dpctl_queue_memcpy()
+    free_usm = dpctl_functions.dpctl_free_with_queue()
+
+    def dpnp_impl(a):
+        if a.size == 0:
+            raise ValueError("Passed Empty array")
+
+        sycl_queue = get_sycl_queue()
+        a_usm = allocate_usm_shared(a.size * a.itemsize, sycl_queue)
+        copy_usm(sycl_queue, a_usm, a.ctypes, a.size * a.itemsize)
+
+        out_usm = allocate_usm_shared(a.itemsize, sycl_queue)
+
+        dpnp_func(a_usm, out_usm, a.size)
+
+        out = np.empty(1, dtype=a.dtype)
+        copy_usm(sycl_queue, out.ctypes, out_usm, out.size * out.itemsize)
+
+        free_usm(a_usm, sycl_queue)
+        free_usm(out_usm, sycl_queue)
+
+        dpnp_ext._dummy_liveness_func([out.size])
+
+        return out[0]
+
+    return dpnp_impl
diff --git a/numba_dppy/dpnp_glue/dpnpdecl.py b/numba_dppy/dpnp_glue/dpnpdecl.py
new file mode 100644
index 0000000000..ce1f7d3583
--- /dev/null
+++ b/numba_dppy/dpnp_glue/dpnpdecl.py
@@ -0,0 +1,23 @@
+from numba.core.typing.templates import (AttributeTemplate, infer_getattr)
+import numba_dppy
+from numba import types
+from numba.core.types.misc import RawPointer
+
+@infer_getattr
+class DppyDpnpTemplate(AttributeTemplate):
+    key = types.Module(numba_dppy)
+
+    def resolve_dpnp(self, mod):
+        return types.Module(numba_dppy.dpnp)
+
+"""
+This adds a shapeptr attribute to Numba type representing np.ndarray.
+This allows us to get the raw pointer to the structure where the shape
+of an ndarray is stored from an overloaded implementation
+"""
+@infer_getattr
+class ArrayAttribute(AttributeTemplate):
+    key = types.Array
+
+    def resolve_shapeptr(self, ary):
+        return types.voidptr
diff --git a/numba_dppy/dpnp_glue/dpnpimpl.py b/numba_dppy/dpnp_glue/dpnpimpl.py
new file mode 100644
index 0000000000..fa429f923f
--- /dev/null
+++ b/numba_dppy/dpnp_glue/dpnpimpl.py
@@ -0,0 +1,47 @@
+from numba.core.imputils import lower_builtin
+from numba.core import types
+from numba.core.extending import register_jitable
+import numpy as np
+from llvmlite import ir
+from numba.core.imputils import lower_getattr
+from numba_dppy.dpctl_functions import _DPCTL_FUNCTIONS
+
+ll_void_p = ir.IntType(8).as_pointer()
+
+def get_dpnp_fptr(fn_name, type_names):
+    from . import dpnp_fptr_interface as dpnp_glue
+
+    f_ptr = dpnp_glue.get_dpnp_fn_ptr(fn_name, type_names)
+    return f_ptr
+
+@register_jitable
+def _check_finite_matrix(a):
+    for v in np.nditer(a):
+        if not np.isfinite(v.item()):
+            raise np.linalg.LinAlgError("Array must not contain infs or NaNs.")
+
+@register_jitable
+def _dummy_liveness_func(a):
+    """pass a list of variables to be preserved through dead code elimination"""
+    return a[0]
+
+def dpnp_func(fn_name, type_names, sig):
+    f_ptr = get_dpnp_fptr(fn_name, type_names)
+
+    def get_pointer(obj):
+        return f_ptr
+
+    return types.ExternalFunctionPointer(sig, get_pointer=get_pointer)
+
+"""
+This function retrieves the pointer to the structure where the shape
+of an ndarray is stored. We cast it to void * to make it easier to
+pass around.
+"""
+@lower_getattr(types.Array, "shapeptr")
+def array_shape(context, builder, typ, value):
+    shape_ptr = builder.gep(value.operands[0],
+                            [context.get_constant(types.int32, 0),
+                             context.get_constant(types.int32, 5)])
+
+    return builder.bitcast(shape_ptr, ll_void_p)
diff --git a/numba_dppy/dpnp_glue/stubs.py b/numba_dppy/dpnp_glue/stubs.py
new file mode 100644
index 0000000000..2fdd6ecbe3
--- /dev/null
+++ b/numba_dppy/dpnp_glue/stubs.py
@@ -0,0 +1,51 @@
+from numba_dppy.ocl.stubs import Stub
+
+class dpnp(Stub):
+    """dpnp namespace
+    """
+    _description_ = '<dpnp>'
+
+    class sum(Stub):
+        pass
+
+    class eig(Stub):
+        pass
+
+    class prod(Stub):
+        pass
+
+    class max(Stub):
+        pass
+
+    class amax(Stub):
+        pass
+
+    class min(Stub):
+        pass
+
+    class amin(Stub):
+        pass
+
+    class mean(Stub):
+        pass
+
+    class median(Stub):
+        pass
+
+    class argmax(Stub):
+        pass
+
+    class argmin(Stub):
+        pass
+
+    class argsort(Stub):
+        pass
+
+    class cov(Stub):
+        pass
+
+    class dot(Stub):
+        pass
+
+    class matmul(Stub):
+        pass
diff --git a/numba_dppy/dppl_host_fn_call_gen.py b/numba_dppy/dppy_host_fn_call_gen.py
similarity index 91%
rename from numba_dppy/dppl_host_fn_call_gen.py
rename to numba_dppy/dppy_host_fn_call_gen.py
index 10a4820906..2808ddf90d 100644
--- a/numba_dppy/dppl_host_fn_call_gen.py
+++ b/numba_dppy/dppy_host_fn_call_gen.py
@@ -9,7 +9,7 @@
 
 from numba.core.ir_utils import legalize_names
 
-class DPPLHostFunctionCallsGenerator(object):
+class DPPYHostFunctionCallsGenerator(object):
     def __init__(self, lowerer, cres, num_inputs):
         self.lowerer = lowerer
         self.context = self.lowerer.context
@@ -52,7 +52,7 @@ def _init_llvm_types_and_constants(self):
         self.byte_ptr_t      = lc.Type.pointer(self.byte_t)
         self.byte_ptr_ptr_t  = lc.Type.pointer(self.byte_ptr_t)
         self.intp_t          = self.context.get_value_type(types.intp)
-        self.long_t          = self.context.get_value_type(types.int64)
+        self.int64_t         = self.context.get_value_type(types.int64)
         self.int32_t         = self.context.get_value_type(types.int32)
         self.int32_ptr_t     = lc.Type.pointer(self.int32_t)
         self.uintp_t         = self.context.get_value_type(types.uintp)
@@ -70,31 +70,31 @@ def _init_llvm_types_and_constants(self):
     def _declare_functions(self):
         get_queue_fnty = lc.Type.function(self.void_ptr_t, ())
         self.get_queue = self.builder.module.get_or_insert_function(get_queue_fnty,
-                                                                name="DPPLQueueMgr_GetCurrentQueue")
+                                                                name="DPCTLQueueMgr_GetCurrentQueue")
 
         submit_range_fnty = lc.Type.function(self.void_ptr_t,
                 [self.void_ptr_t, self.void_ptr_t, self.void_ptr_ptr_t,
                     self.int32_ptr_t, self.intp_t, self.intp_ptr_t,
                     self.intp_t, self.void_ptr_t, self.intp_t])
         self.submit_range = self.builder.module.get_or_insert_function(submit_range_fnty,
-                                                                name="DPPLQueue_SubmitRange")
+                                                                name="DPCTLQueue_SubmitRange")
 
 
         queue_memcpy_fnty = lc.Type.function(lir.VoidType(), [self.void_ptr_t, self.void_ptr_t, self.void_ptr_t, self.intp_t])
         self.queue_memcpy = self.builder.module.get_or_insert_function(queue_memcpy_fnty,
-                                                                name="DPPLQueue_Memcpy")
+                                                                name="DPCTLQueue_Memcpy")
 
         queue_wait_fnty =  lc.Type.function(lir.VoidType(), [self.void_ptr_t])
         self.queue_wait = self.builder.module.get_or_insert_function(queue_wait_fnty,
-                                                                name="DPPLQueue_Wait")
+                                                                name="DPCTLQueue_Wait")
 
         usm_shared_fnty = lc.Type.function(self.void_ptr_t, [self.intp_t, self.void_ptr_t])
         self.usm_shared = self.builder.module.get_or_insert_function(usm_shared_fnty,
-                                                                name="DPPLmalloc_shared")
+                                                                name="DPCTLmalloc_shared")
 
         usm_free_fnty = lc.Type.function(lir.VoidType(), [self.void_ptr_t, self.void_ptr_t])
         self.usm_free = self.builder.module.get_or_insert_function(usm_free_fnty,
-                                                                   name="DPPLfree_with_queue")
+                                                                   name="DPCTLfree_with_queue")
 
     def allocate_kenrel_arg_array(self, num_kernel_args):
         self.sycl_queue_val = cgutils.alloca_once(self.builder, self.void_ptr_t)
@@ -113,23 +113,26 @@ def allocate_kenrel_arg_array(self, num_kernel_args):
 
 
     def resolve_and_return_dpctl_type(self, ty):
+        """This function looks up the dpctl defined enum values from DPCTLKernelArgType.
+        """
+
         val = None
         if ty == types.int32 or isinstance(ty, types.scalars.IntegerLiteral):
-            val = self.context.get_constant(types.int32, 4)
+            val = self.context.get_constant(types.int32, 9)  # DPCTL_LONG_LONG
         elif ty == types.uint32:
-            val = self.context.get_constant(types.int32, 5)
+            val = self.context.get_constant(types.int32, 10)  # DPCTL_UNSIGNED_LONG_LONG
         elif ty == types.boolean:
-            val = self.context.get_constant(types.int32, 5)
+            val = self.context.get_constant(types.int32, 5)  # DPCTL_UNSIGNED_INT
         elif ty == types.int64:
-            val = self.context.get_constant(types.int32, 7)
+            val = self.context.get_constant(types.int32, 9)  # DPCTL_LONG_LONG
         elif ty == types.uint64:
-            val = self.context.get_constant(types.int32, 8)
+            val = self.context.get_constant(types.int32, 11)  # DPCTL_SIZE_T
         elif ty == types.float32:
-            val = self.context.get_constant(types.int32, 12)
+            val = self.context.get_constant(types.int32, 12)  # DPCTL_FLOAT
         elif ty == types.float64:
-            val = self.context.get_constant(types.int32, 13)
+            val = self.context.get_constant(types.int32, 13)  # DPCTL_DOUBLE
         elif ty == types.voidptr:
-            val = self.context.get_constant(types.int32, 15)
+            val = self.context.get_constant(types.int32, 15)  # DPCTL_VOID_PTR
         else:
             raise NotImplementedError
 
@@ -151,12 +154,12 @@ def process_kernel_arg(self, var, llvm_arg, arg_type, gu_sig, val_type, index, m
             if llvm_arg is None:
                 raise NotImplementedError(arg_type, var)
 
-            storage = cgutils.alloca_once(self.builder, self.long_t)
+            storage = cgutils.alloca_once(self.builder, self.int64_t)
             self.builder.store(self.context.get_constant(types.int64, 0), storage)
             ty = self.resolve_and_return_dpctl_type(types.int64)
             self.form_kernel_arg_and_arg_ty(self.builder.bitcast(storage, self.void_ptr_t), ty)
 
-            storage = cgutils.alloca_once(self.builder, self.long_t)
+            storage = cgutils.alloca_once(self.builder, self.int64_t)
             self.builder.store(self.context.get_constant(types.int64, 0), storage)
             ty = self.resolve_and_return_dpctl_type(types.int64)
             self.form_kernel_arg_and_arg_ty(self.builder.bitcast(storage, self.void_ptr_t), ty)
diff --git a/numba_dppy/dppl_lowerer.py b/numba_dppy/dppy_lowerer.py
similarity index 95%
rename from numba_dppy/dppl_lowerer.py
rename to numba_dppy/dppy_lowerer.py
index 51fb072551..3040362592 100644
--- a/numba_dppy/dppl_lowerer.py
+++ b/numba_dppy/dppy_lowerer.py
@@ -12,7 +12,7 @@
 from numba.core import (compiler, ir, types, sigutils, lowering,
                 funcdesc, config)
 from numba.parfors import parfor
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba.core.ir_utils import (add_offset_to_labels,
                             replace_var_names,
                             remove_dels,
@@ -38,9 +38,9 @@
 from numba.core.errors import NumbaParallelSafetyWarning, NumbaPerformanceWarning
 
 from .dufunc_inliner import dufunc_inliner
-from . import dppl_host_fn_call_gen as dppl_call_gen
+from . import dppy_host_fn_call_gen as dppy_call_gen
 import dpctl
-from numba_dppy.target import DPPLTargetContext
+from numba_dppy.target import DPPYTargetContext
 
 
 def _print_block(block):
@@ -72,7 +72,7 @@ def _schedule_loop(parfor_dim, legal_loop_indices, loop_ranges, param_dict):
 
     for eachdim in range(global_id_dim):
         gufunc_txt += ("    " + legal_loop_indices[eachdim] + " = "
-                       + "dppl.get_global_id(" + str(eachdim) + ")\n")
+                       + "dppy.get_global_id(" + str(eachdim) + ")\n")
 
 
     for eachdim in range(global_id_dim, for_loop_dim):
@@ -444,7 +444,7 @@ def print_arg_with_addrspaces(args):
         print("gufunc_txt = ", type(gufunc_txt), "\n", gufunc_txt)
         sys.stdout.flush()
     # Force gufunc outline into existence.
-    globls = {"np": np, "numba": numba, "dppl": dppl}
+    globls = {"np": np, "numba": numba, "dppy": dppy}
     locls = {}
     exec(gufunc_txt, globls, locls)
     gufunc_func = locls[gufunc_name]
@@ -520,6 +520,8 @@ def print_arg_with_addrspaces(args):
     diagnostics.hoist_info[parfor.id] = {'hoisted': hoisted,
                                          'not_hoisted': not_hoisted}
 
+    lowerer.metadata['parfor_diagnostics'].extra_info[str(parfor.id)] = str(dpctl.get_current_queue().get_sycl_device().get_device_name())
+
     if config.DEBUG_ARRAY_OPT:
         print("After hoisting")
         _print_body(loop_body)
@@ -740,7 +742,7 @@ def _lower_parfor_gufunc(lowerer, parfor):
         parfor.races,
         typemap)
 
-    generate_dppl_host_wrapper(
+    generate_dppy_host_wrapper(
         lowerer,
         func,
         gu_signature,
@@ -828,10 +830,10 @@ def bump_alpha(c, class_map):
     return (gu_sin, gu_sout)
 
 
-# Keep all the dppl kernels and programs created alive indefinitely.
+# Keep all the dppy kernels and programs created alive indefinitely.
 keep_alive_kernels = []
 
-def generate_dppl_host_wrapper(lowerer,
+def generate_dppy_host_wrapper(lowerer,
                                cres,
                                gu_signature,
                                outer_sig,
@@ -852,7 +854,7 @@ def generate_dppl_host_wrapper(lowerer,
     num_dim = len(loop_ranges)
 
     if config.DEBUG_ARRAY_OPT:
-        print("generate_dppl_host_wrapper")
+        print("generate_dppy_host_wrapper")
         print("args = ", expr_args)
         print("outer_sig = ", outer_sig.args, outer_sig.return_type,
               outer_sig.recvr, outer_sig.pysig)
@@ -868,8 +870,8 @@ def generate_dppl_host_wrapper(lowerer,
 #        print("cres.fndesc", cres.fndesc, type(cres.fndesc))
 
 
-    # get dppl_cpu_portion_lowerer object
-    dppl_cpu_lowerer = dppl_call_gen.DPPLHostFunctionCallsGenerator(
+    # get dppy_cpu_portion_lowerer object
+    dppy_cpu_lowerer = dppy_call_gen.DPPYHostFunctionCallsGenerator(
                            lowerer, cres, num_inputs)
 
     # Compute number of args ------------------------------------------------
@@ -886,7 +888,7 @@ def generate_dppl_host_wrapper(lowerer,
 
     # now that we know the total number of kernel args, lets allocate
     # a kernel_arg array
-    dppl_cpu_lowerer.allocate_kenrel_arg_array(num_expanded_args)
+    dppy_cpu_lowerer.allocate_kenrel_arg_array(num_expanded_args)
 
     ninouts = len(expr_args)
 
@@ -931,7 +933,7 @@ def val_type_or_none(context, lowerer, x):
                   "\n\tval_type:", val_type, type(val_type),
                   "\n\tindex:", index)
 
-        dppl_cpu_lowerer.process_kernel_arg(var, llvm_arg, arg_type, gu_sig,
+        dppy_cpu_lowerer.process_kernel_arg(var, llvm_arg, arg_type, gu_sig,
                                             val_type, index, modified_arrays)
     # -----------------------------------------------------------------------
 
@@ -951,7 +953,7 @@ def load_range(v):
         step = load_range(step)
         loop_ranges[i] = (start, stop, step)
 
-    dppl_cpu_lowerer.enqueue_kernel_and_read_back(loop_ranges)
+    dppy_cpu_lowerer.enqueue_kernel_and_read_back(loop_ranges)
 
 
 from numba.core.lowering import Lower
@@ -975,18 +977,17 @@ def relatively_deep_copy(obj, memo):
     from numba.core.types.functions import Function, Dispatcher
     from numba.core.bytecode import FunctionIdentity
     from numba.core.typing.templates import Signature
-    from numba_dppy.compiler import DPPLFunctionTemplate
+    from numba_dppy.compiler import DPPYFunctionTemplate
     from numba.core.compiler import CompileResult
     from numba.np.ufunc.dufunc import DUFunc
     from ctypes import _CFuncPtr
-    from cffi.api import FFI
     from types import ModuleType
     from numba.core.types.abstract import Type
 
     # objects which shouldn't or can't be copied and it's ok not to copy it.
-    if isinstance(obj, (FunctionIdentity, _DispatcherBase, Function, Type, Dispatcher, ModuleType,
-                        Signature, DPPLFunctionTemplate, CompileResult,
-                        DUFunc, _CFuncPtr, FFI,
+    if isinstance(obj, (FunctionIdentity, _DispatcherBase, Function, Type,
+                        Dispatcher, ModuleType, Signature,
+                        DPPYFunctionTemplate, CompileResult, DUFunc, _CFuncPtr,
                         type, str, bool, type(None))):
         return obj
 
@@ -1133,7 +1134,7 @@ def get_slots_members(obj):
     return cpy
 
 
-class DPPLLower(Lower):
+class DPPYLower(Lower):
     def __init__(self, context, library, fndesc, func_ir, metadata=None):
         Lower.__init__(self, context, library, fndesc, func_ir, metadata)
         memo = {}
@@ -1142,7 +1143,7 @@ def __init__(self, context, library, fndesc, func_ir, metadata=None):
         func_ir_cpu = relatively_deep_copy(func_ir, memo)
 
 
-        cpu_context = context.cpu_context if isinstance(context, DPPLTargetContext) else context
+        cpu_context = context.cpu_context if isinstance(context, DPPYTargetContext) else context
         self.gpu_lower = Lower(context, library, fndesc, func_ir, metadata)
         self.cpu_lower = Lower(cpu_context, library, fndesc_cpu, func_ir_cpu, metadata)
 
@@ -1152,11 +1153,11 @@ def lower(self):
         # 1. Start lowering of parent function
         # 2. Try to lower parfor on GPU
         #     2.a. enter lower_parfor_rollback and prepare function to lower on GPU - insert get_global_id.
-        #         2.a.a. starting lower parfor body - enter this point (DPPLLower.lower()) second time.
+        #         2.a.a. starting lower parfor body - enter this point (DPPYLower.lower()) second time.
         #         2.a.b. If lowering on GPU failed - try on CPU.
         #         2.a.d. Since get_global_id is NOT supported with CPU context - fail and throw exception
         #     2.b. in lower_parfor_rollback catch exception and restore parfor body and other to its initial state
-        #     2.c. in lower_parfor_rollback throw expeption to catch it here (DPPLLower.lower())
+        #     2.c. in lower_parfor_rollback throw expeption to catch it here (DPPYLower.lower())
         # 3. Catch exception and start parfor lowering with CPU context.
 
         # WARNING: this approach only works in case no device specific modifications were added to
@@ -1166,13 +1167,18 @@ def lower(self):
         try:
             lowering.lower_extensions[parfor.Parfor].append(lower_parfor_rollback)
             self.gpu_lower.lower()
+            # if lower dont crash, and parfor_diagnostics is empty then it is kernel
+            if not self.gpu_lower.metadata['parfor_diagnostics'].extra_info:
+                str_name = str(dpctl.get_current_queue().get_sycl_device().get_device_name())
+                self.gpu_lower.metadata['parfor_diagnostics'].extra_info["kernel"] = str_name
             self.base_lower = self.gpu_lower
             lowering.lower_extensions[parfor.Parfor].pop()
         except Exception as e:
             if numba_dppy.compiler.DEBUG:
-                print("Failed to lower parfor on DPPL-device. Due to:\n", e)
+                print("Failed to lower parfor on DPPY-device. Due to:\n", e)
             lowering.lower_extensions[parfor.Parfor].pop()
-            if (lowering.lower_extensions[parfor.Parfor][-1] == numba.parfors.parfor_lowering._lower_parfor_parallel):
+            if ((lowering.lower_extensions[parfor.Parfor][-1] == numba.parfors.parfor_lowering._lower_parfor_parallel) and
+                numba_dppy.config.FALLBACK_ON_CPU == 1):
                 self.cpu_lower.lower()
                 self.base_lower = self.cpu_lower
             else:
@@ -1196,13 +1202,13 @@ def lower_parfor_rollback(lowerer, parfor):
     try:
         _lower_parfor_gufunc(lowerer, parfor)
         if numba_dppy.compiler.DEBUG:
-            msg = "Parfor lowered on DPPL-device"
+            msg = "Parfor lowered on DPPY-device"
             print(msg, parfor.loc)
     except Exception as e:
-        msg = "Failed to lower parfor on DPPL-device.\nTo see details set environment variable NUMBA_DPPL_DEBUG=1"
+        msg = "Failed to lower parfor on DPPY-device.\nTo see details set environment variable NUMBA_DPPY_DEBUG=1"
         warnings.warn(NumbaPerformanceWarning(msg, parfor.loc))
         raise e
 
 
-def dppl_lower_array_expr(lowerer, expr):
+def dppy_lower_array_expr(lowerer, expr):
     raise NotImplementedError(expr)
diff --git a/numba_dppy/dppl_offload_dispatcher.py b/numba_dppy/dppy_offload_dispatcher.py
similarity index 67%
rename from numba_dppy/dppl_offload_dispatcher.py
rename to numba_dppy/dppy_offload_dispatcher.py
index 49a599589e..0c5fe10f5e 100644
--- a/numba_dppy/dppl_offload_dispatcher.py
+++ b/numba_dppy/dppy_offload_dispatcher.py
@@ -1,23 +1,23 @@
 from numba.core import dispatcher, compiler
 from numba.core.registry import cpu_target, dispatcher_registry
-import numba.dppl_config as dppl_config
+import numba_dppy.config as dppy_config
 
 
-class DpplOffloadDispatcher(dispatcher.Dispatcher):
+class DppyOffloadDispatcher(dispatcher.Dispatcher):
     targetdescr = cpu_target
 
     def __init__(self, py_func, locals={}, targetoptions={}, impl_kind='direct', pipeline_class=compiler.Compiler):
-        if dppl_config.dppl_present:
-            from numba_dppy.compiler import DPPLCompiler
+        if dppy_config.dppy_present:
+            from numba_dppy.compiler import DPPYCompiler
             targetoptions['parallel'] = True
             dispatcher.Dispatcher.__init__(self, py_func, locals=locals,
-                    targetoptions=targetoptions, impl_kind=impl_kind, pipeline_class=DPPLCompiler)
+                    targetoptions=targetoptions, impl_kind=impl_kind, pipeline_class=DPPYCompiler)
         else:
             print("---------------------------------------------------------------------")
-            print("WARNING : DPPL pipeline ignored. Ensure OpenCL drivers are installed.")
+            print("WARNING : DPPY pipeline ignored. Ensure OpenCL drivers are installed.")
             print("---------------------------------------------------------------------")
             dispatcher.Dispatcher.__init__(self, py_func, locals=locals,
                 targetoptions=targetoptions, impl_kind=impl_kind, pipeline_class=pipeline_class)
 
-dispatcher_registry['__dppl_offload_gpu__'] = DpplOffloadDispatcher
-dispatcher_registry['__dppl_offload_cpu__'] = DpplOffloadDispatcher
+dispatcher_registry['__dppy_offload_gpu__'] = DppyOffloadDispatcher
+dispatcher_registry['__dppy_offload_cpu__'] = DppyOffloadDispatcher
diff --git a/numba_dppy/dppy_parfor_diagnostics.py b/numba_dppy/dppy_parfor_diagnostics.py
new file mode 100644
index 0000000000..50e19a1cb1
--- /dev/null
+++ b/numba_dppy/dppy_parfor_diagnostics.py
@@ -0,0 +1,106 @@
+from numba.parfors.parfor import ParforDiagnostics, _termwidth, print_wrapped
+
+
+class ExtendedParforDiagnostics(ParforDiagnostics):
+    def __init__(self):
+        ParforDiagnostics.__init__(self)
+        self.extra_info = {}
+
+    def dump(self, level=1):
+        if level == 0:
+            level = 1
+        super().dump(level)
+
+        if self.extra_info:
+            parfors_simple = self.get_parfors_simple(False)
+            all_lines = self.get_all_lines(parfors_simple)
+            print(' Auto-offloading '.center(_termwidth,'-'))
+            self.print_auto_offloading(all_lines)
+            if 'kernel' in self.extra_info.keys():
+                print_wrapped("Device - '%s'" % self.extra_info['kernel'])
+            print(_termwidth * '-')
+
+    def print_auto_offloading(self, lines):
+        sword = '+--'
+        fac = len(sword)
+
+        summary = dict()
+        # region : {fused, serialized}
+
+        def print_nest(fadj_, nadj_, theroot, reported, region_id):
+            def print_g(fadj_, nadj_, nroot, depth):
+                for k in nadj_[nroot]:
+                    msg = fac * depth * ' ' + '%s%s %s' % (sword, k, '(serial')
+                    if nadj_[k] == []:
+                        fused = []
+                        if fadj_[k] != [] and k not in reported:
+                            fused = sorted(self.reachable_nodes(fadj_, k))
+                            msg += ", fused with loop(s): "
+                            msg += ', '.join([str(x) for x in fused])
+                        msg += ')'
+                        reported.append(k)
+                        print_wrapped(msg)
+                        summary[region_id]['fused'] += len(fused)
+                    else:
+                        print_wrapped(msg + ')')
+                        print_g(fadj_, nadj_, k, depth + 1)
+                    summary[region_id]['serialized'] += 1
+
+            if nadj_[theroot] != []:
+                print_wrapped("Parallel region %s:" % region_id)
+                print_wrapped('%s%s %s' % (sword, theroot, '(parallel)'))
+                summary[region_id] = {'root': theroot, 'fused': 0, 'serialized': 0}
+                print_g(fadj_, nadj_, theroot, 1)
+                print("\n")
+                region_id = region_id + 1
+            return region_id
+
+        def print_fuse(ty, pf_id, adj, depth, region_id):
+            print_wrapped("Parallel region %s:" % region_id)
+            msg = fac * depth * ' ' + '%s%s %s' % (sword, pf_id, '(parallel')
+            fused = []
+            if adj[pf_id] != []:
+                fused = sorted(self.reachable_nodes(adj, pf_id))
+                msg += ", fused with loop(s): "
+                msg += ', '.join([str(x) for x in fused])
+
+            summary[region_id] = {'root': pf_id, 'fused': len(fused), 'serialized': 0}
+            msg += ')'
+            print_wrapped(msg)
+            extra_info = self.extra_info.get(str(region_id))
+            if extra_info:
+                print_wrapped("Device - '%s'" % extra_info)
+            region_id = region_id + 1
+            return region_id
+
+        # Walk the parfors by src line and print optimised structure
+        region_id = 0
+        reported = []
+        for line, info in sorted(lines.items()):
+            opt_ty, pf_id, adj = info
+            if opt_ty == 'fuse':
+                if pf_id not in reported:
+                    region_id = print_fuse('f', pf_id, adj, 0, region_id)
+            elif opt_ty == 'nest':
+                region_id = print_nest(fadj, nadj, pf_id, reported, region_id)
+            else:
+                assert 0
+
+        # print the summary of the fuse/serialize rewrite
+        if summary:
+            for k, v in sorted(summary.items()):
+                msg = ('\n \nParallel region %s (loop #%s) had %s '
+                    'loop(s) fused')
+                root = v['root']
+                fused = v['fused']
+                serialized = v['serialized']
+                if serialized != 0:
+                    msg += (' and %s loop(s) '
+                    'serialized as part of the larger '
+                    'parallel loop (#%s).')
+                    print_wrapped(msg % (k, root, fused, serialized, root))
+                else:
+                    msg += '.'
+                    print_wrapped(msg % (k, root, fused))
+        else:
+            print_wrapped("Parallel structure is already optimal.")
diff --git a/numba_dppy/dppl_passbuilder.py b/numba_dppy/dppy_passbuilder.py
similarity index 75%
rename from numba_dppy/dppl_passbuilder.py
rename to numba_dppy/dppy_passbuilder.py
index 0ddaea6d0b..994351d509 100644
--- a/numba_dppy/dppl_passbuilder.py
+++ b/numba_dppy/dppy_passbuilder.py
@@ -17,19 +17,21 @@
                                 DumpParforDiagnostics, IRLegalization,
                                 InlineOverloads, PreLowerStripPhis)
 
-from .dppl_passes import (
-        DPPLConstantSizeStaticLocalMemoryPass,
-        DPPLPreParforPass,
-        DPPLParforPass,
+from .dppy_passes import (
+        DPPYConstantSizeStaticLocalMemoryPass,
+        DPPYPreParforPass,
+        DPPYParforPass,
         SpirvFriendlyLowering,
-        DPPLAddNumpyOverloadPass,
-        DPPLAddNumpyRemoveOverloadPass,
-        DPPLNoPythonBackend
+        DPPYNoPythonBackend,
+        DPPYDumpParforDiagnostics
         )
 
-class DPPLPassBuilder(object):
+from .rename_numpy_functions_pass import (DPPYRewriteOverloadedNumPyFunctions,
+                                          DPPYRewriteNdarrayFunctions)
+
+class DPPYPassBuilder(object):
     """
-    This is the DPPL pass builder to run Intel GPU/CPU specific
+    This is the DPPY pass builder to run Intel GPU/CPU specific
     code-generation and optimization passes. This pass builder does
     not offer objectmode and interpreted passes.
     """
@@ -44,14 +46,15 @@ def default_numba_nopython_pipeline(state, pm):
         pm.add_pass(IRProcessing, "processing IR")
         pm.add_pass(WithLifting, "Handle with contexts")
 
-        # this pass adds required logic to overload default implementation of
-        # Numpy functions
-        pm.add_pass(DPPLAddNumpyOverloadPass, "dppl add typing template for Numpy functions")
+        # this pass rewrites name of NumPy functions we intend to overload
+        pm.add_pass(DPPYRewriteOverloadedNumPyFunctions,
+                "Rewrite name of Numpy functions to overload already overloaded function",
+        )
 
         # Add pass to ensure when users are allocating static
         # constant memory the size is a constant and can not
         # come from a closure variable
-        pm.add_pass(DPPLConstantSizeStaticLocalMemoryPass, "dppl constant size for static local memory")
+        pm.add_pass(DPPYConstantSizeStaticLocalMemoryPass, "dppy constant size for static local memory")
 
         # pre typing
         if not state.flags.no_rewrites:
@@ -81,6 +84,10 @@ def default_numba_nopython_pipeline(state, pm):
         pm.add_pass(NopythonTypeInference, "nopython frontend")
         pm.add_pass(AnnotateTypes, "annotate types")
 
+        pm.add_pass(DPPYRewriteNdarrayFunctions,
+                "Rewrite ndarray functions to dppy supported functions",
+        )
+
         # strip phis
         pm.add_pass(PreLowerStripPhis, "remove phis nodes")
 
@@ -88,26 +95,25 @@ def default_numba_nopython_pipeline(state, pm):
         pm.add_pass(InlineOverloads, "inline overloaded functions")
 
 
-
     @staticmethod
-    def define_nopython_pipeline(state, name='dppl_nopython'):
+    def define_nopython_pipeline(state, name='dppy_nopython'):
         """Returns an nopython mode pipeline based PassManager
         """
         pm = PassManager(name)
-        DPPLPassBuilder.default_numba_nopython_pipeline(state, pm)
+        DPPYPassBuilder.default_numba_nopython_pipeline(state, pm)
 
         # Intel GPU/CPU specific optimizations
-        pm.add_pass(DPPLPreParforPass, "Preprocessing for parfors")
+        pm.add_pass(DPPYPreParforPass, "Preprocessing for parfors")
         if not state.flags.no_rewrites:
             pm.add_pass(NopythonRewrites, "nopython rewrites")
-        pm.add_pass(DPPLParforPass, "convert to parfors")
+        pm.add_pass(DPPYParforPass, "convert to parfors")
 
         # legalise
         pm.add_pass(IRLegalization, "ensure IR is legal prior to lowering")
 
         # lower
         pm.add_pass(SpirvFriendlyLowering, "SPIRV-friendly lowering pass")
-        pm.add_pass(DPPLNoPythonBackend, "nopython mode backend")
-        pm.add_pass(DPPLAddNumpyRemoveOverloadPass, "dppl remove typing template for Numpy functions")
+        pm.add_pass(DPPYNoPythonBackend, "nopython mode backend")
+        pm.add_pass(DPPYDumpParforDiagnostics, "dump parfor diagnostics")
         pm.finalize()
         return pm
diff --git a/numba_dppy/dppl_passes.py b/numba_dppy/dppy_passes.py
similarity index 70%
rename from numba_dppy/dppl_passes.py
rename to numba_dppy/dppy_passes.py
index f9e2633c3c..be9423230b 100644
--- a/numba_dppy/dppl_passes.py
+++ b/numba_dppy/dppy_passes.py
@@ -3,6 +3,7 @@
 import warnings
 
 import numpy as np
+import numba
 from numba.core import ir
 import weakref
 from collections import namedtuple, deque
@@ -22,130 +23,20 @@
 from numba.core.errors import (LoweringError, new_error_context, TypingError,
                      LiteralTypingError)
 
-from numba.core.compiler_machinery import FunctionPass, LoweringPass, register_pass
+from numba.core.compiler_machinery import FunctionPass, LoweringPass, register_pass, AnalysisPass
 
-from .dppl_lowerer import DPPLLower
+from .dppy_lowerer import DPPYLower
+from numba_dppy import config as dppy_config
 
 from numba.parfors.parfor import PreParforPass as _parfor_PreParforPass, replace_functions_map
 from numba.parfors.parfor import ParforPass as _parfor_ParforPass
 from numba.parfors.parfor import Parfor
 
-def dpnp_available():
-    try:
-       # import dpnp
-        from numba_dppy.dpnp_glue import dpnp_fptr_interface as dpnp_glue
-        return True
-    except:
-        return False
-
-
-@register_pass(mutates_CFG=False, analysis_only=True)
-class DPPLAddNumpyOverloadPass(FunctionPass):
-    _name = "dppl_add_numpy_overload_pass"
-
-    def __init__(self):
-        FunctionPass.__init__(self)
-
-    def run_pass(self, state):
-        if dpnp_available():
-            typingctx = state.typingctx
-            from numba.core.typing.templates import builtin_registry as reg, infer_global
-            from numba.core.typing.templates import (AbstractTemplate, CallableTemplate, signature)
-            from numba.core.typing.npydecl import MatMulTyperMixin
-
-            @infer_global(np.cov)
-            class NPCov(AbstractTemplate):
-                def generic(self, args, kws):
-                    assert not kws
-                    if args[0].ndim > 2:
-                        return
-
-                    nb_dtype = types.float64
-                    return_type = types.Array(dtype=nb_dtype, ndim=args[0].ndim, layout='C')
-                    return signature(return_type, *args)
-
-            @infer_global(np.matmul, typing_key="np.matmul")
-            class matmul(MatMulTyperMixin, AbstractTemplate):
-                key = np.matmul
-                func_name = "np.matmul()"
-
-                def generic(self, args, kws):
-                    assert not kws
-                    restype = self.matmul_typer(*args)
-                    if restype is not None:
-                        return signature(restype, *args)
-
-            @infer_global(np.median)
-            class NPMedian(AbstractTemplate):
-                def generic(self, args, kws):
-                    assert not kws
-
-                    retty = args[0].dtype
-                    return signature(retty, *args)
-
-            @infer_global(np.mean)
-            #@infer_global("array.mean")
-            class NPMean(AbstractTemplate):
-                def generic(self, args, kws):
-                    assert not kws
-
-                    if args[0].dtype == types.float32:
-                        retty = types.float32
-                    else:
-                        retty = types.float64
-                    return signature(retty, *args)
-
-
-            prev_cov = None
-            prev_median = None
-            prev_mean = None
-            for idx, g in enumerate(reg.globals):
-                if g[0] == np.cov:
-                    if not prev_cov:
-                        prev_cov = g[1]
-                    else:
-                        prev_cov.templates = g[1].templates
-
-                if g[0] == np.median:
-                    if not prev_median:
-                        prev_median = g[1]
-                    else:
-                        prev_median.templates = g[1].templates
-
-                if g[0] == np.mean:
-                    if not prev_mean:
-                        prev_mean = g[1]
-                    else:
-                        prev_mean.templates = g[1].templates
-
-            typingctx.refresh()
-        return True
-
-@register_pass(mutates_CFG=False, analysis_only=True)
-class DPPLAddNumpyRemoveOverloadPass(FunctionPass):
-    _name = "dppl_remove_numpy_overload_pass"
-
-    def __init__(self):
-        FunctionPass.__init__(self)
-
-    def run_pass(self, state):
-        if dpnp_available():
-            typingctx = state.typingctx
-            targetctx = state.targetctx
-
-            from importlib import reload
-            from numba.np import npyimpl, arrayobj, arraymath
-            reload(npyimpl)
-            reload(arrayobj)
-            reload(arraymath)
-            targetctx.refresh()
-
-        return True
 
 @register_pass(mutates_CFG=True, analysis_only=False)
-class DPPLConstantSizeStaticLocalMemoryPass(FunctionPass):
+class DPPYConstantSizeStaticLocalMemoryPass(FunctionPass):
 
-    _name = "dppl_constant_size_static_local_memory_pass"
+    _name = "dppy_constant_size_static_local_memory_pass"
 
     def __init__(self):
         FunctionPass.__init__(self)
@@ -218,9 +109,9 @@ def run_pass(self, state):
 
 
 @register_pass(mutates_CFG=True, analysis_only=False)
-class DPPLPreParforPass(FunctionPass):
+class DPPYPreParforPass(FunctionPass):
 
-    _name = "dppl_pre_parfor_pass"
+    _name = "dppy_pre_parfor_pass"
 
     def __init__(self):
         FunctionPass.__init__(self)
@@ -262,9 +153,9 @@ def run_pass(self, state):
 
 
 @register_pass(mutates_CFG=True, analysis_only=False)
-class DPPLParforPass(FunctionPass):
+class DPPYParforPass(FunctionPass):
 
-    _name = "dppl_parfor_pass"
+    _name = "dppy_parfor_pass"
 
     def __init__(self):
         FunctionPass.__init__(self)
@@ -339,21 +230,8 @@ def run_pass(self, state):
             # be later serialized.
             state.library.enable_object_caching()
 
-
         targetctx = state.targetctx
 
-        # This should not happen here, after we have the notion of context in Numba
-        # we should have specialized dispatcher for dppl context and that dispatcher
-        # should be a cpu dispatcher that will overload the lowering functions for
-        # linalg for dppl.cpu_dispatcher and the dppl.gpu_dipatcher should be the
-        # current target context we have to launch kernels.
-        # This is broken as this essentially adds the new lowering in a list which
-        # means it does not get replaced with the new lowering_buitins
-
-        if dpnp_available():
-            from . import experimental_numpy_lowering_overload
-            targetctx.refresh()
-
         library   = state.library
         interp    = state.func_ir  # why is it called this?!
         typemap   = state.typemap
@@ -373,7 +251,7 @@ def run_pass(self, state):
                     noalias=flags.noalias)
 
             with targetctx.push_code_library(library):
-                lower = DPPLLower(targetctx, library, fndesc, interp,
+                lower = DPPYLower(targetctx, library, fndesc, interp,
                                        metadata=metadata)
                 lower.lower()
                 if not flags.no_cpython_wrapper:
@@ -400,7 +278,7 @@ def run_pass(self, state):
 
 
 @register_pass(mutates_CFG=True, analysis_only=False)
-class DPPLNoPythonBackend(FunctionPass):
+class DPPYNoPythonBackend(FunctionPass):
 
     _name = "nopython_backend"
 
@@ -437,3 +315,21 @@ def run_pass(self, state):
         remove_dels(state.func_ir.blocks)
 
         return True
+
+
+@register_pass(mutates_CFG=False, analysis_only=True)
+class DPPYDumpParforDiagnostics(AnalysisPass):
+
+    _name = "dump_parfor_diagnostics"
+
+    def __init__(self):
+        AnalysisPass.__init__(self)
+
+    def run_pass(self, state):
+        # if state.flags.auto_parallel.enabled: //add in condition flag for kernels
+        if dppy_config.OFFLOAD_DIAGNOSTICS:
+            if state.parfor_diagnostics is not None:
+                state.parfor_diagnostics.dump(config.PARALLEL_DIAGNOSTICS)
+            else:
+                raise RuntimeError("Diagnostics failed.")
+        return True
diff --git a/numba_dppy/examples/dppl_func.py b/numba_dppy/examples/dppy_func.py
similarity index 81%
rename from numba_dppy/examples/dppl_func.py
rename to numba_dppy/examples/dppy_func.py
index ec86681457..353ba48995 100644
--- a/numba_dppy/examples/dppl_func.py
+++ b/numba_dppy/examples/dppy_func.py
@@ -1,26 +1,26 @@
 import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import math
 
 import dpctl
 
 
-@dppl.func
+@dppy.func
 def g(a):
     return a + 1
 
 
-@dppl.kernel
+@dppy.kernel
 def f(a, b):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     b[i] = g(a[i])
 
 
 def driver(a, b, N):
     print(b)
     print("--------")
-    f[N, dppl.DEFAULT_LOCAL_SIZE](a, b)
+    f[N, dppy.DEFAULT_LOCAL_SIZE](a, b)
     print(b)
 
 
diff --git a/numba_dppy/examples/dppl_with_context.py b/numba_dppy/examples/dppy_with_context.py
similarity index 94%
rename from numba_dppy/examples/dppl_with_context.py
rename to numba_dppy/examples/dppy_with_context.py
index c830e81ec6..6df025f5ca 100644
--- a/numba_dppy/examples/dppl_with_context.py
+++ b/numba_dppy/examples/dppy_with_context.py
@@ -1,6 +1,6 @@
 import numpy as np
 from numba import njit, prange
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 
 @njit
diff --git a/numba_dppy/examples/matmul.py b/numba_dppy/examples/matmul.py
index 35bef5be8a..b97ac49ca1 100644
--- a/numba_dppy/examples/matmul.py
+++ b/numba_dppy/examples/matmul.py
@@ -4,14 +4,14 @@
 
 import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 
 
-@dppl.kernel
-def dppl_gemm(a, b, c):
-    i = dppl.get_global_id(0)
-    j = dppl.get_global_id(1)
+@dppy.kernel
+def dppy_gemm(a, b, c):
+    i = dppy.get_global_id(0)
+    j = dppy.get_global_id(1)
     if i >= c.shape[0] or j >= c.shape[1]:
         return
     c[i,j] = 0
@@ -30,7 +30,7 @@ def dppl_gemm(a, b, c):
 
 def driver(a, b, c):
     # Invoke the kernel
-    dppl_gemm[griddim,blockdim](a, b, c)
+    dppy_gemm[griddim,blockdim](a, b, c)
 
 
 def main():
diff --git a/numba_dppy/examples/pa_examples/test1-2d.py b/numba_dppy/examples/pa_examples/test1-2d.py
index 7985216aba..df3849b30d 100644
--- a/numba_dppy/examples/pa_examples/test1-2d.py
+++ b/numba_dppy/examples/pa_examples/test1-2d.py
@@ -1,23 +1,29 @@
 from numba import njit, gdb
 import numpy as np
+import dpctl
 
-@njit(parallel={'offload':True})
+
+@njit
 def f1(a, b):
     c = a + b
     return c
 
+
 N = 1000
 print("N", N)
 
-a = np.ones((N,N), dtype=np.float32)
-b = np.ones((N,N), dtype=np.float32)
+a = np.ones((N, N), dtype=np.float32)
+b = np.ones((N, N), dtype=np.float32)
 
 print("a:", a, hex(a.ctypes.data))
 print("b:", b, hex(b.ctypes.data))
-c = f1(a,b)
+
+with dpctl.device_context("opencl:gpu:0"):
+    c = f1(a, b)
+
 print("BIG RESULT c:", c, hex(c.ctypes.data))
 for i in range(N):
     for j in range(N):
-        if c[i,j] != 2.0:
+        if c[i, j] != 2.0:
             print("First index not equal to 2.0 was", i, j)
             break
diff --git a/numba_dppy/examples/pa_examples/test1-3d.py b/numba_dppy/examples/pa_examples/test1-3d.py
index 1304c0762a..a69aa0cbc5 100644
--- a/numba_dppy/examples/pa_examples/test1-3d.py
+++ b/numba_dppy/examples/pa_examples/test1-3d.py
@@ -1,24 +1,30 @@
 from numba import njit, gdb
 import numpy as np
+import dpctl
 
-@njit(parallel={'offload':True})
+
+@njit
 def f1(a, b):
     c = a + b
     return c
 
+
 N = 10
 print("N", N)
 
-a = np.ones((N,N,N), dtype=np.float32)
-b = np.ones((N,N,N), dtype=np.float32)
+a = np.ones((N, N, N), dtype=np.float32)
+b = np.ones((N, N, N), dtype=np.float32)
 
 print("a:", a, hex(a.ctypes.data))
 print("b:", b, hex(b.ctypes.data))
-c = f1(a,b)
+
+with dpctl.device_context("opencl:gpu:0"):
+    c = f1(a, b)
+
 print("BIG RESULT c:", c, hex(c.ctypes.data))
 for i in range(N):
     for j in range(N):
         for k in range(N):
-            if c[i,j,k] != 2.0:
+            if c[i, j, k] != 2.0:
                 print("First index not equal to 2.0 was", i, j, k)
                 break
diff --git a/numba_dppy/examples/pa_examples/test1-4d.py b/numba_dppy/examples/pa_examples/test1-4d.py
index bb52da28de..2647d0e66e 100644
--- a/numba_dppy/examples/pa_examples/test1-4d.py
+++ b/numba_dppy/examples/pa_examples/test1-4d.py
@@ -1,25 +1,31 @@
 from numba import njit, gdb
 import numpy as np
+import dpctl
 
-@njit(parallel={'offload':True})
+
+@njit
 def f1(a, b):
     c = a + b
     return c
 
+
 N = 10
 print("N", N)
 
-a = np.ones((N,N,N,N), dtype=np.float32)
-b = np.ones((N,N,N,N), dtype=np.float32)
+a = np.ones((N, N, N, N), dtype=np.float32)
+b = np.ones((N, N, N, N), dtype=np.float32)
 
 print("a:", a, hex(a.ctypes.data))
 print("b:", b, hex(b.ctypes.data))
-c = f1(a,b)
+
+with dpctl.device_context("opencl:gpu:0"):
+    c = f1(a, b)
+
 print("BIG RESULT c:", c, hex(c.ctypes.data))
 for i in range(N):
     for j in range(N):
         for k in range(N):
             for l in range(N):
-                if c[i,j,k,l] != 2.0:
+                if c[i, j, k, l] != 2.0:
                     print("First index not equal to 2.0 was", i, j, k, l)
                     break
diff --git a/numba_dppy/examples/pa_examples/test1-5d.py b/numba_dppy/examples/pa_examples/test1-5d.py
index e795dbe602..893fe3b6a6 100644
--- a/numba_dppy/examples/pa_examples/test1-5d.py
+++ b/numba_dppy/examples/pa_examples/test1-5d.py
@@ -1,26 +1,32 @@
 from numba import njit, gdb
 import numpy as np
+import dpctl
 
-@njit(parallel={'offload':True})
+
+@njit
 def f1(a, b):
     c = a + b
     return c
 
+
 N = 5
 print("N", N)
 
-a = np.ones((N,N,N,N,N), dtype=np.float32)
-b = np.ones((N,N,N,N,N), dtype=np.float32)
+a = np.ones((N, N, N, N, N), dtype=np.float32)
+b = np.ones((N, N, N, N, N), dtype=np.float32)
 
 print("a:", a, hex(a.ctypes.data))
 print("b:", b, hex(b.ctypes.data))
-c = f1(a,b)
+
+with dpctl.device_context("opencl:gpu:0"):
+    c = f1(a, b)
+
 print("BIG RESULT c:", c, hex(c.ctypes.data))
 for i in range(N):
     for j in range(N):
         for k in range(N):
             for l in range(N):
                 for m in range(N):
-                    if c[i,j,k,l,m] != 2.0:
+                    if c[i, j, k, l, m] != 2.0:
                         print("First index not equal to 2.0 was", i, j, k, l, m)
                         break
diff --git a/numba_dppy/examples/pa_examples/test1.py b/numba_dppy/examples/pa_examples/test1.py
index 1620654cf8..01209b3309 100644
--- a/numba_dppy/examples/pa_examples/test1.py
+++ b/numba_dppy/examples/pa_examples/test1.py
@@ -1,8 +1,9 @@
 from numba import njit
 import numpy as np
+import dpctl
 
 
-@njit(parallel={'offload':True})
+@njit
 def f1(a, b):
     c = a + b
     return c
@@ -19,7 +20,10 @@ def main():
 
     print("a:", a, hex(a.ctypes.data))
     print("b:", b, hex(b.ctypes.data))
-    c = f1(a,b)
+
+    with dpctl.device_context("opencl:gpu:0"):
+        c = f1(a, b)
+
     print("RESULT c:", c, hex(c.ctypes.data))
     for i in range(N):
         if c[i] != 2.0:
diff --git a/numba_dppy/examples/pairwise_distance.py b/numba_dppy/examples/pairwise_distance.py
index cc5c232c92..b72c41ba9c 100644
--- a/numba_dppy/examples/pairwise_distance.py
+++ b/numba_dppy/examples/pairwise_distance.py
@@ -6,7 +6,7 @@
 import argparse
 import timeit
 
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 import dpctl._memory as dpctl_mem
 
@@ -28,9 +28,9 @@
 D = np.empty((args.n, args.n))
 
 
-@dppl.kernel
+@dppy.kernel
 def pairwise_distance(X, D, xshape0, xshape1):
-    idx = dppl.get_global_id(0)
+    idx = dppy.get_global_id(0)
 
     #for i in range(xshape0):
     for j in range(X.shape[0]):
diff --git a/numba_dppy/examples/sum-hybrid.py b/numba_dppy/examples/sum-hybrid.py
index 418976f53a..e66c51ae2c 100644
--- a/numba_dppy/examples/sum-hybrid.py
+++ b/numba_dppy/examples/sum-hybrid.py
@@ -4,13 +4,13 @@
 
 import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 
 
-@dppl.kernel
+@dppy.kernel
 def data_parallel_sum(a, b, c):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     c[i] = a[i] + b[i]
 
 
@@ -27,7 +27,7 @@ def main():
             c = np.ones_like(a)
             print("before A: ", a)
             print("before B: ", b)
-            data_parallel_sum[global_size, dppl.DEFAULT_LOCAL_SIZE](a, b, c)
+            data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)
             print("after  C: ", c)
     else:
         print("CPU device not found")
@@ -40,7 +40,7 @@ def main():
             c = np.ones_like(a)
             print("before A: ", a)
             print("before B: ", b)
-            data_parallel_sum[global_size, dppl.DEFAULT_LOCAL_SIZE](a, b, c)
+            data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)
             print("after  C: ", c)
     else:
         print("GPU device not found")
diff --git a/numba_dppy/examples/sum.py b/numba_dppy/examples/sum.py
index f97b8243cb..fdc1623fa7 100644
--- a/numba_dppy/examples/sum.py
+++ b/numba_dppy/examples/sum.py
@@ -4,13 +4,13 @@
 
 import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 
 
-@dppl.kernel
+@dppy.kernel
 def data_parallel_sum(a, b, c):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     c[i] = a[i] + b[i]
 
 
@@ -18,7 +18,7 @@ def driver(a, b, c, global_size):
     print("before : ", a)
     print("before : ", b)
     print("before : ", c)
-    data_parallel_sum[global_size, dppl.DEFAULT_LOCAL_SIZE](a, b, c)
+    data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)
     print("after : ", c)
 
 
diff --git a/numba_dppy/examples/sum2D.py b/numba_dppy/examples/sum2D.py
index 00be613d2b..90959c8bdf 100644
--- a/numba_dppy/examples/sum2D.py
+++ b/numba_dppy/examples/sum2D.py
@@ -4,21 +4,21 @@
 
 import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 
 
-@dppl.kernel
+@dppy.kernel
 def data_parallel_sum(a, b, c):
-    i = dppl.get_global_id(0)
-    j = dppl.get_global_id(1)
+    i = dppy.get_global_id(0)
+    j = dppy.get_global_id(1)
     c[i,j] = a[i,j] + b[i,j]
 
 
 def driver(a, b, c, global_size):
     print("before A: ", a)
     print("before B: ", b)
-    data_parallel_sum[global_size, dppl.DEFAULT_LOCAL_SIZE](a, b, c)
+    data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)
     print("after  C : ", c)
 
 
diff --git a/numba_dppy/examples/sum_ndarray.py b/numba_dppy/examples/sum_ndarray.py
index 6486be0275..2aea8e080a 100644
--- a/numba_dppy/examples/sum_ndarray.py
+++ b/numba_dppy/examples/sum_ndarray.py
@@ -4,13 +4,13 @@
 
 import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 
 
-@dppl.kernel(access_types={"read_only": ['a', 'b'], "write_only": ['c'], "read_write": []})
+@dppy.kernel(access_types={"read_only": ['a', 'b'], "write_only": ['c'], "read_write": []})
 def data_parallel_sum(a, b, c):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     c[i] = a[i] + b[i]
 
 
diff --git a/numba_dppy/examples/sum_reduction.py b/numba_dppy/examples/sum_reduction.py
index 3e00f95631..367fa37952 100644
--- a/numba_dppy/examples/sum_reduction.py
+++ b/numba_dppy/examples/sum_reduction.py
@@ -4,13 +4,13 @@
 import math
 import time
 
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 
 
-@dppl.kernel
+@dppy.kernel
 def reduction_kernel(A, R, stride):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     # sum two element
     R[i] = A[i] + A[i+stride]
     # store the sum to be used in nex iteration
@@ -34,7 +34,7 @@ def test_sum_reduction():
             while (total > 1):
                 # call kernel
                 global_size = total // 2
-                reduction_kernel[global_size, dppl.DEFAULT_LOCAL_SIZE](A, R, global_size)
+                reduction_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, R, global_size)
                 total = total // 2
 
     else:
diff --git a/numba_dppy/examples/sum_reduction_ocl.py b/numba_dppy/examples/sum_reduction_ocl.py
index e2605a7bbc..8d8e0411aa 100644
--- a/numba_dppy/examples/sum_reduction_ocl.py
+++ b/numba_dppy/examples/sum_reduction_ocl.py
@@ -1,20 +1,20 @@
 import sys
 import numpy as np
 from numba import int32
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import math
 
 import dpctl
 
 def sum_reduction_device_plus_host():
-    @dppl.kernel
+    @dppy.kernel
     def sum_reduction_kernel(inp, partial_sums):
-        local_id   = dppl.get_local_id(0)
-        global_id  = dppl.get_global_id(0)
-        group_size = dppl.get_local_size(0)
-        group_id   = dppl.get_group_id(0)
+        local_id   = dppy.get_local_id(0)
+        global_id  = dppy.get_global_id(0)
+        group_size = dppy.get_local_size(0)
+        group_id   = dppy.get_group_id(0)
 
-        local_sums = dppl.local.static_alloc(64, int32)
+        local_sums = dppy.local.static_alloc(64, int32)
 
         # Copy from global to local memory
         local_sums[local_id] = inp[global_id]
@@ -23,7 +23,7 @@ def sum_reduction_kernel(inp, partial_sums):
         stride = group_size // 2
         while (stride > 0):
             # Waiting for each 2x2 addition into given workgroup
-            dppl.barrier(dppl.CLK_LOCAL_MEM_FENCE)
+            dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE)
 
             # Add elements 2 by 2 between local_id and local_id + stride
             if (local_id < stride):
diff --git a/numba_dppy/examples/sum_reduction_recursive_ocl.py b/numba_dppy/examples/sum_reduction_recursive_ocl.py
index 11f5023a3b..c5dd6daa47 100644
--- a/numba_dppy/examples/sum_reduction_recursive_ocl.py
+++ b/numba_dppy/examples/sum_reduction_recursive_ocl.py
@@ -1,7 +1,7 @@
 import sys
 import numpy as np
 from numba import int32
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import math
 
 import dpctl
@@ -11,15 +11,15 @@
 def recursive_reduction(size, group_size,
                         Dinp, Dpartial_sums):
 
-    @dppl.kernel
+    @dppy.kernel
     def sum_reduction_kernel(inp, input_size,
                              partial_sums):
-        local_id   = dppl.get_local_id(0)
-        global_id  = dppl.get_global_id(0)
-        group_size = dppl.get_local_size(0)
-        group_id   = dppl.get_group_id(0)
+        local_id   = dppy.get_local_id(0)
+        global_id  = dppy.get_global_id(0)
+        group_size = dppy.get_local_size(0)
+        group_id   = dppy.get_group_id(0)
 
-        local_sums = dppl.local.static_alloc(64, int32)
+        local_sums = dppy.local.static_alloc(64, int32)
 
         local_sums[local_id] = 0
 
@@ -30,7 +30,7 @@ def sum_reduction_kernel(inp, input_size,
         stride = group_size // 2
         while (stride > 0):
             # Waiting for each 2x2 addition into given workgroup
-            dppl.barrier(dppl.CLK_LOCAL_MEM_FENCE)
+            dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE)
 
             # Add elements 2 by 2 between local_id and local_id + stride
             if (local_id < stride):
diff --git a/numba_dppy/experimental_numpy_lowering_overload.py b/numba_dppy/experimental_numpy_lowering_overload.py
deleted file mode 100644
index 2123e6667d..0000000000
--- a/numba_dppy/experimental_numpy_lowering_overload.py
+++ /dev/null
@@ -1,723 +0,0 @@
-import numpy as np
-from numba.core import types, cgutils
-from numba.core.imputils import (lower_builtin)
-from numba.core.typing import signature
-from numba.np.arrayobj import make_array, _empty_nd_impl, array_copy
-from numba.core import itanium_mangler
-from llvmlite import ir
-import llvmlite.llvmpy.core as lc
-import contextlib
-
-from numba import int32, int64, uint32, uint64, float32, float64
-
-
-@contextlib.contextmanager
-def make_contiguous(context, builder, sig, args):
-    """
-    Ensure that all array arguments are contiguous, if necessary by
-    copying them.
-    A new (sig, args) tuple is yielded.
-    """
-    newtys = []
-    newargs = []
-    copies = []
-    for ty, val in zip(sig.args, args):
-        if not isinstance(ty, types.Array) or ty.layout in 'CF':
-            newty, newval = ty, val
-        else:
-            newty = ty.copy(layout='C')
-            copysig = signature(newty, ty)
-            newval = array_copy(context, builder, copysig, (val,))
-            copies.append((newty, newval))
-        newtys.append(newty)
-        newargs.append(newval)
-    yield signature(sig.return_type, *newtys), tuple(newargs)
-    for ty, val in copies:
-        context.nrt.decref(builder, ty, val)
-
-def check_c_int(context, builder, n):
-    """
-    Check whether *n* fits in a C `int`.
-    """
-    _maxint = 2**31 - 1
-
-    def impl(n):
-        if n > _maxint:
-            raise OverflowError("array size too large to fit in C int")
-
-    context.compile_internal(builder, impl,
-                             signature(types.none, types.intp), (n,))
-
-
-ll_char = ir.IntType(8)
-ll_char_p = ll_char.as_pointer()
-ll_void = ir.VoidType()
-ll_void_p = ll_char_p
-ll_intc = ir.IntType(32)
-ll_intc_p = ll_intc.as_pointer()
-intp_t = cgutils.intp_t
-ll_intp_t = ir.IntType(64)
-ll_intp_p = intp_t.as_pointer()
-
-
-def ensure_dpnp(name):
-    try:
-       # import dpnp
-        from .dpnp_glue import dpnp_fptr_interface as dpnp_glue
-    except ImportError:
-        raise ImportError("dpNP is needed to call np.%s" % name)
-
-def get_total_size_of_array(context, builder, aty, ary):
-    total_size = cgutils.alloca_once(builder, ll_intp_t)
-    builder.store(builder.sext(builder.mul(ary.nitems,
-       context.get_constant(types.intp, context.get_abi_sizeof(context.get_value_type(aty)))), ll_intp_t), total_size)
-    return builder.load(total_size)
-
-def get_sycl_queue(context, builder):
-    void_ptr_t = context.get_value_type(types.voidptr)
-    get_queue_fnty = lc.Type.function(void_ptr_t, ())
-    get_queue = builder.module.get_or_insert_function(get_queue_fnty,
-                                            name="DPPLQueueMgr_GetCurrentQueue")
-    sycl_queue_val = cgutils.alloca_once(builder, void_ptr_t)
-    builder.store(builder.call(get_queue, []), sycl_queue_val)
-
-    return sycl_queue_val
-
-def allocate_usm(context, builder, size, sycl_queue):
-    void_ptr_t = context.get_value_type(types.voidptr)
-    usm_shared_fnty = lc.Type.function(void_ptr_t, [ll_intp_t, void_ptr_t])
-    usm_shared = builder.module.get_or_insert_function(usm_shared_fnty,
-                                                       name="DPPLmalloc_shared")
-
-    buffer_ptr = cgutils.alloca_once(builder, void_ptr_t)
-    args = [size, builder.load(sycl_queue)]
-    builder.store(builder.call(usm_shared, args), buffer_ptr)
-
-    return builder.load(buffer_ptr)
-
-def copy_usm(context, builder, src, dst, size, sycl_queue):
-    void_ptr_t = context.get_value_type(types.voidptr)
-    queue_memcpy_fnty = lc.Type.function(ir.VoidType(), [void_ptr_t, void_ptr_t, void_ptr_t,
-                                                         ll_intp_t])
-    queue_memcpy = builder.module.get_or_insert_function(queue_memcpy_fnty,
-                                                       name="DPPLQueue_Memcpy")
-    args = [builder.load(sycl_queue),
-            builder.bitcast(dst, void_ptr_t),
-            builder.bitcast(src, void_ptr_t),
-            size]
-    builder.call(queue_memcpy, args)
-
-
-def free_usm(context, builder, usm_buf, sycl_queue):
-    void_ptr_t = context.get_value_type(types.voidptr)
-
-    usm_free_fnty = lc.Type.function(ir.VoidType(), [void_ptr_t, void_ptr_t])
-    usm_free = builder.module.get_or_insert_function(usm_free_fnty,
-                                               name="DPPLfree_with_queue")
-
-    builder.call(usm_free, [usm_buf, builder.load(sycl_queue)])
-
-
-def call_dpnp(context, builder, fn_name, type_names, params, param_tys, ret_ty):
-    from .dpnp_glue import dpnp_fptr_interface as dpnp_glue
-    f_ptr = dpnp_glue.get_dpnp_fn_ptr(fn_name, type_names)
-
-    fnty = ir.FunctionType(ret_ty, param_tys)
-    addr_constant = context.get_constant(int64, f_ptr)
-    fn_ptr = builder.inttoptr(addr_constant, fnty.as_pointer())
-
-    res = builder.call(fn_ptr, params)
-
-
-def dot_2_vv(context, builder, sig, args, conjugate=False):
-    """
-    np.dot(vector, vector)
-    np.vdot(vector, vector)
-    """
-    def check_args(a, b):
-        m, = a.shape
-        n, = b.shape
-        if m != n:
-            print("SIZES ", m, n)
-            raise ValueError("incompatible array sizes for np.dot(a, b) "
-                             "(vector * vector)")
-
-    context.compile_internal(builder, check_args,
-                             signature(types.none, *sig.args), args)
-
-
-    sycl_queue = get_sycl_queue(context, builder)
-    aty, bty = sig.args
-    a = make_array(aty)(context, builder, args[0])
-    b = make_array(bty)(context, builder, args[1])
-    size, = cgutils.unpack_tuple(builder, a.shape)
-
-    check_c_int(context, builder, size)
-
-    total_size_a = get_total_size_of_array(context, builder, aty.dtype, a)
-    a_usm = allocate_usm(context, builder, total_size_a, sycl_queue)
-    copy_usm(context, builder, a.data, a_usm, total_size_a, sycl_queue)
-
-    total_size_b = get_total_size_of_array(context, builder, bty.dtype, b)
-    b_usm = allocate_usm(context, builder, total_size_b, sycl_queue)
-    copy_usm(context, builder, b.data, b_usm, total_size_b, sycl_queue)
-
-    out = cgutils.alloca_once(builder, context.get_value_type(sig.return_type))
-    builder.store(context.get_constant(sig.return_type, 0), out)
-    out_usm = allocate_usm(context, builder,
-            context.get_constant(types.intp, context.get_abi_sizeof(context.get_value_type(sig.return_type))), sycl_queue)
-
-    # arguments are : a->void*, b->void*, result->void*, size->int64
-    param_tys = [ll_void_p, ll_void_p, ll_void_p, ir.IntType(64)]
-    params = (a_usm, b_usm, out_usm, size)
-
-    type_names = []
-    type_names.append(aty.dtype.name)
-    type_names.append(sig.return_type.name)
-
-    call_dpnp(context, builder, "dpnp_dot", type_names, params, param_tys, ll_void)
-
-    copy_usm(context, builder, out_usm, out,
-            context.get_constant(types.intp, context.get_abi_sizeof(context.get_value_type(sig.return_type))), sycl_queue)
-
-    free_usm(context, builder, a_usm, sycl_queue)
-    free_usm(context, builder, out_usm, sycl_queue)
-
-    return builder.load(out)
-
-
-def dot_2_mv(context, builder, sig, args):
-    """
-    np.dot(matrix, matrix)
-    """
-    def make_res(a, b):
-        m, n = a.shape
-        _n, = b.shape
-        if _n != n:
-            raise ValueError("incompatible array sizes for np.dot(a, b)")
-        return np.empty((m, ), a.dtype)
-
-    sycl_queue = get_sycl_queue(context, builder)
-
-    aty, bty = sig.args
-    a = make_array(aty)(context, builder, args[0])
-    b = make_array(bty)(context, builder, args[1])
-    m, k = cgutils.unpack_tuple(builder, a.shape)
-    _n,  = cgutils.unpack_tuple(builder, b.shape)
-    n = context.get_constant(types.int64, 1)
-
-    total_size_a = get_total_size_of_array(context, builder, aty.dtype, a)
-    a_usm = allocate_usm(context, builder, total_size_a, sycl_queue)
-    copy_usm(context, builder, a.data, a_usm, total_size_a, sycl_queue)
-
-    total_size_b = get_total_size_of_array(context, builder, bty.dtype, b)
-    b_usm = allocate_usm(context, builder, total_size_b, sycl_queue)
-    copy_usm(context, builder, b.data, b_usm, total_size_b, sycl_queue)
-
-    out = context.compile_internal(builder, make_res,
-            signature(sig.return_type, *sig.args), args)
-
-    outary = make_array(sig.return_type)(context, builder, out)
-
-    total_size_out = get_total_size_of_array(context, builder, sig.return_type.dtype, outary)
-    out_usm = allocate_usm(context, builder, total_size_out, sycl_queue)
-
-    # arguments are : a->void*, b->void*, result->void*, m->int64, n->int64, k->int64
-    param_tys = [ll_void_p, ll_void_p, ll_void_p, ir.IntType(64), ir.IntType(64), ir.IntType(64)]
-    params = (a_usm, b_usm, out_usm, m, n, k)
-
-    type_names = []
-    type_names.append(aty.dtype.name)
-    type_names.append(sig.return_type.name)
-
-    call_dpnp(context, builder, "dpnp_matmul", type_names, params, param_tys, ll_void)
-
-    copy_usm(context, builder, out_usm, outary.data, total_size_out, sycl_queue)
-
-    free_usm(context, builder, a_usm, sycl_queue)
-    free_usm(context, builder, b_usm, sycl_queue)
-    free_usm(context, builder, out_usm, sycl_queue)
-    return out
-
-
-def dot_2_vm(context, builder, sig, args):
-    """
-    np.dot(vector, matrix)
-    """
-    def make_res(a, b):
-        m,  = a.shape
-        _m, n = b.shape
-        if m != _m:
-            raise ValueError("incompatible array sizes for np.dot(a, b)")
-        return np.empty((n, ), a.dtype)
-
-    sycl_queue = get_sycl_queue(context, builder)
-    aty, bty = sig.args
-    a = make_array(aty)(context, builder, args[0])
-    b = make_array(bty)(context, builder, args[1])
-    m,  = cgutils.unpack_tuple(builder, a.shape)
-    k, n  = cgutils.unpack_tuple(builder, b.shape)
-
-    total_size_a = get_total_size_of_array(context, builder, aty.dtype, a)
-    a_usm = allocate_usm(context, builder, total_size_a, sycl_queue)
-    copy_usm(context, builder, a.data, a_usm, total_size_a, sycl_queue)
-
-    total_size_b = get_total_size_of_array(context, builder, bty.dtype, b)
-    b_usm = allocate_usm(context, builder, total_size_b, sycl_queue)
-    copy_usm(context, builder, b.data, b_usm, total_size_b, sycl_queue)
-
-    out = context.compile_internal(builder, make_res,
-            signature(sig.return_type, *sig.args), args)
-
-    outary = make_array(sig.return_type)(context, builder, out)
-
-    total_size_out = get_total_size_of_array(context, builder, sig.return_type.dtype, outary)
-    out_usm = allocate_usm(context, builder, total_size_out, sycl_queue)
-
-
-    # arguments are : a->void*, b->void*, result->void*, m->int64, n->int64, k->int64
-    param_tys = [ll_void_p, ll_void_p, ll_void_p, ir.IntType(64), ir.IntType(64), ir.IntType(64)]
-    params = (a_usm, b_usm, out_usm, m, n, k)
-
-    type_names = []
-    type_names.append(aty.dtype.name)
-    type_names.append(sig.return_type.name)
-
-    call_dpnp(context, builder, "dpnp_matmul", type_names, params, param_tys, ll_void)
-
-    copy_usm(context, builder, out_usm, outary.data, total_size_out, sycl_queue)
-
-    free_usm(context, builder, a_usm, sycl_queue)
-    free_usm(context, builder, b_usm, sycl_queue)
-    free_usm(context, builder, out_usm, sycl_queue)
-    return out
-
-
-def dot_2_mm(context, builder, sig, args):
-    """
-    np.dot(matrix, matrix), np.matmul(matrix, matrix)
-    """
-    def make_res(a, b):
-        m, k = a.shape
-        _k, n = b.shape
-        if _k != k:
-            raise ValueError("incompatible array sizes for np.dot(a, b)")
-        return np.empty((m, n), a.dtype)
-
-    sycl_queue = get_sycl_queue(context, builder)
-    aty, bty = sig.args
-    a = make_array(aty)(context, builder, args[0])
-    b = make_array(bty)(context, builder, args[1])
-    m, k = cgutils.unpack_tuple(builder, a.shape)
-    _k, n = cgutils.unpack_tuple(builder, b.shape)
-
-    total_size_a = get_total_size_of_array(context, builder, aty.dtype, a)
-    a_usm = allocate_usm(context, builder, total_size_a, sycl_queue)
-    copy_usm(context, builder, a.data, a_usm, total_size_a, sycl_queue)
-
-    total_size_b = get_total_size_of_array(context, builder, bty.dtype, b)
-    b_usm = allocate_usm(context, builder, total_size_b, sycl_queue)
-    copy_usm(context, builder, b.data, b_usm, total_size_b, sycl_queue)
-
-
-    out = context.compile_internal(builder, make_res,
-            signature(sig.return_type, *sig.args), args)
-
-    outary = make_array(sig.return_type)(context, builder, out)
-    total_size_out = get_total_size_of_array(context, builder, sig.return_type.dtype, outary)
-    out_usm = allocate_usm(context, builder, total_size_out, sycl_queue)
-
-
-    # arguments are : a->void*, b->void*, result->void*, m->int64, n->int64, k->int64
-    param_tys = [ll_void_p, ll_void_p, ll_void_p, ir.IntType(64), ir.IntType(64), ir.IntType(64)]
-    params = (a_usm,
-              b_usm,
-              out_usm,
-              m, n, k)
-
-    type_names = []
-    type_names.append(aty.dtype.name)
-    type_names.append(sig.return_type.name)
-
-    call_dpnp(context, builder, "dpnp_matmul", type_names, params, param_tys, ll_void)
-
-    copy_usm(context, builder, out_usm, outary.data, total_size_out, sycl_queue)
-
-    free_usm(context, builder, a_usm, sycl_queue)
-    free_usm(context, builder, b_usm, sycl_queue)
-    free_usm(context, builder, out_usm, sycl_queue)
-    return out
-
-
-@lower_builtin(np.dot, types.Array, types.Array)
-def dot_dppl(context, builder, sig, args):
-    """
-    np.dot(a, b)
-    a @ b
-    """
-
-    ensure_dpnp("dot")
-
-    with make_contiguous(context, builder, sig, args) as (sig, args):
-        ndims = [x.ndim for x in sig.args[:2]]
-        if ndims == [2, 2]:
-            return dot_2_mm(context, builder, sig, args)
-        elif ndims == [2, 1]:
-            return dot_2_mv(context, builder, sig, args)
-        elif ndims == [1, 2]:
-            return dot_2_vm(context, builder, sig, args)
-        elif ndims == [1, 1]:
-            return dot_2_vv(context, builder, sig, args)
-        else:
-            assert 0
-    raise ImportError("scipy 0.16+ is required for linear algebra")
-
-
-@lower_builtin("np.matmul", types.Array, types.Array)
-def matmul_dppl(context, builder, sig, args):
-    """
-    np.matmul(matrix, matrix)
-    """
-    ensure_dpnp("matmul")
-    with make_contiguous(context, builder, sig, args) as (sig, args):
-        ndims = [x.ndim for x in sig.args[:2]]
-        if ndims != [2, 2]:
-            raise ValueError("array dimension has to be 2 for np.matmul(a, b)")
-
-        return dot_2_mm(context, builder, sig, args)
-
-
-def common_sum_prod_impl(context, builder, sig, args, fn_type):
-    def array_size_checker(arry):
-        if arry.size == 0:
-            raise ValueError("Passed Empty array")
-
-    context.compile_internal(builder, array_size_checker,
-                             signature(types.none, *sig.args), args)
-
-    sycl_queue = get_sycl_queue(context, builder)
-
-    aty = sig.args[0]
-    a = make_array(aty)(context, builder, args[0])
-    size = a.nitems
-
-    total_size_a = get_total_size_of_array(context, builder, aty.dtype, a)
-    a_usm = allocate_usm(context, builder, total_size_a, sycl_queue)
-    copy_usm(context, builder, a.data, a_usm, total_size_a, sycl_queue)
-
-    out = cgutils.alloca_once(builder, context.get_value_type(sig.return_type))
-    builder.store(context.get_constant(sig.return_type, 0), out)
-    out_usm = allocate_usm(context, builder,
-            context.get_constant(types.intp, context.get_abi_sizeof(context.get_value_type(aty.dtype))), sycl_queue)
-
-    # arguments are : a ->void*, result->void*, size->int64
-    param_tys = [ll_void_p, ll_void_p, ir.IntType(64)]
-    params = (a_usm, out_usm, size)
-
-    type_names = []
-    type_names.append(aty.dtype.name)
-    type_names.append("NONE")
-
-    call_dpnp(context, builder, fn_type, type_names, params, param_tys, ll_void)
-
-    copy_usm(context, builder, out_usm, out,
-            context.get_constant(types.intp, context.get_abi_sizeof(context.get_value_type(aty.dtype))), sycl_queue)
-
-    free_usm(context, builder, a_usm, sycl_queue)
-    free_usm(context, builder, out_usm, sycl_queue)
-
-    return builder.load(out)
-
-
-
-@lower_builtin(np.sum, types.Array)
-def array_sum(context, builder, sig, args):
-    ensure_dpnp("sum")
-    return common_sum_prod_impl(context, builder, sig, args, "dpnp_sum")
-
-
-@lower_builtin(np.prod, types.Array)
-def array_prod(context, builder, sig, args):
-    ensure_dpnp("prod")
-
-    return common_sum_prod_impl(context, builder, sig, args, "dpnp_prod")
-
-
-def common_max_min_impl(context, builder, sig, args, fn_type):
-    def array_size_checker(arry):
-        if arry.size == 0:
-            raise ValueError("Passed Empty array")
-
-    context.compile_internal(builder, array_size_checker,
-                             signature(types.none, *sig.args), args)
-
-    sycl_queue = get_sycl_queue(context, builder)
-
-    aty = sig.args[0]
-    a = make_array(aty)(context, builder, args[0])
-    a_shape = builder.gep(args[0].operands[0], [context.get_constant(types.int32, 0), context.get_constant(types.int32, 5)])
-    a_ndim = context.get_constant(types.intp, aty.ndim)
-
-    total_size_a = get_total_size_of_array(context, builder, aty.dtype, a)
-    a_usm = allocate_usm(context, builder, total_size_a, sycl_queue)
-    copy_usm(context, builder, a.data, a_usm, total_size_a, sycl_queue)
-
-    out = cgutils.alloca_once(builder, context.get_value_type(sig.return_type))
-    builder.store(context.get_constant(sig.return_type, 0), out)
-    out_usm = allocate_usm(context, builder,
-            context.get_constant(types.intp, context.get_abi_sizeof(context.get_value_type(sig.return_type))), sycl_queue)
-
-    # arguments are : a ->void*, result->void*
-    param_tys = [ll_void_p, ll_void_p, ll_intp_p, ir.IntType(64), ll_intp_p, ir.IntType(64)]
-    params = (a_usm, out_usm, builder.bitcast(a_shape, ll_intp_p), a_ndim,
-              builder.bitcast(a_shape,  ll_intp_p), a_ndim)
-
-    type_names = []
-    type_names.append(aty.dtype.name)
-    if fn_type == "dpnp_mean":
-        type_names.append(aty.dtype.name)
-    else:
-        type_names.append(sig.return_type.name)
-
-    call_dpnp(context, builder, fn_type, type_names, params, param_tys, ll_void)
-
-    copy_usm(context, builder, out_usm, out,
-            context.get_constant(types.intp, context.get_abi_sizeof(context.get_value_type(sig.return_type))), sycl_queue)
-
-    free_usm(context, builder, a_usm, sycl_queue)
-    free_usm(context, builder, out_usm, sycl_queue)
-
-    return builder.load(out)
-
-
-@lower_builtin(np.max, types.Array)
-@lower_builtin("array.max", types.Array)
-def array_max(context, builder, sig, args):
-    ensure_dpnp("max")
-
-    return common_max_min_impl(context, builder, sig, args, "dpnp_max")
-
-@lower_builtin(np.min, types.Array)
-@lower_builtin("array.min", types.Array)
-def array_min(context, builder, sig, args):
-    ensure_dpnp("min")
-
-    return common_max_min_impl(context, builder, sig, args, "dpnp_min")
-
-@lower_builtin(np.mean, types.Array)
-@lower_builtin("array.mean", types.Array)
-def array_mean(context, builder, sig, args):
-    ensure_dpnp("mean")
-
-    return common_max_min_impl(context, builder, sig, args, "dpnp_mean")
-
-@lower_builtin(np.median, types.Array)
-def array_median(context, builder, sig, args):
-    ensure_dpnp("median")
-
-    return common_max_min_impl(context, builder, sig, args, "dpnp_median")
-
-
-def common_argmax_argmin_impl(context, builder, sig, args, fn_type):
-    def array_size_checker(arry):
-        if arry.size == 0:
-            raise ValueError("Passed Empty array")
-
-    context.compile_internal(builder, array_size_checker,
-                             signature(types.none, *sig.args), args)
-
-    sycl_queue = get_sycl_queue(context, builder)
-
-    aty = sig.args[0]
-    a = make_array(aty)(context, builder, args[0])
-    size = a.nitems
-
-    total_size_a = get_total_size_of_array(context, builder, aty.dtype, a)
-    a_usm = allocate_usm(context, builder, total_size_a, sycl_queue)
-    copy_usm(context, builder, a.data, a_usm, total_size_a, sycl_queue)
-
-    out = cgutils.alloca_once(builder, context.get_value_type(sig.return_type))
-    builder.store(context.get_constant(sig.return_type, 0), out)
-    out_usm = allocate_usm(context, builder,
-            context.get_constant(types.intp, context.get_abi_sizeof(context.get_value_type(sig.return_type))), sycl_queue)
-
-    # arguments are : a ->void*, result->void*, size->int64
-    param_tys = [ll_void_p, ll_void_p, ir.IntType(64)]
-    params = (a_usm, out_usm, size)
-
-    type_names = []
-    type_names.append(aty.dtype.name)
-    type_names.append(sig.return_type.name)
-
-    call_dpnp(context, builder, fn_type, type_names, params, param_tys, ll_void)
-
-    copy_usm(context, builder, out_usm, out,
-            context.get_constant(types.intp, context.get_abi_sizeof(context.get_value_type(sig.return_type))), sycl_queue)
-
-    free_usm(context, builder, a_usm, sycl_queue)
-    free_usm(context, builder, out_usm, sycl_queue)
-
-    return builder.load(out)
-
-
-
-@lower_builtin(np.argmax, types.Array)
-def array_argmax(context, builder, sig, args):
-    ensure_dpnp("argmax")
-
-    return common_argmax_argmin_impl(context, builder, sig, args, "dpnp_argmax")
-
-
-@lower_builtin(np.argmin, types.Array)
-def array_argmin(context, builder, sig, args):
-    ensure_dpnp("argmin")
-
-    return common_argmax_argmin_impl(context, builder, sig, args, "dpnp_argmin")
-
-
-@lower_builtin(np.argsort, types.Array, types.StringLiteral)
-def array_argsort(context, builder, sig, args):
-    ensure_dpnp("argsort")
-
-    def make_res(A):
-        return np.arange(A.size)
-
-    def array_dim_checker(arry):
-        if arry.ndim > 1:
-            raise ValueError("Argsort is only supported for 1D array")
-
-    context.compile_internal(builder, array_dim_checker,
-            signature(types.none, *sig.args[:1]), args[:1])
-
-    sycl_queue = get_sycl_queue(context, builder)
-    aty = sig.args[0]
-    a = make_array(aty)(context, builder, args[0])
-    size, = cgutils.unpack_tuple(builder, a.shape)
-
-    total_size_a = get_total_size_of_array(context, builder, aty.dtype, a)
-    a_usm = allocate_usm(context, builder, total_size_a, sycl_queue)
-    copy_usm(context, builder, a.data, a_usm, total_size_a, sycl_queue)
-
-    out = context.compile_internal(builder, make_res,
-            signature(sig.return_type, *sig.args[:1]), args[:1])
-    outary = make_array(sig.return_type)(context, builder, out)
-
-    total_size_out = get_total_size_of_array(context, builder, sig.return_type.dtype, outary)
-    out_usm = allocate_usm(context, builder, total_size_out, sycl_queue)
-
-
-    # arguments are : a ->void*, result->void*, size->int64
-    param_tys = [ll_void_p, ll_void_p, ir.IntType(64)]
-    params = (a_usm, out_usm, size)
-
-    type_names = []
-    for argty in sig.args[:1]:
-        type_names.append(argty.dtype.name)
-    type_names.append(sig.return_type.name)
-
-    call_dpnp(context, builder, "dpnp_argsort", type_names, params, param_tys, ll_void)
-
-    copy_usm(context, builder, out_usm, outary.data, total_size_out, sycl_queue)
-
-    free_usm(context, builder, a_usm, sycl_queue)
-    free_usm(context, builder, out_usm, sycl_queue)
-
-    return out
-
-
-@lower_builtin(np.cov, types.Array)
-def array_cov(context, builder, sig, args):
-    ensure_dpnp("cov")
-    def make_1D_res(size):
-        return np.empty(1, dtype=np.float64)
-
-    def make_2D_res(size):
-        return np.empty((size, size), dtype=np.float64)
-
-    sycl_queue = get_sycl_queue(context, builder)
-    aty = sig.args[0]
-    aty = sig.args[0]
-    a = make_array(aty)(context, builder, args[0])
-
-    total_size_a = get_total_size_of_array(context, builder, aty.dtype, a)
-    a_usm = allocate_usm(context, builder, total_size_a, sycl_queue)
-    copy_usm(context, builder, a.data, a_usm, total_size_a, sycl_queue)
-
-    if aty.ndim == 2:
-        m, n = cgutils.unpack_tuple(builder, a.shape)
-        out = context.compile_internal(builder, make_2D_res,
-                signature(sig.return_type, types.int64), (m,))
-    elif aty.ndim == 1:
-        m, = cgutils.unpack_tuple(builder, a.shape)
-        out = context.compile_internal(builder, make_1D_res,
-                signature(sig.return_type, types.int64), (m,))
-    else:
-        #TODO: Throw error, cov is supported for only 1D and 2D array
-        pass
-
-    outary = make_array(sig.return_type)(context, builder, out)
-
-    total_size_out = get_total_size_of_array(context, builder, sig.return_type.dtype, outary)
-    out_usm = allocate_usm(context, builder, total_size_out, sycl_queue)
-
-    nrows = cgutils.alloca_once(builder, context.get_value_type(types.int64))
-    ncols = cgutils.alloca_once(builder, context.get_value_type(types.int64))
-
-    if aty.ndim == 2:
-        builder.store(m, nrows)
-        builder.store(n, ncols)
-
-    elif aty.ndim == 1:
-        builder.store(context.get_constant(types.int64, 1), nrows)
-        builder.store(m, ncols)
-
-
-    # arguments are : a ->void*, result->void*, nrows->int64, ncols->int64
-    param_tys = [ll_void_p, ll_void_p, ir.IntType(64), ir.IntType(64)]
-    params = (a_usm, out_usm, builder.load(nrows), builder.load(ncols))
-
-    type_names = []
-    type_names.append(aty.dtype.name)
-    type_names.append("NONE")
-
-
-    call_dpnp(context, builder, "dpnp_cov", type_names, params, param_tys, ll_void)
-
-    copy_usm(context, builder, out_usm, outary.data, total_size_out, sycl_queue)
-
-    free_usm(context, builder, a_usm, sycl_queue)
-    free_usm(context, builder, out_usm, sycl_queue)
-
-    return out
-
-
-'''
-@lower_builtin(np.linalg.eig, types.Array)
-def array_cov(context, builder, sig, args):
-    pass
-
-@lower_builtin("np.random.sample")
-def random_impl(context, builder, sig, args):
-
-    def make_res(shape):
-        return np.empty(shape, dtype=np.float64)
-
-    import pdb
-    pdb.set_trace()
-    out = context.compile_internal(builder, make_res,
-            signature(sig.return_type, *sig.args), args)
-
-    outary = make_array(sig.return_type)(context, builder, out)
-
-    # arguments are : result->void*, size->int64
-    param_tys = [ll_void_p, ll_intp_p]
-    params = (builder.bitcast(outary.data, ll_void_p), )
-
-
-    type_names = []
-    for argty in sig.args[:1]:
-        type_names.append(argty.dtype.name.encode('utf-8'))
-    type_names.append(sig.return_type.name.encode('utf-8'))
-
-    call_dpnp(context, builder, b"dpnp_cov", type_names, params, param_tys, ll_void)
-'''
diff --git a/numba_dppy/initialize.py b/numba_dppy/initialize.py
index c8ba56220a..2a2c70f796 100644
--- a/numba_dppy/initialize.py
+++ b/numba_dppy/initialize.py
@@ -5,8 +5,8 @@
 
 
 def init_jit():
-    from numba_dppy.dispatcher import DPPLDispatcher
-    return DPPLDispatcher
+    from numba_dppy.dispatcher import DPPYDispatcher
+    return DPPYDispatcher
 
 def initialize_all():
     from numba.core.registry import dispatcher_registry
@@ -17,12 +17,12 @@ def initialize_all():
     import platform as plt
     platform = plt.system()
     if platform == 'Windows':
-        paths = glob.glob(os.path.join(os.path.dirname(dpctl.__file__), '*DPPLSyclInterface.dll'))
+        paths = glob.glob(os.path.join(os.path.dirname(dpctl.__file__), '*DPCTLSyclInterface.dll'))
     else:
-        paths = glob.glob(os.path.join(os.path.dirname(dpctl.__file__), '*DPPLSyclInterface*'))
+        paths = glob.glob(os.path.join(os.path.dirname(dpctl.__file__), '*DPCTLSyclInterface*'))
 
     if len(paths) == 1:
-        ll.load_library_permanently(find_library(paths[0]))
+        ll.load_library_permanently(paths[0])
     else:
         raise ImportError
 
diff --git a/numba_dppy/ocl/atomics/atomic_ops.cl b/numba_dppy/ocl/atomics/atomic_ops.cl
index ad581716de..56228d8bf5 100644
--- a/numba_dppy/ocl/atomics/atomic_ops.cl
+++ b/numba_dppy/ocl/atomics/atomic_ops.cl
@@ -5,7 +5,7 @@
 #ifdef cl_khr_int64_base_atomics
   #pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable
 
-  long numba_dppl_atomic_add_i64_local(volatile __generic long *p, long val) {
+  long numba_dppy_atomic_add_i64_local(volatile __generic long *p, long val) {
       long found = *p;
       long expected;
       do {
@@ -15,7 +15,7 @@
       return found;
   }
 
-  long numba_dppl_atomic_add_i64_global(volatile __generic long *p, long val) {
+  long numba_dppy_atomic_add_i64_global(volatile __generic long *p, long val) {
       long found = *p;
       long expected;
       do {
@@ -25,7 +25,7 @@
       return found;
   }
 
-  long numba_dppl_atomic_sub_i64_local(volatile __generic long *p, long val) {
+  long numba_dppy_atomic_sub_i64_local(volatile __generic long *p, long val) {
       long found = *p;
       long expected;
       do {
@@ -35,7 +35,7 @@
       return found;
   }
 
-  long numba_dppl_atomic_sub_i64_global(volatile __generic long *p, long val) {
+  long numba_dppy_atomic_sub_i64_global(volatile __generic long *p, long val) {
       long found = *p;
       long expected;
       do {
@@ -48,7 +48,7 @@
   #ifdef cl_khr_fp64
     #pragma OPENCL EXTENSION cl_khr_fp64: enable
 
-    double numba_dppl_atomic_cmpxchg_f64_local(volatile __generic double *p, double cmp, double val) {
+    double numba_dppy_atomic_cmpxchg_f64_local(volatile __generic double *p, double cmp, double val) {
         union {
             ulong  u64;
             double f64;
@@ -60,7 +60,7 @@
         return old_union.f64;
     }
 
-    double numba_dppl_atomic_cmpxchg_f64_global(volatile __generic double *p, double cmp, double val) {
+    double numba_dppy_atomic_cmpxchg_f64_global(volatile __generic double *p, double cmp, double val) {
         union {
             ulong  u64;
             double f64;
@@ -72,50 +72,50 @@
         return old_union.f64;
     }
 
-    double numba_dppl_atomic_add_f64_local(volatile __generic double *p, double val) {
+    double numba_dppy_atomic_add_f64_local(volatile __generic double *p, double val) {
         double  found = *p;
         double  expected;
         do {
             expected = found;
-            found = numba_dppl_atomic_cmpxchg_f64_local(p, expected, expected + val);
+            found = numba_dppy_atomic_cmpxchg_f64_local(p, expected, expected + val);
         } while (found != expected);
         return found;
     }
 
-    double numba_dppl_atomic_add_f64_global(volatile __generic double *p, double val) {
+    double numba_dppy_atomic_add_f64_global(volatile __generic double *p, double val) {
         double  found = *p;
         double  expected;
         do {
             expected = found;
-            found = numba_dppl_atomic_cmpxchg_f64_global(p, expected, expected + val);
+            found = numba_dppy_atomic_cmpxchg_f64_global(p, expected, expected + val);
         } while (found != expected);
         return found;
     }
 
 
-    double numba_dppl_atomic_sub_f64_local(volatile __generic double *p, double val) {
+    double numba_dppy_atomic_sub_f64_local(volatile __generic double *p, double val) {
         double  found = *p;
         double  expected;
         do {
             expected = found;
-            found = numba_dppl_atomic_cmpxchg_f64_local(p, expected, expected - val);
+            found = numba_dppy_atomic_cmpxchg_f64_local(p, expected, expected - val);
         } while (found != expected);
         return found;
     }
 
-    double numba_dppl_atomic_sub_f64_global(volatile __generic double *p, double val) {
+    double numba_dppy_atomic_sub_f64_global(volatile __generic double *p, double val) {
         double  found = *p;
         double  expected;
         do {
             expected = found;
-            found = numba_dppl_atomic_cmpxchg_f64_global(p, expected, expected - val);
+            found = numba_dppy_atomic_cmpxchg_f64_global(p, expected, expected - val);
         } while (found != expected);
         return found;
     }
   #endif
 #endif
 
-float numba_dppl_atomic_cmpxchg_f32_local(volatile __generic float *p, float cmp, float val) {
+float numba_dppy_atomic_cmpxchg_f32_local(volatile __generic float *p, float cmp, float val) {
     union {
         unsigned int u32;
         float        f32;
@@ -127,7 +127,7 @@ float numba_dppl_atomic_cmpxchg_f32_local(volatile __generic float *p, float cmp
     return old_union.f32;
 }
 
-float numba_dppl_atomic_cmpxchg_f32_global(volatile __generic float *p, float cmp, float val) {
+float numba_dppy_atomic_cmpxchg_f32_global(volatile __generic float *p, float cmp, float val) {
     union {
         unsigned int u32;
         float        f32;
@@ -139,47 +139,47 @@ float numba_dppl_atomic_cmpxchg_f32_global(volatile __generic float *p, float cm
     return old_union.f32;
 }
 
-float numba_dppl_atomic_add_f32_local(volatile __generic float *p, float val) {
+float numba_dppy_atomic_add_f32_local(volatile __generic float *p, float val) {
     float found = *p;
     float expected;
     do {
         expected = found;
-        found = numba_dppl_atomic_cmpxchg_f32_local(p, expected, expected + val);
+        found = numba_dppy_atomic_cmpxchg_f32_local(p, expected, expected + val);
     } while (found != expected);
     return found;
 }
 
-float numba_dppl_atomic_add_f32_global(volatile __generic float *p, float val) {
+float numba_dppy_atomic_add_f32_global(volatile __generic float *p, float val) {
     float found = *p;
     float expected;
     do {
         expected = found;
-        found = numba_dppl_atomic_cmpxchg_f32_global(p, expected, expected + val);
+        found = numba_dppy_atomic_cmpxchg_f32_global(p, expected, expected + val);
     } while (found != expected);
     return found;
 }
 
-float numba_dppl_atomic_sub_f32_local(volatile __generic float *p, float val) {
+float numba_dppy_atomic_sub_f32_local(volatile __generic float *p, float val) {
     float found = *p;
     float expected;
     do {
         expected = found;
-        found = numba_dppl_atomic_cmpxchg_f32_local(p, expected, expected - val);
+        found = numba_dppy_atomic_cmpxchg_f32_local(p, expected, expected - val);
     } while (found != expected);
     return found;
 }
 
-float numba_dppl_atomic_sub_f32_global(volatile __generic float *p, float val) {
+float numba_dppy_atomic_sub_f32_global(volatile __generic float *p, float val) {
     float found = *p;
     float expected;
     do {
         expected = found;
-        found = numba_dppl_atomic_cmpxchg_f32_global(p, expected, expected - val);
+        found = numba_dppy_atomic_cmpxchg_f32_global(p, expected, expected - val);
     } while (found != expected);
     return found;
 }
 
-int numba_dppl_atomic_add_i32_local(volatile __generic int *p, int val) {
+int numba_dppy_atomic_add_i32_local(volatile __generic int *p, int val) {
     int found = *p;
     int expected;
     do {
@@ -189,7 +189,7 @@ int numba_dppl_atomic_add_i32_local(volatile __generic int *p, int val) {
     return found;
 }
 
-int numba_dppl_atomic_add_i32_global(volatile __generic int *p, int val) {
+int numba_dppy_atomic_add_i32_global(volatile __generic int *p, int val) {
     int found = *p;
     int expected;
     do {
@@ -199,7 +199,7 @@ int numba_dppl_atomic_add_i32_global(volatile __generic int *p, int val) {
     return found;
 }
 
-int numba_dppl_atomic_sub_i32_local(volatile __generic int *p, int val) {
+int numba_dppy_atomic_sub_i32_local(volatile __generic int *p, int val) {
     int found = *p;
     int expected;
     do {
@@ -209,7 +209,7 @@ int numba_dppl_atomic_sub_i32_local(volatile __generic int *p, int val) {
     return found;
 }
 
-int numba_dppl_atomic_sub_i32_global(volatile __generic int *p, int val) {
+int numba_dppy_atomic_sub_i32_global(volatile __generic int *p, int val) {
     int found = *p;
     int expected;
     do {
diff --git a/numba_dppy/ocl/ocldecl.py b/numba_dppy/ocl/ocldecl.py
index 1af90a6884..adf14a1815 100644
--- a/numba_dppy/ocl/ocldecl.py
+++ b/numba_dppy/ocl/ocldecl.py
@@ -4,7 +4,7 @@
 from numba.core.typing.templates import (AttributeTemplate, ConcreteTemplate,
                                         AbstractTemplate, MacroTemplate,
                                         signature, Registry)
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 
 registry = Registry()
 intrinsic = registry.register
@@ -15,71 +15,71 @@
 
 @intrinsic
 class Ocl_get_global_id(ConcreteTemplate):
-    key = dppl.get_global_id
+    key = dppy.get_global_id
     cases = [signature(types.intp, types.uint32)]
 
 
 @intrinsic
 class Ocl_get_local_id(ConcreteTemplate):
-    key = dppl.get_local_id
+    key = dppy.get_local_id
     cases = [signature(types.intp, types.uint32)]
 
 
 @intrinsic
 class Ocl_get_group_id(ConcreteTemplate):
-    key = dppl.get_group_id
+    key = dppy.get_group_id
     cases = [signature(types.intp, types.uint32)]
 
 
 @intrinsic
 class Ocl_get_num_groups(ConcreteTemplate):
-    key = dppl.get_num_groups
+    key = dppy.get_num_groups
     cases = [signature(types.intp, types.uint32)]
 
 
 @intrinsic
 class Ocl_get_work_dim(ConcreteTemplate):
-    key = dppl.get_work_dim
+    key = dppy.get_work_dim
     cases = [signature(types.uint32)]
 
 
 @intrinsic
 class Ocl_get_global_size(ConcreteTemplate):
-    key = dppl.get_global_size
+    key = dppy.get_global_size
     cases = [signature(types.intp, types.uint32)]
 
 
 @intrinsic
 class Ocl_get_local_size(ConcreteTemplate):
-    key = dppl.get_local_size
+    key = dppy.get_local_size
     cases = [signature(types.intp, types.uint32)]
 
 
 @intrinsic
 class Ocl_barrier(ConcreteTemplate):
-    key = dppl.barrier
+    key = dppy.barrier
     cases = [signature(types.void, types.uint32),
              signature(types.void)]
 
 
 @intrinsic
 class Ocl_mem_fence(ConcreteTemplate):
-    key = dppl.mem_fence
+    key = dppy.mem_fence
     cases = [signature(types.void, types.uint32)]
 
 
 @intrinsic
 class Ocl_sub_group_barrier(ConcreteTemplate):
-    key = dppl.sub_group_barrier
+    key = dppy.sub_group_barrier
 
     cases = [signature(types.void)]
 
 
-# dppl.atomic submodule -------------------------------------------------------
+# dppy.atomic submodule -------------------------------------------------------
 
 @intrinsic
 class Ocl_atomic_add(AbstractTemplate):
-    key = dppl.atomic.add
+    key = dppy.atomic.add
 
     def generic(self, args, kws):
         assert not kws
@@ -92,7 +92,7 @@ def generic(self, args, kws):
 
 @intrinsic
 class Ocl_atomic_sub(AbstractTemplate):
-    key = dppl.atomic.sub
+    key = dppy.atomic.sub
 
     def generic(self, args, kws):
         assert not kws
@@ -106,7 +106,7 @@ def generic(self, args, kws):
 
 @intrinsic_attr
 class OclAtomicTemplate(AttributeTemplate):
-    key = types.Module(dppl.atomic)
+    key = types.Module(dppy.atomic)
 
     def resolve_add(self, mod):
         return types.Function(Ocl_atomic_add)
@@ -115,15 +115,15 @@ def resolve_sub(self, mod):
         return types.Function(Ocl_atomic_sub)
 
 
-# dppl.local submodule -------------------------------------------------------
+# dppy.local submodule -------------------------------------------------------
 
 class Ocl_local_alloc(MacroTemplate):
-    key = dppl.local.static_alloc
+    key = dppy.local.static_alloc
 
 
 @intrinsic_attr
 class OclLocalTemplate(AttributeTemplate):
-    key = types.Module(dppl.local)
+    key = types.Module(dppy.local)
 
     def resolve_static_alloc(self, mod):
         return types.Macro(Ocl_local_alloc)
@@ -133,7 +133,7 @@ def resolve_static_alloc(self, mod):
 
 @intrinsic_attr
 class OclModuleTemplate(AttributeTemplate):
-    key = types.Module(dppl)
+    key = types.Module(dppy)
 
     def resolve_get_global_id(self, mod):
         return types.Function(Ocl_get_global_id)
@@ -166,11 +166,11 @@ def resolve_sub_group_barrier(self, mod):
         return types.Function(Ocl_sub_group_barrier)
 
     def resolve_atomic(self, mod):
-        return types.Module(dppl.atomic)
+        return types.Module(dppy.atomic)
 
     def resolve_local(self, mod):
-        return types.Module(dppl.local)
+        return types.Module(dppy.local)
 
 # intrinsic
 
-#intrinsic_global(dppl, types.Module(dppl))
+#intrinsic_global(dppy, types.Module(dppy))
diff --git a/numba_dppy/ocl/oclimpl.py b/numba_dppy/ocl/oclimpl.py
index b92dca7bae..26f8482799 100644
--- a/numba_dppy/ocl/oclimpl.py
+++ b/numba_dppy/ocl/oclimpl.py
@@ -169,9 +169,9 @@ def insert_and_call_atomic_fn(context, builder, sig, fn_type,
         ll_val = ir.IntType(32)
         ll_p = ll_val.as_pointer()
         if fn_type == "add":
-            name = "numba_dppl_atomic_add_i32"
+            name = "numba_dppy_atomic_add_i32"
         elif fn_type == "sub":
-            name = "numba_dppl_atomic_sub_i32"
+            name = "numba_dppy_atomic_sub_i32"
         else:
             raise TypeError("Operation type is not supported %s" %
                              (fn_type))
@@ -182,9 +182,9 @@ def insert_and_call_atomic_fn(context, builder, sig, fn_type,
             ll_val = ir.IntType(64)
             ll_p = ll_val.as_pointer()
             if fn_type == "add":
-                name = "numba_dppl_atomic_add_i64"
+                name = "numba_dppy_atomic_add_i64"
             elif fn_type == "sub":
-                name = "numba_dppl_atomic_sub_i64"
+                name = "numba_dppy_atomic_sub_i64"
             else:
                 raise TypeError("Operation type is not supported %s" %
                                  (fn_type))
@@ -195,9 +195,9 @@ def insert_and_call_atomic_fn(context, builder, sig, fn_type,
         ll_val = ir.FloatType()
         ll_p = ll_val.as_pointer()
         if fn_type == "add":
-            name = "numba_dppl_atomic_add_f32"
+            name = "numba_dppy_atomic_add_f32"
         elif fn_type == "sub":
-            name = "numba_dppl_atomic_sub_f32"
+            name = "numba_dppy_atomic_sub_f32"
         else:
             raise TypeError("Operation type is not supported %s" %
                              (fn_type))
@@ -208,9 +208,9 @@ def insert_and_call_atomic_fn(context, builder, sig, fn_type,
             ll_val = ir.DoubleType()
             ll_p = ll_val.as_pointer()
             if fn_type == "add":
-                name = "numba_dppl_atomic_add_f64"
+                name = "numba_dppy_atomic_add_f64"
             elif fn_type == "sub":
-                name = "numba_dppl_atomic_sub_f64"
+                name = "numba_dppy_atomic_sub_f64"
             else:
                 raise TypeError("Operation type is not supported %s" %
                                  (fn_type))
@@ -331,11 +331,11 @@ def atomic_sub_tuple(context, builder, sig, args):
         raise ImportError("Atomic support is not present, can not perform atomic_add")
 
 
-@lower('dppl.lmem.alloc', types.UniTuple, types.Any)
-def dppl_lmem_alloc_array(context, builder, sig, args):
+@lower('dppy.lmem.alloc', types.UniTuple, types.Any)
+def dppy_lmem_alloc_array(context, builder, sig, args):
     shape, dtype = args
     return _generic_array(context, builder, shape=shape, dtype=dtype,
-                          symbol_name='_dppl_lmem',
+                          symbol_name='_dppy_lmem',
                           addrspace=target.SPIR_LOCAL_ADDRSPACE)
 
 
diff --git a/numba_dppy/ocl/stubs.py b/numba_dppy/ocl/stubs.py
index 2ec95fa9c8..190b685955 100644
--- a/numba_dppy/ocl/stubs.py
+++ b/numba_dppy/ocl/stubs.py
@@ -83,9 +83,9 @@ def sub_group_barrier():
 
 class Stub(object):
     """A stub object to represent special objects which is meaningless
-    outside the context of DPPL compilation context.
+    outside the context of DPPY compilation context.
     """
-    _description_ = '<dppl special value>'
+    _description_ = '<dppy special value>'
     __slots__ = ()  # don't allocate __dict__
 
     def __new__(cls):
@@ -100,7 +100,7 @@ def __repr__(self):
 def local_alloc(shape, dtype):
     shape = _legalize_shape(shape)
     ndim = len(shape)
-    fname = "dppl.lmem.alloc"
+    fname = "dppy.lmem.alloc"
     restype = types.Array(dtype, ndim, 'C', addrspace=SPIR_LOCAL_ADDRSPACE)
     sig = typing.signature(restype, types.UniTuple(types.intp, ndim), types.Any)
     return ir.Intrinsic(fname, sig, args=(shape, dtype))
diff --git a/numba_dppy/parfor_loop_invariant_hoisting.py.bkp b/numba_dppy/parfor_loop_invariant_hoisting.py.bkp
deleted file mode 100644
index fb37a1c97b..0000000000
--- a/numba_dppy/parfor_loop_invariant_hoisting.py.bkp
+++ /dev/null
@@ -1,213 +0,0 @@
-from __future__ import print_function, division, absolute_import
-
-def add_to_def_once_sets(a_def, def_once, def_more):
-    '''If the variable is already defined more than once, do nothing.
-       Else if defined exactly once previously then transition this
-       variable to the defined more than once set (remove it from
-       def_once set and add to def_more set).
-       Else this must be the first time we've seen this variable defined
-       so add to def_once set.
-    '''
-    if a_def in def_more:
-        pass
-    elif a_def in def_once:
-        def_more.add(a_def)
-        def_once.remove(a_def)
-    else:
-        def_once.add(a_def)
-
-def compute_def_once_block(block, def_once, def_more, getattr_taken, typemap, module_assigns):
-    '''Effect changes to the set of variables defined once or more than once
-       for a single block.
-       block - the block to process
-       def_once - set of variable names known to be defined exactly once
-       def_more - set of variable names known to be defined more than once
-       getattr_taken - dict mapping variable name to tuple of object and attribute taken
-       module_assigns - dict mapping variable name to the Global that they came from
-    '''
-    # The only "defs" occur in assignments, so find such instructions.
-    assignments = block.find_insts(ir.Assign)
-    # For each assignment...
-    for one_assign in assignments:
-        # Get the LHS/target of the assignment.
-        a_def = one_assign.target.name
-        # Add variable to def sets.
-        add_to_def_once_sets(a_def, def_once, def_more)
-
-        rhs = one_assign.value
-        if isinstance(rhs, ir.Global):
-            # Remember assignments of the form "a = Global(...)"
-            # Is this a module?
-            if isinstance(rhs.value, pytypes.ModuleType):
-                module_assigns[a_def] = rhs.value.__name__
-        if isinstance(rhs, ir.Expr) and rhs.op == 'getattr' and rhs.value.name in def_once:
-            # Remember assignments of the form "a = b.c"
-            getattr_taken[a_def] = (rhs.value.name, rhs.attr)
-        if isinstance(rhs, ir.Expr) and rhs.op == 'call' and rhs.func.name in getattr_taken:
-            # If "a" is being called then lookup the getattr definition of "a"
-            # as above, getting the module variable "b" (base_obj)
-            # and the attribute "c" (base_attr).
-            base_obj, base_attr = getattr_taken[rhs.func.name]
-            if base_obj in module_assigns:
-                # If we know the definition of the module variable then get the module
-                # name from module_assigns.
-                base_mod_name = module_assigns[base_obj]
-                if not is_const_call(base_mod_name, base_attr):
-                    # Calling a method on an object could modify the object and is thus
-                    # like a def of that object.  We call is_const_call to see if this module/attribute
-                    # combination is known to not modify the module state.  If we don't know that
-                    # the combination is safe then we have to assume there could be a modification to
-                    # the module and thus add the module variable as defined more than once.
-                    add_to_def_once_sets(base_obj, def_once, def_more)
-            else:
-                # Assume the worst and say that base_obj could be modified by the call.
-                add_to_def_once_sets(base_obj, def_once, def_more)
-        if isinstance(rhs, ir.Expr) and rhs.op == 'call':
-            # If a mutable object is passed to a function, then it may be changed and
-            # therefore can't be hoisted.
-            # For each argument to the function...
-            for argvar in rhs.args:
-                # Get the argument's type.
-                if isinstance(argvar, ir.Var):
-                    argvar = argvar.name
-                avtype = typemap[argvar]
-                # If that type doesn't have a mutable attribute or it does and it's set to
-                # not mutable then this usage is safe for hoisting.
-                if getattr(avtype, 'mutable', False):
-                    # Here we have a mutable variable passed to a function so add this variable
-                    # to the def lists.
-                    add_to_def_once_sets(argvar, def_once, def_more)
-
-def compute_def_once_internal(loop_body, def_once, def_more, getattr_taken, typemap, module_assigns):
-    '''Compute the set of variables defined exactly once in the given set of blocks
-       and use the given sets for storing which variables are defined once, more than
-       once and which have had a getattr call on them.
-    '''
-    # For each block...
-    for label, block in loop_body.items():
-        # Scan this block and effect changes to def_once, def_more, and getattr_taken
-        # based on the instructions in that block.
-        compute_def_once_block(block, def_once, def_more, getattr_taken, typemap, module_assigns)
-        # Have to recursively process parfors manually here.
-        for inst in block.body:
-            if isinstance(inst, parfor.Parfor):
-                # Recursively compute for the parfor's init block.
-                compute_def_once_block(inst.init_block, def_once, def_more, getattr_taken, typemap, module_assigns)
-                # Recursively compute for the parfor's loop body.
-                compute_def_once_internal(inst.loop_body, def_once, def_more, getattr_taken, typemap, module_assigns)
-
-def compute_def_once(loop_body, typemap):
-    '''Compute the set of variables defined exactly once in the given set of blocks.
-    '''
-    def_once = set()   # set to hold variables defined exactly once
-    def_more = set()   # set to hold variables defined more than once
-    getattr_taken = {}
-    module_assigns = {}
-    compute_def_once_internal(loop_body, def_once, def_more, getattr_taken, typemap, module_assigns)
-    return def_once
-
-def find_vars(var, varset):
-    assert isinstance(var, ir.Var)
-    varset.add(var.name)
-    return var
-
-def _hoist_internal(inst, dep_on_param, call_table, hoisted, not_hoisted,
-                    typemap, stored_arrays):
-    if inst.target.name in stored_arrays:
-        not_hoisted.append((inst, "stored array"))
-        if config.DEBUG_ARRAY_OPT >= 1:
-            print("Instruction", inst, " could not be hoisted because the created array is stored.")
-        return False
-
-    uses = set()
-    visit_vars_inner(inst.value, find_vars, uses)
-    diff = uses.difference(dep_on_param)
-    if config.DEBUG_ARRAY_OPT >= 1:
-        print("_hoist_internal:", inst, "uses:", uses, "diff:", diff)
-    if len(diff) == 0 and is_pure(inst.value, None, call_table):
-        if config.DEBUG_ARRAY_OPT >= 1:
-            print("Will hoist instruction", inst, typemap[inst.target.name])
-        hoisted.append(inst)
-        if not isinstance(typemap[inst.target.name], types.npytypes.Array):
-            dep_on_param += [inst.target.name]
-        return True
-    else:
-        if len(diff) > 0:
-            not_hoisted.append((inst, "dependency"))
-            if config.DEBUG_ARRAY_OPT >= 1:
-                print("Instruction", inst, " could not be hoisted because of a dependency.")
-        else:
-            not_hoisted.append((inst, "not pure"))
-            if config.DEBUG_ARRAY_OPT >= 1:
-                print("Instruction", inst, " could not be hoisted because it isn't pure.")
-    return False
-
-def find_setitems_block(setitems, itemsset, block, typemap):
-    for inst in block.body:
-        if isinstance(inst, ir.StaticSetItem) or isinstance(inst, ir.SetItem):
-            setitems.add(inst.target.name)
-            # If we store a non-mutable object into an array then that is safe to hoist.
-            # If the stored object is mutable and you hoist then multiple entries in the
-            # outer array could reference the same object and changing one index would then
-            # change other indices.
-            if getattr(typemap[inst.value.name], "mutable", False):
-                itemsset.add(inst.value.name)
-        elif isinstance(inst, parfor.Parfor):
-            find_setitems_block(setitems, itemsset, inst.init_block, typemap)
-            find_setitems_body(setitems, itemsset, inst.loop_body, typemap)
-
-def find_setitems_body(setitems, itemsset, loop_body, typemap):
-    """
-      Find the arrays that are written into (goes into setitems) and the
-      mutable objects (mostly arrays) that are written into other arrays
-      (goes into itemsset).
-    """
-    for label, block in loop_body.items():
-        find_setitems_block(setitems, itemsset, block, typemap)
-
-def hoist(parfor_params, loop_body, typemap, wrapped_blocks):
-    dep_on_param = copy.copy(parfor_params)
-    hoisted = []
-    not_hoisted = []
-
-    # Compute the set of variable defined exactly once in the loop body.
-    def_once = compute_def_once(loop_body, typemap)
-    (call_table, reverse_call_table) = get_call_table(wrapped_blocks)
-
-    setitems = set()
-    itemsset = set()
-    find_setitems_body(setitems, itemsset, loop_body, typemap)
-    dep_on_param = list(set(dep_on_param).difference(setitems))
-    if config.DEBUG_ARRAY_OPT >= 1:
-        print("hoist - def_once:", def_once, "setitems:",
-              setitems, "itemsset:", itemsset, "dep_on_param:",
-              dep_on_param, "parfor_params:", parfor_params)
-
-    for label, block in loop_body.items():
-        new_block = []
-        for inst in block.body:
-            if isinstance(inst, ir.Assign) and inst.target.name in def_once:
-                if _hoist_internal(inst, dep_on_param, call_table,
-                                   hoisted, not_hoisted, typemap, itemsset):
-                    # don't add this instruction to the block since it is
-                    # hoisted
-                    continue
-            elif isinstance(inst, parfor.Parfor):
-                new_init_block = []
-                if config.DEBUG_ARRAY_OPT >= 1:
-                    print("parfor")
-                    inst.dump()
-                for ib_inst in inst.init_block.body:
-                    if (isinstance(ib_inst, ir.Assign) and
-                        ib_inst.target.name in def_once):
-                        if _hoist_internal(ib_inst, dep_on_param, call_table,
-                                           hoisted, not_hoisted, typemap, itemsset):
-                            # don't add this instuction to the block since it is hoisted
-                            continue
-                    new_init_block.append(ib_inst)
-                inst.init_block.body = new_init_block
-
-            new_block.append(inst)
-        block.body = new_block
-    return hoisted, not_hoisted
-
diff --git a/numba_dppy/printimpl.py b/numba_dppy/printimpl.py
index 74319b1bdd..e5c9d4f793 100644
--- a/numba_dppy/printimpl.py
+++ b/numba_dppy/printimpl.py
@@ -79,8 +79,8 @@ def print_varargs(context, builder, sig, args):
     va_arg.extend(values)
     va_arg = tuple(va_arg)
 
-    dppl_print = declare_print(builder.module)
+    dppy_print = declare_print(builder.module)
 
-    builder.call(dppl_print, va_arg)
+    builder.call(dppy_print, va_arg)
 
     return context.get_dummy_value()
diff --git a/numba_dppy/rename_numpy_functions_pass.py b/numba_dppy/rename_numpy_functions_pass.py
new file mode 100644
index 0000000000..c1d58ce036
--- /dev/null
+++ b/numba_dppy/rename_numpy_functions_pass.py
@@ -0,0 +1,249 @@
+from numba.core import ir
+from numba.core.compiler_machinery import FunctionPass, register_pass
+from numba.core.ir_utils import (
+    find_topo_order,
+    mk_unique_var,
+    remove_dead,
+    simplify_CFG,
+)
+import numba_dppy
+from numba.core import types
+
+rewrite_function_name_map = {"sum": (["np"], "sum"),
+                             "eig": (["linalg"], "eig"),
+                             "prod": (["np"], "prod"),
+                             "max": (["np"], "max"),
+                             "amax": (["np"], "amax"),
+                             "min": (["np"], "min"),
+                             "amin": (["np"], "amin"),
+                             "mean": (["np"], "mean"),
+                             "median": (["np"], "median"),
+                             "argmax": (["np"], "argmax"),
+                             "argmin": (["np"], "argmin"),
+                             "argsort": (["np"], "argsort"),
+                             "cov": (["np"], "cov"),
+                             "dot": (["np"], "dot"),
+                             "matmul": (["np"], "matmul")}
+
+
+class RewriteNumPyOverloadedFunctions(object):
+    def __init__(self, state, rewrite_function_name_map=rewrite_function_name_map):
+        self.state = state
+        self.function_name_map = rewrite_function_name_map
+
+    def run(self):
+        """
+        This function rewrites the name of NumPy functions that exist in self.function_name_map
+        e.g np.sum(a) would produce the following:
+
+        np.sum() --> numba_dppy.dpnp.sum()
+
+        ---------------------------------------------------------------------------------------
+        Numba IR Before Rewrite:
+        ---------------------------------------------------------------------------------------
+
+            $2load_global.0 = global(np: <module 'numpy' from 'numpy/__init__.py'>) ['$2load_global.0']
+            $4load_method.1 = getattr(value=$2load_global.0, attr=sum) ['$2load_global.0', '$4load_method.1']
+            $8call_method.3 = call $4load_method.1(a, func=$4load_method.1, args=[Var(a, test_rewrite.py:7)],
+                                                   kws=(), vararg=None) ['$4load_method.1', '$8call_method.3', 'a']
+
+        ---------------------------------------------------------------------------------------
+        Numba IR After Rewrite:
+        ---------------------------------------------------------------------------------------
+
+            $dppy_replaced_var.0 = global(numba_dppy: <module 'numba_dppy' from 'numba_dppy/__init__.py'>) ['$dppy_replaced_var.0']
+            $dpnp_var.1 = getattr(value=$dppy_replaced_var.0, attr=dpnp) ['$dpnp_var.1', '$dppy_replaced_var.0']
+            $4load_method.1 = getattr(value=$dpnp_var.1, attr=sum) ['$4load_method.1', '$dpnp_var.1']
+            $8call_method.3 = call $4load_method.1(a, func=$4load_method.1, args=[Var(a, test_rewrite.py:7)],
+                                                   kws=(), vararg=None) ['$4load_method.1', '$8call_method.3', 'a']
+
+        ---------------------------------------------------------------------------------------
+        """
+        func_ir = self.state.func_ir
+        blocks = func_ir.blocks
+        topo_order = find_topo_order(blocks)
+
+        for label in topo_order:
+            block = blocks[label]
+            saved_arr_arg = {}
+            new_body = []
+            for stmt in block.body:
+                if isinstance(stmt, ir.Assign) and isinstance(stmt.value, ir.Expr):
+                    lhs = stmt.target.name
+                    rhs = stmt.value
+                    # replace np.FOO with name from self.function_name_map["FOO"]
+                    # e.g. np.sum will be replaced with numba_dppy.dpnp.sum
+                    if rhs.op == "getattr" and rhs.attr in self.function_name_map:
+                        module_node = block.find_variable_assignment(
+                            rhs.value.name
+                        ).value
+                        if (
+                            isinstance(module_node, ir.Global)
+                            and module_node.name in self.function_name_map[rhs.attr][0]
+                        ) or (
+                            isinstance(module_node, ir.Expr)
+                            and module_node.attr in self.function_name_map[rhs.attr][0]
+                        ):
+                            rhs = stmt.value
+                            rhs.attr = self.function_name_map[rhs.attr][1]
+
+                            global_module = rhs.value
+                            saved_arr_arg[lhs] = global_module
+
+                            scope = global_module.scope
+                            loc = global_module.loc
+
+                            g_dppy_var = ir.Var(
+                                scope, mk_unique_var("$2load_global"), loc
+                            )
+                            # We are trying to rename np.function_name/np.linalg.function_name with
+                            # numba_dppy.dpnp.function_name.
+                            # Hence, we need to have a global variable representing module numba_dppy.
+                            # Next, we add attribute dpnp to global module numba_dppy to
+                            # represent numba_dppy.dpnp.
+                            g_dppy = ir.Global("numba_dppy", numba_dppy, loc)
+                            g_dppy_assign = ir.Assign(g_dppy, g_dppy_var, loc)
+
+                            dpnp_var = ir.Var(scope, mk_unique_var("$4load_attr"), loc)
+                            getattr_dpnp = ir.Expr.getattr(g_dppy_var, "dpnp", loc)
+                            dpnp_assign = ir.Assign(getattr_dpnp, dpnp_var, loc)
+
+                            rhs.value = dpnp_var
+                            new_body.append(g_dppy_assign)
+                            new_body.append(dpnp_assign)
+                            func_ir._definitions[dpnp_var.name] = [getattr_dpnp]
+                            func_ir._definitions[g_dppy_var.name] = [g_dppy]
+
+                new_body.append(stmt)
+            block.body = new_body
+
+
+@register_pass(mutates_CFG=True, analysis_only=False)
+class DPPYRewriteOverloadedNumPyFunctions(FunctionPass):
+    _name = "dppy_rewrite_overloaded_functions_pass"
+
+    def __init__(self):
+        FunctionPass.__init__(self)
+
+        import numba_dppy.dpnp_glue.dpnpdecl
+        import numba_dppy.dpnp_glue.dpnpimpl
+        import numba_dppy.dpnp_glue.dpnp_linalgimpl
+        import numba_dppy.dpnp_glue.dpnp_transcendentalsimpl
+        import numba_dppy.dpnp_glue.dpnp_statisticsimpl
+        import numba_dppy.dpnp_glue.dpnp_sort_search_countimpl
+
+    def run_pass(self, state):
+        rewrite_function_name_pass = RewriteNumPyOverloadedFunctions(
+            state, rewrite_function_name_map
+        )
+
+        rewrite_function_name_pass.run()
+
+        remove_dead(state.func_ir.blocks, state.func_ir.arg_names, state.func_ir)
+        state.func_ir.blocks = simplify_CFG(state.func_ir.blocks)
+
+        return True
+
+
+def get_dpnp_func_typ(func):
+    from numba.core.typing.templates import builtin_registry
+    for (k, v) in builtin_registry.globals:
+        if k == func:
+            return v
+    raise RuntimeError("type for func ", func, " not found")
+
+
+class RewriteNdarrayFunctions(object):
+    def __init__(self, state, rewrite_function_name_map=rewrite_function_name_map):
+        self.state = state
+        self.function_name_map = rewrite_function_name_map
+        self.typemap = state.type_annotation.typemap
+        self.calltypes = state.type_annotation.calltypes
+
+    def run(self):
+        typingctx = self.state.typingctx
+
+        # save array arg to call
+        # call_varname -> array
+        func_ir = self.state.func_ir
+        blocks = func_ir.blocks
+        saved_arr_arg = {}
+        topo_order = find_topo_order(blocks)
+
+        for label in topo_order:
+            block = blocks[label]
+            new_body = []
+            for stmt in block.body:
+                if isinstance(stmt, ir.Assign) and isinstance(stmt.value, ir.Expr):
+                    lhs = stmt.target.name
+                    rhs = stmt.value
+                    # replace A.func with np.func, and save A in saved_arr_arg
+                    if (rhs.op == 'getattr' and rhs.attr in self.function_name_map
+                            and isinstance(
+                                self.typemap[rhs.value.name], types.npytypes.Array)):
+                        rhs = stmt.value
+                        arr = rhs.value
+                        saved_arr_arg[lhs] = arr
+                        scope = arr.scope
+                        loc = arr.loc
+
+                        g_dppy_var = ir.Var(scope, mk_unique_var("$load_global"), loc)
+                        self.typemap[g_dppy_var.name] = types.misc.Module(numba_dppy)
+                        g_dppy = ir.Global("numba_dppy", numba_dppy, loc)
+                        g_dppy_assign = ir.Assign(g_dppy, g_dppy_var, loc)
+
+                        dpnp_var = ir.Var(scope, mk_unique_var("$load_attr"), loc)
+                        self.typemap[dpnp_var.name] = types.misc.Module(numba_dppy.dpnp)
+                        getattr_dpnp = ir.Expr.getattr(g_dppy_var, "dpnp", loc)
+                        dpnp_assign = ir.Assign(getattr_dpnp, dpnp_var, loc)
+
+                        rhs.value = dpnp_var
+                        new_body.append(g_dppy_assign)
+                        new_body.append(dpnp_assign)
+
+                        func_ir._definitions[g_dppy_var.name] = [getattr_dpnp]
+                        func_ir._definitions[dpnp_var.name] = [getattr_dpnp]
+
+                        # update func var type
+                        func = getattr(numba_dppy.dpnp, rhs.attr)
+                        func_typ = get_dpnp_func_typ(func)
+
+                        self.typemap.pop(lhs)
+                        self.typemap[lhs] = func_typ
+
+                    if rhs.op == 'call' and rhs.func.name in saved_arr_arg:
+                        # add array as first arg
+                        arr = saved_arr_arg[rhs.func.name]
+                        # update call type signature to include array arg
+                        old_sig = self.calltypes.pop(rhs)
+                        # argsort requires kws for typing so sig.args can't be used
+                        # reusing sig.args since some types become Const in sig
+                        argtyps = old_sig.args[:len(rhs.args)]
+                        kwtyps = {name: self.typemap[v.name] for name, v in rhs.kws}
+                        self.calltypes[rhs] = self.typemap[rhs.func.name].get_call_type(
+                            typingctx, [self.typemap[arr.name]] + list(argtyps), kwtyps)
+                        rhs.args = [arr] + rhs.args
+
+                new_body.append(stmt)
+            block.body = new_body
+        return
+
+
+@register_pass(mutates_CFG=True, analysis_only=False)
+class DPPYRewriteNdarrayFunctions(FunctionPass):
+    _name = "dppy_rewrite_ndarray_functions_pass"
+
+    def __init__(self):
+        FunctionPass.__init__(self)
+
+    def run_pass(self, state):
+        rewrite_ndarray_function_name_pass = RewriteNdarrayFunctions(
+            state, rewrite_function_name_map
+        )
+
+        rewrite_ndarray_function_name_pass.run()
+
+        remove_dead(state.func_ir.blocks, state.func_ir.arg_names, state.func_ir)
+        state.func_ir.blocks = simplify_CFG(state.func_ir.blocks)
+
+        return True
diff --git a/numba_dppy/spirv_generator.py b/numba_dppy/spirv_generator.py
index cee4672ded..5bac98e014 100644
--- a/numba_dppy/spirv_generator.py
+++ b/numba_dppy/spirv_generator.py
@@ -7,6 +7,7 @@
 import tempfile
 
 from numba import config
+from numba_dppy import config as dppy_config
 from numba_dppy.target import LINK_ATOMIC
 
 
@@ -61,7 +62,7 @@ def generate(self, ipath, opath):
         #     b) hoist all allocas to the enty block of the module
         check_call(["opt","-O1","-o",ipath+'.bc',ipath])
         check_call(["llvm-spirv","-o",opath,ipath+'.bc'])
-        if config.SAVE_DPPL_IR_FILES == 0:
+        if dppy_config.SAVE_IR_FILES == 0:
             os.unlink(ipath + '.bc')
 
     def link(self, opath, binaries):
@@ -84,12 +85,12 @@ def __init__(self, context):
     def __del__(self):
         # Remove all temporary files
         for afile in self._tempfiles:
-            if config.SAVE_DPPL_IR_FILES != 0:
+            if dppy_config.SAVE_IR_FILES != 0:
                 print(afile)
             else:
                 os.unlink(afile)
         # Remove directory
-        if config.SAVE_DPPL_IR_FILES == 0:
+        if dppy_config.SAVE_IR_FILES == 0:
             os.rmdir(self._tmpdir)
 
     def _create_temp_file(self, name, mode='wb'):
@@ -136,7 +137,7 @@ def finalize(self):
             self._cmd.link(spirv_path, binary_paths)
 
         # Validate the SPIR-V code
-        if config.SPIRV_VAL == 1:
+        if dppy_config.SPIRV_VAL == 1:
             try:
                 self._cmd.validate(ipath=spirv_path)
             except CalledProcessError:
diff --git a/numba_dppy/target.py b/numba_dppy/target.py
index aac4efcd4b..147b229e77 100644
--- a/numba_dppy/target.py
+++ b/numba_dppy/target.py
@@ -24,7 +24,7 @@
 # Typing
 
 
-class DPPLTypingContext(typing.BaseContext):
+class DPPYTypingContext(typing.BaseContext):
     def load_additional_registries(self):
         # Declarations for OpenCL API functions and OpenCL Math functions
         from .ocl import ocldecl, mathdecl
@@ -91,7 +91,7 @@ def _replace_numpy_ufunc_with_opencl_supported_functions():
                 ufunc_db[ufunc][sig] = lower_ocl_impl[(name, sig_mapper[sig])]
 
 
-class DPPLTargetContext(BaseContext):
+class DPPYTargetContext(BaseContext):
     implement_powi_as_math_call = True
     generic_addrspace = SPIR_GENERIC_ADDRSPACE
 
@@ -153,7 +153,7 @@ def load_additional_registries(self):
 
     @cached_property
     def call_conv(self):
-        return DPPLCallConv(self)
+        return DPPYCallConv(self)
 
     def codegen(self):
         return self._internal_codegen
@@ -169,7 +169,7 @@ def repl(m):
 
         qualified = name + '.' + '.'.join(str(a) for a in argtypes)
         mangled = VALID_CHARS.sub(repl, qualified)
-        return 'dppl_py_devfn_' + mangled
+        return 'dppy_py_devfn_' + mangled
 
     def prepare_ocl_kernel(self, func, argtypes):
         module = func.module
@@ -208,8 +208,8 @@ def sub_gen_with_global(lty):
             llargtys = changed = ()
         wrapperfnty = lc.Type.function(lc.Type.void(), llargtys)
 
-        wrapper_module = self.create_module("dppl.kernel.wrapper")
-        wrappername = 'dpplPy_{name}'.format(name=func.name)
+        wrapper_module = self.create_module("dppy.kernel.wrapper")
+        wrappername = 'dppyPy_{name}'.format(name=func.name)
 
         argtys = list(arginfo.argument_types)
         fnty = lc.Type.function(lc.Type.int(),
@@ -239,7 +239,7 @@ def sub_gen_with_global(lty):
                                                  argtypes, callargs)
         builder.ret_void()
 
-        set_dppl_kernel(wrapper)
+        set_dppy_kernel(wrapper)
 
         #print(str(wrapper_module))
         # Link
@@ -254,10 +254,14 @@ def sub_gen_with_global(lty):
     def declare_function(self, module, fndesc):
         fnty = self.call_conv.get_function_type(fndesc.restype, fndesc.argtypes)
         fn = module.get_or_insert_function(fnty, name=fndesc.mangled_name)
-        fn.attributes.add('alwaysinline')
-        ret = super(DPPLTargetContext, self).declare_function(module, fndesc)
+
+        if not self.enable_debuginfo:
+            fn.attributes.add('alwaysinline')
+
+        ret = super(DPPYTargetContext, self).declare_function(module, fndesc)
+
         # XXX: Refactor fndesc instead of this special case
-        if fndesc.llvm_func_name.startswith('dppl_py_devfn'):
+        if fndesc.llvm_func_name.startswith('dppy_py_devfn'):
             ret.calling_convention = CC_SPIR_FUNC
         return ret
 
@@ -305,7 +309,7 @@ def addrspacecast(self, builder, src, addrspace):
         return builder.addrspacecast(src, ptras)
 
 
-def set_dppl_kernel(fn):
+def set_dppy_kernel(fn):
     """
     Ensure `fn` is usable as a SPIR kernel.
     - Fix calling convention
@@ -332,11 +336,11 @@ def set_dppl_kernel(fn):
     make_constant = lambda x: lc.Constant.int(lc.Type.int(), x)
     spir_version_constant = [make_constant(x) for x in SPIR_VERSION]
 
-    spir_version = mod.get_or_insert_named_metadata("dppl.spir.version")
+    spir_version = mod.get_or_insert_named_metadata("dppy.spir.version")
     if not spir_version.operands:
         spir_version.add(lc.MetaData.get(mod, spir_version_constant))
 
-    ocl_version = mod.get_or_insert_named_metadata("dppl.ocl.version")
+    ocl_version = mod.get_or_insert_named_metadata("dppy.ocl.version")
     if not ocl_version.operands:
         ocl_version.add(lc.MetaData.get(mod, spir_version_constant))
 
@@ -414,7 +418,7 @@ def gen_arg_base_type(fn):
     return lc.MetaData.get(mod, [name] + consts)
 
 
-class DPPLCallConv(MinimalCallConv):
+class DPPYCallConv(MinimalCallConv):
     def call_function(self, builder, callee, resty, argtys, args, env=None):
         """
         Call the Numba-compiled *callee*.
diff --git a/numba_dppy/target_dispatcher.py b/numba_dppy/target_dispatcher.py
index 40b9d589d9..dde38eb75b 100644
--- a/numba_dppy/target_dispatcher.py
+++ b/numba_dppy/target_dispatcher.py
@@ -8,9 +8,9 @@
 class TargetDispatcher(serialize.ReduceMixin, metaclass=dispatcher.DispatcherMeta):
     __numba__ = 'py_func'
 
-    target_offload_gpu = '__dppl_offload_gpu__'
-    target_offload_cpu = '__dppl_offload_cpu__'
-    target_dppl = 'dppy'
+    target_offload_gpu = '__dppy_offload_gpu__'
+    target_offload_cpu = '__dppy_offload_cpu__'
+    target_dppy = 'dppy'
 
     def __init__(self, py_func, wrapper, target, parallel_options, compiled=None):
 
@@ -53,7 +53,7 @@ def get_compiled(self, target=None):
         return self.__compiled[disp]
 
     def __is_with_context_target(self, target):
-        return target is None or target == TargetDispatcher.target_dppl
+        return target is None or target == TargetDispatcher.target_dppy
 
     def get_current_disp(self):
         target = self.__target
@@ -66,7 +66,7 @@ def get_current_disp(self):
             if parallel is False or (isinstance(parallel, dict) and parallel.get('offload') is False):
                 raise UnsupportedError(f"Can't use 'with' context with parallel option '{parallel}'")
 
-            from numba_dppy import dppl_offload_dispatcher
+            from numba_dppy import dppy_offload_dispatcher
 
             if target is None:
                 if dpctl.get_current_device_type() == dpctl.device_type.gpu:
@@ -75,7 +75,7 @@ def get_current_disp(self):
                     return registry.dispatcher_registry[TargetDispatcher.target_offload_cpu]
                 else:
                     if dpctl.is_in_device_context():
-                        raise UnsupportedError('Unknown dppl device type')
+                        raise UnsupportedError('Unknown dppy device type')
                     if offload:
                         if dpctl.has_gpu_queues():
                             return registry.dispatcher_registry[TargetDispatcher.target_offload_gpu]
diff --git a/numba_dppy/testing.py b/numba_dppy/testing.py
index 11090ebedc..e6ff1e3ab3 100644
--- a/numba_dppy/testing.py
+++ b/numba_dppy/testing.py
@@ -1,5 +1,3 @@
-from __future__ import print_function, absolute_import, division
-
 import contextlib
 import sys
 
@@ -7,42 +5,39 @@
 import unittest
 from numba.tests.support import (
     captured_stdout,
-    SerialMixin,
     redirect_c_stdout,
 )
 
-class DPPLTestCase(SerialMixin, unittest.TestCase):
-    def setUp(self):
-        #init()
-	#TODO
-        pass
-    def tearDown(self):
-        #reset()
-	#TODO
-        pass
-
-class DPPLTextCapture(object):
-    def __init__(self, stream):
-        self._stream = stream
-
-    def getvalue(self):
-        return self._stream.read()
-
-class PythonTextCapture(object):
-    def __init__(self, stream):
-        self._stream = stream
-
-    def getvalue(self):
-        return self._stream.getvalue()
 
 @contextlib.contextmanager
-def captured_dppl_stdout():
+def captured_dppy_stdout():
     """
-    Return a minimal stream-like object capturing the text output of dppl
+    Return a minimal stream-like object capturing the text output of dppy
     """
     # Prevent accidentally capturing previously output text
     sys.stdout.flush()
 
-    import numba_dppy, numba_dppy as dppl
+    import numba_dppy, numba_dppy as dppy
     with redirect_c_stdout() as stream:
-        yield DPPLTextCapture(stream)
+        yield DPPYTextCapture(stream)
+
+
+def _id(obj):
+    return obj
+
+
+def expectedFailureIf(condition):
+    """
+    Expected failure for a test if the condition is true.
+    """
+    if condition:
+        return unittest.expectedFailure
+    return _id
+
+
+def ensure_dpnp():
+    try:
+        from numba_dppy.dpnp_glue import dpnp_fptr_interface as dpnp_glue
+        return True
+    except:
+        return False
diff --git a/numba_dppy/tests/__init__.py b/numba_dppy/tests/__init__.py
index c9e582dac3..939c95c567 100644
--- a/numba_dppy/tests/__init__.py
+++ b/numba_dppy/tests/__init__.py
@@ -2,17 +2,18 @@
 from numba.testing import load_testsuite
 from os.path import dirname, join
 
+import numba_dppy
+import numba_dppy.config as dppy_config
 
-import numba.dppl_config as dppl_config
+# from numba_dppy.tests.dppy import *
 
 def load_tests(loader, tests, pattern):
 
     suite = SerialSuite()
-    this_dir = dirname(__file__)
 
-    if dppl_config.dppl_present:
-        suite.addTests(load_testsuite(loader, join(this_dir, 'dppl')))
+    if dppy_config.dppy_present:
+        suite.addTests(load_testsuite(loader, dirname(__file__)))
     else:
-        print("skipped DPPL tests")
+        print("skipped DPPY tests")
 
     return suite
diff --git a/numba_dppy/tests/dppl/__init__.py b/numba_dppy/tests/dppl/__init__.py
deleted file mode 100644
index cff5a36cc2..0000000000
--- a/numba_dppy/tests/dppl/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from numba.testing import SerialSuite
-from numba.testing import load_testsuite
-import os
-
-def load_tests(loader, tests, pattern):
-    return SerialSuite(load_testsuite(loader, os.path.dirname(__file__)))
diff --git a/numba_dppy/tests/dppl/test_dpnp_functions.py b/numba_dppy/tests/dppl/test_dpnp_functions.py
deleted file mode 100644
index bbffb30c3f..0000000000
--- a/numba_dppy/tests/dppl/test_dpnp_functions.py
+++ /dev/null
@@ -1,218 +0,0 @@
-#! /usr/bin/env python
-from __future__ import print_function
-from timeit import default_timer as time
-
-import sys
-import numpy as np
-from numba import njit
-import numba_dppy, numba_dppy as dppl
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
-
-
-def test_for_different_datatypes(fn, test_fn, dims, arg_count, tys, np_all=False, matrix=None):
-    if arg_count == 1:
-        for ty in tys:
-            if matrix and matrix[0]:
-                a = np.array(np.random.random(dims[0] * dims[1]), dtype=ty).reshape(dims[0], dims[1])
-            else:
-                a = np.array(np.random.random(dims[0]), dtype=ty)
-            c = fn(a)
-            d = test_fn(a)
-            if np_all:
-                max_abs_err = np.all(c - d)
-            else:
-                max_abs_err = c - d
-            if not (max_abs_err < 1e-4):
-                return False
-
-    elif arg_count == 2:
-        for ty in tys:
-            if matrix and matrix[0]:
-                a = np.array(np.random.random(dims[0] * dims[1]), dtype=ty).reshape(dims[0], dims[1])
-            else:
-                a = np.array(np.random.random(dims[0] * dims[1]), dtype=ty)
-            if matrix and matrix[1]:
-                b = np.array(np.random.random(dims[2] * dims[3]), dtype=ty).reshape(dims[2], dims[3])
-            else:
-                b = np.array(np.random.random(dims[2] * dims[3]), dtype=ty)
-
-            c = fn(a, b)
-            d = test_fn(a, b)
-            if np_all:
-                max_abs_err = np.sum(c - d)
-            else:
-                max_abs_err = c - d
-            if not (max_abs_err < 1e-4):
-                return False
-
-    return True
-
-def test_for_dimensions(fn, test_fn, dims, tys, np_all=False):
-    total_size = 1
-    for d in dims:
-        total_size *= d
-
-    for ty in tys:
-        a = np.array(np.random.random(total_size), dtype=ty).reshape(dims)
-        c = fn(a)
-        d = test_fn(a)
-        if np_all:
-            max_abs_err = np.all(c - d)
-        else:
-            max_abs_err = c - d
-        if not (max_abs_err < 1e-4):
-            return False
-
-    return True
-
-def ensure_dpnp():
-    try:
-       # import dpnp
-        from numba_dppy.dpnp_glue import dpnp_fptr_interface as dpnp_glue
-        return True
-    except:
-        return False
-
-
-@unittest.skipUnless(ensure_dpnp(), 'test only when dpNP is available')
-class Testdpnp_functions(DPPLTestCase):
-    N = 10
-
-    a = np.array(np.random.random(N), dtype=np.float32)
-    b = np.array(np.random.random(N), dtype=np.float32)
-    tys = [np.int32, np.uint32, np.int64, np.uint64, np.float, np.double]
-
-    def test_sum(self):
-        @njit(parallel={'offload':True})
-        def f(a):
-            c = np.sum(a)
-            return c
-
-        self.assertTrue(test_for_different_datatypes(f, np.sum, [10], 1, self.tys))
-        self.assertTrue(test_for_dimensions(f, np.sum, [10, 2], self.tys))
-        self.assertTrue(test_for_dimensions(f, np.sum, [10, 2, 3], self.tys))
-
-    def test_prod(self):
-        @njit(parallel={'offload':True})
-        def f(a):
-            c = np.prod(a)
-            return c
-
-        self.assertTrue(test_for_different_datatypes(f, np.prod, [10], 1, self.tys))
-        self.assertTrue(test_for_dimensions(f, np.prod, [10, 2], self.tys))
-        self.assertTrue(test_for_dimensions(f, np.prod, [10, 2, 3], self.tys))
-
-    def test_argmax(self):
-        @njit(parallel={'offload':True})
-        def f(a):
-            c = np.argmax(a)
-            return c
-
-        self.assertTrue(test_for_different_datatypes(f, np.argmax, [10], 1, self.tys))
-        self.assertTrue(test_for_dimensions(f, np.argmax, [10, 2], self.tys))
-        self.assertTrue(test_for_dimensions(f, np.argmax, [10, 2, 3], self.tys))
-
-    def test_max(self):
-        @njit(parallel={'offload':True})
-        def f(a):
-            c = np.max(a)
-            return c
-
-        self.assertTrue(test_for_different_datatypes(f, np.max, [10], 1, self.tys))
-        self.assertTrue(test_for_dimensions(f, np.max, [10, 2], self.tys))
-        self.assertTrue(test_for_dimensions(f, np.max, [10, 2, 3], self.tys))
-
-    def test_argmin(self):
-        @njit(parallel={'offload':True})
-        def f(a):
-            c = np.argmin(a)
-            return c
-
-        self.assertTrue(test_for_different_datatypes(f, np.argmin, [10], 1, self.tys))
-        self.assertTrue(test_for_dimensions(f, np.argmin, [10, 2], self.tys))
-        self.assertTrue(test_for_dimensions(f, np.argmin, [10, 2, 3], self.tys))
-
-    def test_min(self):
-        @njit(parallel={'offload':True})
-        def f(a):
-            c = np.min(a)
-            return c
-
-        self.assertTrue(test_for_different_datatypes(f, np.min, [10], 1, self.tys))
-        self.assertTrue(test_for_dimensions(f, np.min, [10, 2], self.tys))
-        self.assertTrue(test_for_dimensions(f, np.min, [10, 2, 3], self.tys))
-
-    def test_argsort(self):
-        @njit(parallel={'offload':True})
-        def f(a):
-            c = np.argsort(a)
-            return c
-
-        self.assertTrue(test_for_different_datatypes(f, np.argmin, [10], 1, self.tys, np_all=True))
-
-    def test_median(self):
-        @njit(parallel={'offload':True})
-        def f(a):
-            c = np.median(a)
-            return c
-
-        self.assertTrue(test_for_different_datatypes(f, np.median, [10], 1, self.tys))
-        self.assertTrue(test_for_dimensions(f, np.median, [10, 2], self.tys))
-        self.assertTrue(test_for_dimensions(f, np.median, [10, 2, 3], self.tys))
-
-    def test_mean(self):
-        @njit(parallel={'offload':True})
-        def f(a):
-            c = np.mean(a)
-            return c
-
-        self.assertTrue(test_for_different_datatypes(f, np.mean, [10], 1, self.tys))
-        self.assertTrue(test_for_dimensions(f, np.mean, [10, 2], self.tys))
-        self.assertTrue(test_for_dimensions(f, np.mean, [10, 2, 3], self.tys))
-
-    def test_matmul(self):
-        @njit(parallel={'offload':True})
-        def f(a, b):
-            c = np.matmul(a, b)
-            return c
-
-        self.assertTrue(test_for_different_datatypes(f, np.matmul, [10, 5, 5, 10], 2, [np.float, np.double], np_all=True, matrix=[True, True]))
-
-    def test_dot(self):
-        @njit(parallel={'offload':True})
-        def f(a, b):
-            c = np.dot(a, b)
-            return c
-
-        self.assertTrue(test_for_different_datatypes(f, np.dot, [10, 1, 10, 1], 2, [np.float, np.double]))
-        self.assertTrue(test_for_different_datatypes(f, np.dot, [10, 1, 10, 2], 2, [np.float, np.double], matrix=[False, True], np_all=True))
-        self.assertTrue(test_for_different_datatypes(f, np.dot, [2, 10, 10, 1], 2, [np.float, np.double], matrix=[True, False], np_all=True))
-        self.assertTrue(test_for_different_datatypes(f, np.dot, [10, 2, 2, 10], 2, [np.float, np.double], matrix=[True, True], np_all=True))
-
-
-    def test_cov(self):
-        @njit(parallel={'offload':True})
-        def f(a):
-            c = np.cov(a)
-            return c
-
-        self.assertTrue(test_for_different_datatypes(f, np.cov, [10, 7], 1, self.tys, matrix=[True], np_all=True))
-
-    def test_dpnp_interacting_with_parfor(self):
-        @njit(parallel={'offload':True})
-        def f(a, b):
-            c = np.sum(a)
-            e = np.add(b, a)
-            #d = a + 1
-            return 0
-
-        result = f(self.a, self.b)
-        #np_result = np.add((self.a + np.sum(self.a)), self.b)
-
-        #max_abs_err = result.sum() - np_result.sum()
-        #self.assertTrue(max_abs_err < 1e-4)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/numba_dppy/tests/dppl/test_dppl_fallback.py b/numba_dppy/tests/dppl/test_dppl_fallback.py
deleted file mode 100644
index adb7ae868b..0000000000
--- a/numba_dppy/tests/dppl/test_dppl_fallback.py
+++ /dev/null
@@ -1,56 +0,0 @@
-from __future__ import print_function, division, absolute_import
-
-import numpy as np
-
-import numba
-import numba_dppy, numba_dppy as dppl
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
-from numba.tests.support import captured_stderr
-import dpctl
-import sys
-import io
-
-
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPLFallback(DPPLTestCase):
-    def test_dppl_fallback_inner_call(self):
-        @numba.jit
-        def fill_value(i):
-            return i
-
-        def inner_call_fallback():
-            x = 10
-            a = np.empty(shape=x, dtype=np.float32)
-
-            for i in numba.prange(x):
-                a[i] = fill_value(i)
-
-            return a
-
-        with captured_stderr() as msg:
-            dppl = numba.njit(parallel={'offload':True})(inner_call_fallback)
-            dppl_result = dppl()
-
-        ref_result = inner_call_fallback()
-
-        np.testing.assert_array_equal(dppl_result, ref_result)
-        self.assertTrue('Failed to lower parfor on DPPL-device' in msg.getvalue())
-
-    def test_dppl_fallback_reductions(self):
-        def reduction(a):
-            return np.amax(a)
-
-        a = np.ones(10)
-        with captured_stderr() as msg:
-            dppl = numba.njit(parallel={'offload':True})(reduction)
-            dppl_result = dppl(a)
-
-        ref_result = reduction(a)
-
-        np.testing.assert_array_equal(dppl_result, ref_result)
-        self.assertTrue('Failed to lower parfor on DPPL-device' in msg.getvalue())
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/numba_dppy/tests/dppl/test_parfor_lower_message.py b/numba_dppy/tests/dppl/test_parfor_lower_message.py
deleted file mode 100644
index 728d46ddf3..0000000000
--- a/numba_dppy/tests/dppl/test_parfor_lower_message.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import numpy as np
-import numba
-from numba import njit, prange
-import numba_dppy, numba_dppy as dppl
-from numba_dppy.testing import unittest, DPPLTestCase
-from numba.tests.support import captured_stdout
-import dpctl.ocldrv as ocldrv
-
-
-def prange_example():
-    n = 10
-    a = np.ones((n), dtype=np.float64)
-    b = np.ones((n), dtype=np.float64)
-    c = np.ones((n), dtype=np.float64)
-    for i in prange(n//2):
-        a[i] = b[i] + c[i]
-
-    return a
-
-
-@unittest.skipUnless(ocldrv.has_gpu_device, 'test only on GPU system')
-class TestParforMessage(DPPLTestCase):
-    def test_parfor_message(self):
-        numba_dppy.compiler.DEBUG = 1
-        jitted = njit(parallel={'offload':True})(prange_example)
-
-        with captured_stdout() as got:
-            jitted()
-
-        numba_dppy.compiler.DEBUG = 0
-        self.assertTrue('Parfor lowered on DPPL-device' in got.getvalue())
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/numba_dppy/tests/skip_tests.py b/numba_dppy/tests/skip_tests.py
new file mode 100644
index 0000000000..fa18d36181
--- /dev/null
+++ b/numba_dppy/tests/skip_tests.py
@@ -0,0 +1,11 @@
+import dpctl
+
+def is_gen12(device_type):
+    with dpctl.device_context(device_type):
+        q = dpctl.get_current_queue()
+        device = q.get_sycl_device()
+        name = device.get_device_name()
+        if "Gen12" in name:
+            return True
+
+        return False
diff --git a/numba_dppy/tests/dppl/test_arg_accessor.py b/numba_dppy/tests/test_arg_accessor.py
similarity index 78%
rename from numba_dppy/tests/dppl/test_arg_accessor.py
rename to numba_dppy/tests/test_arg_accessor.py
index ecc5d839bb..494f269c59 100644
--- a/numba_dppy/tests/dppl/test_arg_accessor.py
+++ b/numba_dppy/tests/test_arg_accessor.py
@@ -1,26 +1,23 @@
-from __future__ import print_function, division, absolute_import
-
 import numpy as np
 
-import numba_dppy, numba_dppy as dppl
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+import numba_dppy, numba_dppy as dppy
+import unittest
 import dpctl
 
 
-@dppl.kernel(access_types={"read_only": ['a', 'b'], "write_only": ['c'], "read_write": []})
+@dppy.kernel(access_types={"read_only": ['a', 'b'], "write_only": ['c'], "read_write": []})
 def sum_with_accessor(a, b, c):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     c[i] = a[i] + b[i]
 
-@dppl.kernel
+@dppy.kernel
 def sum_without_accessor(a, b, c):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     c[i] = a[i] + b[i]
 
 def call_kernel(global_size, local_size,
                 A, B, C, func):
-        func[global_size, dppl.DEFAULT_LOCAL_SIZE](A, B, C)
+        func[global_size, dppy.DEFAULT_LOCAL_SIZE](A, B, C)
 
 
 global_size = 10
@@ -33,7 +30,7 @@ def call_kernel(global_size, local_size,
 
 
 @unittest.skipUnless(dpctl.has_cpu_queues(), 'test only on CPU system')
-class TestDPPLArgAccessorCPU(DPPLTestCase):
+class TestDPPYArgAccessorCPU(unittest.TestCase):
     def test_arg_with_accessor(self):
         C = np.ones_like(A)
         with dpctl.device_context("opencl:cpu") as cpu_queue:
@@ -50,7 +47,7 @@ def test_arg_without_accessor(self):
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPLArgAccessorGPU(DPPLTestCase):
+class TestDPPYArgAccessorGPU(unittest.TestCase):
     def test_arg_with_accessor(self):
         C = np.ones_like(A)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
diff --git a/numba_dppy/tests/dppl/test_arg_types.py b/numba_dppy/tests/test_arg_types.py
similarity index 78%
rename from numba_dppy/tests/dppl/test_arg_types.py
rename to numba_dppy/tests/test_arg_types.py
index fc2eae105d..ed55e12e16 100644
--- a/numba_dppy/tests/dppl/test_arg_types.py
+++ b/numba_dppy/tests/test_arg_types.py
@@ -1,20 +1,17 @@
-from __future__ import print_function, division, absolute_import
-
 import numpy as np
 
-import numba_dppy, numba_dppy as dppl
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+import numba_dppy, numba_dppy as dppy
+import unittest
 import dpctl
 
 
-@dppl.kernel
+@dppy.kernel
 def mul_kernel(A, B, test):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     B[i] = A[i] * test
 
 def call_mul_device_kernel(global_size, A, B, test):
-    mul_kernel[global_size, dppl.DEFAULT_LOCAL_SIZE](A, B, test)
+    mul_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, B, test)
 
 
 global_size = 10
@@ -24,7 +21,7 @@ def call_mul_device_kernel(global_size, A, B, test):
 
 
 @unittest.skipUnless(dpctl.has_cpu_queues(), 'test only on CPU system')
-class TestDPPLArrayArgCPU(DPPLTestCase):
+class TestDPPYArrayArgCPU(unittest.TestCase):
     def test_integer_arg(self):
         x = np.int32(2)
         with dpctl.device_context("opencl:cpu") as cpu_queue:
@@ -42,7 +39,7 @@ def test_float_arg(self):
             self.assertTrue(np.all(A * x == B))
 
     def test_bool_arg(self):
-        @dppl.kernel
+        @dppy.kernel
         def check_bool_kernel(A, test):
             if test:
                 A[0] = 111
@@ -52,14 +49,14 @@ def check_bool_kernel(A, test):
         A = np.array([0], dtype='float64')
 
         with dpctl.device_context("opencl:cpu") as cpu_queue:
-            check_bool_kernel[global_size, dppl.DEFAULT_LOCAL_SIZE](A, True)
+            check_bool_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, True)
             self.assertTrue(A[0] == 111)
-            check_bool_kernel[global_size, dppl.DEFAULT_LOCAL_SIZE](A, False)
+            check_bool_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, False)
             self.assertTrue(A[0] == 222)
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPLArrayArgGPU(DPPLTestCase):
+class TestDPPYArrayArgGPU(unittest.TestCase):
     def test_integer_arg(self):
         x = np.int32(2)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
@@ -77,7 +74,7 @@ def test_float_arg(self):
             self.assertTrue(np.all(A * x == B))
 
     def test_bool_arg(self):
-        @dppl.kernel
+        @dppy.kernel
         def check_bool_kernel(A, test):
             if test:
                 A[0] = 111
@@ -87,9 +84,9 @@ def check_bool_kernel(A, test):
         A = np.array([0], dtype='float64')
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            check_bool_kernel[global_size, dppl.DEFAULT_LOCAL_SIZE](A, True)
+            check_bool_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, True)
             self.assertTrue(A[0] == 111)
-            check_bool_kernel[global_size, dppl.DEFAULT_LOCAL_SIZE](A, False)
+            check_bool_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, False)
             self.assertTrue(A[0] == 222)
 
 if __name__ == '__main__':
diff --git a/numba_dppy/tests/dppl/test_atomic_op.py b/numba_dppy/tests/test_atomic_op.py
similarity index 55%
rename from numba_dppy/tests/dppl/test_atomic_op.py
rename to numba_dppy/tests/test_atomic_op.py
index 9825c707d1..27a810ba08 100644
--- a/numba_dppy/tests/dppl/test_atomic_op.py
+++ b/numba_dppy/tests/test_atomic_op.py
@@ -1,108 +1,105 @@
-from __future__ import print_function, division, absolute_import
-
 import numpy as np
 
 import numba
-import numba_dppy, numba_dppy as dppl
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+import numba_dppy, numba_dppy as dppy
+import unittest
 import dpctl
 
 def atomic_add_int32(ary):
-    tid = dppl.get_local_id(0)
-    lm = dppl.local.static_alloc(32, numba.uint32)
+    tid = dppy.get_local_id(0)
+    lm = dppy.local.static_alloc(32, numba.uint32)
     lm[tid] = 0
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     bin = ary[tid] % 32
-    dppl.atomic.add(lm, bin, 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.add(lm, bin, 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[tid] = lm[tid]
 
 
 def atomic_sub_int32(ary):
-    tid = dppl.get_local_id(0)
-    lm = dppl.local.static_alloc(32, numba.uint32)
+    tid = dppy.get_local_id(0)
+    lm = dppy.local.static_alloc(32, numba.uint32)
     lm[tid] = 0
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     bin = ary[tid] % 32
-    dppl.atomic.sub(lm, bin, 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.sub(lm, bin, 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[tid] = lm[tid]
 
 
 def atomic_add_float32(ary):
-    lm = dppl.local.static_alloc(1, numba.float32)
+    lm = dppy.local.static_alloc(1, numba.float32)
     lm[0] = ary[0]
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
-    dppl.atomic.add(lm, 0, 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.add(lm, 0, 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[0] = lm[0]
 
 
 def atomic_sub_float32(ary):
-    lm = dppl.local.static_alloc(1, numba.float32)
+    lm = dppy.local.static_alloc(1, numba.float32)
     lm[0] = ary[0]
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
-    dppl.atomic.sub(lm, 0, 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.sub(lm, 0, 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[0] = lm[0]
 
 
 def atomic_add_int64(ary):
-    lm = dppl.local.static_alloc(1, numba.int64)
+    lm = dppy.local.static_alloc(1, numba.int64)
     lm[0] = ary[0]
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
-    dppl.atomic.add(lm, 0, 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.add(lm, 0, 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[0] = lm[0]
 
 
 def atomic_sub_int64(ary):
-    lm = dppl.local.static_alloc(1, numba.int64)
+    lm = dppy.local.static_alloc(1, numba.int64)
     lm[0] = ary[0]
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
-    dppl.atomic.sub(lm, 0, 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.sub(lm, 0, 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[0] = lm[0]
 
 
 def atomic_add_float64(ary):
-    lm = dppl.local.static_alloc(1, numba.float64)
+    lm = dppy.local.static_alloc(1, numba.float64)
     lm[0] = ary[0]
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
-    dppl.atomic.add(lm, 0, 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.add(lm, 0, 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[0] = lm[0]
 
 
 def atomic_sub_float64(ary):
-    lm = dppl.local.static_alloc(1, numba.float64)
+    lm = dppy.local.static_alloc(1, numba.float64)
     lm[0] = ary[0]
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
-    dppl.atomic.sub(lm, 0, 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.sub(lm, 0, 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[0] = lm[0]
 
 
 def atomic_add2(ary):
-    tx = dppl.get_local_id(0)
-    ty = dppl.get_local_id(1)
-    lm = dppl.local.static_alloc((4, 8), numba.uint32)
+    tx = dppy.get_local_id(0)
+    ty = dppy.get_local_id(1)
+    lm = dppy.local.static_alloc((4, 8), numba.uint32)
     lm[tx, ty] = ary[tx, ty]
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
-    dppl.atomic.add(lm, (tx, ty), 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.add(lm, (tx, ty), 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[tx, ty] = lm[tx, ty]
 
 
 def atomic_add3(ary):
-    tx = dppl.get_local_id(0)
-    ty = dppl.get_local_id(1)
-    lm = dppl.local.static_alloc((4, 8), numba.uint32)
+    tx = dppy.get_local_id(0)
+    ty = dppy.get_local_id(1)
+    lm = dppy.local.static_alloc((4, 8), numba.uint32)
     lm[tx, ty] = ary[tx, ty]
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
-    dppl.atomic.add(lm, (tx, numba.uint64(ty)), 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.add(lm, (tx, numba.uint64(ty)), 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[tx, ty] = lm[tx, ty]
 
 
@@ -118,18 +115,18 @@ def call_fn_for_datatypes(fn, result, input, global_size):
             #    continue
             #if dtype == np.int64 and not device_env.device_support_int64_atomics():
             #    continue
-            fn[global_size, dppl.DEFAULT_LOCAL_SIZE](a)
+            fn[global_size, dppy.DEFAULT_LOCAL_SIZE](a)
 
         assert(a[0] == result)
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
 @unittest.skipUnless(numba_dppy.ocl.atomic_support_present(), 'test only when atomic support is present')
-class TestAtomicOp(DPPLTestCase):
+class TestAtomicOp(unittest.TestCase):
     def test_atomic_add_global(self):
-        @dppl.kernel
+        @dppy.kernel
         def atomic_add(B):
-            dppl.atomic.add(B, 0, 1)
+            dppy.atomic.add(B, 0, 1)
 
         N = 100
         B = np.array([0])
@@ -138,9 +135,9 @@ def atomic_add(B):
 
 
     def test_atomic_sub_global(self):
-        @dppl.kernel
+        @dppy.kernel
         def atomic_sub(B):
-            dppl.atomic.sub(B, 0, 1)
+            dppy.atomic.sub(B, 0, 1)
 
         N = 100
         B = np.array([100])
@@ -152,10 +149,10 @@ def test_atomic_add_local_int32(self):
         ary = np.random.randint(0, 32, size=32).astype(np.uint32)
         orig = ary.copy()
 
-        #dppl_atomic_add = dppl.kernel('void(uint32[:])')(atomic_add_int32)
-        dppl_atomic_add = dppl.kernel(atomic_add_int32)
+        #dppy_atomic_add = dppy.kernel('void(uint32[:])')(atomic_add_int32)
+        dppy_atomic_add = dppy.kernel(atomic_add_int32)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            dppl_atomic_add[32, dppl.DEFAULT_LOCAL_SIZE](ary)
+            dppy_atomic_add[32, dppy.DEFAULT_LOCAL_SIZE](ary)
 
         gold = np.zeros(32, dtype=np.uint32)
         for i in range(orig.size):
@@ -168,10 +165,10 @@ def test_atomic_sub_local_int32(self):
         ary = np.random.randint(0, 32, size=32).astype(np.uint32)
         orig = ary.copy()
 
-        #dppl_atomic_sub = dppl.kernel('void(uint32[:])')(atomic_sub_int32)
-        dppl_atomic_sub = dppl.kernel(atomic_sub_int32)
+        #dppy_atomic_sub = dppy.kernel('void(uint32[:])')(atomic_sub_int32)
+        dppy_atomic_sub = dppy.kernel(atomic_sub_int32)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            dppl_atomic_sub[32, dppl.DEFAULT_LOCAL_SIZE](ary)
+            dppy_atomic_sub[32, dppy.DEFAULT_LOCAL_SIZE](ary)
 
         gold = np.zeros(32, dtype=np.uint32)
         for i in range(orig.size):
@@ -183,10 +180,10 @@ def test_atomic_sub_local_int32(self):
     def test_atomic_add_local_float32(self):
         ary = np.array([0], dtype=np.float32)
 
-        #dppl_atomic_add = dppl.kernel('void(float32[:])')(atomic_add_float32)
-        dppl_atomic_add = dppl.kernel(atomic_add_float32)
+        #dppy_atomic_add = dppy.kernel('void(float32[:])')(atomic_add_float32)
+        dppy_atomic_add = dppy.kernel(atomic_add_float32)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            dppl_atomic_add[32, dppl.DEFAULT_LOCAL_SIZE](ary)
+            dppy_atomic_add[32, dppy.DEFAULT_LOCAL_SIZE](ary)
 
         self.assertTrue(ary[0] == 32)
 
@@ -194,11 +191,11 @@ def test_atomic_add_local_float32(self):
     def test_atomic_sub_local_float32(self):
         ary = np.array([32], dtype=np.float32)
 
-        #dppl_atomic_sub = dppl.kernel('void(float32[:])')(atomic_sub_float32)
-        dppl_atomic_sub = dppl.kernel(atomic_sub_float32)
+        #dppy_atomic_sub = dppy.kernel('void(float32[:])')(atomic_sub_float32)
+        dppy_atomic_sub = dppy.kernel(atomic_sub_float32)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
 
-            dppl_atomic_sub[32, dppl.DEFAULT_LOCAL_SIZE](ary)
+            dppy_atomic_sub[32, dppy.DEFAULT_LOCAL_SIZE](ary)
 
         self.assertTrue(ary[0] == 0)
 
@@ -206,12 +203,12 @@ def test_atomic_sub_local_float32(self):
     def test_atomic_add_local_int64(self):
         ary = np.array([0], dtype=np.int64)
 
-        #dppl_atomic_add = dppl.kernel('void(int64[:])')(atomic_add_int64)
-        dppl_atomic_add = dppl.kernel(atomic_add_int64)
+        #dppy_atomic_add = dppy.kernel('void(int64[:])')(atomic_add_int64)
+        dppy_atomic_add = dppy.kernel(atomic_add_int64)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             # TODO: dpctl needs to expose this functions
             #if device_env.device_support_int64_atomics():
-            dppl_atomic_add[32, dppl.DEFAULT_LOCAL_SIZE](ary)
+            dppy_atomic_add[32, dppy.DEFAULT_LOCAL_SIZE](ary)
             self.assertTrue(ary[0] == 32)
             #else:
             #    return
@@ -220,12 +217,12 @@ def test_atomic_add_local_int64(self):
     def test_atomic_sub_local_int64(self):
         ary = np.array([32], dtype=np.int64)
 
-        #fn = dppl.kernel('void(int64[:])')(atomic_sub_int64)
-        fn = dppl.kernel(atomic_sub_int64)
+        #fn = dppy.kernel('void(int64[:])')(atomic_sub_int64)
+        fn = dppy.kernel(atomic_sub_int64)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             # TODO: dpctl needs to expose this functions
             #if device_env.device_support_int64_atomics():
-            fn[32, dppl.DEFAULT_LOCAL_SIZE](ary)
+            fn[32, dppy.DEFAULT_LOCAL_SIZE](ary)
             self.assertTrue(ary[0] == 0)
             #else:
             #    return
@@ -234,12 +231,12 @@ def test_atomic_sub_local_int64(self):
     def test_atomic_add_local_float64(self):
         ary = np.array([0], dtype=np.double)
 
-        #fn = dppl.kernel('void(float64[:])')(atomic_add_float64)
-        fn = dppl.kernel(atomic_add_float64)
+        #fn = dppy.kernel('void(float64[:])')(atomic_add_float64)
+        fn = dppy.kernel(atomic_add_float64)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             # TODO: dpctl needs to expose this functions
             #if device_env.device_support_float64_atomics():
-            fn[32, dppl.DEFAULT_LOCAL_SIZE](ary)
+            fn[32, dppy.DEFAULT_LOCAL_SIZE](ary)
             self.assertTrue(ary[0] == 32)
             #else:
             #    return
@@ -248,12 +245,12 @@ def test_atomic_add_local_float64(self):
     def test_atomic_sub_local_float64(self):
         ary = np.array([32], dtype=np.double)
 
-        #fn = dppl.kernel('void(float64[:])')(atomic_sub_int64)
-        fn = dppl.kernel(atomic_sub_int64)
+        #fn = dppy.kernel('void(float64[:])')(atomic_sub_int64)
+        fn = dppy.kernel(atomic_sub_int64)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             # TODO: dpctl needs to expose this functions
             #if device_env.device_support_float64_atomics():
-            fn[32, dppl.DEFAULT_LOCAL_SIZE](ary)
+            fn[32, dppy.DEFAULT_LOCAL_SIZE](ary)
             self.assertTrue(ary[0] == 0)
             #else:
             #    return
@@ -262,20 +259,20 @@ def test_atomic_sub_local_float64(self):
     def test_atomic_add2(self):
         ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
         orig = ary.copy()
-        #dppl_atomic_add2 = dppl.kernel('void(uint32[:,:])')(atomic_add2)
-        dppl_atomic_add2 = dppl.kernel(atomic_add2)
+        #dppy_atomic_add2 = dppy.kernel('void(uint32[:,:])')(atomic_add2)
+        dppy_atomic_add2 = dppy.kernel(atomic_add2)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            dppl_atomic_add2[(4, 8), dppl.DEFAULT_LOCAL_SIZE](ary)
+            dppy_atomic_add2[(4, 8), dppy.DEFAULT_LOCAL_SIZE](ary)
         self.assertTrue(np.all(ary == orig + 1))
 
 
     def test_atomic_add3(self):
         ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
         orig = ary.copy()
-        #dppl_atomic_add3 = dppl.kernel('void(uint32[:,:])')(atomic_add3)
-        dppl_atomic_add3 = dppl.kernel(atomic_add3)
+        #dppy_atomic_add3 = dppy.kernel('void(uint32[:,:])')(atomic_add3)
+        dppy_atomic_add3 = dppy.kernel(atomic_add3)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            dppl_atomic_add3[(4, 8), dppl.DEFAULT_LOCAL_SIZE](ary)
+            dppy_atomic_add3[(4, 8), dppy.DEFAULT_LOCAL_SIZE](ary)
 
         self.assertTrue(np.all(ary == orig + 1))
 
diff --git a/numba_dppy/tests/dppl/test_barrier.py b/numba_dppy/tests/test_barrier.py
similarity index 65%
rename from numba_dppy/tests/dppl/test_barrier.py
rename to numba_dppy/tests/test_barrier.py
index aeff16dd40..7cedc18f13 100644
--- a/numba_dppy/tests/dppl/test_barrier.py
+++ b/numba_dppy/tests/test_barrier.py
@@ -1,23 +1,20 @@
-from __future__ import print_function, division, absolute_import
-
 import numpy as np
 
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+import unittest
 from numba import float32
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
 class TestBarrier(unittest.TestCase):
     def test_proper_lowering(self):
-        #@dppl.kernel("void(float32[::1])")
-        @dppl.kernel
+        #@dppy.kernel("void(float32[::1])")
+        @dppy.kernel
         def twice(A):
-            i = dppl.get_global_id(0)
+            i = dppy.get_global_id(0)
             d = A[i]
-            dppl.barrier(dppl.CLK_LOCAL_MEM_FENCE)  # local mem fence
+            dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE)  # local mem fence
             A[i] = d * 2
 
         N = 256
@@ -31,13 +28,13 @@ def twice(A):
         np.testing.assert_allclose(orig * 2, arr)
 
     def test_no_arg_barrier_support(self):
-        #@dppl.kernel("void(float32[::1])")
-        @dppl.kernel
+        #@dppy.kernel("void(float32[::1])")
+        @dppy.kernel
         def twice(A):
-            i = dppl.get_global_id(0)
+            i = dppy.get_global_id(0)
             d = A[i]
             # no argument defaults to global mem fence
-            dppl.barrier()
+            dppy.barrier()
             A[i] = d * 2
 
         N = 256
@@ -45,7 +42,7 @@ def twice(A):
         orig = arr.copy()
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            twice[N, dppl.DEFAULT_LOCAL_SIZE](arr)
+            twice[N, dppy.DEFAULT_LOCAL_SIZE](arr)
 
         # The computation is correct?
         np.testing.assert_allclose(orig * 2, arr)
@@ -54,16 +51,16 @@ def twice(A):
     def test_local_memory(self):
         blocksize = 10
 
-        #@dppl.kernel("void(float32[::1])")
-        @dppl.kernel
+        #@dppy.kernel("void(float32[::1])")
+        @dppy.kernel
         def reverse_array(A):
-            lm = dppl.local.static_alloc(shape=10, dtype=float32)
-            i = dppl.get_global_id(0)
+            lm = dppy.local.static_alloc(shape=10, dtype=float32)
+            i = dppy.get_global_id(0)
 
             # preload
             lm[i] = A[i]
             # barrier local or global will both work as we only have one work group
-            dppl.barrier(dppl.CLK_LOCAL_MEM_FENCE)  # local mem fence
+            dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE)  # local mem fence
             # write
             A[i] += lm[blocksize - 1 - i]
 
@@ -71,7 +68,7 @@ def reverse_array(A):
         orig = arr.copy()
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            reverse_array[blocksize, dppl.DEFAULT_LOCAL_SIZE](arr)
+            reverse_array[blocksize, dppy.DEFAULT_LOCAL_SIZE](arr)
 
         expected = orig[::-1] + orig
         np.testing.assert_allclose(expected, arr)
diff --git a/numba_dppy/tests/dppl/test_black_scholes.py b/numba_dppy/tests/test_black_scholes.py
similarity index 89%
rename from numba_dppy/tests/dppl/test_black_scholes.py
rename to numba_dppy/tests/test_black_scholes.py
index 3d9581bb54..7baecbeda5 100644
--- a/numba_dppy/tests/dppl/test_black_scholes.py
+++ b/numba_dppy/tests/test_black_scholes.py
@@ -1,12 +1,9 @@
-from __future__ import print_function, division, absolute_import
-
 import numpy as np
 import math
 import time
 
-import numba_dppy, numba_dppy as dppl
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+import numba_dppy, numba_dppy as dppy
+import unittest
 import dpctl
 
 
@@ -49,7 +46,7 @@ def randfloat(rand_var, low, high):
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPLBlackScholes(DPPLTestCase):
+class TestDPPYBlackScholes(unittest.TestCase):
     def test_black_scholes(self):
         OPT_N = 400
         iterations = 2
@@ -70,9 +67,9 @@ def test_black_scholes(self):
                           optionStrike, optionYears, RISKFREE, VOLATILITY)
 
 
-        @dppl.kernel
-        def black_scholes_dppl(callResult, putResult, S, X, T, R, V):
-            i = dppl.get_global_id(0)
+        @dppy.kernel
+        def black_scholes_dppy(callResult, putResult, S, X, T, R, V):
+            i = dppy.get_global_id(0)
             if i >= S.shape[0]:
                 return
             sqrtT = math.sqrt(T[i])
@@ -103,7 +100,7 @@ def black_scholes_dppl(callResult, putResult, S, X, T, R, V):
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             time1 = time.time()
             for i in range(iterations):
-                black_scholes_dppl[blockdim, griddim](
+                black_scholes_dppy[blockdim, griddim](
                     callResultNumbapro, putResultNumbapro, stockPrice, optionStrike,
                     optionYears, RISKFREE, VOLATILITY)
 
diff --git a/numba_dppy/tests/dppl/test_caching.py b/numba_dppy/tests/test_caching.py
similarity index 57%
rename from numba_dppy/tests/dppl/test_caching.py
rename to numba_dppy/tests/test_caching.py
index 6a6a7967a5..268401ce98 100644
--- a/numba_dppy/tests/dppl/test_caching.py
+++ b/numba_dppy/tests/test_caching.py
@@ -1,20 +1,16 @@
-from __future__ import print_function
-from timeit import default_timer as time
-
 import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+import unittest
 
 
 def data_parallel_sum(a, b, c):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     c[i] = a[i] + b[i]
 
 
-class TestCaching(DPPLTestCase):
+class TestCaching(unittest.TestCase):
     def test_caching_kernel(self):
         global_size = 10
         N = global_size
@@ -25,11 +21,11 @@ def test_caching_kernel(self):
 
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            func = dppl.kernel(data_parallel_sum)
-            caching_kernel = func[global_size, dppl.DEFAULT_LOCAL_SIZE].specialize(a, b, c)
+            func = dppy.kernel(data_parallel_sum)
+            caching_kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize(a, b, c)
 
             for i in range(10):
-                cached_kernel = func[global_size, dppl.DEFAULT_LOCAL_SIZE].specialize(a, b, c)
+                cached_kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize(a, b, c)
                 self.assertIs(caching_kernel, cached_kernel)
 
 
diff --git a/numba_dppy/tests/test_controllable_fallback.py b/numba_dppy/tests/test_controllable_fallback.py
new file mode 100644
index 0000000000..357f0b5e20
--- /dev/null
+++ b/numba_dppy/tests/test_controllable_fallback.py
@@ -0,0 +1,71 @@
+import numpy as np
+
+import numba
+import numba_dppy
+from numba_dppy.testing import unittest
+from numba.tests.support import captured_stderr
+import dpctl
+
+
+@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+class TestDPPYFallback(unittest.TestCase):
+    def test_dppy_fallback_true(self):
+        @numba.jit
+        def fill_value(i):
+            return i
+
+        def inner_call_fallback():
+            x = 10
+            a = np.empty(shape=x, dtype=np.float32)
+
+            for i in numba.prange(x):
+                a[i] = fill_value(i)
+
+            return a
+
+        numba_dppy.compiler.DEBUG = 1
+        with captured_stderr() as msg_fallback_true:
+            with dpctl.device_context("opencl:gpu") as gpu_queue:
+                dppy = numba.njit(parallel=True)(inner_call_fallback)
+                dppy_fallback_true = dppy()
+
+        ref_result = inner_call_fallback()
+        numba_dppy.compiler.DEBUG = 0
+
+        np.testing.assert_array_equal(dppy_fallback_true, ref_result)
+        self.assertTrue('Failed to lower parfor on DPPY-device' in msg_fallback_true.getvalue())
+
+    @unittest.expectedFailure
+    def test_dppy_fallback_false(self):
+        @numba.jit
+        def fill_value(i):
+            return i
+
+        def inner_call_fallback():
+            x = 10
+            a = np.empty(shape=x, dtype=np.float32)
+
+            for i in numba.prange(x):
+                a[i] = fill_value(i)
+
+            return a
+
+        try:
+            numba_dppy.compiler.DEBUG = 1
+            numba_dppy.config.FALLBACK_ON_CPU  = 0
+            with captured_stderr() as msg_fallback_true:
+                with dpctl.device_context("opencl:gpu") as gpu_queue:
+                    dppy = numba.njit(parallel=True)(inner_call_fallback)
+                    dppy_fallback_false = dppy()
+
+        finally:
+            ref_result = inner_call_fallback()
+            numba_dppy.config.FALLBACK_ON_CPU  = 1
+            numba_dppy.compiler.DEBUG = 0
+
+            not np.testing.assert_array_equal(dppy_fallback_false, ref_result)
+            not self.assertTrue('Failed to lower parfor on DPPY-device' in msg_fallback_true.getvalue())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/numba_dppy/tests/dppl/test_device_array_args.py b/numba_dppy/tests/test_device_array_args.py
similarity index 64%
rename from numba_dppy/tests/dppl/test_device_array_args.py
rename to numba_dppy/tests/test_device_array_args.py
index 024e3723a9..eb47cd28bc 100644
--- a/numba_dppy/tests/dppl/test_device_array_args.py
+++ b/numba_dppy/tests/test_device_array_args.py
@@ -1,17 +1,13 @@
 #! /usr/bin/env python
-from __future__ import print_function
-from timeit import default_timer as time
-
 import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+import unittest
 
-@dppl.kernel
+@dppy.kernel
 def data_parallel_sum(a, b, c):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     c[i] = a[i] + b[i]
 
 
@@ -24,23 +20,23 @@ def data_parallel_sum(a, b, c):
 
 
 @unittest.skipUnless(dpctl.has_cpu_queues(), 'test only on CPU system')
-class TestDPPLDeviceArrayArgsGPU(DPPLTestCase):
+class TestDPPYDeviceArrayArgsGPU(unittest.TestCase):
     def test_device_array_args_cpu(self):
         c = np.ones_like(a)
 
         with dpctl.device_context("opencl:cpu") as cpu_queue:
-            data_parallel_sum[global_size, dppl.DEFAULT_LOCAL_SIZE](a, b, c)
+            data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)
 
             self.assertTrue(np.all(c == d))
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPLDeviceArrayArgsCPU(DPPLTestCase):
+class TestDPPYDeviceArrayArgsCPU(unittest.TestCase):
     def test_device_array_args_gpu(self):
         c = np.ones_like(a)
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            data_parallel_sum[global_size, dppl.DEFAULT_LOCAL_SIZE](a, b, c)
+            data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)
 
         self.assertTrue(np.all(c == d))
 
diff --git a/numba_dppy/tests/dppl/test_dpctl_api.py b/numba_dppy/tests/test_dpctl_api.py
similarity index 71%
rename from numba_dppy/tests/dppl/test_dpctl_api.py
rename to numba_dppy/tests/test_dpctl_api.py
index bb72a35cf2..59ddd16f65 100644
--- a/numba_dppy/tests/dppl/test_dpctl_api.py
+++ b/numba_dppy/tests/test_dpctl_api.py
@@ -1,14 +1,9 @@
-from __future__ import print_function, division, absolute_import
-
-import numpy as np
-
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+import unittest
 import dpctl
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPCTLAPI(DPPLTestCase):
+class TestDPCTLAPI(unittest.TestCase):
     def test_dpctl_api(self):
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             dpctl.dump()
diff --git a/numba_dppy/tests/test_dpnp_functions.py b/numba_dppy/tests/test_dpnp_functions.py
new file mode 100644
index 0000000000..166937c275
--- /dev/null
+++ b/numba_dppy/tests/test_dpnp_functions.py
@@ -0,0 +1,440 @@
+#! /usr/bin/env python
+from timeit import default_timer as time
+
+import sys
+import numpy as np
+from numba import njit
+import numba_dppy
+import numba_dppy as dppy
+import dpctl
+import unittest
+from numba_dppy.testing import ensure_dpnp
+
+
+import dpctl
+
+def test_for_different_datatypes(fn, test_fn, dims, arg_count, tys, np_all=False, matrix=None):
+    if arg_count == 1:
+        for ty in tys:
+            if matrix and matrix[0]:
+                a = np.array(np.random.random(
+                    dims[0] * dims[1]), dtype=ty).reshape(dims[0], dims[1])
+            else:
+                a = np.array(np.random.random(dims[0]), dtype=ty)
+
+            with dpctl.device_context("opencl:gpu"):
+                c = fn(a)
+
+            d = test_fn(a)
+            if np_all:
+                max_abs_err = np.all(c - d)
+            else:
+                max_abs_err = c - d
+            if not (max_abs_err < 1e-4):
+                return False
+
+    elif arg_count == 2:
+        for ty in tys:
+            if matrix and matrix[0]:
+                a = np.array(np.random.random(
+                    dims[0] * dims[1]), dtype=ty).reshape(dims[0], dims[1])
+            else:
+                a = np.array(np.random.random(dims[0] * dims[1]), dtype=ty)
+            if matrix and matrix[1]:
+                b = np.array(np.random.random(
+                    dims[2] * dims[3]), dtype=ty).reshape(dims[2], dims[3])
+            else:
+                b = np.array(np.random.random(dims[2] * dims[3]), dtype=ty)
+
+            with dpctl.device_context("opencl:gpu"):
+                c = fn(a, b)
+
+            d = test_fn(a, b)
+            if np_all:
+                max_abs_err = np.sum(c - d)
+            else:
+                max_abs_err = c - d
+            if not (max_abs_err < 1e-4):
+                return False
+
+    return True
+
+
+def test_for_dimensions(fn, test_fn, dims, tys, np_all=False):
+    total_size = 1
+    for d in dims:
+        total_size *= d
+
+    for ty in tys:
+        a = np.array(np.random.random(total_size), dtype=ty).reshape(dims)
+
+        with dpctl.device_context("opencl:gpu"):
+            c = fn(a)
+
+        d = test_fn(a)
+        if np_all:
+            max_abs_err = np.all(c - d)
+        else:
+            max_abs_err = c - d
+        if not (max_abs_err < 1e-4):
+            return False
+
+    return True
+
+
+# From https://github.com/IntelPython/dpnp/blob/master/tests/test_linalg.py
+def vvsort(val, vec, size):
+    for i in range(size):
+        imax = i
+        for j in range(i + 1, size):
+            if np.abs(val[imax]) < np.abs(val[j]):
+                imax = j
+
+        temp = val[i]
+        val[i] = val[imax]
+        val[imax] = temp
+
+        for k in range(size):
+            temp = vec[k, i]
+            vec[k, i] = vec[k, imax]
+            vec[k, imax] = temp
+
+
+@unittest.skipUnless(ensure_dpnp(), 'test only when dpNP is available')
+class Testdpnp_linalg_functions(unittest.TestCase):
+    tys = [np.int32, np.uint32, np.int64, np.uint64, np.float, np.double]
+    def test_eig(self):
+        @njit
+        def f(a):
+            return np.linalg.eig(a)
+
+        size = 3
+        for ty in self.tys:
+            a = np.arange(size * size, dtype=ty).reshape((size, size))
+            symm_a = np.tril(a) + np.tril(a, -1).T + np.diag(np.full((size,), size * size, dtype=ty))
+
+            with dpctl.device_context("opencl:gpu"):
+                got_val, got_vec = f(symm_a)
+
+            np_val, np_vec = np.linalg.eig(symm_a)
+
+            # sort val/vec by abs value
+            vvsort(got_val, got_vec, size)
+            vvsort(np_val, np_vec, size)
+
+
+	    # NP change sign of vectors
+            for i in range(np_vec.shape[1]):
+                if np_vec[0, i] * got_vec[0, i] < 0:
+                    np_vec[:, i] = -np_vec[:, i]
+
+            self.assertTrue(np.allclose(got_val, np_val))
+            self.assertTrue(np.allclose(got_vec, np_vec))
+
+
+@unittest.skipUnless(ensure_dpnp(), 'test only when dpNP is available')
+class Testdpnp_ndarray_functions(unittest.TestCase):
+    tys = [np.int32, np.uint32, np.int64, np.uint64, np.float, np.double]
+    def test_ndarray_sum(self):
+        @njit
+        def f(a):
+            return a.sum()
+
+        size = 3
+        for ty in self.tys:
+            a = np.arange(size * size, dtype=ty).reshape((size, size))
+
+            with dpctl.device_context("opencl:gpu"):
+                got = f(a)
+                expected = a.sum()
+
+            self.assertTrue(expected == got)
+
+    def test_ndarray_prod(self):
+        @njit
+        def f(a):
+            return a.prod()
+
+        size = 3
+        for ty in self.tys:
+            a = np.arange(1, (size * size) + 1, dtype=ty).reshape((size, size))
+
+            with dpctl.device_context("opencl:gpu"):
+                got = f(a)
+                expected = a.prod()
+
+            self.assertTrue(expected == got)
+
+    def test_ndarray_max(self):
+        @njit
+        def f(a):
+            return a.max()
+
+        size = 3
+        for ty in self.tys:
+            a = np.arange(1, (size * size) + 1, dtype=ty).reshape((size, size))
+
+            with dpctl.device_context("opencl:gpu"):
+                got = f(a)
+                expected = a.max()
+
+            self.assertTrue(expected == got)
+
+    def test_ndarray_min(self):
+        @njit
+        def f(a):
+            return a.min()
+
+        size = 3
+        for ty in self.tys:
+            a = np.arange(1, (size * size) + 1, dtype=ty).reshape((size, size))
+
+            with dpctl.device_context("opencl:gpu"):
+                got = f(a)
+                expected = a.min()
+
+            self.assertTrue(expected == got)
+
+    def test_ndarray_mean(self):
+        @njit
+        def f(a):
+            return a.mean()
+
+        size = 3
+        for ty in self.tys:
+            a = np.arange(1, (size * size) + 1, dtype=ty).reshape((size, size))
+
+            with dpctl.device_context("opencl:gpu"):
+                got = f(a)
+                expected = a.mean()
+
+            self.assertTrue(expected == got)
+
+    def test_ndarray_argmax(self):
+        @njit
+        def f(a):
+            return a.argmax()
+
+        size = 3
+        for ty in self.tys:
+            a = np.arange(1, (size * size) + 1, dtype=ty).reshape((size, size))
+
+            with dpctl.device_context("opencl:gpu"):
+                got = f(a)
+                expected = a.argmax()
+
+            self.assertTrue(expected == got)
+
+
+    def test_ndarray_argmin(self):
+        @njit
+        def f(a):
+            return a.argmin()
+
+        size = 3
+        for ty in self.tys:
+            a = np.arange(1, (size * size) + 1, dtype=ty).reshape((size, size))
+
+            with dpctl.device_context("opencl:gpu"):
+                got = f(a)
+                expected = a.argmin()
+
+            self.assertTrue(expected == got)
+
+    def test_ndarray_argsort(self):
+        @njit
+        def f(a):
+            return a.argsort()
+
+        size = 3
+        for ty in self.tys:
+            a = np.arange(1, (size * size) + 1, dtype=ty)
+
+            with dpctl.device_context("opencl:gpu"):
+                got = f(a)
+                expected = a.argsort()
+
+            self.assertTrue(np.array_equal(expected, got))
+
+
+@unittest.skipUnless(ensure_dpnp() and dpctl.has_gpu_queues(), 'test only when dpNP and GPU is available')
+class Testdpnp_functions(unittest.TestCase):
+    N = 10
+
+    a = np.array(np.random.random(N), dtype=np.float32)
+    b = np.array(np.random.random(N), dtype=np.float32)
+    tys = [np.int32, np.uint32, np.int64, np.uint64, np.float, np.double]
+
+    def test_sum(self):
+        @njit
+        def f(a):
+            c = np.sum(a)
+            return c
+
+        self.assertTrue(test_for_different_datatypes(
+            f, np.sum, [10], 1, self.tys))
+        self.assertTrue(test_for_dimensions(f, np.sum, [10, 2], self.tys))
+        self.assertTrue(test_for_dimensions(f, np.sum, [10, 2, 3], self.tys))
+
+    def test_prod(self):
+        @njit
+        def f(a):
+            c = np.prod(a)
+            return c
+
+        self.assertTrue(test_for_different_datatypes(
+            f, np.prod, [10], 1, self.tys))
+        self.assertTrue(test_for_dimensions(f, np.prod, [10, 2], self.tys))
+        self.assertTrue(test_for_dimensions(f, np.prod, [10, 2, 3], self.tys))
+
+    def test_argmax(self):
+        @njit
+        def f(a):
+            c = np.argmax(a)
+            return c
+
+        self.assertTrue(test_for_different_datatypes(
+            f, np.argmax, [10], 1, self.tys))
+        self.assertTrue(test_for_dimensions(f, np.argmax, [10, 2], self.tys))
+        self.assertTrue(test_for_dimensions(
+            f, np.argmax, [10, 2, 3], self.tys))
+
+    def test_max(self):
+        @njit
+        def f(a):
+            c = np.max(a)
+            return c
+
+        self.assertTrue(test_for_different_datatypes(
+            f, np.max, [10], 1, self.tys))
+        self.assertTrue(test_for_dimensions(f, np.max, [10, 2], self.tys))
+        self.assertTrue(test_for_dimensions(f, np.max, [10, 2, 3], self.tys))
+
+    def test_amax(self):
+        @njit
+        def f(a):
+            c = np.amax(a)
+            return c
+
+        self.assertTrue(test_for_different_datatypes(
+            f, np.amax, [10], 1, self.tys))
+        self.assertTrue(test_for_dimensions(f, np.amax, [10, 2], self.tys))
+        self.assertTrue(test_for_dimensions(f, np.amax, [10, 2, 3], self.tys))
+
+
+    def test_argmin(self):
+        @njit
+        def f(a):
+            c = np.argmin(a)
+            return c
+
+        self.assertTrue(test_for_different_datatypes(
+            f, np.argmin, [10], 1, self.tys))
+        self.assertTrue(test_for_dimensions(f, np.argmin, [10, 2], self.tys))
+        self.assertTrue(test_for_dimensions(
+            f, np.argmin, [10, 2, 3], self.tys))
+
+    def test_min(self):
+        @njit
+        def f(a):
+            c = np.min(a)
+            return c
+
+        self.assertTrue(test_for_different_datatypes(
+            f, np.min, [10], 1, self.tys))
+        self.assertTrue(test_for_dimensions(f, np.min, [10, 2], self.tys))
+        self.assertTrue(test_for_dimensions(f, np.min, [10, 2, 3], self.tys))
+
+    def test_amin(self):
+        @njit
+        def f(a):
+            c = np.amin(a)
+            return c
+
+        self.assertTrue(test_for_different_datatypes(
+            f, np.min, [10], 1, self.tys))
+        self.assertTrue(test_for_dimensions(f, np.min, [10, 2], self.tys))
+        self.assertTrue(test_for_dimensions(f, np.min, [10, 2, 3], self.tys))
+
+    def test_argsort(self):
+        @njit
+        def f(a):
+            c = np.argsort(a)
+            return c
+
+        self.assertTrue(test_for_different_datatypes(
+            f, np.argmin, [10], 1, self.tys, np_all=True))
+
+    def test_median(self):
+        @njit
+        def f(a):
+            c = np.median(a)
+            return c
+
+        self.assertTrue(test_for_different_datatypes(
+            f, np.median, [10], 1, self.tys))
+        self.assertTrue(test_for_dimensions(f, np.median, [10, 2], self.tys))
+        self.assertTrue(test_for_dimensions(
+            f, np.median, [10, 2, 3], self.tys))
+
+    def test_mean(self):
+        @njit
+        def f(a):
+            c = np.mean(a)
+            return c
+
+        self.assertTrue(test_for_different_datatypes(
+            f, np.mean, [10], 1, self.tys))
+        self.assertTrue(test_for_dimensions(f, np.mean, [10, 2], self.tys))
+        self.assertTrue(test_for_dimensions(f, np.mean, [10, 2, 3], self.tys))
+
+    def test_matmul(self):
+        @njit
+        def f(a, b):
+            c = np.matmul(a, b)
+            return c
+
+        self.assertTrue(test_for_different_datatypes(f, np.matmul, [10, 5, 5, 10], 2, [
+                        np.float, np.double], np_all=True, matrix=[True, True]))
+
+    def test_dot(self):
+        @njit
+        def f(a, b):
+            c = np.dot(a, b)
+            return c
+
+        self.assertTrue(test_for_different_datatypes(
+            f, np.dot, [10, 1, 10, 1], 2, [np.float, np.double]))
+        self.assertTrue(test_for_different_datatypes(f, np.dot, [10, 1, 10, 2], 2, [
+                        np.float, np.double], matrix=[False, True], np_all=True))
+        self.assertTrue(test_for_different_datatypes(f, np.dot, [2, 10, 10, 1], 2, [
+                        np.float, np.double], matrix=[True, False], np_all=True))
+        self.assertTrue(test_for_different_datatypes(f, np.dot, [10, 2, 2, 10], 2, [
+                        np.float, np.double], matrix=[True, True], np_all=True))
+
+    def test_cov(self):
+        @njit
+        def f(a):
+            c = np.cov(a)
+            return c
+
+        self.assertTrue(test_for_different_datatypes(
+            f, np.cov, [10, 7], 1, self.tys, matrix=[True], np_all=True))
+
+    def test_dpnp_interacting_with_parfor(self):
+        @njit
+        def f(a, b):
+            c = np.sum(a)
+            e = np.add(b, a)
+            #d = a + 1
+            return 0
+
+        result = f(self.a, self.b)
+        #np_result = np.add((self.a + np.sum(self.a)), self.b)
+
+        #max_abs_err = result.sum() - np_result.sum()
+        #self.assertTrue(max_abs_err < 1e-4)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/numba_dppy/tests/test_dppy_fallback.py b/numba_dppy/tests/test_dppy_fallback.py
new file mode 100644
index 0000000000..dd05bbdc84
--- /dev/null
+++ b/numba_dppy/tests/test_dppy_fallback.py
@@ -0,0 +1,55 @@
+import numpy as np
+
+import numba
+import unittest
+from numba.tests.support import captured_stderr
+import dpctl
+
+
+@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+class TestDPPYFallback(unittest.TestCase):
+    def test_dppy_fallback_inner_call(self):
+        @numba.jit
+        def fill_value(i):
+            return i
+
+        def inner_call_fallback():
+            x = 10
+            a = np.empty(shape=x, dtype=np.float32)
+
+            for i in numba.prange(x):
+                a[i] = fill_value(i)
+
+            return a
+
+        with captured_stderr() as msg, dpctl.device_context("opencl:gpu"):
+            dppy = numba.njit(inner_call_fallback)
+            dppy_result = dppy()
+
+        ref_result = inner_call_fallback()
+
+        np.testing.assert_array_equal(dppy_result, ref_result)
+        self.assertTrue(
+            'Failed to lower parfor on DPPY-device' in msg.getvalue())
+
+    def test_dppy_fallback_reductions(self):
+        def reduction(a):
+            b = 1
+            for i in numba.prange(len(a)):
+                b += a[i]
+            return b
+
+        a = np.ones(10)
+        with captured_stderr() as msg, dpctl.device_context("opencl:gpu"):
+            dppy = numba.njit(reduction)
+            dppy_result = dppy(a)
+
+        ref_result = reduction(a)
+
+        np.testing.assert_array_equal(dppy_result, ref_result)
+        self.assertTrue(
+            'Failed to lower parfor on DPPY-device' in msg.getvalue())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/numba_dppy/tests/dppl/test_dppl_func.py b/numba_dppy/tests/test_dppy_func.py
similarity index 53%
rename from numba_dppy/tests/dppl/test_dppl_func.py
rename to numba_dppy/tests/test_dppy_func.py
index 0f64046082..729030e153 100644
--- a/numba_dppy/tests/dppl/test_dppl_func.py
+++ b/numba_dppy/tests/test_dppy_func.py
@@ -1,60 +1,57 @@
-from __future__ import print_function, division, absolute_import
-
 import numpy as np
 
-import numba_dppy, numba_dppy as dppl
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+import numba_dppy, numba_dppy as dppy
+import unittest
 import dpctl
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPLFunc(DPPLTestCase):
+class TestDPPYFunc(unittest.TestCase):
     N = 257
 
-    def test_dppl_func_device_array(self):
-        @dppl.func
+    def test_dppy_func_device_array(self):
+        @dppy.func
         def g(a):
             return a + 1
 
-        @dppl.kernel
+        @dppy.kernel
         def f(a, b):
-            i = dppl.get_global_id(0)
+            i = dppy.get_global_id(0)
             b[i] = g(a[i])
 
         a = np.ones(self.N)
         b = np.ones(self.N)
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            f[self.N, dppl.DEFAULT_LOCAL_SIZE](a, b)
+            f[self.N, dppy.DEFAULT_LOCAL_SIZE](a, b)
 
 
         self.assertTrue(np.all(b == 2))
 
-    def test_dppl_func_ndarray(self):
-        @dppl.func
+    def test_dppy_func_ndarray(self):
+        @dppy.func
         def g(a):
             return a + 1
 
-        @dppl.kernel
+        @dppy.kernel
         def f(a, b):
-            i = dppl.get_global_id(0)
+            i = dppy.get_global_id(0)
             b[i] = g(a[i])
 
-        @dppl.kernel
+        @dppy.kernel
         def h(a, b):
-            i = dppl.get_global_id(0)
+            i = dppy.get_global_id(0)
             b[i] = g(a[i]) + 1
 
         a = np.ones(self.N)
         b = np.ones(self.N)
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            f[self.N, dppl.DEFAULT_LOCAL_SIZE](a, b)
+            f[self.N, dppy.DEFAULT_LOCAL_SIZE](a, b)
 
             self.assertTrue(np.all(b == 2))
 
-            h[self.N, dppl.DEFAULT_LOCAL_SIZE](a, b)
+            h[self.N, dppy.DEFAULT_LOCAL_SIZE](a, b)
 
             self.assertTrue(np.all(b == 3))
 
diff --git a/numba_dppy/tests/dppl/test_math_functions.py b/numba_dppy/tests/test_math_functions.py
similarity index 63%
rename from numba_dppy/tests/dppl/test_math_functions.py
rename to numba_dppy/tests/test_math_functions.py
index 977fe85fef..6336c63759 100644
--- a/numba_dppy/tests/dppl/test_math_functions.py
+++ b/numba_dppy/tests/test_math_functions.py
@@ -1,48 +1,43 @@
 #! /usr/bin/env python
-from __future__ import print_function
-from timeit import default_timer as time
-
-import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+import unittest
 import math
 
-@dppl.kernel
-def dppl_fabs(a,b):
-    i = dppl.get_global_id(0)
+@dppy.kernel
+def dppy_fabs(a,b):
+    i = dppy.get_global_id(0)
     b[i] = math.fabs(a[i])
 
-@dppl.kernel
-def dppl_exp(a,b):
-    i = dppl.get_global_id(0)
+@dppy.kernel
+def dppy_exp(a,b):
+    i = dppy.get_global_id(0)
     b[i] = math.exp(a[i])
 
-@dppl.kernel
-def dppl_log(a,b):
-    i = dppl.get_global_id(0)
+@dppy.kernel
+def dppy_log(a,b):
+    i = dppy.get_global_id(0)
     b[i] = math.log(a[i])
 
-@dppl.kernel
-def dppl_sqrt(a,b):
-    i = dppl.get_global_id(0)
+@dppy.kernel
+def dppy_sqrt(a,b):
+    i = dppy.get_global_id(0)
     b[i] = math.sqrt(a[i])
 
-@dppl.kernel
-def dppl_sin(a,b):
-    i = dppl.get_global_id(0)
+@dppy.kernel
+def dppy_sin(a,b):
+    i = dppy.get_global_id(0)
     b[i] = math.sin(a[i])
 
-@dppl.kernel
-def dppl_cos(a,b):
-    i = dppl.get_global_id(0)
+@dppy.kernel
+def dppy_cos(a,b):
+    i = dppy.get_global_id(0)
     b[i] = math.cos(a[i])
 
-@dppl.kernel
-def dppl_tan(a,b):
-    i = dppl.get_global_id(0)
+@dppy.kernel
+def dppy_tan(a,b):
+    i = dppy.get_global_id(0)
     b[i] = math.tan(a[i])
 
 global_size = 10
@@ -53,7 +48,7 @@ def dppl_tan(a,b):
 def driver(a, jitfunc):
     b = np.ones_like(a)
     # Device buffers
-    jitfunc[global_size, dppl.DEFAULT_LOCAL_SIZE](a, b)
+    jitfunc[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b)
     return b
 
 
@@ -73,67 +68,67 @@ def test_driver(input_arr, device_ty, jitfunc):
 
 
 @unittest.skipUnless(dpctl.has_cpu_queues(), 'test only on CPU system')
-class TestDPPLMathFunctionsCPU(DPPLTestCase):
+class TestDPPYMathFunctionsCPU(unittest.TestCase):
     def test_fabs_cpu(self):
-        b_actual = test_driver(a, "CPU", dppl_fabs)
+        b_actual = test_driver(a, "CPU", dppy_fabs)
         b_expected = np.fabs(a)
         self.assertTrue(np.all(b_actual == b_expected))
 
     def test_sin_cpu(self):
-        b_actual = test_driver(a, "CPU", dppl_sin)
+        b_actual = test_driver(a, "CPU", dppy_sin)
         b_expected = np.sin(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
     def test_cos_cpu(self):
-        b_actual = test_driver(a, "CPU", dppl_cos)
+        b_actual = test_driver(a, "CPU", dppy_cos)
         b_expected = np.cos(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
     def test_exp_cpu(self):
-        b_actual = test_driver(a, "CPU", dppl_exp)
+        b_actual = test_driver(a, "CPU", dppy_exp)
         b_expected = np.exp(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
     def test_sqrt_cpu(self):
-        b_actual = test_driver(a, "CPU", dppl_sqrt)
+        b_actual = test_driver(a, "CPU", dppy_sqrt)
         b_expected = np.sqrt(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
     def test_log_cpu(self):
-        b_actual = test_driver(a, "CPU", dppl_log)
+        b_actual = test_driver(a, "CPU", dppy_log)
         b_expected = np.log(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPLMathFunctionsGPU(DPPLTestCase):
+class TestDPPYMathFunctionsGPU(unittest.TestCase):
     def test_fabs_gpu(self):
-        b_actual = test_driver(a, "GPU", dppl_fabs)
+        b_actual = test_driver(a, "GPU", dppy_fabs)
         b_expected = np.fabs(a)
         self.assertTrue(np.all(b_actual == b_expected))
 
     def test_sin_gpu(self):
-        b_actual = test_driver(a, "GPU", dppl_sin)
+        b_actual = test_driver(a, "GPU", dppy_sin)
         b_expected = np.sin(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
     def test_cos_gpu(self):
-        b_actual = test_driver(a, "GPU", dppl_cos)
+        b_actual = test_driver(a, "GPU", dppy_cos)
         b_expected = np.cos(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
     def test_exp_gpu(self):
-        b_actual = test_driver(a, "GPU", dppl_exp)
+        b_actual = test_driver(a, "GPU", dppy_exp)
         b_expected = np.exp(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
     def test_sqrt_gpu(self):
-        b_actual = test_driver(a, "GPU", dppl_sqrt)
+        b_actual = test_driver(a, "GPU", dppy_sqrt)
         b_expected = np.sqrt(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
     def test_log_gpu(self):
-        b_actual = test_driver(a, "GPU", dppl_log)
+        b_actual = test_driver(a, "GPU", dppy_log)
         b_expected = np.log(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
diff --git a/numba_dppy/tests/dppl/test_numpy_bit_twiddling_functions.py b/numba_dppy/tests/test_numpy_bit_twiddling_functions.py
similarity index 54%
rename from numba_dppy/tests/dppl/test_numpy_bit_twiddling_functions.py
rename to numba_dppy/tests/test_numpy_bit_twiddling_functions.py
index 5e3cd9ba24..21a8fc8444 100644
--- a/numba_dppy/tests/dppl/test_numpy_bit_twiddling_functions.py
+++ b/numba_dppy/tests/test_numpy_bit_twiddling_functions.py
@@ -1,108 +1,112 @@
 #! /usr/bin/env python
-from __future__ import print_function
-from timeit import default_timer as time
-
-import sys
 import numpy as np
 from numba import njit
-import numba_dppy, numba_dppy as dppl
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+import dpctl
+import unittest
 
 
-class TestNumpy_bit_twiddling_functions(DPPLTestCase):
+@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+class TestNumpy_bit_twiddling_functions(unittest.TestCase):
     def test_bitwise_and(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.bitwise_and(a, b)
             return c
 
-        a = np.array([2,5,255])
-        b = np.array([3,14,16])
+        a = np.array([2, 5, 255])
+        b = np.array([3, 14, 16])
+
+        with dpctl.device_context("opencl:gpu"):
+            c = f(a, b)
 
-        c = f(a, b)
         d = np.bitwise_and(a, b)
         self.assertTrue(np.all(c == d))
 
-
     def test_bitwise_or(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.bitwise_or(a, b)
             return c
 
-        a = np.array([2,5,255])
-        b = np.array([4,4,4])
+        a = np.array([2, 5, 255])
+        b = np.array([4, 4, 4])
+
+        with dpctl.device_context("opencl:gpu"):
+            c = f(a, b)
 
-        c = f(a, b)
         d = np.bitwise_or(a, b)
         self.assertTrue(np.all(c == d))
 
-
     def test_bitwise_xor(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.bitwise_xor(a, b)
             return c
 
-        a = np.array([2,5,255])
-        b = np.array([4,4,4])
+        a = np.array([2, 5, 255])
+        b = np.array([4, 4, 4])
+
+        with dpctl.device_context("opencl:gpu"):
+            c = f(a, b)
 
-        c = f(a, b)
         d = np.bitwise_xor(a, b)
         self.assertTrue(np.all(c == d))
 
-
     def test_bitwise_not(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.bitwise_not(a)
             return c
 
-        a = np.array([2,5,255])
+        a = np.array([2, 5, 255])
+
+        with dpctl.device_context("opencl:gpu"):
+            c = f(a)
 
-        c = f(a)
         d = np.bitwise_not(a)
         self.assertTrue(np.all(c == d))
 
-
     def test_invert(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.invert(a)
             return c
 
-        a = np.array([2,5,255])
+        a = np.array([2, 5, 255])
+
+        with dpctl.device_context("opencl:gpu"):
+            c = f(a)
 
-        c = f(a)
         d = np.invert(a)
         self.assertTrue(np.all(c == d))
 
-
     def test_left_shift(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.left_shift(a, b)
             return c
 
-        a = np.array([2,3,4])
-        b = np.array([1,2,3])
+        a = np.array([2, 3, 4])
+        b = np.array([1, 2, 3])
+
+        with dpctl.device_context("opencl:gpu"):
+            c = f(a, b)
 
-        c = f(a, b)
         d = np.left_shift(a, b)
         self.assertTrue(np.all(c == d))
 
-
     def test_right_shift(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.right_shift(a, b)
             return c
 
-        a = np.array([2,3,4])
-        b = np.array([1,2,3])
+        a = np.array([2, 3, 4])
+        b = np.array([1, 2, 3])
+
+        with dpctl.device_context("opencl:gpu"):
+            c = f(a, b)
 
-        c = f(a, b)
         d = np.right_shift(a, b)
         self.assertTrue(np.all(c == d))
 
diff --git a/numba_dppy/tests/dppl/test_numpy_comparison_functions.py b/numba_dppy/tests/test_numpy_comparison_functions.py
similarity index 60%
rename from numba_dppy/tests/dppl/test_numpy_comparison_functions.py
rename to numba_dppy/tests/test_numpy_comparison_functions.py
index 0bd7dcbb69..9d56e94374 100644
--- a/numba_dppy/tests/dppl/test_numpy_comparison_functions.py
+++ b/numba_dppy/tests/test_numpy_comparison_functions.py
@@ -1,85 +1,89 @@
 #! /usr/bin/env python
-from __future__ import print_function
-from timeit import default_timer as time
-
-import sys
 import numpy as np
 from numba import njit
-import numba_dppy, numba_dppy as dppl
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+import dpctl
+import unittest
+
+
+@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+class TestNumpy_comparison_functions(unittest.TestCase):
+    a = np.array([4, 5, 6])
+    b = np.array([2, 6, 6])
 
-class TestNumpy_comparison_functions(DPPLTestCase):
-    a = np.array([4,5,6])
-    b = np.array([2,6,6])
     def test_greater(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.greater(a, b)
             return c
 
-        c = f(self.a, self.b)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a, self.b)
+
         d = np.greater(self.a, self.b)
         self.assertTrue(np.all(c == d))
 
-
     def test_greater_equal(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.greater_equal(a, b)
             return c
 
-        c = f(self.a, self.b)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a, self.b)
+
         d = np.greater_equal(self.a, self.b)
         self.assertTrue(np.all(c == d))
 
-
     def test_less(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.less(a, b)
             return c
 
-        c = f(self.a, self.b)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a, self.b)
+
         d = np.less(self.a, self.b)
         self.assertTrue(np.all(c == d))
 
-
     def test_less_equal(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.less_equal(a, b)
             return c
 
-        c = f(self.a, self.b)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a, self.b)
+
         d = np.less_equal(self.a, self.b)
         self.assertTrue(np.all(c == d))
 
-
     def test_not_equal(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.not_equal(a, b)
             return c
 
-        c = f(self.a, self.b)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a, self.b)
+
         d = np.not_equal(self.a, self.b)
         self.assertTrue(np.all(c == d))
 
-
     def test_equal(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.equal(a, b)
             return c
 
-        c = f(self.a, self.b)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a, self.b)
+
         d = np.equal(self.a, self.b)
         self.assertTrue(np.all(c == d))
 
-
     def test_logical_and(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.logical_and(a, b)
             return c
@@ -87,13 +91,14 @@ def f(a, b):
         a = np.array([True, True, False])
         b = np.array([True, False, False])
 
-        c = f(a, b)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(a, b)
+
         d = np.logical_and(a, b)
         self.assertTrue(np.all(c == d))
 
-
     def test_logical_or(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.logical_or(a, b)
             return c
@@ -101,13 +106,14 @@ def f(a, b):
         a = np.array([True, True, False])
         b = np.array([True, False, False])
 
-        c = f(a, b)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(a, b)
+
         d = np.logical_or(a, b)
         self.assertTrue(np.all(c == d))
 
-
     def test_logical_xor(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.logical_xor(a, b)
             return c
@@ -115,76 +121,83 @@ def f(a, b):
         a = np.array([True, True, False])
         b = np.array([True, False, False])
 
-        c = f(a, b)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(a, b)
+
         d = np.logical_xor(a, b)
         self.assertTrue(np.all(c == d))
 
-
     def test_logical_not(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.logical_not(a)
             return c
 
         a = np.array([True, True, False])
 
-        c = f(a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(a)
+
         d = np.logical_not(a)
         self.assertTrue(np.all(c == d))
 
-
     def test_maximum(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.maximum(a, b)
             return c
 
-        a = np.array([5,6,7,np.nan], dtype=np.float32)
-        b = np.array([5,7,6,100], dtype=np.float32)
+        a = np.array([5, 6, 7, np.nan], dtype=np.float32)
+        b = np.array([5, 7, 6, 100], dtype=np.float32)
+
+        with dpctl.device_context("opencl:gpu"):
+            c = f(a, b)
 
-        c = f(a, b)
         d = np.maximum(a, b)
         np.testing.assert_equal(c, d)
 
-
     def test_minimum(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.minimum(a, b)
             return c
 
-        a = np.array([5,6,7,np.nan], dtype=np.float32)
-        b = np.array([5,7,6,100], dtype=np.float32)
+        a = np.array([5, 6, 7, np.nan], dtype=np.float32)
+        b = np.array([5, 7, 6, 100], dtype=np.float32)
+
+        with dpctl.device_context("opencl:gpu"):
+            c = f(a, b)
 
-        c = f(a, b)
         d = np.minimum(a, b)
         np.testing.assert_equal(c, d)
 
-
     def test_fmax(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.fmax(a, b)
             return c
 
-        a = np.array([5,6,7,np.nan], dtype=np.float32)
-        b = np.array([5,7,6,100], dtype=np.float32)
+        a = np.array([5, 6, 7, np.nan], dtype=np.float32)
+        b = np.array([5, 7, 6, 100], dtype=np.float32)
+
+        with dpctl.device_context("opencl:gpu"):
+            c = f(a, b)
 
-        c = f(a, b)
         d = np.fmax(a, b)
         np.testing.assert_equal(c, d)
 
-
     def test_fmin(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.fmin(a, b)
             return c
 
-        a = np.array([5,6,7,np.nan], dtype=np.float32)
-        b = np.array([5,7,6,100], dtype=np.float32)
+        a = np.array([5, 6, 7, np.nan], dtype=np.float32)
+        b = np.array([5, 7, 6, 100], dtype=np.float32)
+
+        with dpctl.device_context("opencl:gpu"):
+            c = f(a, b)
 
-        c = f(a, b)
         d = np.fmin(a, b)
         np.testing.assert_equal(c, d)
 
diff --git a/numba_dppy/tests/dppl/test_numpy_floating_functions.py b/numba_dppy/tests/test_numpy_floating_functions.py
similarity index 63%
rename from numba_dppy/tests/dppl/test_numpy_floating_functions.py
rename to numba_dppy/tests/test_numpy_floating_functions.py
index 62b76b1ade..8df7e2b5d4 100644
--- a/numba_dppy/tests/dppl/test_numpy_floating_functions.py
+++ b/numba_dppy/tests/test_numpy_floating_functions.py
@@ -1,95 +1,98 @@
-from __future__ import print_function
-from timeit import default_timer as time
-
-import sys
 import numpy as np
 from numba import njit
-import numba_dppy, numba_dppy as dppl
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+import dpctl
+import unittest
 
 
-class TestNumpy_floating_functions(DPPLTestCase):
+@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+class TestNumpy_floating_functions(unittest.TestCase):
     def test_isfinite(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.isfinite(a)
             return c
 
-        test_arr = [np.log(-1.),1.,np.log(0)]
+        test_arr = [np.log(-1.), 1., np.log(0)]
         input_arr = np.asarray(test_arr, dtype=np.float32)
 
-        c = f(input_arr)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
+
         d = np.isfinite(input_arr)
         self.assertTrue(np.all(c == d))
 
-
     def test_isinf(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.isinf(a)
             return c
 
-        test_arr = [np.log(-1.),1.,np.log(0)]
+        test_arr = [np.log(-1.), 1., np.log(0)]
         input_arr = np.asarray(test_arr, dtype=np.float32)
 
-        c = f(input_arr)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
+
         d = np.isinf(input_arr)
         self.assertTrue(np.all(c == d))
 
     def test_isnan(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.isnan(a)
             return c
 
-        test_arr = [np.log(-1.),1.,np.log(0)]
+        test_arr = [np.log(-1.), 1., np.log(0)]
         input_arr = np.asarray(test_arr, dtype=np.float32)
 
-        c = f(input_arr)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
+
         d = np.isnan(input_arr)
         self.assertTrue(np.all(c == d))
 
-
     def test_floor(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.floor(a)
             return c
 
         input_arr = np.array([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0])
 
-        c = f(input_arr)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
+
         d = np.floor(input_arr)
         self.assertTrue(np.all(c == d))
 
-
     def test_ceil(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.ceil(a)
             return c
 
         input_arr = np.array([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0])
 
-        c = f(input_arr)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
+
         d = np.ceil(input_arr)
         self.assertTrue(np.all(c == d))
 
-
     def test_trunc(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.trunc(a)
             return c
 
         input_arr = np.array([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0])
 
-        c = f(input_arr)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
+
         d = np.trunc(input_arr)
         self.assertTrue(np.all(c == d))
 
 
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/numba_dppy/tests/dppl/test_numpy_math_functions.py b/numba_dppy/tests/test_numpy_math_functions.py
similarity index 65%
rename from numba_dppy/tests/dppl/test_numpy_math_functions.py
rename to numba_dppy/tests/test_numpy_math_functions.py
index ddbb568ede..ef5dc235b8 100644
--- a/numba_dppy/tests/dppl/test_numpy_math_functions.py
+++ b/numba_dppy/tests/test_numpy_math_functions.py
@@ -1,83 +1,91 @@
 #! /usr/bin/env python
-from __future__ import print_function
-from timeit import default_timer as time
-
-import sys
 import numpy as np
 from numba import njit
-import numba_dppy, numba_dppy as dppl
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
-
+import dpctl
+import unittest
+from . import skip_tests
 
-class TestNumpy_math_functions(DPPLTestCase):
+@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+class TestNumpy_math_functions(unittest.TestCase):
     N = 10
     a = np.array(np.random.random(N), dtype=np.float32)
     b = np.array(np.random.random(N), dtype=np.float32)
 
     def test_add(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.add(a, b)
             return c
 
-        c = f(self.a, self.b)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a, self.b)
+
         d = self.a + self.b
         self.assertTrue(np.all(c == d))
 
     def test_subtract(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.subtract(a, b)
             return c
 
-        c = f(self.a, self.b)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a, self.b)
+
         d = self.a - self.b
         self.assertTrue(np.all(c == d))
 
     def test_multiply(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.multiply(a, b)
             return c
 
-        c = f(self.a, self.b)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a, self.b)
+
         d = self.a * self.b
         self.assertTrue(np.all(c == d))
 
     def test_divide(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.divide(a, b)
             return c
 
-        c = f(self.a, self.b)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a, self.b)
+
         d = self.a / self.b
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-2)
 
     def test_true_divide(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.true_divide(a, b)
             return c
 
-        c = f(self.a, self.b)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a, self.b)
+
         d = np.true_divide(self.a, self.b)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-2)
 
     def test_negative(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.negative(a)
             return c
 
-        c = f(self.a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a)
+
         self.assertTrue(np.all(c == -self.a))
 
     def test_power(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.power(a, b)
             return c
@@ -85,11 +93,13 @@ def f(a, b):
         input_arr = np.random.randint(self.N, size=(self.N))
         exp = np.full((self.N), 2, dtype=np.int)
 
-        c = f(input_arr, exp)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr, exp)
+
         self.assertTrue(np.all(c == input_arr * input_arr))
 
     def test_remainder(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.remainder(a, b)
             return c
@@ -97,11 +107,13 @@ def f(a, b):
         input_arr = np.full((self.N), 3, dtype=np.int)
         divisor = np.full((self.N), 2, dtype=np.int)
 
-        c = f(input_arr, divisor)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr, divisor)
+
         self.assertTrue(np.all(c == 1))
 
     def test_mod(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.mod(a, b)
             return c
@@ -109,11 +121,13 @@ def f(a, b):
         input_arr = np.full((self.N), 3, dtype=np.int)
         divisor = np.full((self.N), 2, dtype=np.int)
 
-        c = f(input_arr, divisor)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr, divisor)
+
         self.assertTrue(np.all(c == 1))
 
     def test_fmod(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.fmod(a, b)
             return c
@@ -121,173 +135,206 @@ def f(a, b):
         input_arr = np.full((self.N), 3, dtype=np.float32)
         divisor = np.full((self.N), 2, dtype=np.int)
 
-        c = f(input_arr, divisor)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr, divisor)
+
         self.assertTrue(np.all(c == 1.))
 
     def test_abs(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.abs(a)
             return c
 
         input_arr = 5 * np.random.random_sample(self.N) - 5
 
-        c = f(input_arr)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
+
         self.assertTrue(np.all(c == -input_arr))
 
     def test_absolute(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.absolute(a)
             return c
 
         input_arr = 5 * np.random.random_sample(self.N) - 5
 
-        c = f(input_arr)
-        self.assertTrue(np.all(c == -input_arr))
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
 
+        self.assertTrue(np.all(c == -input_arr))
 
     def test_fabs(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.fabs(a)
             return c
 
         input_arr = 5 * np.random.random_sample(self.N) - 5
 
-        c = f(input_arr)
-        self.assertTrue(np.all(c == -input_arr))
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
 
+        self.assertTrue(np.all(c == -input_arr))
 
+    @unittest.skipIf(skip_tests.is_gen12("opencl:gpu"), "Gen12 not supported")
     def test_sign(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.sign(a)
             return c
 
         input_arr = 5 * np.random.random_sample(self.N) - 5
 
-        c = f(input_arr)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
+
         self.assertTrue(np.all(c == -1.))
 
     def test_conj(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.conj(a)
             return c
 
         input_arr = np.eye(self.N) + 1j * np.eye(self.N)
 
-        c = f(input_arr)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
+
         d = np.conj(input_arr)
         self.assertTrue(np.all(c == d))
 
     def test_exp(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.exp(a)
             return c
 
         input_arr = np.random.randint(self.N, size=(self.N))
-        c = f(input_arr)
-        d = np.exp(input_arr)
-        self.assertTrue(np.all(c == d))
 
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
 
+        d = np.exp(input_arr)
+        max_abs_err = c.sum() - d.sum()
+        self.assertTrue(max_abs_err < 1e-5)
+
+    @unittest.skipIf(skip_tests.is_gen12("opencl:gpu"), "Gen12 not supported")
     def test_log(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.log(a)
             return c
 
         input_arr = np.random.randint(1, self.N, size=(self.N))
-        c = f(input_arr)
+
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
+
         d = np.log(input_arr)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
+    @unittest.skipIf(skip_tests.is_gen12("opencl:gpu"), "Gen12 not supported")
     def test_log10(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.log10(a)
             return c
 
         input_arr = np.random.randint(1, self.N, size=(self.N))
-        c = f(input_arr)
+
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
+
         d = np.log10(input_arr)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
+    @unittest.skipIf(skip_tests.is_gen12("opencl:gpu"), "Gen12 not supported")
     def test_expm1(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.expm1(a)
             return c
 
         input_arr = np.random.randint(1, self.N, size=(self.N))
-        c = f(input_arr)
+
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
+
         d = np.expm1(input_arr)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
     def test_log1p(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.log1p(a)
             return c
 
         input_arr = np.random.randint(1, self.N, size=(self.N))
-        c = f(input_arr)
+
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
+
         d = np.log1p(input_arr)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
     def test_sqrt(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.sqrt(a)
             return c
 
-        c = f(self.a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a)
+
         d = np.sqrt(self.a)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
     def test_square(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.square(a)
             return c
 
         input_arr = np.random.randint(self.N, size=(self.N))
 
-        c = f(input_arr)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
+
         self.assertTrue(np.all(c == input_arr * input_arr))
 
     def test_reciprocal(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.reciprocal(a)
             return c
 
-        input_arr =  5 * np.random.random_sample(self.N) + 5
+        input_arr = 5 * np.random.random_sample(self.N) + 5
+
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
 
-        c = f(input_arr)
         self.assertTrue(np.all(c == 1/input_arr))
 
     def test_conjugate(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.conjugate(a)
             return c
 
         input_arr = np.eye(self.N) + 1j * np.eye(self.N)
 
-        c = f(input_arr)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
+
         d = np.conj(input_arr)
         self.assertTrue(np.all(c == d))
 
diff --git a/numba_dppy/tests/dppl/test_numpy_trigonomteric_functions.py b/numba_dppy/tests/test_numpy_trigonomteric_functions.py
similarity index 68%
rename from numba_dppy/tests/dppl/test_numpy_trigonomteric_functions.py
rename to numba_dppy/tests/test_numpy_trigonomteric_functions.py
index 8f61f941c9..812f3d060c 100644
--- a/numba_dppy/tests/dppl/test_numpy_trigonomteric_functions.py
+++ b/numba_dppy/tests/test_numpy_trigonomteric_functions.py
@@ -1,218 +1,238 @@
 #! /usr/bin/env python
-from __future__ import print_function
-from timeit import default_timer as time
-
-import sys
 import numpy as np
 from numba import njit
-import numba_dppy, numba_dppy as dppl
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+import dpctl
+import unittest
+from . import skip_tests
+
 
+@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+class TestNumpy_math_functions(unittest.TestCase):
 
-class TestNumpy_math_functions(DPPLTestCase):
     N = 10
     a = np.array(np.random.random(N), dtype=np.float32)
     b = np.array(np.random.random(N), dtype=np.float32)
 
     def test_sin(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.sin(a)
             return c
 
-        c = f(self.a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a)
+
         d = np.sin(self.a)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
     def test_cos(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.cos(a)
             return c
 
-        c = f(self.a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a)
+
         d = np.cos(self.a)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
     def test_tan(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.tan(a)
             return c
 
-        c = f(self.a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a)
+
         d = np.tan(self.a)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
     def test_arcsin(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.arcsin(a)
             return c
 
-        c = f(self.a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a)
+
         d = np.arcsin(self.a)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
     def test_arccos(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.arccos(a)
             return c
 
-        c = f(self.a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a)
+
         d = np.arccos(self.a)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
     def test_arctan(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.arctan(a)
             return c
 
-        c = f(self.a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a)
+
         d = np.arctan(self.a)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
     def test_arctan2(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             c = np.arctan2(a, b)
             return c
 
-        c = f(self.a, self.b)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a, self.b)
+
         d = np.arctan2(self.a, self.b)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
     def test_sinh(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.sinh(a)
             return c
 
-        c = f(self.a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a)
+
         d = np.sinh(self.a)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
     def test_cosh(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.cosh(a)
             return c
 
-        c = f(self.a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a)
+
         d = np.cosh(self.a)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
     def test_tanh(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.tanh(a)
             return c
 
-        c = f(self.a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a)
+
         d = np.tanh(self.a)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
     def test_arcsinh(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.arcsinh(a)
             return c
 
-        c = f(self.a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a)
+
         d = np.arcsinh(self.a)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
+    @unittest.skipIf(skip_tests.is_gen12("opencl:gpu"), "Gen12 not supported")
     def test_arccosh(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.arccosh(a)
             return c
 
         input_arr = np.random.randint(1, self.N, size=(self.N))
-        c = f(input_arr)
+
+        with dpctl.device_context("opencl:gpu"):
+            c = f(input_arr)
+
         d = np.arccosh(input_arr)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
     def test_arctanh(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.arctanh(a)
             return c
 
-        c = f(self.a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a)
+
         d = np.arctanh(self.a)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
     def test_deg2rad(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.deg2rad(a)
             return c
 
-        c = f(self.a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a)
+
         d = np.deg2rad(self.a)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
 
-
     def test_rad2deg(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.rad2deg(a)
             return c
 
-        c = f(self.a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a)
+
         d = np.rad2deg(self.a)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-2)
 
     def test_degrees(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.degrees(a)
             return c
 
-        c = f(self.a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a)
+
         d = np.degrees(self.a)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-2)
 
     def test_radians(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a):
             c = np.radians(a)
             return c
 
-        c = f(self.a)
+        with dpctl.device_context("opencl:gpu"):
+            c = f(self.a)
+
         d = np.radians(self.a)
         max_abs_err = c.sum() - d.sum()
         self.assertTrue(max_abs_err < 1e-5)
diff --git a/numba_dppy/tests/test_offload_diagnostics.py b/numba_dppy/tests/test_offload_diagnostics.py
new file mode 100644
index 0000000000..6b41252fc6
--- /dev/null
+++ b/numba_dppy/tests/test_offload_diagnostics.py
@@ -0,0 +1,60 @@
+import numpy as np
+import numba
+from numba import njit, prange
+import numba_dppy, numba_dppy as dppy
+from numba_dppy import config as dppy_config
+from numba_dppy.testing import unittest
+from numba.tests.support import captured_stdout
+import dpctl
+
+
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
+class TestOffloadDiagnostics(unittest.TestCase):
+    def test_parfor(self):
+        def prange_func():
+            n = 10
+            a = np.ones((n), dtype=np.float64)
+            b = np.ones((n), dtype=np.float64)
+            c = np.ones((n), dtype=np.float64)
+            for i in prange(n//2):
+                a[i] = b[i] + c[i]
+
+            return a
+
+        with dpctl.device_context("opencl:gpu"):
+            dppy_config.OFFLOAD_DIAGNOSTICS = 1
+            jitted = njit(parallel=True)(prange_func)
+
+            with captured_stdout() as got:
+                jitted()
+
+            dppy_config.OFFLOAD_DIAGNOSTICS = 0
+            self.assertTrue("Auto-offloading" in got.getvalue())
+            self.assertTrue("Device -" in got.getvalue())
+
+    def test_kernel(self):
+        @dppy.kernel
+        def parallel_sum(a, b, c):
+            i = dppy.get_global_id(0)
+            c[i] = a[i] + b[i]
+
+        global_size = 10
+        N = global_size
+
+        a = np.array(np.random.random(N), dtype=np.float32)
+        b = np.array(np.random.random(N), dtype=np.float32)
+        c = np.ones_like(a)
+
+        with dpctl.device_context("opencl:gpu"):
+            dppy_config.OFFLOAD_DIAGNOSTICS = 1
+
+            with captured_stdout() as got:
+                parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)
+
+            dppy_config.OFFLOAD_DIAGNOSTICS = 0
+            self.assertTrue("Auto-offloading" in got.getvalue())
+            self.assertTrue("Device -" in got.getvalue())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/numba_dppy/tests/test_parfor_lower_message.py b/numba_dppy/tests/test_parfor_lower_message.py
new file mode 100644
index 0000000000..17f1456bb6
--- /dev/null
+++ b/numba_dppy/tests/test_parfor_lower_message.py
@@ -0,0 +1,37 @@
+import numpy as np
+import numba
+from numba import njit, prange
+import numba_dppy
+import numba_dppy as dppy
+import unittest
+from numba.tests.support import captured_stdout
+import dpctl
+
+
+def prange_example():
+    n = 10
+    a = np.ones((n), dtype=np.float64)
+    b = np.ones((n), dtype=np.float64)
+    c = np.ones((n), dtype=np.float64)
+    for i in prange(n//2):
+        a[i] = b[i] + c[i]
+
+    return a
+
+
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
+class TestParforMessage(unittest.TestCase):
+    def test_parfor_message(self):
+        with dpctl.device_context("opencl:gpu") as gpu_queue:
+            numba_dppy.compiler.DEBUG = 1
+            jitted = njit(prange_example)
+
+            with captured_stdout() as got:
+                jitted()
+
+            numba_dppy.compiler.DEBUG = 0
+            self.assertTrue("Parfor lowered on DPPY-device" in got.getvalue())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/numba_dppy/tests/dppl/test_prange.py b/numba_dppy/tests/test_prange.py
similarity index 70%
rename from numba_dppy/tests/dppl/test_prange.py
rename to numba_dppy/tests/test_prange.py
index f1ceb3b2ce..eda9ccebbc 100644
--- a/numba_dppy/tests/dppl/test_prange.py
+++ b/numba_dppy/tests/test_prange.py
@@ -1,20 +1,19 @@
 #! /usr/bin/env python
-from __future__ import print_function
-from timeit import default_timer as time
-
 import sys
 import numpy as np
 import numba
+import dpctl
 from numba import njit, prange
-import numba_dppy, numba_dppy as dppl
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+import numba_dppy
+import unittest
+from numba_dppy.testing import expectedFailureIf
 from numba.tests.support import captured_stdout
 
 
-class TestPrange(DPPLTestCase):
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
+class TestPrange(unittest.TestCase):
     def test_one_prange(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             for i in prange(4):
                 b[i, 0] = a[i, 0] * 10
@@ -24,14 +23,14 @@ def f(a, b):
         a = np.ones((m, n))
         b = np.ones((m, n))
 
-        f(a, b)
+        with dpctl.device_context("opencl:gpu"):
+            f(a, b)
 
         for i in range(4):
             self.assertTrue(b[i, 0] == a[i, 0] * 10)
 
-
     def test_nested_prange(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             # dimensions must be provided as scalar
             m, n = a.shape
@@ -44,12 +43,13 @@ def f(a, b):
         a = np.ones((m, n))
         b = np.ones((m, n))
 
-        f(a, b)
-        self.assertTrue(np.all(b == 10))
+        with dpctl.device_context("opencl:gpu"):
+            f(a, b)
 
+        self.assertTrue(np.all(b == 10))
 
     def test_multiple_prange(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             # dimensions must be provided as scalar
             m, n = a.shape
@@ -58,7 +58,6 @@ def f(a, b):
                 for j in prange(n):
                     b[i, j] = a[i, j] * val
 
-
             for i in prange(m):
                 for j in prange(n):
                     a[i, j] = a[i, j] * 10
@@ -68,13 +67,14 @@ def f(a, b):
         a = np.ones((m, n))
         b = np.ones((m, n))
 
-        f(a, b)
+        with dpctl.device_context("opencl:gpu"):
+            f(a, b)
+
         self.assertTrue(np.all(b == 10))
         self.assertTrue(np.all(a == 10))
 
-
     def test_three_prange(self):
-        @njit(parallel={'offload':True})
+        @njit
         def f(a, b):
             # dimensions must be provided as scalar
             m, n, o = a.shape
@@ -91,10 +91,12 @@ def f(a, b):
         a = np.ones((m, n, o))
         b = np.ones((m, n, o))
 
-        f(a, b)
-        self.assertTrue(np.all(b == 12))
+        with dpctl.device_context("opencl:gpu"):
+            f(a, b)
 
+        self.assertTrue(np.all(b == 12))
 
+    @unittest.skip('numba-dppy issue 110')
     def test_two_consequent_prange(self):
         def prange_example():
             n = 10
@@ -109,19 +111,21 @@ def prange_example():
         old_debug = numba_dppy.compiler.DEBUG
         numba_dppy.compiler.DEBUG = 1
 
-        jitted = njit(parallel={'offload':True})(prange_example)
-        with captured_stdout() as got:
+        jitted = njit(prange_example)
+
+        with captured_stdout() as stdout, dpctl.device_context("opencl:gpu"):
             jitted_res = jitted()
 
         res = prange_example()
 
         numba_dppy.compiler.DEBUG = old_debug
 
-        self.assertEqual(got.getvalue().count('Parfor lowered on DPPL-device'), 2)
-        self.assertEqual(got.getvalue().count('Failed to lower parfor on DPPL-device'), 0)
+        self.assertEqual(stdout.getvalue().count(
+            'Parfor lowered on DPPY-device'), 2, stdout.getvalue())
+        self.assertEqual(stdout.getvalue().count(
+            'Failed to lower parfor on DPPY-device'), 0, stdout.getvalue())
         np.testing.assert_equal(res, jitted_res)
 
-
     @unittest.skip('NRT required but not enabled')
     def test_2d_arrays(self):
         def prange_example():
@@ -137,16 +141,19 @@ def prange_example():
         old_debug = numba_dppy.compiler.DEBUG
         numba_dppy.compiler.DEBUG = 1
 
-        jitted = njit(parallel={'offload':True})(prange_example)
-        with captured_stdout() as got:
+        jitted = njit(prange_example)
+
+        with captured_stdout() as stdout, dpctl.device_context("opencl:gpu"):
             jitted_res = jitted()
 
         res = prange_example()
 
         numba_dppy.compiler.DEBUG = old_debug
 
-        self.assertEqual(got.getvalue().count('Parfor lowered on DPPL-device'), 2)
-        self.assertEqual(got.getvalue().count('Failed to lower parfor on DPPL-device'), 0)
+        self.assertEqual(stdout.getvalue().count(
+            'Parfor lowered on DPPY-device'), 2, stdout.getvalue())
+        self.assertEqual(stdout.getvalue().count(
+            'Failed to lower parfor on DPPY-device'), 0, stdout.getvalue())
         np.testing.assert_equal(res, jitted_res)
 
 
diff --git a/numba_dppy/tests/dppl/test_print.py b/numba_dppy/tests/test_print.py
similarity index 55%
rename from numba_dppy/tests/dppl/test_print.py
rename to numba_dppy/tests/test_print.py
index ca1e47978a..af19658048 100644
--- a/numba_dppy/tests/dppl/test_print.py
+++ b/numba_dppy/tests/test_print.py
@@ -1,28 +1,23 @@
 #! /usr/bin/env python
-from __future__ import print_function
-from timeit import default_timer as time
-
-import sys
 import numpy as np
 from numba import njit, prange
-import numba_dppy, numba_dppy as dppl
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+import numba_dppy, numba_dppy as dppy
+import unittest
 
 import dpctl
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestPrint(DPPLTestCase):
-    def test_print_dppl_kernel(self):
-        @dppl.func
+class TestPrint(unittest.TestCase):
+    def test_print_dppy_kernel(self):
+        @dppy.func
         def g(a):
             print("value of a:", a)
             return a + 1
 
-        @dppl.kernel
+        @dppy.kernel
         def f(a, b):
-            i = dppl.get_global_id(0)
+            i = dppy.get_global_id(0)
             b[i] = g(a[i])
             print("value of b at:", i, "is", b[i])
 
@@ -32,7 +27,7 @@ def f(a, b):
         b = np.ones(N)
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            f[N, dppl.DEFAULT_LOCAL_SIZE](a, b)
+            f[N, dppy.DEFAULT_LOCAL_SIZE](a, b)
 
 
 if __name__ == '__main__':
diff --git a/numba_dppy/tests/test_rename_numpy_function_pass.py b/numba_dppy/tests/test_rename_numpy_function_pass.py
new file mode 100644
index 0000000000..cfeff09b8d
--- /dev/null
+++ b/numba_dppy/tests/test_rename_numpy_function_pass.py
@@ -0,0 +1,110 @@
+#! /usr/bin/env python
+import unittest
+import numpy as np
+import numba
+from numba import njit, typeof
+import numba_dppy, numba_dppy as dppy
+from numba_dppy.testing import ensure_dpnp
+
+
+from numba.core import (compiler, typing, cpu)
+from numba_dppy.rename_numpy_functions_pass import (DPPYRewriteOverloadedNumPyFunctions,
+        DPPYRewriteNdarrayFunctions)
+from numba.core.typed_passes import (NopythonTypeInference, AnnotateTypes)
+
+
+class MyPipeline(object):
+    def __init__(self, test_ir, args):
+        self.state = compiler.StateDict()
+        self.state.typingctx = typing.Context()
+        self.state.targetctx = cpu.CPUContext(self.state.typingctx)
+        self.state.func_ir = test_ir
+        self.state.func_id = test_ir.func_id
+        self.state.args = args
+        self.state.return_type = None
+        self.state.locals = dict()
+        self.state.status = None
+        self.state.lifted = dict()
+        self.state.lifted_from = None
+
+        self.state.typingctx.refresh()
+        self.state.targetctx.refresh()
+
+
+def check_equivalent(expected_ir, got_ir):
+    expected_block_body = expected_ir.blocks[0].body
+    got_block_body = got_ir.blocks[0].body
+
+    if len(expected_block_body) != len(got_block_body):
+        return False
+
+    for i in range(len(expected_block_body)):
+        expected_stmt = expected_block_body[i]
+        got_stmt = got_block_body[i]
+        if type(expected_stmt) != type(got_stmt):
+            return False
+        else:
+            if isinstance(expected_stmt, numba.core.ir.Assign):
+                if isinstance(expected_stmt.value, numba.core.ir.Global):
+                    if (expected_stmt.value.name != got_stmt.value.name and
+                        expected_stmt.value.name != "numba_dppy"):
+                        return False
+                elif isinstance(expected_stmt.value, numba.core.ir.Expr):
+                    # should get "dpnp" and "sum" as attr
+                    if expected_stmt.value.op == "getattr":
+                        if expected_stmt.value.attr != got_stmt.value.attr:
+                            return False
+    return True
+
+
+class TestRenameNumpyFunctionsPass(unittest.TestCase):
+    def test_rename_numpy(self):
+        def expected(a):
+            return numba_dppy.dpnp.sum(a)
+
+        def got(a):
+            return np.sum(a)
+
+        expected_ir = compiler.run_frontend(expected)
+        got_ir = compiler.run_frontend(got)
+
+        pipeline = MyPipeline(got_ir, None)
+
+        rewrite_numpy_functions_pass = DPPYRewriteOverloadedNumPyFunctions()
+        rewrite_numpy_functions_pass.run_pass(pipeline.state)
+
+        self.assertTrue(check_equivalent(expected_ir, pipeline.state.func_ir))
+
+
+@unittest.skipUnless(ensure_dpnp(), 'test only when dpNP is available')
+class TestRenameNdarrayFunctionsPass(unittest.TestCase):
+    def test_rename_ndarray(self):
+        def expected(a):
+            return numba_dppy.dpnp.sum(a)
+
+        def got(a):
+            return a.sum()
+
+        expected_ir = compiler.run_frontend(expected)
+        got_ir = compiler.run_frontend(got)
+
+        a = np.arange(10)
+        args = [a]
+        argtypes = [typeof(x) for x in args]
+
+        pipeline = MyPipeline(got_ir, argtypes)
+
+        tyinfer_pass = NopythonTypeInference()
+        tyinfer_pass.run_pass(pipeline.state)
+
+        annotate_ty_pass = AnnotateTypes()
+        annotate_ty_pass.run_pass(pipeline.state)
+
+        rewrite_ndarray_functions_pass = DPPYRewriteNdarrayFunctions()
+        rewrite_ndarray_functions_pass.run_pass(pipeline.state)
+
+        self.assertTrue(check_equivalent(expected_ir, pipeline.state.func_ir))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/numba_dppy/tests/dppl/test_sum_reduction.py b/numba_dppy/tests/test_sum_reduction.py
similarity index 75%
rename from numba_dppy/tests/dppl/test_sum_reduction.py
rename to numba_dppy/tests/test_sum_reduction.py
index 3095497a66..37ca38a12a 100644
--- a/numba_dppy/tests/dppl/test_sum_reduction.py
+++ b/numba_dppy/tests/test_sum_reduction.py
@@ -1,17 +1,12 @@
-from __future__ import print_function, division, absolute_import
-
 import numpy as np
 import math
-import time
-
-import numba_dppy, numba_dppy as dppl
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+import numba_dppy, numba_dppy as dppy
+import unittest
 import dpctl
 
-@dppl.kernel
+@dppy.kernel
 def reduction_kernel(A, R, stride):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     # sum two element
     R[i] = A[i] + A[i+stride]
     # store the sum to be used in nex iteration
@@ -19,7 +14,7 @@ def reduction_kernel(A, R, stride):
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPLSumReduction(DPPLTestCase):
+class TestDPPYSumReduction(unittest.TestCase):
     def test_sum_reduction(self):
         # This test will only work for even case
         N = 1024
@@ -36,7 +31,7 @@ def test_sum_reduction(self):
             while (total > 1):
                 # call kernel
                 global_size = total // 2
-                reduction_kernel[global_size, dppl.DEFAULT_LOCAL_SIZE](A, R, global_size)
+                reduction_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, R, global_size)
                 total = total // 2
 
             result = A_copy.sum()
diff --git a/numba_dppy/tests/dppl/test_vectorize.py b/numba_dppy/tests/test_vectorize.py
similarity index 64%
rename from numba_dppy/tests/dppl/test_vectorize.py
rename to numba_dppy/tests/test_vectorize.py
index 12dc7b5ed3..5b3a41629c 100644
--- a/numba_dppy/tests/dppl/test_vectorize.py
+++ b/numba_dppy/tests/test_vectorize.py
@@ -1,25 +1,21 @@
 #! /usr/bin/env python
-from __future__ import print_function
-from timeit import default_timer as time
-
-import sys
 import numpy as np
 from numba import njit, vectorize
-import numba_dppy, numba_dppy as dppl
-from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+import dpctl
+import unittest
 
 
-class TestVectorize(DPPLTestCase):
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
+class TestVectorize(unittest.TestCase):
     def test_vectorize(self):
 
         @vectorize(nopython=True)
         def axy(a, x, y):
             return a * x + y
 
-        @njit(parallel={'offload':True})
+        @njit
         def f(a0, a1):
-            return np.cos(axy(a0, np.sin(a1) - 1., 1.) )
+            return np.cos(axy(a0, np.sin(a1) - 1., 1.))
 
         def f_np(a0, a1):
             sin_res = np.sin(a1)
@@ -28,11 +24,12 @@ def f_np(a0, a1):
                 res.append(axy(a0[i], sin_res[i] - 1., 1.))
             return np.cos(np.array(res))
 
-
         A = np.random.random(10)
         B = np.random.random(10)
 
-        expected = f(A, B)
+        with dpctl.device_context("opencl:gpu"):
+            expected = f(A, B)
+
         actual = f_np(A, B)
 
         max_abs_err = expected.sum() - actual.sum()
diff --git a/numba_dppy/tests/dppl/test_with_context.py b/numba_dppy/tests/test_with_context.py
similarity index 88%
rename from numba_dppy/tests/dppl/test_with_context.py
rename to numba_dppy/tests/test_with_context.py
index 4e34c939cb..693c155ab2 100644
--- a/numba_dppy/tests/dppl/test_with_context.py
+++ b/numba_dppy/tests/test_with_context.py
@@ -1,17 +1,16 @@
-import numba
 import numpy as np
 from numba import njit
-import numba_dppy, numba_dppy as dppl
+import numba_dppy
+import unittest
 from numba.core import errors
 from numba.tests.support import captured_stdout
-from numba_dppy.testing import DPPLTestCase, unittest
 import dpctl
 
 
-class TestWithDPPLContext(DPPLTestCase):
+class TestWithDPPYContext(unittest.TestCase):
 
     @unittest.skipIf(not dpctl.has_gpu_queues(), "No GPU platforms available")
-    def test_with_dppl_context_gpu(self):
+    def test_with_dppy_context_gpu(self):
 
         @njit
         def nested_func(a, b):
@@ -34,10 +33,10 @@ def func(b):
         func(expected)
 
         np.testing.assert_array_equal(expected, got_gpu)
-        self.assertTrue('Parfor lowered on DPPL-device' in got_gpu_message.getvalue())
+        self.assertTrue('Parfor lowered on DPPY-device' in got_gpu_message.getvalue())
 
     @unittest.skipIf(not dpctl.has_cpu_queues(), "No CPU platforms available")
-    def test_with_dppl_context_cpu(self):
+    def test_with_dppy_context_cpu(self):
 
         @njit
         def nested_func(a, b):
@@ -60,11 +59,11 @@ def func(b):
         func(expected)
 
         np.testing.assert_array_equal(expected, got_cpu)
-        self.assertTrue('Parfor lowered on DPPL-device' in got_cpu_message.getvalue())
+        self.assertTrue('Parfor lowered on DPPY-device' in got_cpu_message.getvalue())
 
 
     @unittest.skipIf(not dpctl.has_gpu_queues(), "No GPU platforms available")
-    def test_with_dppl_context_target(self):
+    def test_with_dppy_context_target(self):
 
         @njit(target='cpu')
         def nested_func_target(a, b):
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000000..a853949500
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,13 @@
+
+# See the docstring in versioneer.py for instructions. Note that you must
+# re-run 'versioneer.py setup' after changing this section, and commit the
+# resulting files.
+
+[versioneer]
+VCS = git
+style = pep440
+versionfile_source = numba_dppy/_version.py
+versionfile_build = numba_dppy/_version.py
+tag_prefix =
+#parentdir_prefix =
+
diff --git a/setup.py b/setup.py
index 2a7efb7205..37ad0bfc68 100644
--- a/setup.py
+++ b/setup.py
@@ -2,6 +2,8 @@
 from setuptools import Extension, find_packages, setup
 from Cython.Build import cythonize
 
+import versioneer
+
 
 def get_ext_modules():
     ext_modules = []
@@ -41,7 +43,7 @@ def get_ext_modules():
 
 metadata = dict(
     name="numba-dppy",
-    version="0.0.1",
+    version=versioneer.get_version(),
     description="Numba extension for Intel CPU and GPU backend",
     url="https://github.com/IntelPython/numba-dppy",
     packages=packages,
@@ -61,6 +63,7 @@ def get_ext_modules():
         "Programming Language :: Python :: Implementation :: CPython",
         "Topic :: Software Development :: Compilers",
     ],
+    cmdclass=versioneer.get_cmdclass(),
 )
 
 setup(**metadata)
diff --git a/versioneer.py b/versioneer.py
new file mode 100644
index 0000000000..1040c21892
--- /dev/null
+++ b/versioneer.py
@@ -0,0 +1,1855 @@
+
+# Version: 0.19
+
+"""The Versioneer - like a rocketeer, but for versions.
+
+The Versioneer
+==============
+
+* like a rocketeer, but for versions!
+* https://github.com/python-versioneer/python-versioneer
+* Brian Warner
+* License: Public Domain
+* Compatible with: Python 3.6, 3.7, 3.8, 3.9 and pypy3
+* [![Latest Version][pypi-image]][pypi-url]
+* [![Build Status][travis-image]][travis-url]
+
+This is a tool for managing a recorded version number in distutils-based
+python projects. The goal is to remove the tedious and error-prone "update
+the embedded version string" step from your release process. Making a new
+release should be as easy as recording a new tag in your version-control
+system, and maybe making new tarballs.
+
+
+## Quick Install
+
+* `pip install versioneer` to somewhere in your $PATH
+* add a `[versioneer]` section to your setup.cfg (see [Install](INSTALL.md))
+* run `versioneer install` in your source tree, commit the results
+* Verify version information with `python setup.py version`
+
+## Version Identifiers
+
+Source trees come from a variety of places:
+
+* a version-control system checkout (mostly used by developers)
+* a nightly tarball, produced by build automation
+* a snapshot tarball, produced by a web-based VCS browser, like github's
+  "tarball from tag" feature
+* a release tarball, produced by "setup.py sdist", distributed through PyPI
+
+Within each source tree, the version identifier (either a string or a number,
+this tool is format-agnostic) can come from a variety of places:
+
+* ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows
+  about recent "tags" and an absolute revision-id
+* the name of the directory into which the tarball was unpacked
+* an expanded VCS keyword ($Id$, etc)
+* a `_version.py` created by some earlier build step
+
+For released software, the version identifier is closely related to a VCS
+tag. Some projects use tag names that include more than just the version
+string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool
+needs to strip the tag prefix to extract the version identifier. For
+unreleased software (between tags), the version identifier should provide
+enough information to help developers recreate the same tree, while also
+giving them an idea of roughly how old the tree is (after version 1.2, before
+version 1.3). Many VCS systems can report a description that captures this,
+for example `git describe --tags --dirty --always` reports things like
+"0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the
+0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has
+uncommitted changes).
+
+The version identifier is used for multiple purposes:
+
+* to allow the module to self-identify its version: `myproject.__version__`
+* to choose a name and prefix for a 'setup.py sdist' tarball
+
+## Theory of Operation
+
+Versioneer works by adding a special `_version.py` file into your source
+tree, where your `__init__.py` can import it. This `_version.py` knows how to
+dynamically ask the VCS tool for version information at import time.
+
+`_version.py` also contains `$Revision$` markers, and the installation
+process marks `_version.py` to have this marker rewritten with a tag name
+during the `git archive` command. As a result, generated tarballs will
+contain enough information to get the proper version.
+
+To allow `setup.py` to compute a version too, a `versioneer.py` is added to
+the top level of your source tree, next to `setup.py` and the `setup.cfg`
+that configures it. This overrides several distutils/setuptools commands to
+compute the version when invoked, and changes `setup.py build` and `setup.py
+sdist` to replace `_version.py` with a small static file that contains just
+the generated version data.
+
+## Installation
+
+See [INSTALL.md](./INSTALL.md) for detailed installation instructions.
+
+## Version-String Flavors
+
+Code which uses Versioneer can learn about its version string at runtime by
+importing `_version` from your main `__init__.py` file and running the
+`get_versions()` function. From the "outside" (e.g. in `setup.py`), you can
+import the top-level `versioneer.py` and run `get_versions()`.
+
+Both functions return a dictionary with different flavors of version
+information:
+
+* `['version']`: A condensed version string, rendered using the selected
+  style. This is the most commonly used value for the project's version
+  string. The default "pep440" style yields strings like `0.11`,
+  `0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section
+  below for alternative styles.
+
+* `['full-revisionid']`: detailed revision identifier. For Git, this is the
+  full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac".
+
+* `['date']`: Date and time of the latest `HEAD` commit. For Git, it is the
+  commit date in ISO 8601 format. This will be None if the date is not
+  available.
+
+* `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that
+  this is only accurate if run in a VCS checkout, otherwise it is likely to
+  be False or None
+
+* `['error']`: if the version string could not be computed, this will be set
+  to a string describing the problem, otherwise it will be None. It may be
+  useful to throw an exception in setup.py if this is set, to avoid e.g.
+  creating tarballs with a version string of "unknown".
+
+Some variants are more useful than others. Including `full-revisionid` in a
+bug report should allow developers to reconstruct the exact code being tested
+(or indicate the presence of local changes that should be shared with the
+developers). `version` is suitable for display in an "about" box or a CLI
+`--version` output: it can be easily compared against release notes and lists
+of bugs fixed in various releases.
+
+The installer adds the following text to your `__init__.py` to place a basic
+version in `YOURPROJECT.__version__`:
+
+    from ._version import get_versions
+    __version__ = get_versions()['version']
+    del get_versions
+
+## Styles
+
+The setup.cfg `style=` configuration controls how the VCS information is
+rendered into a version string.
+
+The default style, "pep440", produces a PEP440-compliant string, equal to the
+un-prefixed tag name for actual releases, and containing an additional "local
+version" section with more detail for in-between builds. For Git, this is
+TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags
+--dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the
+tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and
+that this commit is two revisions ("+2") beyond the "0.11" tag. For released
+software (exactly equal to a known tag), the identifier will only contain the
+stripped tag, e.g. "0.11".
+
+Other styles are available. See [details.md](details.md) in the Versioneer
+source tree for descriptions.
+
+## Debugging
+
+Versioneer tries to avoid fatal errors: if something goes wrong, it will tend
+to return a version of "0+unknown". To investigate the problem, run `setup.py
+version`, which will run the version-lookup code in a verbose mode, and will
+display the full contents of `get_versions()` (including the `error` string,
+which may help identify what went wrong).
+
+## Known Limitations
+
+Some situations are known to cause problems for Versioneer. This details the
+most significant ones. More can be found on Github
+[issues page](https://github.com/python-versioneer/python-versioneer/issues).
+
+### Subprojects
+
+Versioneer has limited support for source trees in which `setup.py` is not in
+the root directory (e.g. `setup.py` and `.git/` are *not* siblings). The are
+two common reasons why `setup.py` might not be in the root:
+
+* Source trees which contain multiple subprojects, such as
+  [Buildbot](https://github.com/buildbot/buildbot), which contains both
+  "master" and "slave" subprojects, each with their own `setup.py`,
+  `setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI
+  distributions (and upload multiple independently-installable tarballs).
+* Source trees whose main purpose is to contain a C library, but which also
+  provide bindings to Python (and perhaps other languages) in subdirectories.
+
+Versioneer will look for `.git` in parent directories, and most operations
+should get the right version string. However `pip` and `setuptools` have bugs
+and implementation details which frequently cause `pip install .` from a
+subproject directory to fail to find a correct version string (so it usually
+defaults to `0+unknown`).
+
+`pip install --editable .` should work correctly. `setup.py install` might
+work too.
+
+Pip-8.1.1 is known to have this problem, but hopefully it will get fixed in
+some later version.
+
+[Bug #38](https://github.com/python-versioneer/python-versioneer/issues/38) is tracking
+this issue. The discussion in
+[PR #61](https://github.com/python-versioneer/python-versioneer/pull/61) describes the
+issue from the Versioneer side in more detail.
+[pip PR#3176](https://github.com/pypa/pip/pull/3176) and
+[pip PR#3615](https://github.com/pypa/pip/pull/3615) contain work to improve
+pip to let Versioneer work correctly.
+
+Versioneer-0.16 and earlier only looked for a `.git` directory next to the
+`setup.cfg`, so subprojects were completely unsupported with those releases.
+
+### Editable installs with setuptools <= 18.5
+
+`setup.py develop` and `pip install --editable .` allow you to install a
+project into a virtualenv once, then continue editing the source code (and
+test) without re-installing after every change.
+
+"Entry-point scripts" (`setup(entry_points={"console_scripts": ..})`) are a
+convenient way to specify executable scripts that should be installed along
+with the python package.
+
+These both work as expected when using modern setuptools. When using
+setuptools-18.5 or earlier, however, certain operations will cause
+`pkg_resources.DistributionNotFound` errors when running the entrypoint
+script, which must be resolved by re-installing the package. This happens
+when the install happens with one version, then the egg_info data is
+regenerated while a different version is checked out. Many setup.py commands
+cause egg_info to be rebuilt (including `sdist`, `wheel`, and installing into
+a different virtualenv), so this can be surprising.
+
+[Bug #83](https://github.com/python-versioneer/python-versioneer/issues/83) describes
+this one, but upgrading to a newer version of setuptools should probably
+resolve it.
+
+
+## Updating Versioneer
+
+To upgrade your project to a new release of Versioneer, do the following:
+
+* install the new Versioneer (`pip install -U versioneer` or equivalent)
+* edit `setup.cfg`, if necessary, to include any new configuration settings
+  indicated by the release notes. See [UPGRADING](./UPGRADING.md) for details.
+* re-run `versioneer install` in your source tree, to replace
+  `SRC/_version.py`
+* commit any changed files
+
+## Future Directions
+
+This tool is designed to make it easily extended to other version-control
+systems: all VCS-specific components are in separate directories like
+src/git/ . The top-level `versioneer.py` script is assembled from these
+components by running make-versioneer.py . In the future, make-versioneer.py
+will take a VCS name as an argument, and will construct a version of
+`versioneer.py` that is specific to the given VCS. It might also take the
+configuration arguments that are currently provided manually during
+installation by editing setup.py . Alternatively, it might go the other
+direction and include code from all supported VCS systems, reducing the
+number of intermediate scripts.
+
+## Similar projects
+
+* [setuptools_scm](https://github.com/pypa/setuptools_scm/) - a non-vendored build-time
+  dependency
+* [minver](https://github.com/jbweston/miniver) - a lightweight reimplementation of
+  versioneer
+
+## License
+
+To make Versioneer easier to embed, all its code is dedicated to the public
+domain. The `_version.py` that it creates is also in the public domain.
+Specifically, both are released under the Creative Commons "Public Domain
+Dedication" license (CC0-1.0), as described in
+https://creativecommons.org/publicdomain/zero/1.0/ .
+
+[pypi-image]: https://img.shields.io/pypi/v/versioneer.svg
+[pypi-url]: https://pypi.python.org/pypi/versioneer/
+[travis-image]:
+https://img.shields.io/travis/com/python-versioneer/python-versioneer.svg
+[travis-url]: https://travis-ci.com/github/python-versioneer/python-versioneer
+
+"""
+
+import configparser
+import errno
+import json
+import os
+import re
+import subprocess
+import sys
+
+
+class VersioneerConfig:
+    """Container for Versioneer configuration parameters."""
+
+
+def get_root():
+    """Get the project root directory.
+
+    We require that all commands are run from the project root, i.e. the
+    directory that contains setup.py, setup.cfg, and versioneer.py .
+    """
+    root = os.path.realpath(os.path.abspath(os.getcwd()))
+    setup_py = os.path.join(root, "setup.py")
+    versioneer_py = os.path.join(root, "versioneer.py")
+    if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)):
+        # allow 'python path/to/setup.py COMMAND'
+        root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0])))
+        setup_py = os.path.join(root, "setup.py")
+        versioneer_py = os.path.join(root, "versioneer.py")
+    if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)):
+        err = ("Versioneer was unable to run the project root directory. "
+               "Versioneer requires setup.py to be executed from "
+               "its immediate directory (like 'python setup.py COMMAND'), "
+               "or in a way that lets it use sys.argv[0] to find the root "
+               "(like 'python path/to/setup.py COMMAND').")
+        raise VersioneerBadRootError(err)
+    try:
+        # Certain runtime workflows (setup.py install/develop in a setuptools
+        # tree) execute all dependencies in a single python process, so
+        # "versioneer" may be imported multiple times, and python's shared
+        # module-import table will cache the first one. So we can't use
+        # os.path.dirname(__file__), as that will find whichever
+        # versioneer.py was first imported, even in later projects.
+        me = os.path.realpath(os.path.abspath(__file__))
+        me_dir = os.path.normcase(os.path.splitext(me)[0])
+        vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0])
+        if me_dir != vsr_dir:
+            print("Warning: build in %s is using versioneer.py from %s"
+                  % (os.path.dirname(me), versioneer_py))
+    except NameError:
+        pass
+    return root
+
+
+def get_config_from_root(root):
+    """Read the project setup.cfg file to determine Versioneer config."""
+    # This might raise EnvironmentError (if setup.cfg is missing), or
+    # configparser.NoSectionError (if it lacks a [versioneer] section), or
+    # configparser.NoOptionError (if it lacks "VCS="). See the docstring at
+    # the top of versioneer.py for instructions on writing your setup.cfg .
+    setup_cfg = os.path.join(root, "setup.cfg")
+    parser = configparser.ConfigParser()
+    with open(setup_cfg, "r") as f:
+        parser.read_file(f)
+    VCS = parser.get("versioneer", "VCS")  # mandatory
+
+    def get(parser, name):
+        if parser.has_option("versioneer", name):
+            return parser.get("versioneer", name)
+        return None
+    cfg = VersioneerConfig()
+    cfg.VCS = VCS
+    cfg.style = get(parser, "style") or ""
+    cfg.versionfile_source = get(parser, "versionfile_source")
+    cfg.versionfile_build = get(parser, "versionfile_build")
+    cfg.tag_prefix = get(parser, "tag_prefix")
+    if cfg.tag_prefix in ("''", '""'):
+        cfg.tag_prefix = ""
+    cfg.parentdir_prefix = get(parser, "parentdir_prefix")
+    cfg.verbose = get(parser, "verbose")
+    return cfg
+
+
+class NotThisMethod(Exception):
+    """Exception raised if a method is not valid for the current scenario."""
+
+
+# these dictionaries contain VCS-specific tools
+LONG_VERSION_PY = {}
+HANDLERS = {}
+
+
+def register_vcs_handler(vcs, method):  # decorator
+    """Create decorator to mark a method as the handler of a VCS."""
+    def decorate(f):
+        """Store f in HANDLERS[vcs][method]."""
+        if vcs not in HANDLERS:
+            HANDLERS[vcs] = {}
+        HANDLERS[vcs][method] = f
+        return f
+    return decorate
+
+
+def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
+                env=None):
+    """Call the given command(s)."""
+    assert isinstance(commands, list)
+    p = None
+    for c in commands:
+        try:
+            dispcmd = str([c] + args)
+            # remember shell=False, so use git.cmd on windows, not just git
+            p = subprocess.Popen([c] + args, cwd=cwd, env=env,
+                                 stdout=subprocess.PIPE,
+                                 stderr=(subprocess.PIPE if hide_stderr
+                                         else None))
+            break
+        except EnvironmentError:
+            e = sys.exc_info()[1]
+            if e.errno == errno.ENOENT:
+                continue
+            if verbose:
+                print("unable to run %s" % dispcmd)
+                print(e)
+            return None, None
+    else:
+        if verbose:
+            print("unable to find command, tried %s" % (commands,))
+        return None, None
+    stdout = p.communicate()[0].strip().decode()
+    if p.returncode != 0:
+        if verbose:
+            print("unable to run %s (error)" % dispcmd)
+            print("stdout was %s" % stdout)
+        return None, p.returncode
+    return stdout, p.returncode
+
+
+LONG_VERSION_PY['git'] = r'''
+# This file helps to compute a version number in source trees obtained from
+# git-archive tarball (such as those provided by githubs download-from-tag
+# feature). Distribution tarballs (built by setup.py sdist) and build
+# directories (produced by setup.py build) will contain a much shorter file
+# that just contains the computed version number.
+
+# This file is released into the public domain. Generated by
+# versioneer-0.19 (https://github.com/python-versioneer/python-versioneer)
+
+"""Git implementation of _version.py."""
+
+import errno
+import os
+import re
+import subprocess
+import sys
+
+
+def get_keywords():
+    """Get the keywords needed to look up the version information."""
+    # these strings will be replaced by git during git-archive.
+    # setup.py/versioneer.py will grep for the variable names, so they must
+    # each be defined on a line of their own. _version.py will just call
+    # get_keywords().
+    git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s"
+    git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s"
+    git_date = "%(DOLLAR)sFormat:%%ci%(DOLLAR)s"
+    keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
+    return keywords
+
+
+class VersioneerConfig:
+    """Container for Versioneer configuration parameters."""
+
+
+def get_config():
+    """Create, populate and return the VersioneerConfig() object."""
+    # these strings are filled in when 'setup.py versioneer' creates
+    # _version.py
+    cfg = VersioneerConfig()
+    cfg.VCS = "git"
+    cfg.style = "%(STYLE)s"
+    cfg.tag_prefix = "%(TAG_PREFIX)s"
+    cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s"
+    cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s"
+    cfg.verbose = False
+    return cfg
+
+
+class NotThisMethod(Exception):
+    """Exception raised if a method is not valid for the current scenario."""
+
+
+LONG_VERSION_PY = {}
+HANDLERS = {}
+
+
+def register_vcs_handler(vcs, method):  # decorator
+    """Create decorator to mark a method as the handler of a VCS."""
+    def decorate(f):
+        """Store f in HANDLERS[vcs][method]."""
+        if vcs not in HANDLERS:
+            HANDLERS[vcs] = {}
+        HANDLERS[vcs][method] = f
+        return f
+    return decorate
+
+
+def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
+                env=None):
+    """Call the given command(s)."""
+    assert isinstance(commands, list)
+    p = None
+    for c in commands:
+        try:
+            dispcmd = str([c] + args)
+            # remember shell=False, so use git.cmd on windows, not just git
+            p = subprocess.Popen([c] + args, cwd=cwd, env=env,
+                                 stdout=subprocess.PIPE,
+                                 stderr=(subprocess.PIPE if hide_stderr
+                                         else None))
+            break
+        except EnvironmentError:
+            e = sys.exc_info()[1]
+            if e.errno == errno.ENOENT:
+                continue
+            if verbose:
+                print("unable to run %%s" %% dispcmd)
+                print(e)
+            return None, None
+    else:
+        if verbose:
+            print("unable to find command, tried %%s" %% (commands,))
+        return None, None
+    stdout = p.communicate()[0].strip().decode()
+    if p.returncode != 0:
+        if verbose:
+            print("unable to run %%s (error)" %% dispcmd)
+            print("stdout was %%s" %% stdout)
+        return None, p.returncode
+    return stdout, p.returncode
+
+
+def versions_from_parentdir(parentdir_prefix, root, verbose):
+    """Try to determine the version from the parent directory name.
+
+    Source tarballs conventionally unpack into a directory that includes both
+    the project name and a version string. We will also support searching up
+    two directory levels for an appropriately named parent directory
+    """
+    rootdirs = []
+
+    for i in range(3):
+        dirname = os.path.basename(root)
+        if dirname.startswith(parentdir_prefix):
+            return {"version": dirname[len(parentdir_prefix):],
+                    "full-revisionid": None,
+                    "dirty": False, "error": None, "date": None}
+        else:
+            rootdirs.append(root)
+            root = os.path.dirname(root)  # up a level
+
+    if verbose:
+        print("Tried directories %%s but none started with prefix %%s" %%
+              (str(rootdirs), parentdir_prefix))
+    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
+
+
+@register_vcs_handler("git", "get_keywords")
+def git_get_keywords(versionfile_abs):
+    """Extract version information from the given file."""
+    # the code embedded in _version.py can just fetch the value of these
+    # keywords. When used from setup.py, we don't want to import _version.py,
+    # so we do it with a regexp instead. This function is not used from
+    # _version.py.
+    keywords = {}
+    try:
+        f = open(versionfile_abs, "r")
+        for line in f.readlines():
+            if line.strip().startswith("git_refnames ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["refnames"] = mo.group(1)
+            if line.strip().startswith("git_full ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["full"] = mo.group(1)
+            if line.strip().startswith("git_date ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["date"] = mo.group(1)
+        f.close()
+    except EnvironmentError:
+        pass
+    return keywords
+
+
+@register_vcs_handler("git", "keywords")
+def git_versions_from_keywords(keywords, tag_prefix, verbose):
+    """Get version information from git keywords."""
+    if not keywords:
+        raise NotThisMethod("no keywords at all, weird")
+    date = keywords.get("date")
+    if date is not None:
+        # Use only the last line.  Previous lines may contain GPG signature
+        # information.
+        date = date.splitlines()[-1]
+
+        # git-2.2.0 added "%%cI", which expands to an ISO-8601 -compliant
+        # datestamp. However we prefer "%%ci" (which expands to an "ISO-8601
+        # -like" string, which we must then edit to make compliant), because
+        # it's been around since git-1.5.3, and it's too difficult to
+        # discover which version we're using, or to work around using an
+        # older one.
+        date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+    refnames = keywords["refnames"].strip()
+    if refnames.startswith("$Format"):
+        if verbose:
+            print("keywords are unexpanded, not using")
+        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
+    refs = set([r.strip() for r in refnames.strip("()").split(",")])
+    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
+    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
+    TAG = "tag: "
+    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+    if not tags:
+        # Either we're using git < 1.8.3, or there really are no tags. We use
+        # a heuristic: assume all version tags have a digit. The old git %%d
+        # expansion behaves like git log --decorate=short and strips out the
+        # refs/heads/ and refs/tags/ prefixes that would let us distinguish
+        # between branches and tags. By ignoring refnames without digits, we
+        # filter out many common branch names like "release" and
+        # "stabilization", as well as "HEAD" and "master".
+        tags = set([r for r in refs if re.search(r'\d', r)])
+        if verbose:
+            print("discarding '%%s', no digits" %% ",".join(refs - tags))
+    if verbose:
+        print("likely tags: %%s" %% ",".join(sorted(tags)))
+    for ref in sorted(tags):
+        # sorting will prefer e.g. "2.0" over "2.0rc1"
+        if ref.startswith(tag_prefix):
+            r = ref[len(tag_prefix):]
+            if verbose:
+                print("picking %%s" %% r)
+            return {"version": r,
+                    "full-revisionid": keywords["full"].strip(),
+                    "dirty": False, "error": None,
+                    "date": date}
+    # no suitable tags, so version is "0+unknown", but full hex is still there
+    if verbose:
+        print("no suitable tags, using unknown + full revision id")
+    return {"version": "0+unknown",
+            "full-revisionid": keywords["full"].strip(),
+            "dirty": False, "error": "no suitable tags", "date": None}
+
+
+@register_vcs_handler("git", "pieces_from_vcs")
+def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
+    """Get version from 'git describe' in the root of the source tree.
+
+    This only gets called if the git-archive 'subst' keywords were *not*
+    expanded, and _version.py hasn't already been rewritten with a short
+    version string, meaning we're inside a checked out source tree.
+    """
+    GITS = ["git"]
+    if sys.platform == "win32":
+        GITS = ["git.cmd", "git.exe"]
+
+    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
+                          hide_stderr=True)
+    if rc != 0:
+        if verbose:
+            print("Directory %%s not under git control" %% root)
+        raise NotThisMethod("'git rev-parse --git-dir' returned error")
+
+    # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
+    # if there isn't one, this yields HEX[-dirty] (no NUM)
+    describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
+                                          "--always", "--long",
+                                          "--match", "%%s*" %% tag_prefix],
+                                   cwd=root)
+    # --long was added in git-1.5.5
+    if describe_out is None:
+        raise NotThisMethod("'git describe' failed")
+    describe_out = describe_out.strip()
+    full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
+    if full_out is None:
+        raise NotThisMethod("'git rev-parse' failed")
+    full_out = full_out.strip()
+
+    pieces = {}
+    pieces["long"] = full_out
+    pieces["short"] = full_out[:7]  # maybe improved later
+    pieces["error"] = None
+
+    # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
+    # TAG might have hyphens.
+    git_describe = describe_out
+
+    # look for -dirty suffix
+    dirty = git_describe.endswith("-dirty")
+    pieces["dirty"] = dirty
+    if dirty:
+        git_describe = git_describe[:git_describe.rindex("-dirty")]
+
+    # now we have TAG-NUM-gHEX or HEX
+
+    if "-" in git_describe:
+        # TAG-NUM-gHEX
+        mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
+        if not mo:
+            # unparseable. Maybe git-describe is misbehaving?
+            pieces["error"] = ("unable to parse git-describe output: '%%s'"
+                               %% describe_out)
+            return pieces
+
+        # tag
+        full_tag = mo.group(1)
+        if not full_tag.startswith(tag_prefix):
+            if verbose:
+                fmt = "tag '%%s' doesn't start with prefix '%%s'"
+                print(fmt %% (full_tag, tag_prefix))
+            pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'"
+                               %% (full_tag, tag_prefix))
+            return pieces
+        pieces["closest-tag"] = full_tag[len(tag_prefix):]
+
+        # distance: number of commits since tag
+        pieces["distance"] = int(mo.group(2))
+
+        # commit: short hex revision ID
+        pieces["short"] = mo.group(3)
+
+    else:
+        # HEX: no tags
+        pieces["closest-tag"] = None
+        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
+                                    cwd=root)
+        pieces["distance"] = int(count_out)  # total number of commits
+
+    # commit date: see ISO-8601 comment in git_versions_from_keywords()
+    date = run_command(GITS, ["show", "-s", "--format=%%ci", "HEAD"],
+                       cwd=root)[0].strip()
+    # Use only the last line.  Previous lines may contain GPG signature
+    # information.
+    date = date.splitlines()[-1]
+    pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+
+    return pieces
+
+
+def plus_or_dot(pieces):
+    """Return a + if we don't already have one, else return a ."""
+    if "+" in pieces.get("closest-tag", ""):
+        return "."
+    return "+"
+
+
+def render_pep440(pieces):
+    """Build up version string, with post-release "local version identifier".
+
+    Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
+    get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
+
+    Exceptions:
+    1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += plus_or_dot(pieces)
+            rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"])
+            if pieces["dirty"]:
+                rendered += ".dirty"
+    else:
+        # exception #1
+        rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"],
+                                          pieces["short"])
+        if pieces["dirty"]:
+            rendered += ".dirty"
+    return rendered
+
+
+def render_pep440_pre(pieces):
+    """TAG[.post0.devDISTANCE] -- No -dirty.
+
+    Exceptions:
+    1: no tags. 0.post0.devDISTANCE
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += ".post0.dev%%d" %% pieces["distance"]
+    else:
+        # exception #1
+        rendered = "0.post0.dev%%d" %% pieces["distance"]
+    return rendered
+
+
+def render_pep440_post(pieces):
+    """TAG[.postDISTANCE[.dev0]+gHEX] .
+
+    The ".dev0" means dirty. Note that .dev0 sorts backwards
+    (a dirty tree will appear "older" than the corresponding clean one),
+    but you shouldn't be releasing software with -dirty anyways.
+
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%%d" %% pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+            rendered += plus_or_dot(pieces)
+            rendered += "g%%s" %% pieces["short"]
+    else:
+        # exception #1
+        rendered = "0.post%%d" %% pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+        rendered += "+g%%s" %% pieces["short"]
+    return rendered
+
+
+def render_pep440_old(pieces):
+    """TAG[.postDISTANCE[.dev0]] .
+
+    The ".dev0" means dirty.
+
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%%d" %% pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+    else:
+        # exception #1
+        rendered = "0.post%%d" %% pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+    return rendered
+
+
+def render_git_describe(pieces):
+    """TAG[-DISTANCE-gHEX][-dirty].
+
+    Like 'git describe --tags --dirty --always'.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render_git_describe_long(pieces):
+    """TAG-DISTANCE-gHEX[-dirty].
+
+    Like 'git describe --tags --dirty --always -long'.
+    The distance/hash is unconditional.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render(pieces, style):
+    """Render the given version pieces into the requested style."""
+    if pieces["error"]:
+        return {"version": "unknown",
+                "full-revisionid": pieces.get("long"),
+                "dirty": None,
+                "error": pieces["error"],
+                "date": None}
+
+    if not style or style == "default":
+        style = "pep440"  # the default
+
+    if style == "pep440":
+        rendered = render_pep440(pieces)
+    elif style == "pep440-pre":
+        rendered = render_pep440_pre(pieces)
+    elif style == "pep440-post":
+        rendered = render_pep440_post(pieces)
+    elif style == "pep440-old":
+        rendered = render_pep440_old(pieces)
+    elif style == "git-describe":
+        rendered = render_git_describe(pieces)
+    elif style == "git-describe-long":
+        rendered = render_git_describe_long(pieces)
+    else:
+        raise ValueError("unknown style '%%s'" %% style)
+
+    return {"version": rendered, "full-revisionid": pieces["long"],
+            "dirty": pieces["dirty"], "error": None,
+            "date": pieces.get("date")}
+
+
+def get_versions():
+    """Get version information or return default if unable to do so."""
+    # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
+    # __file__, we can work backwards from there to the root. Some
+    # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
+    # case we can only use expanded keywords.
+
+    cfg = get_config()
+    verbose = cfg.verbose
+
+    try:
+        return git_versions_from_keywords(get_keywords(), cfg.tag_prefix,
+                                          verbose)
+    except NotThisMethod:
+        pass
+
+    try:
+        root = os.path.realpath(__file__)
+        # versionfile_source is the relative path from the top of the source
+        # tree (where the .git directory might live) to this file. Invert
+        # this to find the root from __file__.
+        for i in cfg.versionfile_source.split('/'):
+            root = os.path.dirname(root)
+    except NameError:
+        return {"version": "0+unknown", "full-revisionid": None,
+                "dirty": None,
+                "error": "unable to find root of source tree",
+                "date": None}
+
+    try:
+        pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
+        return render(pieces, cfg.style)
+    except NotThisMethod:
+        pass
+
+    try:
+        if cfg.parentdir_prefix:
+            return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
+    except NotThisMethod:
+        pass
+
+    return {"version": "0+unknown", "full-revisionid": None,
+            "dirty": None,
+            "error": "unable to compute version", "date": None}
+'''
+
+
+@register_vcs_handler("git", "get_keywords")
+def git_get_keywords(versionfile_abs):
+    """Extract version information from the given file."""
+    # the code embedded in _version.py can just fetch the value of these
+    # keywords. When used from setup.py, we don't want to import _version.py,
+    # so we do it with a regexp instead. This function is not used from
+    # _version.py.
+    keywords = {}
+    try:
+        f = open(versionfile_abs, "r")
+        for line in f.readlines():
+            if line.strip().startswith("git_refnames ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["refnames"] = mo.group(1)
+            if line.strip().startswith("git_full ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["full"] = mo.group(1)
+            if line.strip().startswith("git_date ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["date"] = mo.group(1)
+        f.close()
+    except EnvironmentError:
+        pass
+    return keywords
+
+
+@register_vcs_handler("git", "keywords")
+def git_versions_from_keywords(keywords, tag_prefix, verbose):
+    """Get version information from git keywords."""
+    if not keywords:
+        raise NotThisMethod("no keywords at all, weird")
+    date = keywords.get("date")
+    if date is not None:
+        # Use only the last line.  Previous lines may contain GPG signature
+        # information.
+        date = date.splitlines()[-1]
+
+        # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
+        # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
+        # -like" string, which we must then edit to make compliant), because
+        # it's been around since git-1.5.3, and it's too difficult to
+        # discover which version we're using, or to work around using an
+        # older one.
+        date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+    refnames = keywords["refnames"].strip()
+    if refnames.startswith("$Format"):
+        if verbose:
+            print("keywords are unexpanded, not using")
+        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
+    refs = set([r.strip() for r in refnames.strip("()").split(",")])
+    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
+    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
+    TAG = "tag: "
+    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+    if not tags:
+        # Either we're using git < 1.8.3, or there really are no tags. We use
+        # a heuristic: assume all version tags have a digit. The old git %d
+        # expansion behaves like git log --decorate=short and strips out the
+        # refs/heads/ and refs/tags/ prefixes that would let us distinguish
+        # between branches and tags. By ignoring refnames without digits, we
+        # filter out many common branch names like "release" and
+        # "stabilization", as well as "HEAD" and "master".
+        tags = set([r for r in refs if re.search(r'\d', r)])
+        if verbose:
+            print("discarding '%s', no digits" % ",".join(refs - tags))
+    if verbose:
+        print("likely tags: %s" % ",".join(sorted(tags)))
+    for ref in sorted(tags):
+        # sorting will prefer e.g. "2.0" over "2.0rc1"
+        if ref.startswith(tag_prefix):
+            r = ref[len(tag_prefix):]
+            if verbose:
+                print("picking %s" % r)
+            return {"version": r,
+                    "full-revisionid": keywords["full"].strip(),
+                    "dirty": False, "error": None,
+                    "date": date}
+    # no suitable tags, so version is "0+unknown", but full hex is still there
+    if verbose:
+        print("no suitable tags, using unknown + full revision id")
+    return {"version": "0+unknown",
+            "full-revisionid": keywords["full"].strip(),
+            "dirty": False, "error": "no suitable tags", "date": None}
+
+
+@register_vcs_handler("git", "pieces_from_vcs")
+def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
+    """Get version from 'git describe' in the root of the source tree.
+
+    This only gets called if the git-archive 'subst' keywords were *not*
+    expanded, and _version.py hasn't already been rewritten with a short
+    version string, meaning we're inside a checked out source tree.
+    """
+    GITS = ["git"]
+    if sys.platform == "win32":
+        GITS = ["git.cmd", "git.exe"]
+
+    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
+                          hide_stderr=True)
+    if rc != 0:
+        if verbose:
+            print("Directory %s not under git control" % root)
+        raise NotThisMethod("'git rev-parse --git-dir' returned error")
+
+    # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
+    # if there isn't one, this yields HEX[-dirty] (no NUM)
+    describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
+                                          "--always", "--long",
+                                          "--match", "%s*" % tag_prefix],
+                                   cwd=root)
+    # --long was added in git-1.5.5
+    if describe_out is None:
+        raise NotThisMethod("'git describe' failed")
+    describe_out = describe_out.strip()
+    full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
+    if full_out is None:
+        raise NotThisMethod("'git rev-parse' failed")
+    full_out = full_out.strip()
+
+    pieces = {}
+    pieces["long"] = full_out
+    pieces["short"] = full_out[:7]  # maybe improved later
+    pieces["error"] = None
+
+    # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
+    # TAG might have hyphens.
+    git_describe = describe_out
+
+    # look for -dirty suffix
+    dirty = git_describe.endswith("-dirty")
+    pieces["dirty"] = dirty
+    if dirty:
+        git_describe = git_describe[:git_describe.rindex("-dirty")]
+
+    # now we have TAG-NUM-gHEX or HEX
+
+    if "-" in git_describe:
+        # TAG-NUM-gHEX
+        mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
+        if not mo:
+            # unparseable. Maybe git-describe is misbehaving?
+            pieces["error"] = ("unable to parse git-describe output: '%s'"
+                               % describe_out)
+            return pieces
+
+        # tag
+        full_tag = mo.group(1)
+        if not full_tag.startswith(tag_prefix):
+            if verbose:
+                fmt = "tag '%s' doesn't start with prefix '%s'"
+                print(fmt % (full_tag, tag_prefix))
+            pieces["error"] = ("tag '%s' doesn't start with prefix '%s'"
+                               % (full_tag, tag_prefix))
+            return pieces
+        pieces["closest-tag"] = full_tag[len(tag_prefix):]
+
+        # distance: number of commits since tag
+        pieces["distance"] = int(mo.group(2))
+
+        # commit: short hex revision ID
+        pieces["short"] = mo.group(3)
+
+    else:
+        # HEX: no tags
+        pieces["closest-tag"] = None
+        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
+                                    cwd=root)
+        pieces["distance"] = int(count_out)  # total number of commits
+
+    # commit date: see ISO-8601 comment in git_versions_from_keywords()
+    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"],
+                       cwd=root)[0].strip()
+    # Use only the last line.  Previous lines may contain GPG signature
+    # information.
+    date = date.splitlines()[-1]
+    pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+
+    return pieces
+
+
+def do_vcs_install(manifest_in, versionfile_source, ipy):
+    """Git-specific installation logic for Versioneer.
+
+    For Git, this means creating/changing .gitattributes to mark _version.py
+    for export-subst keyword substitution.
+    """
+    GITS = ["git"]
+    if sys.platform == "win32":
+        GITS = ["git.cmd", "git.exe"]
+    files = [manifest_in, versionfile_source]
+    if ipy:
+        files.append(ipy)
+    try:
+        me = __file__
+        if me.endswith(".pyc") or me.endswith(".pyo"):
+            me = os.path.splitext(me)[0] + ".py"
+        versioneer_file = os.path.relpath(me)
+    except NameError:
+        versioneer_file = "versioneer.py"
+    files.append(versioneer_file)
+    present = False
+    try:
+        f = open(".gitattributes", "r")
+        for line in f.readlines():
+            if line.strip().startswith(versionfile_source):
+                if "export-subst" in line.strip().split()[1:]:
+                    present = True
+        f.close()
+    except EnvironmentError:
+        pass
+    if not present:
+        f = open(".gitattributes", "a+")
+        f.write("%s export-subst\n" % versionfile_source)
+        f.close()
+        files.append(".gitattributes")
+    run_command(GITS, ["add", "--"] + files)
+
+
+def versions_from_parentdir(parentdir_prefix, root, verbose):
+    """Try to determine the version from the parent directory name.
+
+    Source tarballs conventionally unpack into a directory that includes both
+    the project name and a version string. We will also support searching up
+    two directory levels for an appropriately named parent directory
+    """
+    rootdirs = []
+
+    for i in range(3):
+        dirname = os.path.basename(root)
+        if dirname.startswith(parentdir_prefix):
+            return {"version": dirname[len(parentdir_prefix):],
+                    "full-revisionid": None,
+                    "dirty": False, "error": None, "date": None}
+        else:
+            rootdirs.append(root)
+            root = os.path.dirname(root)  # up a level
+
+    if verbose:
+        print("Tried directories %s but none started with prefix %s" %
+              (str(rootdirs), parentdir_prefix))
+    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
+
+
+SHORT_VERSION_PY = """
+# This file was generated by 'versioneer.py' (0.19) from
+# revision-control system data, or from the parent directory name of an
+# unpacked source archive. Distribution tarballs contain a pre-generated copy
+# of this file.
+
+import json
+
+version_json = '''
+%s
+'''  # END VERSION_JSON
+
+
+def get_versions():
+    return json.loads(version_json)
+"""
+
+
+def versions_from_file(filename):
+    """Try to determine the version from _version.py if present."""
+    try:
+        with open(filename) as f:
+            contents = f.read()
+    except EnvironmentError:
+        raise NotThisMethod("unable to read _version.py")
+    mo = re.search(r"version_json = '''\n(.*)'''  # END VERSION_JSON",
+                   contents, re.M | re.S)
+    if not mo:
+        mo = re.search(r"version_json = '''\r\n(.*)'''  # END VERSION_JSON",
+                       contents, re.M | re.S)
+    if not mo:
+        raise NotThisMethod("no version_json in _version.py")
+    return json.loads(mo.group(1))
+
+
+def write_to_version_file(filename, versions):
+    """Write the given version number to the given _version.py file."""
+    os.unlink(filename)
+    contents = json.dumps(versions, sort_keys=True,
+                          indent=1, separators=(",", ": "))
+    with open(filename, "w") as f:
+        f.write(SHORT_VERSION_PY % contents)
+
+    print("set %s to '%s'" % (filename, versions["version"]))
+
+
+def plus_or_dot(pieces):
+    """Return a + if we don't already have one, else return a ."""
+    if "+" in pieces.get("closest-tag", ""):
+        return "."
+    return "+"
+
+
+def render_pep440(pieces):
+    """Build up version string, with post-release "local version identifier".
+
+    Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
+    get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
+
+    Exceptions:
+    1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += plus_or_dot(pieces)
+            rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
+            if pieces["dirty"]:
+                rendered += ".dirty"
+    else:
+        # exception #1
+        rendered = "0+untagged.%d.g%s" % (pieces["distance"],
+                                          pieces["short"])
+        if pieces["dirty"]:
+            rendered += ".dirty"
+    return rendered
+
+
+def render_pep440_pre(pieces):
+    """TAG[.post0.devDISTANCE] -- No -dirty.
+
+    Exceptions:
+    1: no tags. 0.post0.devDISTANCE
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += ".post0.dev%d" % pieces["distance"]
+    else:
+        # exception #1
+        rendered = "0.post0.dev%d" % pieces["distance"]
+    return rendered
+
+
+def render_pep440_post(pieces):
+    """TAG[.postDISTANCE[.dev0]+gHEX] .
+
+    The ".dev0" means dirty. Note that .dev0 sorts backwards
+    (a dirty tree will appear "older" than the corresponding clean one),
+    but you shouldn't be releasing software with -dirty anyways.
+
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+            rendered += plus_or_dot(pieces)
+            rendered += "g%s" % pieces["short"]
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+        rendered += "+g%s" % pieces["short"]
+    return rendered
+
+
+def render_pep440_old(pieces):
+    """TAG[.postDISTANCE[.dev0]] .
+
+    The ".dev0" means dirty.
+
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+    return rendered
+
+
+def render_git_describe(pieces):
+    """TAG[-DISTANCE-gHEX][-dirty].
+
+    Like 'git describe --tags --dirty --always'.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render_git_describe_long(pieces):
+    """TAG-DISTANCE-gHEX[-dirty].
+
+    Like 'git describe --tags --dirty --always -long'.
+    The distance/hash is unconditional.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render(pieces, style):
+    """Render the given version pieces into the requested style."""
+    if pieces["error"]:
+        return {"version": "unknown",
+                "full-revisionid": pieces.get("long"),
+                "dirty": None,
+                "error": pieces["error"],
+                "date": None}
+
+    if not style or style == "default":
+        style = "pep440"  # the default
+
+    if style == "pep440":
+        rendered = render_pep440(pieces)
+    elif style == "pep440-pre":
+        rendered = render_pep440_pre(pieces)
+    elif style == "pep440-post":
+        rendered = render_pep440_post(pieces)
+    elif style == "pep440-old":
+        rendered = render_pep440_old(pieces)
+    elif style == "git-describe":
+        rendered = render_git_describe(pieces)
+    elif style == "git-describe-long":
+        rendered = render_git_describe_long(pieces)
+    else:
+        raise ValueError("unknown style '%s'" % style)
+
+    return {"version": rendered, "full-revisionid": pieces["long"],
+            "dirty": pieces["dirty"], "error": None,
+            "date": pieces.get("date")}
+
+
+class VersioneerBadRootError(Exception):
+    """The project root directory is unknown or missing key files."""
+
+
+def get_versions(verbose=False):
+    """Get the project version from whatever source is available.
+
+    Returns dict with two keys: 'version' and 'full'.
+    """
+    if "versioneer" in sys.modules:
+        # see the discussion in cmdclass.py:get_cmdclass()
+        del sys.modules["versioneer"]
+
+    root = get_root()
+    cfg = get_config_from_root(root)
+
+    assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg"
+    handlers = HANDLERS.get(cfg.VCS)
+    assert handlers, "unrecognized VCS '%s'" % cfg.VCS
+    verbose = verbose or cfg.verbose
+    assert cfg.versionfile_source is not None, \
+        "please set versioneer.versionfile_source"
+    assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix"
+
+    versionfile_abs = os.path.join(root, cfg.versionfile_source)
+
+    # extract version from first of: _version.py, VCS command (e.g. 'git
+    # describe'), parentdir. This is meant to work for developers using a
+    # source checkout, for users of a tarball created by 'setup.py sdist',
+    # and for users of a tarball/zipball created by 'git archive' or github's
+    # download-from-tag feature or the equivalent in other VCSes.
+
+    get_keywords_f = handlers.get("get_keywords")
+    from_keywords_f = handlers.get("keywords")
+    if get_keywords_f and from_keywords_f:
+        try:
+            keywords = get_keywords_f(versionfile_abs)
+            ver = from_keywords_f(keywords, cfg.tag_prefix, verbose)
+            if verbose:
+                print("got version from expanded keyword %s" % ver)
+            return ver
+        except NotThisMethod:
+            pass
+
+    try:
+        ver = versions_from_file(versionfile_abs)
+        if verbose:
+            print("got version from file %s %s" % (versionfile_abs, ver))
+        return ver
+    except NotThisMethod:
+        pass
+
+    from_vcs_f = handlers.get("pieces_from_vcs")
+    if from_vcs_f:
+        try:
+            pieces = from_vcs_f(cfg.tag_prefix, root, verbose)
+            ver = render(pieces, cfg.style)
+            if verbose:
+                print("got version from VCS %s" % ver)
+            return ver
+        except NotThisMethod:
+            pass
+
+    try:
+        if cfg.parentdir_prefix:
+            ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
+            if verbose:
+                print("got version from parentdir %s" % ver)
+            return ver
+    except NotThisMethod:
+        pass
+
+    if verbose:
+        print("unable to compute version")
+
+    return {"version": "0+unknown", "full-revisionid": None,
+            "dirty": None, "error": "unable to compute version",
+            "date": None}
+
+
+def get_version():
+    """Get the short version string for this project."""
+    return get_versions()["version"]
+
+
+def get_cmdclass(cmdclass=None):
+    """Get the custom setuptools/distutils subclasses used by Versioneer.
+
+    If the package uses a different cmdclass (e.g. one from numpy), it
+    should be provide as an argument.
+    """
+    if "versioneer" in sys.modules:
+        del sys.modules["versioneer"]
+        # this fixes the "python setup.py develop" case (also 'install' and
+        # 'easy_install .'), in which subdependencies of the main project are
+        # built (using setup.py bdist_egg) in the same python process. Assume
+        # a main project A and a dependency B, which use different versions
+        # of Versioneer. A's setup.py imports A's Versioneer, leaving it in
+        # sys.modules by the time B's setup.py is executed, causing B to run
+        # with the wrong versioneer. Setuptools wraps the sub-dep builds in a
+        # sandbox that restores sys.modules to it's pre-build state, so the
+        # parent is protected against the child's "import versioneer". By
+        # removing ourselves from sys.modules here, before the child build
+        # happens, we protect the child from the parent's versioneer too.
+        # Also see https://github.com/python-versioneer/python-versioneer/issues/52
+
+    cmds = {} if cmdclass is None else cmdclass.copy()
+
+    # we add "version" to both distutils and setuptools
+    from distutils.core import Command
+
+    class cmd_version(Command):
+        description = "report generated version string"
+        user_options = []
+        boolean_options = []
+
+        def initialize_options(self):
+            pass
+
+        def finalize_options(self):
+            pass
+
+        def run(self):
+            vers = get_versions(verbose=True)
+            print("Version: %s" % vers["version"])
+            print(" full-revisionid: %s" % vers.get("full-revisionid"))
+            print(" dirty: %s" % vers.get("dirty"))
+            print(" date: %s" % vers.get("date"))
+            if vers["error"]:
+                print(" error: %s" % vers["error"])
+    cmds["version"] = cmd_version
+
+    # we override "build_py" in both distutils and setuptools
+    #
+    # most invocation pathways end up running build_py:
+    #  distutils/build -> build_py
+    #  distutils/install -> distutils/build ->..
+    #  setuptools/bdist_wheel -> distutils/install ->..
+    #  setuptools/bdist_egg -> distutils/install_lib -> build_py
+    #  setuptools/install -> bdist_egg ->..
+    #  setuptools/develop -> ?
+    #  pip install:
+    #   copies source tree to a tempdir before running egg_info/etc
+    #   if .git isn't copied too, 'git describe' will fail
+    #   then does setup.py bdist_wheel, or sometimes setup.py install
+    #  setup.py egg_info -> ?
+
+    # we override different "build_py" commands for both environments
+    if 'build_py' in cmds:
+        _build_py = cmds['build_py']
+    elif "setuptools" in sys.modules:
+        from setuptools.command.build_py import build_py as _build_py
+    else:
+        from distutils.command.build_py import build_py as _build_py
+
+    class cmd_build_py(_build_py):
+        def run(self):
+            root = get_root()
+            cfg = get_config_from_root(root)
+            versions = get_versions()
+            _build_py.run(self)
+            # now locate _version.py in the new build/ directory and replace
+            # it with an updated value
+            if cfg.versionfile_build:
+                target_versionfile = os.path.join(self.build_lib,
+                                                  cfg.versionfile_build)
+                print("UPDATING %s" % target_versionfile)
+                write_to_version_file(target_versionfile, versions)
+    cmds["build_py"] = cmd_build_py
+
+    if "setuptools" in sys.modules:
+        from setuptools.command.build_ext import build_ext as _build_ext
+    else:
+        from distutils.command.build_ext import build_ext as _build_ext
+
+    class cmd_build_ext(_build_ext):
+        def run(self):
+            root = get_root()
+            cfg = get_config_from_root(root)
+            versions = get_versions()
+            _build_ext.run(self)
+            if self.inplace:
+                # build_ext --inplace will only build extensions in
+                # build/lib<..> dir with no _version.py to write to.
+                # As in place builds will already have a _version.py
+                # in the module dir, we do not need to write one.
+                return
+            # now locate _version.py in the new build/ directory and replace
+            # it with an updated value
+            target_versionfile = os.path.join(self.build_lib,
+                                              cfg.versionfile_source)
+            print("UPDATING %s" % target_versionfile)
+            write_to_version_file(target_versionfile, versions)
+    cmds["build_ext"] = cmd_build_ext
+
+    if "cx_Freeze" in sys.modules:  # cx_freeze enabled?
+        from cx_Freeze.dist import build_exe as _build_exe
+        # nczeczulin reports that py2exe won't like the pep440-style string
+        # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g.
+        # setup(console=[{
+        #   "version": versioneer.get_version().split("+", 1)[0], # FILEVERSION
+        #   "product_version": versioneer.get_version(),
+        #   ...
+
+        class cmd_build_exe(_build_exe):
+            def run(self):
+                root = get_root()
+                cfg = get_config_from_root(root)
+                versions = get_versions()
+                target_versionfile = cfg.versionfile_source
+                print("UPDATING %s" % target_versionfile)
+                write_to_version_file(target_versionfile, versions)
+
+                _build_exe.run(self)
+                os.unlink(target_versionfile)
+                with open(cfg.versionfile_source, "w") as f:
+                    LONG = LONG_VERSION_PY[cfg.VCS]
+                    f.write(LONG %
+                            {"DOLLAR": "$",
+                             "STYLE": cfg.style,
+                             "TAG_PREFIX": cfg.tag_prefix,
+                             "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+                             "VERSIONFILE_SOURCE": cfg.versionfile_source,
+                             })
+        cmds["build_exe"] = cmd_build_exe
+        del cmds["build_py"]
+
+    if 'py2exe' in sys.modules:  # py2exe enabled?
+        from py2exe.distutils_buildexe import py2exe as _py2exe
+
+        class cmd_py2exe(_py2exe):
+            def run(self):
+                root = get_root()
+                cfg = get_config_from_root(root)
+                versions = get_versions()
+                target_versionfile = cfg.versionfile_source
+                print("UPDATING %s" % target_versionfile)
+                write_to_version_file(target_versionfile, versions)
+
+                _py2exe.run(self)
+                os.unlink(target_versionfile)
+                with open(cfg.versionfile_source, "w") as f:
+                    LONG = LONG_VERSION_PY[cfg.VCS]
+                    f.write(LONG %
+                            {"DOLLAR": "$",
+                             "STYLE": cfg.style,
+                             "TAG_PREFIX": cfg.tag_prefix,
+                             "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+                             "VERSIONFILE_SOURCE": cfg.versionfile_source,
+                             })
+        cmds["py2exe"] = cmd_py2exe
+
+    # we override different "sdist" commands for both environments
+    if 'sdist' in cmds:
+        _sdist = cmds['sdist']
+    elif "setuptools" in sys.modules:
+        from setuptools.command.sdist import sdist as _sdist
+    else:
+        from distutils.command.sdist import sdist as _sdist
+
+    class cmd_sdist(_sdist):
+        def run(self):
+            versions = get_versions()
+            self._versioneer_generated_versions = versions
+            # unless we update this, the command will keep using the old
+            # version
+            self.distribution.metadata.version = versions["version"]
+            return _sdist.run(self)
+
+        def make_release_tree(self, base_dir, files):
+            root = get_root()
+            cfg = get_config_from_root(root)
+            _sdist.make_release_tree(self, base_dir, files)
+            # now locate _version.py in the new base_dir directory
+            # (remembering that it may be a hardlink) and replace it with an
+            # updated value
+            target_versionfile = os.path.join(base_dir, cfg.versionfile_source)
+            print("UPDATING %s" % target_versionfile)
+            write_to_version_file(target_versionfile,
+                                  self._versioneer_generated_versions)
+    cmds["sdist"] = cmd_sdist
+
+    return cmds
+
+
+CONFIG_ERROR = """
+setup.cfg is missing the necessary Versioneer configuration. You need
+a section like:
+
+ [versioneer]
+ VCS = git
+ style = pep440
+ versionfile_source = src/myproject/_version.py
+ versionfile_build = myproject/_version.py
+ tag_prefix =
+ parentdir_prefix = myproject-
+
+You will also need to edit your setup.py to use the results:
+
+ import versioneer
+ setup(version=versioneer.get_version(),
+       cmdclass=versioneer.get_cmdclass(), ...)
+
+Please read the docstring in ./versioneer.py for configuration instructions,
+edit setup.cfg, and re-run the installer or 'python versioneer.py setup'.
+"""
+
+SAMPLE_CONFIG = """
+# See the docstring in versioneer.py for instructions. Note that you must
+# re-run 'versioneer.py setup' after changing this section, and commit the
+# resulting files.
+
+[versioneer]
+#VCS = git
+#style = pep440
+#versionfile_source =
+#versionfile_build =
+#tag_prefix =
+#parentdir_prefix =
+
+"""
+
+INIT_PY_SNIPPET = """
+from ._version import get_versions
+__version__ = get_versions()['version']
+del get_versions
+"""
+
+
+def do_setup():
+    """Do main VCS-independent setup function for installing Versioneer."""
+    root = get_root()
+    try:
+        cfg = get_config_from_root(root)
+    except (EnvironmentError, configparser.NoSectionError,
+            configparser.NoOptionError) as e:
+        if isinstance(e, (EnvironmentError, configparser.NoSectionError)):
+            print("Adding sample versioneer config to setup.cfg",
+                  file=sys.stderr)
+            with open(os.path.join(root, "setup.cfg"), "a") as f:
+                f.write(SAMPLE_CONFIG)
+        print(CONFIG_ERROR, file=sys.stderr)
+        return 1
+
+    print(" creating %s" % cfg.versionfile_source)
+    with open(cfg.versionfile_source, "w") as f:
+        LONG = LONG_VERSION_PY[cfg.VCS]
+        f.write(LONG % {"DOLLAR": "$",
+                        "STYLE": cfg.style,
+                        "TAG_PREFIX": cfg.tag_prefix,
+                        "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+                        "VERSIONFILE_SOURCE": cfg.versionfile_source,
+                        })
+
+    ipy = os.path.join(os.path.dirname(cfg.versionfile_source),
+                       "__init__.py")
+    if os.path.exists(ipy):
+        try:
+            with open(ipy, "r") as f:
+                old = f.read()
+        except EnvironmentError:
+            old = ""
+        if INIT_PY_SNIPPET not in old:
+            print(" appending to %s" % ipy)
+            with open(ipy, "a") as f:
+                f.write(INIT_PY_SNIPPET)
+        else:
+            print(" %s unmodified" % ipy)
+    else:
+        print(" %s doesn't exist, ok" % ipy)
+        ipy = None
+
+    # Make sure both the top-level "versioneer.py" and versionfile_source
+    # (PKG/_version.py, used by runtime code) are in MANIFEST.in, so
+    # they'll be copied into source distributions. Pip won't be able to
+    # install the package without this.
+    manifest_in = os.path.join(root, "MANIFEST.in")
+    simple_includes = set()
+    try:
+        with open(manifest_in, "r") as f:
+            for line in f:
+                if line.startswith("include "):
+                    for include in line.split()[1:]:
+                        simple_includes.add(include)
+    except EnvironmentError:
+        pass
+    # That doesn't cover everything MANIFEST.in can do
+    # (http://docs.python.org/2/distutils/sourcedist.html#commands), so
+    # it might give some false negatives. Appending redundant 'include'
+    # lines is safe, though.
+    if "versioneer.py" not in simple_includes:
+        print(" appending 'versioneer.py' to MANIFEST.in")
+        with open(manifest_in, "a") as f:
+            f.write("include versioneer.py\n")
+    else:
+        print(" 'versioneer.py' already in MANIFEST.in")
+    if cfg.versionfile_source not in simple_includes:
+        print(" appending versionfile_source ('%s') to MANIFEST.in" %
+              cfg.versionfile_source)
+        with open(manifest_in, "a") as f:
+            f.write("include %s\n" % cfg.versionfile_source)
+    else:
+        print(" versionfile_source already in MANIFEST.in")
+
+    # Make VCS-specific changes. For git, this means creating/changing
+    # .gitattributes to mark _version.py for export-subst keyword
+    # substitution.
+    do_vcs_install(manifest_in, cfg.versionfile_source, ipy)
+    return 0
+
+
+def scan_setup_py():
+    """Validate the contents of setup.py against Versioneer's expectations."""
+    found = set()
+    setters = False
+    errors = 0
+    with open("setup.py", "r") as f:
+        for line in f.readlines():
+            if "import versioneer" in line:
+                found.add("import")
+            if "versioneer.get_cmdclass()" in line:
+                found.add("cmdclass")
+            if "versioneer.get_version()" in line:
+                found.add("get_version")
+            if "versioneer.VCS" in line:
+                setters = True
+            if "versioneer.versionfile_source" in line:
+                setters = True
+    if len(found) != 3:
+        print("")
+        print("Your setup.py appears to be missing some important items")
+        print("(but I might be wrong). Please make sure it has something")
+        print("roughly like the following:")
+        print("")
+        print(" import versioneer")
+        print(" setup( version=versioneer.get_version(),")
+        print("        cmdclass=versioneer.get_cmdclass(),  ...)")
+        print("")
+        errors += 1
+    if setters:
+        print("You should remove lines like 'versioneer.VCS = ' and")
+        print("'versioneer.versionfile_source = ' . This configuration")
+        print("now lives in setup.cfg, and should be removed from setup.py")
+        print("")
+        errors += 1
+    return errors
+
+
+if __name__ == "__main__":
+    cmd = sys.argv[1]
+    if cmd == "setup":
+        errors = do_setup()
+        errors += scan_setup_py()
+        if errors:
+            sys.exit(1)