From 6401b6f7c0bde5bf608052a36c1fd3cb92ac6378 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Wed, 24 Feb 2021 19:27:00 -0800
Subject: [PATCH 01/69] Fix legacy code on target host

---
 tests/python/unittest/test_target_target.py | 35 ++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/tests/python/unittest/test_target_target.py b/tests/python/unittest/test_target_target.py
index 7b998bef34a5..b9117072c695 100644
--- a/tests/python/unittest/test_target_target.py
+++ b/tests/python/unittest/test_target_target.py
@@ -216,7 +216,30 @@ def test_target_host_warning():
     attributes fails as expected.
     """
     with pytest.raises(ValueError):
-        tgt = tvm.target.Target("cuda --host nvidia/jetson-nano", "llvm")
+        tvm.target.Target("cuda --host nvidia/jetson-nano", "llvm")
+
+
+def test_target_host_merge_0():
+    tgt = tvm.target.Target(tvm.target.Target("cuda --host nvidia/jetson-nano"), None)
+    assert tgt.kind.name == "cuda"
+    assert tgt.host.kind.name == "cuda"
+    assert tgt.host.attrs["arch"] == "sm_53"
+    assert tgt.host.attrs["shared_memory_per_block"] == 49152
+    assert tgt.host.attrs["max_threads_per_block"] == 1024
+    assert tgt.host.attrs["thread_warp_size"] == 32
+    assert tgt.host.attrs["registers_per_block"] == 32768
+
+
+def test_target_host_merge_1():
+    tgt = tvm.target.Target("cuda --host nvidia/jetson-nano")
+    tgt = tvm.target.Target(tgt, tgt.host)
+    assert tgt.kind.name == "cuda"
+    assert tgt.host.kind.name == "cuda"
+    assert tgt.host.attrs["arch"] == "sm_53"
+    assert tgt.host.attrs["shared_memory_per_block"] == 49152
+    assert tgt.host.attrs["max_threads_per_block"] == 1024
+    assert tgt.host.attrs["thread_warp_size"] == 32
+    assert tgt.host.attrs["registers_per_block"] == 32768
 
 
 if __name__ == "__main__":
@@ -226,4 +249,14 @@ def test_target_host_warning():
     test_target_config()
     test_config_map()
     test_composite_target()
+    test_target_tag_0()
+    test_target_tag_1()
     test_list_kinds()
+    test_target_host_tags()
+    test_target_host_tag_dict()
+    test_target_host_single_dict()
+    test_target_host_single_string()
+    test_target_host_single_string_with_tag()
+    test_target_host_warning()
+    test_target_host_merge_0()
+    test_target_host_merge_1()
\ No newline at end of file

From 0167a5f2fe2796f18b85bec57ff02f29534d1060 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Thu, 25 Feb 2021 00:44:35 -0800
Subject: [PATCH 02/69] Modify legacy code for target host change

---
 python/tvm/autotvm/graph_tuner/base_graph_tuner.py | 2 ++
 python/tvm/autotvm/task/relay_integration.py       | 5 +++++
 python/tvm/autotvm/task/task.py                    | 4 ++--
 python/tvm/relay/build_module.py                   | 5 ++++-
 src/target/target.cc                               | 2 +-
 5 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/python/tvm/autotvm/graph_tuner/base_graph_tuner.py b/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
index 741b05f4c453..c802a4289385 100644
--- a/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
+++ b/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
@@ -525,6 +525,8 @@ def _callback(_, inputs, results):
                 continue
 
             records = []
+            target = Target(target, target_host)
+            target_host = target.host
             task = autotvm.task.create(
                 "layout_transform", args=args, target=self._target, target_host=target_host
             )
diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py
index fe88d1741d60..b3bc59309f27 100644
--- a/python/tvm/autotvm/task/relay_integration.py
+++ b/python/tvm/autotvm/task/relay_integration.py
@@ -25,6 +25,7 @@
 
 import tvm
 from tvm.autotvm.task.dispatcher import DispatchContext, FallbackContext
+from tvm.target import Target
 from .task import create
 from .topi_integration import TaskExtractEnv
 
@@ -89,6 +90,8 @@ def extract_from_program(mod, params, target, target_host=None, ops=None):
     task: Array of autotvm.task.Task
         collected tasks
     """
+    target = Target(target, target_host)
+    target_host = target.host
     return extract_from_multiple_program([mod], [params], target, target_host, ops)
 
 
@@ -152,6 +155,8 @@ def extract_from_multiple_program(mods, params, target, target_host=None, ops=No
     tasks = []
     for task_name, args in env.get_tasks():
         try:
+            target = Target(target, target_host)
+            target_host = target.host
             tsk = create(task_name, args, target=target, target_host=target_host)
             tasks.append(tsk)
         except topi.InvalidShapeError:
diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py
index 52f0996c800c..fb7265230594 100644
--- a/python/tvm/autotvm/task/task.py
+++ b/python/tvm/autotvm/task/task.py
@@ -458,8 +458,8 @@ def create(task_name, args, target, target_host=None):
             ret.config_space.code_hash = getattr(sch, "code_hash", None)
 
     ret.flop = ret.config_space.flop or compute_flop(sch)
-    ret.target = target
-    ret.target_host = target_host
+    ret.target = Target(target, target_host)
+    ret.target_host = target.host
 
     return ret
 
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index f05e105ed2a2..05f41dd8b97d 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -129,7 +129,8 @@ def build(self, mod, target=None, target_host=None, params=None):
         old_autotvm_silent = autotvm.GLOBAL_SCOPE.silent
         autotvm.GLOBAL_SCOPE.silent = use_auto_scheduler
 
-        self._build(mod, target, target_host)
+        target = Target(target, target_host)
+        self._build(mod, target, target.host)
         autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent
 
         # Get artifacts
@@ -252,6 +253,8 @@ def build(mod, target=None, target_host=None, params=None, mod_name="default"):
 
     target = _update_target(target)
 
+    target = Target(target, target_host)
+    target_host = target.host
     if isinstance(target_host, (str, Target)):
         target_host = Target(target_host)
     elif target_host:
diff --git a/src/target/target.cc b/src/target/target.cc
index b5ca4c38bbb9..833d131764a8 100644
--- a/src/target/target.cc
+++ b/src/target/target.cc
@@ -375,7 +375,7 @@ Target::Target(const Map<String, ObjectRef>& config) {
 
 Target::Target(Target target, Target host) {
   ObjectPtr<TargetNode> n = make_object<TargetNode>(*target.get());
-  CHECK(!n->host.defined())
+  CHECK(!n->host.defined() && target -> host != host)
       << "ValueError: Adding a host to a target whose host field has been defined";
   // add target host into host field
   n->host = std::move(host);

From 2a3c502150be02c18549d59abfd6b20748ca9149 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Thu, 25 Feb 2021 01:11:00 -0800
Subject: [PATCH 03/69] Add tests and fix merge issue

---
 tests/python/unittest/test_target_target.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/tests/python/unittest/test_target_target.py b/tests/python/unittest/test_target_target.py
index b9117072c695..30876d50f7dc 100644
--- a/tests/python/unittest/test_target_target.py
+++ b/tests/python/unittest/test_target_target.py
@@ -231,15 +231,15 @@ def test_target_host_merge_0():
 
 
 def test_target_host_merge_1():
-    tgt = tvm.target.Target("cuda --host nvidia/jetson-nano")
+    tgt = tvm.target.Target("cuda --host llvm")
     tgt = tvm.target.Target(tgt, tgt.host)
     assert tgt.kind.name == "cuda"
-    assert tgt.host.kind.name == "cuda"
-    assert tgt.host.attrs["arch"] == "sm_53"
-    assert tgt.host.attrs["shared_memory_per_block"] == 49152
-    assert tgt.host.attrs["max_threads_per_block"] == 1024
-    assert tgt.host.attrs["thread_warp_size"] == 32
-    assert tgt.host.attrs["registers_per_block"] == 32768
+    assert tgt.host.kind.name == "llvm"
+
+
+def test_target_host_merge_2():
+    with pytest.raises(ValueError):
+        tvm.target.Target(tvm.target.Target("cuda --host llvm"), tvm.target.Target("llvm"))
 
 
 if __name__ == "__main__":
@@ -259,4 +259,5 @@ def test_target_host_merge_1():
     test_target_host_single_string_with_tag()
     test_target_host_warning()
     test_target_host_merge_0()
-    test_target_host_merge_1()
\ No newline at end of file
+    test_target_host_merge_1()
+    test_target_host_merge_2()
\ No newline at end of file

From 511ce56b5febf1fc62cbc0411e7ad2589d3b68b3 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Thu, 25 Feb 2021 01:12:03 -0800
Subject: [PATCH 04/69] Add condition for same host

---
 src/target/target.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/target/target.cc b/src/target/target.cc
index 833d131764a8..5f0e54adac69 100644
--- a/src/target/target.cc
+++ b/src/target/target.cc
@@ -375,7 +375,7 @@ Target::Target(const Map<String, ObjectRef>& config) {
 
 Target::Target(Target target, Target host) {
   ObjectPtr<TargetNode> n = make_object<TargetNode>(*target.get());
-  CHECK(!n->host.defined() && target -> host != host)
+  CHECK((!n->host.defined()) || n->host == host)
       << "ValueError: Adding a host to a target whose host field has been defined";
   // add target host into host field
   n->host = std::move(host);

From 69601a7019a96c6c58eaf328281b1b8e9600482a Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Thu, 25 Feb 2021 17:10:15 -0800
Subject: [PATCH 05/69] Modify all files for new target host api compatibility

---
 python/tvm/auto_scheduler/relay_integration.py |  6 ++----
 python/tvm/auto_scheduler/search_task.py       | 11 +++++++----
 python/tvm/autotvm/measure/measure_methods.py  |  4 ++++
 python/tvm/autotvm/task/relay_integration.py   |  6 ++++--
 python/tvm/autotvm/task/task.py                | 11 +++++++----
 python/tvm/contrib/peak.py                     | 15 ++++++++++++++-
 python/tvm/driver/build_module.py              | 12 ++++++++++--
 python/tvm/driver/tvmc/autotuner.py            | 13 +++++++++++--
 python/tvm/driver/tvmc/compiler.py             |  3 ++-
 python/tvm/relay/backend/_backend.py           |  2 ++
 python/tvm/relay/backend/vm.py                 | 10 ++++++++++
 python/tvm/relay/build_module.py               |  7 ++++---
 python/tvm/target/target.py                    |  2 +-
 tests/python/unittest/test_target_target.py    |  8 +++++++-
 14 files changed, 85 insertions(+), 25 deletions(-)

diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py
index b39aba227a88..b78d31670228 100644
--- a/python/tvm/auto_scheduler/relay_integration.py
+++ b/python/tvm/auto_scheduler/relay_integration.py
@@ -108,10 +108,8 @@ def extract_tasks(
     """
     # pylint: disable=import-outside-toplevel
 
-    if isinstance(target, str):
-        target = tvm.target.Target(target)
-    if isinstance(target_host, str):
-        target_host = tvm.target.Target(target_host)
+    target = tvm.target.Target(target, target_host)
+    target_host = target.host
 
     # Run the compiler to collect all TOPI calls during compilation.
     env = TracingEnvironment(
diff --git a/python/tvm/auto_scheduler/search_task.py b/python/tvm/auto_scheduler/search_task.py
index 175c2fa06c39..1c13adfc9894 100644
--- a/python/tvm/auto_scheduler/search_task.py
+++ b/python/tvm/auto_scheduler/search_task.py
@@ -228,6 +228,9 @@ def __init__(
         if isinstance(target_host, str):
             target_host = Target(target_host)
 
+        target = Target(target, target_host)
+        target_host = target.host
+
         if layout_rewrite_option is None:
             layout_rewrite_option = LayoutRewriteOption.get_target_default(target)
 
@@ -322,8 +325,8 @@ def __getstate__(self):
         return {
             "compute_dag": self.compute_dag,
             "workload_key": self.workload_key,
-            "target": self.target,
-            "target_host": self.target_host,
+            "target": Target(self.target, self.target_host),
+            "target_host": Target(self.target, self.target_host).host,
             "hardware_params": self.hardware_params,
             "layout_rewrite_option": self.layout_rewrite_option,
         }
@@ -346,8 +349,8 @@ def __setstate__(self, state):
             _ffi_api.SearchTask,
             state["compute_dag"],
             state["workload_key"],
-            state["target"],
-            state["target_host"],
+            Target(state["target"], state["target_host"]),
+            Target(state["target"], state["target_host"]).host,
             state["hardware_params"],
             state["layout_rewrite_option"],
         )
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index ffe4b97e33db..538bccd980e9 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -397,6 +397,10 @@ def set_task(self, task):
 def _build_func_common(measure_input, check_gpu=None, cuda_arch=None, build_option=None):
     """Common part for building a configuration"""
     target, task, config = measure_input
+
+    target = tvm.target.Target(target, task.target_host)
+    task.target_host = target.host
+
     with target:
         s, args = task.instantiate(config)
 
diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py
index b3bc59309f27..dce15a23969a 100644
--- a/python/tvm/autotvm/task/relay_integration.py
+++ b/python/tvm/autotvm/task/relay_integration.py
@@ -151,12 +151,14 @@ def extract_from_multiple_program(mods, params, target, target_host=None, ops=No
 
         logger.disabled = old_state
 
+    # merge target and target host
+    target = Target(target, target_host)
+    target_host = target.host
+
     # create tasks for target
     tasks = []
     for task_name, args in env.get_tasks():
         try:
-            target = Target(target, target_host)
-            target_host = target.host
             tsk = create(task_name, args, target=target, target_host=target_host)
             tasks.append(tsk)
         except topi.InvalidShapeError:
diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py
index fb7265230594..a96ae7311fc9 100644
--- a/python/tvm/autotvm/task/task.py
+++ b/python/tvm/autotvm/task/task.py
@@ -175,6 +175,7 @@ def __getstate__(self):
         # and restore the function by name when unpickling it.
         import cloudpickle  # pylint: disable=import-outside-toplevel
 
+        self.target = Target(self.target, self.target_host)
         return {
             "name": self.name,
             "args": self.args,
@@ -182,7 +183,7 @@ def __getstate__(self):
             "config_space": self.config_space,
             "flop": self.flop,
             "target": self.target,
-            "target_host": self.target_host,
+            "target_host": self.target.host,
             "func": cloudpickle.dumps(self.func),
         }
 
@@ -195,8 +196,8 @@ def __setstate__(self, state):
         self.config_space = state["config_space"]
         self.func = cloudpickle.loads(state["func"])
         self.flop = state["flop"]
-        self.target = state["target"]
-        self.target_host = state["target_host"]
+        self.target = Target(state["target"], state["target_host"])
+        self.target_host = self.target.host
 
     def __repr__(self):
         return "Task(func_name=%s, args=%s, kwargs=%s, workload=%s)" % (
@@ -448,6 +449,8 @@ def create(task_name, args, target, target_host=None):
     if isinstance(target, str):
         target = Target(target)
 
+    target = Target(target, target_host)
+
     # init config space
     ret.config_space = ConfigSpace()
 
@@ -458,7 +461,7 @@ def create(task_name, args, target, target_host=None):
             ret.config_space.code_hash = getattr(sch, "code_hash", None)
 
     ret.flop = ret.config_space.flop or compute_flop(sch)
-    ret.target = Target(target, target_host)
+    ret.target = target
     ret.target_host = target.host
 
     return ret
diff --git a/python/tvm/contrib/peak.py b/python/tvm/contrib/peak.py
index 62ee9fea400b..e0e020e4de95 100644
--- a/python/tvm/contrib/peak.py
+++ b/python/tvm/contrib/peak.py
@@ -106,6 +106,9 @@ def measure_bandwidth_sum(
     s[y].bind(yi, te.thread_axis("threadIdx.x"))
     s[y].unroll(k)
 
+    target = tvm.target.Target(target, target_host)
+    target_host = target.host
+
     try:
         func = tvm.build(s, [x, y], target, target_host=target_host)
 
@@ -153,6 +156,9 @@ def measure_bandwidth_all_types(
     """
     max_threads = target.max_num_threads
 
+    target = tvm.target.Target(target, target_host)
+    target_host = target.host
+
     result = []
     for base_type in ["float"]:
         for bits in [32]:
@@ -229,6 +235,9 @@ def measure_compute_mad(
 
     max_threads = target.max_num_threads
 
+    target = tvm.target.Target(target, target_host)
+    target_host = target.host
+
     base_type = str(base_type) + str(bits)
     dtype = base_type if lanes == 1 else base_type + "x" + str(lanes)
 
@@ -313,6 +322,9 @@ def measure_compute_all_types(
     result: list
         a list of (type_name, GFLOPS/GIOPS) pairs
     """
+    target = tvm.target.Target(target, target_host)
+    target_host = target.host
+
     result = []
     for base_type in ["float", "int"]:
         for bits in [16, 32, 64]:
@@ -357,7 +369,8 @@ def measure_peak_all(target, target_host, host, port):
     port: int
     """
 
-    target = tvm.target.Target(target)
+    target = tvm.target.Target(target, target_host)
+    target_host = target.host
     remote = rpc.connect(host, port)
     n_times = 20
 
diff --git a/python/tvm/driver/build_module.py b/python/tvm/driver/build_module.py
index 5eaecb422163..8954c5b83f16 100644
--- a/python/tvm/driver/build_module.py
+++ b/python/tvm/driver/build_module.py
@@ -231,8 +231,8 @@ def _build_for_device(input_mod, target, target_host):
     mdev : tvm.module
         A module that contains device code.
     """
-    target = Target(target)
-    target_host = Target(target_host)
+    target = Target(target, target_host)
+    target_host = target.host
     device_type = ndarray.context(target.kind.name, 0).device_type
 
     mod_mixed = input_mod
@@ -399,6 +399,9 @@ def build(inputs, args=None, target=None, target_host=None, name="default_functi
         if not isinstance(mod, tvm.IRModule):
             raise ValueError("inputs must be Schedule, IRModule," "or dict of str to IRModule.")
 
+    target = Target(target, target_host)
+    target_host = target.host
+
     if not target_host:
         for tar, _ in target_input_mod.items():
             tar = Target(tar)
@@ -409,6 +412,9 @@ def build(inputs, args=None, target=None, target_host=None, name="default_functi
     if not target_host:
         target_host = "llvm" if tvm.runtime.enabled("llvm") else "stackvm"
 
+    target = Target(target, target_host)
+    target_host = target.host
+
     mod_host_all = tvm.IRModule({})
 
     device_modules = []
@@ -427,6 +433,8 @@ def build(inputs, args=None, target=None, target_host=None, name="default_functi
 
     if not isinstance(target_host, Target):
         target_host = Target(target_host)
+        target = Target(target, target_host)
+        target_host = target.host
     if (
         target_host.attrs.get("runtime", tvm.runtime.String("c++")) == "c"
         and target_host.attrs.get("system-lib", 0).value == 1
diff --git a/python/tvm/driver/tvmc/autotuner.py b/python/tvm/driver/tvmc/autotuner.py
index 187b7c5d2a31..22ee0f226a54 100644
--- a/python/tvm/driver/tvmc/autotuner.py
+++ b/python/tvm/driver/tvmc/autotuner.py
@@ -20,6 +20,7 @@
 import os.path
 import logging
 import time
+import tvm
 
 from urllib.parse import urlparse
 
@@ -242,6 +243,8 @@ def drive_tune(args):
             )
 
     target, extra_targets = common.target_from_cli(args.target)
+    target = tvm.target.Target(target, args.target_host)
+    target_host = target.host
     mod, params = frontends.load_model(args.FILE, args.model_format, shape_dict=args.input_shapes)
 
     for codegen_from_cli in extra_targets:
@@ -298,7 +301,7 @@ def drive_tune(args):
             mod=mod,
             params=params,
             target=target,
-            target_host=args.target_host,
+            target_host=target_host,
             alter_layout=args.desired_layout,
             hardware_params=hardware_params,
             include_simple_tasks=args.include_simple_tasks,
@@ -321,7 +324,7 @@ def drive_tune(args):
             mod=mod,
             params=params,
             target=target,
-            target_host=args.target_host,
+            target_host=target_host,
             alter_layout=args.desired_layout,
         )
 
@@ -365,6 +368,9 @@ def autotvm_get_tuning_tasks(mod, params, target, target_host=None, alter_layout
     if alter_layout:
         mod = common.convert_graph_layout(mod, alter_layout)
 
+    target = tvm.target.Target(target, target_host)
+    target_host = target.host
+
     tasks = autotvm.task.extract_from_program(
         mod["main"],
         target=target,
@@ -413,6 +419,9 @@ def autoscheduler_get_tuning_tasks(
     if alter_layout:
         mod = common.convert_graph_layout(mod, alter_layout)
 
+    target = tvm.target.Target(target, target_host)
+    target_host = target.host
+
     # Extract the tasks
     tasks, task_weights = auto_scheduler.extract_tasks(
         mod["main"],
diff --git a/python/tvm/driver/tvmc/compiler.py b/python/tvm/driver/tvmc/compiler.py
index fc1805ee0ab4..0edd59460250 100644
--- a/python/tvm/driver/tvmc/compiler.py
+++ b/python/tvm/driver/tvmc/compiler.py
@@ -191,7 +191,8 @@ def compile_model(
         mod = common.convert_graph_layout(mod, alter_layout)
 
     tvm_target, extra_targets = common.target_from_cli(target)
-    target_host = tvm_target if not target_host else target_host
+    tvm_target = tvm.target.Target(tvm_target, tvm_target if not target_host else target_host)
+    target_host = tvm_target.host
 
     for codegen_from_cli in extra_targets:
         codegen = composite_target.get_codegen_by_target(codegen_from_cli["name"])
diff --git a/python/tvm/relay/backend/_backend.py b/python/tvm/relay/backend/_backend.py
index 65b0c0ba87c7..821178a05c8c 100644
--- a/python/tvm/relay/backend/_backend.py
+++ b/python/tvm/relay/backend/_backend.py
@@ -80,6 +80,8 @@ def build(mod, target, target_host=None):
     """
     if target_host == "":
         target_host = None
+    target = tvm.target.Target(target, target_host)
+    target_host = target.host
     return tvm.driver.build(mod, target=target, target_host=target_host)
 
 
diff --git a/python/tvm/relay/backend/vm.py b/python/tvm/relay/backend/vm.py
index 0f7875a9202e..b72491cf7e2c 100644
--- a/python/tvm/relay/backend/vm.py
+++ b/python/tvm/relay/backend/vm.py
@@ -65,6 +65,8 @@ def compile(mod, target=None, target_host=None, params=None):
     compiler = VMCompiler()
     if params:
         compiler.set_params(params)
+    target = Target(target, target_host)
+    target_host = target.host
     compiler.lower(mod, target, target_host)
     compiler.codegen()
     return compiler.get_exec()
@@ -130,6 +132,10 @@ def lower(self, mod, target=None, target_host=None):
         """
         target = self._update_target(target)
         target_host = self._update_target_host(target, target_host)
+
+        target = Target(target, target_host)
+        target_host = target.host
+
         tophub_context = self._tophub_context(target)
         with tophub_context:
             self._lower(mod, target, target_host)
@@ -167,6 +173,10 @@ def optimize(self, mod, target=None, target_host=None, params=None):
         """
         target = self._update_target(target)
         target_host = self._update_target_host(target, target_host)
+
+        target = Target(target, target_host)
+        target_host = target.host
+
         if params:
             self.set_params(params)
         return self._optimize(mod, target, target_host), self.get_params()
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 05f41dd8b97d..26f90b54cd85 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -130,7 +130,8 @@ def build(self, mod, target=None, target_host=None, params=None):
         autotvm.GLOBAL_SCOPE.silent = use_auto_scheduler
 
         target = Target(target, target_host)
-        self._build(mod, target, target.host)
+        target_host = target.host
+        self._build(mod, target, target_host)
         autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent
 
         # Get artifacts
@@ -253,12 +254,12 @@ def build(mod, target=None, target_host=None, params=None, mod_name="default"):
 
     target = _update_target(target)
 
-    target = Target(target, target_host)
-    target_host = target.host
     if isinstance(target_host, (str, Target)):
         target_host = Target(target_host)
     elif target_host:
         raise ValueError("target host must be the type of str, " + "tvm.target.Target, or None")
+    target = Target(target, target_host)
+    target_host = target.host
 
     # If current dispatch context is fallback context (the default root context),
     # then load pre-tuned parameters from TopHub
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 8c60260e640a..f6dcb675fc9a 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -95,7 +95,7 @@ def __init__(self, tag_or_str_or_dict, host_tag_or_str_or_dict=None):
             the possible values are same as tag_or_str_or_dict.
         """
         if not isinstance(tag_or_str_or_dict, (dict, str, Target)):
-            raise ValueError("target has to be a string or dictionary.")
+            raise ValueError("target (host) has to be a string or dictionary.")
         if host_tag_or_str_or_dict is not None:
             self.__init_handle_by_constructor__(
                 _ffi_api.Target, Target(tag_or_str_or_dict), Target(host_tag_or_str_or_dict)
diff --git a/tests/python/unittest/test_target_target.py b/tests/python/unittest/test_target_target.py
index 30876d50f7dc..05cdcf42f1ff 100644
--- a/tests/python/unittest/test_target_target.py
+++ b/tests/python/unittest/test_target_target.py
@@ -242,6 +242,11 @@ def test_target_host_merge_2():
         tvm.target.Target(tvm.target.Target("cuda --host llvm"), tvm.target.Target("llvm"))
 
 
+def test_target_host_merge_3():
+    with pytest.raises(ValueError):
+        tvm.target.Target(tvm.target.Target("cuda --host llvm"), 12.34)
+
+
 if __name__ == "__main__":
     test_target_dispatch()
     test_target_string_parse()
@@ -260,4 +265,5 @@ def test_target_host_merge_2():
     test_target_host_warning()
     test_target_host_merge_0()
     test_target_host_merge_1()
-    test_target_host_merge_2()
\ No newline at end of file
+    test_target_host_merge_2()
+    test_target_host_merge_3()
\ No newline at end of file

From 23187d87c1a40c7bcf0672eb4ff7b48bdd95fba5 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Thu, 25 Feb 2021 23:05:33 -0800
Subject: [PATCH 06/69] Add newline

---
 tests/python/unittest/test_target_target.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/unittest/test_target_target.py b/tests/python/unittest/test_target_target.py
index 05cdcf42f1ff..a19c9aacef4f 100644
--- a/tests/python/unittest/test_target_target.py
+++ b/tests/python/unittest/test_target_target.py
@@ -266,4 +266,4 @@ def test_target_host_merge_3():
     test_target_host_merge_0()
     test_target_host_merge_1()
     test_target_host_merge_2()
-    test_target_host_merge_3()
\ No newline at end of file
+    test_target_host_merge_3()

From 85b27db3c455117c709e907fa35c2bc681a6200e Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Thu, 25 Feb 2021 23:46:11 -0800
Subject: [PATCH 07/69] Change import format

---
 python/tvm/driver/tvmc/autotuner.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/tvm/driver/tvmc/autotuner.py b/python/tvm/driver/tvmc/autotuner.py
index 22ee0f226a54..5f78b5eb8fd1 100644
--- a/python/tvm/driver/tvmc/autotuner.py
+++ b/python/tvm/driver/tvmc/autotuner.py
@@ -20,10 +20,11 @@
 import os.path
 import logging
 import time
-import tvm
 
 from urllib.parse import urlparse
 
+import tvm
+
 from tvm import autotvm, auto_scheduler
 from tvm.autotvm.tuner import GATuner
 from tvm.autotvm.tuner import GridSearchTuner

From 7e4eb0a9db0ed023018923f36f4ad629d2d42d04 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Fri, 26 Feb 2021 10:17:36 -0800
Subject: [PATCH 08/69] Optimize test file

---
 tests/python/unittest/test_target_target.py | 23 +++------------------
 1 file changed, 3 insertions(+), 20 deletions(-)

diff --git a/tests/python/unittest/test_target_target.py b/tests/python/unittest/test_target_target.py
index a19c9aacef4f..49e2de889231 100644
--- a/tests/python/unittest/test_target_target.py
+++ b/tests/python/unittest/test_target_target.py
@@ -15,8 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 import json
-import tvm
+import sys
 import pytest
+import tvm
 from tvm import te
 from tvm.target import cuda, rocm, mali, intel_graphics, arm_cpu, vta, bifrost, hexagon
 
@@ -248,22 +249,4 @@ def test_target_host_merge_3():
 
 
 if __name__ == "__main__":
-    test_target_dispatch()
-    test_target_string_parse()
-    test_target_create()
-    test_target_config()
-    test_config_map()
-    test_composite_target()
-    test_target_tag_0()
-    test_target_tag_1()
-    test_list_kinds()
-    test_target_host_tags()
-    test_target_host_tag_dict()
-    test_target_host_single_dict()
-    test_target_host_single_string()
-    test_target_host_single_string_with_tag()
-    test_target_host_warning()
-    test_target_host_merge_0()
-    test_target_host_merge_1()
-    test_target_host_merge_2()
-    test_target_host_merge_3()
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 59457f625b2207290a70e9e40190015e4218ad67 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Fri, 26 Feb 2021 10:30:30 -0800
Subject: [PATCH 09/69] Add match error info for unit tests

---
 tests/python/unittest/test_target_target.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tests/python/unittest/test_target_target.py b/tests/python/unittest/test_target_target.py
index 49e2de889231..c8fbcaaab096 100644
--- a/tests/python/unittest/test_target_target.py
+++ b/tests/python/unittest/test_target_target.py
@@ -216,7 +216,9 @@ def test_target_host_warning():
     Confirm that constructing a target with invalid
     attributes fails as expected.
     """
-    with pytest.raises(ValueError):
+    with pytest.raises(
+        ValueError, match="Adding a host to a target whose host field has been defined"
+    ):
         tvm.target.Target("cuda --host nvidia/jetson-nano", "llvm")
 
 
@@ -239,12 +241,14 @@ def test_target_host_merge_1():
 
 
 def test_target_host_merge_2():
-    with pytest.raises(ValueError):
+    with pytest.raises(
+        ValueError, match="Adding a host to a target whose host field has been defined"
+    ):
         tvm.target.Target(tvm.target.Target("cuda --host llvm"), tvm.target.Target("llvm"))
 
 
 def test_target_host_merge_3():
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match=r"target \(host\) has to be a string or dictionary."):
         tvm.target.Target(tvm.target.Target("cuda --host llvm"), 12.34)
 
 

From b7e4c712bcc6d4243377b361a3f54685d229b63f Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Mon, 1 Mar 2021 16:22:13 -0800
Subject: [PATCH 10/69] Fix for heterogeneous targets

---
 python/tvm/relay/build_module.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 26f90b54cd85..2b6ef9393785 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -129,8 +129,11 @@ def build(self, mod, target=None, target_host=None, params=None):
         old_autotvm_silent = autotvm.GLOBAL_SCOPE.silent
         autotvm.GLOBAL_SCOPE.silent = use_auto_scheduler
 
-        target = Target(target, target_host)
-        target_host = target.host
+        # Assume the target host of all targets in heterogenous target are identical
+        for k, v in target.items():
+            target[k] = Target(target[k], target_host)
+            target_host = target[k].host
+
         self._build(mod, target, target_host)
         autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent
 
@@ -251,9 +254,9 @@ def build(mod, target=None, target_host=None, params=None, mod_name="default"):
             "instead of deprecated parameter mod (tvm.relay.function.Function)",
             DeprecationWarning,
         )
-
+    print(target)
     target = _update_target(target)
-
+    print(target)
     if isinstance(target_host, (str, Target)):
         target_host = Target(target_host)
     elif target_host:

From f5ccc507834a929efb79efa1265731b90ceee2f8 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Mon, 1 Mar 2021 16:30:05 -0800
Subject: [PATCH 11/69] Fix format for dict iteration

---
 python/tvm/relay/build_module.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 2b6ef9393785..ec349ceb874d 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -130,7 +130,7 @@ def build(self, mod, target=None, target_host=None, params=None):
         autotvm.GLOBAL_SCOPE.silent = use_auto_scheduler
 
         # Assume the target host of all targets in heterogenous target are identical
-        for k, v in target.items():
+        for k in target:
             target[k] = Target(target[k], target_host)
             target_host = target[k].host
 
@@ -261,8 +261,9 @@ def build(mod, target=None, target_host=None, params=None, mod_name="default"):
         target_host = Target(target_host)
     elif target_host:
         raise ValueError("target host must be the type of str, " + "tvm.target.Target, or None")
-    target = Target(target, target_host)
-    target_host = target.host
+    for k in target:
+        target[k] = Target(target[k], target_host)
+        target_host = target[k].host
 
     # If current dispatch context is fallback context (the default root context),
     # then load pre-tuned parameters from TopHub

From 11c77bafefff89d741414d66737dcf439c333ea5 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Tue, 2 Mar 2021 15:55:32 -0800
Subject: [PATCH 12/69] Fix target host type error

---
 python/tvm/target/target.py                 | 6 ++++--
 tests/python/unittest/test_target_target.py | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index f6dcb675fc9a..3399ccd58144 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -94,9 +94,11 @@ def __init__(self, tag_or_str_or_dict, host_tag_or_str_or_dict=None):
             configuration options. When using a dictionary or json string to configure target,
             the possible values are same as tag_or_str_or_dict.
         """
-        if not isinstance(tag_or_str_or_dict, (dict, str, Target)):
-            raise ValueError("target (host) has to be a string or dictionary.")
+        if tag_or_str_or_dict is None or not isinstance(tag_or_str_or_dict, (dict, str, Target)):
+            raise ValueError("target has to be a string or dictionary.")
         if host_tag_or_str_or_dict is not None:
+            if not isinstance(host_tag_or_str_or_dict, (dict, str, Target)):
+                raise ValueError("target host has to be a string or dictionary.")
             self.__init_handle_by_constructor__(
                 _ffi_api.Target, Target(tag_or_str_or_dict), Target(host_tag_or_str_or_dict)
             )
diff --git a/tests/python/unittest/test_target_target.py b/tests/python/unittest/test_target_target.py
index c8fbcaaab096..ad947e6503d9 100644
--- a/tests/python/unittest/test_target_target.py
+++ b/tests/python/unittest/test_target_target.py
@@ -248,7 +248,7 @@ def test_target_host_merge_2():
 
 
 def test_target_host_merge_3():
-    with pytest.raises(ValueError, match=r"target \(host\) has to be a string or dictionary."):
+    with pytest.raises(ValueError, match=r"target host has to be a string or dictionary."):
         tvm.target.Target(tvm.target.Target("cuda --host llvm"), 12.34)
 
 

From 75434224b1f79b902ecc934a78a8519c2ba68b0a Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Tue, 2 Mar 2021 16:24:49 -0800
Subject: [PATCH 13/69] Skip one testcase for tvm infinite loop bug

---
 tests/python/unittest/test_target_target.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/python/unittest/test_target_target.py b/tests/python/unittest/test_target_target.py
index ad947e6503d9..9947ccb4fdd5 100644
--- a/tests/python/unittest/test_target_target.py
+++ b/tests/python/unittest/test_target_target.py
@@ -18,8 +18,7 @@
 import sys
 import pytest
 import tvm
-from tvm import te
-from tvm.target import cuda, rocm, mali, intel_graphics, arm_cpu, vta, bifrost, hexagon
+from tvm.target import cuda, rocm, mali, intel_graphics, arm_cpu, vta, bifrost
 
 
 @tvm.target.generic_func
@@ -247,8 +246,9 @@ def test_target_host_merge_2():
         tvm.target.Target(tvm.target.Target("cuda --host llvm"), tvm.target.Target("llvm"))
 
 
+@pytest.mark.skip(reason="Causing infinite loop for reason to be investigated")
 def test_target_host_merge_3():
-    with pytest.raises(ValueError, match=r"target host has to be a string or dictionary."):
+    with pytest.raises(ValueError, match=r"target ddhost has to be a string or dictionary."):
         tvm.target.Target(tvm.target.Target("cuda --host llvm"), 12.34)
 
 

From fbd597ab21b875c41c3d016f7853f4b195ceae35 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Tue, 2 Mar 2021 18:16:48 -0800
Subject: [PATCH 14/69] Fixed bug for target map compatibility

---
 python/tvm/relay/backend/vm.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/backend/vm.py b/python/tvm/relay/backend/vm.py
index b72491cf7e2c..cec011ae2d8d 100644
--- a/python/tvm/relay/backend/vm.py
+++ b/python/tvm/relay/backend/vm.py
@@ -65,7 +65,7 @@ def compile(mod, target=None, target_host=None, params=None):
     compiler = VMCompiler()
     if params:
         compiler.set_params(params)
-    target = Target(target, target_host)
+    target = tvm.target.Target(target, target_host)
     target_host = target.host
     compiler.lower(mod, target, target_host)
     compiler.codegen()
@@ -133,8 +133,9 @@ def lower(self, mod, target=None, target_host=None):
         target = self._update_target(target)
         target_host = self._update_target_host(target, target_host)
 
-        target = Target(target, target_host)
-        target_host = target.host
+        for k in target:
+            target[k] = tvm.target.Target(target[k], target_host)
+            target_host = target[k].host
 
         tophub_context = self._tophub_context(target)
         with tophub_context:

From 4d11b7bef3470dae90d016be1fffeb43c117be2d Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Wed, 3 Mar 2021 01:19:30 -0800
Subject: [PATCH 15/69] Fix another TargetsMap issue

---
 python/tvm/relay/backend/vm.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/backend/vm.py b/python/tvm/relay/backend/vm.py
index cec011ae2d8d..84b35b49e70f 100644
--- a/python/tvm/relay/backend/vm.py
+++ b/python/tvm/relay/backend/vm.py
@@ -175,8 +175,9 @@ def optimize(self, mod, target=None, target_host=None, params=None):
         target = self._update_target(target)
         target_host = self._update_target_host(target, target_host)
 
-        target = Target(target, target_host)
-        target_host = target.host
+        for k in target:
+            target[k] = tvm.target.Target(target[k], target_host)
+            target_host = target[k].host
 
         if params:
             self.set_params(params)

From 5a0f06bdafbf672bf101da66ec5aba3e13949ecb Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Wed, 3 Mar 2021 10:53:16 -0800
Subject: [PATCH 16/69] Fix typo and infinite loop error

---
 python/tvm/runtime/object.py                | 2 +-
 tests/python/unittest/test_target_target.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/tvm/runtime/object.py b/python/tvm/runtime/object.py
index 0c2abd296b42..e22dc187c3ca 100644
--- a/python/tvm/runtime/object.py
+++ b/python/tvm/runtime/object.py
@@ -44,7 +44,7 @@ def _new_object(cls):
 class Object(ObjectBase):
     """Base class for all tvm's runtime objects."""
 
-    __slots__ = []
+    __slots__ = ["handle"]
 
     def __repr__(self):
         return _ffi_node_api.AsRepr(self)
diff --git a/tests/python/unittest/test_target_target.py b/tests/python/unittest/test_target_target.py
index 9947ccb4fdd5..0173ea76b7c2 100644
--- a/tests/python/unittest/test_target_target.py
+++ b/tests/python/unittest/test_target_target.py
@@ -246,9 +246,9 @@ def test_target_host_merge_2():
         tvm.target.Target(tvm.target.Target("cuda --host llvm"), tvm.target.Target("llvm"))
 
 
-@pytest.mark.skip(reason="Causing infinite loop for reason to be investigated")
+# @pytest.mark.skip(reason="Causing infinite loop for reason to be investigated")
 def test_target_host_merge_3():
-    with pytest.raises(ValueError, match=r"target ddhost has to be a string or dictionary."):
+    with pytest.raises(ValueError, match=r"target host has to be a string or dictionary."):
         tvm.target.Target(tvm.target.Target("cuda --host llvm"), 12.34)
 
 

From 0e01e13d1bf2cdaa53e76e33e9fe614f5269784e Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Wed, 3 Mar 2021 12:04:19 -0800
Subject: [PATCH 17/69] Temporary fix for handle issue

---
 python/tvm/runtime/object.py                | 2 +-
 tests/python/unittest/test_target_target.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/runtime/object.py b/python/tvm/runtime/object.py
index e22dc187c3ca..0c2abd296b42 100644
--- a/python/tvm/runtime/object.py
+++ b/python/tvm/runtime/object.py
@@ -44,7 +44,7 @@ def _new_object(cls):
 class Object(ObjectBase):
     """Base class for all tvm's runtime objects."""
 
-    __slots__ = ["handle"]
+    __slots__ = []
 
     def __repr__(self):
         return _ffi_node_api.AsRepr(self)
diff --git a/tests/python/unittest/test_target_target.py b/tests/python/unittest/test_target_target.py
index 0173ea76b7c2..7598b807f59c 100644
--- a/tests/python/unittest/test_target_target.py
+++ b/tests/python/unittest/test_target_target.py
@@ -246,7 +246,7 @@ def test_target_host_merge_2():
         tvm.target.Target(tvm.target.Target("cuda --host llvm"), tvm.target.Target("llvm"))
 
 
-# @pytest.mark.skip(reason="Causing infinite loop for reason to be investigated")
+@pytest.mark.skip(reason="Causing infinite loop because of pytest and handle issue")
 def test_target_host_merge_3():
     with pytest.raises(ValueError, match=r"target host has to be a string or dictionary."):
         tvm.target.Target(tvm.target.Target("cuda --host llvm"), 12.34)

From 7db832732b9794b6c08629aace3376f64d6b4fdc Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Wed, 3 Mar 2021 16:12:17 -0800
Subject: [PATCH 18/69] Fix vm target

---
 python/tvm/relay/backend/vm.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/backend/vm.py b/python/tvm/relay/backend/vm.py
index 84b35b49e70f..2c7df949234c 100644
--- a/python/tvm/relay/backend/vm.py
+++ b/python/tvm/relay/backend/vm.py
@@ -65,8 +65,9 @@ def compile(mod, target=None, target_host=None, params=None):
     compiler = VMCompiler()
     if params:
         compiler.set_params(params)
-    target = tvm.target.Target(target, target_host)
-    target_host = target.host
+    for k in target:
+        target[k] = tvm.target.Target(target[k], target_host)
+        target_host = target[k].host
     compiler.lower(mod, target, target_host)
     compiler.codegen()
     return compiler.get_exec()

From f21441074cde07a231849d552af5296e58ecfdaa Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Wed, 3 Mar 2021 16:34:13 -0800
Subject: [PATCH 19/69] Add condition support for str case

---
 python/tvm/relay/backend/vm.py   | 30 +++++++++++++++++++++---------
 python/tvm/relay/build_module.py | 21 +++++++++++++++------
 2 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/python/tvm/relay/backend/vm.py b/python/tvm/relay/backend/vm.py
index 2c7df949234c..89571b82414c 100644
--- a/python/tvm/relay/backend/vm.py
+++ b/python/tvm/relay/backend/vm.py
@@ -65,9 +65,13 @@ def compile(mod, target=None, target_host=None, params=None):
     compiler = VMCompiler()
     if params:
         compiler.set_params(params)
-    for k in target:
-        target[k] = tvm.target.Target(target[k], target_host)
-        target_host = target[k].host
+    if isinstance(target, dict):
+        for k in target:
+            target[k] = tvm.target.Target(target[k], target_host)
+            target_host = target[k].host
+    else:
+        target = tvm.target.Target(target, target_host)
+        target_host = target.host
     compiler.lower(mod, target, target_host)
     compiler.codegen()
     return compiler.get_exec()
@@ -134,9 +138,13 @@ def lower(self, mod, target=None, target_host=None):
         target = self._update_target(target)
         target_host = self._update_target_host(target, target_host)
 
-        for k in target:
-            target[k] = tvm.target.Target(target[k], target_host)
-            target_host = target[k].host
+        if isinstance(target, dict):
+            for k in target:
+                target[k] = tvm.target.Target(target[k], target_host)
+                target_host = target[k].host
+        else:
+            target = tvm.target.Target(target, target_host)
+            target_host = target.host
 
         tophub_context = self._tophub_context(target)
         with tophub_context:
@@ -176,9 +184,13 @@ def optimize(self, mod, target=None, target_host=None, params=None):
         target = self._update_target(target)
         target_host = self._update_target_host(target, target_host)
 
-        for k in target:
-            target[k] = tvm.target.Target(target[k], target_host)
-            target_host = target[k].host
+        if isinstance(target, dict):
+            for k in target:
+                target[k] = tvm.target.Target(target[k], target_host)
+                target_host = target[k].host
+        else:
+            target = tvm.target.Target(target, target_host)
+            target_host = target.host
 
         if params:
             self.set_params(params)
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index ec349ceb874d..f7073626874d 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -130,9 +130,13 @@ def build(self, mod, target=None, target_host=None, params=None):
         autotvm.GLOBAL_SCOPE.silent = use_auto_scheduler
 
         # Assume the target host of all targets in heterogenous target are identical
-        for k in target:
-            target[k] = Target(target[k], target_host)
-            target_host = target[k].host
+        if isinstance(target, dict):
+            for k in target:
+                target[k] = Target(target[k], target_host)
+                target_host = target[k].host
+        else:
+            target = Target(target, target_host)
+            target_host = target.host
 
         self._build(mod, target, target_host)
         autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent
@@ -261,9 +265,14 @@ def build(mod, target=None, target_host=None, params=None, mod_name="default"):
         target_host = Target(target_host)
     elif target_host:
         raise ValueError("target host must be the type of str, " + "tvm.target.Target, or None")
-    for k in target:
-        target[k] = Target(target[k], target_host)
-        target_host = target[k].host
+
+    if isinstance(target, dict):
+        for k in target:
+            target[k] = Target(target[k], target_host)
+            target_host = target[k].host
+    else:
+        target = Target(target, target_host)
+        target_host = target.host
 
     # If current dispatch context is fallback context (the default root context),
     # then load pre-tuned parameters from TopHub

From 38c4ec05e47d1d89cc9226079b08fbc701916077 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Thu, 4 Mar 2021 14:36:56 -0800
Subject: [PATCH 20/69] Add GetHost function and fix previous bugs

---
 include/tvm/target/target.h       |  3 +++
 src/auto_scheduler/feature.cc     | 15 ++++++++++-----
 src/relay/backend/build_module.cc | 14 ++++++++++++++
 src/target/target.cc              |  5 +++++
 4 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/include/tvm/target/target.h b/include/tvm/target/target.h
index 64bd251c0ded..420643f27f90 100644
--- a/include/tvm/target/target.h
+++ b/include/tvm/target/target.h
@@ -35,6 +35,7 @@
 namespace tvm {
 
 class TargetInternal;
+class Target;
 
 /*!
  * \brief Compilation target.
@@ -60,6 +61,8 @@ class TargetNode : public Object {
   TVM_DLL const std::string& str() const;
   /*! \return Export target to JSON-like configuration */
   TVM_DLL Map<String, ObjectRef> Export() const;
+  /*! \return The Optional<Target> typed target host of the TargetNode */
+  TVM_DLL Optional<Target> GetHost() const;
 
   void VisitAttrs(AttrVisitor* v) {
     v->Visit("kind", &kind);
diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc
index cf516d8452e2..276941b901c1 100755
--- a/src/auto_scheduler/feature.cc
+++ b/src/auto_scheduler/feature.cc
@@ -1397,9 +1397,11 @@ void GetPerStoreFeaturesFromFile(const std::string& filename, int max_lines, int
     if (find_res == task_cache.end()) {
       // rebuild task
       Array<te::Tensor> tensors = (*workload_key_to_tensors)(workload_key);
-      task = SearchTask(ComputeDAG(tensors), workload_key, cur_inp->task->target,
-                        cur_inp->task->target_host, cur_inp->task->hardware_params,
-                        cur_inp->task->layout_rewrite_option);
+      Target target = cur_inp->task->target, target_host = cur_inp->task->target_host;
+      target = Target(target, target_host);
+      target_host = target->GetHost().value();
+      task = SearchTask(ComputeDAG(tensors), workload_key, target, target_host,
+                        cur_inp->task->hardware_params, cur_inp->task->layout_rewrite_option);
       task_id = task_cache.size();
 
       // compute min cost for each task
@@ -1466,8 +1468,11 @@ void GetPerStoreFeaturesFromMeasurePairs(const Array<MeasureInput>& inputs,
         // The measure input is incomplete, rebuild task for incomplete measure pairs read from file
         try {
           Array<te::Tensor> tensors = (*workload_key_to_tensors)(workload_key);
-          task = SearchTask(ComputeDAG(tensors), workload_key, inputs[i]->task->target,
-                            inputs[i]->task->target_host, inputs[i]->task->hardware_params,
+          Target target = inputs[i]->task->target, target_host = inputs[i]->task->target_host;
+          target = Target(target, target_host);
+          target_host = target->GetHost().value();
+          task = SearchTask(ComputeDAG(tensors), workload_key, target, target_host,
+                            inputs[i]->task->hardware_params,
                             inputs[i]->task->layout_rewrite_option);
         } catch (std::exception& e) {
           // Cannot build ComputeDAG from workload key, the task may have not been registered in
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index 08846925bede..b29ebd1f6779 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -235,8 +235,15 @@ class RelayBuildModule : public runtime::ModuleNode {
    * \param target_host Host target device
    */
   void Build(IRModule mod, const TargetsMap& targets, const tvm::Target& target_host) {
+    // Create protected variable targets_ from ground up
     targets_ = targets;
     target_host_ = target_host;
+    for (const auto &iter : targets) {
+      // Construct a new target with target host filed if available
+      targets_.Set(iter.first, Target(iter.second, target_host_));
+      target_host_ = targets_[iter.first]->GetHost().value();
+    }
+
     BuildRelay(mod, params_);
     // Clear compile engine so that tuning schedules can be changed between runs. See issue #6096.
     CompileEngine::Global()->Clear();
@@ -481,6 +488,13 @@ class RelayBuildModule : public runtime::ModuleNode {
     const runtime::PackedFunc* pf = runtime::Registry::Get("codegen.LLVMModuleCreate");
     if (!target_host.defined()) target_host = (pf != nullptr) ? Target("llvm") : Target("stackvm");
 
+    // Update all the targets in the _targets TargetsMap
+    for (const auto &iter : targets_) {
+      // Construct a new target with target host filed if available
+      targets_.Set(iter.first, Target(iter.second, target_host));
+      target_host = targets_[iter.first]->GetHost().value();
+    }
+
     // Generate a placeholder function that attaches linked params as its arguments.
     if (target_host->GetAttr<Bool>("link-params").value_or(Bool(false))) {
       CHECK(pf != nullptr) << "Unable to link-params with no target_host and no llvm codegen.";
diff --git a/src/target/target.cc b/src/target/target.cc
index 5f0e54adac69..ee02a201a992 100644
--- a/src/target/target.cc
+++ b/src/target/target.cc
@@ -414,6 +414,11 @@ Map<String, ObjectRef> TargetNode::Export() const {
   return result;
 }
 
+Optional<Target> TargetNode::GetHost() const {
+  if (!this->host.defined()) return NullOpt;
+  return GetRef<Optional<Target>>(this->host.as<TargetNode>());
+}
+
 /*! \brief Entry to hold the Target context stack. */
 struct TVMTargetThreadLocalEntry {
   /*! \brief The current target context */

From 8bacc8daf114b158784edc42090ee71a93ed6ddf Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Thu, 4 Mar 2021 15:15:04 -0800
Subject: [PATCH 21/69] Fix measure_record.cc

---
 src/auto_scheduler/measure_record.cc | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/auto_scheduler/measure_record.cc b/src/auto_scheduler/measure_record.cc
index 1120f437b176..63ef3eb439f7 100644
--- a/src/auto_scheduler/measure_record.cc
+++ b/src/auto_scheduler/measure_record.cc
@@ -163,8 +163,9 @@ struct Handler<::tvm::auto_scheduler::SearchTaskNode> {
     writer->WriteArrayItem(std::string(data.workload_key));
     writer->WriteArrayItem(data.target->str());
     writer->WriteArrayItem(*data.hardware_params.get());
-    if (data.target_host.defined()) {
-      writer->WriteArrayItem(data.target_host->str());
+    ::tvm::Target target_host = ::tvm::Target(data.target, data.target_host)->GetHost().value();
+    if (target_host.defined()) {
+      writer->WriteArrayItem(target_host->str());
     } else {
       writer->WriteArrayItem(std::string(""));
     }
@@ -193,7 +194,8 @@ struct Handler<::tvm::auto_scheduler::SearchTaskNode> {
       if (s) {
         reader->Read(&str_value);
         if (!str_value.empty()) {
-          data->target_host = ::tvm::Target(str_value);
+          data->target = ::tvm::Target(data->target, ::tvm::Target(str_value));
+          data->target_host = data->target->GetHost().value();
         }
         s = reader->NextArrayItem();
         ICHECK(s);

From 36153dd5dd6a4f7c51021f5fe284127cd5bb60ce Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Thu, 4 Mar 2021 15:34:13 -0800
Subject: [PATCH 22/69] Fix search_task.cc

---
 src/auto_scheduler/search_task.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/auto_scheduler/search_task.cc b/src/auto_scheduler/search_task.cc
index 0abee16fceab..29f04f383ee9 100755
--- a/src/auto_scheduler/search_task.cc
+++ b/src/auto_scheduler/search_task.cc
@@ -53,6 +53,7 @@ HardwareParams::HardwareParams(int num_cores, int vector_unit_bytes, int cache_l
 
 HardwareParams HardwareParamsNode::GetDefaultHardwareParams(const Target& target,
                                                             const Target& target_host) {
+  // There is no use of target_host so no updates here in the function.
   const auto device_type = target->kind->device_type;
   if (device_type == kDLCPU) {
     return HardwareParams(tvm::runtime::threading::MaxConcurrency(), 64, 64, 0, 0, 0, 0, 0);
@@ -115,6 +116,8 @@ HardwareParams HardwareParamsNode::GetDefaultHardwareParams(const Target& target
 SearchTask::SearchTask(ComputeDAG compute_dag, String workload_key, Target target,
                        Target target_host, Optional<HardwareParams> hardware_params,
                        LayoutRewriteOption layout_rewrite_option) {
+  target = Target(target, target_host);
+  target_host = target->GetHost().value();
   auto node = make_object<SearchTaskNode>();
   node->compute_dag = std::move(compute_dag);
   node->workload_key = std::move(workload_key);
@@ -143,6 +146,8 @@ TVM_REGISTER_GLOBAL("auto_scheduler.SearchTask")
     .set_body_typed([](ComputeDAG compute_dag, String workload_key, Target target,
                        Target target_host, Optional<HardwareParams> hardware_params,
                        int layout_rewrite_option) {
+      target = Target(target, target_host);
+      target_host = target->GetHost().value();
       return SearchTask(compute_dag, workload_key, target, target_host, hardware_params,
                         LayoutRewriteOption(layout_rewrite_option));
     });

From df1f6a104605e938285f7a758f3456a05979c914 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Thu, 4 Mar 2021 16:10:57 -0800
Subject: [PATCH 23/69] Fix compiler.cc, memory_alloc.cc

---
 src/relay/backend/vm/compiler.cc     | 19 +++++++++++++++++--
 src/relay/transforms/memory_alloc.cc |  8 ++++++++
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index 0718191a2ff6..8aa773c6a1c6 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -255,6 +255,11 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
         context_(context),
         target_host_(target_host),
         expr_device_map_(std::move(expr_device_map)) {
+    for (auto& iter : targets) {
+      targets.Set(iter.first, Target(targets[iter.first], target_host));
+      target_host = targets[iter.first]->GetHost().value();
+    }
+    target_host_ = target_host;
     for (const auto& it : targets) {
       targets_[it.first->value] = it.second;
     }
@@ -895,6 +900,10 @@ void VMCompiler::Lower(IRModule mod, const TargetsMap& targets, const tvm::Targe
   exec_ = make_object<Executable>();
   targets_ = targets;
   target_host_ = target_host;
+  for (auto& iter : targets_) {
+    targets_.Set(iter.first, Target(targets_[iter.first], target_host_));
+    target_host_ = targets[iter.first]->GetHost().value();
+  }
 
   // Run the optimizations necessary to target the VM.
   context_.module = OptimizeModule(mod, targets_, target_host_);
@@ -996,8 +1005,14 @@ transform::Sequential MemoryOpt(tvm::Target host_target, TargetsMap targets) {
   return transform::Sequential(pass_seqs);
 }
 
-IRModule VMCompiler::OptimizeModule(IRModule mod, const TargetsMap& targets,
-                                    const Target& target_host) {
+IRModule VMCompiler::OptimizeModule(IRModule mod, const TargetsMap& targets_arg,
+                                    const Target& target_host_arg) {
+  TargetsMap targets = targets_arg;
+  Target target_host = target_host_arg;
+  for (auto& iter : targets) {
+    targets.Set(iter.first, Target(targets[iter.first], target_host));
+    target_host = targets[iter.first]->GetHost().value();
+  }
   if (params_.size()) {
     BaseFunc base_func = mod->Lookup("main");
     ICHECK(base_func->IsInstance<FunctionNode>())
diff --git a/src/relay/transforms/memory_alloc.cc b/src/relay/transforms/memory_alloc.cc
index b8c87909a025..3c9de20b84f6 100644
--- a/src/relay/transforms/memory_alloc.cc
+++ b/src/relay/transforms/memory_alloc.cc
@@ -415,6 +415,10 @@ class DialectRewriter : public ExprMutator {
 namespace transform {
 
 Pass ManifestAlloc(Target target_host, Map<tvm::Integer, tvm::Target> targets) {
+  for (auto& iter : targets) {
+    targets.Set(iter.first, Target(targets[iter.first], target_host));
+    target_host = targets[iter.first]->GetHost().value();
+  }
   return tvm::transform::CreateModulePass(
       [=](IRModule mod, const PassContext& pass_ctx) {
         DLOG(INFO) << "tvm::relay::transform::ManifestAlloc";
@@ -458,6 +462,10 @@ Pass ManifestAlloc(Target target_host, Map<tvm::Integer, tvm::Target> targets) {
 
 TVM_REGISTER_GLOBAL("relay.transform.ManifestAlloc")
     .set_body_typed([](Target target_host, Map<tvm::Integer, tvm::Target> targets) {
+      for (auto& iter : targets) {
+        targets.Set(iter.first, Target(targets[iter.first], target_host));
+        target_host = targets[iter.first]->GetHost().value();
+      }
       return ManifestAlloc(target_host, targets);
     });
 

From 4539cffca0d160795342b15a7ea725377e440fd5 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Thu, 4 Mar 2021 16:58:30 -0800
Subject: [PATCH 24/69] Fix driver_api.cc

---
 src/driver/driver_api.cc | 57 +++++++++++++++++++++++++++-------------
 1 file changed, 39 insertions(+), 18 deletions(-)

diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index bbbb7e3f9eb5..dd90227cf46d 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -185,9 +185,10 @@ IRModule lower(te::Schedule sch, const Array<te::Tensor>& args, const std::strin
   return mod;
 }
 
-std::pair<IRModule, IRModule> SplitDevHostFuncs(IRModule mod_mixed, const Target& target,
-                                                const Target& target_host,
+std::pair<IRModule, IRModule> SplitDevHostFuncs(IRModule mod_mixed, const Target& target_arg,
+                                                const Target& target_host_arg,
                                                 const transform::PassContext& pass_ctx) {
+  Target target = Target(target_arg, target_host_arg), target_host = target->GetHost().value();
   Array<tvm::transform::Pass> mixed_pass_list = {BindTarget(target),
                                                  tir::transform::VerifyMemory()};
 
@@ -253,31 +254,47 @@ std::pair<IRModule, IRModule> SplitDevHostFuncs(IRModule mod_mixed, const Target
 }
 
 // Build for heterogeneous execution.
-runtime::Module build(const Map<Target, IRModule>& inputs, const Target& target_host) {
+runtime::Module build(const Map<Target, IRModule>& inputs_arg, const Target& target_host_arg) {
   auto pass_ctx = transform::PassContext::Current();
 
   std::vector<runtime::Module> device_modules;
-  Target target_host_val = target_host;
+  Target target_host = target_host_arg;
+  Map<Target, IRModule> updated_inputs;
+
+  // Fetch previous defined target host in targets
+  for (const auto& it : inputs_arg) {
+    auto target = Target(it.first, target_host);
+    updated_inputs.Set(target, it.second);
+    target_host = target->GetHost().value();
+  }
+
   if (!target_host.defined()) {
-    for (const auto& it : inputs) {
+    for (const auto& it : updated_inputs) {
       if (it.first->kind->device_type == kDLCPU || it.first->kind->device_type == kDLMicroDev) {
-        target_host_val = it.first;
+        target_host = it.first;
         break;
       }
     }
   }
 
-  if (!target_host_val.defined()) {
-    target_host_val = DefaultTargetHost(target_host_val);
+  if (!target_host.defined()) {
+    target_host = DefaultTargetHost(target_host);
+  }
+
+  // Update target host for all targets
+  for (const auto& it : inputs_arg) {
+    auto target = Target(it.first, target_host);
+    updated_inputs.Set(target, it.second);
+    target_host = target->GetHost().value();
   }
 
   IRModule mhost_all = IRModule(Map<GlobalVar, BaseFunc>());
 
   ICHECK(mhost_all.defined()) << "The host module must be defined";
 
-  for (const auto& it : inputs) {
+  for (const auto& it : updated_inputs) {
     if (it.second.defined()) {
-      auto pair = SplitDevHostFuncs(it.second, it.first, target_host_val, pass_ctx);
+      auto pair = SplitDevHostFuncs(it.second, it.first, target_host, pass_ctx);
       auto& mhost = pair.first;
       auto& mdevice = pair.second;
 
@@ -293,7 +310,7 @@ runtime::Module build(const Map<Target, IRModule>& inputs, const Target& target_
     }
   }
 
-  runtime::Module mhost = codegen::Build(mhost_all, target_host_val);
+  runtime::Module mhost = codegen::Build(mhost_all, target_host);
   // Import all modules
   for (const auto& it : device_modules) {
     if (it.operator->()) {
@@ -304,21 +321,25 @@ runtime::Module build(const Map<Target, IRModule>& inputs, const Target& target_
 }
 
 // Build for heterogeneous execution when target is a string.
-runtime::Module build(const Map<String, IRModule>& inputs, const Target& target_host) {
-  Map<Target, IRModule> updated_input;
-  for (const auto& it : inputs) {
-    auto target = Target(it.first);
+runtime::Module build(const Map<String, IRModule>& inputs_arg, const Target& target_host_arg) {
+  Map<Target, IRModule> updated_inputs;
+  Target target_host = target_host_arg;
+  for (const auto& it : inputs_arg) {
+    auto target = Target(Target(it.first), target_host);
+    target_host = target_host->GetHost().value();
     Optional<String> device = target->GetAttr<String>("device");
     if (device.defined() && device.value() == "vta") {
       target = Target("ext_dev");
     }
-    updated_input.Set(target, it.second);
+    updated_inputs.Set(target, it.second);
   }
-  return build(updated_input, target_host);
+  return build(updated_inputs, target_host);
 }
 
 // Build for homogeneous execution.
-runtime::Module build(const IRModule& funcs, const Target& target, const Target& target_host) {
+runtime::Module build(const IRModule& funcs, const Target& target_arg,
+                      const Target& target_host_arg) {
+  auto target = Target(target_arg, target_host_arg), target_host = target->GetHost().value();
   Map<Target, IRModule> inputs = {{target, funcs}};
   return build(inputs, target_host);
 }

From b3285255d26166e45f12f60fe846c2a4f29eb64d Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Thu, 4 Mar 2021 17:07:20 -0800
Subject: [PATCH 25/69] Fix format

---
 src/auto_scheduler/feature.cc     |  6 +++---
 src/relay/backend/build_module.cc | 12 ++++++------
 src/relay/backend/vm/compiler.cc  |  8 +++-----
 3 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc
index 276941b901c1..68fe490d8626 100755
--- a/src/auto_scheduler/feature.cc
+++ b/src/auto_scheduler/feature.cc
@@ -1471,9 +1471,9 @@ void GetPerStoreFeaturesFromMeasurePairs(const Array<MeasureInput>& inputs,
           Target target = inputs[i]->task->target, target_host = inputs[i]->task->target_host;
           target = Target(target, target_host);
           target_host = target->GetHost().value();
-          task = SearchTask(ComputeDAG(tensors), workload_key, target, target_host,
-                            inputs[i]->task->hardware_params,
-                            inputs[i]->task->layout_rewrite_option);
+          task =
+              SearchTask(ComputeDAG(tensors), workload_key, target, target_host,
+                         inputs[i]->task->hardware_params, inputs[i]->task->layout_rewrite_option);
         } catch (std::exception& e) {
           // Cannot build ComputeDAG from workload key, the task may have not been registered in
           // this search round
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index b29ebd1f6779..91ae0aa4c335 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -238,10 +238,10 @@ class RelayBuildModule : public runtime::ModuleNode {
     // Create protected variable targets_ from ground up
     targets_ = targets;
     target_host_ = target_host;
-    for (const auto &iter : targets) {
+    for (const auto& it : targets) {
       // Construct a new target with target host filed if available
-      targets_.Set(iter.first, Target(iter.second, target_host_));
-      target_host_ = targets_[iter.first]->GetHost().value();
+      targets_.Set(it.first, Target(it.second, target_host_));
+      target_host_ = targets_[it.first]->GetHost().value();
     }
 
     BuildRelay(mod, params_);
@@ -489,10 +489,10 @@ class RelayBuildModule : public runtime::ModuleNode {
     if (!target_host.defined()) target_host = (pf != nullptr) ? Target("llvm") : Target("stackvm");
 
     // Update all the targets in the _targets TargetsMap
-    for (const auto &iter : targets_) {
+    for (const auto& it : targets_) {
       // Construct a new target with target host filed if available
-      targets_.Set(iter.first, Target(iter.second, target_host));
-      target_host = targets_[iter.first]->GetHost().value();
+      targets_.Set(it.first, Target(it.second, target_host));
+      target_host = targets_[it.first]->GetHost().value();
     }
 
     // Generate a placeholder function that attaches linked params as its arguments.
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index 8aa773c6a1c6..4e7c1e61b1d7 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -255,14 +255,12 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
         context_(context),
         target_host_(target_host),
         expr_device_map_(std::move(expr_device_map)) {
-    for (auto& iter : targets) {
-      targets.Set(iter.first, Target(targets[iter.first], target_host));
-      target_host = targets[iter.first]->GetHost().value();
-    }
-    target_host_ = target_host;
     for (const auto& it : targets) {
+      targets.Set(it.first, Target(targets[it.first], target_host));
+      target_host = targets[it.first]->GetHost().value();
       targets_[it.first->value] = it.second;
     }
+    target_host_ = target_host;
   }
 
   VMFunction Compile(const GlobalVar& var, const Function& func) {

From ba427ecca14038da530e0fd5d4c762dbede9a20b Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Fri, 5 Mar 2021 00:38:41 -0800
Subject: [PATCH 26/69] Fix bugs and GetHost function usage

---
 src/auto_scheduler/feature.cc        |  4 ++--
 src/auto_scheduler/measure_record.cc |  5 +++--
 src/auto_scheduler/search_task.cc    |  4 ++--
 src/driver/driver_api.cc             | 12 ++++++------
 src/relay/backend/build_module.cc    |  4 ++--
 src/relay/backend/vm/compiler.cc     |  6 +++---
 src/relay/transforms/memory_alloc.cc |  4 ++--
 src/target/target.cc                 |  1 -
 8 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc
index 68fe490d8626..20d456672e24 100755
--- a/src/auto_scheduler/feature.cc
+++ b/src/auto_scheduler/feature.cc
@@ -1399,7 +1399,7 @@ void GetPerStoreFeaturesFromFile(const std::string& filename, int max_lines, int
       Array<te::Tensor> tensors = (*workload_key_to_tensors)(workload_key);
       Target target = cur_inp->task->target, target_host = cur_inp->task->target_host;
       target = Target(target, target_host);
-      target_host = target->GetHost().value();
+      target_host = target->GetHost().value_or(Target());
       task = SearchTask(ComputeDAG(tensors), workload_key, target, target_host,
                         cur_inp->task->hardware_params, cur_inp->task->layout_rewrite_option);
       task_id = task_cache.size();
@@ -1470,7 +1470,7 @@ void GetPerStoreFeaturesFromMeasurePairs(const Array<MeasureInput>& inputs,
           Array<te::Tensor> tensors = (*workload_key_to_tensors)(workload_key);
           Target target = inputs[i]->task->target, target_host = inputs[i]->task->target_host;
           target = Target(target, target_host);
-          target_host = target->GetHost().value();
+          target_host = target->GetHost().value_or(Target());
           task =
               SearchTask(ComputeDAG(tensors), workload_key, target, target_host,
                          inputs[i]->task->hardware_params, inputs[i]->task->layout_rewrite_option);
diff --git a/src/auto_scheduler/measure_record.cc b/src/auto_scheduler/measure_record.cc
index 63ef3eb439f7..2b10eac27426 100644
--- a/src/auto_scheduler/measure_record.cc
+++ b/src/auto_scheduler/measure_record.cc
@@ -163,7 +163,8 @@ struct Handler<::tvm::auto_scheduler::SearchTaskNode> {
     writer->WriteArrayItem(std::string(data.workload_key));
     writer->WriteArrayItem(data.target->str());
     writer->WriteArrayItem(*data.hardware_params.get());
-    ::tvm::Target target_host = ::tvm::Target(data.target, data.target_host)->GetHost().value();
+    ::tvm::Target target_host = ::tvm::Target(data.target, data.target_host)->GetHost()
+                                  .value_or(::tvm::Target());
     if (target_host.defined()) {
       writer->WriteArrayItem(target_host->str());
     } else {
@@ -195,7 +196,7 @@ struct Handler<::tvm::auto_scheduler::SearchTaskNode> {
         reader->Read(&str_value);
         if (!str_value.empty()) {
           data->target = ::tvm::Target(data->target, ::tvm::Target(str_value));
-          data->target_host = data->target->GetHost().value();
+          data->target_host = data->target->GetHost().value_or(::tvm::Target());
         }
         s = reader->NextArrayItem();
         ICHECK(s);
diff --git a/src/auto_scheduler/search_task.cc b/src/auto_scheduler/search_task.cc
index 29f04f383ee9..a5c7285f7160 100755
--- a/src/auto_scheduler/search_task.cc
+++ b/src/auto_scheduler/search_task.cc
@@ -117,7 +117,7 @@ SearchTask::SearchTask(ComputeDAG compute_dag, String workload_key, Target targe
                        Target target_host, Optional<HardwareParams> hardware_params,
                        LayoutRewriteOption layout_rewrite_option) {
   target = Target(target, target_host);
-  target_host = target->GetHost().value();
+  target_host = target->GetHost().value_or(Target());
   auto node = make_object<SearchTaskNode>();
   node->compute_dag = std::move(compute_dag);
   node->workload_key = std::move(workload_key);
@@ -147,7 +147,7 @@ TVM_REGISTER_GLOBAL("auto_scheduler.SearchTask")
                        Target target_host, Optional<HardwareParams> hardware_params,
                        int layout_rewrite_option) {
       target = Target(target, target_host);
-      target_host = target->GetHost().value();
+      target_host = target->GetHost().value_or(Target());
       return SearchTask(compute_dag, workload_key, target, target_host, hardware_params,
                         LayoutRewriteOption(layout_rewrite_option));
     });
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index dd90227cf46d..da417a55619b 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -188,7 +188,8 @@ IRModule lower(te::Schedule sch, const Array<te::Tensor>& args, const std::strin
 std::pair<IRModule, IRModule> SplitDevHostFuncs(IRModule mod_mixed, const Target& target_arg,
                                                 const Target& target_host_arg,
                                                 const transform::PassContext& pass_ctx) {
-  Target target = Target(target_arg, target_host_arg), target_host = target->GetHost().value();
+  Target target = Target(target_arg, target_host_arg),
+         target_host = target->GetHost().value_or(Target());
   Array<tvm::transform::Pass> mixed_pass_list = {BindTarget(target),
                                                  tir::transform::VerifyMemory()};
 
@@ -264,8 +265,7 @@ runtime::Module build(const Map<Target, IRModule>& inputs_arg, const Target& tar
   // Fetch previous defined target host in targets
   for (const auto& it : inputs_arg) {
     auto target = Target(it.first, target_host);
-    updated_inputs.Set(target, it.second);
-    target_host = target->GetHost().value();
+    target_host = target->GetHost().value_or(Target());
   }
 
   if (!target_host.defined()) {
@@ -285,7 +285,6 @@ runtime::Module build(const Map<Target, IRModule>& inputs_arg, const Target& tar
   for (const auto& it : inputs_arg) {
     auto target = Target(it.first, target_host);
     updated_inputs.Set(target, it.second);
-    target_host = target->GetHost().value();
   }
 
   IRModule mhost_all = IRModule(Map<GlobalVar, BaseFunc>());
@@ -326,7 +325,7 @@ runtime::Module build(const Map<String, IRModule>& inputs_arg, const Target& tar
   Target target_host = target_host_arg;
   for (const auto& it : inputs_arg) {
     auto target = Target(Target(it.first), target_host);
-    target_host = target_host->GetHost().value();
+    target_host = target->GetHost().value_or(Target());
     Optional<String> device = target->GetAttr<String>("device");
     if (device.defined() && device.value() == "vta") {
       target = Target("ext_dev");
@@ -339,7 +338,8 @@ runtime::Module build(const Map<String, IRModule>& inputs_arg, const Target& tar
 // Build for homogeneous execution.
 runtime::Module build(const IRModule& funcs, const Target& target_arg,
                       const Target& target_host_arg) {
-  auto target = Target(target_arg, target_host_arg), target_host = target->GetHost().value();
+  auto target = Target(target_arg, target_host_arg),
+       target_host = target->GetHost().value_or(Target());
   Map<Target, IRModule> inputs = {{target, funcs}};
   return build(inputs, target_host);
 }
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index 91ae0aa4c335..e132cf54aefb 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -241,7 +241,7 @@ class RelayBuildModule : public runtime::ModuleNode {
     for (const auto& it : targets) {
       // Construct a new target with target host filed if available
       targets_.Set(it.first, Target(it.second, target_host_));
-      target_host_ = targets_[it.first]->GetHost().value();
+      target_host_ = targets_[it.first]->GetHost().value_or(Target());
     }
 
     BuildRelay(mod, params_);
@@ -492,7 +492,7 @@ class RelayBuildModule : public runtime::ModuleNode {
     for (const auto& it : targets_) {
       // Construct a new target with target host filed if available
       targets_.Set(it.first, Target(it.second, target_host));
-      target_host = targets_[it.first]->GetHost().value();
+      target_host = targets_[it.first]->GetHost().value_or(Target());
     }
 
     // Generate a placeholder function that attaches linked params as its arguments.
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index 4e7c1e61b1d7..577a39da902a 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -257,7 +257,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
         expr_device_map_(std::move(expr_device_map)) {
     for (const auto& it : targets) {
       targets.Set(it.first, Target(targets[it.first], target_host));
-      target_host = targets[it.first]->GetHost().value();
+      target_host = targets[it.first]->GetHost().value_or(Target());
       targets_[it.first->value] = it.second;
     }
     target_host_ = target_host;
@@ -900,7 +900,7 @@ void VMCompiler::Lower(IRModule mod, const TargetsMap& targets, const tvm::Targe
   target_host_ = target_host;
   for (auto& iter : targets_) {
     targets_.Set(iter.first, Target(targets_[iter.first], target_host_));
-    target_host_ = targets[iter.first]->GetHost().value();
+    target_host_ = targets[iter.first]->GetHost().value_or(Target());
   }
 
   // Run the optimizations necessary to target the VM.
@@ -1009,7 +1009,7 @@ IRModule VMCompiler::OptimizeModule(IRModule mod, const TargetsMap& targets_arg,
   Target target_host = target_host_arg;
   for (auto& iter : targets) {
     targets.Set(iter.first, Target(targets[iter.first], target_host));
-    target_host = targets[iter.first]->GetHost().value();
+    target_host = targets[iter.first]->GetHost().value_or(Target());
   }
   if (params_.size()) {
     BaseFunc base_func = mod->Lookup("main");
diff --git a/src/relay/transforms/memory_alloc.cc b/src/relay/transforms/memory_alloc.cc
index 3c9de20b84f6..5bc48db7de69 100644
--- a/src/relay/transforms/memory_alloc.cc
+++ b/src/relay/transforms/memory_alloc.cc
@@ -417,7 +417,7 @@ namespace transform {
 Pass ManifestAlloc(Target target_host, Map<tvm::Integer, tvm::Target> targets) {
   for (auto& iter : targets) {
     targets.Set(iter.first, Target(targets[iter.first], target_host));
-    target_host = targets[iter.first]->GetHost().value();
+    target_host = targets[iter.first]->GetHost().value_or(Target());
   }
   return tvm::transform::CreateModulePass(
       [=](IRModule mod, const PassContext& pass_ctx) {
@@ -464,7 +464,7 @@ TVM_REGISTER_GLOBAL("relay.transform.ManifestAlloc")
     .set_body_typed([](Target target_host, Map<tvm::Integer, tvm::Target> targets) {
       for (auto& iter : targets) {
         targets.Set(iter.first, Target(targets[iter.first], target_host));
-        target_host = targets[iter.first]->GetHost().value();
+        target_host = targets[iter.first]->GetHost().value_or(Target());
       }
       return ManifestAlloc(target_host, targets);
     });
diff --git a/src/target/target.cc b/src/target/target.cc
index ee02a201a992..a8bb00b6f503 100644
--- a/src/target/target.cc
+++ b/src/target/target.cc
@@ -415,7 +415,6 @@ Map<String, ObjectRef> TargetNode::Export() const {
 }
 
 Optional<Target> TargetNode::GetHost() const {
-  if (!this->host.defined()) return NullOpt;
   return GetRef<Optional<Target>>(this->host.as<TargetNode>());
 }
 

From 915e3d3ff459a8de44cbfa50ba00f8ce69aabf4c Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Fri, 5 Mar 2021 02:27:09 -0800
Subject: [PATCH 27/69] Fix clang format

---
 src/auto_scheduler/measure_record.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/auto_scheduler/measure_record.cc b/src/auto_scheduler/measure_record.cc
index 2b10eac27426..109f5f51f0fc 100644
--- a/src/auto_scheduler/measure_record.cc
+++ b/src/auto_scheduler/measure_record.cc
@@ -163,8 +163,8 @@ struct Handler<::tvm::auto_scheduler::SearchTaskNode> {
     writer->WriteArrayItem(std::string(data.workload_key));
     writer->WriteArrayItem(data.target->str());
     writer->WriteArrayItem(*data.hardware_params.get());
-    ::tvm::Target target_host = ::tvm::Target(data.target, data.target_host)->GetHost()
-                                  .value_or(::tvm::Target());
+    ::tvm::Target target_host =
+        ::tvm::Target(data.target, data.target_host)->GetHost().value_or(::tvm::Target());
     if (target_host.defined()) {
       writer->WriteArrayItem(target_host->str());
     } else {

From 1a9dcb550dfb5d098bf1340733ed53bc6d7c9b92 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Fri, 5 Mar 2021 17:42:38 -0800
Subject: [PATCH 28/69] Fix bug

---
 src/driver/driver_api.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index da417a55619b..3c640e37f190 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -255,7 +255,7 @@ std::pair<IRModule, IRModule> SplitDevHostFuncs(IRModule mod_mixed, const Target
 }
 
 // Build for heterogeneous execution.
-runtime::Module build(const Map<Target, IRModule>& inputs_arg, const Target& target_host_arg) {
+runtime::Module build(const Map<Target, IRModule>& inputs, const Target& target_host_arg) {
   auto pass_ctx = transform::PassContext::Current();
 
   std::vector<runtime::Module> device_modules;
@@ -263,13 +263,13 @@ runtime::Module build(const Map<Target, IRModule>& inputs_arg, const Target& tar
   Map<Target, IRModule> updated_inputs;
 
   // Fetch previous defined target host in targets
-  for (const auto& it : inputs_arg) {
+  for (const auto& it : inputs) {
     auto target = Target(it.first, target_host);
     target_host = target->GetHost().value_or(Target());
   }
 
   if (!target_host.defined()) {
-    for (const auto& it : updated_inputs) {
+    for (const auto& it : inputs) {
       if (it.first->kind->device_type == kDLCPU || it.first->kind->device_type == kDLMicroDev) {
         target_host = it.first;
         break;
@@ -282,7 +282,7 @@ runtime::Module build(const Map<Target, IRModule>& inputs_arg, const Target& tar
   }
 
   // Update target host for all targets
-  for (const auto& it : inputs_arg) {
+  for (const auto& it : inputs) {
     auto target = Target(it.first, target_host);
     updated_inputs.Set(target, it.second);
   }

From 606ec71d7aecc3f76f87c702b285ecc2c9c20e70 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Sun, 7 Mar 2021 01:44:44 -0800
Subject: [PATCH 29/69] Modify python tests

---
 python/tvm/driver/build_module.py                 | 6 ++++++
 tests/python/unittest/test_runtime_rpc.py         | 2 +-
 tests/python/unittest/test_target_codegen_blob.py | 2 +-
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/python/tvm/driver/build_module.py b/python/tvm/driver/build_module.py
index 8954c5b83f16..860572a45d0e 100644
--- a/python/tvm/driver/build_module.py
+++ b/python/tvm/driver/build_module.py
@@ -386,10 +386,12 @@ def build(inputs, args=None, target=None, target_host=None, name="default_functi
             f"but got {type(inputs)}."
         )
 
+    flag_target_inputs = False
     if not isinstance(inputs, (dict, container.Map)):
         target = Target.current() if target is None else target
         target = target if target else "llvm"
         target_input_mod = {target: input_mod}
+        flag_target_inputs = True
     else:
         target_input_mod = inputs
 
@@ -401,6 +403,8 @@ def build(inputs, args=None, target=None, target_host=None, name="default_functi
 
     target = Target(target, target_host)
     target_host = target.host
+    if flag_target_inputs:
+        target_input_mod = {target: input_mod}
 
     if not target_host:
         for tar, _ in target_input_mod.items():
@@ -414,6 +418,8 @@ def build(inputs, args=None, target=None, target_host=None, name="default_functi
 
     target = Target(target, target_host)
     target_host = target.host
+    if flag_target_inputs:
+        target_input_mod = {target: input_mod}
 
     mod_host_all = tvm.IRModule({})
 
diff --git a/tests/python/unittest/test_runtime_rpc.py b/tests/python/unittest/test_runtime_rpc.py
index 11c109810fbb..a19aea14da2c 100644
--- a/tests/python/unittest/test_runtime_rpc.py
+++ b/tests/python/unittest/test_runtime_rpc.py
@@ -309,7 +309,7 @@ def check_remote_link_cl(remote):
         xo, xi = s[B].split(B.op.axis[0], factor=32)
         s[B].bind(xo, te.thread_axis("blockIdx.x"))
         s[B].bind(xi, te.thread_axis("threadIdx.x"))
-        f = tvm.build(s, [A, B], "opencl", target_host="llvm", name="myadd")
+        f = tvm.build(s, [A, B], "opencl --host=llvm", name="myadd")
         # Option 1: save modules separately and rely on remote compiler
         path_o = temp.relpath("myadd.o")
         path_cl = temp.relpath("myadd.cl")
diff --git a/tests/python/unittest/test_target_codegen_blob.py b/tests/python/unittest/test_target_codegen_blob.py
index dc42381cf82d..c17513acecaa 100644
--- a/tests/python/unittest/test_target_codegen_blob.py
+++ b/tests/python/unittest/test_target_codegen_blob.py
@@ -85,7 +85,7 @@ def test_cuda_lib():
     from tvm.contrib import utils
 
     temp = utils.tempdir()
-    fn_add = tvm.build(s, [A, B], target="cuda", target_host="llvm", name="add")
+    fn_add = tvm.build(s, [A, B], target="cuda --host=llvm", name="add")
     path_lib = temp.relpath("deploy_lib.so")
     fn_add.export_library(path_lib)
     m = tvm.runtime.load_module(path_lib)

From 71e01d0140b1b8e667ba55185c361815112ecdd0 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Sun, 7 Mar 2021 02:04:27 -0800
Subject: [PATCH 30/69] Change python unit tests to new target api

---
 python/tvm/auto_scheduler/measure.py             |  4 ++--
 .../unittest/test_auto_scheduler_measure.py      |  5 ++---
 .../unittest/test_auto_scheduler_search_task.py  | 16 ++++++++--------
 tests/python/unittest/test_crt.py                |  3 ++-
 .../unittest/test_target_codegen_device.py       |  2 +-
 .../unittest/test_target_codegen_hexagon.py      |  6 ++++--
 ...st_tir_transform_instrument_bound_checkers.py |  6 +++---
 7 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/python/tvm/auto_scheduler/measure.py b/python/tvm/auto_scheduler/measure.py
index 959a9c5da82a..197861be7b5f 100644
--- a/python/tvm/auto_scheduler/measure.py
+++ b/python/tvm/auto_scheduler/measure.py
@@ -223,8 +223,8 @@ def recover_measure_input(inp, rebuild_state=False):
     task = inp.task
     new_task = SearchTask(
         workload_key=task.workload_key,
-        target=task.target,
-        target_host=task.target_host,
+        target=tvm.target.Target(task.target, task.target_host),
+        target_host=None,
         hardware_params=task.hardware_params,
         layout_rewrite_option=task.layout_rewrite_option,
         task_inputs=list(task.task_input_names),
diff --git a/tests/python/unittest/test_auto_scheduler_measure.py b/tests/python/unittest/test_auto_scheduler_measure.py
index 116981028cc9..b5ee7495ed72 100644
--- a/tests/python/unittest/test_auto_scheduler_measure.py
+++ b/tests/python/unittest/test_auto_scheduler_measure.py
@@ -336,8 +336,7 @@ def test_measure_target_host():
     task = auto_scheduler.SearchTask(
         func=matmul_auto_scheduler_test,
         args=(512, 512, 512),
-        target="llvm",
-        target_host="llvm -mtriple=aarch64-linux-gnu",
+        target=tvm.target.Target("llvm", "llvm -mtriple=aarch64-linux-gnu"),
     )
 
     inp = auto_scheduler.measure.MeasureInput(task, task.compute_dag.init_state)
@@ -353,7 +352,7 @@ def test_measure_target_host():
         raw_inp = inputs[0]
 
         recovered_inp = auto_scheduler.measure.recover_measure_input(raw_inp)
-        assert str(recovered_inp.task.target_host) == str(inp.task.target_host)
+        assert str(recovered_inp.task.target.host) == str(inp.task.target.host)
 
 
 @tvm.testing.requires_llvm
diff --git a/tests/python/unittest/test_auto_scheduler_search_task.py b/tests/python/unittest/test_auto_scheduler_search_task.py
index 78e85dc213e0..cd47f1e468ff 100644
--- a/tests/python/unittest/test_auto_scheduler_search_task.py
+++ b/tests/python/unittest/test_auto_scheduler_search_task.py
@@ -70,7 +70,7 @@ def test_search_task_record():
     # TODO(jcf94): Check the compute dag & hardware parameter
     assert task.workload_key == new_task.workload_key
     assert str(task.target) == str(new_task.target)
-    assert str(task.target_host) == str(new_task.target_host)
+    assert str(task.target.host) == str(new_task.target.host)
     assert task.layout_rewrite_option == new_task.layout_rewrite_option
 
     # Log with 1 task input
@@ -86,7 +86,7 @@ def test_search_task_record():
     new_task = auto_scheduler._ffi_api.DeserializeSearchTask(task_record)
     assert task.workload_key == new_task.workload_key
     assert str(task.target) == str(new_task.target)
-    assert str(task.target_host) == str(new_task.target_host)
+    assert str(task.target.host) == str(new_task.target.host)
     assert task.layout_rewrite_option == new_task.layout_rewrite_option
     assert len(new_task.task_input_names) == 1
     assert new_task.task_input_names[0] == "test_input_0"
@@ -107,7 +107,7 @@ def test_search_task_record():
     new_task = auto_scheduler._ffi_api.DeserializeSearchTask(task_record)
     assert task.workload_key == new_task.workload_key
     assert str(task.target) == str(new_task.target)
-    assert str(task.target_host) == str(new_task.target_host)
+    assert str(task.target.host) == str(new_task.target.host)
     assert task.layout_rewrite_option == new_task.layout_rewrite_option
     assert len(new_task.task_input_names) == 2
     assert new_task.task_input_names[0] == "test_input_0"
@@ -118,7 +118,7 @@ def test_search_task_record():
     new_task = auto_scheduler._ffi_api.DeserializeSearchTask(v5_log)
     assert task.workload_key == new_task.workload_key
     assert str(task.target) == str(new_task.target)
-    assert str(task.target_host) == str(new_task.target_host)
+    assert str(task.target.host) == str(new_task.target.host)
     assert task.layout_rewrite_option == new_task.layout_rewrite_option
     assert len(new_task.task_input_names) == 0
 
@@ -139,7 +139,7 @@ def test_recover_measure_input_with_task_input():
     new_task = measure_log[0].task
     assert task.workload_key == new_task.workload_key
     assert str(task.target) == str(new_task.target)
-    assert str(task.target_host) == str(new_task.target_host)
+    assert str(task.target.host) == str(new_task.target.host)
     assert task.layout_rewrite_option == new_task.layout_rewrite_option
 
     # Log with 1 task input
@@ -160,7 +160,7 @@ def test_recover_measure_input_with_task_input():
     new_task = measure_log[0].task
     assert task.workload_key == new_task.workload_key
     assert str(task.target) == str(new_task.target)
-    assert str(task.target_host) == str(new_task.target_host)
+    assert str(task.target.host) == str(new_task.target.host)
     assert task.layout_rewrite_option == new_task.layout_rewrite_option
     assert len(new_task.task_input_names) == 1
     assert new_task.task_input_names[0] == "test_input_0"
@@ -184,7 +184,7 @@ def test_recover_measure_input_with_task_input():
     new_task = measure_log[0].task
     assert task.workload_key == new_task.workload_key
     assert str(task.target) == str(new_task.target)
-    assert str(task.target_host) == str(new_task.target_host)
+    assert str(task.target.host) == str(new_task.target.host)
     assert task.layout_rewrite_option == new_task.layout_rewrite_option
     assert len(new_task.task_input_names) == 2
     assert new_task.task_input_names[0] == "test_input_0"
@@ -196,7 +196,7 @@ def test_recover_measure_input_with_task_input():
     new_task = measure_log[0].task
     assert task.workload_key == new_task.workload_key
     assert str(task.target) == str(new_task.target)
-    assert str(task.target_host) == str(new_task.target_host)
+    assert str(task.target.host) == str(new_task.target.host)
     assert task.layout_rewrite_option == new_task.layout_rewrite_option
     assert len(new_task.task_input_names) == 0
 
diff --git a/tests/python/unittest/test_crt.py b/tests/python/unittest/test_crt.py
index 1bd24c931b72..def396dad72c 100644
--- a/tests/python/unittest/test_crt.py
+++ b/tests/python/unittest/test_crt.py
@@ -32,6 +32,7 @@
 import tvm
 import tvm.relay
 import tvm.testing
+from tvm.target import Target
 
 from tvm.topi.utils import get_const_tuple
 from tvm.topi.testing import conv2d_nchw_python
@@ -44,7 +45,7 @@
 
 def _make_sess_from_op(workspace, op_name, sched, arg_bufs):
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
-        mod = tvm.build(sched, arg_bufs, TARGET, target_host=TARGET, name=op_name)
+        mod = tvm.build(sched, arg_bufs, Target(TARGET, TARGET), name=op_name)
 
     return _make_session(workspace, mod)
 
diff --git a/tests/python/unittest/test_target_codegen_device.py b/tests/python/unittest/test_target_codegen_device.py
index 3b764c6709a8..42ddfc516f7e 100644
--- a/tests/python/unittest/test_target_codegen_device.py
+++ b/tests/python/unittest/test_target_codegen_device.py
@@ -71,7 +71,7 @@ def check_target(device, host="stackvm"):
         if not tvm.testing.device_enabled(device) or not tvm.testing.device_enabled(host):
             return
         ctx = tvm.context(device, 0)
-        mhost = tvm.driver.build(s, [A, B, D], target=device, target_host=host)
+        mhost = tvm.driver.build(s, [A, B, D], target=tvm.target.Target(device, host))
         f = mhost.entry_func
         # launch the kernel.
         n = 1027
diff --git a/tests/python/unittest/test_target_codegen_hexagon.py b/tests/python/unittest/test_target_codegen_hexagon.py
index b74d487f3fa7..6ffb2f4741e8 100644
--- a/tests/python/unittest/test_target_codegen_hexagon.py
+++ b/tests/python/unittest/test_target_codegen_hexagon.py
@@ -53,7 +53,9 @@ def check_add(offload):
             m = tvm.build(s, [C, A, B], target=target, name="offload_add")
             hexm = m.imported_modules[0]
         else:
-            hexm = tvm.build(s, [C, A, B], target=target, target_host=target, name="native_add")
+            hexm = tvm.build(
+                s, [C, A, B], target=tvm.target.Target(target, target), name="native_add"
+            )
 
         asm = hexm.get_source("s")
         vadds = re.findall(r"v[0-9]+.b = vadd\(v[0-9]+.b,v[0-9]+.b\)", asm)
@@ -71,7 +73,7 @@ def test_llvm_target_features():
     A = tvm.te.placeholder((128,), dtype="uint8", name="A")
     C = tvm.te.compute((128,), lambda i: A[i] + 1, name="C")
     s = tvm.te.create_schedule(C.op)
-    m = tvm.build(s, [C, A], target=target, target_host=target, name="add_one")
+    m = tvm.build(s, [C, A], target=tvm.target.Target(target, target), name="add_one")
     llvm_ir = m.get_source("ll")
     # Make sure we find +hvx-length128b in "attributes".
     fs = re.findall(r"attributes.*\+hvx-length128b", llvm_ir)
diff --git a/tests/python/unittest/test_tir_transform_instrument_bound_checkers.py b/tests/python/unittest/test_tir_transform_instrument_bound_checkers.py
index 187013e132a4..bee8bfb60764 100644
--- a/tests/python/unittest/test_tir_transform_instrument_bound_checkers.py
+++ b/tests/python/unittest/test_tir_transform_instrument_bound_checkers.py
@@ -39,7 +39,7 @@ def test_out_of_bounds_llvm(index_a, index_b):
     tgt_host = "llvm"
     stmt = tvm.lower(s, [A, B, C], simple_mode=True)
     print(stmt)
-    fadd = tvm.build(s, [A, B, C], tgt, target_host=tgt_host, name="myadd")
+    fadd = tvm.build(s, [A, B, C], tvm.target.Target(tgt, tgt_host), name="myadd")
     ctx = tvm.context(tgt, 0)
     a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
     b = tvm.nd.array(np.random.uniform(size=1024).astype(B.dtype), ctx)
@@ -57,7 +57,7 @@ def test_in_bounds_llvm():
     tgt = "llvm"
     tgt_host = "llvm"
     stmt = tvm.lower(s, [A, B, C], simple_mode=True)
-    fadd = tvm.build(s, [A, B, C], tgt, target_host=tgt_host, name="myadd")
+    fadd = tvm.build(s, [A, B, C], tvm.target.Target(tgt, tgt_host), name="myadd")
     ctx = tvm.context(tgt, 0)
     a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
     b = tvm.nd.array(np.random.uniform(size=1024).astype(B.dtype), ctx)
@@ -79,7 +79,7 @@ def test_out_of_bounds_vectorize_llvm(nn, index_a, index_b):
     tgt = "llvm"
     tgt_host = "llvm"
     stmt = tvm.lower(s, [a, b, c], simple_mode=True)
-    f = tvm.build(s, [a, b, c], tgt, target_host=tgt_host, name="myaddvec")
+    f = tvm.build(s, [a, b, c], tgt, tvm.target.Target(tgt, tgt_host), name="myaddvec")
     ctx = tvm.cpu(0)
     n = nn
     a = tvm.nd.array(np.random.uniform(size=(n)).astype(a.dtype), ctx)

From 95539d9a4762501c287bb924326e48b4b8916e26 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Mon, 8 Mar 2021 09:48:09 -0800
Subject: [PATCH 31/69] Fi test_runtime_heterogeneous.py

---
 tests/python/unittest/test_runtime_heterogeneous.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/python/unittest/test_runtime_heterogeneous.py b/tests/python/unittest/test_runtime_heterogeneous.py
index 161f944ea7bb..6cb531517e1b 100644
--- a/tests/python/unittest/test_runtime_heterogeneous.py
+++ b/tests/python/unittest/test_runtime_heterogeneous.py
@@ -170,7 +170,9 @@ def check_device(device, target_device):
         )
 
         target_flist = {target_device: lower_add, target_host: lower_sub}
-        mhost = tvm.build(target_flist, target_host=target_host)
+        target = tvm.target.Target.current() if target is None else target
+        target = target if target else "llvm"
+        mhost = tvm.build(target_flist, target=tvm.target.Target(target, target_host))
         ctx = [host_ctx, device_ctx]
         mod = graph_runtime.create(graph, mhost, ctx)
         params = {}
@@ -399,7 +401,9 @@ def check_device(device, target_device):
 
         lower_add0.update(lower_add1)
         target_flist = {target_device: lower_add0, target_host: lower_sub}
-        mhost = tvm.build(target_flist, target_host=target_host)
+        target = tvm.target.Target.current() if target is None else target
+        target = target if target else "llvm"
+        mhost = tvm.build(target_flist, target=tvm.target.Target(target, target_host))
         ctx = [host_ctx, device_ctx]
         params = {}
         params["A"] = tensor_a = np.random.uniform(size=shape).astype(tensor_a.dtype)

From 858d901e3374815ecbd4e4449e8ba65b9b745a67 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Mon, 8 Mar 2021 10:01:52 -0800
Subject: [PATCH 32/69] Modify tutorials & remove extra print

---
 python/tvm/relay/build_module.py                   |  2 --
 tutorials/auto_scheduler/tune_network_mali.py      | 10 +++++++---
 tutorials/autotvm/tune_relay_mobile_gpu.py         |  5 ++---
 tutorials/frontend/deploy_model_on_android.py      |  2 +-
 tutorials/frontend/from_darknet.py                 |  2 +-
 tutorials/frontend/from_pytorch.py                 |  2 +-
 tutorials/frontend/from_tensorflow.py              |  2 +-
 tutorials/get_started/cross_compilation_and_rpc.py |  2 +-
 tutorials/get_started/tensor_expr_get_started.py   |  2 +-
 9 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index f10f479b6cd7..94c250f2090d 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -272,9 +272,7 @@ def build(mod, target=None, target_host=None, params=None, mod_name="default"):
             "instead of deprecated parameter mod (tvm.relay.function.Function)",
             DeprecationWarning,
         )
-    print(target)
     target = _update_target(target)
-    print(target)
     if isinstance(target_host, (str, Target)):
         target_host = Target(target_host)
     elif target_host:
diff --git a/tutorials/auto_scheduler/tune_network_mali.py b/tutorials/auto_scheduler/tune_network_mali.py
index ca1067b27c80..329a632085f8 100644
--- a/tutorials/auto_scheduler/tune_network_mali.py
+++ b/tutorials/auto_scheduler/tune_network_mali.py
@@ -170,7 +170,9 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"):
 # Extract tasks from the network
 print("Extract tasks...")
 mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype)
-tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target, target_host)
+tasks, task_weights = auto_scheduler.extract_tasks(
+    mod["main"], params, tvm.target.Target(target, target_host)
+)
 
 for idx, task in enumerate(tasks):
     print("========== Task %d  (workload key: %s) ==========" % (idx, task.workload_key))
@@ -198,7 +200,9 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"):
 #
 #   .. code-block:: python
 #
-#     tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target, target_host, hardware_params)
+#    tasks, task_weights = auto_scheduler.extract_tasks(
+#        mod["main"], params, tvm.target.Target(target, target_host), hardware_params = hardware_params
+#    )
 #
 
 #################################################################
@@ -240,7 +244,7 @@ def tune_and_evaluate():
         with tvm.transform.PassContext(
             opt_level=3, config={"relay.backend.use_auto_scheduler": True}
         ):
-            lib = relay.build(mod, target=target, target_host=target_host, params=params)
+            lib = relay.build(mod, target=tvm.target.Target(target, target_host), params=params)
 
     # Create graph runtime
     print("=============== Request Remote ===============")
diff --git a/tutorials/autotvm/tune_relay_mobile_gpu.py b/tutorials/autotvm/tune_relay_mobile_gpu.py
index 859ac583236e..614a6a44d42b 100644
--- a/tutorials/autotvm/tune_relay_mobile_gpu.py
+++ b/tutorials/autotvm/tune_relay_mobile_gpu.py
@@ -316,8 +316,7 @@ def tune_and_evaluate(tuning_opt):
     mod, params, input_shape, _ = get_network(network, batch_size=1)
     tasks = autotvm.task.extract_from_program(
         mod["main"],
-        target=target,
-        target_host=target_host,
+        target=tvm.target.Target(target, target_host),
         params=params,
         ops=(relay.op.get("nn.conv2d"),),
     )
@@ -331,7 +330,7 @@ def tune_and_evaluate(tuning_opt):
         print("Compile...")
         with tvm.transform.PassContext(opt_level=3):
             lib = relay.build_module.build(
-                mod, target=target, params=params, target_host=target_host
+                mod, target=tvm.target.Target(target, target_host), params=params
             )
         # export library
         tmp = tempdir()
diff --git a/tutorials/frontend/deploy_model_on_android.py b/tutorials/frontend/deploy_model_on_android.py
index ff7ef44a7acb..0608c1839da3 100644
--- a/tutorials/frontend/deploy_model_on_android.py
+++ b/tutorials/frontend/deploy_model_on_android.py
@@ -275,7 +275,7 @@ def transform_image(image):
 mod, params = relay.frontend.from_keras(keras_mobilenet_v2, shape_dict)
 
 with tvm.transform.PassContext(opt_level=3):
-    lib = relay.build(mod, target=target, target_host=target_host, params=params)
+    lib = relay.build(mod, target=tvm.target.Target(target, target_host), params=params)
 
 # After `relay.build`, you will get three return values: graph,
 # library and the new parameter, since we do some optimization that will
diff --git a/tutorials/frontend/from_darknet.py b/tutorials/frontend/from_darknet.py
index fc770799f51a..e5e0efb4ba38 100644
--- a/tutorials/frontend/from_darknet.py
+++ b/tutorials/frontend/from_darknet.py
@@ -101,7 +101,7 @@
 shape = {"data": data.shape}
 print("Compiling the model...")
 with tvm.transform.PassContext(opt_level=3):
-    lib = relay.build(mod, target=target, target_host=target_host, params=params)
+    lib = relay.build(mod, target=tvm.target.Target(target, target_host), params=params)
 
 [neth, netw] = shape["data"][2:]  # Current image shape is 608x608
 ######################################################################
diff --git a/tutorials/frontend/from_pytorch.py b/tutorials/frontend/from_pytorch.py
index b5bcdf6792f9..4af9febe6b13 100644
--- a/tutorials/frontend/from_pytorch.py
+++ b/tutorials/frontend/from_pytorch.py
@@ -104,7 +104,7 @@
 target_host = "llvm"
 ctx = tvm.cpu(0)
 with tvm.transform.PassContext(opt_level=3):
-    lib = relay.build(mod, target=target, target_host=target_host, params=params)
+    lib = relay.build(mod, target=tvm.target.Target(target, target_host), params=params)
 
 ######################################################################
 # Execute the portable graph on TVM
diff --git a/tutorials/frontend/from_tensorflow.py b/tutorials/frontend/from_tensorflow.py
index 5cdc39588616..61014e4da261 100644
--- a/tutorials/frontend/from_tensorflow.py
+++ b/tutorials/frontend/from_tensorflow.py
@@ -145,7 +145,7 @@
 #   lib: target library which can be deployed on target with TVM runtime.
 
 with tvm.transform.PassContext(opt_level=3):
-    lib = relay.build(mod, target=target, target_host=target_host, params=params)
+    lib = relay.build(mod, target=tvm.target.Target(target, target_host), params=params)
 
 ######################################################################
 # Execute the portable graph on TVM
diff --git a/tutorials/get_started/cross_compilation_and_rpc.py b/tutorials/get_started/cross_compilation_and_rpc.py
index 2386e7bdd135..455743c1a400 100644
--- a/tutorials/get_started/cross_compilation_and_rpc.py
+++ b/tutorials/get_started/cross_compilation_and_rpc.py
@@ -234,7 +234,7 @@ def run_opencl():
     xo, xi = s[B].split(B.op.axis[0], factor=32)
     s[B].bind(xo, te.thread_axis("blockIdx.x"))
     s[B].bind(xi, te.thread_axis("threadIdx.x"))
-    func = tvm.build(s, [A, B], "opencl", target_host=target_host)
+    func = tvm.build(s, [A, B], tvm.target.Target("opencl", target_host))
 
     remote = rpc.connect(opencl_device_host, opencl_device_port)
 
diff --git a/tutorials/get_started/tensor_expr_get_started.py b/tutorials/get_started/tensor_expr_get_started.py
index 7f1bb6a3d1e5..4ccce2dfbcbe 100644
--- a/tutorials/get_started/tensor_expr_get_started.py
+++ b/tutorials/get_started/tensor_expr_get_started.py
@@ -138,7 +138,7 @@
 # function.  fadd is the generated host wrapper function, it contains
 # a reference to the generated device function internally.
 #
-fadd = tvm.build(s, [A, B, C], tgt, target_host=tgt_host, name="myadd")
+fadd = tvm.build(s, [A, B, C], tvm.target.Target(tgt, tgt_host), name="myadd")
 
 ######################################################################
 # Run the Function

From d99b560dfee95eba1bfceab1ea0273b87cd709da Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Mon, 8 Mar 2021 12:50:47 -0800
Subject: [PATCH 33/69] Update more tests to new api

---
 tests/micro/qemu/test_zephyr.py                  | 2 +-
 tests/python/contrib/test_cudnn.py               | 8 ++++----
 tests/python/contrib/test_miopen.py              | 4 ++--
 tests/python/driver/tvmc/test_compiler.py        | 3 +--
 tests/python/frontend/tensorflow/test_forward.py | 4 +++-
 tests/python/integration/test_reduce.py          | 4 +++-
 tests/python/integration/test_tuning.py          | 7 +++----
 7 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/tests/micro/qemu/test_zephyr.py b/tests/micro/qemu/test_zephyr.py
index 4c8bd5f5dae8..470c29f23030 100644
--- a/tests/micro/qemu/test_zephyr.py
+++ b/tests/micro/qemu/test_zephyr.py
@@ -46,7 +46,7 @@
 def _make_sess_from_op(model, zephyr_board, west_cmd, op_name, sched, arg_bufs):
     target = tvm.target.target.micro(model)
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
-        mod = tvm.build(sched, arg_bufs, target, target_host=target, name=op_name)
+        mod = tvm.build(sched, arg_bufs, tvm.target.Target(target, target), name=op_name)
 
     return _make_session(model, target, zephyr_board, west_cmd, mod)
 
diff --git a/tests/python/contrib/test_cudnn.py b/tests/python/contrib/test_cudnn.py
index 514f529b4692..5e7533637177 100644
--- a/tests/python/contrib/test_cudnn.py
+++ b/tests/python/contrib/test_cudnn.py
@@ -72,7 +72,7 @@ def verify_conv2d(data_dtype, conv_dtype, tensor_format=0, groups=1):
 
     # validation
     ctx = tvm.gpu(0)
-    f = tvm.build(s, [X, W, Y], "cuda", target_host="llvm", name="conv2d")
+    f = tvm.build(s, [X, W, Y], "cuda --host=llvm", name="conv2d")
     x_np = np.random.uniform(-1, 1, xshape).astype(data_dtype)
     w_np = np.random.uniform(-1, 1, wshape).astype(data_dtype)
     y_np = np.zeros(yshape).astype(data_dtype)
@@ -150,7 +150,7 @@ def verify_conv3d(data_dtype, conv_dtype, tensor_format=0, groups=1):
 
     # validation
     ctx = tvm.gpu(0)
-    f = tvm.build(s, [X, W, Y], "cuda", target_host="llvm", name="conv3d")
+    f = tvm.build(s, [X, W, Y], "cuda --host=llvm", name="conv3d")
     x_np = np.random.uniform(-1, 1, xshape).astype(data_dtype)
     w_np = np.random.uniform(-1, 1, wshape).astype(data_dtype)
     y_np = np.zeros(yshape).astype(data_dtype)
@@ -182,7 +182,7 @@ def verify_softmax(shape, axis, dtype="float32"):
     b_np = tvm.topi.testing.softmax_python(a_np)
     a = tvm.nd.array(a_np, ctx)
     b = tvm.nd.array(b_np, ctx)
-    f = tvm.build(s, [A, B], "cuda", target_host="llvm", name="softmax")
+    f = tvm.build(s, [A, B], "cuda --host=llvm", name="softmax")
     f(a, b)
     tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3)
 
@@ -199,7 +199,7 @@ def verify_softmax_4d(shape, dtype="float32"):
     b_np = b_np.reshape(n, h, w, c).transpose(0, 3, 1, 2)
     a = tvm.nd.array(a_np, ctx)
     b = tvm.nd.array(b_np, ctx)
-    f = tvm.build(s, [A, B], "cuda", target_host="llvm", name="softmax")
+    f = tvm.build(s, [A, B], "cuda --host=llvm", name="softmax")
     f(a, b)
     tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3)
 
diff --git a/tests/python/contrib/test_miopen.py b/tests/python/contrib/test_miopen.py
index 317736725d08..7a7c10fc7f86 100644
--- a/tests/python/contrib/test_miopen.py
+++ b/tests/python/contrib/test_miopen.py
@@ -53,7 +53,7 @@ def test_conv2d():
 
     def verify():
         ctx = tvm.rocm(0)
-        f = tvm.build(s, [X, W, Y], "rocm", target_host="llvm", name="conv2d")
+        f = tvm.build(s, [X, W, Y], "rocm --host=llvm", name="conv2d")
         x = tvm.nd.array(np.random.uniform(-1, 1, xshape).astype(np.float32), ctx)
         w = tvm.nd.array(np.random.uniform(-1, 1, wshape).astype(np.float32), ctx)
         y = tvm.nd.array(np.random.uniform(-1, 1, yshape).astype(np.float32), ctx)
@@ -63,7 +63,7 @@ def verify():
             X, W, (stride_h, stride_w), (pad_h, pad_w), (dilation_h, dilation_w)
         )
         s_ref = te.create_schedule(Y_ref.op)
-        f_ref = tvm.build(s_ref, [X, W, Y_ref], "rocm", target_host="llvm")
+        f_ref = tvm.build(s_ref, [X, W, Y_ref], "rocm --host=llvm")
         y_ref = tvm.nd.array(np.random.uniform(-1, 1, yshape).astype(np.float32), ctx)
         f_ref(x, w, y_ref)
         print("Max abs diff:", np.max(np.abs(y.asnumpy() - y_ref.asnumpy())))
diff --git a/tests/python/driver/tvmc/test_compiler.py b/tests/python/driver/tvmc/test_compiler.py
index ae859298facd..6c2125262e0e 100644
--- a/tests/python/driver/tvmc/test_compiler.py
+++ b/tests/python/driver/tvmc/test_compiler.py
@@ -178,8 +178,7 @@ def test_compile_opencl(tflite_mobilenet_v1_0_25_128):
 
     graph, lib, params, dumps = tvmc.compiler.compile_model(
         tflite_mobilenet_v1_0_25_128,
-        target="opencl",
-        target_host="llvm",
+        target="opencl --host=llvm",
         alter_layout="NCHW",
     )
 
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 81aeb5ef886c..7ace735789de 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -164,7 +164,9 @@ def run_tvm_graph(
         return vmobj_to_list(result)
     else:
         with tvm.transform.PassContext(opt_level=opt_level, disabled_pass=disabled_pass):
-            graph, lib, params = relay.build(mod, target, target_host, params)
+            graph, lib, params = relay.build(
+                mod, tvm.target.Target(target, target_host), params=params
+            )
         from tvm.contrib import graph_runtime
 
         m = graph_runtime.create(graph, lib, ctx)
diff --git a/tests/python/integration/test_reduce.py b/tests/python/integration/test_reduce.py
index e978b83aabd6..21c9acb08eb6 100644
--- a/tests/python/integration/test_reduce.py
+++ b/tests/python/integration/test_reduce.py
@@ -45,7 +45,9 @@ def check_device(device, host="llvm"):
             if not tvm.testing.device_enabled(device):
                 print("skip because %s is not enabled.." % device)
                 return
-            freduce = tvm.build(s, args=[A, B], target=device, target_host=host, name="myreduce")
+            freduce = tvm.build(
+                s, args=[A, B], target=tvm.target.Target(device, host), name="myreduce"
+            )
             # launch the kernel.
             n = 1028
             m = 129
diff --git a/tests/python/integration/test_tuning.py b/tests/python/integration/test_tuning.py
index 813352c52096..b7b7298f6e30 100644
--- a/tests/python/integration/test_tuning.py
+++ b/tests/python/integration/test_tuning.py
@@ -131,12 +131,11 @@ def teardown_module():
 
 
 def get_sample_task(target=tvm.target.cuda(), target_host=None):
+    target = tvm.target.Target(target, target_host)
+    target_host = target.host
     """return a sample task for testing"""
     task = autotvm.task.create(
-        "testing/conv2d_no_batching",
-        args=(1, 7, 7, 512, 512, 3, 3),
-        target=target,
-        target_host=target_host,
+        "testing/conv2d_no_batching", args=(1, 7, 7, 512, 512, 3, 3), target=target
     )
     return task, target
 

From 62ec2d38da70b9e6c86033490cd9e5684f00c4ac Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Mon, 8 Mar 2021 13:07:05 -0800
Subject: [PATCH 34/69] Refine the tutorial target usage

---
 tutorials/auto_scheduler/tune_network_mali.py      | 11 ++++-------
 tutorials/autotvm/tune_relay_mobile_gpu.py         | 11 +++--------
 tutorials/frontend/deploy_model_on_android.py      | 14 +++++---------
 tutorials/frontend/from_darknet.py                 |  5 ++---
 tutorials/frontend/from_pytorch.py                 |  5 ++---
 tutorials/frontend/from_tensorflow.py              |  8 +++-----
 tutorials/get_started/cross_compilation_and_rpc.py |  4 ++--
 7 files changed, 21 insertions(+), 37 deletions(-)

diff --git a/tutorials/auto_scheduler/tune_network_mali.py b/tutorials/auto_scheduler/tune_network_mali.py
index 329a632085f8..2f3936b7feeb 100644
--- a/tutorials/auto_scheduler/tune_network_mali.py
+++ b/tutorials/auto_scheduler/tune_network_mali.py
@@ -139,8 +139,7 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"):
 use_ndk = True
 # Path to cross compiler
 os.environ["TVM_NDK_CC"] = "/usr/bin/aarch64-linux-gnu-g++"
-target_host = tvm.target.Target("llvm -mtriple=aarch64-linux-gnu")
-target = tvm.target.Target("opencl -device=mali")
+target = tvm.target.Target("opencl -device=mali", "llvm -mtriple=aarch64-linux-gnu")
 dtype = "float32"
 log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target.kind.name)
 
@@ -170,9 +169,7 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"):
 # Extract tasks from the network
 print("Extract tasks...")
 mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype)
-tasks, task_weights = auto_scheduler.extract_tasks(
-    mod["main"], params, tvm.target.Target(target, target_host)
-)
+tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
 
 for idx, task in enumerate(tasks):
     print("========== Task %d  (workload key: %s) ==========" % (idx, task.workload_key))
@@ -201,7 +198,7 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"):
 #   .. code-block:: python
 #
 #    tasks, task_weights = auto_scheduler.extract_tasks(
-#        mod["main"], params, tvm.target.Target(target, target_host), hardware_params = hardware_params
+#        mod["main"], params, target, hardware_params = hardware_params
 #    )
 #
 
@@ -244,7 +241,7 @@ def tune_and_evaluate():
         with tvm.transform.PassContext(
             opt_level=3, config={"relay.backend.use_auto_scheduler": True}
         ):
-            lib = relay.build(mod, target=tvm.target.Target(target, target_host), params=params)
+            lib = relay.build(mod, target, params=params)
 
     # Create graph runtime
     print("=============== Request Remote ===============")
diff --git a/tutorials/autotvm/tune_relay_mobile_gpu.py b/tutorials/autotvm/tune_relay_mobile_gpu.py
index 614a6a44d42b..7fb8dec8ce17 100644
--- a/tutorials/autotvm/tune_relay_mobile_gpu.py
+++ b/tutorials/autotvm/tune_relay_mobile_gpu.py
@@ -201,12 +201,9 @@ def get_network(name, batch_size):
 # set :code:`use_android` to True if you use android phone.
 
 #### DEVICE CONFIG ####
-
-target = tvm.target.Target("opencl -device=mali")
-
 # Replace "aarch64-linux-gnu" with the correct target of your board.
 # This target host is used for cross compilation. You can query it by :code:`gcc -v` on your device.
-target_host = "llvm -mtriple=aarch64-linux-gnu"
+target = tvm.target.Target("opencl -device=mali", "llvm -mtriple=aarch64-linux-gnu")
 
 # Also replace this with the device key in your tracker
 device_key = "rk3399"
@@ -316,7 +313,7 @@ def tune_and_evaluate(tuning_opt):
     mod, params, input_shape, _ = get_network(network, batch_size=1)
     tasks = autotvm.task.extract_from_program(
         mod["main"],
-        target=tvm.target.Target(target, target_host),
+        target,
         params=params,
         ops=(relay.op.get("nn.conv2d"),),
     )
@@ -329,9 +326,7 @@ def tune_and_evaluate(tuning_opt):
     with autotvm.apply_history_best(log_file):
         print("Compile...")
         with tvm.transform.PassContext(opt_level=3):
-            lib = relay.build_module.build(
-                mod, target=tvm.target.Target(target, target_host), params=params
-            )
+            lib = relay.build_module.build(mod, target, params=params)
         # export library
         tmp = tempdir()
         if use_android:
diff --git a/tutorials/frontend/deploy_model_on_android.py b/tutorials/frontend/deploy_model_on_android.py
index 0608c1839da3..aa49d3a73db3 100644
--- a/tutorials/frontend/deploy_model_on_android.py
+++ b/tutorials/frontend/deploy_model_on_android.py
@@ -257,25 +257,21 @@ def transform_image(image):
 # Change target configuration.
 # Run `adb shell cat /proc/cpuinfo` to find the arch.
 arch = "arm64"
-target = "llvm -mtriple=%s-linux-android" % arch
-target_host = None
+target = tvm.target.Target("llvm -mtriple=%s-linux-android" % arch)
 
 if local_demo:
-    target_host = None
-    target = "llvm"
+    target = tvm.target.Target("llvm")
 elif test_target == "opencl":
-    target_host = target
-    target = "opencl"
+    target = tvm.target.Target("opencl", target)
 elif test_target == "vulkan":
-    target_host = target
-    target = "vulkan"
+    target = tvm.target.Target("vulkan", target)
 
 input_name = "input_1"
 shape_dict = {input_name: x.shape}
 mod, params = relay.frontend.from_keras(keras_mobilenet_v2, shape_dict)
 
 with tvm.transform.PassContext(opt_level=3):
-    lib = relay.build(mod, target=tvm.target.Target(target, target_host), params=params)
+    lib = relay.build(mod, target, params=params)
 
 # After `relay.build`, you will get three return values: graph,
 # library and the new parameter, since we do some optimization that will
diff --git a/tutorials/frontend/from_darknet.py b/tutorials/frontend/from_darknet.py
index e5e0efb4ba38..3dace3e05697 100644
--- a/tutorials/frontend/from_darknet.py
+++ b/tutorials/frontend/from_darknet.py
@@ -94,14 +94,13 @@
 # Import the graph to Relay
 # -------------------------
 # compile the model
-target = "llvm"
-target_host = "llvm"
+target = tvm.target.Target("llvm --host=llvm")
 ctx = tvm.cpu(0)
 data = np.empty([batch_size, net.c, net.h, net.w], dtype)
 shape = {"data": data.shape}
 print("Compiling the model...")
 with tvm.transform.PassContext(opt_level=3):
-    lib = relay.build(mod, target=tvm.target.Target(target, target_host), params=params)
+    lib = relay.build(mod, target, params=params)
 
 [neth, netw] = shape["data"][2:]  # Current image shape is 608x608
 ######################################################################
diff --git a/tutorials/frontend/from_pytorch.py b/tutorials/frontend/from_pytorch.py
index 4af9febe6b13..891593557005 100644
--- a/tutorials/frontend/from_pytorch.py
+++ b/tutorials/frontend/from_pytorch.py
@@ -100,11 +100,10 @@
 # Relay Build
 # -----------
 # Compile the graph to llvm target with given input specification.
-target = "llvm"
-target_host = "llvm"
+target = tvm.target.Target("llvm --host=llvm")
 ctx = tvm.cpu(0)
 with tvm.transform.PassContext(opt_level=3):
-    lib = relay.build(mod, target=tvm.target.Target(target, target_host), params=params)
+    lib = relay.build(mod, target, params=params)
 
 ######################################################################
 # Execute the portable graph on TVM
diff --git a/tutorials/frontend/from_tensorflow.py b/tutorials/frontend/from_tensorflow.py
index 61014e4da261..42ad8824c371 100644
--- a/tutorials/frontend/from_tensorflow.py
+++ b/tutorials/frontend/from_tensorflow.py
@@ -70,12 +70,10 @@
 
 # Target settings
 # Use these commented settings to build for cuda.
-# target = 'cuda'
-# target_host = 'llvm'
+# target = tvm.target.Target('cuda --host=llvm')
 # layout = "NCHW"
 # ctx = tvm.gpu(0)
-target = "llvm"
-target_host = "llvm"
+target = tvm.target.Target("llvm --host=llvm")
 layout = None
 ctx = tvm.cpu(0)
 
@@ -145,7 +143,7 @@
 #   lib: target library which can be deployed on target with TVM runtime.
 
 with tvm.transform.PassContext(opt_level=3):
-    lib = relay.build(mod, target=tvm.target.Target(target, target_host), params=params)
+    lib = relay.build(mod, target, params=params)
 
 ######################################################################
 # Execute the portable graph on TVM
diff --git a/tutorials/get_started/cross_compilation_and_rpc.py b/tutorials/get_started/cross_compilation_and_rpc.py
index 455743c1a400..69cbc188a325 100644
--- a/tutorials/get_started/cross_compilation_and_rpc.py
+++ b/tutorials/get_started/cross_compilation_and_rpc.py
@@ -225,16 +225,16 @@
 def run_opencl():
     # NOTE: This is the setting for my rk3399 board. You need to modify
     # them according to your environment.
-    target_host = "llvm -mtriple=aarch64-linux-gnu"
     opencl_device_host = "10.77.1.145"
     opencl_device_port = 9090
+    target = tvm.target.Target("opencl", "llvm -mtriple=aarch64-linux-gnu")
 
     # create schedule for the above "add one" compute declaration
     s = te.create_schedule(B.op)
     xo, xi = s[B].split(B.op.axis[0], factor=32)
     s[B].bind(xo, te.thread_axis("blockIdx.x"))
     s[B].bind(xi, te.thread_axis("threadIdx.x"))
-    func = tvm.build(s, [A, B], tvm.target.Target("opencl", target_host))
+    func = tvm.build(s, [A, B], target)
 
     remote = rpc.connect(opencl_device_host, opencl_device_port)
 

From 6916758dcef789ce367329b0d157d80b4905c67e Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Mon, 8 Mar 2021 13:21:36 -0800
Subject: [PATCH 35/69] change argument name for Target constructor function

---
 python/tvm/autotvm/task/task.py |  3 ++-
 python/tvm/target/target.py     | 28 +++++++++++++---------------
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py
index a96ae7311fc9..349128f342f2 100644
--- a/python/tvm/autotvm/task/task.py
+++ b/python/tvm/autotvm/task/task.py
@@ -450,6 +450,7 @@ def create(task_name, args, target, target_host=None):
         target = Target(target)
 
     target = Target(target, target_host)
+    target_host = target.host
 
     # init config space
     ret.config_space = ConfigSpace()
@@ -462,7 +463,7 @@ def create(task_name, args, target, target_host=None):
 
     ret.flop = ret.config_space.flop or compute_flop(sch)
     ret.target = target
-    ret.target_host = target.host
+    ret.target_host = target_host
 
     return ret
 
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 3399ccd58144..4ab0e86e4b29 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -46,7 +46,7 @@ class Target(Object):
     - :py:func:`tvm.target.intel_graphics` create Intel Graphics target
     """
 
-    def __init__(self, tag_or_str_or_dict, host_tag_or_str_or_dict=None):
+    def __init__(self, target, host=None):
         """Construct a TVM target object from
         1) Raw target string
         2) Target config dict
@@ -54,7 +54,7 @@ def __init__(self, tag_or_str_or_dict, host_tag_or_str_or_dict=None):
 
         Parameters
         ----------
-        tag_or_str_or_dict : Union[str, Dict[str, Any]]
+        target : Union[str, Dict[str, Any]]
             Can be one of a literal target string, a json string describing
             a configuration, or a dictionary of configuration options.
             When using a dictionary or json string to configure target, the
@@ -87,23 +87,21 @@ def __init__(self, tag_or_str_or_dict, host_tag_or_str_or_dict=None):
                 An llvm setting that is one of 'hard' or 'soft' indicating whether to use
                 hardware or software floating-point operations.
             host : Union[str, Dict[str, Any]] (optional)
-                Description for target host. Can be recursive. Similar to tag_or_str_or_dict.
-        host_tag_or_str_or_dict : Optional[Union[str, Dict[str, Any]]]
-            Similar to tag_or_str_or_dict but for target host. Can be one of a literal
-            target host string, a json string describing a configuration, or a dictionary of
-            configuration options. When using a dictionary or json string to configure target,
-            the possible values are same as tag_or_str_or_dict.
+                Description for target host. Can be recursive. Similar to target.
+        host : Optional[Union[str, Dict[str, Any]]]
+            Similar to target but for target host. Can be one of a literal target host string,
+            a json string describing a configuration, or a dictionary of configuration options.
+            When using a dictionary or json string to configure target, the possible values are
+            same as target.
         """
-        if tag_or_str_or_dict is None or not isinstance(tag_or_str_or_dict, (dict, str, Target)):
+        if target is None or not isinstance(target, (dict, str, Target)):
             raise ValueError("target has to be a string or dictionary.")
-        if host_tag_or_str_or_dict is not None:
-            if not isinstance(host_tag_or_str_or_dict, (dict, str, Target)):
+        if host is not None:
+            if not isinstance(host, (dict, str, Target)):
                 raise ValueError("target host has to be a string or dictionary.")
-            self.__init_handle_by_constructor__(
-                _ffi_api.Target, Target(tag_or_str_or_dict), Target(host_tag_or_str_or_dict)
-            )
+            self.__init_handle_by_constructor__(_ffi_api.Target, Target(target), Target(host))
         else:
-            self.__init_handle_by_constructor__(_ffi_api.Target, tag_or_str_or_dict)
+            self.__init_handle_by_constructor__(_ffi_api.Target, target)
 
     def __enter__(self):
         _ffi_api.TargetEnterScope(self)

From a762d7d7420d40fdfd6841e494669d3296a63896 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Mon, 8 Mar 2021 17:15:17 -0800
Subject: [PATCH 36/69] Fix target export function

---
 src/target/target.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/target/target.cc b/src/target/target.cc
index a8bb00b6f503..cc559444ba27 100644
--- a/src/target/target.cc
+++ b/src/target/target.cc
@@ -408,6 +408,8 @@ Map<String, ObjectRef> TargetNode::Export() const {
       {"tag", this->tag},
       {"keys", this->keys},
   };
+  if (this->host.defined())
+    result.Set("host", this->GetHost().value_or(Target())->Export());
   for (const auto& kv : attrs) {
     result.Set(kv.first, kv.second);
   }

From b01f6cc3749e01ab0260a8fd80ed713b5f9db24d Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Mon, 8 Mar 2021 18:01:58 -0800
Subject: [PATCH 37/69] Fix and validate all tutorial usage

---
 tutorials/auto_scheduler/tune_network_mali.py |  2 +-
 tutorials/autotvm/tune_relay_mobile_gpu.py    |  6 ++---
 tutorials/frontend/deploy_model_on_android.py |  6 ++---
 tutorials/frontend/from_darknet.py            |  4 +--
 tutorials/frontend/from_pytorch.py            |  4 +--
 tutorials/frontend/from_tensorflow.py         |  4 +--
 .../get_started/cross_compilation_and_rpc.py  |  4 +--
 .../get_started/tensor_expr_get_started.py    | 27 +++++++++----------
 8 files changed, 28 insertions(+), 29 deletions(-)

diff --git a/tutorials/auto_scheduler/tune_network_mali.py b/tutorials/auto_scheduler/tune_network_mali.py
index 2f3936b7feeb..0af429b994a7 100644
--- a/tutorials/auto_scheduler/tune_network_mali.py
+++ b/tutorials/auto_scheduler/tune_network_mali.py
@@ -139,7 +139,7 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"):
 use_ndk = True
 # Path to cross compiler
 os.environ["TVM_NDK_CC"] = "/usr/bin/aarch64-linux-gnu-g++"
-target = tvm.target.Target("opencl -device=mali", "llvm -mtriple=aarch64-linux-gnu")
+target = tvm.target.Target("opencl -device=mali", host="llvm -mtriple=aarch64-linux-gnu")
 dtype = "float32"
 log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target.kind.name)
 
diff --git a/tutorials/autotvm/tune_relay_mobile_gpu.py b/tutorials/autotvm/tune_relay_mobile_gpu.py
index 7fb8dec8ce17..8ba47c4eaabf 100644
--- a/tutorials/autotvm/tune_relay_mobile_gpu.py
+++ b/tutorials/autotvm/tune_relay_mobile_gpu.py
@@ -203,7 +203,7 @@ def get_network(name, batch_size):
 #### DEVICE CONFIG ####
 # Replace "aarch64-linux-gnu" with the correct target of your board.
 # This target host is used for cross compilation. You can query it by :code:`gcc -v` on your device.
-target = tvm.target.Target("opencl -device=mali", "llvm -mtriple=aarch64-linux-gnu")
+target = tvm.target.Target("opencl -device=mali", host="llvm -mtriple=aarch64-linux-gnu")
 
 # Also replace this with the device key in your tracker
 device_key = "rk3399"
@@ -313,7 +313,7 @@ def tune_and_evaluate(tuning_opt):
     mod, params, input_shape, _ = get_network(network, batch_size=1)
     tasks = autotvm.task.extract_from_program(
         mod["main"],
-        target,
+        target=target,
         params=params,
         ops=(relay.op.get("nn.conv2d"),),
     )
@@ -326,7 +326,7 @@ def tune_and_evaluate(tuning_opt):
     with autotvm.apply_history_best(log_file):
         print("Compile...")
         with tvm.transform.PassContext(opt_level=3):
-            lib = relay.build_module.build(mod, target, params=params)
+            lib = relay.build_module.build(mod, target=target, params=params)
         # export library
         tmp = tempdir()
         if use_android:
diff --git a/tutorials/frontend/deploy_model_on_android.py b/tutorials/frontend/deploy_model_on_android.py
index aa49d3a73db3..aa9726b6e27a 100644
--- a/tutorials/frontend/deploy_model_on_android.py
+++ b/tutorials/frontend/deploy_model_on_android.py
@@ -262,16 +262,16 @@ def transform_image(image):
 if local_demo:
     target = tvm.target.Target("llvm")
 elif test_target == "opencl":
-    target = tvm.target.Target("opencl", target)
+    target = tvm.target.Target("opencl", host=target)
 elif test_target == "vulkan":
-    target = tvm.target.Target("vulkan", target)
+    target = tvm.target.Target("vulkan", host=target)
 
 input_name = "input_1"
 shape_dict = {input_name: x.shape}
 mod, params = relay.frontend.from_keras(keras_mobilenet_v2, shape_dict)
 
 with tvm.transform.PassContext(opt_level=3):
-    lib = relay.build(mod, target, params=params)
+    lib = relay.build(mod, target=target, params=params)
 
 # After `relay.build`, you will get three return values: graph,
 # library and the new parameter, since we do some optimization that will
diff --git a/tutorials/frontend/from_darknet.py b/tutorials/frontend/from_darknet.py
index 3dace3e05697..76205b526e85 100644
--- a/tutorials/frontend/from_darknet.py
+++ b/tutorials/frontend/from_darknet.py
@@ -94,13 +94,13 @@
 # Import the graph to Relay
 # -------------------------
 # compile the model
-target = tvm.target.Target("llvm --host=llvm")
+target = tvm.target.Target("llvm", host="llvm")
 ctx = tvm.cpu(0)
 data = np.empty([batch_size, net.c, net.h, net.w], dtype)
 shape = {"data": data.shape}
 print("Compiling the model...")
 with tvm.transform.PassContext(opt_level=3):
-    lib = relay.build(mod, target, params=params)
+    lib = relay.build(mod, target=target, params=params)
 
 [neth, netw] = shape["data"][2:]  # Current image shape is 608x608
 ######################################################################
diff --git a/tutorials/frontend/from_pytorch.py b/tutorials/frontend/from_pytorch.py
index 891593557005..069626407e63 100644
--- a/tutorials/frontend/from_pytorch.py
+++ b/tutorials/frontend/from_pytorch.py
@@ -100,10 +100,10 @@
 # Relay Build
 # -----------
 # Compile the graph to llvm target with given input specification.
-target = tvm.target.Target("llvm --host=llvm")
+target = tvm.target.Target("llvm", host="llvm")
 ctx = tvm.cpu(0)
 with tvm.transform.PassContext(opt_level=3):
-    lib = relay.build(mod, target, params=params)
+    lib = relay.build(mod, target=target, params=params)
 
 ######################################################################
 # Execute the portable graph on TVM
diff --git a/tutorials/frontend/from_tensorflow.py b/tutorials/frontend/from_tensorflow.py
index 42ad8824c371..3d01c6b0e407 100644
--- a/tutorials/frontend/from_tensorflow.py
+++ b/tutorials/frontend/from_tensorflow.py
@@ -70,10 +70,10 @@
 
 # Target settings
 # Use these commented settings to build for cuda.
-# target = tvm.target.Target('cuda --host=llvm')
+# target = tvm.target.Target("cuda", host="llvm")
 # layout = "NCHW"
 # ctx = tvm.gpu(0)
-target = tvm.target.Target("llvm --host=llvm")
+target = tvm.target.Target("llvm", host="llvm")
 layout = None
 ctx = tvm.cpu(0)
 
diff --git a/tutorials/get_started/cross_compilation_and_rpc.py b/tutorials/get_started/cross_compilation_and_rpc.py
index 69cbc188a325..cc5b9cc5bc2c 100644
--- a/tutorials/get_started/cross_compilation_and_rpc.py
+++ b/tutorials/get_started/cross_compilation_and_rpc.py
@@ -227,14 +227,14 @@ def run_opencl():
     # them according to your environment.
     opencl_device_host = "10.77.1.145"
     opencl_device_port = 9090
-    target = tvm.target.Target("opencl", "llvm -mtriple=aarch64-linux-gnu")
+    target = tvm.target.Target("opencl", host="llvm -mtriple=aarch64-linux-gnu")
 
     # create schedule for the above "add one" compute declaration
     s = te.create_schedule(B.op)
     xo, xi = s[B].split(B.op.axis[0], factor=32)
     s[B].bind(xo, te.thread_axis("blockIdx.x"))
     s[B].bind(xi, te.thread_axis("threadIdx.x"))
-    func = tvm.build(s, [A, B], target)
+    func = tvm.build(s, [A, B], target=target)
 
     remote = rpc.connect(opencl_device_host, opencl_device_port)
 
diff --git a/tutorials/get_started/tensor_expr_get_started.py b/tutorials/get_started/tensor_expr_get_started.py
index 4ccce2dfbcbe..ac6d547dd1a3 100644
--- a/tutorials/get_started/tensor_expr_get_started.py
+++ b/tutorials/get_started/tensor_expr_get_started.py
@@ -36,9 +36,8 @@
 
 # Global declarations of environment.
 
-tgt_host = "llvm"
-# Change it to respective GPU if gpu is enabled Ex: cuda, opencl, rocm
-tgt = "cuda"
+# Change target to respective GPU if gpu is enabled Ex: cuda, opencl, rocm
+tgt = tvm.target.Target(target="cuda", host="llvm")
 
 ######################################################################
 # Vector Add Example
@@ -117,7 +116,7 @@
 # compute grid. These are GPU specific constructs that allow us
 # to generate code that runs on GPU.
 #
-if tgt == "cuda" or tgt == "rocm" or tgt.startswith("opencl"):
+if tgt.kind.name == "cuda" or tgt.kind.name == "rocm" or tgt.kind.name.startswith("opencl"):
     s[C].bind(bx, te.thread_axis("blockIdx.x"))
     s[C].bind(tx, te.thread_axis("threadIdx.x"))
 
@@ -138,7 +137,7 @@
 # function.  fadd is the generated host wrapper function, it contains
 # a reference to the generated device function internally.
 #
-fadd = tvm.build(s, [A, B, C], tvm.target.Target(tgt, tgt_host), name="myadd")
+fadd = tvm.build(s, [A, B, C], target=tgt, name="myadd")
 
 ######################################################################
 # Run the Function
@@ -154,7 +153,7 @@
 # - fadd runs the actual computation.
 # - asnumpy() copies the GPU array back to the CPU and we can use this to verify correctness
 #
-ctx = tvm.context(tgt, 0)
+ctx = tvm.context(str(tgt), 0)
 
 n = 1024
 a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
@@ -172,7 +171,7 @@
 #
 # The following code fetches the device module and prints the content code.
 #
-if tgt == "cuda" or tgt == "rocm" or tgt.startswith("opencl"):
+if tgt.kind.name == "cuda" or tgt.kind.name == "rocm" or tgt.kind.name.startswith("opencl"):
     dev_module = fadd.imported_modules[0]
     print("-----GPU code-----")
     print(dev_module.get_source())
@@ -214,11 +213,11 @@
 
 temp = utils.tempdir()
 fadd.save(temp.relpath("myadd.o"))
-if tgt == "cuda":
+if tgt.kind.name == "cuda":
     fadd.imported_modules[0].save(temp.relpath("myadd.ptx"))
-if tgt == "rocm":
+if tgt.kind.name == "rocm":
     fadd.imported_modules[0].save(temp.relpath("myadd.hsaco"))
-if tgt.startswith("opencl"):
+if tgt.kind.name.startswith("opencl"):
     fadd.imported_modules[0].save(temp.relpath("myadd.cl"))
 cc.create_shared(temp.relpath("myadd.so"), [temp.relpath("myadd.o")])
 print(temp.listdir())
@@ -240,15 +239,15 @@
 # re-links them together. We can verify that the newly loaded function works.
 #
 fadd1 = tvm.runtime.load_module(temp.relpath("myadd.so"))
-if tgt == "cuda":
+if tgt.kind.name == "cuda":
     fadd1_dev = tvm.runtime.load_module(temp.relpath("myadd.ptx"))
     fadd1.import_module(fadd1_dev)
 
-if tgt == "rocm":
+if tgt.kind.name == "rocm":
     fadd1_dev = tvm.runtime.load_module(temp.relpath("myadd.hsaco"))
     fadd1.import_module(fadd1_dev)
 
-if tgt.startswith("opencl"):
+if tgt.kind.name.startswith("opencl"):
     fadd1_dev = tvm.runtime.load_module(temp.relpath("myadd.cl"))
     fadd1.import_module(fadd1_dev)
 
@@ -290,7 +289,7 @@
 # The following code blocks generate OpenCL code, creates array on an OpenCL
 # device, and verifies the correctness of the code.
 #
-if tgt.startswith("opencl"):
+if tgt.kind.name.startswith("opencl"):
     fadd_cl = tvm.build(s, [A, B, C], tgt, name="myadd")
     print("------opencl code------")
     print(fadd_cl.imported_modules[0].get_source())

From b480bee0075cd73320802907e3eccc25755d1b14 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Mon, 8 Mar 2021 18:07:01 -0800
Subject: [PATCH 38/69] Remove unused argument

---
 python/tvm/auto_scheduler/measure.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/tvm/auto_scheduler/measure.py b/python/tvm/auto_scheduler/measure.py
index 197861be7b5f..180aef29766d 100644
--- a/python/tvm/auto_scheduler/measure.py
+++ b/python/tvm/auto_scheduler/measure.py
@@ -224,7 +224,6 @@ def recover_measure_input(inp, rebuild_state=False):
     new_task = SearchTask(
         workload_key=task.workload_key,
         target=tvm.target.Target(task.target, task.target_host),
-        target_host=None,
         hardware_params=task.hardware_params,
         layout_rewrite_option=task.layout_rewrite_option,
         task_inputs=list(task.task_input_names),

From c17a18e801c5a10abf0b99892833abad5e218dae Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Mon, 8 Mar 2021 22:07:49 -0800
Subject: [PATCH 39/69] Fix format

---
 src/target/target.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/target/target.cc b/src/target/target.cc
index cc559444ba27..7054e6a22bc0 100644
--- a/src/target/target.cc
+++ b/src/target/target.cc
@@ -408,8 +408,7 @@ Map<String, ObjectRef> TargetNode::Export() const {
       {"tag", this->tag},
       {"keys", this->keys},
   };
-  if (this->host.defined())
-    result.Set("host", this->GetHost().value_or(Target())->Export());
+  if (this->host.defined()) result.Set("host", this->GetHost().value_or(Target())->Export());
   for (const auto& kv : attrs) {
     result.Set(kv.first, kv.second);
   }

From a64efd65da4a23aaeb6a21e4f7afba720c2084e1 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Tue, 9 Mar 2021 15:30:46 -0800
Subject: [PATCH 40/69] Fix bug in driver/build_module.py for heterogeneous
 target

---
 tests/python/unittest/test_runtime_heterogeneous.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/python/unittest/test_runtime_heterogeneous.py b/tests/python/unittest/test_runtime_heterogeneous.py
index 6cb531517e1b..2317248adf4c 100644
--- a/tests/python/unittest/test_runtime_heterogeneous.py
+++ b/tests/python/unittest/test_runtime_heterogeneous.py
@@ -172,7 +172,8 @@ def check_device(device, target_device):
         target_flist = {target_device: lower_add, target_host: lower_sub}
         target = tvm.target.Target.current() if target is None else target
         target = target if target else "llvm"
-        mhost = tvm.build(target_flist, target=tvm.target.Target(target, target_host))
+        target = tvm.target.Target(target, target_host)
+        mhost = tvm.build(target_flist, target=target)
         ctx = [host_ctx, device_ctx]
         mod = graph_runtime.create(graph, mhost, ctx)
         params = {}
@@ -403,7 +404,8 @@ def check_device(device, target_device):
         target_flist = {target_device: lower_add0, target_host: lower_sub}
         target = tvm.target.Target.current() if target is None else target
         target = target if target else "llvm"
-        mhost = tvm.build(target_flist, target=tvm.target.Target(target, target_host))
+        target = tvm.target.Target(target, target_host)
+        mhost = tvm.build(target_flist, target=target)
         ctx = [host_ctx, device_ctx]
         params = {}
         params["A"] = tensor_a = np.random.uniform(size=shape).astype(tensor_a.dtype)

From fa982a9c484ed93d8e633f29c613ea7ccac8a171 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Tue, 9 Mar 2021 15:31:26 -0800
Subject: [PATCH 41/69] Fix bug in driver/build_module.py for heterogeneous
 target more

---
 python/tvm/driver/build_module.py   | 25 ++++++++++++-------------
 tests/python/contrib/test_dlpack.py |  4 +++-
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/python/tvm/driver/build_module.py b/python/tvm/driver/build_module.py
index 860572a45d0e..0a2e7d05db0e 100644
--- a/python/tvm/driver/build_module.py
+++ b/python/tvm/driver/build_module.py
@@ -386,12 +386,10 @@ def build(inputs, args=None, target=None, target_host=None, name="default_functi
             f"but got {type(inputs)}."
         )
 
-    flag_target_inputs = False
     if not isinstance(inputs, (dict, container.Map)):
         target = Target.current() if target is None else target
         target = target if target else "llvm"
         target_input_mod = {target: input_mod}
-        flag_target_inputs = True
     else:
         target_input_mod = inputs
 
@@ -401,13 +399,12 @@ def build(inputs, args=None, target=None, target_host=None, name="default_functi
         if not isinstance(mod, tvm.IRModule):
             raise ValueError("inputs must be Schedule, IRModule," "or dict of str to IRModule.")
 
-    target = Target(target, target_host)
-    target_host = target.host
-    if flag_target_inputs:
-        target_input_mod = {target: input_mod}
+    for tar, mod in target_input_mod.items():
+        if isinstance(tar, (str, Target)):
+            target_host = Target(tar, target_host).host
 
     if not target_host:
-        for tar, _ in target_input_mod.items():
+        for tar, mod in target_input_mod.items():
             tar = Target(tar)
             device_type = ndarray.context(tar.kind.name, 0).device_type
             if device_type == ndarray.cpu(0).device_type:
@@ -416,10 +413,14 @@ def build(inputs, args=None, target=None, target_host=None, name="default_functi
     if not target_host:
         target_host = "llvm" if tvm.runtime.enabled("llvm") else "stackvm"
 
-    target = Target(target, target_host)
-    target_host = target.host
-    if flag_target_inputs:
-        target_input_mod = {target: input_mod}
+    new_input_mod = {}
+    for tar, mod in target_input_mod.items():
+        if isinstance(tar, (str, Target)):
+            new_tar = Target(target=tar, host=target_host)
+            new_input_mod[new_tar] = mod
+        else:
+            new_input_mod[tar] = mod
+    target_input_mod = new_input_mod
 
     mod_host_all = tvm.IRModule({})
 
@@ -439,8 +440,6 @@ def build(inputs, args=None, target=None, target_host=None, name="default_functi
 
     if not isinstance(target_host, Target):
         target_host = Target(target_host)
-        target = Target(target, target_host)
-        target_host = target.host
     if (
         target_host.attrs.get("runtime", tvm.runtime.String("c++")) == "c"
         and target_host.attrs.get("system-lib", 0).value == 1
diff --git a/tests/python/contrib/test_dlpack.py b/tests/python/contrib/test_dlpack.py
index 6ff2529f7570..8bf9069b78cf 100644
--- a/tests/python/contrib/test_dlpack.py
+++ b/tests/python/contrib/test_dlpack.py
@@ -49,7 +49,9 @@ def test():
         k = te.reduce_axis((0, n), name="k")
         ZZ = te.compute((n, n), lambda i, j: te.sum(XX[i, k] * YY[k, j], axis=k))
         s = te.create_schedule(ZZ.op)
-        f = tvm.build(s, [XX, YY, ZZ], target_host="llvm", name="f")
+        # No need to speficy target_host if it's llvm
+        # Otherwise you will need to specify the target and target_host
+        f = tvm.build(s, [XX, YY, ZZ], name="f")
 
         f_pytorch = to_pytorch_func(f)
         zz2 = torch.empty(137, 137)

From 33c405779f60671cc10b801660f25e933fce1043 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Tue, 9 Mar 2021 22:00:56 -0800
Subject: [PATCH 42/69] Fix target host type error

---
 python/tvm/driver/build_module.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/tvm/driver/build_module.py b/python/tvm/driver/build_module.py
index 0a2e7d05db0e..c8493180e023 100644
--- a/python/tvm/driver/build_module.py
+++ b/python/tvm/driver/build_module.py
@@ -417,6 +417,7 @@ def build(inputs, args=None, target=None, target_host=None, name="default_functi
     for tar, mod in target_input_mod.items():
         if isinstance(tar, (str, Target)):
             new_tar = Target(target=tar, host=target_host)
+            target_host = new_tar.host
             new_input_mod[new_tar] = mod
         else:
             new_input_mod[tar] = mod

From 75d0f4420cb1d1d37bdd9560b4cc3bff5ad4d2f7 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Wed, 10 Mar 2021 11:19:31 -0800
Subject: [PATCH 43/69] Fix cudnn target host bug

---
 python/tvm/driver/build_module.py  | 8 +++++++-
 tests/python/contrib/test_cudnn.py | 6 +++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/python/tvm/driver/build_module.py b/python/tvm/driver/build_module.py
index c8493180e023..3c2a7209b85b 100644
--- a/python/tvm/driver/build_module.py
+++ b/python/tvm/driver/build_module.py
@@ -399,9 +399,15 @@ def build(inputs, args=None, target=None, target_host=None, name="default_functi
         if not isinstance(mod, tvm.IRModule):
             raise ValueError("inputs must be Schedule, IRModule," "or dict of str to IRModule.")
 
+    new_input_mod = {}
     for tar, mod in target_input_mod.items():
         if isinstance(tar, (str, Target)):
-            target_host = Target(tar, target_host).host
+            new_tar = Target(target=tar, host=target_host)
+            target_host = new_tar.host
+            new_input_mod[new_tar] = mod
+        else:
+            new_input_mod[tar] = mod
+    target_input_mod = new_input_mod
 
     if not target_host:
         for tar, mod in target_input_mod.items():
diff --git a/tests/python/contrib/test_cudnn.py b/tests/python/contrib/test_cudnn.py
index 5e7533637177..a4425ab6fd90 100644
--- a/tests/python/contrib/test_cudnn.py
+++ b/tests/python/contrib/test_cudnn.py
@@ -150,7 +150,7 @@ def verify_conv3d(data_dtype, conv_dtype, tensor_format=0, groups=1):
 
     # validation
     ctx = tvm.gpu(0)
-    f = tvm.build(s, [X, W, Y], "cuda --host=llvm", name="conv3d")
+    f = tvm.build(s, [X, W, Y], target="cuda --host=llvm", name="conv3d")
     x_np = np.random.uniform(-1, 1, xshape).astype(data_dtype)
     w_np = np.random.uniform(-1, 1, wshape).astype(data_dtype)
     y_np = np.zeros(yshape).astype(data_dtype)
@@ -182,7 +182,7 @@ def verify_softmax(shape, axis, dtype="float32"):
     b_np = tvm.topi.testing.softmax_python(a_np)
     a = tvm.nd.array(a_np, ctx)
     b = tvm.nd.array(b_np, ctx)
-    f = tvm.build(s, [A, B], "cuda --host=llvm", name="softmax")
+    f = tvm.build(s, [A, B], target="cuda --host=llvm", name="softmax")
     f(a, b)
     tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3)
 
@@ -199,7 +199,7 @@ def verify_softmax_4d(shape, dtype="float32"):
     b_np = b_np.reshape(n, h, w, c).transpose(0, 3, 1, 2)
     a = tvm.nd.array(a_np, ctx)
     b = tvm.nd.array(b_np, ctx)
-    f = tvm.build(s, [A, B], "cuda --host=llvm", name="softmax")
+    f = tvm.build(s, [A, B], target="cuda --host=llvm", name="softmax")
     f(a, b)
     tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3)
 

From 47bcc4c3725b2760562e4d16dc131de38b7dec07 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Fri, 12 Mar 2021 16:01:10 -0800
Subject: [PATCH 44/69] Fix according to reviews, add helper function in python

---
 python/tvm/driver/build_module.py             | 21 +++-------------
 python/tvm/driver/tvmc/autotuner.py           |  6 ++---
 python/tvm/relay/backend/vm.py                | 24 ++++---------------
 python/tvm/relay/build_module.py              | 17 +++----------
 python/tvm/target/target.py                   | 17 +++++++++++++
 src/auto_scheduler/feature.cc                 |  3 ++-
 src/driver/driver_api.cc                      |  4 ++--
 src/target/target.cc                          |  6 +++--
 tests/micro/qemu/test_zephyr.py               |  3 ++-
 .../get_started/tensor_expr_get_started.py    |  2 +-
 10 files changed, 41 insertions(+), 62 deletions(-)

diff --git a/python/tvm/driver/build_module.py b/python/tvm/driver/build_module.py
index 3c2a7209b85b..eecde8b4a2d4 100644
--- a/python/tvm/driver/build_module.py
+++ b/python/tvm/driver/build_module.py
@@ -30,6 +30,7 @@
 from tvm.te import tensor
 from tvm.te import schedule
 from tvm.target import Target
+from tvm.target.target import refresh_multi_hosts
 
 
 def get_binds(args, compact=False, binds=None):
@@ -399,15 +400,7 @@ def build(inputs, args=None, target=None, target_host=None, name="default_functi
         if not isinstance(mod, tvm.IRModule):
             raise ValueError("inputs must be Schedule, IRModule," "or dict of str to IRModule.")
 
-    new_input_mod = {}
-    for tar, mod in target_input_mod.items():
-        if isinstance(tar, (str, Target)):
-            new_tar = Target(target=tar, host=target_host)
-            target_host = new_tar.host
-            new_input_mod[new_tar] = mod
-        else:
-            new_input_mod[tar] = mod
-    target_input_mod = new_input_mod
+    target_input_mod, target_host = refresh_multi_hosts(target_input_mod, target_host)
 
     if not target_host:
         for tar, mod in target_input_mod.items():
@@ -419,15 +412,7 @@ def build(inputs, args=None, target=None, target_host=None, name="default_functi
     if not target_host:
         target_host = "llvm" if tvm.runtime.enabled("llvm") else "stackvm"
 
-    new_input_mod = {}
-    for tar, mod in target_input_mod.items():
-        if isinstance(tar, (str, Target)):
-            new_tar = Target(target=tar, host=target_host)
-            target_host = new_tar.host
-            new_input_mod[new_tar] = mod
-        else:
-            new_input_mod[tar] = mod
-    target_input_mod = new_input_mod
+    target_input_mod, target_host = refresh_multi_hosts(target_input_mod, target_host)
 
     mod_host_all = tvm.IRModule({})
 
diff --git a/python/tvm/driver/tvmc/autotuner.py b/python/tvm/driver/tvmc/autotuner.py
index 5f78b5eb8fd1..6c8028fcc01f 100644
--- a/python/tvm/driver/tvmc/autotuner.py
+++ b/python/tvm/driver/tvmc/autotuner.py
@@ -23,13 +23,12 @@
 
 from urllib.parse import urlparse
 
-import tvm
-
 from tvm import autotvm, auto_scheduler
 from tvm.autotvm.tuner import GATuner
 from tvm.autotvm.tuner import GridSearchTuner
 from tvm.autotvm.tuner import RandomTuner
 from tvm.autotvm.tuner import XGBTuner
+from tvm.target.target import refresh_host
 
 from . import common, composite_target, frontends
 from .common import TVMCException
@@ -420,8 +419,7 @@ def autoscheduler_get_tuning_tasks(
     if alter_layout:
         mod = common.convert_graph_layout(mod, alter_layout)
 
-    target = tvm.target.Target(target, target_host)
-    target_host = target.host
+    target, target_host = refresh_host(target, target_host)
 
     # Extract the tasks
     tasks, task_weights = auto_scheduler.extract_tasks(
diff --git a/python/tvm/relay/backend/vm.py b/python/tvm/relay/backend/vm.py
index 89571b82414c..2d8a39abb3bd 100644
--- a/python/tvm/relay/backend/vm.py
+++ b/python/tvm/relay/backend/vm.py
@@ -28,6 +28,7 @@
 from tvm import autotvm
 from tvm.relay import expr as _expr
 from tvm.relay.backend.interpreter import Executor
+from tvm.target.target import refresh_host, refresh_multi_hosts
 from . import _vm
 
 
@@ -65,13 +66,7 @@ def compile(mod, target=None, target_host=None, params=None):
     compiler = VMCompiler()
     if params:
         compiler.set_params(params)
-    if isinstance(target, dict):
-        for k in target:
-            target[k] = tvm.target.Target(target[k], target_host)
-            target_host = target[k].host
-    else:
-        target = tvm.target.Target(target, target_host)
-        target_host = target.host
+    target, target_host = refresh_multi_hosts(target, target_host)
     compiler.lower(mod, target, target_host)
     compiler.codegen()
     return compiler.get_exec()
@@ -139,12 +134,9 @@ def lower(self, mod, target=None, target_host=None):
         target_host = self._update_target_host(target, target_host)
 
         if isinstance(target, dict):
-            for k in target:
-                target[k] = tvm.target.Target(target[k], target_host)
-                target_host = target[k].host
+            target, target_host = refresh_multi_hosts(target, target_host)
         else:
-            target = tvm.target.Target(target, target_host)
-            target_host = target.host
+            target, target_host = refresh_host(target, target_host)
 
         tophub_context = self._tophub_context(target)
         with tophub_context:
@@ -184,13 +176,7 @@ def optimize(self, mod, target=None, target_host=None, params=None):
         target = self._update_target(target)
         target_host = self._update_target_host(target, target_host)
 
-        if isinstance(target, dict):
-            for k in target:
-                target[k] = tvm.target.Target(target[k], target_host)
-                target_host = target[k].host
-        else:
-            target = tvm.target.Target(target, target_host)
-            target_host = target.host
+        target, target_host = refresh_multi_hosts(target, target_host)
 
         if params:
             self.set_params(params)
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 383989d92a5f..a7ec2bfc691f 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -25,6 +25,7 @@
 
 from tvm.ir.transform import PassContext
 from tvm.tir import expr as tvm_expr
+from tvm.target.target import refresh_multi_hosts
 from .. import nd as _nd, autotvm, register_func
 from ..target import Target
 from ..contrib import graph_runtime as _graph_rt
@@ -130,13 +131,7 @@ def build(self, mod, target=None, target_host=None, params=None):
         autotvm.GLOBAL_SCOPE.silent = use_auto_scheduler
 
         # Assume the target host of all targets in heterogenous target are identical
-        if isinstance(target, dict):
-            for k in target:
-                target[k] = Target(target[k], target_host)
-                target_host = target[k].host
-        else:
-            target = Target(target, target_host)
-            target_host = target.host
+        target, target_host = refresh_multi_hosts(target, target_host)
 
         self._build(mod, target, target_host)
         autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent
@@ -278,13 +273,7 @@ def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"
     elif target_host:
         raise ValueError("target host must be the type of str, " + "tvm.target.Target, or None")
 
-    if isinstance(target, dict):
-        for k in target:
-            target[k] = Target(target[k], target_host)
-            target_host = target[k].host
-    else:
-        target = Target(target, target_host)
-        target_host = target.host
+    target, target_host = refresh_multi_hosts(target, target_host)
 
     # If current dispatch context is fallback context (the default root context),
     # then load pre-tuned parameters from TopHub
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 4ab0e86e4b29..d07aa62668d8 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -494,3 +494,20 @@ def _load_config_dict(config_dict_str):
         if not isinstance(key, str):
             return None
     return config
+
+
+def refresh_host(target, host=None):
+    target = Target(target, host)
+    host = target.host
+    return target, host
+
+
+def refresh_multi_hosts(target, host=None):
+    if not isinstance(target, dict):
+        return refresh_host(target, host)
+    new_target = {}
+    for tgt, mod in target.items():
+        if isinstance(tgt, (dict, str, Target)):
+            tgt, host = refresh_host(tgt, host)
+        new_target[tgt] = mod
+    return new_target, host
diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc
index 11295966ac6c..bdc1e9580f89 100755
--- a/src/auto_scheduler/feature.cc
+++ b/src/auto_scheduler/feature.cc
@@ -1397,7 +1397,8 @@ void GetPerStoreFeaturesFromFile(const std::string& filename, int max_lines, int
     if (find_res == task_cache.end()) {
       // rebuild task
       Array<te::Tensor> tensors = (*workload_key_to_tensors)(workload_key);
-      Target target = cur_inp->task->target, target_host = cur_inp->task->target_host;
+      Target target = cur_inp->task->target;
+      Target target_host = cur_inp->task->target_host;
       target = Target(target, target_host);
       target_host = target->GetHost().value_or(Target());
       task = SearchTask(ComputeDAG(tensors), workload_key, target, target_host,
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index 3c640e37f190..5819d5efcd52 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -338,8 +338,8 @@ runtime::Module build(const Map<String, IRModule>& inputs_arg, const Target& tar
 // Build for homogeneous execution.
 runtime::Module build(const IRModule& funcs, const Target& target_arg,
                       const Target& target_host_arg) {
-  auto target = Target(target_arg, target_host_arg),
-       target_host = target->GetHost().value_or(Target());
+  auto target = Target(target_arg, target_host_arg);
+  auto target_host = target->GetHost().value_or(Target());
   Map<Target, IRModule> inputs = {{target, funcs}};
   return build(inputs, target_host);
 }
diff --git a/src/target/target.cc b/src/target/target.cc
index 7054e6a22bc0..319d72fc1bd1 100644
--- a/src/target/target.cc
+++ b/src/target/target.cc
@@ -375,7 +375,7 @@ Target::Target(const Map<String, ObjectRef>& config) {
 
 Target::Target(Target target, Target host) {
   ObjectPtr<TargetNode> n = make_object<TargetNode>(*target.get());
-  CHECK((!n->host.defined()) || n->host == host)
+  CHECK(!n->host.defined() || n->host == host)
       << "ValueError: Adding a host to a target whose host field has been defined";
   // add target host into host field
   n->host = std::move(host);
@@ -408,7 +408,9 @@ Map<String, ObjectRef> TargetNode::Export() const {
       {"tag", this->tag},
       {"keys", this->keys},
   };
-  if (this->host.defined()) result.Set("host", this->GetHost().value_or(Target())->Export());
+  if (this->host.defined()) {
+    result.Set("host", this->GetHost().value_or(Target())->Export());
+  }
   for (const auto& kv : attrs) {
     result.Set(kv.first, kv.second);
   }
diff --git a/tests/micro/qemu/test_zephyr.py b/tests/micro/qemu/test_zephyr.py
index 470c29f23030..51d5f990e710 100644
--- a/tests/micro/qemu/test_zephyr.py
+++ b/tests/micro/qemu/test_zephyr.py
@@ -45,8 +45,9 @@
 
 def _make_sess_from_op(model, zephyr_board, west_cmd, op_name, sched, arg_bufs):
     target = tvm.target.target.micro(model)
+    target = tvm.target.Target(target=target, host=target)
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
-        mod = tvm.build(sched, arg_bufs, tvm.target.Target(target, target), name=op_name)
+        mod = tvm.build(sched, arg_bufs, target=target, name=op_name)
 
     return _make_session(model, target, zephyr_board, west_cmd, mod)
 
diff --git a/tutorials/get_started/tensor_expr_get_started.py b/tutorials/get_started/tensor_expr_get_started.py
index ac6d547dd1a3..83c328560b4d 100644
--- a/tutorials/get_started/tensor_expr_get_started.py
+++ b/tutorials/get_started/tensor_expr_get_started.py
@@ -153,7 +153,7 @@
 # - fadd runs the actual computation.
 # - asnumpy() copies the GPU array back to the CPU and we can use this to verify correctness
 #
-ctx = tvm.context(str(tgt), 0)
+ctx = tvm.context(tgt.kind.name, 0)
 
 n = 1024
 a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)

From 5d8201ef30759ec4a455d4e6f85a9e7d1e3054af Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Mon, 15 Mar 2021 18:19:00 -0700
Subject: [PATCH 45/69] Refactor code as helper function

---
 include/tvm/target/target.h          |  6 ++++++
 src/auto_scheduler/feature.cc        |  6 ++----
 src/auto_scheduler/measure_record.cc |  8 ++++----
 src/auto_scheduler/search_task.cc    |  6 ++----
 src/driver/driver_api.cc             | 16 ++++++++--------
 src/relay/backend/build_module.cc    | 15 +++------------
 src/relay/backend/vm/compiler.cc     | 13 +++----------
 src/relay/transforms/memory_alloc.cc | 10 ++--------
 src/target/target.cc                 | 18 ++++++++++++++++++
 9 files changed, 48 insertions(+), 50 deletions(-)

diff --git a/include/tvm/target/target.h b/include/tvm/target/target.h
index 420643f27f90..f5218e035d41 100644
--- a/include/tvm/target/target.h
+++ b/include/tvm/target/target.h
@@ -27,6 +27,7 @@
 #include <tvm/node/node.h>
 #include <tvm/support/with.h>
 #include <tvm/target/target_kind.h>
+#include <tvm/ir/expr.h>
 
 #include <string>
 #include <unordered_set>
@@ -171,5 +172,10 @@ class Target : public ObjectRef {
   TVM_DLL void ExitWithScope();
 };
 
+using TargetsMap = Map<Integer, Target>;
+
+TVM_DLL void RefreshHost(Target*, Target*);
+TVM_DLL void RefreshHost(TargetsMap*, Target*);
+
 }  // namespace tvm
 #endif  // TVM_TARGET_TARGET_H_
diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc
index bdc1e9580f89..0baa0bcf2e9a 100755
--- a/src/auto_scheduler/feature.cc
+++ b/src/auto_scheduler/feature.cc
@@ -1399,8 +1399,7 @@ void GetPerStoreFeaturesFromFile(const std::string& filename, int max_lines, int
       Array<te::Tensor> tensors = (*workload_key_to_tensors)(workload_key);
       Target target = cur_inp->task->target;
       Target target_host = cur_inp->task->target_host;
-      target = Target(target, target_host);
-      target_host = target->GetHost().value_or(Target());
+      RefreshHost(&target, &target_host);
       task = SearchTask(ComputeDAG(tensors), workload_key, target, target_host,
                         cur_inp->task->hardware_params, cur_inp->task->layout_rewrite_option,
                         cur_inp->task->task_input_names);
@@ -1471,8 +1470,7 @@ void GetPerStoreFeaturesFromMeasurePairs(const Array<MeasureInput>& inputs,
         try {
           Array<te::Tensor> tensors = (*workload_key_to_tensors)(workload_key);
           Target target = inputs[i]->task->target, target_host = inputs[i]->task->target_host;
-          target = Target(target, target_host);
-          target_host = target->GetHost().value_or(Target());
+          RefreshHost(&target, &target_host);
           task =
               SearchTask(ComputeDAG(tensors), workload_key, target, target_host,
                          inputs[i]->task->hardware_params, inputs[i]->task->layout_rewrite_option,
diff --git a/src/auto_scheduler/measure_record.cc b/src/auto_scheduler/measure_record.cc
index 9faa6c3bce32..144f2098db57 100644
--- a/src/auto_scheduler/measure_record.cc
+++ b/src/auto_scheduler/measure_record.cc
@@ -163,8 +163,8 @@ struct Handler<::tvm::auto_scheduler::SearchTaskNode> {
     writer->WriteArrayItem(std::string(data.workload_key));
     writer->WriteArrayItem(data.target->str());
     writer->WriteArrayItem(*data.hardware_params.get());
-    ::tvm::Target target_host =
-        ::tvm::Target(data.target, data.target_host)->GetHost().value_or(::tvm::Target());
+    ::tvm::Target target = data.target, target_host = data.target_host;
+    ::tvm::RefreshHost(&target, &target_host);
     if (target_host.defined()) {
       writer->WriteArrayItem(target_host->str());
     } else {
@@ -201,8 +201,8 @@ struct Handler<::tvm::auto_scheduler::SearchTaskNode> {
       if (s) {
         reader->Read(&str_value);
         if (!str_value.empty()) {
-          data->target = ::tvm::Target(data->target, ::tvm::Target(str_value));
-          data->target_host = data->target->GetHost().value_or(::tvm::Target());
+          data->target_host = ::tvm::Target(str_value);
+          ::tvm::RefreshHost(&data->target, &data->target_host);
         }
         s = reader->NextArrayItem();
         ICHECK(s);
diff --git a/src/auto_scheduler/search_task.cc b/src/auto_scheduler/search_task.cc
index 0e14f101ebae..18bcf46bb6ac 100755
--- a/src/auto_scheduler/search_task.cc
+++ b/src/auto_scheduler/search_task.cc
@@ -116,8 +116,7 @@ HardwareParams HardwareParamsNode::GetDefaultHardwareParams(const Target& target
 SearchTask::SearchTask(ComputeDAG compute_dag, String workload_key, Target target,
                        Target target_host, Optional<HardwareParams> hardware_params,
                        LayoutRewriteOption layout_rewrite_option, Array<String> task_input_names) {
-  target = Target(target, target_host);
-  target_host = target->GetHost().value_or(Target());
+  RefreshHost(&target, &target_host);
   auto node = make_object<SearchTaskNode>();
   node->compute_dag = std::move(compute_dag);
   node->workload_key = std::move(workload_key);
@@ -147,8 +146,7 @@ TVM_REGISTER_GLOBAL("auto_scheduler.SearchTask")
     .set_body_typed([](ComputeDAG compute_dag, String workload_key, Target target,
                        Target target_host, Optional<HardwareParams> hardware_params,
                        int layout_rewrite_option, Array<String> task_input_names) {
-      target = Target(target, target_host);
-      target_host = target->GetHost().value_or(Target());
+      RefreshHost(&target, &target_host);
       return SearchTask(compute_dag, workload_key, target, target_host, hardware_params,
                         LayoutRewriteOption(layout_rewrite_option), task_input_names);
     });
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index 5819d5efcd52..f4da52ed316c 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -188,8 +188,8 @@ IRModule lower(te::Schedule sch, const Array<te::Tensor>& args, const std::strin
 std::pair<IRModule, IRModule> SplitDevHostFuncs(IRModule mod_mixed, const Target& target_arg,
                                                 const Target& target_host_arg,
                                                 const transform::PassContext& pass_ctx) {
-  Target target = Target(target_arg, target_host_arg),
-         target_host = target->GetHost().value_or(Target());
+  Target target = target_arg, target_host = target_host_arg;
+  RefreshHost(&target, &target_host);
   Array<tvm::transform::Pass> mixed_pass_list = {BindTarget(target),
                                                  tir::transform::VerifyMemory()};
 
@@ -264,8 +264,8 @@ runtime::Module build(const Map<Target, IRModule>& inputs, const Target& target_
 
   // Fetch previous defined target host in targets
   for (const auto& it : inputs) {
-    auto target = Target(it.first, target_host);
-    target_host = target->GetHost().value_or(Target());
+    auto target = it.first;
+    RefreshHost(&target, &target_host);
   }
 
   if (!target_host.defined()) {
@@ -324,8 +324,8 @@ runtime::Module build(const Map<String, IRModule>& inputs_arg, const Target& tar
   Map<Target, IRModule> updated_inputs;
   Target target_host = target_host_arg;
   for (const auto& it : inputs_arg) {
-    auto target = Target(Target(it.first), target_host);
-    target_host = target->GetHost().value_or(Target());
+    Target target = Target(it.first);
+    RefreshHost(&target, &target_host);
     Optional<String> device = target->GetAttr<String>("device");
     if (device.defined() && device.value() == "vta") {
       target = Target("ext_dev");
@@ -338,8 +338,8 @@ runtime::Module build(const Map<String, IRModule>& inputs_arg, const Target& tar
 // Build for homogeneous execution.
 runtime::Module build(const IRModule& funcs, const Target& target_arg,
                       const Target& target_host_arg) {
-  auto target = Target(target_arg, target_host_arg);
-  auto target_host = target->GetHost().value_or(Target());
+  auto target = target_arg, target_host = target_host_arg;
+  RefreshHost(&target, &target_host);
   Map<Target, IRModule> inputs = {{target, funcs}};
   return build(inputs, target_host);
 }
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index e132cf54aefb..f586b86f1e5a 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -238,12 +238,7 @@ class RelayBuildModule : public runtime::ModuleNode {
     // Create protected variable targets_ from ground up
     targets_ = targets;
     target_host_ = target_host;
-    for (const auto& it : targets) {
-      // Construct a new target with target host filed if available
-      targets_.Set(it.first, Target(it.second, target_host_));
-      target_host_ = targets_[it.first]->GetHost().value_or(Target());
-    }
-
+    RefreshHost(&targets_, &target_host_);
     BuildRelay(mod, params_);
     // Clear compile engine so that tuning schedules can be changed between runs. See issue #6096.
     CompileEngine::Global()->Clear();
@@ -488,12 +483,8 @@ class RelayBuildModule : public runtime::ModuleNode {
     const runtime::PackedFunc* pf = runtime::Registry::Get("codegen.LLVMModuleCreate");
     if (!target_host.defined()) target_host = (pf != nullptr) ? Target("llvm") : Target("stackvm");
 
-    // Update all the targets in the _targets TargetsMap
-    for (const auto& it : targets_) {
-      // Construct a new target with target host filed if available
-      targets_.Set(it.first, Target(it.second, target_host));
-      target_host = targets_[it.first]->GetHost().value_or(Target());
-    }
+    // Update all the targets in the targets_ TargetsMap
+    RefreshHost(&targets_, &target_host);
 
     // Generate a placeholder function that attaches linked params as its arguments.
     if (target_host->GetAttr<Bool>("link-params").value_or(Bool(false))) {
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index b8a7424d4c66..e52efdfe2ee3 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -255,9 +255,8 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
         context_(context),
         target_host_(target_host),
         expr_device_map_(std::move(expr_device_map)) {
+    RefreshHost(&targets, &target_host);
     for (const auto& it : targets) {
-      targets.Set(it.first, Target(targets[it.first], target_host));
-      target_host = targets[it.first]->GetHost().value_or(Target());
       targets_[it.first->value] = it.second;
     }
     target_host_ = target_host;
@@ -903,10 +902,7 @@ void VMCompiler::Lower(IRModule mod, const TargetsMap& targets, const tvm::Targe
   exec_ = make_object<Executable>();
   targets_ = targets;
   target_host_ = target_host;
-  for (auto& iter : targets_) {
-    targets_.Set(iter.first, Target(targets_[iter.first], target_host_));
-    target_host_ = targets[iter.first]->GetHost().value_or(Target());
-  }
+  RefreshHost(&targets_, &target_host_);
 
   // Run the optimizations necessary to target the VM.
   context_.module = OptimizeModule(mod, targets_, target_host_);
@@ -1012,10 +1008,7 @@ IRModule VMCompiler::OptimizeModule(IRModule mod, const TargetsMap& targets_arg,
                                     const Target& target_host_arg) {
   TargetsMap targets = targets_arg;
   Target target_host = target_host_arg;
-  for (auto& iter : targets) {
-    targets.Set(iter.first, Target(targets[iter.first], target_host));
-    target_host = targets[iter.first]->GetHost().value_or(Target());
-  }
+  RefreshHost(&targets, &target_host);
   if (params_.size()) {
     BaseFunc base_func = mod->Lookup("main");
     ICHECK(base_func->IsInstance<FunctionNode>())
diff --git a/src/relay/transforms/memory_alloc.cc b/src/relay/transforms/memory_alloc.cc
index 5bc48db7de69..0ac2fefa1a9c 100644
--- a/src/relay/transforms/memory_alloc.cc
+++ b/src/relay/transforms/memory_alloc.cc
@@ -415,10 +415,7 @@ class DialectRewriter : public ExprMutator {
 namespace transform {
 
 Pass ManifestAlloc(Target target_host, Map<tvm::Integer, tvm::Target> targets) {
-  for (auto& iter : targets) {
-    targets.Set(iter.first, Target(targets[iter.first], target_host));
-    target_host = targets[iter.first]->GetHost().value_or(Target());
-  }
+  RefreshHost(&targets, &target_host);
   return tvm::transform::CreateModulePass(
       [=](IRModule mod, const PassContext& pass_ctx) {
         DLOG(INFO) << "tvm::relay::transform::ManifestAlloc";
@@ -462,10 +459,7 @@ Pass ManifestAlloc(Target target_host, Map<tvm::Integer, tvm::Target> targets) {
 
 TVM_REGISTER_GLOBAL("relay.transform.ManifestAlloc")
     .set_body_typed([](Target target_host, Map<tvm::Integer, tvm::Target> targets) {
-      for (auto& iter : targets) {
-        targets.Set(iter.first, Target(targets[iter.first], target_host));
-        target_host = targets[iter.first]->GetHost().value_or(Target());
-      }
+      RefreshHost(&targets, &target_host);
       return ManifestAlloc(target_host, targets);
     });
 
diff --git a/src/target/target.cc b/src/target/target.cc
index 319d72fc1bd1..2e1ad4a9c2cb 100644
--- a/src/target/target.cc
+++ b/src/target/target.cc
@@ -55,6 +55,24 @@ class TargetInternal {
 
 /**********  Helper functions  **********/
 
+void RefreshHost(Target* target, Target* target_host) {
+  *target = Target(*target, *target_host);
+  *target_host = (*target)->GetHost().value_or(Target());
+}
+
+using TargetsMap = Map<Integer, Target>;
+
+
+void RefreshHost(TargetsMap* targets, Target* target_host) {
+  TargetsMap new_targets;
+  for (auto& it : *targets) {
+    auto target = it.second;
+    RefreshHost(&target, target_host);
+    new_targets.Set(it.first, target);
+  }
+  *targets = new_targets;
+}
+
 static std::vector<String> DeduplicateKeys(const std::vector<String>& keys) {
   std::vector<String> new_keys;
   for (size_t i = 0; i < keys.size(); ++i) {

From c9e1c9ba5f164ff66138130caa921419d5f2c348 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Mon, 15 Mar 2021 18:52:22 -0700
Subject: [PATCH 46/69] Expand helper function

---
 include/tvm/target/target.h |  4 +++-
 src/driver/driver_api.cc    | 16 +++++-----------
 src/target/target.cc        | 13 ++++++++++---
 3 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/include/tvm/target/target.h b/include/tvm/target/target.h
index f5218e035d41..2469f0477a99 100644
--- a/include/tvm/target/target.h
+++ b/include/tvm/target/target.h
@@ -24,10 +24,11 @@
 #ifndef TVM_TARGET_TARGET_H_
 #define TVM_TARGET_TARGET_H_
 
+#include <tvm/ir/expr.h>
+#include <tvm/ir/module.h>
 #include <tvm/node/node.h>
 #include <tvm/support/with.h>
 #include <tvm/target/target_kind.h>
-#include <tvm/ir/expr.h>
 
 #include <string>
 #include <unordered_set>
@@ -176,6 +177,7 @@ using TargetsMap = Map<Integer, Target>;
 
 TVM_DLL void RefreshHost(Target*, Target*);
 TVM_DLL void RefreshHost(TargetsMap*, Target*);
+TVM_DLL void RefreshHost(Map<Target, IRModule>*, Target*);
 
 }  // namespace tvm
 #endif  // TVM_TARGET_TARGET_H_
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index f4da52ed316c..0a3a41d0a3a5 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -255,18 +255,15 @@ std::pair<IRModule, IRModule> SplitDevHostFuncs(IRModule mod_mixed, const Target
 }
 
 // Build for heterogeneous execution.
-runtime::Module build(const Map<Target, IRModule>& inputs, const Target& target_host_arg) {
+runtime::Module build(const Map<Target, IRModule>& inputs_arg, const Target& target_host_arg) {
   auto pass_ctx = transform::PassContext::Current();
 
   std::vector<runtime::Module> device_modules;
+  Map<Target, IRModule> inputs = inputs_arg;
   Target target_host = target_host_arg;
-  Map<Target, IRModule> updated_inputs;
 
   // Fetch previous defined target host in targets
-  for (const auto& it : inputs) {
-    auto target = it.first;
-    RefreshHost(&target, &target_host);
-  }
+  RefreshHost(&inputs, &target_host);
 
   if (!target_host.defined()) {
     for (const auto& it : inputs) {
@@ -282,16 +279,13 @@ runtime::Module build(const Map<Target, IRModule>& inputs, const Target& target_
   }
 
   // Update target host for all targets
-  for (const auto& it : inputs) {
-    auto target = Target(it.first, target_host);
-    updated_inputs.Set(target, it.second);
-  }
+  RefreshHost(&inputs, &target_host);
 
   IRModule mhost_all = IRModule(Map<GlobalVar, BaseFunc>());
 
   ICHECK(mhost_all.defined()) << "The host module must be defined";
 
-  for (const auto& it : updated_inputs) {
+  for (const auto& it : inputs) {
     if (it.second.defined()) {
       auto pair = SplitDevHostFuncs(it.second, it.first, target_host, pass_ctx);
       auto& mhost = pair.first;
diff --git a/src/target/target.cc b/src/target/target.cc
index 2e1ad4a9c2cb..a3f9e9a3fe66 100644
--- a/src/target/target.cc
+++ b/src/target/target.cc
@@ -60,9 +60,6 @@ void RefreshHost(Target* target, Target* target_host) {
   *target_host = (*target)->GetHost().value_or(Target());
 }
 
-using TargetsMap = Map<Integer, Target>;
-
-
 void RefreshHost(TargetsMap* targets, Target* target_host) {
   TargetsMap new_targets;
   for (auto& it : *targets) {
@@ -73,6 +70,16 @@ void RefreshHost(TargetsMap* targets, Target* target_host) {
   *targets = new_targets;
 }
 
+void RefreshHost(Map<Target, IRModule>* targets, Target* target_host) {
+  Map<Target, IRModule> new_targets;
+  for (auto& it : *targets) {
+    auto target = it.first;
+    RefreshHost(&target, target_host);
+    new_targets.Set(target, it.second);
+  }
+  *targets = new_targets;
+}
+
 static std::vector<String> DeduplicateKeys(const std::vector<String>& keys) {
   std::vector<String> new_keys;
   for (size_t i = 0; i < keys.size(); ++i) {

From ec664ee4bdbd26d9044bd70ae905d532b9562e1a Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Mon, 15 Mar 2021 23:45:29 -0700
Subject: [PATCH 47/69] Fix bug add and update python helper function

---
 python/tvm/auto_scheduler/measure.py          |  3 +-
 .../tvm/auto_scheduler/relay_integration.py   |  4 +-
 python/tvm/auto_scheduler/search_task.py      | 13 +++---
 python/tvm/driver/build_module.py             |  6 +--
 python/tvm/relay/backend/vm.py                | 11 ++---
 python/tvm/relay/build_module.py              |  6 +--
 python/tvm/target/target.py                   | 42 ++++++++++++-------
 7 files changed, 46 insertions(+), 39 deletions(-)

diff --git a/python/tvm/auto_scheduler/measure.py b/python/tvm/auto_scheduler/measure.py
index 180aef29766d..dd7a0cf99aa4 100644
--- a/python/tvm/auto_scheduler/measure.py
+++ b/python/tvm/auto_scheduler/measure.py
@@ -221,9 +221,10 @@ def recover_measure_input(inp, rebuild_state=False):
     from .search_task import SearchTask  # lazily import to avoid recursive dependency
 
     task = inp.task
+    task.target, task.target_host = refresh_host(task.target, task.target_host)
     new_task = SearchTask(
         workload_key=task.workload_key,
-        target=tvm.target.Target(task.target, task.target_host),
+        target=task.target,
         hardware_params=task.hardware_params,
         layout_rewrite_option=task.layout_rewrite_option,
         task_inputs=list(task.task_input_names),
diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py
index ccf797746884..49cabd3443db 100644
--- a/python/tvm/auto_scheduler/relay_integration.py
+++ b/python/tvm/auto_scheduler/relay_integration.py
@@ -29,6 +29,7 @@
 from tvm import autotvm, transform
 from tvm.ir.transform import PassContext
 from tvm.runtime import convert_to_object
+from tvm.target.target import refresh_host
 from tvm.te.tensor import ComputeOp, PlaceholderOp, Tensor
 from tvm.tir import Reduce
 from tvm.tir import expr as _expr
@@ -108,8 +109,7 @@ def extract_tasks(
     """
     # pylint: disable=import-outside-toplevel
 
-    target = tvm.target.Target(target, target_host)
-    target_host = target.host
+    target, target_host = refresh_host(target, target_host)
 
     # Run the compiler to collect all TOPI calls during compilation.
     env = TracingEnvironment(
diff --git a/python/tvm/auto_scheduler/search_task.py b/python/tvm/auto_scheduler/search_task.py
index 52e74ffe5460..758dae077e61 100644
--- a/python/tvm/auto_scheduler/search_task.py
+++ b/python/tvm/auto_scheduler/search_task.py
@@ -28,6 +28,7 @@
 
 from tvm.driver.build_module import build
 from tvm.target import Target
+from tvm.target.target import refresh_host
 from .measure import LocalBuilder, LocalRunner
 from .measure_record import load_best_record
 from .workload_registry import make_workload_key
@@ -393,13 +394,8 @@ def __init__(
             compute_dag = ComputeDAG(workload_key)
 
         assert target is not None, "Must specify a target."
-        if isinstance(target, str):
-            target = Target(target)
-        if isinstance(target_host, str):
-            target_host = Target(target_host)
 
-        target = Target(target, target_host)
-        target_host = target.host
+        target, target_host = refresh_host(target, target_host)
 
         if layout_rewrite_option is None:
             layout_rewrite_option = LayoutRewriteOption.get_target_default(target)
@@ -509,11 +505,12 @@ def print_best(self, log_file, print_mode="schedule"):
         raise ValueError("Invalid print_mode: %s" % print_mode)
 
     def __getstate__(self):
+        self.target, self.target_host = refresh_host(self.target, self.target_host)
         return {
             "compute_dag": self.compute_dag,
             "workload_key": self.workload_key,
-            "target": Target(self.target, self.target_host),
-            "target_host": Target(self.target, self.target_host).host,
+            "target": self.target,
+            "target_host": self.target_host,
             "hardware_params": self.hardware_params,
             "layout_rewrite_option": self.layout_rewrite_option,
             "task_input_names": self.task_input_names,
diff --git a/python/tvm/driver/build_module.py b/python/tvm/driver/build_module.py
index eecde8b4a2d4..26ead83d820a 100644
--- a/python/tvm/driver/build_module.py
+++ b/python/tvm/driver/build_module.py
@@ -30,7 +30,7 @@
 from tvm.te import tensor
 from tvm.te import schedule
 from tvm.target import Target
-from tvm.target.target import refresh_multi_hosts
+from tvm.target.target import refresh_host
 
 
 def get_binds(args, compact=False, binds=None):
@@ -400,7 +400,7 @@ def build(inputs, args=None, target=None, target_host=None, name="default_functi
         if not isinstance(mod, tvm.IRModule):
             raise ValueError("inputs must be Schedule, IRModule," "or dict of str to IRModule.")
 
-    target_input_mod, target_host = refresh_multi_hosts(target_input_mod, target_host)
+    target_input_mod, target_host = refresh_host(target_input_mod, target_host)
 
     if not target_host:
         for tar, mod in target_input_mod.items():
@@ -412,7 +412,7 @@ def build(inputs, args=None, target=None, target_host=None, name="default_functi
     if not target_host:
         target_host = "llvm" if tvm.runtime.enabled("llvm") else "stackvm"
 
-    target_input_mod, target_host = refresh_multi_hosts(target_input_mod, target_host)
+    target_input_mod, target_host = refresh_host(target_input_mod, target_host)
 
     mod_host_all = tvm.IRModule({})
 
diff --git a/python/tvm/relay/backend/vm.py b/python/tvm/relay/backend/vm.py
index 2d8a39abb3bd..8abd9b32e69e 100644
--- a/python/tvm/relay/backend/vm.py
+++ b/python/tvm/relay/backend/vm.py
@@ -28,7 +28,7 @@
 from tvm import autotvm
 from tvm.relay import expr as _expr
 from tvm.relay.backend.interpreter import Executor
-from tvm.target.target import refresh_host, refresh_multi_hosts
+from tvm.target.target import refresh_host
 from . import _vm
 
 
@@ -66,7 +66,7 @@ def compile(mod, target=None, target_host=None, params=None):
     compiler = VMCompiler()
     if params:
         compiler.set_params(params)
-    target, target_host = refresh_multi_hosts(target, target_host)
+    target, target_host = refresh_host(target, target_host, target_is_key=False)
     compiler.lower(mod, target, target_host)
     compiler.codegen()
     return compiler.get_exec()
@@ -133,10 +133,7 @@ def lower(self, mod, target=None, target_host=None):
         target = self._update_target(target)
         target_host = self._update_target_host(target, target_host)
 
-        if isinstance(target, dict):
-            target, target_host = refresh_multi_hosts(target, target_host)
-        else:
-            target, target_host = refresh_host(target, target_host)
+        target, target_host = refresh_host(target, target_host)
 
         tophub_context = self._tophub_context(target)
         with tophub_context:
@@ -176,7 +173,7 @@ def optimize(self, mod, target=None, target_host=None, params=None):
         target = self._update_target(target)
         target_host = self._update_target_host(target, target_host)
 
-        target, target_host = refresh_multi_hosts(target, target_host)
+        target, target_host = refresh_host(target, target_host)
 
         if params:
             self.set_params(params)
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index a7ec2bfc691f..62d4c812517a 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -25,7 +25,7 @@
 
 from tvm.ir.transform import PassContext
 from tvm.tir import expr as tvm_expr
-from tvm.target.target import refresh_multi_hosts
+from tvm.target.target import refresh_host
 from .. import nd as _nd, autotvm, register_func
 from ..target import Target
 from ..contrib import graph_runtime as _graph_rt
@@ -131,7 +131,7 @@ def build(self, mod, target=None, target_host=None, params=None):
         autotvm.GLOBAL_SCOPE.silent = use_auto_scheduler
 
         # Assume the target host of all targets in heterogenous target are identical
-        target, target_host = refresh_multi_hosts(target, target_host)
+        target, target_host = refresh_host(target, target_host)
 
         self._build(mod, target, target_host)
         autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent
@@ -273,7 +273,7 @@ def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"
     elif target_host:
         raise ValueError("target host must be the type of str, " + "tvm.target.Target, or None")
 
-    target, target_host = refresh_multi_hosts(target, target_host)
+    target, target_host = refresh_host(target, target_host)
 
     # If current dispatch context is fallback context (the default root context),
     # then load pre-tuned parameters from TopHub
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index d07aa62668d8..3c9cd2a71320 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -496,18 +496,30 @@ def _load_config_dict(config_dict_str):
     return config
 
 
-def refresh_host(target, host=None):
-    target = Target(target, host)
-    host = target.host
-    return target, host
-
-
-def refresh_multi_hosts(target, host=None):
-    if not isinstance(target, dict):
-        return refresh_host(target, host)
-    new_target = {}
-    for tgt, mod in target.items():
-        if isinstance(tgt, (dict, str, Target)):
-            tgt, host = refresh_host(tgt, host)
-        new_target[tgt] = mod
-    return new_target, host
+def refresh_host(target, host=None, target_is_key=True):
+    """Helpfer function to return a target and target host after updating each other.
+
+    Parameters
+    ----------
+    target        : Union[str, Dict[str, Any], Target]
+        The target or heterogeneous target
+    host          : Union[str, Dict[str, Any], Target, None]
+        The target host
+    target_is_key : Bool
+        When the type of target is dict, whether Target is the key (Otherwise the value)
+    """
+    try:
+        target = Target(target, host)
+        host = target.host
+        return target, host
+    except (TypeError, ValueError):
+        new_target = {}
+        for tgt, mod in target.items():
+            if not target_is_key:
+                tgt, mod = mod, tgt
+            if isinstance(tgt, (dict, str, Target)):
+                tgt, host = refresh_host(tgt, host)
+            if not target_is_key:
+                tgt, mod = mod, tgt
+            new_target[tgt] = mod
+        return new_target, host

From 983108cafe1ba82780db84b640576559a4398bd5 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Tue, 16 Mar 2021 00:35:48 -0700
Subject: [PATCH 48/69] Update target hosts

---
 python/tvm/auto_scheduler/measure.py          |  6 +++---
 .../tvm/auto_scheduler/relay_integration.py   |  2 +-
 python/tvm/auto_scheduler/search_task.py      |  5 +++--
 .../autotvm/graph_tuner/base_graph_tuner.py   |  8 +++-----
 .../graph_tuner/utils/traverse_graph.py       |  2 +-
 python/tvm/autotvm/measure/measure_methods.py |  4 ++--
 python/tvm/autotvm/task/relay_integration.py  | 15 +++++++-------
 python/tvm/autotvm/task/task.py               | 11 +++++-----
 python/tvm/contrib/peak.py                    | 20 ++++++++-----------
 python/tvm/driver/build_module.py             |  3 +--
 python/tvm/driver/tvmc/autotuner.py           | 11 +++-------
 python/tvm/driver/tvmc/compiler.py            | 15 ++++++--------
 python/tvm/exec/measure_peak.py               |  2 ++
 python/tvm/relay/backend/_backend.py          |  4 ++--
 python/tvm/relay/backend/vm.py                |  2 +-
 python/tvm/relay/build_module.py              |  4 ++--
 16 files changed, 50 insertions(+), 64 deletions(-)

diff --git a/python/tvm/auto_scheduler/measure.py b/python/tvm/auto_scheduler/measure.py
index dd7a0cf99aa4..f6738a0fcc0c 100644
--- a/python/tvm/auto_scheduler/measure.py
+++ b/python/tvm/auto_scheduler/measure.py
@@ -44,6 +44,7 @@
 from tvm.ir import transform
 from tvm.autotvm.measure.measure_methods import set_cuda_target_arch
 from tvm.contrib import tar, ndk
+from tvm.target.target import refresh_host
 
 from . import _ffi_api
 from .loop_state import StateObject
@@ -621,10 +622,9 @@ def _timed_func(inp_serialized, build_func, verbose):
         filename = os.path.join(dirname, "tmp_func." + build_func.output_format)
 
         try:
+            task.target, task.target_host = refresh_host(task.target, task.target_host)
             with transform.PassContext():
-                func = build_module.build(
-                    sch, args, target=task.target, target_host=task.target_host
-                )
+                func = build_module.build(sch, args, target=task.target)
             func.export_library(filename, build_func)
         # pylint: disable=broad-except
         except Exception:
diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py
index 49cabd3443db..6c1766c18d36 100644
--- a/python/tvm/auto_scheduler/relay_integration.py
+++ b/python/tvm/auto_scheduler/relay_integration.py
@@ -125,12 +125,12 @@ def extract_tasks(
     # create search tasks
     tasks = []
     weights = []
+    target = refresh_host(target, target_host)
     for wkl_key, weight in env.wkl_key_to_weight.items():
         tasks.append(
             SearchTask(
                 workload_key=wkl_key,
                 target=target,
-                target_host=target_host,
                 hardware_params=hardware_params,
                 # When auto scheduler is used in end to end network, try to apply layout rewrite
                 # to improve the overall performance
diff --git a/python/tvm/auto_scheduler/search_task.py b/python/tvm/auto_scheduler/search_task.py
index 758dae077e61..f3e876156394 100644
--- a/python/tvm/auto_scheduler/search_task.py
+++ b/python/tvm/auto_scheduler/search_task.py
@@ -530,12 +530,13 @@ def __setstate__(self, state):
         if workload[0] not in WORKLOAD_FUNC_REGISTRY:
             register_workload_tensors(state["workload_key"], state["compute_dag"].tensors)
 
+        state["target"], state["target_host"] = refresh_host(state["target"], state["target_host"])
         self.__init_handle_by_constructor__(
             _ffi_api.SearchTask,
             state["compute_dag"],
             state["workload_key"],
-            Target(state["target"], state["target_host"]),
-            Target(state["target"], state["target_host"]).host,
+            state["target"],
+            state["target"].host,
             state["hardware_params"],
             state["layout_rewrite_option"],
             state["task_input_names"],
diff --git a/python/tvm/autotvm/graph_tuner/base_graph_tuner.py b/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
index c802a4289385..7c55ec641021 100644
--- a/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
+++ b/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
@@ -28,6 +28,7 @@
 from tvm.autotvm.task import get_config
 from tvm.autotvm.record import encode, load_from_file
 from tvm.autotvm.measure import MeasureResult, MeasureInput
+from tvm.target.target import refresh_host
 
 from ...target import Target
 from .utils import (
@@ -525,11 +526,8 @@ def _callback(_, inputs, results):
                 continue
 
             records = []
-            target = Target(target, target_host)
-            target_host = target.host
-            task = autotvm.task.create(
-                "layout_transform", args=args, target=self._target, target_host=target_host
-            )
+            self._target, target_host = refresh_host(self._target, target_host)
+            task = autotvm.task.create("layout_transform", args=args, target=self._target)
             tuner = autotvm.tuner.GridSearchTuner(task)
             tuner.tune(n_trial=1, measure_option=measure_option, callbacks=[_log_to_list(records)])
             if not isinstance(records[0][1].costs[0], float):
diff --git a/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py b/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py
index 6e29474ab399..a8e580998367 100644
--- a/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py
+++ b/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py
@@ -63,7 +63,7 @@ def expr2graph(expr, target_ops, node_dict, node_list):
         for node_entry in node_list:
             if node_entry["op"] in target_ops:
                 task_name, args = env.task_collection[task_pos]
-                task = autotvm.task.create(task_name, args, target="llvm", target_host=None)
+                task = autotvm.task.create(task_name, args, target="llvm")
                 node_entry["workloads"] = [task.workload]
                 node_entry["topi_op"] = [task_name]
                 task_pos += 1
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 7ac5cf73d254..28dd98191c12 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -40,6 +40,7 @@
 from tvm.error import TVMError
 from tvm.driver import build
 from tvm.contrib import nvcc, ndk, tar
+from tvm.target.target import refresh_host
 
 from ..utils import get_const_tuple
 from ..env import AutotvmGlobalScope
@@ -419,8 +420,7 @@ def _build_func_common(measure_input, check_gpu=None, cuda_arch=None, build_opti
     """Common part for building a configuration"""
     target, task, config = measure_input
 
-    target = tvm.target.Target(target, task.target_host)
-    task.target_host = target.host
+    target, task.target_host = refresh_host(target, task.target_host)
 
     with target:
         s, args = task.instantiate(config)
diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py
index dce15a23969a..46deecd6bd92 100644
--- a/python/tvm/autotvm/task/relay_integration.py
+++ b/python/tvm/autotvm/task/relay_integration.py
@@ -26,6 +26,7 @@
 import tvm
 from tvm.autotvm.task.dispatcher import DispatchContext, FallbackContext
 from tvm.target import Target
+from tvm.target.target import refresh_host
 from .task import create
 from .topi_integration import TaskExtractEnv
 
@@ -90,9 +91,8 @@ def extract_from_program(mod, params, target, target_host=None, ops=None):
     task: Array of autotvm.task.Task
         collected tasks
     """
-    target = Target(target, target_host)
-    target_host = target.host
-    return extract_from_multiple_program([mod], [params], target, target_host, ops)
+    target, target_host = refresh_host(target, target_host)
+    return extract_from_multiple_program([mod], [params], target, ops)
 
 
 def extract_from_multiple_program(mods, params, target, target_host=None, ops=None):
@@ -125,6 +125,9 @@ def extract_from_multiple_program(mods, params, target, target_host=None, ops=No
 
     env = TaskExtractEnv.get()
 
+    # merge target and target host
+    target, target_host = refresh_host(target, target_host)
+
     # run compiler to collect all TOPI calls during compilation
     env.reset(ops)
     with env:
@@ -151,15 +154,11 @@ def extract_from_multiple_program(mods, params, target, target_host=None, ops=No
 
         logger.disabled = old_state
 
-    # merge target and target host
-    target = Target(target, target_host)
-    target_host = target.host
-
     # create tasks for target
     tasks = []
     for task_name, args in env.get_tasks():
         try:
-            tsk = create(task_name, args, target=target, target_host=target_host)
+            tsk = create(task_name, args, target=target)
             tasks.append(tsk)
         except topi.InvalidShapeError:
             logger.warning("Invalid shape during AutoTVM task creation")
diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py
index 349128f342f2..ab0cd9c15820 100644
--- a/python/tvm/autotvm/task/task.py
+++ b/python/tvm/autotvm/task/task.py
@@ -27,6 +27,7 @@
 from tvm.ir import container
 from tvm.target import Target
 from tvm.te import placeholder, tensor
+from tvm.target.target import refresh_host
 from tvm.tir import expr
 
 
@@ -175,7 +176,7 @@ def __getstate__(self):
         # and restore the function by name when unpickling it.
         import cloudpickle  # pylint: disable=import-outside-toplevel
 
-        self.target = Target(self.target, self.target_host)
+        self.target, self.target_host = refresh_host(self.target, self.target_host)
         return {
             "name": self.name,
             "args": self.args,
@@ -196,8 +197,7 @@ def __setstate__(self, state):
         self.config_space = state["config_space"]
         self.func = cloudpickle.loads(state["func"])
         self.flop = state["flop"]
-        self.target = Target(state["target"], state["target_host"])
-        self.target_host = self.target.host
+        self.target, self.target_host = refresh_host(state["target"], state["target_host"])
 
     def __repr__(self):
         return "Task(func_name=%s, args=%s, kwargs=%s, workload=%s)" % (
@@ -449,8 +449,7 @@ def create(task_name, args, target, target_host=None):
     if isinstance(target, str):
         target = Target(target)
 
-    target = Target(target, target_host)
-    target_host = target.host
+    target, target_host = refresh_host(target, target_host)
 
     # init config space
     ret.config_space = ConfigSpace()
@@ -463,7 +462,7 @@ def create(task_name, args, target, target_host=None):
 
     ret.flop = ret.config_space.flop or compute_flop(sch)
     ret.target = target
-    ret.target_host = target_host
+    ret.target_host = target.host
 
     return ret
 
diff --git a/python/tvm/contrib/peak.py b/python/tvm/contrib/peak.py
index e0e020e4de95..b6a1dc59d385 100644
--- a/python/tvm/contrib/peak.py
+++ b/python/tvm/contrib/peak.py
@@ -20,6 +20,7 @@
 import logging
 import tvm
 from tvm import te
+from tvm.target.target import refresh_host
 from . import utils
 from .. import rpc
 
@@ -106,11 +107,10 @@ def measure_bandwidth_sum(
     s[y].bind(yi, te.thread_axis("threadIdx.x"))
     s[y].unroll(k)
 
-    target = tvm.target.Target(target, target_host)
-    target_host = target.host
+    target, target_host = refresh_host(target, target_host)
 
     try:
-        func = tvm.build(s, [x, y], target, target_host=target_host)
+        func = tvm.build(s, [x, y], target)
 
         x = tvm.nd.empty((n,), dtype=dtype, ctx=ctx)
         y = tvm.nd.empty((n // m,), dtype=dtype, ctx=ctx)
@@ -156,8 +156,7 @@ def measure_bandwidth_all_types(
     """
     max_threads = target.max_num_threads
 
-    target = tvm.target.Target(target, target_host)
-    target_host = target.host
+    target, target_host = refresh_host(target, target_host)
 
     result = []
     for base_type in ["float"]:
@@ -235,8 +234,7 @@ def measure_compute_mad(
 
     max_threads = target.max_num_threads
 
-    target = tvm.target.Target(target, target_host)
-    target_host = target.host
+    target, target_host = refresh_host(target, target_host)
 
     base_type = str(base_type) + str(bits)
     dtype = base_type if lanes == 1 else base_type + "x" + str(lanes)
@@ -281,7 +279,7 @@ def mad_func(x, y):
     s = te.create_schedule(y.op)
 
     try:
-        func = tvm.build(s, [y], target, target_host=target_host)
+        func = tvm.build(s, [y], target)
         func = _convert_to_remote(func, remote)
         time_f = func.time_evaluator(func.entry_name, ctx, number=n_times)
         y = tvm.nd.empty((n,), dtype=dtype, ctx=ctx)
@@ -322,8 +320,7 @@ def measure_compute_all_types(
     result: list
         a list of (type_name, GFLOPS/GIOPS) pairs
     """
-    target = tvm.target.Target(target, target_host)
-    target_host = target.host
+    target, target_host = refresh_host(target, target_host)
 
     result = []
     for base_type in ["float", "int"]:
@@ -369,8 +366,7 @@ def measure_peak_all(target, target_host, host, port):
     port: int
     """
 
-    target = tvm.target.Target(target, target_host)
-    target_host = target.host
+    target, target_host = refresh_host(target, target_host)
     remote = rpc.connect(host, port)
     n_times = 20
 
diff --git a/python/tvm/driver/build_module.py b/python/tvm/driver/build_module.py
index 26ead83d820a..bf5a58d29cc5 100644
--- a/python/tvm/driver/build_module.py
+++ b/python/tvm/driver/build_module.py
@@ -232,8 +232,7 @@ def _build_for_device(input_mod, target, target_host):
     mdev : tvm.module
         A module that contains device code.
     """
-    target = Target(target, target_host)
-    target_host = target.host
+    target, target_host = refresh_host(target, target_host)
     device_type = ndarray.context(target.kind.name, 0).device_type
 
     mod_mixed = input_mod
diff --git a/python/tvm/driver/tvmc/autotuner.py b/python/tvm/driver/tvmc/autotuner.py
index 6c8028fcc01f..d46c101d47e5 100644
--- a/python/tvm/driver/tvmc/autotuner.py
+++ b/python/tvm/driver/tvmc/autotuner.py
@@ -243,8 +243,8 @@ def drive_tune(args):
             )
 
     target, extra_targets = common.target_from_cli(args.target)
-    target = tvm.target.Target(target, args.target_host)
-    target_host = target.host
+    target_host = args.target_host
+    target, target_host = refresh_host(target, target_host)
     mod, params = frontends.load_model(args.FILE, args.model_format, shape_dict=args.input_shapes)
 
     for codegen_from_cli in extra_targets:
@@ -301,7 +301,6 @@ def drive_tune(args):
             mod=mod,
             params=params,
             target=target,
-            target_host=target_host,
             alter_layout=args.desired_layout,
             hardware_params=hardware_params,
             include_simple_tasks=args.include_simple_tasks,
@@ -324,7 +323,6 @@ def drive_tune(args):
             mod=mod,
             params=params,
             target=target,
-            target_host=target_host,
             alter_layout=args.desired_layout,
         )
 
@@ -368,13 +366,11 @@ def autotvm_get_tuning_tasks(mod, params, target, target_host=None, alter_layout
     if alter_layout:
         mod = common.convert_graph_layout(mod, alter_layout)
 
-    target = tvm.target.Target(target, target_host)
-    target_host = target.host
+    target, target_host = refresh_host(target, target_host)
 
     tasks = autotvm.task.extract_from_program(
         mod["main"],
         target=target,
-        target_host=target_host,
         params=params,
     )
 
@@ -426,7 +422,6 @@ def autoscheduler_get_tuning_tasks(
         mod["main"],
         params,
         target=target,
-        target_host=target_host,
         hardware_params=hardware_params,
         include_simple_tasks=include_simple_tasks,
     )
diff --git a/python/tvm/driver/tvmc/compiler.py b/python/tvm/driver/tvmc/compiler.py
index 2992f108ce12..55e4d2956fc8 100644
--- a/python/tvm/driver/tvmc/compiler.py
+++ b/python/tvm/driver/tvmc/compiler.py
@@ -27,6 +27,7 @@
 from tvm import relay, runtime
 from tvm.contrib import cc
 from tvm.contrib import utils
+from tvm.target.target import refresh_host
 
 from . import common, composite_target, frontends
 from .main import register_parser
@@ -191,8 +192,8 @@ def compile_model(
         mod = common.convert_graph_layout(mod, alter_layout)
 
     tvm_target, extra_targets = common.target_from_cli(target)
-    tvm_target = tvm.target.Target(tvm_target, tvm_target if not target_host else target_host)
-    target_host = tvm_target.host
+    target_host = tvm_target if not target_host else target_host
+    tvm_target, target_host = refresh_host(tvm_target, target_host)
 
     for codegen_from_cli in extra_targets:
         codegen = composite_target.get_codegen_by_target(codegen_from_cli["name"])
@@ -215,20 +216,16 @@ def compile_model(
                 config["relay.backend.use_auto_scheduler"] = True
                 with tvm.transform.PassContext(opt_level=3, config=config):
                     logger.debug("building relay graph with autoscheduler")
-                    graph_module = relay.build(
-                        mod, target=target, params=params, target_host=target_host
-                    )
+                    graph_module = relay.build(mod, target=target, params=params)
         else:
             with autotvm.apply_history_best(tuning_records):
                 with tvm.transform.PassContext(opt_level=3, config=config):
                     logger.debug("building relay graph with tuning records")
-                    graph_module = relay.build(
-                        mod, tvm_target, params=params, target_host=target_host
-                    )
+                    graph_module = relay.build(mod, tvm_target, params=params)
     else:
         with tvm.transform.PassContext(opt_level=3, config=config):
             logger.debug("building relay graph (no tuning records provided)")
-            graph_module = relay.build(mod, tvm_target, params=params, target_host=target_host)
+            graph_module = relay.build(mod, tvm_target, params=params)
 
     # Generate output dump files with sources
     dump_code = dump_code or []
diff --git a/python/tvm/exec/measure_peak.py b/python/tvm/exec/measure_peak.py
index 3b502a96d09c..52d173d48d42 100644
--- a/python/tvm/exec/measure_peak.py
+++ b/python/tvm/exec/measure_peak.py
@@ -25,6 +25,7 @@
 import argparse
 import logging
 
+from tvm.target.target import refresh_host
 from ..contrib.peak import measure_peak_all
 
 
@@ -43,6 +44,7 @@ def main():
     args = parser.parse_args()
     logging.basicConfig(level=logging.INFO)
 
+    args.target, args.target_host = refresh_host(args.target, args.target_host)
     measure_peak_all(args.target, args.target_host, args.rpc_host, args.rpc_port)
 
 
diff --git a/python/tvm/relay/backend/_backend.py b/python/tvm/relay/backend/_backend.py
index 821178a05c8c..378cd7395e18 100644
--- a/python/tvm/relay/backend/_backend.py
+++ b/python/tvm/relay/backend/_backend.py
@@ -17,6 +17,7 @@
 """The interface of expr function exposed from C++."""
 import tvm._ffi
 import tvm.driver
+from tvm.target.target import refresh_host
 
 
 @tvm._ffi.register_func("relay.backend.lower")
@@ -80,8 +81,7 @@ def build(mod, target, target_host=None):
     """
     if target_host == "":
         target_host = None
-    target = tvm.target.Target(target, target_host)
-    target_host = target.host
+    target, target_host = refresh_host(target, target_host)
     return tvm.driver.build(mod, target=target, target_host=target_host)
 
 
diff --git a/python/tvm/relay/backend/vm.py b/python/tvm/relay/backend/vm.py
index 8abd9b32e69e..a18c4b345c7f 100644
--- a/python/tvm/relay/backend/vm.py
+++ b/python/tvm/relay/backend/vm.py
@@ -67,7 +67,7 @@ def compile(mod, target=None, target_host=None, params=None):
     if params:
         compiler.set_params(params)
     target, target_host = refresh_host(target, target_host, target_is_key=False)
-    compiler.lower(mod, target, target_host)
+    compiler.lower(mod, target)
     compiler.codegen()
     return compiler.get_exec()
 
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 62d4c812517a..64b9c67355ea 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -209,7 +209,7 @@ def _build_module_no_factory(mod, target=None, target_host=None, params=None, mo
     This wrapper is suitable to be used from other programming languages as
     the runtime::Module can be freely passed between language boundaries.
     """
-    return build(mod, target, target_host, params, mod_name).module
+    return build(mod, target, params=params, mod_name=mod_name).module
 
 
 def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"):
@@ -284,7 +284,7 @@ def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"
 
     with tophub_context:
         bld_mod = BuildModule()
-        graph_json, runtime_mod, params = bld_mod.build(ir_mod, target, target_host, params)
+        graph_json, runtime_mod, params = bld_mod.build(mod=ir_mod, target=target, params=params)
         runtime_mod = _graph_runtime_factory.GraphRuntimeFactoryModule(
             ir_mod, target, graph_json, runtime_mod, mod_name, params
         )

From ddfdeb2ff43e5e68fe302ed06fa9a8c072b91971 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Tue, 16 Mar 2021 01:09:10 -0700
Subject: [PATCH 49/69] Fix format & refresh function

---
 python/tvm/auto_scheduler/search_task.py     |  1 -
 python/tvm/autotvm/task/relay_integration.py |  1 -
 python/tvm/relay/build_module.py             |  2 +-
 python/tvm/target/target.py                  | 12 ++++++------
 4 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/python/tvm/auto_scheduler/search_task.py b/python/tvm/auto_scheduler/search_task.py
index f3e876156394..deb434fa72fe 100644
--- a/python/tvm/auto_scheduler/search_task.py
+++ b/python/tvm/auto_scheduler/search_task.py
@@ -27,7 +27,6 @@
 from tvm.runtime import Object, ndarray
 
 from tvm.driver.build_module import build
-from tvm.target import Target
 from tvm.target.target import refresh_host
 from .measure import LocalBuilder, LocalRunner
 from .measure_record import load_best_record
diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py
index 46deecd6bd92..ba594c42ac1a 100644
--- a/python/tvm/autotvm/task/relay_integration.py
+++ b/python/tvm/autotvm/task/relay_integration.py
@@ -25,7 +25,6 @@
 
 import tvm
 from tvm.autotvm.task.dispatcher import DispatchContext, FallbackContext
-from tvm.target import Target
 from tvm.target.target import refresh_host
 from .task import create
 from .topi_integration import TaskExtractEnv
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 64b9c67355ea..53976188835e 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -209,7 +209,7 @@ def _build_module_no_factory(mod, target=None, target_host=None, params=None, mo
     This wrapper is suitable to be used from other programming languages as
     the runtime::Module can be freely passed between language boundaries.
     """
-    return build(mod, target, params=params, mod_name=mod_name).module
+    return build(mod, target, target_host, params=params, mod_name=mod_name).module
 
 
 def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"):
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 3c9cd2a71320..c7ed91aefd5f 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -508,11 +508,7 @@ def refresh_host(target, host=None, target_is_key=True):
     target_is_key : Bool
         When the type of target is dict, whether Target is the key (Otherwise the value)
     """
-    try:
-        target = Target(target, host)
-        host = target.host
-        return target, host
-    except (TypeError, ValueError):
+    if isinstance(target, dict):
         new_target = {}
         for tgt, mod in target.items():
             if not target_is_key:
@@ -522,4 +518,8 @@ def refresh_host(target, host=None, target_is_key=True):
             if not target_is_key:
                 tgt, mod = mod, tgt
             new_target[tgt] = mod
-        return new_target, host
+        target = new_target
+    else:
+        target = Target(target, host)
+        host = target.host
+    return target, host

From cb206ec78b51b54c39667608ad95466504d4427f Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Tue, 16 Mar 2021 01:51:55 -0700
Subject: [PATCH 50/69] Fix unit test bug

---
 python/tvm/autotvm/task/relay_integration.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py
index ba594c42ac1a..4a2e36f72718 100644
--- a/python/tvm/autotvm/task/relay_integration.py
+++ b/python/tvm/autotvm/task/relay_integration.py
@@ -91,7 +91,7 @@ def extract_from_program(mod, params, target, target_host=None, ops=None):
         collected tasks
     """
     target, target_host = refresh_host(target, target_host)
-    return extract_from_multiple_program([mod], [params], target, ops)
+    return extract_from_multiple_program([mod], [params], target, ops=ops)
 
 
 def extract_from_multiple_program(mods, params, target, target_host=None, ops=None):

From ae4ca6855c12b252eeef5cd07b09c6941453db65 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Tue, 16 Mar 2021 11:42:35 -0700
Subject: [PATCH 51/69] Fix bug in refreshing host

---
 python/tvm/auto_scheduler/relay_integration.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py
index 6c1766c18d36..81a7befbd799 100644
--- a/python/tvm/auto_scheduler/relay_integration.py
+++ b/python/tvm/auto_scheduler/relay_integration.py
@@ -125,7 +125,7 @@ def extract_tasks(
     # create search tasks
     tasks = []
     weights = []
-    target = refresh_host(target, target_host)
+    target, target_host = refresh_host(target, target_host)
     for wkl_key, weight in env.wkl_key_to_weight.items():
         tasks.append(
             SearchTask(

From 26a86471158c3f7fba149f42e53bddc47c083142 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Tue, 16 Mar 2021 15:25:08 -0700
Subject: [PATCH 52/69] Fix bug

---
 python/tvm/relay/backend/vm.py   | 4 ++--
 python/tvm/relay/build_module.py | 5 ++---
 python/tvm/target/target.py      | 2 +-
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/python/tvm/relay/backend/vm.py b/python/tvm/relay/backend/vm.py
index a18c4b345c7f..2f220ea50830 100644
--- a/python/tvm/relay/backend/vm.py
+++ b/python/tvm/relay/backend/vm.py
@@ -133,7 +133,7 @@ def lower(self, mod, target=None, target_host=None):
         target = self._update_target(target)
         target_host = self._update_target_host(target, target_host)
 
-        target, target_host = refresh_host(target, target_host)
+        target, target_host = refresh_host(target, target_host, target_is_key=False)
 
         tophub_context = self._tophub_context(target)
         with tophub_context:
@@ -173,7 +173,7 @@ def optimize(self, mod, target=None, target_host=None, params=None):
         target = self._update_target(target)
         target_host = self._update_target_host(target, target_host)
 
-        target, target_host = refresh_host(target, target_host)
+        target, target_host = refresh_host(target, target_host, target_is_key=False)
 
         if params:
             self.set_params(params)
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 53976188835e..8badcdaeacaa 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -131,8 +131,7 @@ def build(self, mod, target=None, target_host=None, params=None):
         autotvm.GLOBAL_SCOPE.silent = use_auto_scheduler
 
         # Assume the target host of all targets in heterogenous target are identical
-        target, target_host = refresh_host(target, target_host)
-
+        target, target_host = refresh_host(target, target_host, target_is_key=False)
         self._build(mod, target, target_host)
         autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent
 
@@ -273,7 +272,7 @@ def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"
     elif target_host:
         raise ValueError("target host must be the type of str, " + "tvm.target.Target, or None")
 
-    target, target_host = refresh_host(target, target_host)
+    target, target_host = refresh_host(target, target_host, target_is_key=False)
 
     # If current dispatch context is fallback context (the default root context),
     # then load pre-tuned parameters from TopHub
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index c7ed91aefd5f..2ee964c00f9f 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -508,7 +508,7 @@ def refresh_host(target, host=None, target_is_key=True):
     target_is_key : Bool
         When the type of target is dict, whether Target is the key (Otherwise the value)
     """
-    if isinstance(target, dict):
+    if isinstance(target, dict) and "kind" not in target:
         new_target = {}
         for tgt, mod in target.items():
             if not target_is_key:

From 83f290b544bbde283da40faf143beb27b278db9e Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Tue, 16 Mar 2021 16:30:06 -0700
Subject: [PATCH 53/69] Add SetHost function

---
 include/tvm/target/target.h                 |  2 ++
 python/tvm/target/target.py                 |  4 ++++
 src/target/target.cc                        |  8 ++++++++
 tests/python/unittest/test_target_target.py | 11 +++++++++++
 4 files changed, 25 insertions(+)

diff --git a/include/tvm/target/target.h b/include/tvm/target/target.h
index 2469f0477a99..8398f9bd409c 100644
--- a/include/tvm/target/target.h
+++ b/include/tvm/target/target.h
@@ -65,6 +65,8 @@ class TargetNode : public Object {
   TVM_DLL Map<String, ObjectRef> Export() const;
   /*! \return The Optional<Target> typed target host of the TargetNode */
   TVM_DLL Optional<Target> GetHost() const;
+  /*! \return Set target host of the TargetNode */
+  TVM_DLL void SetHost(Target);
 
   void VisitAttrs(AttrVisitor* v) {
     v->Visit("kind", &kind);
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 2ee964c00f9f..142cf625703f 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -113,6 +113,10 @@ def __exit__(self, ptype, value, trace):
     def export(self):
         return _ffi_api.TargetExport(self)
 
+    def set_host(self, host=None):
+        if host is not None:
+            _ffi_api.SetHost(self, Target(host))
+
     @staticmethod
     def current(allow_none=True):
         """Returns the current target.
diff --git a/src/target/target.cc b/src/target/target.cc
index a3f9e9a3fe66..a458acafacdd 100644
--- a/src/target/target.cc
+++ b/src/target/target.cc
@@ -51,6 +51,9 @@ class TargetInternal {
   static ObjectPtr<Object> FromRawString(const String& target_str);
   static ObjectPtr<Object> FromConfig(std::unordered_map<String, ObjectRef> config);
   static void ConstructorDispatcher(TVMArgs args, TVMRetValue* rv);
+  static void SetHost(Target target, Target host) {
+    static_cast<TargetNode*>(target.data_.get())->SetHost(host);
+  }
 };
 
 /**********  Helper functions  **********/
@@ -446,6 +449,10 @@ Optional<Target> TargetNode::GetHost() const {
   return GetRef<Optional<Target>>(this->host.as<TargetNode>());
 }
 
+void TargetNode::SetHost(Target host) {
+  this->host = host;
+}
+
 /*! \brief Entry to hold the Target context stack. */
 struct TVMTargetThreadLocalEntry {
   /*! \brief The current target context */
@@ -680,6 +687,7 @@ TVM_REGISTER_GLOBAL("target.TargetEnterScope").set_body_typed(TargetInternal::En
 TVM_REGISTER_GLOBAL("target.TargetExitScope").set_body_typed(TargetInternal::ExitScope);
 TVM_REGISTER_GLOBAL("target.TargetCurrent").set_body_typed(Target::Current);
 TVM_REGISTER_GLOBAL("target.TargetExport").set_body_typed(TargetInternal::Export);
+TVM_REGISTER_GLOBAL("target.SetHost").set_body_typed(TargetInternal::SetHost);
 
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<TargetNode>([](const ObjectRef& obj, ReprPrinter* p) {
diff --git a/tests/python/unittest/test_target_target.py b/tests/python/unittest/test_target_target.py
index 7598b807f59c..2b9edd60ba19 100644
--- a/tests/python/unittest/test_target_target.py
+++ b/tests/python/unittest/test_target_target.py
@@ -252,5 +252,16 @@ def test_target_host_merge_3():
         tvm.target.Target(tvm.target.Target("cuda --host llvm"), 12.34)
 
 
+def test_target_set_host():
+    tgt = tvm.target.Target("cuda")
+    llvm = tvm.target.Target("llvm")
+    tgt.set_host(llvm)
+    assert tgt.kind.name == "cuda"
+    assert tgt.host.kind.name == "llvm"
+    cuda_host = tvm.target.Target("cuda")
+    tgt.set_host(cuda_host)
+    assert tgt.host.kind.name == "cuda"
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 47b072c03cbf423dcb686ba8eb6a2e2bb1ea1685 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Tue, 16 Mar 2021 16:54:34 -0700
Subject: [PATCH 54/69] Update export function

---
 src/target/target.cc | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/target/target.cc b/src/target/target.cc
index a458acafacdd..6950bfa33905 100644
--- a/src/target/target.cc
+++ b/src/target/target.cc
@@ -439,9 +439,10 @@ Map<String, ObjectRef> TargetNode::Export() const {
   if (this->host.defined()) {
     result.Set("host", this->GetHost().value_or(Target())->Export());
   }
-  for (const auto& kv : attrs) {
-    result.Set(kv.first, kv.second);
-  }
+  for (const auto& kv : attrs)
+    if (kv.first != "host") {
+      result.Set(kv.first, kv.second);
+    }
   return result;
 }
 

From bef6fbb9a5754f8169883b9cab3fd40626a46dd2 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Tue, 16 Mar 2021 18:41:20 -0700
Subject: [PATCH 55/69] Fix format

---
 src/target/target.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/target/target.cc b/src/target/target.cc
index 6950bfa33905..016f3acb4fbf 100644
--- a/src/target/target.cc
+++ b/src/target/target.cc
@@ -450,9 +450,7 @@ Optional<Target> TargetNode::GetHost() const {
   return GetRef<Optional<Target>>(this->host.as<TargetNode>());
 }
 
-void TargetNode::SetHost(Target host) {
-  this->host = host;
-}
+void TargetNode::SetHost(Target host) { this->host = host; }
 
 /*! \brief Entry to hold the Target context stack. */
 struct TVMTargetThreadLocalEntry {

From 6771f2d67b5fbaa7c5755bbc68fff5d81692023f Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Wed, 17 Mar 2021 02:10:31 -0700
Subject: [PATCH 56/69] Fix export bug in target

---
 src/target/target.cc | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/src/target/target.cc b/src/target/target.cc
index 016f3acb4fbf..cbe6598b4b0b 100644
--- a/src/target/target.cc
+++ b/src/target/target.cc
@@ -439,10 +439,9 @@ Map<String, ObjectRef> TargetNode::Export() const {
   if (this->host.defined()) {
     result.Set("host", this->GetHost().value_or(Target())->Export());
   }
-  for (const auto& kv : attrs)
-    if (kv.first != "host") {
-      result.Set(kv.first, kv.second);
-    }
+  for (const auto& kv : attrs) {
+    result.Set(kv.first, kv.second);
+  }
   return result;
 }
 
@@ -645,6 +644,13 @@ ObjectPtr<Object> TargetInternal::FromConfig(std::unordered_map<String, ObjectRe
     target->keys = DeduplicateKeys(keys);
     config.erase(kKeys);
   }
+  // parse host
+  if (config.count(kHost)) {
+    target->host = PackedFunc(ConstructorDispatcher)(config[kHost]).AsObjectRef<Target>();
+    config.erase(kHost);
+  } else {
+    target->host = NullOpt;
+  }
   // parse attrs
   std::unordered_map<String, ObjectRef> attrs;
   for (const auto& cfg_kv : config) {
@@ -657,13 +663,6 @@ ObjectPtr<Object> TargetInternal::FromConfig(std::unordered_map<String, ObjectRe
       throw dmlc::Error(": Error when parsing target[\"" + key + "\"]" + e.what());
     }
   }
-  // parse host
-  if (config.count(kHost)) {
-    target->host = PackedFunc(ConstructorDispatcher)(config[kHost]).AsObjectRef<Target>();
-    config.erase(kHost);
-  } else {
-    target->host = NullOpt;
-  }
   // set default attribute values if they do not exist
   for (const auto& kv : target->kind->key2default_) {
     if (!attrs.count(kv.first)) {

From 4442fbadc43c0220ec5a111f6d440f445f1f746f Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Wed, 17 Mar 2021 10:45:52 -0700
Subject: [PATCH 57/69] Fix bug on host referencing

---
 tests/python/unittest/test_target_target.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/unittest/test_target_target.py b/tests/python/unittest/test_target_target.py
index 2b9edd60ba19..ac152447bc4a 100644
--- a/tests/python/unittest/test_target_target.py
+++ b/tests/python/unittest/test_target_target.py
@@ -121,7 +121,7 @@ def test_config_map():
 def test_composite_target():
     tgt = tvm.target.Target("composite --host=llvm --devices=cuda,opencl")
     assert tgt.kind.name == "composite"
-    assert tgt.attrs["host"].kind.name == "llvm"
+    assert tgt.host.kind.name == "llvm"
     assert len(tgt.attrs["devices"]) == 2
     cuda_device, opencl_device = tgt.attrs["devices"]
     assert cuda_device.kind.name == "cuda"

From 542c927c45596d5ec38306c4638042bdc68269fb Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Wed, 17 Mar 2021 15:19:51 -0700
Subject: [PATCH 58/69] Addtional tests

---
 tests/python/unittest/test_target_target.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/python/unittest/test_target_target.py b/tests/python/unittest/test_target_target.py
index ac152447bc4a..8a5aa5bd67a8 100644
--- a/tests/python/unittest/test_target_target.py
+++ b/tests/python/unittest/test_target_target.py
@@ -258,9 +258,14 @@ def test_target_set_host():
     tgt.set_host(llvm)
     assert tgt.kind.name == "cuda"
     assert tgt.host.kind.name == "llvm"
-    cuda_host = tvm.target.Target("cuda")
+    cuda_host = tvm.target.Target("nvidia/jetson-nano")
     tgt.set_host(cuda_host)
     assert tgt.host.kind.name == "cuda"
+    assert tgt.host.attrs["arch"] == "sm_53"
+    assert tgt.host.attrs["shared_memory_per_block"] == 49152
+    assert tgt.host.attrs["max_threads_per_block"] == 1024
+    assert tgt.host.attrs["thread_warp_size"] == 32
+    assert tgt.host.attrs["registers_per_block"] == 32768
 
 
 if __name__ == "__main__":

From 8a537b40b1a298479666b20f073835feee8ebb14 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Thu, 18 Mar 2021 14:17:16 -0700
Subject: [PATCH 59/69] Address review issues

---
 include/tvm/target/target.h                    |  4 +---
 python/tvm/auto_scheduler/measure.py           |  2 +-
 python/tvm/auto_scheduler/relay_integration.py |  1 -
 .../autotvm/graph_tuner/base_graph_tuner.py    |  3 ++-
 python/tvm/autotvm/measure/measure_methods.py  |  1 -
 python/tvm/contrib/peak.py                     | 10 ++++------
 python/tvm/driver/tvmc/autotuner.py            |  8 ++++----
 python/tvm/relay/backend/_backend.py           |  5 ++---
 python/tvm/relay/backend/vm.py                 |  8 +++-----
 python/tvm/relay/build_module.py               |  8 ++++----
 python/tvm/target/target.py                    | 14 +++++++-------
 src/auto_scheduler/measure_record.cc           |  3 ++-
 src/relay/backend/build_module.cc              | 18 +++++++++---------
 src/target/target.cc                           |  6 +++---
 14 files changed, 42 insertions(+), 49 deletions(-)

diff --git a/include/tvm/target/target.h b/include/tvm/target/target.h
index 8398f9bd409c..94cc898ed56a 100644
--- a/include/tvm/target/target.h
+++ b/include/tvm/target/target.h
@@ -175,10 +175,8 @@ class Target : public ObjectRef {
   TVM_DLL void ExitWithScope();
 };
 
-using TargetsMap = Map<Integer, Target>;
-
 TVM_DLL void RefreshHost(Target*, Target*);
-TVM_DLL void RefreshHost(TargetsMap*, Target*);
+TVM_DLL void RefreshHost(Map<Integer, Target>*, Target*);
 TVM_DLL void RefreshHost(Map<Target, IRModule>*, Target*);
 
 }  // namespace tvm
diff --git a/python/tvm/auto_scheduler/measure.py b/python/tvm/auto_scheduler/measure.py
index f6738a0fcc0c..8da04c578bfe 100644
--- a/python/tvm/auto_scheduler/measure.py
+++ b/python/tvm/auto_scheduler/measure.py
@@ -603,6 +603,7 @@ def _timed_func(inp_serialized, build_func, verbose):
     tic = time.time()
     inp = MeasureInput.deserialize(inp_serialized)
     task = inp.task
+    task.target, task.target_host = refresh_host(task.target, task.target_host)
 
     error_no = MeasureErrorNo.NO_ERROR
     error_msg = None
@@ -622,7 +623,6 @@ def _timed_func(inp_serialized, build_func, verbose):
         filename = os.path.join(dirname, "tmp_func." + build_func.output_format)
 
         try:
-            task.target, task.target_host = refresh_host(task.target, task.target_host)
             with transform.PassContext():
                 func = build_module.build(sch, args, target=task.target)
             func.export_library(filename, build_func)
diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py
index 81a7befbd799..c47ca6e48c0b 100644
--- a/python/tvm/auto_scheduler/relay_integration.py
+++ b/python/tvm/auto_scheduler/relay_integration.py
@@ -125,7 +125,6 @@ def extract_tasks(
     # create search tasks
     tasks = []
     weights = []
-    target, target_host = refresh_host(target, target_host)
     for wkl_key, weight in env.wkl_key_to_weight.items():
         tasks.append(
             SearchTask(
diff --git a/python/tvm/autotvm/graph_tuner/base_graph_tuner.py b/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
index 7c55ec641021..e2239a329dd2 100644
--- a/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
+++ b/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
@@ -440,6 +440,8 @@ def benchmark_layout_transform(
             This might bring performance loss comparing to benchmarking layout transformation.
         """
         self._logger.info("Start to benchmark layout transformation...")
+        self._target, target_host = refresh_host(self._target, target_host)
+
         if layout_records is None and infer_layout:
             raise RuntimeError("Requires some records to infer layout transformation time.")
 
@@ -526,7 +528,6 @@ def _callback(_, inputs, results):
                 continue
 
             records = []
-            self._target, target_host = refresh_host(self._target, target_host)
             task = autotvm.task.create("layout_transform", args=args, target=self._target)
             tuner = autotvm.tuner.GridSearchTuner(task)
             tuner.tune(n_trial=1, measure_option=measure_option, callbacks=[_log_to_list(records)])
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 28dd98191c12..9eb99528d710 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -419,7 +419,6 @@ def set_task(self, task):
 def _build_func_common(measure_input, check_gpu=None, cuda_arch=None, build_option=None):
     """Common part for building a configuration"""
     target, task, config = measure_input
-
     target, task.target_host = refresh_host(target, task.target_host)
 
     with target:
diff --git a/python/tvm/contrib/peak.py b/python/tvm/contrib/peak.py
index b6a1dc59d385..ea44a996ee69 100644
--- a/python/tvm/contrib/peak.py
+++ b/python/tvm/contrib/peak.py
@@ -87,6 +87,8 @@ def measure_bandwidth_sum(
     GBPS: float
          gigabyte per second
     """
+    target, target_host = refresh_host(target, target_host)
+
     n, m = total_item, item_per_thread
     n //= lanes
 
@@ -107,8 +109,6 @@ def measure_bandwidth_sum(
     s[y].bind(yi, te.thread_axis("threadIdx.x"))
     s[y].unroll(k)
 
-    target, target_host = refresh_host(target, target_host)
-
     try:
         func = tvm.build(s, [x, y], target)
 
@@ -154,9 +154,8 @@ def measure_bandwidth_all_types(
     result: list
         a list of (type_name, GBPS) pairs
     """
-    max_threads = target.max_num_threads
-
     target, target_host = refresh_host(target, target_host)
+    max_threads = target.max_num_threads
 
     result = []
     for base_type in ["float"]:
@@ -226,6 +225,7 @@ def measure_compute_mad(
     GOPS: float
          giga operation per second
     """
+    target, target_host = refresh_host(target, target_host)
 
     n = total_item
 
@@ -234,8 +234,6 @@ def measure_compute_mad(
 
     max_threads = target.max_num_threads
 
-    target, target_host = refresh_host(target, target_host)
-
     base_type = str(base_type) + str(bits)
     dtype = base_type if lanes == 1 else base_type + "x" + str(lanes)
 
diff --git a/python/tvm/driver/tvmc/autotuner.py b/python/tvm/driver/tvmc/autotuner.py
index d46c101d47e5..7423952c7264 100644
--- a/python/tvm/driver/tvmc/autotuner.py
+++ b/python/tvm/driver/tvmc/autotuner.py
@@ -363,11 +363,11 @@ def autotvm_get_tuning_tasks(mod, params, target, target_host=None, alter_layout
     tasks : list of autotvm.Tasks
         list of tasks to be tuned
     """
+    target, target_host = refresh_host(target, target_host)
+
     if alter_layout:
         mod = common.convert_graph_layout(mod, alter_layout)
 
-    target, target_host = refresh_host(target, target_host)
-
     tasks = autotvm.task.extract_from_program(
         mod["main"],
         target=target,
@@ -412,11 +412,11 @@ def autoscheduler_get_tuning_tasks(
     weights : List[int]
         the weight (i.e. the number of appearance) of extracted tasks
     """
+    target, target_host = refresh_host(target, target_host)
+
     if alter_layout:
         mod = common.convert_graph_layout(mod, alter_layout)
 
-    target, target_host = refresh_host(target, target_host)
-
     # Extract the tasks
     tasks, task_weights = auto_scheduler.extract_tasks(
         mod["main"],
diff --git a/python/tvm/relay/backend/_backend.py b/python/tvm/relay/backend/_backend.py
index 378cd7395e18..30e3e3e7ad82 100644
--- a/python/tvm/relay/backend/_backend.py
+++ b/python/tvm/relay/backend/_backend.py
@@ -79,10 +79,9 @@ def build(mod, target, target_host=None):
     module : tvm.Module
         The runtime module.
     """
-    if target_host == "":
-        target_host = None
+    target_host = None if target_host == "" else target_host
     target, target_host = refresh_host(target, target_host)
-    return tvm.driver.build(mod, target=target, target_host=target_host)
+    return tvm.driver.build(mod, target=target)
 
 
 @tvm._ffi.register_func("relay._tensor_value_repr")
diff --git a/python/tvm/relay/backend/vm.py b/python/tvm/relay/backend/vm.py
index 2f220ea50830..b3fdbe880c92 100644
--- a/python/tvm/relay/backend/vm.py
+++ b/python/tvm/relay/backend/vm.py
@@ -63,10 +63,10 @@ def compile(mod, target=None, target_host=None, params=None):
     exec : tvm.runtime.vm.Executable
         The VM executable that contains both library code and bytecode.
     """
+    target, target_host = refresh_host(target, target_host, target_is_dict_key=False)
     compiler = VMCompiler()
     if params:
         compiler.set_params(params)
-    target, target_host = refresh_host(target, target_host, target_is_key=False)
     compiler.lower(mod, target)
     compiler.codegen()
     return compiler.get_exec()
@@ -132,8 +132,7 @@ def lower(self, mod, target=None, target_host=None):
         """
         target = self._update_target(target)
         target_host = self._update_target_host(target, target_host)
-
-        target, target_host = refresh_host(target, target_host, target_is_key=False)
+        target, target_host = refresh_host(target, target_host, target_is_dict_key=False)
 
         tophub_context = self._tophub_context(target)
         with tophub_context:
@@ -172,8 +171,7 @@ def optimize(self, mod, target=None, target_host=None, params=None):
         """
         target = self._update_target(target)
         target_host = self._update_target_host(target, target_host)
-
-        target, target_host = refresh_host(target, target_host, target_is_key=False)
+        target, target_host = refresh_host(target, target_host, target_is_dict_key=False)
 
         if params:
             self.set_params(params)
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 8badcdaeacaa..72b09d9e3cec 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -115,6 +115,7 @@ def build(self, mod, target=None, target_host=None, params=None):
             The runtime factory for the TVM graph runtime.
         """
         target = _update_target(target)
+        target, target_host = refresh_host(target, target_host, target_is_dict_key=False)
 
         # Setup the params.
         if params:
@@ -130,8 +131,6 @@ def build(self, mod, target=None, target_host=None, params=None):
         old_autotvm_silent = autotvm.GLOBAL_SCOPE.silent
         autotvm.GLOBAL_SCOPE.silent = use_auto_scheduler
 
-        # Assume the target host of all targets in heterogenous target are identical
-        target, target_host = refresh_host(target, target_host, target_is_key=False)
         self._build(mod, target, target_host)
         autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent
 
@@ -208,7 +207,8 @@ def _build_module_no_factory(mod, target=None, target_host=None, params=None, mo
     This wrapper is suitable to be used from other programming languages as
     the runtime::Module can be freely passed between language boundaries.
     """
-    return build(mod, target, target_host, params=params, mod_name=mod_name).module
+    target, target_host = refresh_host(target, target_host)
+    return build(mod, target, params=params, mod_name=mod_name).module
 
 
 def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"):
@@ -272,7 +272,7 @@ def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"
     elif target_host:
         raise ValueError("target host must be the type of str, " + "tvm.target.Target, or None")
 
-    target, target_host = refresh_host(target, target_host, target_is_key=False)
+    target, target_host = refresh_host(target, target_host, target_is_dict_key=False)
 
     # If current dispatch context is fallback context (the default root context),
     # then load pre-tuned parameters from TopHub
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 142cf625703f..5271d2c6696e 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -500,26 +500,26 @@ def _load_config_dict(config_dict_str):
     return config
 
 
-def refresh_host(target, host=None, target_is_key=True):
-    """Helpfer function to return a target and target host after updating each other.
+def refresh_host(target, host=None, target_is_dict_key=True):
+    """A helper function that merges a legacy "target, target_host" pair, then returns the merged target and its host field.
 
     Parameters
     ----------
-    target        : Union[str, Dict[str, Any], Target]
+    target : Union[str, Dict[str, Any], Target]
         The target or heterogeneous target
-    host          : Union[str, Dict[str, Any], Target, None]
+    host : Union[str, Dict[str, Any], Target, None]
         The target host
-    target_is_key : Bool
+    target_is_dict_key : Bool
         When the type of target is dict, whether Target is the key (Otherwise the value)
     """
     if isinstance(target, dict) and "kind" not in target:
         new_target = {}
         for tgt, mod in target.items():
-            if not target_is_key:
+            if not target_is_dict_key:
                 tgt, mod = mod, tgt
             if isinstance(tgt, (dict, str, Target)):
                 tgt, host = refresh_host(tgt, host)
-            if not target_is_key:
+            if not target_is_dict_key:
                 tgt, mod = mod, tgt
             new_target[tgt] = mod
         target = new_target
diff --git a/src/auto_scheduler/measure_record.cc b/src/auto_scheduler/measure_record.cc
index 144f2098db57..0a81baf62444 100644
--- a/src/auto_scheduler/measure_record.cc
+++ b/src/auto_scheduler/measure_record.cc
@@ -163,7 +163,8 @@ struct Handler<::tvm::auto_scheduler::SearchTaskNode> {
     writer->WriteArrayItem(std::string(data.workload_key));
     writer->WriteArrayItem(data.target->str());
     writer->WriteArrayItem(*data.hardware_params.get());
-    ::tvm::Target target = data.target, target_host = data.target_host;
+    ::tvm::Target target = data.target;
+    ::tvm::Target target_host = data.target_host;
     ::tvm::RefreshHost(&target, &target_host);
     if (target_host.defined()) {
       writer->WriteArrayItem(target_host->str());
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index f586b86f1e5a..68093eaca7d9 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -462,6 +462,15 @@ class RelayBuildModule : public runtime::ModuleNode {
    */
   void BuildRelay(IRModule relay_module,
                   const std::unordered_map<std::string, tvm::runtime::NDArray>& params) {
+    Target target_host = GetTargetHost();
+    // If no target_host has been set, we choose a default one, which is
+    // llvm if "codegen.LLVMModuleCreate" is accessible.
+    const runtime::PackedFunc* pf = runtime::Registry::Get("codegen.LLVMModuleCreate");
+    if (!target_host.defined()) target_host = (pf != nullptr) ? Target("llvm") : Target("stackvm");
+
+    // Update all the targets in the targets_ TargetsMap
+    RefreshHost(&targets_, &target_host);
+
     // Relay IRModule -> IRModule optimizations.
     relay_module = Optimize(relay_module, targets_, params);
     // Get the updated function.
@@ -477,15 +486,6 @@ class RelayBuildModule : public runtime::ModuleNode {
 
     auto lowered_funcs = graph_codegen_->GetIRModule();
 
-    Target target_host = GetTargetHost();
-    // If no target_host has been set, we choose a default one, which is
-    // llvm if "codegen.LLVMModuleCreate" is accessible.
-    const runtime::PackedFunc* pf = runtime::Registry::Get("codegen.LLVMModuleCreate");
-    if (!target_host.defined()) target_host = (pf != nullptr) ? Target("llvm") : Target("stackvm");
-
-    // Update all the targets in the targets_ TargetsMap
-    RefreshHost(&targets_, &target_host);
-
     // Generate a placeholder function that attaches linked params as its arguments.
     if (target_host->GetAttr<Bool>("link-params").value_or(Bool(false))) {
       CHECK(pf != nullptr) << "Unable to link-params with no target_host and no llvm codegen.";
diff --git a/src/target/target.cc b/src/target/target.cc
index cbe6598b4b0b..8c9045b65f9e 100644
--- a/src/target/target.cc
+++ b/src/target/target.cc
@@ -63,8 +63,8 @@ void RefreshHost(Target* target, Target* target_host) {
   *target_host = (*target)->GetHost().value_or(Target());
 }
 
-void RefreshHost(TargetsMap* targets, Target* target_host) {
-  TargetsMap new_targets;
+void RefreshHost(Map<Integer, Target>* targets, Target* target_host) {
+  Map<Integer, Target> new_targets;
   for (auto& it : *targets) {
     auto target = it.second;
     RefreshHost(&target, target_host);
@@ -403,7 +403,7 @@ Target::Target(const Map<String, ObjectRef>& config) {
 
 Target::Target(Target target, Target host) {
   ObjectPtr<TargetNode> n = make_object<TargetNode>(*target.get());
-  CHECK(!n->host.defined() || n->host == host)
+  CHECK(!n->host.defined() || n->host.same_as(host))
       << "ValueError: Adding a host to a target whose host field has been defined";
   // add target host into host field
   n->host = std::move(host);

From 6f76c1d205a301b067215ba7b8b9498119085321 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Thu, 18 Mar 2021 14:39:55 -0700
Subject: [PATCH 60/69] Fix format target.py

---
 python/tvm/target/target.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 5271d2c6696e..f589938470c7 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -501,7 +501,8 @@ def _load_config_dict(config_dict_str):
 
 
 def refresh_host(target, host=None, target_is_dict_key=True):
-    """A helper function that merges a legacy "target, target_host" pair, then returns the merged target and its host field.
+    """A helper function that merges a legacy "target, target_host" pair, then returns
+    the merged target and its host field.
 
     Parameters
     ----------

From f46626f76afe2a3692cd57ae707788fca971718c Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Mon, 29 Mar 2021 18:08:14 -0700
Subject: [PATCH 61/69] Fix issues and format

---
 include/tvm/target/target.h                   | 39 +++++++++--
 python/tvm/auto_scheduler/measure.py          | 11 +++-
 .../tvm/auto_scheduler/relay_integration.py   |  5 +-
 python/tvm/auto_scheduler/search_task.py      | 12 ++--
 .../autotvm/graph_tuner/base_graph_tuner.py   |  6 +-
 python/tvm/autotvm/measure/measure_methods.py |  4 +-
 python/tvm/autotvm/task/relay_integration.py  |  6 +-
 python/tvm/autotvm/task/task.py               | 12 ++--
 python/tvm/contrib/peak.py                    | 12 ++--
 python/tvm/driver/build_module.py             | 11 ++--
 python/tvm/driver/tvmc/autotuner.py           |  8 +--
 python/tvm/driver/tvmc/compiler.py            |  4 +-
 python/tvm/exec/measure_peak.py               |  6 +-
 python/tvm/relay/backend/_backend.py          |  4 +-
 python/tvm/relay/backend/vm.py                | 14 ++--
 python/tvm/relay/build_module.py              | 12 ++--
 python/tvm/target/target.py                   | 66 +++++++++----------
 src/auto_scheduler/feature.cc                 |  7 +-
 src/auto_scheduler/measure_record.cc          |  4 +-
 src/auto_scheduler/search_task.cc             |  4 +-
 src/driver/driver_api.cc                      | 10 +--
 src/relay/backend/build_module.cc             |  4 +-
 src/relay/backend/vm/compiler.cc              |  6 +-
 src/relay/transforms/memory_alloc.cc          |  4 +-
 src/target/target.cc                          | 27 ++++----
 .../unittest/test_auto_scheduler_measure.py   |  1 -
 tests/python/unittest/test_target_target.py   |  6 +-
 27 files changed, 182 insertions(+), 123 deletions(-)

diff --git a/include/tvm/target/target.h b/include/tvm/target/target.h
index 94cc898ed56a..c2e1e295323f 100644
--- a/include/tvm/target/target.h
+++ b/include/tvm/target/target.h
@@ -29,6 +29,7 @@
 #include <tvm/node/node.h>
 #include <tvm/support/with.h>
 #include <tvm/target/target_kind.h>
+#include <tvm/runtime/container.h>
 
 #include <string>
 #include <unordered_set>
@@ -65,8 +66,6 @@ class TargetNode : public Object {
   TVM_DLL Map<String, ObjectRef> Export() const;
   /*! \return The Optional<Target> typed target host of the TargetNode */
   TVM_DLL Optional<Target> GetHost() const;
-  /*! \return Set target host of the TargetNode */
-  TVM_DLL void SetHost(Target);
 
   void VisitAttrs(AttrVisitor* v) {
     v->Visit("kind", &kind);
@@ -157,6 +156,13 @@ class Target : public ObjectRef {
    */
   TVM_DLL explicit Target(Target target, Target host);
   TVM_DEFINE_OBJECT_REF_METHODS(Target, ObjectRef, TargetNode);
+  /*!
+   * \brief Create a new Target object with given target (w.o host) and target host.
+   * \param target The current Target typed object target, with or without host field.
+   * \param host The given Target typed object target host
+   * \return The new Target object with the given target and host field of given host.
+  */
+  static Target WithHost(const Target&, const Target&);
 
  private:
   // enable with syntax.
@@ -174,10 +180,29 @@ class Target : public ObjectRef {
    */
   TVM_DLL void ExitWithScope();
 };
-
-TVM_DLL void RefreshHost(Target*, Target*);
-TVM_DLL void RefreshHost(Map<Integer, Target>*, Target*);
-TVM_DLL void RefreshHost(Map<Target, IRModule>*, Target*);
-
+/*!
+  * \brief Check and update host field of the given legacy target and target host pair.
+  *  Note that this function is for legacy target api compatibility issue only, not
+  *  recommended for other use.
+  * \param target The pointer to a Target typed object with host field to be updated
+  * \param host The pointer to a Target typed object for target host to be updated
+*/
+void CheckAndUpdateHostConsistency(Target*, Target*);
+/*!
+  * \brief Check and update host field of the given legacy heterogeneous targets and
+  *  target host.Note that this function is for legacy target api compatibility issue only,
+  *  not recommended for other use.
+  * \param target The pointer to a Map objects with values being Target objects
+  * \param host The Target typed object for target host to be updated
+*/
+void CheckAndUpdateHostConsistency(Map<Integer, Target>*, Target*);
+/*!
+  * \brief Check and update host field of the given legacy heterogeneous targets and
+  *  target host.Note that this function is for legacy target api compatibility issue only,
+  *  not recommended for other use.
+  * \param target The pointer to a Map objects with keys being Target objects
+  * \param host The Target typed object for target host to be updated
+*/
+void CheckAndUpdateHostConsistency(Map<Target, IRModule>*, Target*);
 }  // namespace tvm
 #endif  // TVM_TARGET_TARGET_H_
diff --git a/python/tvm/auto_scheduler/measure.py b/python/tvm/auto_scheduler/measure.py
index 8da04c578bfe..322143b28594 100644
--- a/python/tvm/auto_scheduler/measure.py
+++ b/python/tvm/auto_scheduler/measure.py
@@ -44,7 +44,8 @@
 from tvm.ir import transform
 from tvm.autotvm.measure.measure_methods import set_cuda_target_arch
 from tvm.contrib import tar, ndk
-from tvm.target.target import refresh_host
+from tvm.target import Target
+
 
 from . import _ffi_api
 from .loop_state import StateObject
@@ -222,7 +223,9 @@ def recover_measure_input(inp, rebuild_state=False):
     from .search_task import SearchTask  # lazily import to avoid recursive dependency
 
     task = inp.task
-    task.target, task.target_host = refresh_host(task.target, task.target_host)
+    task.target, task.target_host = Target.check_and_update_host_consistency(
+        task.target, task.target_host
+    )
     new_task = SearchTask(
         workload_key=task.workload_key,
         target=task.target,
@@ -603,7 +606,9 @@ def _timed_func(inp_serialized, build_func, verbose):
     tic = time.time()
     inp = MeasureInput.deserialize(inp_serialized)
     task = inp.task
-    task.target, task.target_host = refresh_host(task.target, task.target_host)
+    task.target, task.target_host = Target.check_and_update_host_consistency(
+        task.target, task.target_host
+    )
 
     error_no = MeasureErrorNo.NO_ERROR
     error_msg = None
diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py
index c47ca6e48c0b..6c5957e89de0 100644
--- a/python/tvm/auto_scheduler/relay_integration.py
+++ b/python/tvm/auto_scheduler/relay_integration.py
@@ -29,10 +29,11 @@
 from tvm import autotvm, transform
 from tvm.ir.transform import PassContext
 from tvm.runtime import convert_to_object
-from tvm.target.target import refresh_host
+
 from tvm.te.tensor import ComputeOp, PlaceholderOp, Tensor
 from tvm.tir import Reduce
 from tvm.tir import expr as _expr
+from tvm.target import Target
 
 from . import _ffi_api
 from .compute_dag import ComputeDAG, LayoutRewriteOption
@@ -109,7 +110,7 @@ def extract_tasks(
     """
     # pylint: disable=import-outside-toplevel
 
-    target, target_host = refresh_host(target, target_host)
+    target, target_host = Target.check_and_update_host_consistency(target, target_host)
 
     # Run the compiler to collect all TOPI calls during compilation.
     env = TracingEnvironment(
diff --git a/python/tvm/auto_scheduler/search_task.py b/python/tvm/auto_scheduler/search_task.py
index deb434fa72fe..dd7e89c69184 100644
--- a/python/tvm/auto_scheduler/search_task.py
+++ b/python/tvm/auto_scheduler/search_task.py
@@ -27,7 +27,7 @@
 from tvm.runtime import Object, ndarray
 
 from tvm.driver.build_module import build
-from tvm.target.target import refresh_host
+from tvm.target import Target
 from .measure import LocalBuilder, LocalRunner
 from .measure_record import load_best_record
 from .workload_registry import make_workload_key
@@ -394,7 +394,7 @@ def __init__(
 
         assert target is not None, "Must specify a target."
 
-        target, target_host = refresh_host(target, target_host)
+        target, target_host = Target.check_and_update_host_consistency(target, target_host)
 
         if layout_rewrite_option is None:
             layout_rewrite_option = LayoutRewriteOption.get_target_default(target)
@@ -504,7 +504,9 @@ def print_best(self, log_file, print_mode="schedule"):
         raise ValueError("Invalid print_mode: %s" % print_mode)
 
     def __getstate__(self):
-        self.target, self.target_host = refresh_host(self.target, self.target_host)
+        self.target, self.target_host = Target.check_and_update_host_consistency(
+            self.target, self.target_host
+        )
         return {
             "compute_dag": self.compute_dag,
             "workload_key": self.workload_key,
@@ -529,7 +531,9 @@ def __setstate__(self, state):
         if workload[0] not in WORKLOAD_FUNC_REGISTRY:
             register_workload_tensors(state["workload_key"], state["compute_dag"].tensors)
 
-        state["target"], state["target_host"] = refresh_host(state["target"], state["target_host"])
+        state["target"], state["target_host"] = Target.check_and_update_host_consistency(
+            state["target"], state["target_host"]
+        )
         self.__init_handle_by_constructor__(
             _ffi_api.SearchTask,
             state["compute_dag"],
diff --git a/python/tvm/autotvm/graph_tuner/base_graph_tuner.py b/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
index e2239a329dd2..6605ba733aac 100644
--- a/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
+++ b/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
@@ -28,7 +28,7 @@
 from tvm.autotvm.task import get_config
 from tvm.autotvm.record import encode, load_from_file
 from tvm.autotvm.measure import MeasureResult, MeasureInput
-from tvm.target.target import refresh_host
+from tvm.target import Target
 
 from ...target import Target
 from .utils import (
@@ -440,7 +440,9 @@ def benchmark_layout_transform(
             This might bring performance loss comparing to benchmarking layout transformation.
         """
         self._logger.info("Start to benchmark layout transformation...")
-        self._target, target_host = refresh_host(self._target, target_host)
+        self._target, target_host = Target.check_and_update_host_consistency(
+            self._target, target_host
+        )
 
         if layout_records is None and infer_layout:
             raise RuntimeError("Requires some records to infer layout transformation time.")
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 9eb99528d710..3042d93f01fd 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -40,7 +40,7 @@
 from tvm.error import TVMError
 from tvm.driver import build
 from tvm.contrib import nvcc, ndk, tar
-from tvm.target.target import refresh_host
+from tvm.target import Target
 
 from ..utils import get_const_tuple
 from ..env import AutotvmGlobalScope
@@ -419,7 +419,7 @@ def set_task(self, task):
 def _build_func_common(measure_input, check_gpu=None, cuda_arch=None, build_option=None):
     """Common part for building a configuration"""
     target, task, config = measure_input
-    target, task.target_host = refresh_host(target, task.target_host)
+    target, task.target_host = Target.check_and_update_host_consistency(target, task.target_host)
 
     with target:
         s, args = task.instantiate(config)
diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py
index 4a2e36f72718..15145b635817 100644
--- a/python/tvm/autotvm/task/relay_integration.py
+++ b/python/tvm/autotvm/task/relay_integration.py
@@ -25,7 +25,7 @@
 
 import tvm
 from tvm.autotvm.task.dispatcher import DispatchContext, FallbackContext
-from tvm.target.target import refresh_host
+from tvm.target import Target
 from .task import create
 from .topi_integration import TaskExtractEnv
 
@@ -90,7 +90,7 @@ def extract_from_program(mod, params, target, target_host=None, ops=None):
     task: Array of autotvm.task.Task
         collected tasks
     """
-    target, target_host = refresh_host(target, target_host)
+    target, target_host = Target.check_and_update_host_consistency(target, target_host)
     return extract_from_multiple_program([mod], [params], target, ops=ops)
 
 
@@ -125,7 +125,7 @@ def extract_from_multiple_program(mods, params, target, target_host=None, ops=No
     env = TaskExtractEnv.get()
 
     # merge target and target host
-    target, target_host = refresh_host(target, target_host)
+    target, target_host = Target.check_and_update_host_consistency(target, target_host)
 
     # run compiler to collect all TOPI calls during compilation
     env.reset(ops)
diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py
index ab0cd9c15820..720990dfc70f 100644
--- a/python/tvm/autotvm/task/task.py
+++ b/python/tvm/autotvm/task/task.py
@@ -27,7 +27,7 @@
 from tvm.ir import container
 from tvm.target import Target
 from tvm.te import placeholder, tensor
-from tvm.target.target import refresh_host
+from tvm.target import Target
 from tvm.tir import expr
 
 
@@ -176,7 +176,9 @@ def __getstate__(self):
         # and restore the function by name when unpickling it.
         import cloudpickle  # pylint: disable=import-outside-toplevel
 
-        self.target, self.target_host = refresh_host(self.target, self.target_host)
+        self.target, self.target_host = Target.check_and_update_host_consistency(
+            self.target, self.target_host
+        )
         return {
             "name": self.name,
             "args": self.args,
@@ -197,7 +199,9 @@ def __setstate__(self, state):
         self.config_space = state["config_space"]
         self.func = cloudpickle.loads(state["func"])
         self.flop = state["flop"]
-        self.target, self.target_host = refresh_host(state["target"], state["target_host"])
+        self.target, self.target_host = Target.check_and_update_host_consistency(
+            state["target"], state["target_host"]
+        )
 
     def __repr__(self):
         return "Task(func_name=%s, args=%s, kwargs=%s, workload=%s)" % (
@@ -449,7 +453,7 @@ def create(task_name, args, target, target_host=None):
     if isinstance(target, str):
         target = Target(target)
 
-    target, target_host = refresh_host(target, target_host)
+    target, target_host = Target.check_and_update_host_consistency(target, target_host)
 
     # init config space
     ret.config_space = ConfigSpace()
diff --git a/python/tvm/contrib/peak.py b/python/tvm/contrib/peak.py
index ea44a996ee69..0931fc737606 100644
--- a/python/tvm/contrib/peak.py
+++ b/python/tvm/contrib/peak.py
@@ -20,7 +20,7 @@
 import logging
 import tvm
 from tvm import te
-from tvm.target.target import refresh_host
+from tvm.target import Target
 from . import utils
 from .. import rpc
 
@@ -87,7 +87,7 @@ def measure_bandwidth_sum(
     GBPS: float
          gigabyte per second
     """
-    target, target_host = refresh_host(target, target_host)
+    target, target_host = Target.check_and_update_host_consistency(target, target_host)
 
     n, m = total_item, item_per_thread
     n //= lanes
@@ -154,7 +154,7 @@ def measure_bandwidth_all_types(
     result: list
         a list of (type_name, GBPS) pairs
     """
-    target, target_host = refresh_host(target, target_host)
+    target, target_host = Target.check_and_update_host_consistency(target, target_host)
     max_threads = target.max_num_threads
 
     result = []
@@ -225,7 +225,7 @@ def measure_compute_mad(
     GOPS: float
          giga operation per second
     """
-    target, target_host = refresh_host(target, target_host)
+    target, target_host = Target.check_and_update_host_consistency(target, target_host)
 
     n = total_item
 
@@ -318,7 +318,7 @@ def measure_compute_all_types(
     result: list
         a list of (type_name, GFLOPS/GIOPS) pairs
     """
-    target, target_host = refresh_host(target, target_host)
+    target, target_host = Target.check_and_update_host_consistency(target, target_host)
 
     result = []
     for base_type in ["float", "int"]:
@@ -364,7 +364,7 @@ def measure_peak_all(target, target_host, host, port):
     port: int
     """
 
-    target, target_host = refresh_host(target, target_host)
+    target, target_host = Target.check_and_update_host_consistency(target, target_host)
     remote = rpc.connect(host, port)
     n_times = 20
 
diff --git a/python/tvm/driver/build_module.py b/python/tvm/driver/build_module.py
index bf5a58d29cc5..e96d7a8023ba 100644
--- a/python/tvm/driver/build_module.py
+++ b/python/tvm/driver/build_module.py
@@ -30,7 +30,6 @@
 from tvm.te import tensor
 from tvm.te import schedule
 from tvm.target import Target
-from tvm.target.target import refresh_host
 
 
 def get_binds(args, compact=False, binds=None):
@@ -232,7 +231,7 @@ def _build_for_device(input_mod, target, target_host):
     mdev : tvm.module
         A module that contains device code.
     """
-    target, target_host = refresh_host(target, target_host)
+    target, target_host = Target.check_and_update_host_consistency(target, target_host)
     device_type = ndarray.context(target.kind.name, 0).device_type
 
     mod_mixed = input_mod
@@ -399,7 +398,9 @@ def build(inputs, args=None, target=None, target_host=None, name="default_functi
         if not isinstance(mod, tvm.IRModule):
             raise ValueError("inputs must be Schedule, IRModule," "or dict of str to IRModule.")
 
-    target_input_mod, target_host = refresh_host(target_input_mod, target_host)
+    target_input_mod, target_host = Target.check_and_update_host_consistency(
+        target_input_mod, target_host
+    )
 
     if not target_host:
         for tar, mod in target_input_mod.items():
@@ -411,7 +412,9 @@ def build(inputs, args=None, target=None, target_host=None, name="default_functi
     if not target_host:
         target_host = "llvm" if tvm.runtime.enabled("llvm") else "stackvm"
 
-    target_input_mod, target_host = refresh_host(target_input_mod, target_host)
+    target_input_mod, target_host = Target.check_and_update_host_consistency(
+        target_input_mod, target_host
+    )
 
     mod_host_all = tvm.IRModule({})
 
diff --git a/python/tvm/driver/tvmc/autotuner.py b/python/tvm/driver/tvmc/autotuner.py
index 7423952c7264..df3c6728596c 100644
--- a/python/tvm/driver/tvmc/autotuner.py
+++ b/python/tvm/driver/tvmc/autotuner.py
@@ -28,7 +28,7 @@
 from tvm.autotvm.tuner import GridSearchTuner
 from tvm.autotvm.tuner import RandomTuner
 from tvm.autotvm.tuner import XGBTuner
-from tvm.target.target import refresh_host
+from tvm.target import Target
 
 from . import common, composite_target, frontends
 from .common import TVMCException
@@ -244,7 +244,7 @@ def drive_tune(args):
 
     target, extra_targets = common.target_from_cli(args.target)
     target_host = args.target_host
-    target, target_host = refresh_host(target, target_host)
+    target, target_host = Target.check_and_update_host_consistency(target, target_host)
     mod, params = frontends.load_model(args.FILE, args.model_format, shape_dict=args.input_shapes)
 
     for codegen_from_cli in extra_targets:
@@ -363,7 +363,7 @@ def autotvm_get_tuning_tasks(mod, params, target, target_host=None, alter_layout
     tasks : list of autotvm.Tasks
         list of tasks to be tuned
     """
-    target, target_host = refresh_host(target, target_host)
+    target, target_host = Target.check_and_update_host_consistency(target, target_host)
 
     if alter_layout:
         mod = common.convert_graph_layout(mod, alter_layout)
@@ -412,7 +412,7 @@ def autoscheduler_get_tuning_tasks(
     weights : List[int]
         the weight (i.e. the number of appearance) of extracted tasks
     """
-    target, target_host = refresh_host(target, target_host)
+    target, target_host = Target.check_and_update_host_consistency(target, target_host)
 
     if alter_layout:
         mod = common.convert_graph_layout(mod, alter_layout)
diff --git a/python/tvm/driver/tvmc/compiler.py b/python/tvm/driver/tvmc/compiler.py
index 55e4d2956fc8..4eeddf0ac4be 100644
--- a/python/tvm/driver/tvmc/compiler.py
+++ b/python/tvm/driver/tvmc/compiler.py
@@ -27,7 +27,7 @@
 from tvm import relay, runtime
 from tvm.contrib import cc
 from tvm.contrib import utils
-from tvm.target.target import refresh_host
+from tvm.target import Target
 
 from . import common, composite_target, frontends
 from .main import register_parser
@@ -193,7 +193,7 @@ def compile_model(
 
     tvm_target, extra_targets = common.target_from_cli(target)
     target_host = tvm_target if not target_host else target_host
-    tvm_target, target_host = refresh_host(tvm_target, target_host)
+    tvm_target, target_host = Target.check_and_update_host_consistency(tvm_target, target_host)
 
     for codegen_from_cli in extra_targets:
         codegen = composite_target.get_codegen_by_target(codegen_from_cli["name"])
diff --git a/python/tvm/exec/measure_peak.py b/python/tvm/exec/measure_peak.py
index 52d173d48d42..207e7875da16 100644
--- a/python/tvm/exec/measure_peak.py
+++ b/python/tvm/exec/measure_peak.py
@@ -25,7 +25,7 @@
 import argparse
 import logging
 
-from tvm.target.target import refresh_host
+from tvm.target import Target
 from ..contrib.peak import measure_peak_all
 
 
@@ -44,7 +44,9 @@ def main():
     args = parser.parse_args()
     logging.basicConfig(level=logging.INFO)
 
-    args.target, args.target_host = refresh_host(args.target, args.target_host)
+    args.target, args.target_host = Target.check_and_update_host_consistency(
+        args.target, args.target_host
+    )
     measure_peak_all(args.target, args.target_host, args.rpc_host, args.rpc_port)
 
 
diff --git a/python/tvm/relay/backend/_backend.py b/python/tvm/relay/backend/_backend.py
index 30e3e3e7ad82..1d6d0e0ffb8b 100644
--- a/python/tvm/relay/backend/_backend.py
+++ b/python/tvm/relay/backend/_backend.py
@@ -17,7 +17,7 @@
 """The interface of expr function exposed from C++."""
 import tvm._ffi
 import tvm.driver
-from tvm.target.target import refresh_host
+from tvm.target import Target
 
 
 @tvm._ffi.register_func("relay.backend.lower")
@@ -80,7 +80,7 @@ def build(mod, target, target_host=None):
         The runtime module.
     """
     target_host = None if target_host == "" else target_host
-    target, target_host = refresh_host(target, target_host)
+    target, target_host = Target.check_and_update_host_consistency(target, target_host)
     return tvm.driver.build(mod, target=target)
 
 
diff --git a/python/tvm/relay/backend/vm.py b/python/tvm/relay/backend/vm.py
index b3fdbe880c92..6d8961b876f1 100644
--- a/python/tvm/relay/backend/vm.py
+++ b/python/tvm/relay/backend/vm.py
@@ -28,7 +28,7 @@
 from tvm import autotvm
 from tvm.relay import expr as _expr
 from tvm.relay.backend.interpreter import Executor
-from tvm.target.target import refresh_host
+from tvm.target import Target
 from . import _vm
 
 
@@ -63,7 +63,9 @@ def compile(mod, target=None, target_host=None, params=None):
     exec : tvm.runtime.vm.Executable
         The VM executable that contains both library code and bytecode.
     """
-    target, target_host = refresh_host(target, target_host, target_is_dict_key=False)
+    target, target_host = Target.check_and_update_host_consistency(
+        target, target_host, target_is_dict_key=False
+    )
     compiler = VMCompiler()
     if params:
         compiler.set_params(params)
@@ -132,7 +134,9 @@ def lower(self, mod, target=None, target_host=None):
         """
         target = self._update_target(target)
         target_host = self._update_target_host(target, target_host)
-        target, target_host = refresh_host(target, target_host, target_is_dict_key=False)
+        target, target_host = Target.check_and_update_host_consistency(
+            target, target_host, target_is_dict_key=False
+        )
 
         tophub_context = self._tophub_context(target)
         with tophub_context:
@@ -171,7 +175,9 @@ def optimize(self, mod, target=None, target_host=None, params=None):
         """
         target = self._update_target(target)
         target_host = self._update_target_host(target, target_host)
-        target, target_host = refresh_host(target, target_host, target_is_dict_key=False)
+        target, target_host = Target.check_and_update_host_consistency(
+            target, target_host, target_is_dict_key=False
+        )
 
         if params:
             self.set_params(params)
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 72b09d9e3cec..5201ac3f4fa6 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -25,7 +25,7 @@
 
 from tvm.ir.transform import PassContext
 from tvm.tir import expr as tvm_expr
-from tvm.target.target import refresh_host
+from tvm.target import Target
 from .. import nd as _nd, autotvm, register_func
 from ..target import Target
 from ..contrib import graph_runtime as _graph_rt
@@ -115,7 +115,9 @@ def build(self, mod, target=None, target_host=None, params=None):
             The runtime factory for the TVM graph runtime.
         """
         target = _update_target(target)
-        target, target_host = refresh_host(target, target_host, target_is_dict_key=False)
+        target, target_host = Target.check_and_update_host_consistency(
+            target, target_host, target_is_dict_key=False
+        )
 
         # Setup the params.
         if params:
@@ -207,7 +209,7 @@ def _build_module_no_factory(mod, target=None, target_host=None, params=None, mo
     This wrapper is suitable to be used from other programming languages as
     the runtime::Module can be freely passed between language boundaries.
     """
-    target, target_host = refresh_host(target, target_host)
+    target, target_host = Target.check_and_update_host_consistency(target, target_host)
     return build(mod, target, params=params, mod_name=mod_name).module
 
 
@@ -272,7 +274,9 @@ def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"
     elif target_host:
         raise ValueError("target host must be the type of str, " + "tvm.target.Target, or None")
 
-    target, target_host = refresh_host(target, target_host, target_is_dict_key=False)
+    target, target_host = Target.check_and_update_host_consistency(
+        target, target_host, target_is_dict_key=False
+    )
 
     # If current dispatch context is fallback context (the default root context),
     # then load pre-tuned parameters from TopHub
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index f589938470c7..80852cf60605 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -113,9 +113,8 @@ def __exit__(self, ptype, value, trace):
     def export(self):
         return _ffi_api.TargetExport(self)
 
-    def set_host(self, host=None):
-        if host is not None:
-            _ffi_api.SetHost(self, Target(host))
+    def with_host(self, host=None):
+        return _ffi_api.WithHost(self, Target(host))
 
     @staticmethod
     def current(allow_none=True):
@@ -168,6 +167,37 @@ def list_kinds():
         """Returns the list of available target names."""
         return list(_ffi_api.ListTargetKinds())
 
+    @staticmethod
+    def check_and_update_host_consistency(target, host=None, target_is_dict_key=True):
+        """A helper function that merges a legacy "target, target_host" pair, then returns
+        the merged target and its host field. The function is for legacy target and target
+        host pair only, and should not be used in the new target system.
+
+        Parameters
+        ----------
+        target : Union[str, Dict[str, Any], Target]
+            The target or heterogeneous target
+        host : Union[str, Dict[str, Any], Target, None]
+            The target host
+        target_is_dict_key : Bool
+            When the type of target is dict, whether Target is the key (Otherwise the value)
+        """
+        if isinstance(target, dict) and "kind" not in target:
+            new_target = {}
+            for tgt, mod in target.items():
+                if not target_is_dict_key:
+                    tgt, mod = mod, tgt
+                if isinstance(tgt, (dict, str, Target)):
+                    tgt, host = Target.check_and_update_host_consistency(tgt, host)
+                if not target_is_dict_key:
+                    tgt, mod = mod, tgt
+                new_target[tgt] = mod
+            target = new_target
+        else:
+            target = Target(target, host)
+            host = target.host
+        return target, host
+
 
 # TODO(@tvm-team): Deprecate the helper functions below. Encourage the usage of config dict instead.
 
@@ -498,33 +528,3 @@ def _load_config_dict(config_dict_str):
         if not isinstance(key, str):
             return None
     return config
-
-
-def refresh_host(target, host=None, target_is_dict_key=True):
-    """A helper function that merges a legacy "target, target_host" pair, then returns
-    the merged target and its host field.
-
-    Parameters
-    ----------
-    target : Union[str, Dict[str, Any], Target]
-        The target or heterogeneous target
-    host : Union[str, Dict[str, Any], Target, None]
-        The target host
-    target_is_dict_key : Bool
-        When the type of target is dict, whether Target is the key (Otherwise the value)
-    """
-    if isinstance(target, dict) and "kind" not in target:
-        new_target = {}
-        for tgt, mod in target.items():
-            if not target_is_dict_key:
-                tgt, mod = mod, tgt
-            if isinstance(tgt, (dict, str, Target)):
-                tgt, host = refresh_host(tgt, host)
-            if not target_is_dict_key:
-                tgt, mod = mod, tgt
-            new_target[tgt] = mod
-        target = new_target
-    else:
-        target = Target(target, host)
-        host = target.host
-    return target, host
diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc
index 0baa0bcf2e9a..cdfb71fe8fa0 100755
--- a/src/auto_scheduler/feature.cc
+++ b/src/auto_scheduler/feature.cc
@@ -1399,7 +1399,7 @@ void GetPerStoreFeaturesFromFile(const std::string& filename, int max_lines, int
       Array<te::Tensor> tensors = (*workload_key_to_tensors)(workload_key);
       Target target = cur_inp->task->target;
       Target target_host = cur_inp->task->target_host;
-      RefreshHost(&target, &target_host);
+      CheckAndUpdateHostConsistency(&target, &target_host);
       task = SearchTask(ComputeDAG(tensors), workload_key, target, target_host,
                         cur_inp->task->hardware_params, cur_inp->task->layout_rewrite_option,
                         cur_inp->task->task_input_names);
@@ -1469,8 +1469,9 @@ void GetPerStoreFeaturesFromMeasurePairs(const Array<MeasureInput>& inputs,
         // The measure input is incomplete, rebuild task for incomplete measure pairs read from file
         try {
           Array<te::Tensor> tensors = (*workload_key_to_tensors)(workload_key);
-          Target target = inputs[i]->task->target, target_host = inputs[i]->task->target_host;
-          RefreshHost(&target, &target_host);
+          Target target = inputs[i]->task->target;
+          Target target_host = inputs[i]->task->target_host;
+          CheckAndUpdateHostConsistency(&target, &target_host);
           task =
               SearchTask(ComputeDAG(tensors), workload_key, target, target_host,
                          inputs[i]->task->hardware_params, inputs[i]->task->layout_rewrite_option,
diff --git a/src/auto_scheduler/measure_record.cc b/src/auto_scheduler/measure_record.cc
index 0a81baf62444..af37443d91e2 100644
--- a/src/auto_scheduler/measure_record.cc
+++ b/src/auto_scheduler/measure_record.cc
@@ -165,7 +165,7 @@ struct Handler<::tvm::auto_scheduler::SearchTaskNode> {
     writer->WriteArrayItem(*data.hardware_params.get());
     ::tvm::Target target = data.target;
     ::tvm::Target target_host = data.target_host;
-    ::tvm::RefreshHost(&target, &target_host);
+    ::tvm::CheckAndUpdateHostConsistency(&target, &target_host);
     if (target_host.defined()) {
       writer->WriteArrayItem(target_host->str());
     } else {
@@ -203,7 +203,7 @@ struct Handler<::tvm::auto_scheduler::SearchTaskNode> {
         reader->Read(&str_value);
         if (!str_value.empty()) {
           data->target_host = ::tvm::Target(str_value);
-          ::tvm::RefreshHost(&data->target, &data->target_host);
+          ::tvm::CheckAndUpdateHostConsistency(&data->target, &data->target_host);
         }
         s = reader->NextArrayItem();
         ICHECK(s);
diff --git a/src/auto_scheduler/search_task.cc b/src/auto_scheduler/search_task.cc
index 18bcf46bb6ac..46db045b663a 100755
--- a/src/auto_scheduler/search_task.cc
+++ b/src/auto_scheduler/search_task.cc
@@ -116,7 +116,7 @@ HardwareParams HardwareParamsNode::GetDefaultHardwareParams(const Target& target
 SearchTask::SearchTask(ComputeDAG compute_dag, String workload_key, Target target,
                        Target target_host, Optional<HardwareParams> hardware_params,
                        LayoutRewriteOption layout_rewrite_option, Array<String> task_input_names) {
-  RefreshHost(&target, &target_host);
+  CheckAndUpdateHostConsistency(&target, &target_host);
   auto node = make_object<SearchTaskNode>();
   node->compute_dag = std::move(compute_dag);
   node->workload_key = std::move(workload_key);
@@ -146,7 +146,7 @@ TVM_REGISTER_GLOBAL("auto_scheduler.SearchTask")
     .set_body_typed([](ComputeDAG compute_dag, String workload_key, Target target,
                        Target target_host, Optional<HardwareParams> hardware_params,
                        int layout_rewrite_option, Array<String> task_input_names) {
-      RefreshHost(&target, &target_host);
+      CheckAndUpdateHostConsistency(&target, &target_host);
       return SearchTask(compute_dag, workload_key, target, target_host, hardware_params,
                         LayoutRewriteOption(layout_rewrite_option), task_input_names);
     });
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index 0a3a41d0a3a5..f30cecbf7f05 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -189,7 +189,7 @@ std::pair<IRModule, IRModule> SplitDevHostFuncs(IRModule mod_mixed, const Target
                                                 const Target& target_host_arg,
                                                 const transform::PassContext& pass_ctx) {
   Target target = target_arg, target_host = target_host_arg;
-  RefreshHost(&target, &target_host);
+  CheckAndUpdateHostConsistency(&target, &target_host);
   Array<tvm::transform::Pass> mixed_pass_list = {BindTarget(target),
                                                  tir::transform::VerifyMemory()};
 
@@ -263,7 +263,7 @@ runtime::Module build(const Map<Target, IRModule>& inputs_arg, const Target& tar
   Target target_host = target_host_arg;
 
   // Fetch previous defined target host in targets
-  RefreshHost(&inputs, &target_host);
+  CheckAndUpdateHostConsistency(&inputs, &target_host);
 
   if (!target_host.defined()) {
     for (const auto& it : inputs) {
@@ -279,7 +279,7 @@ runtime::Module build(const Map<Target, IRModule>& inputs_arg, const Target& tar
   }
 
   // Update target host for all targets
-  RefreshHost(&inputs, &target_host);
+  CheckAndUpdateHostConsistency(&inputs, &target_host);
 
   IRModule mhost_all = IRModule(Map<GlobalVar, BaseFunc>());
 
@@ -319,7 +319,7 @@ runtime::Module build(const Map<String, IRModule>& inputs_arg, const Target& tar
   Target target_host = target_host_arg;
   for (const auto& it : inputs_arg) {
     Target target = Target(it.first);
-    RefreshHost(&target, &target_host);
+    CheckAndUpdateHostConsistency(&target, &target_host);
     Optional<String> device = target->GetAttr<String>("device");
     if (device.defined() && device.value() == "vta") {
       target = Target("ext_dev");
@@ -333,7 +333,7 @@ runtime::Module build(const Map<String, IRModule>& inputs_arg, const Target& tar
 runtime::Module build(const IRModule& funcs, const Target& target_arg,
                       const Target& target_host_arg) {
   auto target = target_arg, target_host = target_host_arg;
-  RefreshHost(&target, &target_host);
+  CheckAndUpdateHostConsistency(&target, &target_host);
   Map<Target, IRModule> inputs = {{target, funcs}};
   return build(inputs, target_host);
 }
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index 68093eaca7d9..54d8b0056859 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -238,7 +238,7 @@ class RelayBuildModule : public runtime::ModuleNode {
     // Create protected variable targets_ from ground up
     targets_ = targets;
     target_host_ = target_host;
-    RefreshHost(&targets_, &target_host_);
+    CheckAndUpdateHostConsistency(&targets_, &target_host_);
     BuildRelay(mod, params_);
     // Clear compile engine so that tuning schedules can be changed between runs. See issue #6096.
     CompileEngine::Global()->Clear();
@@ -469,7 +469,7 @@ class RelayBuildModule : public runtime::ModuleNode {
     if (!target_host.defined()) target_host = (pf != nullptr) ? Target("llvm") : Target("stackvm");
 
     // Update all the targets in the targets_ TargetsMap
-    RefreshHost(&targets_, &target_host);
+    CheckAndUpdateHostConsistency(&targets_, &target_host);
 
     // Relay IRModule -> IRModule optimizations.
     relay_module = Optimize(relay_module, targets_, params);
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index e52efdfe2ee3..af58a8a2747f 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -255,7 +255,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
         context_(context),
         target_host_(target_host),
         expr_device_map_(std::move(expr_device_map)) {
-    RefreshHost(&targets, &target_host);
+    CheckAndUpdateHostConsistency(&targets, &target_host);
     for (const auto& it : targets) {
       targets_[it.first->value] = it.second;
     }
@@ -902,7 +902,7 @@ void VMCompiler::Lower(IRModule mod, const TargetsMap& targets, const tvm::Targe
   exec_ = make_object<Executable>();
   targets_ = targets;
   target_host_ = target_host;
-  RefreshHost(&targets_, &target_host_);
+  CheckAndUpdateHostConsistency(&targets_, &target_host_);
 
   // Run the optimizations necessary to target the VM.
   context_.module = OptimizeModule(mod, targets_, target_host_);
@@ -1008,7 +1008,7 @@ IRModule VMCompiler::OptimizeModule(IRModule mod, const TargetsMap& targets_arg,
                                     const Target& target_host_arg) {
   TargetsMap targets = targets_arg;
   Target target_host = target_host_arg;
-  RefreshHost(&targets, &target_host);
+  CheckAndUpdateHostConsistency(&targets, &target_host);
   if (params_.size()) {
     BaseFunc base_func = mod->Lookup("main");
     ICHECK(base_func->IsInstance<FunctionNode>())
diff --git a/src/relay/transforms/memory_alloc.cc b/src/relay/transforms/memory_alloc.cc
index 0ac2fefa1a9c..dd1b1ecdc066 100644
--- a/src/relay/transforms/memory_alloc.cc
+++ b/src/relay/transforms/memory_alloc.cc
@@ -415,7 +415,7 @@ class DialectRewriter : public ExprMutator {
 namespace transform {
 
 Pass ManifestAlloc(Target target_host, Map<tvm::Integer, tvm::Target> targets) {
-  RefreshHost(&targets, &target_host);
+  CheckAndUpdateHostConsistency(&targets, &target_host);
   return tvm::transform::CreateModulePass(
       [=](IRModule mod, const PassContext& pass_ctx) {
         DLOG(INFO) << "tvm::relay::transform::ManifestAlloc";
@@ -459,7 +459,7 @@ Pass ManifestAlloc(Target target_host, Map<tvm::Integer, tvm::Target> targets) {
 
 TVM_REGISTER_GLOBAL("relay.transform.ManifestAlloc")
     .set_body_typed([](Target target_host, Map<tvm::Integer, tvm::Target> targets) {
-      RefreshHost(&targets, &target_host);
+      CheckAndUpdateHostConsistency(&targets, &target_host);
       return ManifestAlloc(target_host, targets);
     });
 
diff --git a/src/target/target.cc b/src/target/target.cc
index 8c9045b65f9e..9d0b01bf3202 100644
--- a/src/target/target.cc
+++ b/src/target/target.cc
@@ -51,33 +51,38 @@ class TargetInternal {
   static ObjectPtr<Object> FromRawString(const String& target_str);
   static ObjectPtr<Object> FromConfig(std::unordered_map<String, ObjectRef> config);
   static void ConstructorDispatcher(TVMArgs args, TVMRetValue* rv);
-  static void SetHost(Target target, Target host) {
-    static_cast<TargetNode*>(target.data_.get())->SetHost(host);
+  static Target WithHost(const Target& target, const Target& target_host) {
+    ObjectPtr<TargetNode> n = make_object<TargetNode>(*target.get());
+    n->host = target_host;
+    return (Target)n;
   }
 };
 
 /**********  Helper functions  **********/
+Target Target::WithHost(const Target& target, const Target& host) {
+  return TargetInternal::WithHost(target, host);
+}
 
-void RefreshHost(Target* target, Target* target_host) {
-  *target = Target(*target, *target_host);
-  *target_host = (*target)->GetHost().value_or(Target());
+void CheckAndUpdateHostConsistency(Target* target, Target* host) {
+  *target = Target(*target, *host);
+  *host = (*target)->GetHost().value_or(Target());
 }
 
-void RefreshHost(Map<Integer, Target>* targets, Target* target_host) {
+void CheckAndUpdateHostConsistency(Map<Integer, Target>* targets, Target* host) {
   Map<Integer, Target> new_targets;
   for (auto& it : *targets) {
     auto target = it.second;
-    RefreshHost(&target, target_host);
+    CheckAndUpdateHostConsistency(&target, host);
     new_targets.Set(it.first, target);
   }
   *targets = new_targets;
 }
 
-void RefreshHost(Map<Target, IRModule>* targets, Target* target_host) {
+void CheckAndUpdateHostConsistency(Map<Target, IRModule>* targets, Target* host) {
   Map<Target, IRModule> new_targets;
   for (auto& it : *targets) {
     auto target = it.first;
-    RefreshHost(&target, target_host);
+    CheckAndUpdateHostConsistency(&target, host);
     new_targets.Set(target, it.second);
   }
   *targets = new_targets;
@@ -449,8 +454,6 @@ Optional<Target> TargetNode::GetHost() const {
   return GetRef<Optional<Target>>(this->host.as<TargetNode>());
 }
 
-void TargetNode::SetHost(Target host) { this->host = host; }
-
 /*! \brief Entry to hold the Target context stack. */
 struct TVMTargetThreadLocalEntry {
   /*! \brief The current target context */
@@ -685,7 +688,7 @@ TVM_REGISTER_GLOBAL("target.TargetEnterScope").set_body_typed(TargetInternal::En
 TVM_REGISTER_GLOBAL("target.TargetExitScope").set_body_typed(TargetInternal::ExitScope);
 TVM_REGISTER_GLOBAL("target.TargetCurrent").set_body_typed(Target::Current);
 TVM_REGISTER_GLOBAL("target.TargetExport").set_body_typed(TargetInternal::Export);
-TVM_REGISTER_GLOBAL("target.SetHost").set_body_typed(TargetInternal::SetHost);
+TVM_REGISTER_GLOBAL("target.WithHost").set_body_typed(TargetInternal::WithHost);
 
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<TargetNode>([](const ObjectRef& obj, ReprPrinter* p) {
diff --git a/tests/python/unittest/test_auto_scheduler_measure.py b/tests/python/unittest/test_auto_scheduler_measure.py
index b5ee7495ed72..1f141a2cfd00 100644
--- a/tests/python/unittest/test_auto_scheduler_measure.py
+++ b/tests/python/unittest/test_auto_scheduler_measure.py
@@ -26,7 +26,6 @@
 import tempfile
 import tvm.testing
 import pickle
-
 from test_auto_scheduler_common import matmul_auto_scheduler_test
 from tvm.auto_scheduler import workload_registry
 
diff --git a/tests/python/unittest/test_target_target.py b/tests/python/unittest/test_target_target.py
index 8a5aa5bd67a8..2f885d39335b 100644
--- a/tests/python/unittest/test_target_target.py
+++ b/tests/python/unittest/test_target_target.py
@@ -252,14 +252,14 @@ def test_target_host_merge_3():
         tvm.target.Target(tvm.target.Target("cuda --host llvm"), 12.34)
 
 
-def test_target_set_host():
+def test_target_with_host():
     tgt = tvm.target.Target("cuda")
     llvm = tvm.target.Target("llvm")
-    tgt.set_host(llvm)
+    tgt = tgt.with_host(llvm)
     assert tgt.kind.name == "cuda"
     assert tgt.host.kind.name == "llvm"
     cuda_host = tvm.target.Target("nvidia/jetson-nano")
-    tgt.set_host(cuda_host)
+    tgt = tgt.with_host(cuda_host)
     assert tgt.host.kind.name == "cuda"
     assert tgt.host.attrs["arch"] == "sm_53"
     assert tgt.host.attrs["shared_memory_per_block"] == 49152

From 244cc40cb143af7ba920603ce0f557d21e6c7f12 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Mon, 29 Mar 2021 18:53:05 -0700
Subject: [PATCH 62/69] Add some 3rd party dependencies

---
 3rdparty/dlpack       | 2 +-
 3rdparty/dmlc-core    | 2 +-
 3rdparty/libbacktrace | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)
 create mode 160000 3rdparty/libbacktrace

diff --git a/3rdparty/dlpack b/3rdparty/dlpack
index 3ec04430e89a..a07f962d446b 160000
--- a/3rdparty/dlpack
+++ b/3rdparty/dlpack
@@ -1 +1 @@
-Subproject commit 3ec04430e89a6834e5a1b99471f415fa939bf642
+Subproject commit a07f962d446b577adf4baef2b347a0f3a2a20617
diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core
index 6c401e242c59..21cc7de0dc9f 160000
--- a/3rdparty/dmlc-core
+++ b/3rdparty/dmlc-core
@@ -1 +1 @@
-Subproject commit 6c401e242c59a1f4c913918246591bb13fd714e7
+Subproject commit 21cc7de0dc9fd6acb796e1be6181fa8e6b6c8f41
diff --git a/3rdparty/libbacktrace b/3rdparty/libbacktrace
new file mode 160000
index 000000000000..08f7c7e69f8e
--- /dev/null
+++ b/3rdparty/libbacktrace
@@ -0,0 +1 @@
+Subproject commit 08f7c7e69f8ea61a0c4151359bc8023be8e9217b

From fdfb93a8e7eb7be13bb3ac06aecadad46257d3c6 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Mon, 29 Mar 2021 19:13:44 -0700
Subject: [PATCH 63/69] Merge main branch

---
 .gitmodules                                   |   3 +
 CMakeLists.txt                                | 114 +++-
 CONTRIBUTORS.md                               |   1 +
 Jenkinsfile                                   |   6 +-
 Makefile                                      |   2 +-
 README.md                                     |   2 +-
 .../Camera2BasicFragment.java                 |  44 +-
 .../app/src/main/jni/Application.mk           |   4 +-
 .../app/src/main/jni/tvm_runtime.h            |  41 +-
 apps/android_camera/models/prepare_model.py   |   2 +-
 .../apache/tvm/android/demo/MainActivity.java |  38 +-
 .../app/src/main/jni/Application.mk           |   2 +-
 .../app/src/main/jni/tvm_runtime.h            |   6 +-
 .../app/src/main/jni/Application.mk           |   2 +-
 .../app/src/main/jni/tvm_runtime.h            |  42 +-
 apps/android_rpc/tests/android_rpc_test.py    |  24 +-
 apps/benchmark/arm_cpu_imagenet_bench.py      |   8 +-
 apps/benchmark/gpu_imagenet_bench.py          |   8 +-
 apps/benchmark/mobile_gpu_imagenet_bench.py   |   8 +-
 apps/bundle_deploy/Makefile                   |  18 +-
 apps/bundle_deploy/README.md                  |   8 +-
 apps/bundle_deploy/bundle.c                   |  46 +-
 apps/bundle_deploy/bundle.cc                  |   2 +-
 apps/bundle_deploy/bundle_static.c            |  48 +-
 apps/bundle_deploy/demo.cc                    |   4 +-
 apps/bundle_deploy/demo_static.c              |   8 +-
 apps/bundle_deploy/runtime.cc                 |   2 +-
 apps/bundle_deploy/test.cc                    |   4 +-
 apps/bundle_deploy/test_static.c              |   8 +-
 apps/cpp_rpc/main.cc                          |  10 +-
 apps/cpp_rpc/rpc_env.cc                       |  35 +-
 apps/cpp_rpc/rpc_env.h                        |   2 +-
 apps/cpp_rpc/rpc_server.cc                    |  21 +-
 apps/cpp_rpc/rpc_server.h                     |   3 +-
 apps/dso_plugin_module/Makefile               |   3 +-
 apps/extension/Makefile                       |   3 +-
 apps/extension/tests/test_ext.py              |  12 +-
 apps/howto_deploy/cpp_deploy.cc               |  16 +-
 apps/howto_deploy/tvm_runtime_pack.cc         |   6 +-
 apps/ios_rpc/tests/ios_rpc_mobilenet.py       |  12 +-
 apps/ios_rpc/tests/ios_rpc_test.py            |  16 +-
 apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj |   6 +
 apps/ios_rpc/tvmrpc/TVMRuntime.h              |   2 +-
 apps/ios_rpc/tvmrpc/TVMRuntime.mm             |  20 +-
 apps/ios_rpc/tvmrpc/ViewController.mm         |   4 +-
 apps/microtvm/README.md                       |  17 +-
 .../{reference-vm/zephyr => }/pyproject.toml  |  10 +-
 apps/microtvm/reference-vm/base-box-tool.py   |  28 +-
 .../reference-vm/zephyr/base-box/setup.sh     |  15 +-
 .../zephyr/base-box/test-config.json          |  14 +-
 .../reference-vm/zephyr/rebuild-tvm.sh        |   2 +-
 apps/microtvm/zephyr/README.md                |  19 +
 .../zephyr/demo_runtime}/CMakeLists.txt       |   1 -
 apps/microtvm/zephyr/demo_runtime/README.md   |  21 +
 .../boards/nrf5340dk_nrf5340_cpuapp.conf      |  31 +
 .../demo_runtime/boards/nucleo_f746zg.conf    |  30 +
 .../zephyr/demo_runtime/boards/qemu_x86.conf  |  23 +
 .../zephyr/demo_runtime}/crt/crt_config.h     |   2 +-
 .../microtvm/zephyr/demo_runtime}/prj.conf    |   7 +-
 .../demo_runtime}/qemu-hack/qemu-system-i386  |   0
 .../microtvm/zephyr/demo_runtime}/src/main.c  | 226 +++---
 .../topi_recipe/conv/depthwise_conv2d_test.py |  44 +-
 apps/topi_recipe/conv/test_conv2d_hwcn_map.py |  10 +-
 apps/topi_recipe/conv/test_conv_int8_arm.py   |  12 +-
 apps/topi_recipe/conv/test_conv_int8_intel.py |  12 +-
 apps/topi_recipe/gemm/android_gemm_square.py  |  14 +-
 apps/topi_recipe/gemm/cuda_gemm_square.py     |  12 +-
 apps/topi_recipe/gemm/gemm_int8.py            |  10 +-
 apps/topi_recipe/reduce/test_reduce_map.py    |   4 +-
 apps/topi_recipe/rnn/lstm.py                  |  14 +-
 apps/topi_recipe/rnn/matexp.py                |  10 +-
 apps/wasm-standalone/wasm-graph/src/types.rs  |   4 +-
 cmake/config.cmake                            |  30 +-
 cmake/libs/Libbacktrace.cmake                 |  45 ++
 cmake/modules/CUDA.cmake                      |  11 +
 cmake/modules/LibInfo.cmake                   |   6 +-
 cmake/modules/Logging.cmake                   |  46 ++
 cmake/modules/StandaloneCrt.cmake             |   6 +-
 cmake/modules/VTA.cmake                       |   3 +
 cmake/modules/contrib/ArmComputeLib.cmake     |  22 +-
 cmake/modules/contrib/BNNS.cmake              |  30 +
 cmake/modules/contrib/TensorRT.cmake          |   2 +-
 conda/recipe/bld.bat                          |   2 +-
 conda/recipe/build.sh                         |   3 +-
 docker/Dockerfile.ci_gpu                      |   4 +-
 docker/Dockerfile.ci_qemu                     |   4 +
 docker/Dockerfile.demo_android                |   2 +-
 docker/build.sh                               |  28 +-
 docker/install/install_tvm_cpu.sh             |   2 +-
 .../ubuntu_install_ethosn_driver_stack.sh     |   2 +-
 docker/install/ubuntu_install_zephyr.sh       |  69 +-
 .../{graph_runtime.rst => graph_executor.rst} |   4 +-
 docs/api/python/index.rst                     |   2 +-
 docs/api/python/relay/backend.rst             |   2 +-
 docs/conf.py                                  |   8 +-
 docs/contribute/code_guide.rst                |   2 +-
 docs/deploy/arm_compute_lib.rst               |  14 +-
 docs/deploy/bnns.rst                          | 183 +++++
 docs/deploy/hls.rst                           |   8 +-
 docs/deploy/index.rst                         |   1 +
 docs/deploy/tensorrt.rst                      |   4 +-
 docs/deploy/vitis_ai.rst                      |  16 +-
 docs/dev/codebase_walkthrough.rst             |   8 +-
 docs/dev/debugger.rst                         |  14 +-
 docs/dev/index.rst                            |   4 +-
 docs/dev/microtvm_design.rst                  |  22 +-
 docs/dev/relay_bring_your_own_codegen.rst     |   8 +-
 docs/dev/virtual_machine.rst                  |  10 +-
 docs/index.rst                                |   2 +-
 docs/install/from_source.rst                  |   2 +-
 docs/microtvm/index.rst                       |   2 +-
 golang/Makefile                               |   2 +-
 golang/sample/complex.go                      |  10 +-
 golang/sample/simple.go                       |   2 +-
 golang/src/array_test.go                      |  24 +-
 golang/src/{context.go => device.go}          |  68 +-
 golang/src/function_test.go                   |   2 +-
 golang/src/ndarray.go                         |  18 +-
 golang/src/tvm_runtime_pack.cc                |   4 +-
 golang/src/value.go                           |   4 +-
 include/tvm/arith/analyzer.h                  |  10 +
 include/tvm/arith/iter_affine_map.h           |   5 +-
 include/tvm/ir/attrs.h                        |   4 +-
 include/tvm/ir/diagnostic.h                   |   9 +
 include/tvm/ir/error.h                        |  26 +-
 include/tvm/ir/type_relation.h                |   2 +-
 include/tvm/relay/analysis.h                  |  10 +-
 include/tvm/relay/attrs/transform.h           |  14 +-
 include/tvm/relay/interpreter.h               |   4 +-
 include/tvm/relay/qnn/attrs.h                 |  12 +
 include/tvm/runtime/c_runtime_api.h           |  49 +-
 include/tvm/runtime/container.h               |   1 +
 include/tvm/runtime/crt/error_codes.h         |  10 +-
 .../crt/{graph_runtime.h => graph_executor.h} |  56 +-
 ...ntime_module.h => graph_executor_module.h} |  14 +-
 include/tvm/runtime/crt/memory.h              |   8 +-
 include/tvm/runtime/crt/platform.h            |   8 +-
 include/tvm/runtime/data_type.h               |   2 +-
 include/tvm/runtime/device_api.h              | 131 ++--
 include/tvm/runtime/logging.h                 | 539 +++++++++++++++
 include/tvm/runtime/ndarray.h                 |  66 +-
 include/tvm/runtime/object.h                  |  22 +-
 include/tvm/runtime/packed_func.h             |  46 +-
 include/tvm/runtime/profiling.h               |  16 +-
 include/tvm/runtime/serializer.h              |  16 +-
 include/tvm/runtime/vm/bytecode.h             |   2 +-
 include/tvm/runtime/vm/memory_manager.h       |  35 +-
 include/tvm/runtime/vm/vm.h                   |  16 +-
 include/tvm/support/logging.h                 | 158 -----
 include/tvm/support/with.h                    |   2 +-
 include/tvm/tir/analysis.h                    |  37 +
 include/tvm/tir/stmt.h                        |  10 +-
 include/tvm/tir/stmt_functor.h                |   9 +
 include/tvm/topi/detail/constant_utils.h      |  11 +-
 jvm/README.md                                 |   8 +-
 .../main/java/org/apache/tvm/ArgTypeCode.java |   2 +-
 .../tvm/{TVMContext.java => Device.java}      |  68 +-
 .../src/main/java/org/apache/tvm/LibInfo.java |   2 +-
 .../src/main/java/org/apache/tvm/NDArray.java |  32 +-
 .../{GraphRuntime.java => GraphExecutor.java} |  18 +-
 .../org/apache/tvm/contrib/GraphModule.java   |  20 +-
 .../java/org/apache/tvm/rpc/RPCSession.java   |  86 +--
 ...emoteContext.java => TVMRemoteDevice.java} |   6 +-
 .../test/java/org/apache/tvm/ModuleTest.java  |  14 +-
 ...untimeTest.java => GraphExecutorTest.java} |  22 +-
 ...raph_runtime.py => test_graph_executor.py} |   2 +-
 jvm/native/src/main/native/jni_helper_func.h  |  14 +-
 .../native/org_apache_tvm_native_c_api.cc     |   2 +-
 licenses/LICENSE.libbacktrace.txt             |  29 +
 python/setup.py                               |   2 +-
 python/tvm/__init__.py                        |   2 +-
 python/tvm/_ffi/_ctypes/packed_func.py        |  10 +-
 python/tvm/_ffi/_ctypes/types.py              |  16 +-
 python/tvm/_ffi/_cython/base.pxi              |  10 +-
 python/tvm/_ffi/_cython/packed_func.pxi       |  12 +-
 python/tvm/_ffi/base.py                       |   4 +-
 python/tvm/_ffi/runtime_ctypes.py             |  16 +-
 python/tvm/arith/__init__.py                  |   2 +-
 python/tvm/arith/iter_affine_map.py           |  30 +-
 python/tvm/auto_scheduler/dispatcher.py       |  49 +-
 python/tvm/auto_scheduler/measure.py          |  34 +-
 .../tvm/auto_scheduler/relay_integration.py   |  18 +-
 .../graph_tuner/utils/traverse_graph.py       |   3 +-
 python/tvm/autotvm/measure/measure_methods.py |  22 +-
 python/tvm/autotvm/task/relay_integration.py  |   8 +-
 python/tvm/contrib/cc.py                      |  16 +-
 python/tvm/contrib/coreml_runtime.py          |  12 +-
 .../tvm/contrib/cuda_graph/__init__.py        |   6 -
 .../contrib/cuda_graph/cuda_graph_executor.py | 134 ++++
 python/tvm/contrib/debugger/debug_executor.py | 239 +++++++
 python/tvm/contrib/debugger/debug_runtime.py  | 228 +------
 python/tvm/contrib/graph_executor.py          | 306 +++++++++
 python/tvm/contrib/graph_runtime.py           | 295 +-------
 python/tvm/contrib/nvcc.py                    |  12 +
 python/tvm/contrib/peak.py                    |  48 +-
 python/tvm/contrib/sparse.py                  |  16 +-
 python/tvm/contrib/target/coreml.py           |   4 +-
 python/tvm/contrib/tflite_runtime.py          |  16 +-
 python/tvm/driver/build_module.py             |   4 +-
 python/tvm/driver/tvmc/__init__.py            |   4 +
 python/tvm/driver/tvmc/common.py              |  19 +-
 python/tvm/driver/tvmc/compiler.py            |   2 +-
 python/tvm/driver/tvmc/composite_target.py    |   5 +
 python/tvm/driver/tvmc/frontends.py           |  27 +-
 python/tvm/driver/tvmc/runner.py              |  20 +-
 python/tvm/micro/__init__.py                  |   4 +-
 python/tvm/micro/build.py                     |   2 +-
 python/tvm/micro/contrib/zephyr.py            |   5 +-
 python/tvm/micro/model_library_format.py      |   6 +-
 python/tvm/micro/session.py                   |  54 +-
 python/tvm/relay/analysis/analysis.py         |  10 +-
 ...e_codegen.py => graph_executor_codegen.py} |  12 +-
 ...e_factory.py => graph_executor_factory.py} |  14 +-
 python/tvm/relay/backend/interpreter.py       |  10 +-
 python/tvm/relay/backend/vm.py                |  14 +-
 python/tvm/relay/build_module.py              |  58 +-
 python/tvm/relay/frontend/common.py           |   8 +-
 python/tvm/relay/frontend/onnx.py             | 157 +++--
 python/tvm/relay/frontend/pytorch.py          | 129 ++--
 python/tvm/relay/frontend/qnn_torch.py        |   1 +
 python/tvm/relay/frontend/tensorflow.py       |  89 ++-
 python/tvm/relay/frontend/tflite.py           |  19 +-
 python/tvm/relay/op/_transform.py             |  19 +-
 python/tvm/relay/op/annotation/annotation.py  |  10 +-
 python/tvm/relay/op/contrib/__init__.py       |   1 +
 .../tvm/relay/op/contrib/arm_compute_lib.py   |  41 +-
 python/tvm/relay/op/contrib/bnns.py           | 327 +++++++++
 python/tvm/relay/op/contrib/tensorrt.py       |   2 +-
 python/tvm/relay/op/memory/memory.py          |  10 +-
 python/tvm/relay/op/nn/_nn.py                 |   5 +
 python/tvm/relay/op/nn/nn.py                  |  47 ++
 python/tvm/relay/op/strategy/arm_cpu.py       |   7 +-
 python/tvm/relay/op/strategy/cuda.py          |  19 +-
 python/tvm/relay/op/strategy/generic.py       |  52 +-
 python/tvm/relay/op/strategy/x86.py           |   8 +-
 python/tvm/relay/op/tensor.py                 |  20 +-
 python/tvm/relay/op/transform.py              |  72 +-
 python/tvm/relay/qnn/op/__init__.py           |   2 +-
 python/tvm/relay/qnn/op/_qnn.py               |  52 ++
 python/tvm/relay/qnn/op/qnn.py                |  72 +-
 python/tvm/relay/quantize/_calibrate.py       |   8 +-
 python/tvm/relay/testing/__init__.py          |   4 +-
 python/tvm/relay/testing/darknet.py           |   2 +-
 python/tvm/relay/testing/init.py              |   2 +-
 python/tvm/relay/transform/memory_plan.py     |  31 +-
 python/tvm/relay/transform/transform.py       |  26 +-
 python/tvm/rpc/client.py                      |  32 +-
 python/tvm/runtime/__init__.py                |   4 +-
 python/tvm/runtime/module.py                  |  15 +-
 python/tvm/runtime/ndarray.py                 | 127 ++--
 python/tvm/runtime/profiler_vm.py             |   6 +-
 python/tvm/runtime/vm.py                      |  52 +-
 python/tvm/script/context_maintainer.py       | 210 +++++-
 python/tvm/script/intrin.py                   |  20 +-
 python/tvm/script/node.py                     | 150 ++++
 python/tvm/script/parser.py                   | 179 +++--
 python/tvm/script/registry.py                 |  20 +-
 python/tvm/script/scope_handler.py            | 473 +++++++++++--
 python/tvm/script/special_stmt.py             | 380 ++++++++++-
 python/tvm/script/utils.py                    |  95 ++-
 python/tvm/target/target.py                   |  20 +-
 python/tvm/testing.py                         |  33 +-
 python/tvm/tir/analysis/analysis.py           |  23 +
 python/tvm/tir/buffer.py                      |   8 +-
 python/tvm/topi/__init__.py                   |   2 +-
 python/tvm/topi/arm_cpu/__init__.py           |   1 +
 python/tvm/topi/arm_cpu/group_conv2d.py       | 370 ++++++++++
 python/tvm/topi/cuda/batch_matmul.py          |   7 +-
 python/tvm/topi/cuda/dense.py                 |  11 +-
 python/tvm/topi/cuda/nms.py                   |  12 +-
 python/tvm/topi/cuda/scan.py                  | 196 +++++-
 python/tvm/topi/cuda/sort.py                  | 604 +++++++++++-----
 python/tvm/topi/cuda/unique.py                |  15 +-
 python/tvm/topi/cumsum.py                     | 121 ----
 python/tvm/topi/nn/__init__.py                |   1 +
 python/tvm/topi/nn/qnn.py                     | 190 ++++++
 python/tvm/topi/nn/sparse.py                  |  69 ++
 python/tvm/topi/random/kernel.py              |   8 +-
 python/tvm/topi/scan.py                       | 236 +++++++
 python/tvm/topi/sort.py                       |   6 +-
 python/tvm/topi/testing/common.py             |   8 +-
 python/tvm/topi/unique.py                     |   2 +-
 python/tvm/topi/vision/nms.py                 |   8 +-
 python/tvm/topi/x86/__init__.py               |   1 +
 python/tvm/topi/x86/group_conv2d.py           | 371 ++++++++++
 rust/tvm-graph-rt/Cargo.toml                  |   2 +-
 rust/tvm-graph-rt/src/array.rs                |  12 +-
 rust/tvm-graph-rt/src/graph.rs                |  16 +-
 rust/tvm-rt/src/{context.rs => device.rs}     |  14 +-
 rust/tvm-rt/src/lib.rs                        |  12 +-
 rust/tvm-rt/src/ndarray.rs                    |  68 +-
 rust/tvm-sys/src/array.rs                     |   4 +-
 rust/tvm-sys/src/{context.rs => device.rs}    | 112 +--
 rust/tvm-sys/src/lib.rs                       |   4 +-
 rust/tvm-sys/src/packed_func.rs               |   8 +-
 rust/tvm-sys/src/value.rs                     |  12 +-
 rust/tvm/README.md                            |   2 +-
 rust/tvm/examples/resnet/src/build_resnet.py  |  10 +-
 rust/tvm/examples/resnet/src/main.rs          |   8 +-
 rust/tvm/src/ir/diagnostics/mod.rs            |   2 +-
 rust/tvm/src/lib.rs                           |   6 +-
 rust/tvm/src/runtime/graph_rt.rs              |  34 +-
 rust/tvm/tests/basics/src/main.rs             |  12 +-
 rust/tvm/tests/callback/src/bin/array.rs      |   2 +-
 src/arith/analyzer.cc                         |   7 +
 src/arith/iter_affine_map.cc                  | 511 +++++++++++---
 src/arith/solve_linear_inequality.cc          |  51 +-
 src/auto_scheduler/compute_dag.cc             |   2 +-
 src/auto_scheduler/feature.cc                 |   2 +-
 .../search_policy/sketch_policy_rules.cc      |   5 +-
 src/auto_scheduler/search_task.cc             |  31 +-
 src/auto_scheduler/transform_step.cc          |   2 +-
 src/contrib/tf_op/tvm_dso_op_kernels.cc       |  10 +-
 src/ir/error.cc                               |   3 +-
 src/node/structural_hash.cc                   |   6 +-
 src/parser/parser.cc                          |  20 +-
 src/parser/span_check.h                       |   2 +-
 src/printer/relay_text_printer.cc             |   2 +-
 src/printer/text_printer.h                    |   2 +
 src/printer/tir_text_printer.cc               | 109 ++-
 src/printer/tvmscript_printer.cc              | 232 ++++++-
 src/relay/analysis/annotated_region_set.cc    |   5 +-
 src/relay/analysis/context_analysis.cc        |  86 +--
 src/relay/analysis/get_calibration_data.cc    |   4 +-
 src/relay/analysis/kind_check.cc              |   2 +-
 src/relay/analysis/type_solver.cc             |   4 +-
 src/relay/analysis/util.cc                    |   2 +-
 src/relay/analysis/well_formed.cc             |   2 +-
 src/relay/backend/build_module.cc             |   6 +-
 src/relay/backend/compile_engine.cc           |   4 +-
 .../contrib/arm_compute_lib/codegen.cc        |   6 +-
 src/relay/backend/contrib/bnns/codegen.cc     | 215 ++++++
 .../backend/contrib/codegen_c/codegen_c.h     |   2 +
 src/relay/backend/contrib/ethosn/codegen.cc   |  30 +-
 .../contrib/ethosn/ethosn_api_version.h       |   4 +
 src/relay/backend/contrib/tensorrt/codegen.cc |  12 +-
 ...e_codegen.cc => graph_executor_codegen.cc} |  26 +-
 src/relay/backend/graph_plan_memory.cc        |   4 +-
 src/relay/backend/interpreter.cc              |  42 +-
 src/relay/backend/vm/compiler.cc              |   4 +-
 src/relay/backend/vm/compiler.h               |   4 +-
 src/relay/backend/vm/inline_primitives.cc     |   2 +-
 src/relay/backend/vm/lambda_lift.cc           |   2 +-
 src/relay/backend/vm/removed_unused_funcs.cc  |   2 +-
 src/relay/op/make_op.h                        |   2 +
 src/relay/op/memory/memory.cc                 |   6 +-
 src/relay/op/memory/memory.h                  |   2 +-
 src/relay/op/nn/convolution.h                 |   2 +-
 src/relay/op/nn/nn.cc                         |   1 +
 src/relay/op/nn/sparse.cc                     |  41 ++
 src/relay/op/tensor/transform.cc              |  46 +-
 src/relay/op/tensor/transform.h               |   4 +-
 src/relay/op/type_relations.cc                |   2 +-
 src/relay/qnn/op/concatenate.cc               |  14 +-
 src/relay/qnn/op/dequantize.cc                |   9 +-
 src/relay/qnn/op/quantize.cc                  |  13 +-
 src/relay/qnn/op/simulated_dequantize.cc      |  80 +++
 src/relay/qnn/op/simulated_quantize.cc        |  82 +++
 src/relay/transforms/first_order_gradient.cc  | 309 +++++++++
 src/relay/transforms/fold_constant.cc         |  28 +-
 src/relay/transforms/fold_explicit_padding.cc |   2 +-
 src/relay/transforms/gradient.h               |  54 ++
 .../{gradient.cc => higher_order_gradient.cc} | 274 +-------
 src/relay/transforms/inline.cc                |   2 +-
 src/relay/transforms/memory_alloc.cc          |  55 +-
 src/relay/transforms/partial_eval.cc          |  24 +-
 src/relay/transforms/partition_graph.cc       |   2 +-
 src/relay/transforms/simplify_expr.cc         |  96 ++-
 src/relay/transforms/to_a_normal_form.cc      |   2 +-
 .../transforms/to_basic_block_normal_form.cc  |   2 +-
 src/relay/transforms/type_infer.cc            |   2 +-
 src/runtime/c_runtime_api.cc                  | 135 ++--
 .../contrib/arm_compute_lib/acl_allocator.cc  |   8 +-
 .../contrib/arm_compute_lib/acl_allocator.h   |   8 +-
 .../contrib/arm_compute_lib/acl_runtime.cc    |   8 +-
 src/runtime/contrib/bnns/bnns_json_runtime.cc | 573 ++++++++++++++++
 src/runtime/contrib/bnns/bnns_wrp.h           | 495 ++++++++++++++
 src/runtime/contrib/cblas/cblas.cc            |   2 +-
 src/runtime/contrib/cblas/mkl.cc              |   2 +-
 src/runtime/contrib/cblas/mkldnn.cc           |   2 +-
 src/runtime/contrib/coreml/coreml_runtime.mm  |   4 +-
 src/runtime/contrib/cublas/cublas.cc          |   2 +-
 src/runtime/contrib/cublas/cublas_utils.h     |   2 +-
 src/runtime/contrib/cudnn/conv_forward.cc     |   4 +-
 src/runtime/contrib/cudnn/cudnn_utils.cc      |   4 +-
 src/runtime/contrib/cudnn/cudnn_utils.h       |   4 +-
 .../contrib/edgetpu/edgetpu_runtime.cc        |   8 +-
 src/runtime/contrib/edgetpu/edgetpu_runtime.h |   8 +-
 src/runtime/contrib/ethosn/ethosn_device.cc   |   3 +-
 src/runtime/contrib/miopen/conv_forward.cc    |  22 +-
 src/runtime/contrib/miopen/miopen_utils.cc    |   4 +-
 src/runtime/contrib/miopen/miopen_utils.h     |   4 +-
 src/runtime/contrib/mps/conv.mm               |  20 +-
 src/runtime/contrib/mps/gemm.mm               |   8 +-
 src/runtime/contrib/mps/mps_utils.h           |   2 +-
 src/runtime/contrib/nnpack/convolution.cc     |  18 +-
 src/runtime/contrib/nnpack/fully_connected.cc |   2 +-
 src/runtime/contrib/nnpack/nnpack_utils.h     |   2 +-
 .../contrib/random/mt_random_engine.cc        |  13 +-
 src/runtime/contrib/random/random.cc          |   4 +-
 src/runtime/contrib/rocblas/rocblas.cc        |   2 +-
 .../contrib/tensorrt/tensorrt_builder.cc      |   4 +-
 .../contrib/tensorrt/tensorrt_logger.h        |   2 +-
 .../contrib/tensorrt/tensorrt_runtime.cc      |  18 +-
 src/runtime/contrib/tflite/tflite_runtime.cc  |  10 +-
 src/runtime/contrib/tflite/tflite_runtime.h   |  10 +-
 .../contrib/vitis_ai/vitis_ai_runtime.cc      |   1 +
 src/runtime/cpu_device_api.cc                 |  27 +-
 src/runtime/crt/Makefile                      |   8 +-
 src/runtime/crt/common/crt_backend_api.c      |   8 +-
 src/runtime/crt/common/crt_runtime_api.c      |  56 +-
 src/runtime/crt/common/ndarray.c              |  32 +-
 .../graph_executor.c}                         | 409 +++++------
 .../load_json.c                               |  24 +-
 .../graph_executor_module.c}                  | 124 ++--
 src/runtime/crt/host/main.cc                  |  18 +-
 .../tvm/runtime/crt/internal/common/ndarray.h |   4 +-
 .../graph_executor.h}                         |  70 +-
 .../load_json.h                               |  14 +-
 src/runtime/crt/memory/memory.c               |   6 +-
 src/runtime/crt/utvm_rpc_server/rpc_server.cc |   6 +-
 src/runtime/cuda/cuda_device_api.cc           |  99 ++-
 src/runtime/file_utils.cc                     |   2 +-
 src/runtime/graph/graph_runtime_factory.cc    | 176 -----
 .../cuda_graph/graph_runtime_cuda_graph.cc    | 136 ++++
 .../debug/graph_executor_debug.cc}            |  58 +-
 .../graph_executor.cc}                        | 134 ++--
 .../graph_executor.h}                         |  34 +-
 .../graph_executor/graph_executor_factory.cc  | 210 ++++++
 .../graph_executor_factory.h}                 |  56 +-
 src/runtime/hexagon/hexagon_device_api.cc     |  74 +-
 src/runtime/hexagon/hexagon_module.cc         |  36 +-
 src/runtime/hexagon/hexagon_module.h          |   4 +-
 src/runtime/hexagon/sim/hexagon_device_sim.cc |   2 +-
 .../hexagon/target/hexagon_dsprpcapi.cc       |   2 +-
 .../hexagon/target/hexagon_dsprpcapi.h        |   2 +-
 src/runtime/hexagon/target/hexagon_stubapi.cc |   2 +-
 src/runtime/hexagon/target/hexagon_stubapi.h  |   2 +-
 src/runtime/logging.cc                        | 176 +++++
 src/runtime/metadata_module.cc                |   2 +-
 src/runtime/metal/metal_common.h              |  54 +-
 src/runtime/metal/metal_device_api.mm         | 299 ++++----
 src/runtime/metal/metal_module.mm             |  90 +--
 src/runtime/micro/micro_session.cc            |   2 +-
 ...raph_runtime.cc => utvm_graph_executor.cc} |  32 +-
 ..._graph_runtime.h => utvm_graph_executor.h} |  28 +-
 src/runtime/micro/standalone/utvm_runtime.cc  |  14 +-
 src/runtime/minrpc/minrpc_server.h            |  44 +-
 src/runtime/minrpc/rpc_reference.h            |  16 +-
 src/runtime/ndarray.cc                        |  61 +-
 src/runtime/object.cc                         |   4 +-
 src/runtime/opencl/aocl/aocl_common.h         |   2 +-
 src/runtime/opencl/aocl/aocl_device_api.cc    |   4 +-
 src/runtime/opencl/opencl_common.h            |  44 +-
 src/runtime/opencl/opencl_device_api.cc       |  47 +-
 src/runtime/opencl/opencl_module.cc           |   4 +-
 src/runtime/opencl/sdaccel/sdaccel_common.h   |   2 +-
 .../opencl/sdaccel/sdaccel_device_api.cc      |   4 +-
 src/runtime/profiling.cc                      |  20 +-
 src/runtime/registry.cc                       |   2 +-
 src/runtime/rocm/rocm_device_api.cc           |  86 +--
 src/runtime/rpc/rpc_device_api.cc             |  83 ++-
 src/runtime/rpc/rpc_endpoint.cc               |  92 ++-
 src/runtime/rpc/rpc_endpoint.h                |   4 +-
 src/runtime/rpc/rpc_local_session.cc          |  20 +-
 src/runtime/rpc/rpc_local_session.h           |   2 +-
 src/runtime/rpc/rpc_module.cc                 |  90 ++-
 src/runtime/rpc/rpc_session.cc                |  12 +-
 src/runtime/rpc/rpc_session.h                 |  18 +-
 src/runtime/runtime_base.h                    |   6 +-
 src/runtime/stackvm/stackvm.cc                |   8 +-
 src/runtime/thread_pool.cc                    |   2 +-
 src/runtime/threading_backend.cc              |   2 +-
 src/runtime/vm/bytecode.cc                    |   2 +-
 src/runtime/vm/memory_manager.cc              |  39 +-
 src/runtime/vm/naive_allocator.h              |  10 +-
 src/runtime/vm/pooled_allocator.h             |  12 +-
 src/runtime/vm/profiler/vm.cc                 |   9 +-
 src/runtime/vm/vm.cc                          |  82 +--
 src/runtime/vulkan/vulkan.cc                  | 108 +--
 src/runtime/vulkan/vulkan_common.h            |   2 +-
 src/runtime/vulkan/vulkan_shader.h            |   2 +-
 src/runtime/workspace_pool.cc                 |  42 +-
 src/runtime/workspace_pool.h                  |   8 +-
 src/support/base64.h                          |   2 +-
 src/support/ffi_testing.cc                    |  10 +-
 src/support/libinfo.cc                        |  18 +-
 src/support/parallel_for.cc                   |   2 +-
 src/support/pipe.h                            |   2 +-
 src/support/socket.h                          |   2 +-
 src/target/llvm/codegen_amdgpu.cc             |  12 +-
 src/target/llvm/codegen_cpu.cc                |   7 +-
 src/target/llvm/codegen_hexagon.cc            |   6 +-
 src/target/llvm/codegen_params.cc             |   2 +-
 src/target/llvm/llvm_common.cc                |   2 +-
 src/target/metadata_module.cc                 |   2 +-
 src/target/source/codegen_c.cc                |   4 +-
 src/target/source/source_module.cc            |   2 +-
 src/target/spirv/ir_builder.cc                |   2 +
 src/target/target.cc                          |  81 ++-
 src/target/target_kind.cc                     |   2 +-
 ...hedule_postproc_rewrite_for_tensor_core.cc |   4 +-
 .../analysis/block_access_region_detector.cc  | 246 +++++++
 src/tir/analysis/expr_complexity.cc           |  53 ++
 src/tir/ir/script/script_complete.cc          | 136 ++++
 src/tir/ir/stmt_functor.cc                    |  54 +-
 src/tir/transforms/lower_tvm_builtin.cc       |   4 +-
 src/tir/transforms/make_packed_api.cc         |   8 +-
 src/tir/transforms/storage_access.cc          |   4 +
 tests/azure-pipelines/main.yml                |   6 +-
 tests/cpp/build_module_test.cc                |   8 +-
 tests/cpp/container_test.cc                   |   2 +-
 tests/cpp/contrib/bnns.cc                     | 307 +++++++++
 tests/cpp/ir_functor_test.cc                  |  55 +-
 tests/cpp/packed_func_test.cc                 |   4 +-
 tests/cpp/parallel_for_test.cc                |   2 +-
 tests/cpp/profiling.cc                        |   8 +-
 tests/cpp/relay_build_module_test.cc          |   6 +-
 tests/crt/memory_test.cc                      |   8 +-
 tests/lint/check_file_type.py                 |  13 +-
 tests/micro/qemu/.gitignore                   |   2 -
 tests/micro/qemu/zephyr-runtime/.gitignore    |   3 -
 tests/micro/test_runtime_micro_on_arm.py      |  16 +-
 tests/micro/zephyr/README.md                  |  42 ++
 tests/micro/{qemu => zephyr}/conftest.py      |   3 +
 tests/micro/{qemu => zephyr}/test_zephyr.py   |  91 ++-
 tests/micro/zephyr/testdata/digit-2.jpg       | Bin 0 -> 572 bytes
 tests/micro/zephyr/testdata/digit-9.jpg       | Bin 0 -> 535 bytes
 .../test_minimal_target_codegen_llvm.py       |  14 +-
 .../test_runtime_ndarray.py                   |   8 +-
 .../test_runtime_packed_func.py               |  12 +-
 .../test_arm_compute_lib/infrastructure.py    |   4 +-
 tests/python/contrib/test_bnns/__init__.py    |  17 +
 .../contrib/test_bnns/infrastructure.py       | 330 +++++++++
 tests/python/contrib/test_bnns/test_conv2d.py | 177 +++++
 .../contrib/test_bnns/test_conv2d_patterns.py | 107 +++
 tests/python/contrib/test_bnns/test_dense.py  | 190 ++++++
 tests/python/contrib/test_bnns/test_matmul.py | 113 +++
 .../contrib/test_bnns/test_normalization.py   | 201 ++++++
 .../contrib/test_bnns/test_onnx_topologies.py | 140 ++++
 .../python/contrib/test_bnns/test_pooling.py  | 289 ++++++++
 tests/python/contrib/test_cblas.py            |  24 +-
 tests/python/contrib/test_coreml_codegen.py   |   6 +-
 tests/python/contrib/test_coreml_runtime.py   |  14 +-
 tests/python/contrib/test_cublas.py           |  24 +-
 tests/python/contrib/test_cudnn.py            |  28 +-
 tests/python/contrib/test_edgetpu_runtime.py  |   6 +-
 .../contrib/test_ethosn/infrastructure.py     |   6 +-
 .../contrib/test_ethosn/test_networks.py      |  16 +-
 tests/python/contrib/test_gemm_acc16.py       |  10 +-
 tests/python/contrib/test_gemm_acc32_vnni.py  |  10 +-
 tests/python/contrib/test_miopen.py           |  10 +-
 tests/python/contrib/test_mps.py              |  16 +-
 tests/python/contrib/test_mxnet_bridge.py     |   8 +-
 tests/python/contrib/test_nnpack.py           |  28 +-
 tests/python/contrib/test_onnx.py             |   4 +-
 tests/python/contrib/test_onnx_model.py       |   4 +-
 tests/python/contrib/test_random.py           |  20 +-
 tests/python/contrib/test_rocblas.py          |  16 +-
 tests/python/contrib/test_sort.py             |  26 +-
 tests/python/contrib/test_sparse.py           |  24 +-
 tests/python/contrib/test_tensorrt.py         |  42 +-
 tests/python/contrib/test_tflite_runtime.py   |   1 -
 tests/python/contrib/test_thrust.py           |  24 +-
 .../contrib/test_verilator/infrastructure.py  |   4 +-
 .../contrib/test_vitis_ai/infrastructure.py   |   9 +-
 .../test_vitis_ai_runtime_cpu_part.py         |   2 +-
 tests/python/driver/tvmc/test_compiler.py     |  20 +-
 tests/python/driver/tvmc/test_frontends.py    |  40 +-
 tests/python/driver/tvmc/test_runner.py       |   2 +-
 tests/python/driver/tvmc/test_tvmc_common.py  |  15 +
 tests/python/frontend/caffe/test_forward.py   |   6 +-
 tests/python/frontend/caffe2/test_forward.py  |  10 +-
 tests/python/frontend/coreml/test_forward.py  | 106 +--
 tests/python/frontend/darknet/test_forward.py |   6 +-
 tests/python/frontend/keras/test_forward.py   |  10 +-
 tests/python/frontend/mxnet/test_forward.py   | 320 +++++----
 .../frontend/mxnet/test_qnn_ops_utils.py      |   6 +-
 tests/python/frontend/onnx/test_forward.py    | 351 +++++++---
 tests/python/frontend/pytorch/qnn_test.py     |   2 +-
 tests/python/frontend/pytorch/test_forward.py |  49 +-
 tests/python/frontend/pytorch/test_lstm.py    |  10 +-
 .../frontend/pytorch/test_object_detection.py |   4 +-
 .../frontend/tensorflow/test_bn_dynamic.py    |   6 +-
 .../frontend/tensorflow/test_forward.py       | 135 +++-
 tests/python/frontend/tflite/test_forward.py  |  33 +-
 tests/python/integration/test_dot.py          |   8 +-
 tests/python/integration/test_ewise.py        |  58 +-
 tests/python/integration/test_ewise_fpga.py   |  20 +-
 tests/python/integration/test_gemm.py         |  12 +-
 tests/python/integration/test_reduce.py       |  94 +--
 tests/python/integration/test_scan.py         |   6 +-
 tests/python/integration/test_tuning.py       |   2 +-
 .../integration/test_winograd_nnpack.py       |  10 +-
 .../test_quantization_accuracy.py             |   6 +-
 .../python/relay/benchmarking/benchmark_vm.py |  30 +-
 .../relay/dyn/test_dynamic_op_level10.py      |  18 +-
 .../relay/dyn/test_dynamic_op_level2.py       |   8 +-
 .../relay/dyn/test_dynamic_op_level3.py       |   6 +-
 .../relay/dyn/test_dynamic_op_level4.py       |   4 +-
 .../relay/dyn/test_dynamic_op_level5.py       |   4 +-
 .../relay/dyn/test_dynamic_op_level6.py       |   4 +-
 tests/python/relay/test_adt.py                |   4 +-
 tests/python/relay/test_any.py                |  12 +-
 ..._auto_scheduler_layout_rewrite_networks.py |   6 +-
 .../relay/test_auto_scheduler_tuning.py       |   6 +-
 .../relay/test_backend_compile_engine.py      |   6 +-
 ...time.py => test_backend_graph_executor.py} |  12 +-
 .../python/relay/test_backend_interpreter.py  |   8 +-
 tests/python/relay/test_cpp_build_module.py   |  28 +-
 tests/python/relay/test_external_codegen.py   |  75 +-
 tests/python/relay/test_ir_parser.py          |   8 +-
 tests/python/relay/test_ir_text_printer.py    |   5 -
 tests/python/relay/test_json_runtime.py       |  20 +-
 tests/python/relay/test_op_fast_math.py       |   8 +-
 tests/python/relay/test_op_grad_level1.py     |   8 +-
 tests/python/relay/test_op_grad_level2.py     |  12 +-
 tests/python/relay/test_op_grad_level3.py     |   8 +-
 tests/python/relay/test_op_level1.py          |  40 +-
 tests/python/relay/test_op_level10.py         |  60 +-
 tests/python/relay/test_op_level2.py          | 114 ++--
 tests/python/relay/test_op_level3.py          | 187 +++--
 tests/python/relay/test_op_level4.py          |  44 +-
 tests/python/relay/test_op_level5.py          |  88 +--
 tests/python/relay/test_op_level6.py          |  21 +-
 tests/python/relay/test_op_qnn_add.py         |  12 +-
 tests/python/relay/test_op_qnn_concatenate.py |  12 +-
 .../relay/test_op_qnn_conv2_transpose.py      |   4 +-
 tests/python/relay/test_op_qnn_conv2d.py      |  10 +-
 tests/python/relay/test_op_qnn_dense.py       |   4 +-
 tests/python/relay/test_op_qnn_dequantize.py  |  10 +-
 tests/python/relay/test_op_qnn_mul.py         |  12 +-
 tests/python/relay/test_op_qnn_quantize.py    |  10 +-
 tests/python/relay/test_op_qnn_requantize.py  |   4 +-
 .../relay/test_op_qnn_simulated_dequantize.py | 177 +++++
 .../relay/test_op_qnn_simulated_quantize.py   | 185 +++++
 tests/python/relay/test_op_qnn_subtract.py    |   2 +-
 tests/python/relay/test_param_dict.py         |  12 +-
 .../python/relay/test_pass_alter_op_layout.py |   6 +-
 .../python/relay/test_pass_annotate_target.py |  16 +-
 tests/python/relay/test_pass_annotation.py    | 174 ++---
 tests/python/relay/test_pass_auto_quantize.py |  38 +-
 .../relay/test_pass_dynamic_to_static.py      |   8 +-
 .../relay/test_pass_fold_explicit_padding.py  |   4 +-
 tests/python/relay/test_pass_fuse_ops.py      |   4 +-
 tests/python/relay/test_pass_legalize.py      |   2 +-
 .../relay/test_pass_legalize_tensorcore.py    |   2 +-
 tests/python/relay/test_pass_manager.py       |  24 +-
 tests/python/relay/test_pass_partial_eval.py  |   4 +-
 .../python/relay/test_pass_partition_graph.py |  22 +-
 tests/python/relay/test_pass_qnn_legalize.py  |   2 +-
 tests/python/relay/test_pass_simplify_expr.py |  58 ++
 .../relay/test_pass_to_a_normal_form.py       |   8 +-
 .../test_pass_to_basic_block_normal_form.py   |   8 +-
 .../relay/test_pass_to_graph_normal_form.py   |   4 +-
 tests/python/relay/test_prng.py               |  23 +-
 .../relay/test_simplify_fc_transpose.py       |   6 +-
 .../python/relay/test_sparse_dense_convert.py |   6 +-
 tests/python/relay/test_tensor_array.py       |   7 +-
 tests/python/relay/test_vm.py                 |  84 +--
 tests/python/relay/test_vm_serialization.py   |  18 +-
 tests/python/topi/python/test_fifo_buffer.py  |  70 +-
 .../python/topi/python/test_topi_argwhere.py  |  24 +-
 .../topi/python/test_topi_batch_matmul.py     |  24 +-
 .../test_topi_batch_matmul_tensorcore.py      |   8 +-
 .../python/test_topi_batch_to_space_nd.py     |  18 +-
 .../topi/python/test_topi_bitserial_conv2d.py |  16 +-
 .../python/test_topi_bitserial_conv2d_rasp.py |   8 +-
 .../topi/python/test_topi_bitserial_dense.py  |   8 +-
 tests/python/topi/python/test_topi_bnn.py     |  12 +-
 .../python/topi/python/test_topi_broadcast.py | 114 ++--
 tests/python/topi/python/test_topi_clip.py    |  18 +-
 tests/python/topi/python/test_topi_conv1d.py  |  20 +-
 .../python/test_topi_conv1d_transpose_ncw.py  |  24 +-
 .../topi/python/test_topi_conv2d_NCHWc.py     |  10 +-
 .../topi/python/test_topi_conv2d_hwcn.py      |  36 +-
 .../test_topi_conv2d_hwnc_tensorcore.py       |  26 +-
 .../topi/python/test_topi_conv2d_int8.py      | 110 +--
 .../topi/python/test_topi_conv2d_nchw.py      |  40 +-
 .../topi/python/test_topi_conv2d_nhwc.py      |   8 +-
 .../python/test_topi_conv2d_nhwc_pack_int8.py |   8 +-
 .../test_topi_conv2d_nhwc_tensorcore.py       |  12 +-
 .../python/test_topi_conv2d_nhwc_winograd.py  |  10 +-
 .../python/test_topi_conv2d_transpose_nchw.py |  36 +-
 .../topi/python/test_topi_conv2d_winograd.py  |  20 +-
 .../topi/python/test_topi_conv3d_ncdhw.py     |  26 +-
 .../topi/python/test_topi_conv3d_ndhwc.py     |  22 +-
 .../test_topi_conv3d_ndhwc_tensorcore.py      |  10 +-
 .../test_topi_conv3d_transpose_ncdhw.py       |  24 +-
 .../topi/python/test_topi_conv3d_winograd.py  |  10 +-
 .../topi/python/test_topi_correlation.py      |  20 +-
 tests/python/topi/python/test_topi_cumsum.py  |  79 ---
 .../python/test_topi_deformable_conv2d.py     |  20 +-
 tests/python/topi/python/test_topi_dense.py   |  26 +-
 .../topi/python/test_topi_dense_tensorcore.py |  10 +-
 .../topi/python/test_topi_depth_to_space.py   |  10 +-
 .../topi/python/test_topi_depthwise_conv2d.py | 130 ++--
 .../test_topi_depthwise_conv2d_back_input.py  |  10 +-
 .../test_topi_depthwise_conv2d_back_weight.py |  10 +-
 tests/python/topi/python/test_topi_dilate.py  |   6 +-
 tests/python/topi/python/test_topi_einsum.py  |   6 +-
 .../topi/python/test_topi_group_conv2d.py     |  90 +--
 .../test_topi_group_conv2d_NCHWc_int8.py      |  10 +-
 tests/python/topi/python/test_topi_image.py   |  96 +--
 tests/python/topi/python/test_topi_lrn.py     |   6 +-
 tests/python/topi/python/test_topi_math.py    |  86 +--
 tests/python/topi/python/test_topi_matmul.py  |   6 +-
 tests/python/topi/python/test_topi_pooling.py | 114 ++--
 tests/python/topi/python/test_topi_prng.py    |  44 +-
 tests/python/topi/python/test_topi_qnn.py     | 161 +++++
 tests/python/topi/python/test_topi_reduce.py  |  16 +-
 tests/python/topi/python/test_topi_relu.py    |  36 +-
 tests/python/topi/python/test_topi_reorg.py   |   6 +-
 tests/python/topi/python/test_topi_scan.py    | 144 ++++
 tests/python/topi/python/test_topi_scatter.py |   6 +-
 tests/python/topi/python/test_topi_softmax.py |  26 +-
 tests/python/topi/python/test_topi_sort.py    |  72 +-
 .../python/test_topi_space_to_batch_nd.py     |  18 +-
 .../topi/python/test_topi_space_to_depth.py   |  10 +-
 tests/python/topi/python/test_topi_sparse.py  | 110 +--
 tests/python/topi/python/test_topi_tensor.py  |  62 +-
 .../python/topi/python/test_topi_transform.py | 646 +++++++++---------
 tests/python/topi/python/test_topi_unique.py  |  18 +-
 .../topi/python/test_topi_upsampling.py       |  36 +-
 tests/python/topi/python/test_topi_vision.py  | 190 +++---
 .../unittest/test_arith_iter_affine_map.py    | 125 +++-
 .../test_auto_scheduler_layout_rewrite.py     |  24 +-
 .../unittest/test_auto_scheduler_measure.py   |  37 +-
 .../test_auto_scheduler_search_policy.py      |   2 +-
 .../test_autotvm_graph_tuner_utils.py         |  10 +
 tests/python/unittest/test_crt.py             |  34 +-
 tests/python/unittest/test_link_params.py     |  22 +-
 .../test_micro_model_library_format.py        |   4 +-
 .../python/unittest/test_runtime_container.py |   2 +-
 tests/python/unittest/test_runtime_graph.py   |  40 +-
 .../unittest/test_runtime_graph_cuda_graph.py | 100 +++
 .../unittest/test_runtime_graph_debug.py      |  12 +-
 .../unittest/test_runtime_heterogeneous.py    |  28 +-
 .../test_runtime_module_based_interface.py    | 148 ++--
 .../unittest/test_runtime_module_load.py      |  24 +-
 tests/python/unittest/test_runtime_rpc.py     |  38 +-
 .../unittest/test_runtime_vm_profiler.py      |   4 +-
 .../unittest/test_target_codegen_blob.py      |  16 +-
 .../unittest/test_target_codegen_bool.py      |  16 +-
 .../unittest/test_target_codegen_c_host.py    |  40 +-
 .../test_target_codegen_cross_llvm.py         |   8 +-
 .../unittest/test_target_codegen_cuda.py      | 150 ++--
 .../unittest/test_target_codegen_device.py    |  12 +-
 .../unittest/test_target_codegen_extern.py    |  18 +-
 .../unittest/test_target_codegen_llvm.py      |  72 +-
 .../unittest/test_target_codegen_opencl.py    |  70 +-
 .../unittest/test_target_codegen_rocm.py      |  36 +-
 .../unittest/test_target_codegen_spirv.py     |  36 +-
 .../unittest/test_target_codegen_vulkan.py    |  18 +-
 tests/python/unittest/test_te_autodiff.py     |   2 +-
 .../python/unittest/test_te_hybrid_script.py  |   6 +-
 ...hedule_postproc_rewrite_for_tensor_core.py |  20 +-
 .../unittest/test_te_schedule_tensor_core.py  |  20 +-
 .../unittest/test_te_tensor_overload.py       |  34 +-
 ...st_tir_analysis_get_block_access_region.py |  57 ++
 tests/python/unittest/test_tir_buffer.py      |  32 +-
 tests/python/unittest/test_tir_intrin.py      |  28 +-
 tests/python/unittest/test_tir_ir_builder.py  |  40 +-
 tests/python/unittest/test_tir_nodes.py       |  13 +-
 .../unittest/test_tir_transform_hoist_if.py   |  10 +-
 ...tir_transform_instrument_bound_checkers.py | 141 ++--
 .../test_tir_transform_loop_partition.py      |   8 +-
 .../test_tir_transform_lower_warp_memory.py   |  26 +-
 .../unittest/test_tvmscript_complete.py       | 174 +++++
 .../unittest/test_tvmscript_error_report.py   | 205 ++++++
 .../unittest/test_tvmscript_roundtrip.py      | 173 +++++
 tests/scripts/task_build.sh                   |   2 +-
 tests/scripts/task_ci_python_setup.sh         |   2 +-
 tests/scripts/task_ci_setup.sh                |   2 +-
 tests/scripts/task_config_build_arm.sh        |   3 +-
 tests/scripts/task_config_build_cpu.sh        |   4 +-
 tests/scripts/task_config_build_gpu.sh        |   6 +-
 tests/scripts/task_config_build_gpu_vulkan.sh |   4 +-
 tests/scripts/task_config_build_i386.sh       |   3 +-
 tests/scripts/task_config_build_wasm.sh       |   3 +-
 tests/scripts/task_java_unittest.sh           |   2 +-
 tests/scripts/task_python_docs.sh             |   5 +-
 tests/scripts/task_python_microtvm.sh         |   2 +-
 tests/scripts/task_rust.sh                    |   4 +-
 .../auto_scheduler/tune_conv2d_layer_cuda.py  |  12 +-
 tutorials/auto_scheduler/tune_network_arm.py  |  10 +-
 tutorials/auto_scheduler/tune_network_cuda.py |  12 +-
 tutorials/auto_scheduler/tune_network_mali.py |  22 +-
 tutorials/auto_scheduler/tune_network_x86.py  |  12 +-
 tutorials/auto_scheduler/tune_sparse_x86.py   |  16 +-
 tutorials/autotvm/tune_conv2d_cuda.py         |  10 +-
 tutorials/autotvm/tune_relay_arm.py           |   8 +-
 tutorials/autotvm/tune_relay_cuda.py          |   8 +-
 tutorials/autotvm/tune_relay_mobile_gpu.py    |   8 +-
 tutorials/autotvm/tune_relay_x86.py           |   8 +-
 tutorials/autotvm/tune_simple_template.py     | 336 ---------
 tutorials/frontend/build_gcn.py               |   8 +-
 tutorials/frontend/deploy_model_on_android.py |  16 +-
 tutorials/frontend/deploy_model_on_rasp.py    |   6 +-
 .../deploy_object_detection_pytorch.py        |   4 +-
 tutorials/frontend/deploy_prequantized.py     |   6 +-
 .../frontend/deploy_prequantized_tflite.py    |   8 +-
 tutorials/frontend/deploy_quantized.py        |   4 +-
 tutorials/frontend/deploy_sparse.py           |  22 +-
 tutorials/frontend/deploy_ssd_gluoncv.py      |  14 +-
 tutorials/frontend/from_caffe2.py             |   6 +-
 tutorials/frontend/from_coreml.py             |   6 +-
 tutorials/frontend/from_darknet.py            |   6 +-
 tutorials/frontend/from_keras.py              |   4 +-
 tutorials/frontend/from_mxnet.py              |   6 +-
 tutorials/frontend/from_pytorch.py            |   6 +-
 tutorials/frontend/from_tensorflow.py         |   8 +-
 tutorials/frontend/from_tflite.py             |   2 +-
 tutorials/frontend/using_external_lib.py      |  10 +-
 tutorials/get_started/README.txt              |   4 +-
 tutorials/get_started/autotvm_matmul.py       | 376 ++++++++++
 .../get_started/cross_compilation_and_rpc.py  |  14 +-
 tutorials/get_started/install.py              |  49 ++
 tutorials/get_started/introduction.py         | 132 ++++
 tutorials/get_started/relay_quick_start.py    |  10 +-
 .../get_started/tensor_expr_get_started.py    |  18 +-
 .../tune_matmul_x86.py                        | 148 ++--
 .../get_started/tvmc_command_line_driver.py   | 552 +++++++++------
 tutorials/language/extern_op.py               |  12 +-
 tutorials/language/reduction.py               |   6 +-
 tutorials/language/scan.py                    |   6 +-
 tutorials/language/tensorize.py               |  10 +-
 tutorials/micro/micro_tflite.py               |  13 +-
 tutorials/optimize/opt_conv_cuda.py           |  10 +-
 tutorials/optimize/opt_conv_tensorcore.py     |  12 +-
 tutorials/optimize/opt_gemm.py                |  34 +-
 .../optimize/opt_matmul_auto_tensorcore.py    |  12 +-
 tutorials/topi/intro_topi.py                  |   8 +-
 vta/python/vta/testing/simulator.py           |  26 +-
 vta/python/vta/top/graphpack.py               |  10 +-
 vta/runtime/device_api.cc                     |  28 +-
 vta/scripts/tune_resnet.py                    |   8 +-
 .../python/integration/test_benchmark_gemm.py |  10 +-
 .../integration/test_benchmark_topi_conv2d.py |  12 +-
 .../test_benchmark_topi_conv2d_transpose.py   |  10 +-
 .../integration/test_benchmark_topi_dense.py  |  10 +-
 .../test_benchmark_topi_group_conv2d.py       |  12 +-
 vta/tests/python/unittest/test_vta_insn.py    |  44 +-
 vta/tutorials/autotvm/tune_relay_vta.py       |   6 +-
 .../frontend/deploy_classification.py         |  12 +-
 .../frontend/legacy/deploy_detection.py       |  10 +-
 web/apps/node/example.js                      |   3 +-
 web/emcc/tvmjs_support.cc                     |  45 +-
 web/emcc/wasm_runtime.cc                      |  39 +-
 web/emcc/webgpu_runtime.cc                    |  57 +-
 web/src/ctypes.ts                             |   4 +-
 web/src/index.ts                              |   2 +-
 web/src/runtime.ts                            |  92 ++-
 web/tests/node/test_ndarray.js                |   2 +-
 web/tests/node/test_packed_func.js            |   6 +
 web/tests/python/webgpu_rpc_test.py           |   6 +-
 web/tests/python/websock_rpc_test.py          |   8 +-
 857 files changed, 22900 insertions(+), 10083 deletions(-)
 rename apps/microtvm/{reference-vm/zephyr => }/pyproject.toml (95%)
 mode change 100755 => 100644 apps/microtvm/reference-vm/base-box-tool.py
 create mode 100644 apps/microtvm/zephyr/README.md
 rename {tests/micro/qemu/zephyr-runtime => apps/microtvm/zephyr/demo_runtime}/CMakeLists.txt (99%)
 create mode 100644 apps/microtvm/zephyr/demo_runtime/README.md
 create mode 100644 apps/microtvm/zephyr/demo_runtime/boards/nrf5340dk_nrf5340_cpuapp.conf
 create mode 100644 apps/microtvm/zephyr/demo_runtime/boards/nucleo_f746zg.conf
 create mode 100644 apps/microtvm/zephyr/demo_runtime/boards/qemu_x86.conf
 rename {tests/micro/qemu/zephyr-runtime => apps/microtvm/zephyr/demo_runtime}/crt/crt_config.h (98%)
 rename {tests/micro/qemu/zephyr-runtime => apps/microtvm/zephyr/demo_runtime}/prj.conf (87%)
 rename {tests/micro/qemu/zephyr-runtime => apps/microtvm/zephyr/demo_runtime}/qemu-hack/qemu-system-i386 (100%)
 rename {tests/micro/qemu/zephyr-runtime => apps/microtvm/zephyr/demo_runtime}/src/main.c (55%)
 create mode 100644 cmake/libs/Libbacktrace.cmake
 create mode 100644 cmake/modules/Logging.cmake
 create mode 100644 cmake/modules/contrib/BNNS.cmake
 rename docs/api/python/{graph_runtime.rst => graph_executor.rst} (92%)
 create mode 100644 docs/deploy/bnns.rst
 rename golang/src/{context.go => device.go} (58%)
 rename include/tvm/runtime/crt/{graph_runtime.h => graph_executor.h} (60%)
 rename include/tvm/runtime/crt/{graph_runtime_module.h => graph_executor_module.h} (71%)
 create mode 100644 include/tvm/runtime/logging.h
 delete mode 100644 include/tvm/support/logging.h
 rename jvm/core/src/main/java/org/apache/tvm/{TVMContext.java => Device.java} (75%)
 rename jvm/core/src/main/java/org/apache/tvm/contrib/{GraphRuntime.java => GraphExecutor.java} (87%)
 rename jvm/core/src/main/java/org/apache/tvm/rpc/{TVMRemoteContext.java => TVMRemoteDevice.java} (86%)
 rename jvm/core/src/test/java/org/apache/tvm/contrib/{GraphRuntimeTest.java => GraphExecutorTest.java} (85%)
 rename jvm/core/src/test/scripts/{test_graph_runtime.py => test_graph_executor.py} (98%)
 create mode 100644 licenses/LICENSE.libbacktrace.txt
 rename tests/micro/qemu/zephyr-runtime/sample.yaml => python/tvm/contrib/cuda_graph/__init__.py (88%)
 create mode 100644 python/tvm/contrib/cuda_graph/cuda_graph_executor.py
 create mode 100644 python/tvm/contrib/debugger/debug_executor.py
 create mode 100644 python/tvm/contrib/graph_executor.py
 rename python/tvm/relay/backend/{graph_runtime_codegen.py => graph_executor_codegen.py} (89%)
 rename python/tvm/relay/backend/{graph_runtime_factory.py => graph_executor_factory.py} (88%)
 create mode 100644 python/tvm/relay/op/contrib/bnns.py
 create mode 100644 python/tvm/relay/qnn/op/_qnn.py
 create mode 100644 python/tvm/script/node.py
 create mode 100644 python/tvm/topi/arm_cpu/group_conv2d.py
 delete mode 100644 python/tvm/topi/cumsum.py
 create mode 100644 python/tvm/topi/nn/qnn.py
 create mode 100644 python/tvm/topi/scan.py
 create mode 100644 python/tvm/topi/x86/group_conv2d.py
 rename rust/tvm-rt/src/{context.rs => device.rs} (91%)
 rename rust/tvm-sys/src/{context.rs => device.rs} (71%)
 mode change 100755 => 100644 src/auto_scheduler/transform_step.cc
 create mode 100644 src/relay/backend/contrib/bnns/codegen.cc
 rename src/relay/backend/{graph_runtime_codegen.cc => graph_executor_codegen.cc} (96%)
 create mode 100644 src/relay/qnn/op/simulated_dequantize.cc
 create mode 100644 src/relay/qnn/op/simulated_quantize.cc
 create mode 100644 src/relay/transforms/first_order_gradient.cc
 create mode 100644 src/relay/transforms/gradient.h
 rename src/relay/transforms/{gradient.cc => higher_order_gradient.cc} (64%)
 create mode 100644 src/runtime/contrib/bnns/bnns_json_runtime.cc
 create mode 100644 src/runtime/contrib/bnns/bnns_wrp.h
 rename src/runtime/crt/{graph_runtime/graph_runtime.c => graph_executor/graph_executor.c} (71%)
 rename src/runtime/crt/{graph_runtime => graph_executor}/load_json.c (95%)
 rename src/runtime/crt/{graph_runtime_module/graph_runtime_module.c => graph_executor_module/graph_executor_module.c} (50%)
 rename src/runtime/crt/include/tvm/runtime/crt/internal/{graph_runtime/graph_runtime.h => graph_executor/graph_executor.h} (56%)
 rename src/runtime/crt/include/tvm/runtime/crt/internal/{graph_runtime => graph_executor}/load_json.h (90%)
 delete mode 100644 src/runtime/graph/graph_runtime_factory.cc
 create mode 100644 src/runtime/graph_executor/cuda_graph/graph_runtime_cuda_graph.cc
 rename src/runtime/{graph/debug/graph_runtime_debug.cc => graph_executor/debug/graph_executor_debug.cc} (85%)
 rename src/runtime/{graph/graph_runtime.cc => graph_executor/graph_executor.cc} (80%)
 rename src/runtime/{graph/graph_runtime.h => graph_executor/graph_executor.h} (92%)
 create mode 100644 src/runtime/graph_executor/graph_executor_factory.cc
 rename src/runtime/{graph/graph_runtime_factory.h => graph_executor/graph_executor_factory.h} (63%)
 create mode 100644 src/runtime/logging.cc
 rename src/runtime/micro/standalone/{utvm_graph_runtime.cc => utvm_graph_executor.cc} (94%)
 rename src/runtime/micro/standalone/{utvm_graph_runtime.h => utvm_graph_executor.h} (87%)
 create mode 100644 src/tir/analysis/block_access_region_detector.cc
 create mode 100644 src/tir/analysis/expr_complexity.cc
 create mode 100644 src/tir/ir/script/script_complete.cc
 create mode 100644 tests/cpp/contrib/bnns.cc
 delete mode 100644 tests/micro/qemu/.gitignore
 delete mode 100644 tests/micro/qemu/zephyr-runtime/.gitignore
 create mode 100644 tests/micro/zephyr/README.md
 rename tests/micro/{qemu => zephyr}/conftest.py (94%)
 rename tests/micro/{qemu => zephyr}/test_zephyr.py (75%)
 create mode 100644 tests/micro/zephyr/testdata/digit-2.jpg
 create mode 100644 tests/micro/zephyr/testdata/digit-9.jpg
 create mode 100644 tests/python/contrib/test_bnns/__init__.py
 create mode 100644 tests/python/contrib/test_bnns/infrastructure.py
 create mode 100644 tests/python/contrib/test_bnns/test_conv2d.py
 create mode 100644 tests/python/contrib/test_bnns/test_conv2d_patterns.py
 create mode 100644 tests/python/contrib/test_bnns/test_dense.py
 create mode 100644 tests/python/contrib/test_bnns/test_matmul.py
 create mode 100644 tests/python/contrib/test_bnns/test_normalization.py
 create mode 100644 tests/python/contrib/test_bnns/test_onnx_topologies.py
 create mode 100644 tests/python/contrib/test_bnns/test_pooling.py
 rename tests/python/relay/{test_backend_graph_runtime.py => test_backend_graph_executor.py} (95%)
 create mode 100644 tests/python/relay/test_op_qnn_simulated_dequantize.py
 create mode 100644 tests/python/relay/test_op_qnn_simulated_quantize.py
 delete mode 100644 tests/python/topi/python/test_topi_cumsum.py
 create mode 100644 tests/python/topi/python/test_topi_qnn.py
 create mode 100644 tests/python/topi/python/test_topi_scan.py
 create mode 100644 tests/python/unittest/test_runtime_graph_cuda_graph.py
 create mode 100644 tests/python/unittest/test_tir_analysis_get_block_access_region.py
 create mode 100644 tests/python/unittest/test_tvmscript_complete.py
 delete mode 100644 tutorials/autotvm/tune_simple_template.py
 create mode 100644 tutorials/get_started/autotvm_matmul.py
 create mode 100644 tutorials/get_started/install.py
 create mode 100644 tutorials/get_started/introduction.py
 rename tutorials/{auto_scheduler => get_started}/tune_matmul_x86.py (52%)

diff --git a/.gitmodules b/.gitmodules
index a1367c97b2f5..6ef740e33153 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,3 +10,6 @@
 [submodule "3rdparty/vta-hw"]
 	path = 3rdparty/vta-hw
 	url = https://github.com/apache/incubator-tvm-vta
+[submodule "3rdparty/libbacktrace"]
+	path = 3rdparty/libbacktrace
+	url = https://github.com/tlc-pack/libbacktrace.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 56170c693e3c..277fe4a9bfbc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -34,8 +34,9 @@ tvm_option(USE_RPC "Build with RPC" ON)
 tvm_option(USE_THREADS "Build with thread support" ON)
 tvm_option(USE_LLVM "Build with LLVM, can be set to specific llvm-config path" OFF)
 tvm_option(USE_STACKVM_RUNTIME "Include stackvm into the runtime" OFF)
-tvm_option(USE_GRAPH_RUNTIME "Build with tiny graph runtime" ON)
-tvm_option(USE_GRAPH_RUNTIME_DEBUG "Build with tiny graph runtime debug mode" OFF)
+tvm_option(USE_GRAPH_EXECUTOR "Build with tiny graph executor" ON)
+tvm_option(USE_GRAPH_EXECUTOR_CUDA_GRAPH "Build with tiny graph executor with CUDA Graph for GPUs" OFF)
+tvm_option(USE_PROFILER "Build profiler for the VM and graph executor" ON)
 tvm_option(USE_OPENMP "Build with OpenMP thread pool implementation" OFF)
 tvm_option(USE_RELAY_DEBUG "Building Relay in debug mode..." OFF)
 tvm_option(USE_RTTI "Build with RTTI" ON)
@@ -47,6 +48,7 @@ tvm_option(USE_TF_TVMDSOOP "Build with TensorFlow TVMDSOOp" OFF)
 tvm_option(USE_FALLBACK_STL_MAP "Use TVM's POD compatible Map" OFF)
 tvm_option(USE_ETHOSN "Build with Arm Ethos-N" OFF)
 tvm_option(INDEX_DEFAULT_I64 "Defaults the index datatype to int64" ON)
+tvm_option(USE_LIBBACKTRACE "Build libbacktrace to supply linenumbers on stack traces" AUTO)
 
 # 3rdparty libraries
 tvm_option(DLPACK_PATH "Path to DLPACK" "3rdparty/dlpack/include")
@@ -74,9 +76,10 @@ tvm_option(USE_CPP_RPC "Build CPP RPC" OFF)
 tvm_option(USE_TFLITE "Build with tflite support" OFF)
 tvm_option(USE_TENSORFLOW_PATH "TensorFlow root path when use TFLite" none)
 tvm_option(USE_COREML "Build with coreml support" OFF)
+tvm_option(USE_BNNS "Build with BNNS support" OFF)
 tvm_option(USE_TARGET_ONNX "Build with ONNX Codegen support" OFF)
 tvm_option(USE_ARM_COMPUTE_LIB "Build with Arm Compute Library" OFF)
-tvm_option(USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME "Build with Arm Compute Library graph runtime" OFF)
+tvm_option(USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR "Build with Arm Compute Library graph executor" OFF)
 tvm_option(USE_TENSORRT_CODEGEN "Build with TensorRT Codegen support" OFF)
 tvm_option(USE_TENSORRT_RUNTIME "Build with TensorRT runtime" OFF)
 tvm_option(USE_RUST_EXT "Build with Rust based compiler extensions, STATIC, DYNAMIC, or OFF" OFF)
@@ -136,6 +139,8 @@ if(MSVC)
   add_compile_options(/wd4146)
   # 'inline': used more than once
   add_compile_options(/wd4141)
+  # unknown pragma
+  add_compile_options(/wd4068)
 else(MSVC)
   set(WARNING_FLAG -Wall)
   if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
@@ -261,13 +266,6 @@ list(APPEND COMPILER_SRCS ${RELAY_BACKEND_SRCS})
 list(APPEND COMPILER_SRCS ${RELAY_IR_SRCS})
 list(APPEND COMPILER_SRCS ${RELAY_QNN_SRCS})
 
-
-if(USE_VM_PROFILER)
-  message(STATUS "Build compiler with Relay VM profiler support...")
-  file(GLOB BACKEND_VM_PROFILER_SRCS src/relay/backend/vm/profiler/*.cc)
-  list(APPEND COMPILER_SRCS ${BACKEND_VM_PROFILER_SRCS})
-endif(USE_VM_PROFILER)
-
 file(GLOB DATATYPE_SRCS src/target/datatype/*.cc)
 list(APPEND COMPILER_SRCS ${DATATYPE_SRCS})
 list(APPEND COMPILER_SRCS "src/target/datatype/myfloat/myfloat.cc")
@@ -309,25 +307,48 @@ else()
   list(APPEND COMPILER_SRCS ${STACKVM_RUNTIME_SRCS})
 endif(USE_STACKVM_RUNTIME)
 
-if(USE_GRAPH_RUNTIME)
-  message(STATUS "Build with Graph runtime support...")
-  file(GLOB RUNTIME_GRAPH_SRCS src/runtime/graph/*.cc)
-  list(APPEND RUNTIME_SRCS ${RUNTIME_GRAPH_SRCS})
+# NOTE(areusch): USE_GRAPH_RUNTIME will be deleted in a future release
+if(USE_GRAPH_RUNTIME AND NOT DEFINED USE_GRAPH_EXECUTOR)
+  message(WARNING "USE_GRAPH_RUNTIME renamed to USE_GRAPH_EXECUTOR. Please update your config.cmake")
+  set(USE_GRAPH_EXECUTOR ${USE_GRAPH_RUNTIME})
+  unset(USE_GRAPH_RUNTIME CACHE)
+endif(USE_GRAPH_RUNTIME AND NOT DEFINED USE_GRAPH_EXECUTOR)
+
+# NOTE(areusch): USE_GRAPH_RUNTIME_DEBUG will be deleted in a future release
+if(USE_GRAPH_RUNTIME_DEBUG AND NOT DEFINED USE_GRAPH_EXECUTOR_DEBUG)
+  message(WARNING "USE_GRAPH_RUNTIME_DEBUG renamed to USE_GRAPH_EXECUTOR_DEBUG. Please update your config.cmake")
+  set(USE_GRAPH_EXECUTOR_DEBUG ${USE_GRAPH_RUNTIME_DEBUG})
+  unset(USE_GRAPH_RUNTIME_DEBUG CACHE)
+endif(USE_GRAPH_RUNTIME_DEBUG AND NOT DEFINED USE_GRAPH_EXECUTOR_DEBUG)
+
+if(USE_GRAPH_EXECUTOR)
+  message(STATUS "Build with Graph Executor support...")
+  file(GLOB RUNTIME_GRAPH_EXECUTOR_SRCS src/runtime/graph_executor/*.cc)
+  list(APPEND RUNTIME_SRCS ${RUNTIME_GRAPH_EXECUTOR_SRCS})
+
+endif(USE_GRAPH_EXECUTOR)
+
+# convert old options for profiler
+if(USE_GRAPH_EXECUTOR_DEBUG)
+  unset(USE_GRAPH_EXECUTOR_DEBUG CACHE)
+  set(USE_PROFILER ON)
+endif()
+if(USE_VM_PROFILER)
+  unset(USE_VM_PROFILER CACHE)
+  set(USE_PROFILER ON)
+endif()
+
+if(USE_PROFILER)
+  message(STATUS "Build with profiler...")
 
-  if(USE_GRAPH_RUNTIME_DEBUG)
-    message(STATUS "Build with Graph runtime debug support...")
-    file(GLOB RUNTIME_GRAPH_DEBUG_SRCS src/runtime/graph/debug/*.cc)
-    list(APPEND RUNTIME_SRCS ${RUNTIME_GRAPH_DEBUG_SRCS})
-    set_source_files_properties(${RUNTIME_GRAPH_SRCS}
-      PROPERTIES COMPILE_DEFINITIONS "TVM_GRAPH_RUNTIME_DEBUG")
-  endif(USE_GRAPH_RUNTIME_DEBUG)
-endif(USE_GRAPH_RUNTIME)
+  file(GLOB RUNTIME_GRAPH_EXECUTOR_DEBUG_SRCS src/runtime/graph_executor/debug/*.cc)
+  list(APPEND RUNTIME_SRCS ${RUNTIME_GRAPH_EXECUTOR_DEBUG_SRCS})
+  set_source_files_properties(${RUNTIME_GRAPH_EXECUTOR_SRCS}
+    PROPERTIES COMPILE_DEFINITIONS "TVM_GRAPH_EXECUTOR_DEBUG")
 
-if(USE_VM_PROFILER)
-  message(STATUS "Build with Relay VM profiler support...")
   file(GLOB RUNTIME_VM_PROFILER_SRCS src/runtime/vm/profiler/*.cc)
   list(APPEND RUNTIME_SRCS ${RUNTIME_VM_PROFILER_SRCS})
-endif(USE_VM_PROFILER)
+endif(USE_PROFILER)
 
 # Module rules
 include(cmake/modules/VTA.cmake)
@@ -354,6 +375,7 @@ include(cmake/modules/contrib/HybridDump.cmake)
 include(cmake/modules/contrib/TFLite.cmake)
 include(cmake/modules/contrib/TF_TVMDSOOP.cmake)
 include(cmake/modules/contrib/CoreML.cmake)
+include(cmake/modules/contrib/BNNS.cmake)
 include(cmake/modules/contrib/ONNX.cmake)
 include(cmake/modules/contrib/ArmComputeLib.cmake)
 include(cmake/modules/contrib/TensorRT.cmake)
@@ -383,6 +405,13 @@ add_library(tvm SHARED $<TARGET_OBJECTS:tvm_objs> $<TARGET_OBJECTS:tvm_runtime_o
 set_property(TARGET tvm APPEND PROPERTY LINK_OPTIONS "${TVM_VISIBILITY_FLAG}")
 add_library(tvm_runtime SHARED $<TARGET_OBJECTS:tvm_runtime_objs>)
 set_property(TARGET tvm_runtime APPEND PROPERTY LINK_OPTIONS "${TVM_VISIBILITY_FLAG}")
+target_compile_definitions(tvm_objs PUBLIC DMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
+target_compile_definitions(tvm_runtime_objs PUBLIC DMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
+target_compile_definitions(tvm PUBLIC DMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
+target_compile_definitions(tvm_runtime PUBLIC DMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
+
+# logging option for libbacktrace
+include(cmake/modules/Logging.cmake)
 
 if(USE_MICRO)
   # NOTE: cmake doesn't track dependencies at the file level across subdirectories. For the
@@ -398,9 +427,9 @@ endif()
 if(USE_RELAY_DEBUG)
   message(STATUS "Building Relay in debug mode...")
   target_compile_definitions(tvm_objs PRIVATE "USE_RELAY_DEBUG")
-  target_compile_definitions(tvm_objs PRIVATE "DMLC_LOG_DEBUG")
+  target_compile_definitions(tvm_objs PRIVATE "TVM_LOG_DEBUG")
   target_compile_definitions(tvm_runtime_objs PRIVATE "USE_RELAY_DEBUG")
-  target_compile_definitions(tvm_runtime_objs PRIVATE "DMLC_LOG_DEBUG")
+  target_compile_definitions(tvm_runtime_objs PRIVATE "TVM_LOG_DEBUG")
 else()
   target_compile_definitions(tvm_objs PRIVATE "NDEBUG")
   target_compile_definitions(tvm_runtime_objs PRIVATE "NDEBUG")
@@ -471,6 +500,7 @@ if (HIDE_PRIVATE_SYMBOLS AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
   # once minimum CMake version is bumped up to 3.13 or above.
   target_link_libraries(tvm PRIVATE ${HIDE_SYMBOLS_LINKER_FLAGS})
   target_link_libraries(tvm_runtime PRIVATE ${HIDE_SYMBOLS_LINKER_FLAGS})
+  target_compile_definitions(tvm_allvisible PUBLIC DMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
 endif()
 
 # Tests
@@ -539,3 +569,33 @@ if(MSVC)
   target_compile_definitions(tvm_objs PRIVATE -DTVM_EXPORTS)
   target_compile_definitions(tvm_runtime_objs PRIVATE -DTVM_EXPORTS)
 endif()
+
+set(TVM_IS_DEBUG_BUILD OFF)
+if(CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo" OR CMAKE_CXX_FLAGS MATCHES "-g")
+  set(TVM_IS_DEBUG_BUILD ON)
+endif()
+
+# Change relative paths in backtrace to absolute ones
+if(TVM_IS_DEBUG_BUILD)
+  set(FILE_PREFIX_MAP_FLAG "-ffile-prefix-map=..=${CMAKE_CURRENT_SOURCE_DIR}")
+  target_compile_options(tvm PRIVATE "${FILE_PREFIX_MAP_FLAG}")
+  CHECK_CXX_COMPILER_FLAG("${FILE_PREFIX_MAP_FLAG}" FILE_PREFIX_MAP_SUPPORTED)
+  if(FILE_PREFIX_MAP_SUPPORTED)
+    target_compile_options(tvm PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${FILE_PREFIX_MAP_FLAG}>)
+    target_compile_options(tvm_objs PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${FILE_PREFIX_MAP_FLAG}>)
+    target_compile_options(tvm_runtime PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${FILE_PREFIX_MAP_FLAG}>)
+    target_compile_options(tvm_runtime_objs PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${FILE_PREFIX_MAP_FLAG}>)
+  endif()
+endif()
+
+# Run dsymutil to generate debugging symbols for backtraces
+if(APPLE AND TVM_IS_DEBUG_BUILD)
+  find_program(DSYMUTIL dsymutil)
+  mark_as_advanced(DSYMUTIL)
+  add_custom_command(TARGET tvm
+      POST_BUILD
+      COMMAND ${DSYMUTIL} ARGS $<TARGET_FILE:tvm>
+      COMMENT "Running dsymutil"
+      VERBATIM
+		  )
+endif()
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index f72220d07f16..eb2af2151acc 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -60,6 +60,7 @@ We do encourage everyone to work anything they are interested in.
 - [Thierry Moreau](https://github.com/tmoreau89) (PMC): @tmoreau89 - vta
 - [Kazutaka Morita](https://github.com/kazum): @kazum - frontends, opencl
 - [Krzysztof Parzyszek](https://github.com/kparzysz-quic): @kparzysz-quic - hexagon, llvm
+- [Andrew Reusch](https://github.com/areusch): @areusch - runtime, µTVM
 - [Jared Roesch](https://github.com/jroesch) (PMC): @jroesch - relay
 - [Siju Samuel](https://github.com/siju-samuel): @siju-samuel - frontends
 - [Siva](https://github.com/srkreddy1238): @srkreddy1238 - frontends, golang
diff --git a/Jenkinsfile b/Jenkinsfile
index 506dcab4e306..f7fc6e4e2178 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -46,11 +46,11 @@
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = "tlcpack/ci-lint:v0.62"
 ci_gpu = "tlcpack/ci-gpu:v0.72"
-ci_cpu = "tlcpack/ci-cpu:v0.72-t0"
+ci_cpu = "tlcpack/ci-cpu:v0.73"
 ci_wasm = "tlcpack/ci-wasm:v0.70"
 ci_i386 = "tlcpack/ci-i386:v0.72-t0"
-ci_qemu = "tlcpack/ci-qemu:v0.01"
-ci_arm = "tlcpack/ci-arm:v0.02"
+ci_qemu = "tlcpack/ci-qemu:v0.02"
+ci_arm = "tlcpack/ci-arm:v0.03"
 // <--- End of regex-scanned config.
 
 // tvm libraries
diff --git a/Makefile b/Makefile
index 011dc5c31bf5..c143c69d0a2c 100644
--- a/Makefile
+++ b/Makefile
@@ -55,7 +55,7 @@ crttest:
 	@mkdir -p build && cd build && cmake .. && $(MAKE) crttest
 
 # EMCC; Web related scripts
-EMCC_FLAGS= -std=c++11 -DDMLC_LOG_STACK_TRACE=0\
+EMCC_FLAGS= -std=c++11\
 	-Oz -s RESERVED_FUNCTION_POINTERS=2 -s MAIN_MODULE=1 -s NO_EXIT_RUNTIME=1\
 	-s TOTAL_MEMORY=1073741824\
 	-s EXTRA_EXPORTED_RUNTIME_METHODS="['addFunction','cwrap','getValue','setValue']"\
diff --git a/README.md b/README.md
index ac4ed62524b1..eec5bfd5797d 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@ License
 Contribute to TVM
 -----------------
 TVM adopts apache committer model, we aim to create an open source project that is maintained and owned by the community.
-Checkout the [Contributor Guide](https://tvm.apache.org/docs/contribute/)
+Check out the [Contributor Guide](https://tvm.apache.org/docs/contribute/).
 
 Acknowledgement
 ---------------
diff --git a/apps/android_camera/app/src/main/java/org/apache/tvm/android/androidcamerademo/Camera2BasicFragment.java b/apps/android_camera/app/src/main/java/org/apache/tvm/android/androidcamerademo/Camera2BasicFragment.java
index f598f8e698f8..8a5f54a3e399 100644
--- a/apps/android_camera/app/src/main/java/org/apache/tvm/android/androidcamerademo/Camera2BasicFragment.java
+++ b/apps/android_camera/app/src/main/java/org/apache/tvm/android/androidcamerademo/Camera2BasicFragment.java
@@ -57,7 +57,7 @@
 import org.apache.tvm.Function;
 import org.apache.tvm.Module;
 import org.apache.tvm.NDArray;
-import org.apache.tvm.TVMContext;
+import org.apache.tvm.Device;
 import org.apache.tvm.TVMType;
 import org.apache.tvm.TVMValue;
 import org.json.JSONException;
@@ -111,7 +111,7 @@ public class Camera2BasicFragment extends Fragment {
     private AppCompatTextView mInfoView;
     private ListView mModelView;
     private AssetManager assetManager;
-    private Module graphRuntimeModule;
+    private Module graphExecutorModule;
     private JSONObject labels;
     private ListenableFuture<ProcessCameraProvider> cameraProviderFuture;
     private PreviewView previewView;
@@ -187,21 +187,21 @@ private String[] getModels() {
     private String[] inference(float[] chw) {
         NDArray inputNdArray = NDArray.empty(new long[]{1, IMG_CHANNEL, MODEL_INPUT_SIZE, MODEL_INPUT_SIZE}, new TVMType("float32"));
         inputNdArray.copyFrom(chw);
-        Function setInputFunc = graphRuntimeModule.getFunction("set_input");
+        Function setInputFunc = graphExecutorModule.getFunction("set_input");
         setInputFunc.pushArg(INPUT_NAME).pushArg(inputNdArray).invoke();
         // release tvm local variables
         inputNdArray.release();
         setInputFunc.release();
 
         // get the function from the module(run it)
-        Function runFunc = graphRuntimeModule.getFunction("run");
+        Function runFunc = graphExecutorModule.getFunction("run");
         runFunc.invoke();
         // release tvm local variables
         runFunc.release();
 
         // get the function from the module(get output data)
         NDArray outputNdArray = NDArray.empty(new long[]{1, 1000}, new TVMType("float32"));
-        Function getOutputFunc = graphRuntimeModule.getFunction("get_output");
+        Function getOutputFunc = graphExecutorModule.getFunction("get_output");
         getOutputFunc.pushArg(OUTPUT_INDEX).pushArg(outputNdArray).invoke();
         float[] output = outputNdArray.asFloatArray();
         // release tvm local variables
@@ -272,8 +272,8 @@ public void onActivityCreated(Bundle savedInstanceState) {
     @Override
     public void onDestroy() {
         // release tvm local variables
-        if (null != graphRuntimeModule)
-            graphRuntimeModule.release();
+        if (null != graphExecutorModule)
+            graphExecutorModule.release();
         super.onDestroy();
     }
 
@@ -516,7 +516,7 @@ private void setInputName(String modelName) {
     }
 
     /*
-       Load precompiled model on TVM graph runtime and init the system.
+       Load precompiled model on TVM graph executor and init the system.
     */
     private class LoadModelAsyncTask extends AsyncTask<Void, Void, Integer> {
 
@@ -571,9 +571,9 @@ protected Integer doInBackground(Void... args) {
                 return -1;//failure
             }
 
-            Log.i(TAG, "creating java tvm context...");
-            // create java tvm context
-            TVMContext tvmCtx = EXE_GPU ? TVMContext.opencl() : TVMContext.cpu();
+            Log.i(TAG, "creating java tvm device...");
+            // create java tvm device
+            Device tvmDev = EXE_GPU ? Device.opencl() : Device.cpu();
 
             Log.i(TAG, "loading compiled functions...");
             Log.i(TAG, libCacheFilePath);
@@ -581,26 +581,26 @@ protected Integer doInBackground(Void... args) {
             Module modelLib = Module.load(libCacheFilePath);
 
 
-            // get global function module for graph runtime
-            Log.i(TAG, "getting graph runtime create handle...");
+            // get global function module for graph executor
+            Log.i(TAG, "getting graph executor create handle...");
 
-            Function runtimeCreFun = Function.getFunction("tvm.graph_runtime.create");
-            Log.i(TAG, "creating graph runtime...");
+            Function runtimeCreFun = Function.getFunction("tvm.graph_executor.create");
+            Log.i(TAG, "creating graph executor...");
 
-            Log.i(TAG, "ctx type: " + tvmCtx.deviceType);
-            Log.i(TAG, "ctx id: " + tvmCtx.deviceId);
+            Log.i(TAG, "device type: " + tvmDev.deviceType);
+            Log.i(TAG, "device id: " + tvmDev.deviceId);
 
             TVMValue runtimeCreFunRes = runtimeCreFun.pushArg(modelGraph)
                     .pushArg(modelLib)
-                    .pushArg(tvmCtx.deviceType)
-                    .pushArg(tvmCtx.deviceId)
+                    .pushArg(tvmDev.deviceType)
+                    .pushArg(tvmDev.deviceId)
                     .invoke();
 
             Log.i(TAG, "as module...");
-            graphRuntimeModule = runtimeCreFunRes.asModule();
-            Log.i(TAG, "getting graph runtime load params handle...");
+            graphExecutorModule = runtimeCreFunRes.asModule();
+            Log.i(TAG, "getting graph executor load params handle...");
             // get the function from the module(load parameters)
-            Function loadParamFunc = graphRuntimeModule.getFunction("load_params");
+            Function loadParamFunc = graphExecutorModule.getFunction("load_params");
             Log.i(TAG, "loading params...");
             loadParamFunc.pushArg(modelParams).invoke();
             // release tvm local variables
diff --git a/apps/android_camera/app/src/main/jni/Application.mk b/apps/android_camera/app/src/main/jni/Application.mk
index 63a79458ef94..6ac3271f49ac 100644
--- a/apps/android_camera/app/src/main/jni/Application.mk
+++ b/apps/android_camera/app/src/main/jni/Application.mk
@@ -31,7 +31,7 @@ include $(config)
 APP_ABI ?= all
 APP_STL := c++_shared
 
-APP_CPPFLAGS += -DDMLC_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti
+APP_CPPFLAGS += -DTVM_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti
 ifeq ($(USE_OPENCL), 1)
     APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1
 endif
@@ -43,4 +43,4 @@ endif
 
 ifeq ($(USE_SORT), 1)
     APP_CPPFLAGS += -DUSE_SORT=1
-endif
\ No newline at end of file
+endif
diff --git a/apps/android_camera/app/src/main/jni/tvm_runtime.h b/apps/android_camera/app/src/main/jni/tvm_runtime.h
index 5f3db04274a1..f3c7efd08b5c 100644
--- a/apps/android_camera/app/src/main/jni/tvm_runtime.h
+++ b/apps/android_camera/app/src/main/jni/tvm_runtime.h
@@ -25,24 +25,21 @@
 
 #include <fstream>
 
-/* Enable custom logging - this will cause TVM to pass every log message
- * through CustomLogMessage instead of LogMessage. By enabling this, we must
- * implement dmlc::CustomLogMessage::Log. We use this to pass TVM log
- * messages to Android logcat.
+#define DMLC_USE_LOGGING_LIBRARY <tvm/runtime/logging.h>
+#define TVM_USE_LIBBACKTRACE 0
+/* Enable custom logging - this will cause TVM to use a custom implementation
+ * of tvm::runtime::detail::LogMessage. We use this to pass TVM log messages to
+ * Android logcat.
  */
-#define DMLC_LOG_CUSTOMIZE 1
-
-/* Ensure that fatal errors are passed to the logger before throwing
- * in LogMessageFatal
- */
-#define DMLC_LOG_BEFORE_THROW 1
+#define TVM_LOG_CUSTOMIZE 1
 
 #include "../src/runtime/c_runtime_api.cc"
 #include "../src/runtime/cpu_device_api.cc"
 #include "../src/runtime/dso_library.cc"
 #include "../src/runtime/file_utils.cc"
-#include "../src/runtime/graph/graph_runtime.cc"
+#include "../src/runtime/graph_executor/graph_executor.cc"
 #include "../src/runtime/library_module.cc"
+#include "../src/runtime/logging.cc"
 #include "../src/runtime/module.cc"
 #include "../src/runtime/ndarray.cc"
 #include "../src/runtime/object.cc"
@@ -72,8 +69,20 @@
 
 #include <android/log.h>
 
-void dmlc::CustomLogMessage::Log(const std::string& msg) {
-  // This is called for every message logged by TVM.
-  // We pass the message to logcat.
-  __android_log_write(ANDROID_LOG_DEBUG, "TVM_RUNTIME", msg.c_str());
-}
\ No newline at end of file
+namespace tvm {
+namespace runtime {
+namespace detail {
+// Override logging mechanism
+void LogFatalImpl(const std::string& file, int lineno, const std::string& message) {
+  std::string m = file + ":" + std::to_string(lineno) + ": " + message;
+  __android_log_write(ANDROID_LOG_DEBUG, "TVM_RUNTIME", m.c_str());
+  throw InternalError(file, lineno, message);
+}
+void LogMessageImpl(const std::string& file, int lineno, const std::string& message) {
+  std::string m = file + ":" + std::to_string(lineno) + ": " + message;
+  __android_log_write(ANDROID_LOG_DEBUG, "TVM_RUNTIME", m.c_str());
+}
+
+}  // namespace detail
+}  // namespace runtime
+}  // namespace tvm
diff --git a/apps/android_camera/models/prepare_model.py b/apps/android_camera/models/prepare_model.py
index f155d46c31a4..d767b2ef88fc 100644
--- a/apps/android_camera/models/prepare_model.py
+++ b/apps/android_camera/models/prepare_model.py
@@ -25,7 +25,7 @@
 
 import tvm
 import tvm.relay as relay
-from tvm.contrib import utils, ndk, graph_runtime as runtime
+from tvm.contrib import utils, ndk, graph_executor as runtime
 from tvm.contrib.download import download_testdata, download
 
 target = "llvm -mtriple=arm64-linux-android"
diff --git a/apps/android_deploy/app/src/main/java/org/apache/tvm/android/demo/MainActivity.java b/apps/android_deploy/app/src/main/java/org/apache/tvm/android/demo/MainActivity.java
index 43a279b820d7..85cc7a277b4d 100644
--- a/apps/android_deploy/app/src/main/java/org/apache/tvm/android/demo/MainActivity.java
+++ b/apps/android_deploy/app/src/main/java/org/apache/tvm/android/demo/MainActivity.java
@@ -56,7 +56,7 @@
 import org.apache.tvm.Function;
 import org.apache.tvm.Module;
 import org.apache.tvm.NDArray;
-import org.apache.tvm.TVMContext;
+import org.apache.tvm.Device;
 import org.apache.tvm.TVMValue;
 import org.apache.tvm.TVMType;
 
@@ -90,7 +90,7 @@ public class MainActivity extends AppCompatActivity {
     private ImageView mImageView;
     private TextView mResultView;
     private AssetManager assetManager;
-    private Module graphRuntimeModule;
+    private Module graphExecutorModule;
     private Vector<String> labels = new Vector<String>();
 
     @Override
@@ -119,7 +119,7 @@ public void onClick(View v) {
     }
 
     /*
-        Load precompiled model on TVM graph runtime and init the system.
+        Load precompiled model on TVM graph executor and init the system.
      */
     private class LoadModleAsyncTask extends AsyncTask<Void, Void, Integer> {
         ProgressDialog dialog = new ProgressDialog(MainActivity.this);
@@ -177,23 +177,23 @@ protected Integer doInBackground(Void... args) {
                 return -1;//failure
             }
 
-            // create java tvm context
-            TVMContext tvmCtx = EXE_GPU ? TVMContext.opencl() : TVMContext.cpu();
+            // create java tvm device
+            Device tvmDev = EXE_GPU ? Device.opencl() : Device.cpu();
 
             // tvm module for compiled functions
             Module modelLib = Module.load(libCacheFilePath);
 
-            // get global function module for graph runtime
-            Function runtimeCreFun = Function.getFunction("tvm.graph_runtime.create");
+            // get global function module for graph executor
+            Function runtimeCreFun = Function.getFunction("tvm.graph_executor.create");
             TVMValue runtimeCreFunRes = runtimeCreFun.pushArg(modelGraph)
                     .pushArg(modelLib)
-                    .pushArg(tvmCtx.deviceType)
-                    .pushArg(tvmCtx.deviceId)
+                    .pushArg(tvmDev.deviceType)
+                    .pushArg(tvmDev.deviceId)
                     .invoke();
-            graphRuntimeModule = runtimeCreFunRes.asModule();
+            graphExecutorModule = runtimeCreFunRes.asModule();
 
             // get the function from the module(load parameters)
-            Function loadParamFunc = graphRuntimeModule.getFunction("load_params");
+            Function loadParamFunc = graphExecutorModule.getFunction("load_params");
             loadParamFunc.pushArg(modelParams).invoke();
 
             // release tvm local variables
@@ -224,14 +224,14 @@ protected void onPostExecute(Integer status) {
     }
 
     /*
-        Execute prediction for processed decode input bitmap image content on TVM graph runtime.
+        Execute prediction for processed decode input bitmap image content on TVM graph executor.
      */
     private class ModelRunAsyncTask extends AsyncTask<Bitmap, Void, Integer> {
         ProgressDialog dialog = new ProgressDialog(MainActivity.this);
 
         @Override
         protected Integer doInBackground(Bitmap... bitmaps) {
-            if (null != graphRuntimeModule) {
+            if (null != graphExecutorModule) {
                 int count  = bitmaps.length;
                 for (int i = 0 ; i < count ; i++) {
                     long processingTimeMs = SystemClock.uptimeMillis();
@@ -283,7 +283,7 @@ protected Integer doInBackground(Bitmap... bitmaps) {
                     Log.i(TAG, "set input data");
                     NDArray inputNdArray = NDArray.empty(new long[]{1, IMG_CHANNEL, MODEL_INPUT_SIZE, MODEL_INPUT_SIZE}, new TVMType("float32"));;
                     inputNdArray.copyFrom(imgRgbTranValues);
-                    Function setInputFunc = graphRuntimeModule.getFunction("set_input");
+                    Function setInputFunc = graphExecutorModule.getFunction("set_input");
                     setInputFunc.pushArg(INPUT_NAME).pushArg(inputNdArray).invoke();
                     // release tvm local variables
                     inputNdArray.release();
@@ -291,7 +291,7 @@ protected Integer doInBackground(Bitmap... bitmaps) {
 
                     // get the function from the module(run it)
                     Log.i(TAG, "run function on target");
-                    Function runFunc = graphRuntimeModule.getFunction("run");
+                    Function runFunc = graphExecutorModule.getFunction("run");
                     runFunc.invoke();
                     // release tvm local variables
                     runFunc.release();
@@ -299,7 +299,7 @@ protected Integer doInBackground(Bitmap... bitmaps) {
                     // get the function from the module(get output data)
                     Log.i(TAG, "get output data");
                     NDArray outputNdArray = NDArray.empty(new long[]{1, 1000}, new TVMType("float32"));
-                    Function getOutputFunc = graphRuntimeModule.getFunction("get_output");
+                    Function getOutputFunc = graphExecutorModule.getFunction("get_output");
                     getOutputFunc.pushArg(OUTPUT_INDEX).pushArg(outputNdArray).invoke();
                     float[] output = outputNdArray.asFloatArray();
                     // release tvm local variables
@@ -343,7 +343,7 @@ protected void onPostExecute(Integer status) {
                 dialog.dismiss();
             }
             if (status != 0) {
-                showDialog("Error", "Fail to predict image, GraphRuntime exception");
+                showDialog("Error", "Fail to predict image, GraphExecutor exception");
             }
         }
     }
@@ -351,8 +351,8 @@ protected void onPostExecute(Integer status) {
     @Override
     protected void onDestroy() {
         // release tvm local variables
-        if (null != graphRuntimeModule)
-            graphRuntimeModule.release();
+        if (null != graphExecutorModule)
+            graphExecutorModule.release();
         super.onDestroy();
     }
 
diff --git a/apps/android_deploy/app/src/main/jni/Application.mk b/apps/android_deploy/app/src/main/jni/Application.mk
index a50a40bf5cd1..220c6af3bc7e 100644
--- a/apps/android_deploy/app/src/main/jni/Application.mk
+++ b/apps/android_deploy/app/src/main/jni/Application.mk
@@ -27,7 +27,7 @@ include $(config)
 
 APP_STL := c++_static
 
-APP_CPPFLAGS += -DDMLC_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti
+APP_CPPFLAGS += -DTVM_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti
 ifeq ($(USE_OPENCL), 1)
 	APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1
 endif
diff --git a/apps/android_deploy/app/src/main/jni/tvm_runtime.h b/apps/android_deploy/app/src/main/jni/tvm_runtime.h
index 362d278c38c4..725b5e1d3b7a 100644
--- a/apps/android_deploy/app/src/main/jni/tvm_runtime.h
+++ b/apps/android_deploy/app/src/main/jni/tvm_runtime.h
@@ -25,12 +25,16 @@
 
 #include <fstream>
 
+#define DMLC_USE_LOGGING_LIBRARY <tvm/runtime/logging.h>
+#define TVM_USE_LIBBACKTRACE 0
+
 #include "../src/runtime/c_runtime_api.cc"
 #include "../src/runtime/cpu_device_api.cc"
 #include "../src/runtime/dso_library.cc"
 #include "../src/runtime/file_utils.cc"
-#include "../src/runtime/graph/graph_runtime.cc"
+#include "../src/runtime/graph_executor/graph_executor.cc"
 #include "../src/runtime/library_module.cc"
+#include "../src/runtime/logging.cc"
 #include "../src/runtime/module.cc"
 #include "../src/runtime/ndarray.cc"
 #include "../src/runtime/object.cc"
diff --git a/apps/android_rpc/app/src/main/jni/Application.mk b/apps/android_rpc/app/src/main/jni/Application.mk
index 5f885f1c6f14..e3078906ff55 100644
--- a/apps/android_rpc/app/src/main/jni/Application.mk
+++ b/apps/android_rpc/app/src/main/jni/Application.mk
@@ -31,7 +31,7 @@ include $(config)
 APP_ABI ?= armeabi-v7a arm64-v8a x86 x86_64 mips
 APP_STL := c++_shared
 
-APP_CPPFLAGS += -DDMLC_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti
+APP_CPPFLAGS += -DTVM_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti
 ifeq ($(USE_OPENCL), 1)
     APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1
 endif
diff --git a/apps/android_rpc/app/src/main/jni/tvm_runtime.h b/apps/android_rpc/app/src/main/jni/tvm_runtime.h
index 2005568c608c..5dcd823929ca 100644
--- a/apps/android_rpc/app/src/main/jni/tvm_runtime.h
+++ b/apps/android_rpc/app/src/main/jni/tvm_runtime.h
@@ -25,28 +25,26 @@
 
 #include <fstream>
 
-/* Enable custom logging - this will cause TVM to pass every log message
- * through CustomLogMessage instead of LogMessage. By enabling this, we must
- * implement dmlc::CustomLogMessage::Log. We use this to pass TVM log
- * messages to Android logcat.
+#define DMLC_USE_LOGGING_LIBRARY <tvm/runtime/logging.h>
+#define TVM_USE_LIBBACKTRACE 0
+/* Enable custom logging - this will cause TVM to use a custom implementation
+ * of tvm::runtime::detail::LogMessage. We use this to pass TVM log messages to
+ * Android logcat.
  */
-#define DMLC_LOG_CUSTOMIZE 1
-
-/* Ensure that fatal errors are passed to the logger before throwing
- * in LogMessageFatal
- */
-#define DMLC_LOG_BEFORE_THROW 1
+#define TVM_LOG_CUSTOMIZE 1
 
 #include "../src/runtime/c_runtime_api.cc"
 #include "../src/runtime/cpu_device_api.cc"
 #include "../src/runtime/dso_library.cc"
 #include "../src/runtime/file_utils.cc"
-#include "../src/runtime/graph/graph_runtime.cc"
-#include "../src/runtime/graph/graph_runtime_factory.cc"
+#include "../src/runtime/graph_executor/graph_executor.cc"
+#include "../src/runtime/graph_executor/graph_executor_factory.cc"
 #include "../src/runtime/library_module.cc"
+#include "../src/runtime/logging.cc"
 #include "../src/runtime/module.cc"
 #include "../src/runtime/ndarray.cc"
 #include "../src/runtime/object.cc"
+#include "../src/runtime/profiling.cc"
 #include "../src/runtime/registry.cc"
 #include "../src/runtime/rpc/rpc_channel.cc"
 #include "../src/runtime/rpc/rpc_endpoint.cc"
@@ -80,8 +78,20 @@
 
 #include <android/log.h>
 
-void dmlc::CustomLogMessage::Log(const std::string& msg) {
-  // This is called for every message logged by TVM.
-  // We pass the message to logcat.
-  __android_log_write(ANDROID_LOG_DEBUG, "TVM_RUNTIME", msg.c_str());
+namespace tvm {
+namespace runtime {
+namespace detail {
+// Override logging mechanism
+void LogFatalImpl(const std::string& file, int lineno, const std::string& message) {
+  std::string m = file + ":" + std::to_string(lineno) + ": " + message;
+  __android_log_write(ANDROID_LOG_DEBUG, "TVM_RUNTIME", m.c_str());
+  throw InternalError(file, lineno, message);
 }
+void LogMessageImpl(const std::string& file, int lineno, const std::string& message) {
+  std::string m = file + ":" + std::to_string(lineno) + ": " + message;
+  __android_log_write(ANDROID_LOG_DEBUG, "TVM_RUNTIME", m.c_str());
+}
+
+}  // namespace detail
+}  // namespace runtime
+}  // namespace tvm
diff --git a/apps/android_rpc/tests/android_rpc_test.py b/apps/android_rpc/tests/android_rpc_test.py
index 9586bffeca0b..0c0f429b516f 100644
--- a/apps/android_rpc/tests/android_rpc_test.py
+++ b/apps/android_rpc/tests/android_rpc_test.py
@@ -68,12 +68,12 @@ def test_rpc_module():
 
     # Execute the portable graph on cpu target
     print("Run CPU test ...")
-    ctx = remote.cpu(0)
+    dev = remote.cpu(0)
     remote.upload(path_dso_cpu)
     f2 = remote.load_module("cpu_lib.so")
-    a = tvm.nd.array(a_np, ctx)
-    b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
-    time_f = f2.time_evaluator(f2.entry_name, ctx, number=10)
+    a = tvm.nd.array(a_np, dev)
+    b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
+    time_f = f2.time_evaluator(f2.entry_name, dev, number=10)
     cost = time_f(a, b).mean
     print("%g secs/op\n" % cost)
     np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
@@ -91,12 +91,12 @@ def test_rpc_module():
         f.export_library(path_dso_cl, ndk.create_shared)
 
         print("Run GPU(OpenCL Flavor) test ...")
-        ctx = remote.cl(0)
+        dev = remote.cl(0)
         remote.upload(path_dso_cl)
         f1 = remote.load_module("dev_lib_cl.so")
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
-        time_f = f1.time_evaluator(f1.entry_name, ctx, number=10)
+        a = tvm.nd.array(a_np, dev)
+        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
+        time_f = f1.time_evaluator(f1.entry_name, dev, number=10)
         cost = time_f(a, b).mean
         print("%g secs/op\n" % cost)
         np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
@@ -114,12 +114,12 @@ def test_rpc_module():
         f.export_library(path_dso_vulkan, ndk.create_shared)
 
         print("Run GPU(Vulkan Flavor) test ...")
-        ctx = remote.vulkan(0)
+        dev = remote.vulkan(0)
         remote.upload(path_dso_vulkan)
         f1 = remote.load_module("dev_lib_vulkan.so")
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
-        time_f = f1.time_evaluator(f1.entry_name, ctx, number=10)
+        a = tvm.nd.array(a_np, dev)
+        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
+        time_f = f1.time_evaluator(f1.entry_name, dev, number=10)
         cost = time_f(a, b).mean
         print("%g secs/op\n" % cost)
         np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
diff --git a/apps/benchmark/arm_cpu_imagenet_bench.py b/apps/benchmark/arm_cpu_imagenet_bench.py
index e7233370e6d6..656735ec6c05 100644
--- a/apps/benchmark/arm_cpu_imagenet_bench.py
+++ b/apps/benchmark/arm_cpu_imagenet_bench.py
@@ -24,7 +24,7 @@
 import tvm
 from tvm import te
 from tvm.contrib.utils import tempdir
-import tvm.contrib.graph_runtime as runtime
+import tvm.contrib.graph_executor as runtime
 from tvm import relay
 
 from util import get_network, print_progress
@@ -54,17 +54,17 @@ def evaluate_network(network, target, target_host, repeat):
 
     # upload library and params
     print_progress("%-20s uploading..." % network)
-    ctx = remote.context(str(target), 0)
+    dev = remote.device(str(target), 0)
     remote.upload(tmp.relpath(filename))
 
     rlib = remote.load_module(filename)
-    module = runtime.GraphModule(rlib["default"](ctx))
+    module = runtime.GraphModule(rlib["default"](dev))
     data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
     module.set_input("data", data_tvm)
 
     # evaluate
     print_progress("%-20s evaluating..." % network)
-    ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=repeat)
+    ftimer = module.module.time_evaluator("run", dev, number=1, repeat=repeat)
     prof_res = np.array(ftimer().results) * 1000  # multiply 1000 for converting to millisecond
     print(
         "%-20s %-19s (%s)" % (network, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res))
diff --git a/apps/benchmark/gpu_imagenet_bench.py b/apps/benchmark/gpu_imagenet_bench.py
index b78476f98dc2..6407f766cb76 100644
--- a/apps/benchmark/gpu_imagenet_bench.py
+++ b/apps/benchmark/gpu_imagenet_bench.py
@@ -24,7 +24,7 @@
 
 import tvm
 from tvm import te
-import tvm.contrib.graph_runtime as runtime
+import tvm.contrib.graph_executor as runtime
 from tvm import relay
 
 from util import get_network
@@ -37,13 +37,13 @@ def benchmark(network, target):
         lib = relay.build(net, target=target, params=params)
 
     # create runtime
-    ctx = tvm.context(str(target), 0)
-    module = runtime.GraphModule(lib["default"](ctx))
+    dev = tvm.device(str(target), 0)
+    module = runtime.GraphModule(lib["default"](dev))
     data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
     module.set_input("data", data_tvm)
 
     # evaluate
-    ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=args.repeat)
+    ftimer = module.module.time_evaluator("run", dev, number=1, repeat=args.repeat)
     prof_res = np.array(ftimer().results) * 1000  # multiply 1000 for converting to millisecond
     print(
         "%-20s %-19s (%s)" % (network, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res))
diff --git a/apps/benchmark/mobile_gpu_imagenet_bench.py b/apps/benchmark/mobile_gpu_imagenet_bench.py
index cf78c66141d0..4eff259875ca 100644
--- a/apps/benchmark/mobile_gpu_imagenet_bench.py
+++ b/apps/benchmark/mobile_gpu_imagenet_bench.py
@@ -24,7 +24,7 @@
 import tvm
 from tvm import te
 from tvm.contrib.utils import tempdir
-import tvm.contrib.graph_runtime as runtime
+import tvm.contrib.graph_executor as runtime
 from tvm import relay
 
 from util import get_network, print_progress
@@ -54,17 +54,17 @@ def evaluate_network(network, target, target_host, dtype, repeat):
 
     # upload library and params
     print_progress("%-20s uploading..." % network)
-    ctx = remote.context(str(target), 0)
+    dev = remote.device(str(target), 0)
     remote.upload(tmp.relpath(filename))
 
     rlib = remote.load_module(filename)
-    module = runtime.GraphModule(rlib["default"](ctx))
+    module = runtime.GraphModule(rlib["default"](dev))
     data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
     module.set_input("data", data_tvm)
 
     # evaluate
     print_progress("%-20s evaluating..." % network)
-    ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=repeat)
+    ftimer = module.module.time_evaluator("run", dev, number=1, repeat=repeat)
     prof_res = np.array(ftimer().results) * 1000  # multiply 1000 for converting to millisecond
     print(
         "%-20s %-19s (%s)" % (network, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res))
diff --git a/apps/bundle_deploy/Makefile b/apps/bundle_deploy/Makefile
index 38d9d3456d55..b533030c4b82 100644
--- a/apps/bundle_deploy/Makefile
+++ b/apps/bundle_deploy/Makefile
@@ -32,12 +32,14 @@ PKG_CXXFLAGS = ${PKG_COMPILE_OPTS} -std=c++14 \
 	-I${TVM_ROOT}/include \
 	-I${DMLC_CORE}/include \
 	-I${TVM_ROOT}/3rdparty/dlpack/include \
-	-Icrt_config
+	-Icrt_config \
+	-DDMLC_USE_LOGGING_LIBRARY=\<tvm/runtime/logging.h\>
 PKG_CFLAGS = ${PKG_COMPILE_OPTS} \
 	-I${TVM_ROOT}/include \
 	-I${DMLC_CORE}/include \
 	-I${TVM_ROOT}/3rdparty/dlpack/include \
-	-Icrt_config
+	-Icrt_config \
+	-DDMLC_USE_LOGGING_LIBRARY=\<tvm/runtime/logging.h\>
 
 PKG_LDFLAGS = -pthread -lm
 
@@ -82,8 +84,8 @@ test_static: $(build_dir)/test_static $(build_dir)/test_data_c.bin $(build_dir)/
 $(build_dir)/crt/libcommon.a: $(CRT_SRCS)
 	$(QUIET)cd $(CRT_ROOT) && make QUIET= BUILD_DIR=$(abspath $(build_dir))/crt CRT_CONFIG=$(abspath crt_config/crt_config.h) "EXTRA_CFLAGS=$(PKG_COMPILE_OPTS)" common
 
-$(build_dir)/crt/libgraph_runtime.a: $(CRT_SRCS)
-	$(QUIET)cd $(CRT_ROOT) && make QUIET= BUILD_DIR=$(abspath $(build_dir))/crt CRT_CONFIG=$(abspath crt_config/crt_config.h) "EXTRA_CFLAGS=$(PKG_COMPILE_OPTS)" graph_runtime
+$(build_dir)/crt/libgraph_executor.a: $(CRT_SRCS)
+	$(QUIET)cd $(CRT_ROOT) && make QUIET= BUILD_DIR=$(abspath $(build_dir))/crt CRT_CONFIG=$(abspath crt_config/crt_config.h) "EXTRA_CFLAGS=$(PKG_COMPILE_OPTS)" graph_executor
 
 $(build_dir)/crt/libmemory.a: $(CRT_SRCS)
 	$(QUIET)cd $(CRT_ROOT) && make QUIET= BUILD_DIR=$(abspath $(build_dir))/crt CRT_CONFIG=$(abspath crt_config/crt_config.h) "EXTRA_CFLAGS=$(PKG_COMPILE_OPTS)" memory
@@ -96,11 +98,11 @@ $(build_dir)/test_dynamic: test.cc ${build_dir}/test_graph_c.json ${build_dir}/t
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)g++ $(PKG_CXXFLAGS) -o $@ test.cc $(BACKTRACE_OBJS) $(BACKTRACE_LDFLAGS)
 
-$(build_dir)/demo_static: demo_static.c ${build_dir}/bundle_static.o $(MODEL_OBJ) ${build_dir}/crt/libmemory.a ${build_dir}/crt/libgraph_runtime.a ${build_dir}/crt/libcommon.a ${build_dir}/graph_c.json.c ${build_dir}/params_c.bin.c $(BACKTRACE_OBJS)
+$(build_dir)/demo_static: demo_static.c ${build_dir}/bundle_static.o $(MODEL_OBJ) ${build_dir}/crt/libmemory.a ${build_dir}/crt/libgraph_executor.a ${build_dir}/crt/libcommon.a ${build_dir}/graph_c.json.c ${build_dir}/params_c.bin.c $(BACKTRACE_OBJS)
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)gcc $(PKG_CFLAGS) -o $@ $^ $(PKG_LDFLAGS) $(BACKTRACE_LDFLAGS) $(BACKTRACE_CFLAGS)
 
-$(build_dir)/test_static: test_static.c ${build_dir}/bundle_static.o $(TEST_MODEL_OBJ) ${build_dir}/crt/libmemory.a ${build_dir}/crt/libgraph_runtime.a ${build_dir}/crt/libcommon.a $(BACKTRACE_OBJS)
+$(build_dir)/test_static: test_static.c ${build_dir}/bundle_static.o $(TEST_MODEL_OBJ) ${build_dir}/crt/libmemory.a ${build_dir}/crt/libgraph_executor.a ${build_dir}/crt/libcommon.a $(BACKTRACE_OBJS)
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)gcc $(PKG_CFLAGS) -o $@ $^ $(BACKTRACE_LDFLAGS)
 
@@ -138,7 +140,7 @@ $(build_dir)/bundle.so: bundle.cc runtime.cc $(build_dir)/model_cpp.o
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)g++ -shared $(PKG_CXXFLAGS) -fvisibility=hidden -o $@  $^ $(PKG_LDFLAGS)
 
-$(build_dir)/bundle_c.so: bundle.c $(MODEL_OBJ) ${build_dir}/crt/libmemory.a ${build_dir}/crt/libgraph_runtime.a ${build_dir}/crt/libcommon.a $(BACKTRACE_OBJS)
+$(build_dir)/bundle_c.so: bundle.c $(MODEL_OBJ) ${build_dir}/crt/libmemory.a ${build_dir}/crt/libgraph_executor.a ${build_dir}/crt/libcommon.a $(BACKTRACE_OBJS)
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)gcc -shared $(PKG_CFLAGS) -fvisibility=hidden -o $@  $^ $(PKG_LDFLAGS) $(BACKTRACE_LDFLAGS) $(BACKTRACE_CFLAGS)
 
@@ -146,7 +148,7 @@ $(build_dir)/test_bundle.so: bundle.cc runtime.cc $(build_dir)/test_model_cpp.o
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)g++ -shared $(PKG_CXXFLAGS) -fvisibility=hidden -o $@  $^ $(PKG_LDFLAGS)
 
-$(build_dir)/test_bundle_c.so: bundle.c $(TEST_MODEL_OBJ) ${build_dir}/crt/libmemory.a ${build_dir}/crt/libgraph_runtime.a ${build_dir}/crt/libcommon.a $(BACKTRACE_OBJS)
+$(build_dir)/test_bundle_c.so: bundle.c $(TEST_MODEL_OBJ) ${build_dir}/crt/libmemory.a ${build_dir}/crt/libgraph_executor.a ${build_dir}/crt/libcommon.a $(BACKTRACE_OBJS)
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)gcc -shared $(PKG_CFLAGS) -fvisibility=hidden -o $@  $^ $(PKG_LDFLAGS) $(BACKTRACE_LDFLAGS) $(BACKTRACE_CFLAGS)
 
diff --git a/apps/bundle_deploy/README.md b/apps/bundle_deploy/README.md
index a52d3a78f9c9..619a2d7d05cc 100644
--- a/apps/bundle_deploy/README.md
+++ b/apps/bundle_deploy/README.md
@@ -20,9 +20,9 @@ How to Bundle TVM Modules
 =========================
 
 This folder contains an example on how to bundle a TVM module (with the required
-interpreter runtime modules such as `runtime::GraphRuntime`, the graph JSON, and
+interpreter runtime modules such as `runtime::GraphExecutor`, the graph JSON, and
 the params) into a single, self-contained shared object (`bundle.so`) which
-exposes a C API wrapping the appropriate `runtime::GraphRuntime` instance.
+exposes a C API wrapping the appropriate `runtime::GraphExecutor` instance.
 
 This is useful for cases where we'd like to avoid deploying the TVM runtime
 components to the target host in advance - instead, we simply deploy the bundled
@@ -49,8 +49,8 @@ This will:
 - Build a `bundle.so` shared object containing the model specification and
   parameters
 - Build a `demo_dynamic` executable that `dlopen`'s `bundle.so` (or `bundle_c.so` in 
-  terms of the MISRA-C runtime), instantiates the contained graph runtime,
-  and invokes the `GraphRuntime::Run` function on a cat image, then prints
+  terms of the MISRA-C runtime), instantiates the contained graph executor,
+  and invokes the `GraphExecutor::Run` function on a cat image, then prints
   the output results.
 
 Type the following command to run the sample code with static linking.
diff --git a/apps/bundle_deploy/bundle.c b/apps/bundle_deploy/bundle.c
index 098ac994223e..9083f7b5f48b 100644
--- a/apps/bundle_deploy/bundle.c
+++ b/apps/bundle_deploy/bundle.c
@@ -22,7 +22,7 @@
 #include <stdlib.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/crt/crt.h>
-#include <tvm/runtime/crt/graph_runtime.h>
+#include <tvm/runtime/crt/graph_executor.h>
 #include <tvm/runtime/crt/memory.h>
 #include <tvm/runtime/crt/packed_func.h>
 
@@ -59,9 +59,9 @@ TVM_DLL void* tvm_runtime_create(const char* json_data, const char* params_data,
   params.data = params_data;
   params.size = params_size;
 
-  TVMContext ctx;
-  ctx.device_type = (DLDeviceType)device_type;
-  ctx.device_id = device_id;
+  DLDevice dev;
+  dev.device_type = (DLDeviceType)device_type;
+  dev.device_id = device_id;
 
   // declare pointers
   TVM_CCALL(MemoryManagerCreate(&g_memory_manager, g_crt_memory, sizeof(g_crt_memory),
@@ -75,30 +75,30 @@ TVM_DLL void* tvm_runtime_create(const char* json_data, const char* params_data,
   TVMModuleHandle mod_syslib = TVMArgs_AsModuleHandle(&pf.ret_value, 0);
 
   // run modules
-  TVMGraphRuntime* graph_runtime = NULL;
-  TVM_CCALL(TVMGraphRuntime_Create(json_data, mod_syslib, &ctx, &graph_runtime));
-  TVM_CCALL(TVMGraphRuntime_LoadParams(graph_runtime, params.data, params.size));
+  TVMGraphExecutor* graph_executor = NULL;
+  TVM_CCALL(TVMGraphExecutor_Create(json_data, mod_syslib, &dev, &graph_executor));
+  TVM_CCALL(TVMGraphExecutor_LoadParams(graph_executor, params.data, params.size));
 
-  return graph_runtime;
+  return graph_executor;
 }
 
-TVM_DLL void tvm_runtime_destroy(void* runtime) {
-  TVMGraphRuntime_Release((TVMGraphRuntime**)&runtime);
+TVM_DLL void tvm_runtime_destroy(void* executor) {
+  TVMGraphExecutor_Release((TVMGraphExecutor**)&executor);
 }
 
-TVM_DLL void tvm_runtime_set_input(void* runtime, const char* name, DLTensor* tensor) {
-  TVMGraphRuntime* graph_runtime = (TVMGraphRuntime*)runtime;
-  TVMGraphRuntime_SetInput(graph_runtime, name, tensor);
+TVM_DLL void tvm_runtime_set_input(void* executor, const char* name, DLTensor* tensor) {
+  TVMGraphExecutor* graph_executor = (TVMGraphExecutor*)executor;
+  TVMGraphExecutor_SetInput(graph_executor, name, tensor);
 }
 
-TVM_DLL void tvm_runtime_run(void* runtime) {
-  TVMGraphRuntime* graph_runtime = (TVMGraphRuntime*)runtime;
-  TVMGraphRuntime_Run(graph_runtime);
+TVM_DLL void tvm_runtime_run(void* executor) {
+  TVMGraphExecutor* graph_executor = (TVMGraphExecutor*)executor;
+  TVMGraphExecutor_Run(graph_executor);
 }
 
-TVM_DLL void tvm_runtime_get_output(void* runtime, int32_t index, DLTensor* tensor) {
-  TVMGraphRuntime* graph_runtime = (TVMGraphRuntime*)runtime;
-  TVMGraphRuntime_GetOutput(graph_runtime, index, tensor);
+TVM_DLL void tvm_runtime_get_output(void* executor, int32_t index, DLTensor* tensor) {
+  TVMGraphExecutor* graph_executor = (TVMGraphExecutor*)executor;
+  TVMGraphExecutor_GetOutput(graph_executor, index, tensor);
 }
 
 void TVMLogf(const char* msg, ...) {
@@ -116,12 +116,12 @@ void __attribute__((noreturn)) TVMPlatformAbort(tvm_crt_error_t error_code) {
   exit(-1);
 }
 
-tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLContext ctx, void** out_ptr) {
-  return g_memory_manager->Allocate(g_memory_manager, num_bytes, ctx, out_ptr);
+tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr) {
+  return g_memory_manager->Allocate(g_memory_manager, num_bytes, dev, out_ptr);
 }
 
-tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLContext ctx) {
-  return g_memory_manager->Free(g_memory_manager, ptr, ctx);
+tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) {
+  return g_memory_manager->Free(g_memory_manager, ptr, dev);
 }
 
 tvm_crt_error_t TVMPlatformTimerStart() { return kTvmErrorFunctionCallNotImplemented; }
diff --git a/apps/bundle_deploy/bundle.cc b/apps/bundle_deploy/bundle.cc
index e3cc7d1730ce..435d0e41f3db 100644
--- a/apps/bundle_deploy/bundle.cc
+++ b/apps/bundle_deploy/bundle.cc
@@ -35,7 +35,7 @@ TVM_BUNDLE_FUNCTION void* tvm_runtime_create(const char* build_graph_json,
   int device_type = kDLCPU;
   int device_id = 0;
 
-  tvm::runtime::Module mod = (*tvm::runtime::Registry::Get("tvm.graph_runtime.create"))(
+  tvm::runtime::Module mod = (*tvm::runtime::Registry::Get("tvm.graph_executor.create"))(
       json_data, mod_syslib, device_type, device_id);
   TVMByteArray params;
   params.data = reinterpret_cast<const char*>(&build_params_bin[0]);
diff --git a/apps/bundle_deploy/bundle_static.c b/apps/bundle_deploy/bundle_static.c
index c4b637c3fc7b..62e63d6b4fe2 100644
--- a/apps/bundle_deploy/bundle_static.c
+++ b/apps/bundle_deploy/bundle_static.c
@@ -21,7 +21,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <tvm/runtime/crt/crt.h>
-#include <tvm/runtime/crt/graph_runtime.h>
+#include <tvm/runtime/crt/graph_executor.h>
 #include <tvm/runtime/crt/memory.h>
 #include <tvm/runtime/crt/packed_func.h>
 #include <unistd.h>
@@ -59,9 +59,9 @@ TVM_DLL void* tvm_runtime_create(const char* json_data, const char* params_data,
   params.data = params_data;
   params.size = params_size;
 
-  TVMContext ctx;
-  ctx.device_type = (DLDeviceType)device_type;
-  ctx.device_id = device_id;
+  DLDevice dev;
+  dev.device_type = (DLDeviceType)device_type;
+  dev.device_id = device_id;
 
   // get pointers
   TVM_CCALL(MemoryManagerCreate(&g_memory_manager, g_crt_memory, sizeof(g_crt_memory),
@@ -75,31 +75,31 @@ TVM_DLL void* tvm_runtime_create(const char* json_data, const char* params_data,
   TVMModuleHandle mod_syslib = TVMArgs_AsModuleHandle(&pf.ret_value, 0);
 
   // run modules
-  TVMGraphRuntime* graph_runtime = NULL;
-  TVM_CCALL(TVMGraphRuntime_Create(json_data, mod_syslib, &ctx, &graph_runtime));
-  TVM_CCALL(TVMGraphRuntime_LoadParams(graph_runtime, params.data, params.size));
+  TVMGraphExecutor* graph_executor = NULL;
+  TVM_CCALL(TVMGraphExecutor_Create(json_data, mod_syslib, &dev, &graph_executor));
+  TVM_CCALL(TVMGraphExecutor_LoadParams(graph_executor, params.data, params.size));
 
-  return graph_runtime;
+  return graph_executor;
 }
 
-TVM_DLL void tvm_runtime_destroy(void* runtime) {
-  TVMGraphRuntime* graph_runtime = (TVMGraphRuntime*)runtime;
-  TVMGraphRuntime_Release(&graph_runtime);
+TVM_DLL void tvm_runtime_destroy(void* executor) {
+  TVMGraphExecutor* graph_executor = (TVMGraphExecutor*)executor;
+  TVMGraphExecutor_Release(&graph_executor);
 }
 
-TVM_DLL void tvm_runtime_set_input(void* runtime, const char* name, DLTensor* tensor) {
-  TVMGraphRuntime* graph_runtime = (TVMGraphRuntime*)runtime;
-  TVMGraphRuntime_SetInput(graph_runtime, name, tensor);
+TVM_DLL void tvm_runtime_set_input(void* executor, const char* name, DLTensor* tensor) {
+  TVMGraphExecutor* graph_executor = (TVMGraphExecutor*)executor;
+  TVMGraphExecutor_SetInput(graph_executor, name, tensor);
 }
 
-TVM_DLL void tvm_runtime_run(void* runtime) {
-  TVMGraphRuntime* graph_runtime = (TVMGraphRuntime*)runtime;
-  TVMGraphRuntime_Run(graph_runtime);
+TVM_DLL void tvm_runtime_run(void* executor) {
+  TVMGraphExecutor* graph_executor = (TVMGraphExecutor*)executor;
+  TVMGraphExecutor_Run(graph_executor);
 }
 
-TVM_DLL void tvm_runtime_get_output(void* runtime, int32_t index, DLTensor* tensor) {
-  TVMGraphRuntime* graph_runtime = (TVMGraphRuntime*)runtime;
-  TVMGraphRuntime_GetOutput(graph_runtime, index, tensor);
+TVM_DLL void tvm_runtime_get_output(void* executor, int32_t index, DLTensor* tensor) {
+  TVMGraphExecutor* graph_executor = (TVMGraphExecutor*)executor;
+  TVMGraphExecutor_GetOutput(graph_executor, index, tensor);
 }
 
 void TVMLogf(const char* msg, ...) {
@@ -117,12 +117,12 @@ void __attribute__((noreturn)) TVMPlatformAbort(tvm_crt_error_t error_code) {
   exit(-1);
 }
 
-tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLContext ctx, void** out_ptr) {
-  return g_memory_manager->Allocate(g_memory_manager, num_bytes, ctx, out_ptr);
+tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr) {
+  return g_memory_manager->Allocate(g_memory_manager, num_bytes, dev, out_ptr);
 }
 
-tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLContext ctx) {
-  return g_memory_manager->Free(g_memory_manager, ptr, ctx);
+tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) {
+  return g_memory_manager->Free(g_memory_manager, ptr, dev);
 }
 
 tvm_crt_error_t TVMPlatformTimerStart() { return kTvmErrorFunctionCallNotImplemented; }
diff --git a/apps/bundle_deploy/demo.cc b/apps/bundle_deploy/demo.cc
index 98f29d283353..01107c70c353 100644
--- a/apps/bundle_deploy/demo.cc
+++ b/apps/bundle_deploy/demo.cc
@@ -119,7 +119,7 @@ int main(int argc, char** argv) {
   std::vector<int64_t> input_shape = {1, 3, 224, 224};
   DLTensor input;
   input.data = input_storage;
-  input.ctx = DLContext{kDLCPU, 0};
+  input.device = DLDevice{kDLCPU, 0};
   input.ndim = 4;
   input.dtype = DLDataType{kDLFloat, 32, 1};
   input.shape = input_shape.data();
@@ -138,7 +138,7 @@ int main(int argc, char** argv) {
   std::vector<int64_t> output_shape = {1, 1000};
   DLTensor output;
   output.data = output_storage;
-  output.ctx = DLContext{kDLCPU, 0};
+  output.device = DLDevice{kDLCPU, 0};
   output.ndim = 2;
   output.dtype = DLDataType{kDLFloat, 32, 1};
   output.shape = output_shape.data();
diff --git a/apps/bundle_deploy/demo_static.c b/apps/bundle_deploy/demo_static.c
index b25ad90a2388..a2fec6937311 100644
--- a/apps/bundle_deploy/demo_static.c
+++ b/apps/bundle_deploy/demo_static.c
@@ -54,8 +54,8 @@ int main(int argc, char** argv) {
 
   DLTensor input;
   input.data = input_storage;
-  DLContext ctx = {kDLCPU, 0};
-  input.ctx = ctx;
+  DLDevice dev = {kDLCPU, 0};
+  input.device = dev;
   input.ndim = 4;
   DLDataType dtype = {kDLFloat, 32, 1};
   input.dtype = dtype;
@@ -73,8 +73,8 @@ int main(int argc, char** argv) {
   float output_storage[OUTPUT_LEN];
   DLTensor output;
   output.data = output_storage;
-  DLContext out_ctx = {kDLCPU, 0};
-  output.ctx = out_ctx;
+  DLDevice out_dev = {kDLCPU, 0};
+  output.device = out_dev;
   output.ndim = 2;
   DLDataType out_dtype = {kDLFloat, 32, 1};
   output.dtype = out_dtype;
diff --git a/apps/bundle_deploy/runtime.cc b/apps/bundle_deploy/runtime.cc
index 2f7e3848b4bf..7a2573b643f5 100644
--- a/apps/bundle_deploy/runtime.cc
+++ b/apps/bundle_deploy/runtime.cc
@@ -26,7 +26,7 @@
 #include "../../src/runtime/container.cc"
 #include "../../src/runtime/cpu_device_api.cc"
 #include "../../src/runtime/file_utils.cc"
-#include "../../src/runtime/graph/graph_runtime.cc"
+#include "../../src/runtime/graph_executor/graph_executor.cc"
 #include "../../src/runtime/library_module.cc"
 #include "../../src/runtime/module.cc"
 #include "../../src/runtime/ndarray.cc"
diff --git a/apps/bundle_deploy/test.cc b/apps/bundle_deploy/test.cc
index c1a7f5d45377..270047611f57 100644
--- a/apps/bundle_deploy/test.cc
+++ b/apps/bundle_deploy/test.cc
@@ -114,7 +114,7 @@ int main(int argc, char** argv) {
   std::vector<int64_t> input_shape = {10, 5};
   DLTensor input;
   input.data = input_storage;
-  input.ctx = DLContext{kDLCPU, 0};
+  input.device = DLDevice{kDLCPU, 0};
   input.ndim = 2;
   input.dtype = DLDataType{kDLFloat, 32, 1};
   input.shape = input_shape.data();
@@ -133,7 +133,7 @@ int main(int argc, char** argv) {
   std::vector<int64_t> output_shape = {10, 5};
   DLTensor output;
   output.data = output_storage;
-  output.ctx = DLContext{kDLCPU, 0};
+  output.device = DLDevice{kDLCPU, 0};
   output.ndim = 2;
   output.dtype = DLDataType{kDLFloat, 32, 1};
   output.shape = output_shape.data();
diff --git a/apps/bundle_deploy/test_static.c b/apps/bundle_deploy/test_static.c
index 11ca2c44952e..b9c980843ea1 100644
--- a/apps/bundle_deploy/test_static.c
+++ b/apps/bundle_deploy/test_static.c
@@ -66,8 +66,8 @@ int main(int argc, char** argv) {
 
   DLTensor input;
   input.data = input_storage;
-  DLContext ctx = {kDLCPU, 0};
-  input.ctx = ctx;
+  DLDevice dev = {kDLCPU, 0};
+  input.device = dev;
   input.ndim = 2;
   DLDataType dtype = {kDLFloat, 32, 1};
   input.dtype = dtype;
@@ -85,8 +85,8 @@ int main(int argc, char** argv) {
   float output_storage[10 * 5];
   DLTensor output;
   output.data = output_storage;
-  DLContext out_ctx = {kDLCPU, 0};
-  output.ctx = out_ctx;
+  DLDevice out_dev = {kDLCPU, 0};
+  output.device = out_dev;
   output.ndim = 2;
   DLDataType out_dtype = {kDLFloat, 32, 1};
   output.dtype = out_dtype;
diff --git a/apps/cpp_rpc/main.cc b/apps/cpp_rpc/main.cc
index e381dd2b261b..0663c378819e 100644
--- a/apps/cpp_rpc/main.cc
+++ b/apps/cpp_rpc/main.cc
@@ -55,6 +55,7 @@ static const string kUsage =
     "--tracker     - The RPC tracker address in host:port format e.g. 10.1.1.2:9190 Default=\"\"\n"
     "--key         - The key used to identify the device type in tracker. Default=\"\"\n"
     "--custom-addr - Custom IP Address to Report to RPC Tracker. Default=\"\"\n"
+    "--work-dir    - Custom work directory. Default=\"\"\n"
     "--silent      - Whether to run in silent mode. Default=False\n"
     "\n"
     "  Example\n"
@@ -70,6 +71,7 @@ static const string kUsage =
  * \arg tracker The address of RPC tracker in host:port format e.g. 10.77.1.234:9190 Default=""
  * \arg key The key used to identify the device type in tracker. Default=""
  * \arg custom_addr Custom IP Address to Report to RPC Tracker. Default=""
+ * \arg work_dir Custom work directory. Default=""
  * \arg silent Whether run in silent mode. Default=False
  */
 struct RpcServerArgs {
@@ -79,6 +81,7 @@ struct RpcServerArgs {
   string tracker;
   string key;
   string custom_addr;
+  string work_dir;
   bool silent = false;
 #if defined(WIN32)
   std::string mmap_path;
@@ -96,6 +99,7 @@ void PrintArgs(const RpcServerArgs& args) {
   LOG(INFO) << "tracker     = " << args.tracker;
   LOG(INFO) << "key         = " << args.key;
   LOG(INFO) << "custom_addr = " << args.custom_addr;
+  LOG(INFO) << "work_dir    = " << args.work_dir;
   LOG(INFO) << "silent      = " << ((args.silent) ? ("True") : ("False"));
 }
 
@@ -238,6 +242,10 @@ void ParseCmdArgs(int argc, char* argv[], struct RpcServerArgs& args) {
     dmlc::InitLogging("--minloglevel=0");
   }
 #endif
+  const string work_dir = GetCmdOption(argc, argv, "--work-dir=");
+  if (!work_dir.empty()) {
+    args.work_dir = work_dir;
+  }
 }
 
 /*!
@@ -274,7 +282,7 @@ int RpcServer(int argc, char* argv[]) {
 #endif
 
   RPCServerCreate(args.host, args.port, args.port_end, args.tracker, args.key, args.custom_addr,
-                  args.silent);
+                  args.work_dir, args.silent);
   return 0;
 }
 
diff --git a/apps/cpp_rpc/rpc_env.cc b/apps/cpp_rpc/rpc_env.cc
index ea19cfa3979d..5f703e1dc2b0 100644
--- a/apps/cpp_rpc/rpc_env.cc
+++ b/apps/cpp_rpc/rpc_env.cc
@@ -39,7 +39,6 @@ int mkdir(const char* path, int /* ignored */) { return _mkdir(path); }
 #include <iostream>
 #include <string>
 #include <vector>
-
 #include "../../src/support/utils.h"
 #include "rpc_env.h"
 
@@ -85,25 +84,31 @@ void CleanDir(const std::string& dirname);
  */
 std::string BuildSharedLibrary(std::string file_in);
 
-RPCEnv::RPCEnv() {
+RPCEnv::RPCEnv(const std::string& wd) {
+  if (wd != "") {
+    base_ = wd + "/.cache";
+    mkdir(wd.c_str(), 0777);
+    mkdir(base_.c_str(), 0777);
+  } else {
 #if defined(ANDROID) || defined(__ANDROID__)
-  char cwd[PATH_MAX];
-  auto cmdline = fopen("/proc/self/cmdline", "r");
-  fread(cwd, 1, sizeof(cwd), cmdline);
-  fclose(cmdline);
-  base_ = "/data/data/" + std::string(cwd) + "/cache/rpc";
+    char cwd[PATH_MAX];
+    auto cmdline = fopen("/proc/self/cmdline", "r");
+    fread(cwd, 1, sizeof(cwd), cmdline);
+    fclose(cmdline);
+    base_ = "/data/data/" + std::string(cwd) + "/cache/rpc";
 #elif !defined(_WIN32)
-  char cwd[PATH_MAX];
-  if (getcwd(cwd, sizeof(cwd))) {
-    base_ = std::string(cwd) + "/rpc";
-  } else {
-    base_ = "./rpc";
-  }
+    char cwd[PATH_MAX];
+    if (getcwd(cwd, sizeof(cwd))) {
+      base_ = std::string(cwd) + "/rpc";
+    } else {
+      base_ = "./rpc";
+    }
 #else
-  base_ = "./rpc";
+    base_ = "./rpc";
 #endif
+    mkdir(base_.c_str(), 0777);
+  }
 
-  mkdir(base_.c_str(), 0777);
   TVM_REGISTER_GLOBAL("tvm.rpc.server.workpath").set_body([this](TVMArgs args, TVMRetValue* rv) {
     *rv = this->GetPath(args[0]);
   });
diff --git a/apps/cpp_rpc/rpc_env.h b/apps/cpp_rpc/rpc_env.h
index 50ef3835e015..dbb0a62d2c5d 100644
--- a/apps/cpp_rpc/rpc_env.h
+++ b/apps/cpp_rpc/rpc_env.h
@@ -39,7 +39,7 @@ struct RPCEnv {
   /*!
    * \brief Constructor Init The RPC Environment initialize function
    */
-  RPCEnv();
+  RPCEnv(const std::string& word_dir = "");
   /*!
    * \brief GetPath To get the workpath from packed function
    * \param name The file name
diff --git a/apps/cpp_rpc/rpc_server.cc b/apps/cpp_rpc/rpc_server.cc
index a4028ff61eca..52b5da965b4c 100644
--- a/apps/cpp_rpc/rpc_server.cc
+++ b/apps/cpp_rpc/rpc_server.cc
@@ -98,14 +98,15 @@ class RPCServer {
    * \brief Constructor.
    */
   RPCServer(std::string host, int port, int port_end, std::string tracker_addr, std::string key,
-            std::string custom_addr)
+            std::string custom_addr, std::string work_dir)
       : host_(std::move(host)),
         port_(port),
         my_port_(0),
         port_end_(port_end),
         tracker_addr_(std::move(tracker_addr)),
         key_(std::move(key)),
-        custom_addr_(std::move(custom_addr)) {}
+        custom_addr_(std::move(custom_addr)),
+        work_dir_(std::move(work_dir)) {}
 
   /*!
    * \brief Destructor.
@@ -174,7 +175,7 @@ class RPCServer {
         const pid_t worker_pid = fork();
         if (worker_pid == 0) {
           // Worker process
-          ServerLoopProc(conn, addr);
+          ServerLoopProc(conn, addr, work_dir_);
           _exit(0);
         }
 
@@ -201,7 +202,7 @@ class RPCServer {
       } else {
         auto pid = fork();
         if (pid == 0) {
-          ServerLoopProc(conn, addr);
+          ServerLoopProc(conn, addr, work_dir_);
           exit(0);
         }
         // Wait for the result
@@ -308,9 +309,10 @@ class RPCServer {
    * \param sock The socket information
    * \param addr The socket address information
    */
-  static void ServerLoopProc(support::TCPSocket sock, support::SockAddr addr) {
+  static void ServerLoopProc(support::TCPSocket sock, support::SockAddr addr,
+                             std::string work_dir) {
     // Server loop
-    const auto env = RPCEnv();
+    const auto env = RPCEnv(work_dir);
     RPCServerLoop(int(sock.sockfd));
     LOG(INFO) << "Finish serving " << addr.AsString();
     env.CleanUp();
@@ -339,6 +341,7 @@ class RPCServer {
   std::string tracker_addr_;
   std::string key_;
   std::string custom_addr_;
+  std::string work_dir_;
   support::TCPSocket listen_sock_;
   support::TCPSocket tracker_sock_;
 };
@@ -370,19 +373,19 @@ void ServerLoopFromChild(SOCKET socket) {
  * silent mode. Default=True
  */
 void RPCServerCreate(std::string host, int port, int port_end, std::string tracker_addr,
-                     std::string key, std::string custom_addr, bool silent) {
+                     std::string key, std::string custom_addr, std::string work_dir, bool silent) {
   if (silent) {
     // Only errors and fatal is logged
     dmlc::InitLogging("--minloglevel=2");
   }
   // Start the rpc server
   RPCServer rpc(std::move(host), port, port_end, std::move(tracker_addr), std::move(key),
-                std::move(custom_addr));
+                std::move(custom_addr), std::move(work_dir));
   rpc.Start();
 }
 
 TVM_REGISTER_GLOBAL("rpc.ServerCreate").set_body([](TVMArgs args, TVMRetValue* rv) {
-  RPCServerCreate(args[0], args[1], args[2], args[3], args[4], args[5], args[6]);
+  RPCServerCreate(args[0], args[1], args[2], args[3], args[4], args[5], args[6], args[7]);
 });
 }  // namespace runtime
 }  // namespace tvm
diff --git a/apps/cpp_rpc/rpc_server.h b/apps/cpp_rpc/rpc_server.h
index 7a4bda5d65c4..e4565d095b2e 100644
--- a/apps/cpp_rpc/rpc_server.h
+++ b/apps/cpp_rpc/rpc_server.h
@@ -48,11 +48,12 @@ void ServerLoopFromChild(SOCKET socket);
  * \param tracker The address of RPC tracker in host:port format e.g. 10.77.1.234:9190 Default=""
  * \param key The key used to identify the device type in tracker. Default=""
  * \param custom_addr Custom IP Address to Report to RPC Tracker. Default=""
+ * \param work_dir Custom work directory. Default=""
  * \param silent Whether run in silent mode. Default=True
  */
 void RPCServerCreate(std::string host = "", int port = 9090, int port_end = 9099,
                      std::string tracker_addr = "", std::string key = "",
-                     std::string custom_addr = "", bool silent = true);
+                     std::string custom_addr = "", std::string work_dir = "", bool silent = true);
 }  // namespace runtime
 }  // namespace tvm
 #endif  // TVM_APPS_CPP_RPC_SERVER_H_
diff --git a/apps/dso_plugin_module/Makefile b/apps/dso_plugin_module/Makefile
index c2ce3306870a..438d9db223a8 100644
--- a/apps/dso_plugin_module/Makefile
+++ b/apps/dso_plugin_module/Makefile
@@ -19,7 +19,8 @@ TVM_ROOT=$(shell cd ../..; pwd)
 PKG_CFLAGS = -std=c++14 -O2 -fPIC\
 	-I${TVM_ROOT}/include\
 	-I${TVM_ROOT}/3rdparty/dmlc-core/include\
-	-I${TVM_ROOT}/3rdparty/dlpack/include
+	-I${TVM_ROOT}/3rdparty/dlpack/include\
+	-DDMLC_USE_LOGGING_LIBRARY=\<tvm/runtime/logging.h\>
 
 PKG_LDFLAGS =-L${TVM_ROOT}/build
 UNAME_S := $(shell uname -s)
diff --git a/apps/extension/Makefile b/apps/extension/Makefile
index 91d914aba63b..6eba941f7c98 100644
--- a/apps/extension/Makefile
+++ b/apps/extension/Makefile
@@ -20,7 +20,8 @@ TVM_ROOT=$(shell cd ../..; pwd)
 PKG_CFLAGS = -std=c++14 -O2 -fPIC\
 	-I${TVM_ROOT}/include\
 	-I${TVM_ROOT}/3rdparty/dmlc-core/include\
-	-I${TVM_ROOT}/3rdparty/dlpack/include
+	-I${TVM_ROOT}/3rdparty/dlpack/include\
+	-DDMLC_USE_LOGGING_LIBRARY=\<tvm/runtime/logging.h\>
 
 
 PKG_LDFLAGS =-L${TVM_ROOT}/build
diff --git a/apps/extension/tests/test_ext.py b/apps/extension/tests/test_ext.py
index c73e820c19ad..a01f97c349ca 100644
--- a/apps/extension/tests/test_ext.py
+++ b/apps/extension/tests/test_ext.py
@@ -40,10 +40,10 @@ def check_llvm():
         if not tvm.testing.device_enabled("llvm"):
             return
         f = tvm.build(s, [A, B], "ext_dev", "llvm")
-        ctx = tvm.ext_dev(0)
+        dev = tvm.ext_dev(0)
         # launch the kernel.
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
+        b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev)
         f(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), a.asnumpy() + 1)
 
@@ -87,10 +87,10 @@ def check_llvm():
         if not tvm.testing.device_enabled("llvm"):
             return
         f = tvm.build(s, [A, B], "llvm")
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         # launch the kernel.
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
+        b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev)
         f(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), a.asnumpy() + 1)
 
diff --git a/apps/howto_deploy/cpp_deploy.cc b/apps/howto_deploy/cpp_deploy.cc
index 829241d31a6d..8500ddb5fabe 100644
--- a/apps/howto_deploy/cpp_deploy.cc
+++ b/apps/howto_deploy/cpp_deploy.cc
@@ -83,20 +83,20 @@ void DeploySingleOp() {
   Verify(mod_syslib, "addonesys");
 }
 
-void DeployGraphRuntime() {
-  LOG(INFO) << "Running graph runtime...";
+void DeployGraphExecutor() {
+  LOG(INFO) << "Running graph executor...";
   // load in the library
-  DLContext ctx{kDLCPU, 0};
+  DLDevice dev{kDLCPU, 0};
   tvm::runtime::Module mod_factory = tvm::runtime::Module::LoadFromFile("lib/test_relay_add.so");
-  // create the graph runtime module
-  tvm::runtime::Module gmod = mod_factory.GetFunction("default")(ctx);
+  // create the graph executor module
+  tvm::runtime::Module gmod = mod_factory.GetFunction("default")(dev);
   tvm::runtime::PackedFunc set_input = gmod.GetFunction("set_input");
   tvm::runtime::PackedFunc get_output = gmod.GetFunction("get_output");
   tvm::runtime::PackedFunc run = gmod.GetFunction("run");
 
   // Use the C++ API
-  tvm::runtime::NDArray x = tvm::runtime::NDArray::Empty({2, 2}, DLDataType{kDLFloat, 32, 1}, ctx);
-  tvm::runtime::NDArray y = tvm::runtime::NDArray::Empty({2, 2}, DLDataType{kDLFloat, 32, 1}, ctx);
+  tvm::runtime::NDArray x = tvm::runtime::NDArray::Empty({2, 2}, DLDataType{kDLFloat, 32, 1}, dev);
+  tvm::runtime::NDArray y = tvm::runtime::NDArray::Empty({2, 2}, DLDataType{kDLFloat, 32, 1}, dev);
 
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 2; ++j) {
@@ -119,6 +119,6 @@ void DeployGraphRuntime() {
 
 int main(void) {
   DeploySingleOp();
-  DeployGraphRuntime();
+  DeployGraphExecutor();
   return 0;
 }
diff --git a/apps/howto_deploy/tvm_runtime_pack.cc b/apps/howto_deploy/tvm_runtime_pack.cc
index d6dd5876a994..c8778a380233 100644
--- a/apps/howto_deploy/tvm_runtime_pack.cc
+++ b/apps/howto_deploy/tvm_runtime_pack.cc
@@ -58,9 +58,9 @@
 #include "../../src/runtime/dso_library.cc"
 #include "../../src/runtime/system_library.cc"
 
-// Graph runtime
-#include "../../src/runtime/graph/graph_runtime.cc"
-#include "../../src/runtime/graph/graph_runtime_factory.cc"
+// Graph executor
+#include "../../src/runtime/graph_executor/graph_executor.cc"
+#include "../../src/runtime/graph_executor/graph_executor_factory.cc"
 
 // Uncomment the following lines to enable RPC
 // #include "../../src/runtime/rpc/rpc_session.cc"
diff --git a/apps/ios_rpc/tests/ios_rpc_mobilenet.py b/apps/ios_rpc/tests/ios_rpc_mobilenet.py
index 90ac6bfb9218..ee6ab5fd8363 100644
--- a/apps/ios_rpc/tests/ios_rpc_mobilenet.py
+++ b/apps/ios_rpc/tests/ios_rpc_mobilenet.py
@@ -22,7 +22,7 @@
 from tvm.relay import transform
 from tvm.relay.op.annotation import compiler_begin, compiler_end
 from tvm.relay.quantize.quantize import prerequisite_optimize
-from tvm.contrib import utils, xcode, graph_runtime, coreml_runtime
+from tvm.contrib import utils, xcode, graph_executor, coreml_runtime
 from tvm.contrib.target import coreml as _coreml
 
 import os
@@ -116,20 +116,20 @@ def run(mod, target):
         remote = rpc.connect(proxy_host, proxy_port, key=key)
 
         if target == "metal":
-            ctx = remote.metal(0)
+            dev = remote.metal(0)
         else:
-            ctx = remote.cpu(0)
+            dev = remote.cpu(0)
         lib = remote.load_module("deploy.dylib")
-        m = graph_runtime.GraphModule(lib["default"](ctx))
+        m = graph_executor.GraphModule(lib["default"](dev))
 
-        m.set_input("data", tvm.nd.array(image, ctx))
+        m.set_input("data", tvm.nd.array(image, dev))
         m.run()
         tvm_output = m.get_output(0)
         top1 = np.argmax(tvm_output.asnumpy()[0])
         print("TVM prediction top-1:", top1, synset[top1])
 
         # evaluate
-        ftimer = m.module.time_evaluator("run", ctx, number=3, repeat=10)
+        ftimer = m.module.time_evaluator("run", dev, number=3, repeat=10)
         prof_res = np.array(ftimer().results) * 1000
         print("%-19s (%s)" % ("%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res)))
 
diff --git a/apps/ios_rpc/tests/ios_rpc_test.py b/apps/ios_rpc/tests/ios_rpc_test.py
index a967c2f75e61..865cdb0b8e43 100644
--- a/apps/ios_rpc/tests/ios_rpc_test.py
+++ b/apps/ios_rpc/tests/ios_rpc_test.py
@@ -88,22 +88,22 @@ def test_rpc_module():
 
     # connect to the proxy
     remote = rpc.connect(proxy_host, proxy_port, key=key)
-    ctx = remote.metal(0)
+    dev = remote.metal(0)
     f1 = remote.load_module("dev_lib.dylib")
     a_np = np.random.uniform(size=1024).astype(A.dtype)
-    a = tvm.nd.array(a_np, ctx)
-    b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
-    time_f = f1.time_evaluator(f1.entry_name, ctx, number=10)
+    a = tvm.nd.array(a_np, dev)
+    b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
+    time_f = f1.time_evaluator(f1.entry_name, dev, number=10)
     cost = time_f(a, b).mean
     print("%g secs/op" % cost)
     np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
     # CPU
-    ctx = remote.cpu(0)
+    dev = remote.cpu(0)
     f2 = remote.load_module("cpu_lib.dylib")
     a_np = np.random.uniform(size=1024).astype(A.dtype)
-    a = tvm.nd.array(a_np, ctx)
-    b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
-    time_f = f2.time_evaluator(f1.entry_name, ctx, number=10)
+    a = tvm.nd.array(a_np, dev)
+    b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
+    time_f = f2.time_evaluator(f1.entry_name, dev, number=10)
     cost = time_f(a, b).mean
     print("%g secs/op" % cost)
     np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
diff --git a/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj b/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj
index b33c892cf002..6f00c03a26d7 100644
--- a/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj
+++ b/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj
@@ -349,6 +349,8 @@
 				GCC_PREPROCESSOR_DEFINITIONS = (
 					"DEBUG=1",
 					"$(inherited)",
+					"DMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>",
+					"TVM_USE_LIBBACKTRACE=0",
 				);
 				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
 				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
@@ -393,6 +395,10 @@
 				ENABLE_STRICT_OBJC_MSGSEND = YES;
 				GCC_C_LANGUAGE_STANDARD = gnu99;
 				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>",
+					"TVM_USE_LIBBACKTRACE=0",
+				);
 				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
 				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
 				GCC_WARN_UNDECLARED_SELECTOR = YES;
diff --git a/apps/ios_rpc/tvmrpc/TVMRuntime.h b/apps/ios_rpc/tvmrpc/TVMRuntime.h
index f6a6dc64c53a..0d172fc3eaa1 100644
--- a/apps/ios_rpc/tvmrpc/TVMRuntime.h
+++ b/apps/ios_rpc/tvmrpc/TVMRuntime.h
@@ -22,7 +22,7 @@
  */
 #import <Foundation/Foundation.h>
 // Customize logging mechanism, redirect to NSLOG
-#define DMLC_LOG_CUSTOMIZE 1
+#define TVM_LOG_CUSTOMIZE 1
 #define TVM_METAL_RUNTIME 1
 
 #include <tvm/runtime/packed_func.h>
diff --git a/apps/ios_rpc/tvmrpc/TVMRuntime.mm b/apps/ios_rpc/tvmrpc/TVMRuntime.mm
index fbe4850e1b57..7ab9a4d2d219 100644
--- a/apps/ios_rpc/tvmrpc/TVMRuntime.mm
+++ b/apps/ios_rpc/tvmrpc/TVMRuntime.mm
@@ -45,17 +45,27 @@
 #include "../../../src/runtime/rpc/rpc_server_env.cc"
 #include "../../../src/runtime/rpc/rpc_session.cc"
 #include "../../../src/runtime/rpc/rpc_socket_impl.cc"
-// Graph runtime
-#include "../../../src/runtime/graph/graph_runtime.cc"
+// Graph executor
+#include "../../../src/runtime/graph_executor/graph_executor.cc"
 // Metal
 #include "../../../src/runtime/metal/metal_device_api.mm"
 #include "../../../src/runtime/metal/metal_module.mm"
 // CoreML
 #include "../../../src/runtime/contrib/coreml/coreml_runtime.mm"
 
-namespace dmlc {
+namespace tvm {
+namespace runtime {
+namespace detail {
 // Override logging mechanism
-void CustomLogMessage::Log(const std::string& msg) { NSLog(@"%s", msg.c_str()); }
+void LogFatalImpl(const std::string& file, int lineno, const std::string& message) {
+  throw tvm::runtime::InternalError(file, lineno, message);
+}
+
+void LogMessageImpl(const std::string& file, int lineno, const std::string& message) {
+  NSLog(@"%s:%d: %s", file.c_str(), lineno, message.c_str());
+}
+}
+}
 }  // namespace dmlc
 
 namespace tvm {
@@ -69,7 +79,7 @@ size_t Send(const void* data, size_t size) final {
     ssize_t nbytes = [stream_ write:reinterpret_cast<const uint8_t*>(data) maxLength:size];
     if (nbytes < 0) {
       NSLog(@"%@", [stream_ streamError].localizedDescription);
-      throw dmlc::Error("Stream error");
+      throw tvm::Error("Stream error");
     }
     return nbytes;
   }
diff --git a/apps/ios_rpc/tvmrpc/ViewController.mm b/apps/ios_rpc/tvmrpc/ViewController.mm
index 910c650aedc1..879ed2334a84 100644
--- a/apps/ios_rpc/tvmrpc/ViewController.mm
+++ b/apps/ios_rpc/tvmrpc/ViewController.mm
@@ -100,7 +100,7 @@ - (void)onReadAvailable {
         if (flag == 2) {
           [self onShutdownReceived];
         }
-      } catch (const dmlc::Error& e) {
+      } catch (const tvm::Error& e) {
         [self close];
       }
     }
@@ -123,7 +123,7 @@ - (void)onWriteAvailable {
       if (flag == 2) {
         [self onShutdownReceived];
       }
-    } catch (const dmlc::Error& e) {
+    } catch (const tvm::Error& e) {
       [self close];
     }
   }
diff --git a/apps/microtvm/README.md b/apps/microtvm/README.md
index 97b844a4c01b..362bc407238e 100644
--- a/apps/microtvm/README.md
+++ b/apps/microtvm/README.md
@@ -15,14 +15,17 @@
 <!--- specific language governing permissions and limitations -->
 <!--- under the License. -->
 
-# microTVM Reference Virtual Machines
+# microTVM
 
+microTVM is the effort that allows TVM to build and execute models on bare-metal microcontrollers.
 
-microTVM is the effort to allow TVM to build and execute models on bare-metal microcontrollers.
-These Virtual Machines are used to reproduce results and bugs when using microTVM with real
-physical hardware. Note that they are not used to run Continuous Integration regression tests--
-those are instead run by the QEMU container (they run against an emulator, rather than real
-hardware).
+The `pyproject.toml` file in this directory can be used to create a
+[Poetry](https://python-poetry.org/) Python environment with all of the required
+dependencies installed for running microTVM. To use it, run:
+
+```
+$ poetry lock && poetry install
+$ poetry shell
+```
 
 
-See the "microTVM Reference Virtual Machines" tutorial for information on how to use these.
diff --git a/apps/microtvm/reference-vm/zephyr/pyproject.toml b/apps/microtvm/pyproject.toml
similarity index 95%
rename from apps/microtvm/reference-vm/zephyr/pyproject.toml
rename to apps/microtvm/pyproject.toml
index b4cfc544df58..8bfae0a157cd 100644
--- a/apps/microtvm/reference-vm/zephyr/pyproject.toml
+++ b/apps/microtvm/pyproject.toml
@@ -15,6 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 
+# This `pyproject.toml` file is used to allow MicroTVM
+# to run within a Poetry-managed environment.
+
 [tool.black]
 line-length = 100
 target-version = ['py36']
@@ -47,12 +50,12 @@ exclude = '''
 )
 '''
 [tool.poetry]
-name = "tvm"
+name = "microtvm"
 version = "0.1.0"
 description = ""
-authors = ["Your Name <you@example.com>"]
+authors = []
 packages = [
-    { include = "tvm", from = "../../../../python" },
+    { include = "tvm", from = "../../python" },
 ]
 
 [tool.poetry.dependencies]
@@ -67,7 +70,6 @@ typed_ast = "^1.4"
 pyyaml = "^5.4.1"
 pyserial = "^3.5"
 
-
 # AutoTVM
 xgboost = {version = "^1.1", optional = true}
 
diff --git a/apps/microtvm/reference-vm/base-box-tool.py b/apps/microtvm/reference-vm/base-box-tool.py
old mode 100755
new mode 100644
index 0e82dc2e9c0e..dbf05f016f67
--- a/apps/microtvm/reference-vm/base-box-tool.py
+++ b/apps/microtvm/reference-vm/base-box-tool.py
@@ -42,6 +42,12 @@
     "vmware_desktop",
 )
 
+# List of microTVM platforms for testing.
+ALL_MICROTVM_PLATFORMS = (
+    "stm32f746xx",
+    "nrf5340dk",
+)
+
 
 def parse_virtualbox_devices():
     output = subprocess.check_output(["VBoxManage", "list", "usbhost"], encoding="utf-8")
@@ -109,6 +115,7 @@ def attach_virtualbox(uuid, vid_hex=None, pid_hex=None, serial=None):
             if serial is not None:
                 rule_args.extend(["--serialnumber", serial])
             subprocess.check_call(rule_args)
+            # TODO(mehrdadh): skip usb attach if it's already attached
             subprocess.check_call(["VBoxManage", "controlvm", uuid, "usbattach", dev["UUID"]])
             return
 
@@ -308,13 +315,17 @@ def test_command(args):
     test_config_file = os.path.join(base_box_dir, "test-config.json")
     with open(test_config_file) as f:
         test_config = json.load(f)
+
+        # select microTVM test platform
+        microtvm_test_platform = test_config[args.microtvm_platform]
+
         for key, expected_type in REQUIRED_TEST_CONFIG_KEYS.items():
-            assert key in test_config and isinstance(
-                test_config[key], expected_type
+            assert key in microtvm_test_platform and isinstance(
+                microtvm_test_platform[key], expected_type
             ), f"Expected key {key} of type {expected_type} in {test_config_file}: {test_config!r}"
 
-        test_config["vid_hex"] = test_config["vid_hex"].lower()
-        test_config["pid_hex"] = test_config["pid_hex"].lower()
+        microtvm_test_platform["vid_hex"] = microtvm_test_platform["vid_hex"].lower()
+        microtvm_test_platform["pid_hex"] = microtvm_test_platform["pid_hex"].lower()
 
     providers = args.provider
     provider_passed = {p: False for p in providers}
@@ -331,7 +342,7 @@ def test_command(args):
                     release_test_dir, user_box_dir, base_box_dir, provider_name
                 )
             do_run_release_test(
-                release_test_dir, provider_name, test_config, args.test_device_serial
+                release_test_dir, provider_name, microtvm_test_platform, args.test_device_serial
             )
             provider_passed[provider_name] = True
 
@@ -444,6 +455,13 @@ def parse_args():
         ),
     )
 
+    parser.add_argument(
+        "--microtvm-platform",
+        default="stm32f746xx",
+        choices=ALL_MICROTVM_PLATFORMS,
+        help="For use with 'test' command. MicroTVM platfrom that are used for testing.",
+    )
+
     return parser.parse_args()
 
 
diff --git a/apps/microtvm/reference-vm/zephyr/base-box/setup.sh b/apps/microtvm/reference-vm/zephyr/base-box/setup.sh
index 52af947c3e89..7299ceae6ad8 100644
--- a/apps/microtvm/reference-vm/zephyr/base-box/setup.sh
+++ b/apps/microtvm/reference-vm/zephyr/base-box/setup.sh
@@ -59,17 +59,22 @@ sudo apt install -y llvm
 sudo apt install -y protobuf-compiler libprotoc-dev
 
 # nrfjprog
+NRF_COMMANDLINE_TOOLS_FILE=nRFCommandLineToolsLinuxamd64.tar.gz
+NRF_COMMANDLINE_TOOLS_URL=https://www.nordicsemi.com/-/media/Software-and-other-downloads/Desktop-software/nRF-command-line-tools/sw/Versions-10-x-x/10-12-1/nRFCommandLineTools10121Linuxamd64.tar.gz
+NRF_COMMANDLINE_TOOLS_INSTALLER=nRF-Command-Line-Tools_10_12_1_Linux-amd64.deb
+JLINK_LINUX_INSTALLER=JLink_Linux_V688a_x86_64.deb
+
 cd ~
 mkdir -p nrfjprog
-wget --no-verbose -O nRFCommandLineTools1090Linuxamd64.tar.gz https://www.nordicsemi.com/-/media/Software-and-other-downloads/Desktop-software/nRF-command-line-tools/sw/Versions-10-x-x/10-9-0/nRFCommandLineTools1090Linuxamd64tar.gz
+wget --no-verbose -O $NRF_COMMANDLINE_TOOLS_FILE $NRF_COMMANDLINE_TOOLS_URL
 cd nrfjprog
-tar -xzvf ../nRFCommandLineTools1090Linuxamd64.tar.gz
-sudo apt install -y ./JLink_Linux_V680a_x86_64.deb
-sudo apt install -y ./nRF-Command-Line-Tools_10_9_0_Linux-amd64.deb
+tar -xzvf "../${NRF_COMMANDLINE_TOOLS_FILE}"
+sudo apt install -y "./${JLINK_LINUX_INSTALLER}"
+sudo apt install -y "./${NRF_COMMANDLINE_TOOLS_INSTALLER}"
 source ~/.profile
 nrfjprog --help
 cd ..
-rm -rf nrfjprog nRFCommandLineTools1090Linuxamd64.tar.gz
+rm -rf nrfjprog "${NRF_COMMANDLINE_TOOLS_FILE}"
 
 # Zephyr
 pip3 install --user -U west
diff --git a/apps/microtvm/reference-vm/zephyr/base-box/test-config.json b/apps/microtvm/reference-vm/zephyr/base-box/test-config.json
index 78a6bd216e65..0d266797f04d 100644
--- a/apps/microtvm/reference-vm/zephyr/base-box/test-config.json
+++ b/apps/microtvm/reference-vm/zephyr/base-box/test-config.json
@@ -1,4 +1,12 @@
-{"vid_hex": "0483",
- "pid_hex": "374b",
- "test_cmd": ["pytest", "tests/micro/qemu/test_zephyr.py", "--microtvm-platforms=stm32f746xx"]
+{
+    "stm32f746xx": {
+        "vid_hex": "0483",
+        "pid_hex": "374b",
+        "test_cmd": ["pytest", "tests/micro/qemu/test_zephyr.py", "--microtvm-platforms=stm32f746xx"]
+        },
+    "nrf5340dk": {
+        "vid_hex": "1366",
+        "pid_hex": "1055",
+        "test_cmd": ["pytest", "tests/micro/qemu/test_zephyr.py", "--microtvm-platforms=nrf5340dk"]
+        }
 }
diff --git a/apps/microtvm/reference-vm/zephyr/rebuild-tvm.sh b/apps/microtvm/reference-vm/zephyr/rebuild-tvm.sh
index 4672012e73f2..2eb55e385520 100755
--- a/apps/microtvm/reference-vm/zephyr/rebuild-tvm.sh
+++ b/apps/microtvm/reference-vm/zephyr/rebuild-tvm.sh
@@ -28,7 +28,7 @@ fi
 cp cmake/config.cmake "${BUILD_DIR}"
 cd "${BUILD_DIR}"
 sed -i 's/USE_MICRO OFF/USE_MICRO ON/' config.cmake
-sed -i 's/USE_GRAPH_RUNTIME_DEBUG OFF/USE_GRAPH_RUNTIME_DEBUG ON/' config.cmake
+sed -i 's/USE_GRAPH_EXECUTOR_DEBUG OFF/USE_GRAPH_EXECUTOR_DEBUG ON/' config.cmake
 sed -i 's/USE_LLVM OFF/USE_LLVM ON/' config.cmake
 cmake ..
 rm -rf standalone_crt host_standalone_crt  # remove stale generated files
diff --git a/apps/microtvm/zephyr/README.md b/apps/microtvm/zephyr/README.md
new file mode 100644
index 000000000000..ad00393c0805
--- /dev/null
+++ b/apps/microtvm/zephyr/README.md
@@ -0,0 +1,19 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+This directory code to interface microTVM with the [Zephyr RTOS](https://zephyrproject.org/).
+
diff --git a/tests/micro/qemu/zephyr-runtime/CMakeLists.txt b/apps/microtvm/zephyr/demo_runtime/CMakeLists.txt
similarity index 99%
rename from tests/micro/qemu/zephyr-runtime/CMakeLists.txt
rename to apps/microtvm/zephyr/demo_runtime/CMakeLists.txt
index ce5605469fcb..a99d5edb07e6 100644
--- a/tests/micro/qemu/zephyr-runtime/CMakeLists.txt
+++ b/apps/microtvm/zephyr/demo_runtime/CMakeLists.txt
@@ -9,7 +9,6 @@ set(QEMU_PIPE "\${QEMU_PIPE}")  # QEMU_PIPE is set by the calling TVM instance.
 find_package(Zephyr HINTS $ENV{ZEPHYR_BASE})
 project(microtvm_zephyr_runtime)
 
-
 set(CMAKE_VERBOSE_MAKEFILE ON)
 file(GLOB TVM_SOURCES ${CMAKE_SOURCE_DIR}/__tvm*.c)
 target_sources(app PRIVATE src/main.c ${TVM_SOURCES})
diff --git a/apps/microtvm/zephyr/demo_runtime/README.md b/apps/microtvm/zephyr/demo_runtime/README.md
new file mode 100644
index 000000000000..eab3f3d241a1
--- /dev/null
+++ b/apps/microtvm/zephyr/demo_runtime/README.md
@@ -0,0 +1,21 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+This directory contains a Zephyr-based "demo" runtime environment that
+pulls together the microTVM runtime dependencies into a single application
+that can communicate with a Python-based host program via the UART, using
+TVM's RPC protocol.
diff --git a/apps/microtvm/zephyr/demo_runtime/boards/nrf5340dk_nrf5340_cpuapp.conf b/apps/microtvm/zephyr/demo_runtime/boards/nrf5340dk_nrf5340_cpuapp.conf
new file mode 100644
index 000000000000..149a69ea3b5b
--- /dev/null
+++ b/apps/microtvm/zephyr/demo_runtime/boards/nrf5340dk_nrf5340_cpuapp.conf
@@ -0,0 +1,31 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# This file is specific to the nRF5340 DK board.
+
+# For intrinsics used by generated optimized operators.
+CONFIG_CMSIS_DSP=y
+
+# Required for Cortex-M33 devices.
+CONFIG_MAIN_STACK_SIZE=1536
+
+# For random number generation.
+CONFIG_ENTROPY_GENERATOR=y
+CONFIG_TEST_RANDOM_GENERATOR=y
+
+# For debugging.
+CONFIG_LED=y
diff --git a/apps/microtvm/zephyr/demo_runtime/boards/nucleo_f746zg.conf b/apps/microtvm/zephyr/demo_runtime/boards/nucleo_f746zg.conf
new file mode 100644
index 000000000000..5931377d55ae
--- /dev/null
+++ b/apps/microtvm/zephyr/demo_runtime/boards/nucleo_f746zg.conf
@@ -0,0 +1,30 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# This file is specific to the nRF5340 DK board.
+
+# For intrinsics used by generated optimized operators.
+CONFIG_CMSIS_DSP=y
+
+# Required for Cortex-M33 devices.
+CONFIG_MAIN_STACK_SIZE=50
+
+# For random number generation.
+CONFIG_ENTROPY_GENERATOR=y
+
+# For debugging.
+CONFIG_LED=y
diff --git a/apps/microtvm/zephyr/demo_runtime/boards/qemu_x86.conf b/apps/microtvm/zephyr/demo_runtime/boards/qemu_x86.conf
new file mode 100644
index 000000000000..e0e4ae2fb2d3
--- /dev/null
+++ b/apps/microtvm/zephyr/demo_runtime/boards/qemu_x86.conf
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file is specific to the QEMU-emulated microTVM board.
+
+# For TVMPlatformGenerateRandom(). Remember, these values do not need to be truly random.
+CONFIG_TEST_RANDOM_GENERATOR=y
+CONFIG_TIMER_RANDOM_GENERATOR=y
+
diff --git a/tests/micro/qemu/zephyr-runtime/crt/crt_config.h b/apps/microtvm/zephyr/demo_runtime/crt/crt_config.h
similarity index 98%
rename from tests/micro/qemu/zephyr-runtime/crt/crt_config.h
rename to apps/microtvm/zephyr/demo_runtime/crt/crt_config.h
index a7f4f90b0538..f8fc7514a28d 100644
--- a/tests/micro/qemu/zephyr-runtime/crt/crt_config.h
+++ b/apps/microtvm/zephyr/demo_runtime/crt/crt_config.h
@@ -59,6 +59,6 @@
 /*! \brief Number of pages on device. */
 #define TVM_CRT_MAX_PAGES 300
 
-//#define TVM_CRT_FRAMER_ENABLE_LOGS
+// #define TVM_CRT_FRAMER_ENABLE_LOGS
 
 #endif  // TVM_RUNTIME_CRT_CONFIG_H_
diff --git a/tests/micro/qemu/zephyr-runtime/prj.conf b/apps/microtvm/zephyr/demo_runtime/prj.conf
similarity index 87%
rename from tests/micro/qemu/zephyr-runtime/prj.conf
rename to apps/microtvm/zephyr/demo_runtime/prj.conf
index 7be42b260bbb..bf2b330e35a6 100644
--- a/tests/micro/qemu/zephyr-runtime/prj.conf
+++ b/apps/microtvm/zephyr/demo_runtime/prj.conf
@@ -15,6 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 
+# The settings in this file are generic for all boards, and are merged
+# with the settings in the file boards/<BOARD>.conf by the Zephyr build
+# process.
+
 # For UART implementation in main().
 CONFIG_RING_BUFFER=y
 CONFIG_UART_CONSOLE=n
@@ -30,6 +34,3 @@ CONFIG_FPU=y
 # For TVMPlatformAbort().
 CONFIG_REBOOT=y
 
-# For TVMPlatformGenerateRandom(). Remember, these values do not need to be truly random.
-CONFIG_TEST_RANDOM_GENERATOR=y
-CONFIG_TIMER_RANDOM_GENERATOR=y
diff --git a/tests/micro/qemu/zephyr-runtime/qemu-hack/qemu-system-i386 b/apps/microtvm/zephyr/demo_runtime/qemu-hack/qemu-system-i386
similarity index 100%
rename from tests/micro/qemu/zephyr-runtime/qemu-hack/qemu-system-i386
rename to apps/microtvm/zephyr/demo_runtime/qemu-hack/qemu-system-i386
diff --git a/tests/micro/qemu/zephyr-runtime/src/main.c b/apps/microtvm/zephyr/demo_runtime/src/main.c
similarity index 55%
rename from tests/micro/qemu/zephyr-runtime/src/main.c
rename to apps/microtvm/zephyr/demo_runtime/src/main.c
index e04fc20508b4..e2aa59af7ad9 100644
--- a/tests/micro/qemu/zephyr-runtime/src/main.c
+++ b/apps/microtvm/zephyr/demo_runtime/src/main.c
@@ -22,8 +22,16 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
+/*
+ * This is a sample Zephyr-based application that contains the logic
+ * needed to control a microTVM-based model via the UART. This is only
+ * intended to be a demonstration, since typically you will want to incorporate
+ * this logic into your own application.
+ */
+
 #include <drivers/gpio.h>
 #include <drivers/uart.h>
+#include <fatal.h>
 #include <kernel.h>
 #include <power/reboot.h>
 #include <random/rand32.h>
@@ -41,65 +49,107 @@
 
 #include "crt_config.h"
 
-K_SEM_DEFINE(tx_sem, 0, 1);
-
 static const struct device* tvm_uart;
 
-int write_hook(int c) {
-  uart_poll_out(tvm_uart, c);
-  return 0;
-}
+#ifdef CONFIG_LED
+#define LED0_NODE DT_ALIAS(led0)
+#define LED0 DT_GPIO_LABEL(LED0_NODE, gpios)
+#define LED0_PIN DT_GPIO_PIN(LED0_NODE, gpios)
+#define LED0_FLAGS DT_GPIO_FLAGS(LED0_NODE, gpios)
+static const struct device* led0_pin;
+#endif  // CONFIG_LED
 
+static size_t g_num_bytes_requested = 0;
+static size_t g_num_bytes_written = 0;
+
+// Called by TVM to write serial data to the UART.
 ssize_t write_serial(void* unused_context, const uint8_t* data, size_t size) {
+#ifdef CONFIG_LED
+  gpio_pin_set(led0_pin, LED0_PIN, 1);
+#endif
+  g_num_bytes_requested += size;
+
   for (size_t i = 0; i < size; i++) {
     uart_poll_out(tvm_uart, data[i]);
+    g_num_bytes_written++;
   }
 
+#ifdef CONFIG_LED
+  gpio_pin_set(led0_pin, LED0_PIN, 0);
+#endif
+
   return size;
 }
 
+// This is invoked by Zephyr from an exception handler, which will be invoked
+// if the device crashes. Here, we turn on the LED and spin.
+void k_sys_fatal_error_handler(unsigned int reason, const z_arch_esf_t* esf) {
+#ifdef CONFIG_LED
+  gpio_pin_set(led0_pin, LED0_PIN, 1);
+#endif
+  for (;;)
+    ;
+}
+
+// Called by TVM when a message needs to be formatted.
 size_t TVMPlatformFormatMessage(char* out_buf, size_t out_buf_size_bytes, const char* fmt,
                                 va_list args) {
   return vsnprintk(out_buf, out_buf_size_bytes, fmt, args);
 }
 
+// Called by TVM when an internal invariant is violated, and execution cannot continue.
 void TVMPlatformAbort(tvm_crt_error_t error) {
   sys_reboot(SYS_REBOOT_COLD);
+#ifdef CONFIG_LED
+  gpio_pin_set(led0_pin, LED0_PIN, 1);
+#endif
   for (;;)
     ;
 }
 
-K_MEM_POOL_DEFINE(tvm_memory_pool, 64, 1024, 120, 4);
+// Called by TVM to generate random data.
+tvm_crt_error_t TVMPlatformGenerateRandom(uint8_t* buffer, size_t num_bytes) {
+  uint32_t random;  // one unit of random data.
+
+  // Fill parts of `buffer` which are as large as `random`.
+  size_t num_full_blocks = num_bytes / sizeof(random);
+  for (int i = 0; i < num_full_blocks; ++i) {
+    random = sys_rand32_get();
+    memcpy(&buffer[i * sizeof(random)], &random, sizeof(random));
+  }
+
+  // Fill any leftover tail which is smaller than `random`.
+  size_t num_tail_bytes = num_bytes % sizeof(random);
+  if (num_tail_bytes > 0) {
+    random = sys_rand32_get();
+    memcpy(&buffer[num_bytes - num_tail_bytes], &random, num_tail_bytes);
+  }
+  return kTvmErrorNoError;
+}
+
+// Memory pool for use by TVMPlatformMemoryAllocate.
+K_MEM_POOL_DEFINE(tvm_memory_pool, 64, 1024, 216, 4);
 
-tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLContext ctx, void** out_ptr) {
+// Called by TVM to allocate memory.
+tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr) {
   *out_ptr = k_mem_pool_malloc(&tvm_memory_pool, num_bytes);
   return (*out_ptr == NULL) ? kTvmErrorPlatformNoMemory : kTvmErrorNoError;
 }
 
-tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLContext ctx) {
+// Called by TVM to deallocate memory.
+tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) {
   k_free(ptr);
   return kTvmErrorNoError;
 }
 
-uint32_t g_utvm_start_time;
-
 #define MILLIS_TIL_EXPIRY 200
 #define TIME_TIL_EXPIRY (K_MSEC(MILLIS_TIL_EXPIRY))
 K_TIMER_DEFINE(g_utvm_timer, /* expiry func */ NULL, /* stop func */ NULL);
 
+uint32_t g_utvm_start_time;
 int g_utvm_timer_running = 0;
 
-#ifdef CONFIG_LED
-/* The devicetree node identifier for the "led0" alias. */
-#define LED0_NODE DT_ALIAS(led0)
-
-#define LED0 DT_GPIO_LABEL(LED0_NODE, gpios)
-#define PIN DT_GPIO_PIN(LED0_NODE, gpios)
-#define FLAGS DT_GPIO_FLAGS(LED0_NODE, gpios)
-
-static struct device* led_pin;
-#endif  // CONFIG_LED
-
+// Called to start system timer.
 tvm_crt_error_t TVMPlatformTimerStart() {
   if (g_utvm_timer_running) {
     TVMLogf("timer already running");
@@ -107,7 +157,7 @@ tvm_crt_error_t TVMPlatformTimerStart() {
   }
 
 #ifdef CONFIG_LED
-  gpio_pin_set(led_pin, PIN, 1);
+  gpio_pin_set(led0_pin, LED0_PIN, 1);
 #endif
   k_timer_start(&g_utvm_timer, TIME_TIL_EXPIRY, TIME_TIL_EXPIRY);
   g_utvm_start_time = k_cycle_get_32();
@@ -115,15 +165,16 @@ tvm_crt_error_t TVMPlatformTimerStart() {
   return kTvmErrorNoError;
 }
 
+// Called to stop system timer.
 tvm_crt_error_t TVMPlatformTimerStop(double* elapsed_time_seconds) {
   if (!g_utvm_timer_running) {
     TVMLogf("timer not running");
-    return kTvmErrorPlatformTimerBadState;
+    return kTvmErrorSystemErrorMask | 2;
   }
 
   uint32_t stop_time = k_cycle_get_32();
 #ifdef CONFIG_LED
-  gpio_pin_set(led_pin, PIN, 0);
+  gpio_pin_set(led0_pin, LED0_PIN, 0);
 #endif
 
   // compute how long the work took
@@ -135,7 +186,7 @@ tvm_crt_error_t TVMPlatformTimerStop(double* elapsed_time_seconds) {
   }
 
   uint32_t ns_spent = (uint32_t)k_cyc_to_ns_floor64(cycles_spent);
-  double hw_clock_elapsed_seconds = ns_spent / 1e9;
+  double hw_clock_res_us = ns_spent / 1000.0;
 
   // need to grab time remaining *before* stopping. when stopped, this function
   // always returns 0.
@@ -144,7 +195,7 @@ tvm_crt_error_t TVMPlatformTimerStop(double* elapsed_time_seconds) {
   // check *after* stopping to prevent extra expiries on the happy path
   if (time_remaining_ms < 0) {
     TVMLogf("negative time remaining");
-    return -1;
+    return kTvmErrorSystemErrorMask | 3;
   }
   uint32_t num_expiries = k_timer_status_get(&g_utvm_timer);
   uint32_t timer_res_ms = ((num_expiries * MILLIS_TIL_EXPIRY) + time_remaining_ms);
@@ -153,113 +204,112 @@ tvm_crt_error_t TVMPlatformTimerStop(double* elapsed_time_seconds) {
   // if we approach the limits of the HW clock datatype (uint32_t), use the
   // coarse-grained timer result instead
   if (approx_num_cycles > (0.5 * (~((uint32_t)0)))) {
-    *elapsed_time_seconds = timer_res_ms / 1e3;
+    *elapsed_time_seconds = timer_res_ms / 1000.0;
   } else {
-    *elapsed_time_seconds = hw_clock_elapsed_seconds;
+    *elapsed_time_seconds = hw_clock_res_us / 1e6;
   }
 
   g_utvm_timer_running = 0;
   return kTvmErrorNoError;
 }
 
-tvm_crt_error_t TVMPlatformGenerateRandom(uint8_t* buffer, size_t num_bytes) {
-  uint32_t random;  // one unit of random data.
-
-  // Fill parts of `buffer` which are as large as `random`.
-  size_t num_full_blocks = num_bytes / sizeof(random);
-  for (int i = 0; i < num_full_blocks; ++i) {
-    random = sys_rand32_get();
-    memcpy(&buffer[i * sizeof(random)], &random, sizeof(random));
-  }
-
-  // Fill any leftover tail which is smaller than `random`.
-  size_t num_tail_bytes = num_bytes % sizeof(random);
-  if (num_tail_bytes > 0) {
-    random = sys_rand32_get();
-    memcpy(&buffer[num_bytes - num_tail_bytes], &random, num_tail_bytes);
-  }
-
-  return kTvmErrorNoError;
-}
-
-#define RING_BUF_SIZE 512
-struct uart_rx_buf_t {
-  struct ring_buf buf;
-  uint32_t buffer[RING_BUF_SIZE];
-};
+// Ring buffer used to store data read from the UART on rx interrupt.
+#define RING_BUF_SIZE_BYTES 4 * 1024
+RING_BUF_DECLARE(uart_rx_rbuf, RING_BUF_SIZE_BYTES);
 
-struct uart_rx_buf_t uart_rx_buf;
+// Small buffer used to read data from the UART into the ring buffer.
+static uint8_t uart_data[32];
 
+// UART interrupt callback.
 void uart_irq_cb(const struct device* dev, void* user_data) {
   while (uart_irq_update(dev) && uart_irq_is_pending(dev)) {
-    struct uart_rx_buf_t* buf = (struct uart_rx_buf_t*)user_data;
-    if (uart_irq_rx_ready(dev) == 0) {
-      continue;
-    }
-
-    uint8_t data[32];
-    for (;;) {
-      int bytes_read = uart_fifo_read(dev, data, sizeof(data));
-      if (bytes_read < 0) {
-        TVMPlatformAbort(0xbeef);
-      } else if (bytes_read == 0) {
-        break;
+    struct ring_buf* rbuf = (struct ring_buf*)user_data;
+    if (uart_irq_rx_ready(dev) != 0) {
+      for (;;) {
+        // Read a small chunk of data from the UART.
+        int bytes_read = uart_fifo_read(dev, uart_data, sizeof(uart_data));
+        if (bytes_read < 0) {
+          TVMPlatformAbort((tvm_crt_error_t)0xbeef1);
+        } else if (bytes_read == 0) {
+          break;
+        }
+        // Write it into the ring buffer.
+        int bytes_written = ring_buf_put(rbuf, uart_data, bytes_read);
+        if (bytes_read != bytes_written) {
+          TVMPlatformAbort((tvm_crt_error_t)0xbeef2);
+        }
+        // CHECK_EQ(bytes_read, bytes_written, "bytes_read: %d; bytes_written: %d", bytes_read,
+        //         bytes_written);
       }
-      int bytes_written = ring_buf_put(&buf->buf, data, bytes_read);
-      CHECK_EQ(bytes_read, bytes_written, "bytes_read: %d; bytes_written: %d", bytes_read,
-               bytes_written);
     }
   }
 }
 
-void uart_rx_init(struct uart_rx_buf_t* buf, const struct device* dev) {
-  ring_buf_init(&buf->buf, RING_BUF_SIZE, buf->buffer);
-  uart_irq_callback_user_data_set(dev, uart_irq_cb, (void*)buf);
+// Used to initialize the UART receiver.
+void uart_rx_init(struct ring_buf* rbuf, const struct device* dev) {
+  uart_irq_callback_user_data_set(dev, uart_irq_cb, (void*)rbuf);
   uart_irq_rx_enable(dev);
 }
 
-int uart_rx_buf_read(struct uart_rx_buf_t* buf, uint8_t* data, size_t data_size_bytes) {
+// Used to read data from the UART.
+int uart_rx_buf_read(struct ring_buf* rbuf, uint8_t* data, size_t data_size_bytes) {
   unsigned int key = irq_lock();
-  int bytes_read = ring_buf_get(&buf->buf, data, data_size_bytes);
+  int bytes_read = ring_buf_get(rbuf, data, data_size_bytes);
   irq_unlock(key);
   return bytes_read;
 }
 
+// Buffer used to read from the UART rx ring buffer and feed it to the UTvmRpcServerLoop.
+static uint8_t main_rx_buf[RING_BUF_SIZE_BYTES];
+
+// The main function of this application.
 extern void __stdout_hook_install(int (*hook)(int));
 void main(void) {
 #ifdef CONFIG_LED
-  led_pin = device_get_binding(LED0);
-  if (led_pin == NULL) {
+  int ret;
+  led0_pin = device_get_binding(LED0);
+  if (led0_pin == NULL) {
     for (;;)
       ;
   }
-  int ret = gpio_pin_configure(led_pin, PIN, GPIO_OUTPUT_ACTIVE | FLAGS);
+  ret = gpio_pin_configure(led0_pin, LED0_PIN, GPIO_OUTPUT_ACTIVE | LED0_FLAGS);
   if (ret < 0) {
-    for (;;)
-      ;
+    TVMPlatformAbort((tvm_crt_error_t)0xbeef4);
   }
-  gpio_pin_set(led_pin, PIN, 0);
+  gpio_pin_set(led0_pin, LED0_PIN, 1);
 #endif
 
-  /* Claim console device */
+  // Claim console device.
   tvm_uart = device_get_binding(DT_LABEL(DT_CHOSEN(zephyr_console)));
-  uart_rx_init(&uart_rx_buf, tvm_uart);
-  __stdout_hook_install(&write_hook);
+  uart_rx_init(&uart_rx_rbuf, tvm_uart);
 
+  // Initialize microTVM RPC server, which will receive commands from the UART and execute them.
   utvm_rpc_server_t server = UTvmRpcServerInit(write_serial, NULL);
-  TVMLogf("uTVM On-Device Runtime");
+  TVMLogf("microTVM Zephyr runtime - running");
+#ifdef CONFIG_LED
+  gpio_pin_set(led0_pin, LED0_PIN, 0);
+#endif
 
+  // The main application loop. We continuously read commands from the UART
+  // and dispatch them to UTvmRpcServerLoop().
   while (true) {
-    uint8_t buf[256];
-    int bytes_read = uart_rx_buf_read(&uart_rx_buf, buf, sizeof(buf));
+    int bytes_read = uart_rx_buf_read(&uart_rx_rbuf, main_rx_buf, sizeof(main_rx_buf));
     if (bytes_read > 0) {
       size_t bytes_remaining = bytes_read;
-      uint8_t* cursor = buf;
+      uint8_t* cursor = main_rx_buf;
       while (bytes_remaining > 0) {
+        // Pass the received bytes to the RPC server.
         tvm_crt_error_t err = UTvmRpcServerLoop(server, &cursor, &bytes_remaining);
         if (err != kTvmErrorNoError && err != kTvmErrorFramingShortPacket) {
           TVMPlatformAbort(err);
         }
+        if (g_num_bytes_written != 0 || g_num_bytes_requested != 0) {
+          if (g_num_bytes_written != g_num_bytes_requested) {
+            TVMPlatformAbort((tvm_crt_error_t)0xbeef5);
+          }
+          g_num_bytes_written = 0;
+          g_num_bytes_requested = 0;
+        }
       }
     }
   }
diff --git a/apps/topi_recipe/conv/depthwise_conv2d_test.py b/apps/topi_recipe/conv/depthwise_conv2d_test.py
index 94687edde5f9..e282e67af717 100644
--- a/apps/topi_recipe/conv/depthwise_conv2d_test.py
+++ b/apps/topi_recipe/conv/depthwise_conv2d_test.py
@@ -95,32 +95,32 @@ def check_device(device):
         if not tvm.runtime.enabled(device):
             print("Skip because %s is not enabled" % device)
             return
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         # Build the kernel
         f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device)
         f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device)
         f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device)
         # Prepare data
-        input_tvm = tvm.nd.array(input_np, ctx)
-        filter_tvm = tvm.nd.array(filter_np, ctx)
-        scale_tvm = tvm.nd.array(scale_np, ctx)
-        shift_tvm = tvm.nd.array(shift_np, ctx)
+        input_tvm = tvm.nd.array(input_np, dev)
+        filter_tvm = tvm.nd.array(filter_np, dev)
+        scale_tvm = tvm.nd.array(scale_np, dev)
+        shift_tvm = tvm.nd.array(shift_np, dev)
 
         depthwise_conv2d_tvm = tvm.nd.array(
-            np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), ctx
+            np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), dev
         )
         scale_shift_tvm = tvm.nd.array(
-            np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), ctx
+            np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), dev
         )
-        relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx)
+        relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), dev)
         # Measure time cost of kernel 1 (depthwise_conv2d)
-        timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=1000)
+        timer_1 = f1.time_evaluator(f1.entry_name, dev, number=1000)
         tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean
         # Measure time cost of kernel 2 (depthwise_conv2d + scale_shift)
-        timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=1000)
+        timer_2 = f2.time_evaluator(f2.entry_name, dev, number=1000)
         tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm, scale_shift_tvm).mean
         # Measure time cost of kernel 3 (depthwise_conv2d + scale_shift + relu)
-        timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1000)
+        timer_3 = f3.time_evaluator(f3.entry_name, dev, number=1000)
         tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean
         print("Input shape = " + str(get_const_tuple(Input.shape)))
         print("Filter shape = " + str(get_const_tuple(Filter.shape)))
@@ -203,31 +203,31 @@ def check_device(device):
         if not tvm.runtime.enabled(device):
             print("Skip because %s is not enabled" % device)
             return
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         # Build the kernel
         f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device)
         f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device)
         f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device)
         # Prepare data
-        input_tvm = tvm.nd.array(input_np, ctx)
-        filter_tvm = tvm.nd.array(filter_np, ctx)
-        scale_tvm = tvm.nd.array(scale_np, ctx)
-        shift_tvm = tvm.nd.array(shift_np, ctx)
+        input_tvm = tvm.nd.array(input_np, dev)
+        filter_tvm = tvm.nd.array(filter_np, dev)
+        scale_tvm = tvm.nd.array(scale_np, dev)
+        shift_tvm = tvm.nd.array(shift_np, dev)
         depthwise_conv2d_tvm = tvm.nd.array(
-            np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), ctx
+            np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), dev
         )
         scale_shift_tvm = tvm.nd.array(
-            np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), ctx
+            np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), dev
         )
-        relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx)
+        relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), dev)
         # Measure time cost of kernel 1 (depthwise_conv2d)
-        timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=1000)
+        timer_1 = f1.time_evaluator(f1.entry_name, dev, number=1000)
         tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean
         # Measure time cost of kernel 2 (depthwise_conv2d + scale_shift)
-        timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=1000)
+        timer_2 = f2.time_evaluator(f2.entry_name, dev, number=1000)
         tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm, scale_shift_tvm).mean
         # Measure time cost of kernel 3 (depthwise_conv2d + scale_shift + relu)
-        timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1000)
+        timer_3 = f3.time_evaluator(f3.entry_name, dev, number=1000)
         tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean
         print("Input shape = " + str(get_const_tuple(Input.shape)))
         print("Filter shape = " + str(get_const_tuple(Filter.shape)))
diff --git a/apps/topi_recipe/conv/test_conv2d_hwcn_map.py b/apps/topi_recipe/conv/test_conv2d_hwcn_map.py
index d67bfdc8952e..a2394a7279c0 100644
--- a/apps/topi_recipe/conv/test_conv2d_hwcn_map.py
+++ b/apps/topi_recipe/conv/test_conv2d_hwcn_map.py
@@ -75,11 +75,11 @@ def check_device(device):
         if not tvm.runtime.enabled(device):
             print("Skip because %s is not enabled" % device)
             return
-        ctx = tvm.context(device, 0)
-        a = tvm.nd.array(a_np, ctx)
-        w = tvm.nd.array(w_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        dev = tvm.device(device, 0)
+        a = tvm.nd.array(a_np, dev)
+        w = tvm.nd.array(w_np, dev)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
 
         with tvm.transform.PassContext(
             config={
diff --git a/apps/topi_recipe/conv/test_conv_int8_arm.py b/apps/topi_recipe/conv/test_conv_int8_arm.py
index 289e69ab9b5b..4e8262928568 100644
--- a/apps/topi_recipe/conv/test_conv_int8_arm.py
+++ b/apps/topi_recipe/conv/test_conv_int8_arm.py
@@ -61,7 +61,7 @@
 
 TARGET_NAME = "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+v8.2a,+dotprod"
 NUM_VEC_LANES = 16
-CTX = tvm.context(TARGET_NAME, 0)
+DEV = tvm.device(TARGET_NAME, 0)
 
 
 def get_shape(
@@ -136,16 +136,16 @@ def run_inference(
 
     # Create the numpy arrays to be used for executing conv models
     if data_dtype == "float32":
-        data_array = tvm.nd.array(np.random.rand(*data_shape).astype(dtype=data_dtype), CTX)
-        kernel_array = tvm.nd.array(np.random.rand(*kernel_shape).astype(dtype=kernel_dtype), CTX)
+        data_array = tvm.nd.array(np.random.rand(*data_shape).astype(dtype=data_dtype), DEV)
+        kernel_array = tvm.nd.array(np.random.rand(*kernel_shape).astype(dtype=kernel_dtype), DEV)
     else:
         data_array = tvm.nd.array(np.random.randint(100, size=data_shape).astype(data_dtype))
         kernel_array = tvm.nd.array(np.random.randint(100, size=kernel_shape).astype(kernel_dtype))
 
     # c_orig will be used for declaration ouptut
     # c_sch will be used for scheduled computation output
-    c_orig = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), CTX)
-    c_sch = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), CTX)
+    c_orig = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), DEV)
+    c_sch = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), DEV)
 
     with tvm.target.Target(TARGET_NAME):
         if out_dtype == "float32":
@@ -190,7 +190,7 @@ def run_inference(
         else:
             assert np.allclose(c_orig.asnumpy(), c_sch.asnumpy())
 
-        evaluator = func.time_evaluator(func.entry_name, CTX, number=1000)
+        evaluator = func.time_evaluator(func.entry_name, DEV, number=1000)
         LOGGER.debug(tvm.lower(sconv, [data, kernel], simple_mode=True))
         return evaluator(data_array, kernel_array, c_sch).mean
 
diff --git a/apps/topi_recipe/conv/test_conv_int8_intel.py b/apps/topi_recipe/conv/test_conv_int8_intel.py
index 562812ab8d82..b46d80fff821 100644
--- a/apps/topi_recipe/conv/test_conv_int8_intel.py
+++ b/apps/topi_recipe/conv/test_conv_int8_intel.py
@@ -61,7 +61,7 @@
 
 TARGET_NAME = "llvm -mcpu=skylake-avx512"
 NUM_VEC_LANES = 16
-CTX = tvm.context(TARGET_NAME, 0)
+DEV = tvm.device(TARGET_NAME, 0)
 
 
 def get_shape(
@@ -137,16 +137,16 @@ def run_inference(
 
     # Create the numpy arrays to be used for executing conv models
     if data_dtype == "float32":
-        data_array = tvm.nd.array(np.random.rand(*data_shape).astype(dtype=data_dtype), CTX)
-        kernel_array = tvm.nd.array(np.random.rand(*kernel_shape).astype(dtype=kernel_dtype), CTX)
+        data_array = tvm.nd.array(np.random.rand(*data_shape).astype(dtype=data_dtype), DEV)
+        kernel_array = tvm.nd.array(np.random.rand(*kernel_shape).astype(dtype=kernel_dtype), DEV)
     else:
         data_array = tvm.nd.array(np.random.randint(100, size=data_shape).astype(data_dtype))
         kernel_array = tvm.nd.array(np.random.randint(100, size=kernel_shape).astype(kernel_dtype))
 
     # c_orig will be used for declaration ouptut
     # c_sch will be used for scheduled computation output
-    c_orig = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), CTX)
-    c_sch = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), CTX)
+    c_orig = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), DEV)
+    c_sch = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), DEV)
 
     with tvm.target.Target(TARGET_NAME):
         conv = topi.nn.conv2d_NCHWc(
@@ -176,7 +176,7 @@ def run_inference(
         else:
             assert np.allclose(c_orig.asnumpy(), c_sch.asnumpy())
 
-        evaluator = func.time_evaluator(func.entry_name, CTX, number=1000)
+        evaluator = func.time_evaluator(func.entry_name, DEV, number=1000)
         LOGGER.debug(tvm.lower(sconv, [data, kernel], simple_mode=True))
         return evaluator(data_array, kernel_array, c_sch).mean
 
diff --git a/apps/topi_recipe/gemm/android_gemm_square.py b/apps/topi_recipe/gemm/android_gemm_square.py
index 0e64dcd3844d..41370c677b38 100644
--- a/apps/topi_recipe/gemm/android_gemm_square.py
+++ b/apps/topi_recipe/gemm/android_gemm_square.py
@@ -40,14 +40,14 @@ def ngflops(N):
 dtype = "float32"
 
 
-def evaluate(func, ctx, N, times):
+def evaluate(func, dev, N, times):
     a_np = np.random.uniform(size=(N, N)).astype(dtype)
     b_np = np.random.uniform(size=(N, N)).astype(dtype)
-    a = tvm.nd.array(a_np, ctx)
-    b = tvm.nd.array(b_np, ctx)
-    c = tvm.nd.array(np.zeros((N, N), dtype=dtype), ctx)
+    a = tvm.nd.array(a_np, dev)
+    b = tvm.nd.array(b_np, dev)
+    c = tvm.nd.array(np.zeros((N, N), dtype=dtype), dev)
 
-    time_f = func.time_evaluator(func.entry_name, ctx, number=times)
+    time_f = func.time_evaluator(func.entry_name, dev, number=times)
     cost = time_f(a, b, c).mean
     gf = ngflops(N) / cost
     print("%g secs/op, %g GFLOPS" % (cost, gf))
@@ -127,11 +127,11 @@ def test_gemm_gpu(N, times, bn, num_block, num_thread):
 
     # connect to the proxy
     remote = rpc.connect(proxy_host, proxy_port, key=key)
-    ctx = remote.cl(0)
+    dev = remote.cl(0)
     remote.upload(path_dso)
     f = remote.load_module("gemm_gpu.so")
 
-    evaluate(f, ctx, N, times)
+    evaluate(f, dev, N, times)
 
 
 if __name__ == "__main__":
diff --git a/apps/topi_recipe/gemm/cuda_gemm_square.py b/apps/topi_recipe/gemm/cuda_gemm_square.py
index 0d548dc0b554..d84deea86e82 100644
--- a/apps/topi_recipe/gemm/cuda_gemm_square.py
+++ b/apps/topi_recipe/gemm/cuda_gemm_square.py
@@ -121,8 +121,8 @@ def test_gemm():
     s[BB].double_buffer()
     # correctness
     def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not ctx.exist:
+        dev = tvm.device(device, 0)
+        if not dev.exist:
             print("Skip because %s is not enabled" % device)
             return
         print("Device %s" % device)
@@ -131,16 +131,16 @@ def check_device(device):
         n, m, l = nn, nn, nn
         a_np = np.random.uniform(size=(n, l)).astype(A.dtype)
         b_np = np.random.uniform(size=(m, l)).astype(B.dtype)
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(b_np, ctx)
-        c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
+        a = tvm.nd.array(a_np, dev)
+        b = tvm.nd.array(b_np, dev)
+        c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev)
         for i in range(2):
             f(a, b, c)
         tvm.testing.assert_allclose(c.asnumpy(), np.dot(b_np.T, a_np), rtol=1e-5)
 
         num_flops = 2 * nn * nn * nn
         num_runs = 10
-        timer_f = f.time_evaluator(f.entry_name, ctx, number=num_runs)
+        timer_f = f.time_evaluator(f.entry_name, dev, number=num_runs)
         t = timer_f(a, b, c).mean
         GFLOPS = num_flops / (t * 1e3) / 1e6
         print("average time cost of %d runs = %g ms, %g GFLOPS." % (num_runs, t * 1e3, GFLOPS))
diff --git a/apps/topi_recipe/gemm/gemm_int8.py b/apps/topi_recipe/gemm/gemm_int8.py
index 0d0941d4217d..ff3fa81f20d3 100644
--- a/apps/topi_recipe/gemm/gemm_int8.py
+++ b/apps/topi_recipe/gemm/gemm_int8.py
@@ -160,14 +160,14 @@ def block_size_filter(entity):
             s, arg_bufs = gemm_int8(n, m, l)
             f = tvm.build(s, arg_bufs, "cuda", name="gemm_int8")
 
-    ctx = tvm.context("cuda", 0)
+    dev = tvm.device("cuda", 0)
 
     a_np = np.random.randint(size=(n, l), low=-128, high=127, dtype="int8")
     b_np = np.random.randint(size=(m, l), low=-128, high=127, dtype="int8")
 
-    a = tvm.nd.array(a_np, ctx)
-    b = tvm.nd.array(b_np, ctx)
-    c = tvm.nd.array(np.zeros((n, m), dtype="int32"), ctx)
+    a = tvm.nd.array(a_np, dev)
+    b = tvm.nd.array(b_np, dev)
+    c = tvm.nd.array(np.zeros((n, m), dtype="int32"), dev)
     f(a, b, c)
 
     tvm.testing.assert_allclose(
@@ -176,7 +176,7 @@ def block_size_filter(entity):
 
     num_ops = 2 * l * m * n
     num_runs = 1000
-    timer_f = f.time_evaluator(f.entry_name, ctx, number=num_runs)
+    timer_f = f.time_evaluator(f.entry_name, dev, number=num_runs)
     t = timer_f(a, b, c).mean
     GOPS = num_ops / (t * 1e3) / 1e6
     print("average time cost of %d runs = %g ms, %g GOPS." % (num_runs, t * 1e3, GOPS))
diff --git a/apps/topi_recipe/reduce/test_reduce_map.py b/apps/topi_recipe/reduce/test_reduce_map.py
index 00b1ca2676df..0a78e5bedb58 100644
--- a/apps/topi_recipe/reduce/test_reduce_map.py
+++ b/apps/topi_recipe/reduce/test_reduce_map.py
@@ -78,8 +78,8 @@ def test_reduce_map(in_shape, axis, keepdims, type="sum", test_id=0):
     else:
         raise NotImplementedError
 
-    data_tvm = tvm.nd.array(in_npy, ctx=tvm.gpu())
-    out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=tvm.gpu())
+    data_tvm = tvm.nd.array(in_npy, device=tvm.gpu())
+    out_tvm = tvm.nd.empty(shape=out_npy.shape, device=tvm.gpu())
 
     for _ in range(2):
         fcuda(data_tvm, out_tvm)
diff --git a/apps/topi_recipe/rnn/lstm.py b/apps/topi_recipe/rnn/lstm.py
index 701797e18dbf..e4b7fbade387 100644
--- a/apps/topi_recipe/rnn/lstm.py
+++ b/apps/topi_recipe/rnn/lstm.py
@@ -171,20 +171,20 @@ def lstm():
     def check_device(target):
         num_step = n_num_step
         flstm = tvm.build(s, [Xi2h, Wh2h, scan_h, scan_c], target)
-        ctx = tvm.gpu(0) if target == "cuda" else tvm.cl(0)
+        dev = tvm.gpu(0) if target == "cuda" else tvm.cl(0)
         # launch the kernel.
         scan_h_np = np.zeros((num_step, batch_size, num_hidden)).astype("float32")
         scan_c_np = np.zeros((num_step, batch_size, num_hidden)).astype("float32")
         Xi2h_np = np.random.normal(size=(num_step, batch_size, 4, num_hidden)).astype("float32")
         Wh2h_np = np.random.normal(size=(4, num_hidden, num_hidden)).astype("float32")
-        scan_h_a = tvm.nd.array(scan_h_np, ctx)
-        scan_c_a = tvm.nd.array(scan_c_np, ctx)
-        Xi2h_a = tvm.nd.array(Xi2h_np, ctx)
-        Wh2h_a = tvm.nd.array(Wh2h_np, ctx)
+        scan_h_a = tvm.nd.array(scan_h_np, dev)
+        scan_c_a = tvm.nd.array(scan_c_np, dev)
+        Xi2h_a = tvm.nd.array(Xi2h_np, dev)
+        Wh2h_a = tvm.nd.array(Wh2h_np, dev)
         flstm(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a)
-        ctx.sync()
+        dev.sync()
         # measure time cost of second step.
-        evaluator = flstm.time_evaluator(flstm.entry_name, ctx, 1, repeat=1000)
+        evaluator = flstm.time_evaluator(flstm.entry_name, dev, 1, repeat=1000)
         eval_result = evaluator(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a)
         print("Time cost=%g" % eval_result.mean)
 
diff --git a/apps/topi_recipe/rnn/matexp.py b/apps/topi_recipe/rnn/matexp.py
index e2cea9b31a9d..ecf868cb5646 100644
--- a/apps/topi_recipe/rnn/matexp.py
+++ b/apps/topi_recipe/rnn/matexp.py
@@ -140,22 +140,22 @@ def check_device(target):
             }
         ):
             f = tvm.build(s, [s_scan, Whh], target)
-        ctx = tvm.gpu(0) if target == "cuda" else tvm.cl(0)
+        dev = tvm.gpu(0) if target == "cuda" else tvm.cl(0)
         # launch the kernel.
         res_np = np.zeros((n_num_step, n_batch_size, n_num_hidden)).astype("float32")
         Whh_np = np.zeros((n_num_hidden, n_num_hidden)).astype("float32")
         Whh_np[:] = 2.0 / n_num_hidden
         Whh_np[:, n_num_hidden // 2 :] = 0
 
-        res_a = tvm.nd.array(res_np, ctx)
-        Whh_a = tvm.nd.array(Whh_np, ctx)
+        res_a = tvm.nd.array(res_np, dev)
+        Whh_a = tvm.nd.array(Whh_np, dev)
         # Skip first pass as it is compilation
         f(res_a, Whh_a)
-        ctx.sync()
+        dev.sync()
         # measure time cost of second step.
         tstart = time.time()
         f(res_a, Whh_a)
-        ctx.sync()
+        dev.sync()
         tgap = time.time() - tstart
         print("Time cost=%g" % tgap)
         # correctness
diff --git a/apps/wasm-standalone/wasm-graph/src/types.rs b/apps/wasm-standalone/wasm-graph/src/types.rs
index 9d4dff96d189..a3761a758cff 100644
--- a/apps/wasm-standalone/wasm-graph/src/types.rs
+++ b/apps/wasm-standalone/wasm-graph/src/types.rs
@@ -24,7 +24,7 @@ use std::{
 };
 pub use tvm_sys::ffi::DLTensor;
 use tvm_sys::ffi::{
-    DLContext, DLDataType, DLDataTypeCode_kDLFloat, DLDataTypeCode_kDLInt, DLDeviceType_kDLCPU,
+    DLDevice, DLDataType, DLDataTypeCode_kDLFloat, DLDataTypeCode_kDLInt, DLDeviceType_kDLCPU,
 };
 
 #[derive(Debug, PartialEq, Clone, Serialize, Deserialize)]
@@ -114,7 +114,7 @@ impl Tensor {
     pub fn as_dltensor(&self) -> DLTensor {
         DLTensor {
             data: self.data.as_ptr() as *mut c_void,
-            ctx: DLContext {
+            device: DLDevice {
                 device_type: DLDeviceType_kDLCPU,
                 device_id: 0 as c_int,
             },
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 30c21f707c08..7b29df648ac7 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -96,14 +96,14 @@ set(USE_CPP_RPC OFF)
 # Whether embed stackvm into the runtime
 set(USE_STACKVM_RUNTIME OFF)
 
-# Whether enable tiny embedded graph runtime.
-set(USE_GRAPH_RUNTIME ON)
+# Whether enable tiny embedded graph executor.
+set(USE_GRAPH_EXECUTOR ON)
 
-# Whether enable additional graph debug functions
-set(USE_GRAPH_RUNTIME_DEBUG OFF)
+# Whether enable tiny graph executor with CUDA Graph
+set(USE_GRAPH_EXECUTOR_CUDA_GRAPH OFF)
 
-# Whether enable additional vm profiler functions
-set(USE_VM_PROFILER OFF)
+# Whether to enable the profiler for the graph executor and vm
+set(USE_PROFILER ON)
 
 # Whether enable uTVM standalone runtime
 set(USE_MICRO_STANDALONE_RUNTIME OFF)
@@ -116,7 +116,7 @@ set(USE_MICRO_STANDALONE_RUNTIME OFF)
 # - OFF: disable llvm, note this will disable CPU codegen
 #        which is needed for most cases
 # - /path/to/llvm-config: enable specific LLVM when multiple llvm-dev is available.
-set(USE_LLVM ON)
+set(USE_LLVM OFF)
 
 #---------------------------------------------
 # Contrib libraries
@@ -207,10 +207,10 @@ set(USE_DNNL_CODEGEN OFF)
 #
 # USE_ARM_COMPUTE_LIB - Support for compiling a relay graph offloading supported
 #                       operators to Arm Compute Library. OFF/ON
-# USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME - Run Arm Compute Library annotated functions via the ACL
+# USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR - Run Arm Compute Library annotated functions via the ACL
 #                                     runtime. OFF/ON/"path/to/ACL"
 set(USE_ARM_COMPUTE_LIB OFF)
-set(USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME OFF)
+set(USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR OFF)
 
 # Whether to build with Arm Ethos-N support
 # Possible values:
@@ -272,3 +272,15 @@ set(USE_HEXAGON_SDK /path/to/sdk)
 
 # Whether to use ONNX codegen
 set(USE_TARGET_ONNX OFF)
+
+# Whether enable BNNS runtime
+set(USE_BNNS OFF)
+
+# Whether to use libbacktrace
+# Libbacktrace provides line and column information on stack traces from errors.
+# It is only supported on linux and macOS.
+# Possible values:
+# - AUTO: auto set according to system information and feasibility
+# - ON: enable libbacktrace
+# - OFF: disable libbacktrace
+set(USE_LIBBACKTRACE AUTO)
diff --git a/cmake/libs/Libbacktrace.cmake b/cmake/libs/Libbacktrace.cmake
new file mode 100644
index 000000000000..742855358809
--- /dev/null
+++ b/cmake/libs/Libbacktrace.cmake
@@ -0,0 +1,45 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+include(ExternalProject)
+
+ExternalProject_Add(project_libbacktrace
+  PREFIX libbacktrace
+  SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/../../3rdparty/libbacktrace
+  BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/libbacktrace
+  CONFIGURE_COMMAND "${CMAKE_CURRENT_LIST_DIR}/../../3rdparty/libbacktrace/configure"
+                    "--prefix=${CMAKE_CURRENT_BINARY_DIR}/libbacktrace" --with-pic
+  INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace"
+  BUILD_COMMAND make
+  INSTALL_COMMAND make install
+  BUILD_BYPRODUCTS "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/lib/libbacktrace.a"
+                   "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/include/backtrace.h"
+  )
+
+# Custom step to rebuild libbacktrace if any of the source files change
+file(GLOB LIBBACKTRACE_SRCS "${CMAKE_CURRENT_LIST_DIR}/../../3rdparty/libbacktrace/*.c")
+ExternalProject_Add_Step(project_libbacktrace checkout
+  DEPENDERS configure
+  DEPENDEES download
+  DEPENDS ${LIBBACKTRACE_SRCS}
+)
+
+add_library(libbacktrace STATIC IMPORTED)
+add_dependencies(libbacktrace project_libbacktrace)
+set_property(TARGET libbacktrace
+  PROPERTY IMPORTED_LOCATION ${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/lib/libbacktrace.a)
+# create include directory so cmake doesn't complain
+file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/include)
diff --git a/cmake/modules/CUDA.cmake b/cmake/modules/CUDA.cmake
index 0ec2f1466bd1..1bdc5036f857 100644
--- a/cmake/modules/CUDA.cmake
+++ b/cmake/modules/CUDA.cmake
@@ -65,6 +65,17 @@ if(USE_CUDA)
     list(APPEND RUNTIME_SRCS ${CONTRIB_THRUST_SRC})
   endif(USE_THRUST)
 
+  if(USE_GRAPH_EXECUTOR_CUDA_GRAPH)
+    if(NOT USE_GRAPH_EXECUTOR)
+      message(FATAL_ERROR "CUDA Graph is only supported by graph executor, please set USE_GRAPH_EXECUTOR=ON")
+    endif()
+    if(CUDAToolkit_VERSION_MAJOR LESS "10")
+      message(FATAL_ERROR "CUDA Graph requires CUDA 10 or above, got=" ${CUDAToolkit_VERSION})
+    endif()
+    message(STATUS "Build with Graph executor with CUDA Graph support...")
+    file(GLOB RUNTIME_CUDA_GRAPH_SRCS src/runtime/graph_executor/cuda_graph/*.cc)
+    list(APPEND RUNTIME_SRCS ${RUNTIME_CUDA_GRAPH_SRCS})
+  endif()
 else(USE_CUDA)
   list(APPEND COMPILER_SRCS src/target/opt/build_cuda_off.cc)
 endif(USE_CUDA)
diff --git a/cmake/modules/LibInfo.cmake b/cmake/modules/LibInfo.cmake
index 131dceeb345d..2a69d06970a8 100644
--- a/cmake/modules/LibInfo.cmake
+++ b/cmake/modules/LibInfo.cmake
@@ -42,8 +42,8 @@ function(add_lib_info src_file)
     TVM_INFO_USE_LLVM="${USE_LLVM}"
     TVM_INFO_LLVM_VERSION="${TVM_INFO_LLVM_VERSION}"
     TVM_INFO_USE_STACKVM_RUNTIME="${USE_STACKVM_RUNTIME}"
-    TVM_INFO_USE_GRAPH_RUNTIME="${USE_GRAPH_RUNTIME}"
-    TVM_INFO_USE_GRAPH_RUNTIME_DEBUG="${USE_GRAPH_RUNTIME_DEBUG}"
+    TVM_INFO_USE_GRAPH_EXECUTOR="${USE_GRAPH_EXECUTOR}"
+    TVM_INFO_USE_GRAPH_EXECUTOR_DEBUG="${USE_GRAPH_EXECUTOR_DEBUG}"
     TVM_INFO_USE_OPENMP="${USE_OPENMP}"
     TVM_INFO_USE_RELAY_DEBUG="${USE_RELAY_DEBUG}"
     TVM_INFO_USE_RTTI="${USE_RTTI}"
@@ -73,7 +73,7 @@ function(add_lib_info src_file)
     TVM_INFO_USE_COREML="${USE_COREML}"
     TVM_INFO_USE_TARGET_ONNX="${USE_TARGET_ONNX}"
     TVM_INFO_USE_ARM_COMPUTE_LIB="${USE_ARM_COMPUTE_LIB}"
-    TVM_INFO_USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME="${USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME}"
+    TVM_INFO_USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR="${USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR}"
     TVM_INFO_INDEX_DEFAULT_I64="${INDEX_DEFAULT_I64}"
     TVM_CXX_COMPILER_PATH="${CMAKE_CXX_COMPILER}"
   )
diff --git a/cmake/modules/Logging.cmake b/cmake/modules/Logging.cmake
new file mode 100644
index 000000000000..91c0fd07b676
--- /dev/null
+++ b/cmake/modules/Logging.cmake
@@ -0,0 +1,46 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This script configures the logging module and dependency on libbacktrace
+
+if("${USE_LIBBACKTRACE}" STREQUAL "AUTO")
+  if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+    set(USE_LIBBACKTRACE ON)
+  else()
+    set(USE_LIBBACKTRACE OFF)
+  endif()
+  message(STATUS "Autoset: USE_LIBBACKTRACE=" ${USE_LIBBACKTRACE} " in " ${CMAKE_SYSTEM_NAME})
+endif()
+
+
+if(USE_LIBBACKTRACE)
+  message(STATUS "Building with libbacktrace...")
+  include(cmake/libs/Libbacktrace.cmake)
+  target_link_libraries(tvm PRIVATE libbacktrace)
+  target_link_libraries(tvm_runtime PRIVATE libbacktrace)
+  add_dependencies(tvm_runtime_objs libbacktrace)
+  # pre 3.12 versions of cmake cannot propagate include directories from imported targets so we set them manually
+  target_include_directories(tvm PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/include")
+  target_include_directories(tvm_objs PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/include")
+  target_include_directories(tvm_runtime PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/include")
+  target_include_directories(tvm_runtime_objs PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/include")
+  target_compile_definitions(tvm_objs PRIVATE TVM_USE_LIBBACKTRACE=1)
+  target_compile_definitions(tvm_runtime_objs PRIVATE TVM_USE_LIBBACKTRACE=1)
+else()
+  target_compile_definitions(tvm_objs PRIVATE TVM_USE_LIBBACKTRACE=0)
+  target_compile_definitions(tvm_runtime_objs PRIVATE TVM_USE_LIBBACKTRACE=0)
+endif()
diff --git a/cmake/modules/StandaloneCrt.cmake b/cmake/modules/StandaloneCrt.cmake
index dc1b3b2665f2..fe6baf81c3e5 100644
--- a/cmake/modules/StandaloneCrt.cmake
+++ b/cmake/modules/StandaloneCrt.cmake
@@ -43,8 +43,8 @@ if(USE_MICRO)
          "src/runtime/crt Makefile -> ."
          "src/runtime/crt/include *.h -> include"
          "src/runtime/crt/common *.c -> src/runtime/crt/common"
-         "src/runtime/crt/graph_runtime *.c -> src/runtime/crt/graph_runtime"
-         "src/runtime/crt/graph_runtime_module *.c -> src/runtime/crt/graph_runtime_module"
+         "src/runtime/crt/graph_executor *.c -> src/runtime/crt/graph_executor"
+         "src/runtime/crt/graph_executor_module *.c -> src/runtime/crt/graph_executor_module"
          "src/runtime/crt/host crt_config.h -> template/host"
          "src/runtime/crt/host *.cc -> template/host"
          "src/runtime/crt/memory *.c -> src/runtime/crt/memory"
@@ -97,7 +97,7 @@ if(USE_MICRO)
     set(make_quiet )
     endif(${VERBOSE})
 
-    list(APPEND crt_libraries memory graph_runtime utvm_rpc_server utvm_rpc_common common)  # NOTE: listed in link order.
+    list(APPEND crt_libraries memory graph_executor utvm_rpc_server utvm_rpc_common common)  # NOTE: listed in link order.
     foreach(crt_lib_name IN LISTS crt_libraries)
       list(APPEND crt_library_paths "host_standalone_crt/lib${crt_lib_name}.a")
     endforeach()
diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake
index 115216680fff..58b58d231d83 100644
--- a/cmake/modules/VTA.cmake
+++ b/cmake/modules/VTA.cmake
@@ -60,6 +60,7 @@ elseif(PYTHON)
     # Target lib: vta_fsim
     add_library(vta_fsim SHARED ${FSIM_RUNTIME_SRCS})
     target_include_directories(vta_fsim SYSTEM PUBLIC ${VTA_HW_PATH}/include)
+    target_compile_definitions(vta_fsim PUBLIC DMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
     foreach(__def ${VTA_DEFINITIONS})
       string(SUBSTRING ${__def} 3 -1 __strip_def)
       target_compile_definitions(vta_fsim PUBLIC ${__strip_def})
@@ -81,6 +82,7 @@ elseif(PYTHON)
     # Target lib: vta_tsim
     add_library(vta_tsim SHARED ${TSIM_RUNTIME_SRCS})
     target_include_directories(vta_tsim SYSTEM PUBLIC ${VTA_HW_PATH}/include)
+    target_compile_definitions(vta_tsim PUBLIC DMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
     foreach(__def ${VTA_DEFINITIONS})
       string(SUBSTRING ${__def} 3 -1 __strip_def)
       target_compile_definitions(vta_tsim PUBLIC ${__strip_def})
@@ -107,6 +109,7 @@ elseif(PYTHON)
     add_library(vta SHARED ${FPGA_RUNTIME_SRCS})
     target_include_directories(vta PUBLIC vta/runtime)
     target_include_directories(vta PUBLIC ${VTA_HW_PATH}/include)
+    target_compile_definitions(vta PUBLIC DMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
     foreach(__def ${VTA_DEFINITIONS})
       string(SUBSTRING ${__def} 3 -1 __strip_def)
       target_compile_definitions(vta PUBLIC ${__strip_def})
diff --git a/cmake/modules/contrib/ArmComputeLib.cmake b/cmake/modules/contrib/ArmComputeLib.cmake
index ba082505125b..54ce917dfb50 100644
--- a/cmake/modules/contrib/ArmComputeLib.cmake
+++ b/cmake/modules/contrib/ArmComputeLib.cmake
@@ -23,17 +23,25 @@ if(USE_ARM_COMPUTE_LIB)
     file(GLOB ACL_RELAY_CONTRIB_SRC src/relay/backend/contrib/arm_compute_lib/*.cc)
     file(GLOB ACL_RUNTIME_MODULE src/runtime/contrib/arm_compute_lib/acl_runtime.cc)
     list(APPEND COMPILER_SRCS ${ACL_RELAY_CONTRIB_SRC})
-    if(NOT USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME)
+
+    if(NOT USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR)
         list(APPEND COMPILER_SRCS ${ACL_RUNTIME_MODULE})
     endif()
     message(STATUS "Build with Arm Compute Library support...")
 endif()
 
-if(USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME)
+if(USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME AND NOT DEFINED USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR)
+    message(WARNING "USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME renamed to USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR. "
+                    "Please update your config.cmake")
+    set(USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR ${USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME})
+    unset(USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME CACHE)
+endif(USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME AND NOT DEFINED USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR)
+
+if(USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR)
     set(ACL_PATH ${CMAKE_CURRENT_SOURCE_DIR}/acl)
     # Detect custom ACL path.
-    if (NOT USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME STREQUAL "ON")
-        set(ACL_PATH ${USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME})
+    if (NOT USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR STREQUAL "ON")
+        set(ACL_PATH ${USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR})
     endif()
 
     file(GLOB ACL_CONTRIB_SRC src/runtime/contrib/arm_compute_lib/*)
@@ -60,11 +68,11 @@ if(USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME)
     list(APPEND TVM_RUNTIME_LINKER_LIBS ${EXTERN_ACL_COMPUTE_CORE_LIB})
     list(APPEND TVM_RUNTIME_LINKER_LIBS ${EXTERN_ACL_COMPUTE_GRAPH_LIB})
     list(APPEND RUNTIME_SRCS ${ACL_CONTRIB_SRC})
-    message(STATUS "Build with Arm Compute Library graph runtime support: "
+    message(STATUS "Build with Arm Compute Library graph executor support: "
             ${EXTERN_ACL_COMPUTE_LIB} ", \n"
             ${EXTERN_ACL_COMPUTE_CORE_LIB} ", \n"
             ${EXTERN_ACL_COMPUTE_GRAPH_LIB})
 
-    # Set flag to detect ACL graph runtime support.
-    add_definitions(-DTVM_GRAPH_RUNTIME_ARM_COMPUTE_LIB)
+    # Set flag to detect ACL graph executor support.
+    add_definitions(-DTVM_GRAPH_EXECUTOR_ARM_COMPUTE_LIB)
 endif()
diff --git a/cmake/modules/contrib/BNNS.cmake b/cmake/modules/contrib/BNNS.cmake
new file mode 100644
index 000000000000..e14aa2857ebc
--- /dev/null
+++ b/cmake/modules/contrib/BNNS.cmake
@@ -0,0 +1,30 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+if(USE_BNNS STREQUAL "ON")
+  add_definitions(-DUSE_JSON_RUNTIME=1)
+  file(GLOB BNNS_RELAY_CONTRIB_SRC src/relay/backend/contrib/bnns/*.cc)
+  list(APPEND COMPILER_SRCS ${BNNS_RELAY_CONTRIB_SRC})
+  list(APPEND COMPILER_SRCS ${JSON_RELAY_CONTRIB_SRC})
+
+  list(APPEND TVM_RUNTIME_LINKER_LIBS "-framework Accelerate")
+
+  file(GLOB BNNS_CONTRIB_SRC src/runtime/contrib/bnns/*.cc)
+  list(APPEND RUNTIME_SRCS ${BNNS_CONTRIB_SRC})
+  message(STATUS "Build with BNNS JSON runtime: " ${EXTERN_LIBRARY_BNNS})
+endif()
+
diff --git a/cmake/modules/contrib/TensorRT.cmake b/cmake/modules/contrib/TensorRT.cmake
index 0c7e43c0fcf8..218f0b2e20fe 100644
--- a/cmake/modules/contrib/TensorRT.cmake
+++ b/cmake/modules/contrib/TensorRT.cmake
@@ -55,5 +55,5 @@ if(USE_TENSORRT_RUNTIME)
     list(APPEND RUNTIME_SRCS ${RUNTIME_TENSORRT_SRCS})
 
     # Set defines
-    add_definitions(-DTVM_GRAPH_RUNTIME_TENSORRT)
+    add_definitions(-DTVM_GRAPH_EXECUTOR_TENSORRT)
 endif()
diff --git a/conda/recipe/bld.bat b/conda/recipe/bld.bat
index 9fc0469febc6..e877b8fda1e1 100644
--- a/conda/recipe/bld.bat
+++ b/conda/recipe/bld.bat
@@ -28,7 +28,7 @@ cmake ^
       -DUSE_CPP_RPC=ON ^
       -DUSE_SORT=ON ^
       -DUSE_RANDOM=ON ^
-      -DUSE_GRAPH_RUNTIME_DEBUG=ON ^
+      -DUSE_GRAPH_EXECUTOR_DEBUG=ON ^
       -DINSTALL_DEV=ON ^
       %SRC_DIR%
 
diff --git a/conda/recipe/build.sh b/conda/recipe/build.sh
index c9e76314da31..a94b9df72440 100755
--- a/conda/recipe/build.sh
+++ b/conda/recipe/build.sh
@@ -49,9 +49,10 @@ cmake -DCMAKE_INSTALL_PREFIX="${PREFIX}" \
       -DUSE_CPP_RPC=OFF \
       -DUSE_SORT=ON \
       -DUSE_RANDOM=ON \
-      -DUSE_GRAPH_RUNTIME_DEBUG=ON \
+      -DUSE_GRAPH_EXECUTOR_DEBUG=ON \
       -DUSE_LLVM=ON \
       -DINSTALL_DEV=ON \
+      -DUSE_LIBBACKTRACE=AUTO \
       ${GPU_OPT} ${TOOLCHAIN_OPT} \
       ${SRC_DIR}
 
diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu
index ac76af6b0a1e..a44677f5ce56 100644
--- a/docker/Dockerfile.ci_gpu
+++ b/docker/Dockerfile.ci_gpu
@@ -107,8 +107,8 @@ ENV PATH=/usr/local/nvidia/bin:${PATH}
 ENV PATH=/usr/local/cuda/bin:${PATH}
 ENV CPLUS_INCLUDE_PATH=/usr/local/cuda/include:${CPLUS_INCLUDE_PATH}
 ENV C_INCLUDE_PATH=/usr/local/cuda/include:${C_INCLUDE_PATH}
-ENV LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/compact:${LIBRARY_PATH}
-ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/compact:${LD_LIBRARY_PATH}
+ENV LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/compat:${LIBRARY_PATH}
+ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/compat:${LD_LIBRARY_PATH}
 
 ENV LD_LIBRARY_PATH=/opt/rocm/lib:${LD_LIBRARY_PATH}
 ENV PATH=/node_modules/.bin:${PATH}
diff --git a/docker/Dockerfile.ci_qemu b/docker/Dockerfile.ci_qemu
index 9120f91cc69f..04434b7186b0 100644
--- a/docker/Dockerfile.ci_qemu
+++ b/docker/Dockerfile.ci_qemu
@@ -64,3 +64,7 @@ RUN bash /install/ubuntu_install_qemu.sh
 COPY install/ubuntu_install_zephyr.sh /install/ubuntu_install_zephyr.sh
 RUN bash /install/ubuntu_install_zephyr.sh
 ENV ZEPHYR_BASE=/opt/zephyrproject/zephyr
+
+# Install ONNX
+COPY install/ubuntu_install_onnx.sh /install/ubuntu_install_onnx.sh
+RUN bash /install/ubuntu_install_onnx.sh
diff --git a/docker/Dockerfile.demo_android b/docker/Dockerfile.demo_android
index 039439a937e9..f56f56728e70 100644
--- a/docker/Dockerfile.demo_android
+++ b/docker/Dockerfile.demo_android
@@ -61,7 +61,7 @@ RUN cd /usr && \
         -DUSE_LLVM=llvm-config-8 \
         -DUSE_RPC=ON \
         -DUSE_SORT=ON \
-        -DUSE_GRAPH_RUNTIME=ON \
+        -DUSE_GRAPH_EXECUTOR=ON \
         -DUSE_VULKAN=ON \
         .. && \
     make -j10
diff --git a/docker/build.sh b/docker/build.sh
index bd13937b2571..f966e22d035b 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -20,11 +20,15 @@
 #
 # Execute command within a docker container
 #
-# Usage: build.sh <CONTAINER_TYPE> [--dockerfile <DOCKERFILE_PATH>] [-it]
+# Usage: build.sh <CONTAINER_TYPE> [--tag <DOCKER_IMAGE_TAG>]
+#                [--dockerfile <DOCKERFILE_PATH>] [-it]
 #                [--net=host] [--cache-from <IMAGE_NAME>] <COMMAND>
 #
-# CONTAINER_TYPE: Type of the docker container used the run the build: e.g.,
-#                 (cpu | gpu)
+# CONTAINER_TYPE: Type of the docker container used the run the build,
+#                 e.g. "ci_cpu", "ci_gpu"
+#
+# DOCKER_IMAGE_TAG: (Optional) Docker image tag to be built and used.
+#                   Defaults to 'latest', as it is the default Docker tag.
 #
 # DOCKERFILE_PATH: (Optional) Path to the Dockerfile used for docker build.  If
 #                  this optional value is not supplied (via the --dockerfile
@@ -45,6 +49,12 @@ shift 1
 DOCKERFILE_PATH="${SCRIPT_DIR}/Dockerfile.${CONTAINER_TYPE}"
 DOCKER_CONTEXT_PATH="${SCRIPT_DIR}"
 
+if [[ "$1" == "--tag" ]]; then
+    DOCKER_IMAGE_TAG="$2"
+    echo "Using custom Docker tag: ${DOCKER_IMAGE_TAG}"
+    shift 2
+fi
+
 if [[ "$1" == "--dockerfile" ]]; then
     DOCKERFILE_PATH="$2"
     DOCKER_CONTEXT_PATH=$(dirname "${DOCKERFILE_PATH}")
@@ -114,6 +124,7 @@ function upsearch () {
 # reasonable defaults if you run it outside of Jenkins.
 WORKSPACE="${WORKSPACE:-${SCRIPT_DIR}/../}"
 BUILD_TAG="${BUILD_TAG:-tvm}"
+DOCKER_IMAGE_TAG="${DOCKER_IMAGE_TAG:-latest}"
 
 # Determine the docker image name
 DOCKER_IMG_NAME="${BUILD_TAG}.${CONTAINER_TYPE}"
@@ -125,6 +136,9 @@ DOCKER_IMG_NAME=$(echo "${DOCKER_IMG_NAME}" | sed -e 's/=/_/g' -e 's/,/-/g')
 # Convert to all lower-case, as per requirement of Docker image names
 DOCKER_IMG_NAME=$(echo "${DOCKER_IMG_NAME}" | tr '[:upper:]' '[:lower:]')
 
+# Compose the full image spec with "name:tag" e.g. "tvm.ci_cpu:v0.03"
+DOCKER_IMG_SPEC="${DOCKER_IMG_NAME}:${DOCKER_IMAGE_TAG}"
+
 # Print arguments.
 echo "WORKSPACE: ${WORKSPACE}"
 echo "CI_DOCKER_EXTRA_PARAMS: ${CI_DOCKER_EXTRA_PARAMS[@]}"
@@ -132,12 +146,14 @@ echo "COMMAND: ${COMMAND[@]}"
 echo "CONTAINER_TYPE: ${CONTAINER_TYPE}"
 echo "BUILD_TAG: ${BUILD_TAG}"
 echo "DOCKER CONTAINER NAME: ${DOCKER_IMG_NAME}"
+echo "DOCKER_IMAGE_TAG: ${DOCKER_IMAGE_TAG}"
+echo "DOCKER_IMG_SPEC: ${DOCKER_IMG_SPEC}"
 echo ""
 
 
 # Build the docker container.
 echo "Building container (${DOCKER_IMG_NAME})..."
-docker build -t ${DOCKER_IMG_NAME} \
+docker build -t ${DOCKER_IMG_SPEC} \
     -f "${DOCKERFILE_PATH}" \
     ${CI_DOCKER_BUILD_EXTRA_PARAMS[@]} \
     "${DOCKER_CONTEXT_PATH}"
@@ -149,7 +165,7 @@ if [[ $? != "0" ]]; then
 fi
 
 # Run the command inside the container.
-echo "Running '${COMMAND[@]}' inside ${DOCKER_IMG_NAME}..."
+echo "Running '${COMMAND[@]}' inside ${DOCKER_IMG_SPEC}..."
 
 # By default we cleanup - remove the container once it finish running (--rm)
 # and share the PID namespace (--pid=host) so the process inside does not have
@@ -167,6 +183,6 @@ ${DOCKER_BINARY} run --rm --pid=host \
     -e "CI_IMAGE_NAME=${DOCKER_IMAGE_NAME}" \
     ${CUDA_ENV}\
     ${CI_DOCKER_EXTRA_PARAMS[@]} \
-    ${DOCKER_IMG_NAME} \
+    ${DOCKER_IMG_SPEC} \
     bash --login docker/with_the_same_user \
     ${COMMAND[@]}
diff --git a/docker/install/install_tvm_cpu.sh b/docker/install/install_tvm_cpu.sh
index c3a15fa26b6d..48e6df3597db 100755
--- a/docker/install/install_tvm_cpu.sh
+++ b/docker/install/install_tvm_cpu.sh
@@ -27,7 +27,7 @@ cd /usr/tvm
 git checkout 4b13bf668edc7099b38d463e5db94ebc96c80470
 
 echo set\(USE_LLVM llvm-config-8\) >> config.cmake
-echo set\(USE_GRAPH_RUNTIME ON\) >> config.cmake
+echo set\(USE_GRAPH_EXECUTOR ON\) >> config.cmake
 echo set\(USE_BLAS openblas\) >> config.cmake
 mkdir -p build
 cd build
diff --git a/docker/install/ubuntu_install_ethosn_driver_stack.sh b/docker/install/ubuntu_install_ethosn_driver_stack.sh
index 15b93bbdf901..e73d1c35e2d2 100755
--- a/docker/install/ubuntu_install_ethosn_driver_stack.sh
+++ b/docker/install/ubuntu_install_ethosn_driver_stack.sh
@@ -22,7 +22,7 @@ set -o pipefail
 
 repo_url="https://github.com/Arm-software/ethos-n-driver-stack"
 repo_dir="ethosn-driver"
-repo_revision="20.08"
+repo_revision="21.02"
 install_path="/opt/arm/$repo_dir"
 
 tmpdir=$(mktemp -d)
diff --git a/docker/install/ubuntu_install_zephyr.sh b/docker/install/ubuntu_install_zephyr.sh
index e10ff48212d1..1654ab9d1c09 100644
--- a/docker/install/ubuntu_install_zephyr.sh
+++ b/docker/install/ubuntu_install_zephyr.sh
@@ -33,73 +33,8 @@ sudo apt-get install -y --no-install-recommends \
      python3-dev python3-pip python3-setuptools python3-tk python3-wheel python3-venv \
      xz-utils file make gcc gcc-multilib g++-multilib apt-transport-https
 
-cat <<EOF | gpg --dearmor - | sudo tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
------BEGIN PGP PUBLIC KEY BLOCK-----
-
-mQINBF0bjnMBEADVgQr04Lg258KpWi42rzGemFGkzHCx7SXDWVqHApx34HUxF63s
-RnknCTt42Thqcv78CJ9WQYjjvT5+FZOlxA+0kwkeatFoKNeVvBkyYFgU6gxSuVQ+
-a1ZEw2IYdqRH+vUC1AKGY88KlrteTAqtqYsaGimiF5ry3y3bLBySyxLHfltCaENy
-uKPJEHHvHxTZsZAD3iwVysNZkw2V/V4IS8wy8m9rq1U7OU40KMJ3EUan89DzD1qt
-8sroEThsjE9IG6QMf1H9pvNIIz/QhwqSKQkGqt8obdf0W+EB4cef6ka98a+E6slc
-Otw2AVB2B47ljnp5AyLwZPiYxeIXPZsO8cZbx1uBOkOZ1OkqHlk4tgJEqg+v6APO
-cm625fk4iftsB+U/3MZvm4QH4Y5xfAFb3aDL2zkxN/EUCWW5tUn+Z+RaegGaojTE
-N2laH91ncpeZh1M9GPvXGT/efDg3a/Nv9UNUtv9lhNn35VyVgBNaaYwNScq5+ApV
-pG8b/j18x8mQR8kk7bXvOXjc/4NdCrY7QcIExA9DTWemLsDVeDM62lBvOKZGED4X
-fgGehGGPtu862kf4vvCZKrrEeVkVBrTiOsxFMdHshnKqtQyyJQKXXVjl9//jhMGM
-cZHJ5+D9O4JNE/aZC4h2F7hL0NpO0AVGJ0Ly5N7B07yMBZGGJaH4QXCoHwARAQAB
-tEVLaXR3YXJlIEFwdCBBcmNoaXZlIEF1dG9tYXRpYyBTaWduaW5nIEtleSAoMjAy
-MCkgPGRlYmlhbkBraXR3YXJlLmNvbT6JAlQEEwEKAD4WIQRtkDmVQkqDpI1C1T2o
-5e86AmACaAUCXRuOcwIbAwUJBaOagAULCQgHAgYVCgkICwIEFgIDAQIeAQIXgAAK
-CRCo5e86AmACaLJ9D/9ly840Ko3F0HgIAAxAeWE7BzQOD09BbnL/is6F0lquXd/W
-fZXUbVhONv7Q3FK9IDwzKoYHmRrwo6IpDIsy7AqiHHkWWxCdpIzVWQfE7rFg4UWa
-2bNXoFBGRImYmQHaG/02EJiNnTDnsYgN7y9zzAAvz63dnSsm6GOUp9pkIoxHnt9D
-WxMlM05GgVRjSeNvi4OLuPE6jHhHvAGGrMS8g9oU6TtCj9WVNryFpROchdmTteS4
-P16FP4n5NczXjYXFch3S+cOfijHnsfuFzB4JanrZ+JlBd21BDfhO/VLFx8+Ljdj0
-axKpwa86oHc5ALnMHPnGM2EVN+NNS88PDBngvJEpRUkECpEy4cwZ3zjCJ0jMeiRv
-cFf/FjZBFeqrAapwWNFjIH0El7dJq+XYDKuA0kakMDo6GZlfTNDRobGj9vR/HA0j
-/a7VD9tfW2dLr61qsQwynn6S+9B4XY/fYwc4AyYCp+FNm4ONFFjQ6ytCgdLdBEWK
-X+xCMifTqDx9nm/1u/95ZqwcayAqwhKDb01hQhSTlozybz8B4trfeHJdXYoH7/s4
-TLnt5R68bc2Fm0ikk4tndSTH4SUtnEeIv+nap5RkCmHI6URJ4P4kFT3C30Ooeafv
-GOa18HYxhb/qnU2DvWXL1rnKoKB51p2nhrkjliDPSfMMIf6AgyZSZR4BpMoz47kC
-DQRdG49RARAAyX+HK4Xh0RiiqPd0DDbgHV+8UvY1gihObyK/cqpRQzewSKEw/jwM
-abwav3oqisI7IFp4FmupqhSi7uqB54eUF44LHeGZiUedZm5pAreX2ygQASr2It3g
-kWr58J0ZOas6cRqUzga1mCL1eljfff9T9+1syIWiUWTjXDzEwsMgksHIn9ZGaxM0
-zvkbXfTCmlzmCbvBwokHRrw9cvmXZIKaGdvAEg/S9asmkRBeA/0GgX2Tlr1H6mv5
-0ZYF25t0n2IYiVuvXTOrz9OCuWxv0NQiweMFTi62sN6myjB4PC499ySTQkIhWVsf
-2oa5+rvcCg6j3jpUFy4MoDA9cXl47/0ccpim+mwJo4uY4ysIsDq7mKqjN7honj45
-zosvs7yd2UXrKWKay+P1e1vdsSOVP8PSSJCJV8HvdKCRfcYYdlTq3PSeloVrWC/4
-PCKlnp16AzPzL+CBWtj5ruhAFTaoKveEjUnQD5IiKD4hvt9nnX6C9RT2yhKDHnoi
-uup+nXOEn78UWxmoPJKu5wE1c5ZZhw81bYByEkLjHt5Bl+FS8CJN3G+56kVuBc9v
-Kqa90EThcLr6bIEx3LU3mK3FBxBCh+7xEPYI4Cx/NGyrszirRkzIIM6wIxc983l5
-+BtdKn14b2yDhfw2wOBsxo4aLWhGzqJGQAxuUo4sdNbElT5mpwpDxJEAEQEAAYkE
-cgQYAQoAJhYhBG2QOZVCSoOkjULVPajl7zoCYAJoBQJdG49RAhsCBQkFo5qAAkAJ
-EKjl7zoCYAJowXQgBBkBCgAdFiEEWbJ5MHJjJForbwaGKR+f9v04V4MFAl0bj1EA
-CgkQKR+f9v04V4Mz9Q/9E0KmNCJC95HfP46enwASVnBZ7ntlHvtqQgNVZ8r0W69v
-qg+FdsKK2109lR3RvRe5TAwHi4ryFW6YazmvH4k7Bd1pGxAtc5VSuehgs8lPGObo
-SKI8S9EH+v3G4IAm25vaRDtnVdDpcfn5A6RrSDyTTDjdhyTp9w/f62SfMryf/0fv
-yg5HS5JQSHBJdxN6mnRDqM66Ey4plfFbt4yKJIPnj5xsa19wx72Zw8hED1O6FZAV
-URQ8ffE521R9wzQAfX3746pdEQ+S21Ht0lEsNjmU/HDq0WeOBElIN6S09XQyL0zG
-0HrOZkByI5683v+cp6clJKxnBX7hsR0+4AxQK0+eNJEtLwLPcwObBi2ACeUG49cA
-ms+BaSAvjbyCW4M7ye42zdEFbWS9hfK6T5Ry85Pv6IxgpUHAX7kvtqHxguDobuUZ
-4CmSdRyBCEAN7dgjWrqrtmq7cF3Kwz5kLwzB0AeQTArLoYlBSlcx/eT/jDLZdFjQ
-Ol6uqVdv63BADNriYExz++g4A02LzAfk+C0J/7syKeEs5nonIFwTfrS7VJbcs7Cn
-8HkuCPuH9u1nYSJV8U7xYNCbRK3JNBr20IlO+TXAuf7M3z5IuZjED7EtG0kMyl41
-vbBYCFbKMpEEjFAUUO5CsbyL4IoYJRptJij10RsDI9jRY+YfOQ+WxP4txPDv1Eei
-eBAAs2PDWG7MvubB1wE3QcRUEQqvDbEIdvRfz9YIOXfGlaDfiuhBpcxsgsDG/IjQ
-3c0PnJqpLpivfOMMyfynwPRW4ZiwIUSrOYJ6xhOt3zUzqf/GfIB39pCz3AI0EBxp
-uicL4PJ4OeA0V3XT+IEcjbqBaVz5UCS/sVuYTykxwk8BPYaJOFlHtp4kEtn43kpL
-kQHPMQCC1+skI85d0YG7Yn1w5qSqtwYJBPFU2OWpyLHtxL55S8dAWmvlkKmA1I6W
-WyOPM/Y5WWdG8BUphXmv67wdeVdxp4s5V8oXKy3QQ0FA5Wt/z6l7Ei8tXcOIgDYw
-nYgTgjOprZPXOY+L+6gED3YVWUvAJ6xhdYVsJazu3Ulwr4dwkHrBd1qXe7NGA3Ib
-7VAkzkPzRtdPJ+OT/YX0vfh3a4VvYepoTAHIf0J6Uo2vcqBFA/Ztiby3bM4T4C30
-c5AqQkLDZ/2UbBW9Yu4f9oiw7/gDdNI7C8xHaQNLFzzRzhjnEpjwBhlpeballXoU
-6ShFo6T0CzZ1N46iumJ5nTor40dY2EcX+dXxGCJ2ihifIeHrbx6fKFOB9VLV3VpW
-SzLJTT9ARIgvqVg5lhTFiKRiZNp5MAu9NFw5wgyCJxUjASLOWshMwkhKHHe13AZD
-2Hxmkp7Qwjg6kihr/j03NQIBhOK+M068Urew/dbndYwIzsI=
-=0GnF
------END PGP PUBLIC KEY BLOCK-----
-
-EOF
+wget --no-verbose https://apt.kitware.com/keys/kitware-archive-latest.asc
+sudo apt-key add kitware-archive-latest.asc
 
 sudo apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main'
 sudo apt-get update
diff --git a/docs/api/python/graph_runtime.rst b/docs/api/python/graph_executor.rst
similarity index 92%
rename from docs/api/python/graph_runtime.rst
rename to docs/api/python/graph_executor.rst
index d82c7ce00e2e..3f8811553ba4 100644
--- a/docs/api/python/graph_runtime.rst
+++ b/docs/api/python/graph_executor.rst
@@ -15,7 +15,7 @@
     specific language governing permissions and limitations
     under the License.
 
-tvm.contrib.graph_runtime
+tvm.contrib.graph_executor
 -------------------------
-.. automodule:: tvm.contrib.graph_runtime
+.. automodule:: tvm.contrib.graph_executor
     :members:
diff --git a/docs/api/python/index.rst b/docs/api/python/index.rst
index a6179684413d..76322a1acfe2 100644
--- a/docs/api/python/index.rst
+++ b/docs/api/python/index.rst
@@ -44,6 +44,6 @@ Python API
    rpc
    micro
    contrib
-   graph_runtime
+   graph_executor
    topi
    vta/index
diff --git a/docs/api/python/relay/backend.rst b/docs/api/python/relay/backend.rst
index c30f226e8437..ffe8a9a8ce79 100644
--- a/docs/api/python/relay/backend.rst
+++ b/docs/api/python/relay/backend.rst
@@ -26,7 +26,7 @@ tvm.relay.backend
 .. automodule:: tvm.relay.backend.compile_engine
     :members:
 
-.. automodule:: tvm.relay.backend.graph_runtime_codegen
+.. automodule:: tvm.relay.backend.graph_executor_codegen
     :members:
 
 .. automodule:: tvm.relay.backend.vm
diff --git a/docs/conf.py b/docs/conf.py
index ad838f767f80..85ce4a6e0663 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -210,10 +210,14 @@
 # The unlisted files always appear after listed files.
 within_subsection_order = {
     "get_started": [
-        "relay_quick_start.py",
-        "tensor_expr_get_started.py",
+        "introduction.py",
+        "install.py",
         "tvmc_command_line_driver.py",
+        "tensor_expr_get_started.py",
+        "autotvm_matmul.py",
+        "autoschedule_matmul.py",
         "cross_compilation_and_rpc.py",
+        "relay_quick_start.py",
     ],
     "frontend": [
         "from_pytorch.py",
diff --git a/docs/contribute/code_guide.rst b/docs/contribute/code_guide.rst
index f6de6c158001..0ed2ce4ca9e1 100644
--- a/docs/contribute/code_guide.rst
+++ b/docs/contribute/code_guide.rst
@@ -91,7 +91,7 @@ If you want your test to run over a variety of targets, use the :py:func:`tvm.te
 .. code:: python
 
   @tvm.testing.parametrize_targets
-  def test_mytest(target, ctx):
+  def test_mytest(target, dev):
     ...
 
 will run ``test_mytest`` with ``target="llvm"``, ``target="cuda"``, and few others. This also ensures that your test is run on the correct hardware by the CI. If you only want to test against a couple targets use ``@tvm.testing.parametrize_targets("target_1", "target_2")``. If you want to test on a single target, use the associated decorator from :py:func:`tvm.testing`. For example, CUDA tests use the ``@tvm.testing.requires_cuda`` decorator.
diff --git a/docs/deploy/arm_compute_lib.rst b/docs/deploy/arm_compute_lib.rst
index 5d11241c1a34..4e43682a240a 100644
--- a/docs/deploy/arm_compute_lib.rst
+++ b/docs/deploy/arm_compute_lib.rst
@@ -52,7 +52,7 @@ We recommend two different ways to build and install ACL:
       mv ./linux-<architecture-to-build-for>-neon/* .
 
 
-In both cases you will need to set USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME to the path where the ACL package
+In both cases you will need to set USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR to the path where the ACL package
 is located. Cmake will look in /path-to-acl/ along with /path-to-acl/lib and /path-to-acl/build for the
 required binaries. See the section below for more information on how to use these configuration options.
 
@@ -64,15 +64,15 @@ because ACL cannot be used on an x86 machine. However, we still want to be able
 runtime module on an x86 machine.
 
 * USE_ARM_COMPUTE_LIB=ON/OFF - Enabling this flag will add support for compiling an ACL runtime module.
-* USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME=ON/OFF/path-to-acl - Enabling this flag will allow the graph runtime to
+* USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR=ON/OFF/path-to-acl - Enabling this flag will allow the graph executor to
   compute the ACL offloaded functions.
 
 These flags can be used in different scenarios depending on your setup. For example, if you want
 to compile an ACL module on an x86 machine and then run the module on a remote Arm device via RPC, you will
-need to use USE_ARM_COMPUTE_LIB=ON on the x86 machine and USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME=ON on the remote
+need to use USE_ARM_COMPUTE_LIB=ON on the x86 machine and USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR=ON on the remote
 AArch64 device.
 
-By default both options are set to OFF. Using USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME=ON will mean that ACL
+By default both options are set to OFF. Using USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR=ON will mean that ACL
 binaries are searched for by cmake in the default locations
 (see https://cmake.org/cmake/help/v3.4/command/find_library.html). In addition to this,
 /path-to-tvm-project/acl/ will also be searched. It is likely that you will need to set your own path to
@@ -83,7 +83,7 @@ These flags should be set in your config.cmake file. For example:
 .. code:: cmake
 
     set(USE_ARM_COMPUTE_LIB ON)
-    set(USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME /path/to/acl)
+    set(USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR /path/to/acl)
 
 
 Usage
@@ -148,9 +148,9 @@ https://tvm.apache.org/docs/tutorials/get_started/cross_compilation_and_rpc.html
 
 .. code:: python
 
-    ctx = tvm.cpu(0)
+    dev = tvm.cpu(0)
     loaded_lib = tvm.runtime.load_module('lib_acl.so')
-    gen_module = tvm.contrib.graph_runtime.GraphModule(loaded_lib['default'](ctx))
+    gen_module = tvm.contrib.graph_executor.GraphModule(loaded_lib['default'](dev))
     d_data = np.random.uniform(0, 1, data_shape).astype(data_type)
     map_inputs = {'data': d_data}
     gen_module.set_input(**map_inputs)
diff --git a/docs/deploy/bnns.rst b/docs/deploy/bnns.rst
new file mode 100644
index 000000000000..7b62fb15a617
--- /dev/null
+++ b/docs/deploy/bnns.rst
@@ -0,0 +1,183 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+Relay BNNS Integration
+======================
+**Author**: `Egor Churaev <https://github.com/echuraev>`_
+
+Introduction
+------------
+
+Apple BNNS library is a collection of functions that can be used to construct neural networks
+for inference (and train). It’s supported in macOS, iOS, tvOS, and watchOS. BNNS provides
+primitives executed on all CPU supported on those platforms and optimized for high performance
+and low-energy consumption. This integration will offload as many operators as possible from Relay to BNNS.
+
+BNNS runtime is a part of platform API and available on all modern Apple operating systems.
+Application using BNNS will not depends on any additional external dependencies.
+
+BNNS functions uses Apple private hardware capabilities which are not exposed yet by Apple. Example
+of such capabilities can be AMX Apple cpu extension.
+
+This guide will demonstrate how to build TVM with BNNS codegen and runtime enabled. It will also provide example
+code to compile and run models using BNNS runtime. Finally, we document the supported operators.
+
+Building TVM with BNNS support
+------------------------------
+
+To turn on TVM BNNS codegen and TVM BNNS runtime you need to turn on the only USE_BNNS flag
+
+* USE_BNNS=ON/OFF - This flag will enable compiling a network with offloading subgraphs to BNNS primitives
+  and will link tvm library to the BNNS runtime module.
+
+Enabling of this flag will cause to search the default Accelerate Frameworks on current target SDK.
+The minimal versions of required SDK is macOS 11.0, iOS 14.0, tvOS 14.0 and watchOS 7.0.
+
+Example setting in config.cmake file:
+
+.. code:: cmake
+
+    set(USE_BNNS ON)
+
+BNNS partitioning of Relay graph
+--------------------------------
+
+Operations to be offloaded on BNNS execution must be annotated before passing of module for compilation.
+All ops annotated by `partition_for_bnns` will be offloaded for BNNS execution. The rest of the ops
+will go through the LLVM compilation and code generation.
+
+Important note: BNNS support primitives only with constant weights. To satisfy this requirements we have
+to map constants to related tensor abstraction in relay representation. To freeze tensors and operate
+with them as constants you may need to call ONNX importer with special flag "freeze_params=True"
+or performer binding manually. In general cases all relay importers don't do that by default.
+For your convenience "partition_for_bnns" can do this for you if params dictionary is passed as the argument.
+
+.. code:: python
+
+    from tvm.relay.op.contrib.bnns import partition_for_bnns
+    model = partition_for_bnns(model, params=params)
+
+
+Input data layout for operations to be offloaded to BNNS execution
+------------------------------------------------------------------
+
+BNNS kernels support only planar format of input data. The partitioner will require to have NCHW input
+layout for conv2d input.
+
+To use BNNS integration for models with interleave input layout, they should be converted before
+passing of module to `partition_for_bnns`. The layout conversion will happen only for explicitly
+enumerated types of ops. It might happen that depending on topology there might be regular data reorder
+around conv2d to interleave and planar layout. This will be reflected in performance penalties and affect
+execution time. It is recommended to analyze the whole topology and extend below list to convert all
+intermediate tensors to NCHW data layout.
+
+Example of input layouts change:
+
+.. code:: python
+
+    # For models with NHWC input layout
+    with tvm.transform.PassContext(opt_level=3):
+        mod = relay.transform.InferType()(mod)
+        mod = relay.transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"],
+                                            "nn.bias_add": ["NCHW", "default"],
+                                            "nn.relu": ["NCHW"]})(mod)
+
+
+Example: Build and Deploy Mobilenet v2 1.0 with BNNS
+----------------------------------------------------
+
+Create a Relay graph from a MXNet Mobilenet v2 1.0 model.
+
+.. code:: python
+
+    import tvm
+    from tvm import relay
+    import mxnet
+    from mxnet.gluon.model_zoo.vision import get_model
+
+    dtype = "float32"
+    input_shape = (1, 3, 224, 224)
+    block = get_model('mobilenetv2_1.0', pretrained=True)
+    module, params = relay.frontend.from_mxnet(block, shape={'data': input_shape}, dtype=dtype)
+
+
+Markup the parts of graphs to be offloaded to BNNS primitives. All ops which are supported by the BNNS
+integration will be handled by BNNS invocations, the rest of the ops will go through the
+regular TVM llvm compilation and code generation.
+
+After that you need to compile new module with target corresponding to required Apple platform
+
+.. code:: python
+
+    from tvm.relay.op.contrib.bnns import partition_for_bnns
+
+    # target for macOS Big Sur 11.1:
+    target = "llvm -mtriple=x86_64-apple-darwin20.2.0"
+
+    model = partition_for_bnns(model, params=params)  # to markup operations to be offloaded to BNNS
+    with tvm.transform.PassContext(opt_level=3):
+        lib = relay.build(model, target=target, target_host=target, params=params)
+
+Export the module.
+
+.. code:: python
+
+    lib.export_library('compiled.dylib')
+
+
+Load module and run inference on the target machine with TVM  built with ``USE_BNNS`` enabled
+
+.. code:: python
+
+    import tvm
+    import numpy as np
+    from tvm.contrib import graph_executor
+
+    dev = tvm.cpu(0)
+    loaded_lib = tvm.runtime.load_module('compiled.dylib')
+    gen_module = tvm.contrib.graph_executor.GraphModule(loaded_lib['default'](dev))
+
+    dtype = "float32"
+    input_shape = (1, 3, 224, 224)
+    input_data = np.random.uniform(0, 1, input_shape).astype(dtype)
+    gen_module.run(data=input_data)
+
+
+
+Operator support
+----------------
+
++------------------------+------------------------------------------------------------------------------+
+|       Relay Node       |              Remarks                                                         |
++========================+==============================================================================+
+| nn.conv2d              |                                                                              |
++------------------------+------------------------------------------------------------------------------+
+| nn.batch_norm          | Supported by BNNS integration only in nn.conv2d-batch_norm pattern           |
++------------------------+------------------------------------------------------------------------------+
+| nn.dense               |                                                                              |
++------------------------+------------------------------------------------------------------------------+
+| nn.batch_matmul        |                                                                              |
++------------------------+------------------------------------------------------------------------------+
+| nn.bias_add            | Supported by BNNS integration only as a bias part of nn.conv2d or nn.dense   |
+|                        | fusion                                                                       |
++------------------------+------------------------------------------------------------------------------+
+| add                    | Supported by BNNS integration only as a bias part of nn.conv2d or nn.dense fusion |
++------------------------+------------------------------------------------------------------------------+
+| nn.relu                | Supported by BNNS integration only as a part of nn.conv2d or nn.dense fusion |
++------------------------+------------------------------------------------------------------------------+
+| nn.gelu                | Supported by BNNS integration only as a part of nn.conv2d or nn.dense fusion |
++------------------------+------------------------------------------------------------------------------+
diff --git a/docs/deploy/hls.rst b/docs/deploy/hls.rst
index a8faf6453a0f..3c735e829936 100644
--- a/docs/deploy/hls.rst
+++ b/docs/deploy/hls.rst
@@ -71,12 +71,12 @@ We use two python scripts for this tutorial.
           fadd_dev = tvm.runtime.load_module("myadd.awsxclbin")
       fadd.import_module(fadd_dev)
 
-      ctx = tvm.context(tgt, 0)
+      dev = tvm.device(tgt, 0)
 
       n = 1024
-      a = tvm.nd.array(np.random.uniform(size=n).astype("float32"), ctx)
-      b = tvm.nd.array(np.random.uniform(size=n).astype("float32"), ctx)
-      c = tvm.nd.array(np.zeros(n, dtype="float32"), ctx)
+      a = tvm.nd.array(np.random.uniform(size=n).astype("float32"), dev)
+      b = tvm.nd.array(np.random.uniform(size=n).astype("float32"), dev)
+      c = tvm.nd.array(np.zeros(n, dtype="float32"), dev)
 
       fadd(a, b, c)
       tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
diff --git a/docs/deploy/index.rst b/docs/deploy/index.rst
index 2b37f734c3c3..3cbbb10bd74b 100644
--- a/docs/deploy/index.rst
+++ b/docs/deploy/index.rst
@@ -71,3 +71,4 @@ target device without relying on RPC. see the following resources on how to do s
    arm_compute_lib
    tensorrt
    vitis_ai
+   bnns
diff --git a/docs/deploy/tensorrt.rst b/docs/deploy/tensorrt.rst
index 27f11e9b5377..308db4933ae8 100644
--- a/docs/deploy/tensorrt.rst
+++ b/docs/deploy/tensorrt.rst
@@ -124,9 +124,9 @@ have to be built.
 
 .. code:: python
 
-    ctx = tvm.gpu(0)
+    dev = tvm.gpu(0)
     loaded_lib = tvm.runtime.load_module('compiled.so')
-    gen_module = tvm.contrib.graph_runtime.GraphModule(loaded_lib['default'](ctx))
+    gen_module = tvm.contrib.graph_executor.GraphModule(loaded_lib['default'](dev))
     input_data = np.random.uniform(0, 1, input_shape).astype(dtype)
     gen_module.run(data=input_data)
 
diff --git a/docs/deploy/vitis_ai.rst b/docs/deploy/vitis_ai.rst
index 7de8f58ce54f..1ce89ebed9c2 100755
--- a/docs/deploy/vitis_ai.rst
+++ b/docs/deploy/vitis_ai.rst
@@ -449,7 +449,7 @@ TVM.
    import tvm
    import tvm.relay as relay
    from tvm.contrib.target import vitis_ai
-   from tvm.contrib import utils, graph_runtime
+   from tvm.contrib import utils, graph_executor
    from tvm.relay.build_module import bind_params_by_name
    from tvm.relay.op.contrib.vitis_ai import annotation
 
@@ -490,7 +490,7 @@ will take a substantial amount of time.
 
 .. code:: python
 
-   module = graph_runtime.GraphModule(lib["default"](tvm.cpu()))
+   module = graph_executor.GraphModule(lib["default"](tvm.cpu()))
 
    # First N (default = 128) inputs are used for quantization calibration and will
    # be executed on the CPU
@@ -520,7 +520,7 @@ Load the module from compiled files and run inference
    # load the module into memory
    loaded_lib = tvm.runtime.load_module(lib_path)
 
-   module = graph_runtime.GraphModule(lib["default"](tvm.cpu()))
+   module = graph_executor.GraphModule(lib["default"](tvm.cpu()))
    module.set_input(name, data)
    module.run()
 
@@ -551,7 +551,7 @@ TVM.
    import tvm
    import tvm.relay as relay
    from tvm.contrib.target import vitis_ai
-   from tvm.contrib import utils, graph_runtime
+   from tvm.contrib import utils, graph_executor
    from tvm.relay.build_module import bind_params_by_name
    from tvm.relay.op.contrib.vitis_ai import annotation
 
@@ -631,7 +631,7 @@ quantization on the host machine. This makes use of TVM inference calls
 
 .. code:: python
 
-   module = graph_runtime.GraphModule(lib["default"](tvm.cpu()))
+   module = graph_executor.GraphModule(lib["default"](tvm.cpu()))
 
    # First N (default = 128) inputs are used for quantization calibration and will
    # be executed on the CPU
@@ -694,9 +694,9 @@ as root (execute ``su`` in terminal to log into root).
 
    import pyxir
    import tvm
-   from tvm.contrib import graph_runtime
+   from tvm.contrib import graph_executor
 
-   ctx = tvm.cpu()
+   dev = tvm.cpu()
    
    # input_name = ...
    # input_data = ...
@@ -704,6 +704,6 @@ as root (execute ``su`` in terminal to log into root).
    # load the module into memory
    lib = tvm.runtime.load_module("tvm_dpu_arm.so")
 
-   module = graph_runtime.GraphModule(lib["default"](tvm.cpu()))
+   module = graph_executor.GraphModule(lib["default"](dev))
    module.set_input(input_name, input_data)
    module.run()
diff --git a/docs/dev/codebase_walkthrough.rst b/docs/dev/codebase_walkthrough.rst
index 0a21bb8909e7..90c0670e402f 100644
--- a/docs/dev/codebase_walkthrough.rst
+++ b/docs/dev/codebase_walkthrough.rst
@@ -164,10 +164,10 @@ The returned module, which can be thought of as a combination of a compiled func
 
 ::
 
-   ctx = tvm.context(target, 0)
-   a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
-   b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
-   c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+   dev = tvm.device(target, 0)
+   a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
+   b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
+   c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
    fadd(a, b, c)
    output = c.asnumpy()
 
diff --git a/docs/dev/debugger.rst b/docs/dev/debugger.rst
index 4c49e926a8df..f1bd004717b4 100644
--- a/docs/dev/debugger.rst
+++ b/docs/dev/debugger.rst
@@ -123,24 +123,24 @@ Example of loading the parameters
 How to use Debugger?
 ***************************************
 
-1. In ``config.cmake`` set the ``USE_GRAPH_RUNTIME_DEBUG`` flag to ``ON``
+1. In ``config.cmake`` set the ``USE_GRAPH_EXECUTOR_DEBUG`` flag to ``ON``
 
    ::
 
        # Whether enable additional graph debug functions
-       set(USE_GRAPH_RUNTIME_DEBUG ON)
+       set(USE_GRAPH_EXECUTOR_DEBUG ON)
 
 2. Do 'make' tvm, so that it will make the ``libtvm_runtime.so``
 
 3. In frontend script file instead of
-   ``from tvm.contrib import graph_runtime`` import the
-   ``debug_runtime``
-   ``from tvm.contrib.debugger import debug_runtime as graph_runtime``
+   ``from tvm.contrib import graph_executor`` import the
+   ``debug_executor``
+   ``from tvm.contrib.debugger import debug_executor as graph_executor``
 
 ::
 
-    from tvm.contrib.debugger import debug_runtime as graph_runtime
-    m = graph_runtime.create(graph, lib, ctx, dump_root="/tmp/tvmdbg")
+    from tvm.contrib.debugger import debug_executor as graph_executor
+    m = graph_executor.create(graph, lib, dev, dump_root="/tmp/tvmdbg")
     # set inputs
     m.set_input('data', tvm.nd.array(data.astype(dtype)))
     m.set_input(**params)
diff --git a/docs/dev/index.rst b/docs/dev/index.rst
index a098df12f1c1..c297d32923fe 100644
--- a/docs/dev/index.rst
+++ b/docs/dev/index.rst
@@ -94,7 +94,7 @@ This process helps us to divide the original problem into two sub-problems:
 We use the low-level tir phase to compile and optimize each sub-functions. For specific targets, we may also directly go to the target translation
 phase and use external code generators.
 
-There are a few different ways(in relay/backend) to handle the calls into the overall execution problem. For simple models with known shapes and no control flow, we can lower to a graph runtime that stores the execution structure in a graph. We also support a virtual machine backend for dynamic executions. Finally, we plan to support ahead of time compilation that compiles the high-level execution structure into the executable and generated primitive functions. All of these execution modes are encapsulated by a unified **runtime.Module** interface, which we will discuss in the latter part of the guide.
+There are a few different ways(in relay/backend) to handle the calls into the overall execution problem. For simple models with known shapes and no control flow, we can lower to a graph executor that stores the execution structure in a graph. We also support a virtual machine backend for dynamic executions. Finally, we plan to support ahead of time compilation that compiles the high-level execution structure into the executable and generated primitive functions. All of these execution modes are encapsulated by a unified **runtime.Module** interface, which we will discuss in the latter part of the guide.
 
 **tir/transform** contains transformation passes for TIR level functions. Many tir passes serve the purpose of lowering. For example, there are passes to flatten multi-dimensional access to one-dimensional pointer access, to expand the intrinsics into target-specific ones, and to decorate the function entry to meet the runtime calling convention. Of course, there are also optimizations passes, such as access index simplification and dead code elimination.
 
@@ -144,7 +144,7 @@ The main goal of TVM's runtime is to provide a minimal API for loading and execu
     import tvm
     # Example runtime execution program in python, with type annotated
     mod: tvm.runtime.Module = tvm.runtime.load_module("compiled_artifact.so")
-    arr: tvm.runtime.NDArray = tvm.nd.array([1, 2, 3], ctx=tvm.gpu(0))
+    arr: tvm.runtime.NDArray = tvm.nd.array([1, 2, 3], device=tvm.gpu(0))
     fun: tvm.runtime.PackedFunc = mod["addone"]
     fun(a)
     print(a.asnumpy())
diff --git a/docs/dev/microtvm_design.rst b/docs/dev/microtvm_design.rst
index 2c3eeb2faea3..885ef2c8fc0d 100644
--- a/docs/dev/microtvm_design.rst
+++ b/docs/dev/microtvm_design.rst
@@ -68,7 +68,7 @@ The parts of this process are described below:
 
 #. **Deployment**. The project is built and the residual firmware binary is flashed onto the device.
    Model inference is driven either by TVM using an on-device RPC server, or on the device using the
-   on-device Graph Runtime.
+   on-device Graph Executor.
 
 Design Goals
 ============
@@ -189,14 +189,14 @@ The TVM compiler traditionally outputs three pieces:
 2. A model execution graph, encoded as JSON; and
 3. Simplified parameters.
 
-To correctly execute the model, a Graph Runtime needs to reconstruct the graph in memory, load the
+To correctly execute the model, a Graph Executor needs to reconstruct the graph in memory, load the
 parameters, and then invoke the operator implementations in the correct order.
 
 microTVM supports two ways to do this:
 
-1. **Host-Driven**. The Graph Runtime can run on the host and carry out execution by issuing
+1. **Host-Driven**. The Graph Executor can run on the host and carry out execution by issuing
    commands to the device using an RPC link with a UART-like transport.
-2. **Standalone**. A C Graph Runtime is available to be compiled on-device, but it is not
+2. **Standalone**. A C Graph Executor is available to be compiled on-device, but it is not
    particularly memory efficient. This way enables standalone execution without any attached host.
 
 Host-Driven is designed for experimenting with models on-device and, like AutoTVM, uses the RPC server to
@@ -213,8 +213,8 @@ In Host-Driven execution, the firmware binary is the following:
 4. The TVM RPC server.
 5. (optional) Simplified Parameters.
 
-This firmware image is flashed onto the device and a GraphRuntime instance is created on the host.
-The GraphRuntime drives execution by sending RPC commands over a UART:
+This firmware image is flashed onto the device and a GraphExecutor instance is created on the host.
+The GraphExecutor drives execution by sending RPC commands over a UART:
 
 .. figure:: https://raw.githubusercontent.com/tvmai/web-data/main/images/dev/microtvm_host_driven.svg
    :align: center
@@ -223,7 +223,7 @@ The GraphRuntime drives execution by sending RPC commands over a UART:
 Standalone Execution
 ^^^^^^^^^^^^^^^^^^^^
 
-In Standalone execution, the GraphRuntime is instantiated on device:
+In Standalone execution, the GraphExecutor is instantiated on device:
 
 .. figure:: https://raw.githubusercontent.com/tvmai/web-data/main/images/dev/microtvm_standalone.svg
    :align: center
@@ -248,7 +248,7 @@ When configuring for host-driven inference or AutoTVM, the remaining tasks are w
 When configuring for standalone deployment, the firmware needs to:
 
 1. Instantiate the system library by calling the ``runtime.SystemLib`` PackedFunc.
-2. Instantiate a GraphRuntime passing the system library module.
+2. Instantiate a GraphExecutor passing the system library module.
 3. Configure parameters and inputs as needed.
 4. Run the model.
 
@@ -267,7 +267,7 @@ For Host-driven model execution, firmware also needs:
 
 For Standalone model execution, firmware also needs:
 
-4. The TVM C GraphRuntime library, supplied by TVM as a static library.
+4. The TVM C GraphExecutor library, supplied by TVM as a static library.
 5. The remaining compiler outputs (Simplified Parameters and Graph JSON).
 
 The Automated Build Flow
@@ -323,11 +323,11 @@ Future Work
 Ahead-of-Time Runtime
 ----------------------
 
-A limitation of the Graph Runtime is the amount of memory overhead required in parsing the JSON.
+A limitation of the Graph Executor is the amount of memory overhead required in parsing the JSON.
 The current implementation contributes significantly to the dynamic memory usage of microTVM,
 limiting its utility. An ahead-of-time runtime can avoid the need for any Graph JSON parsing and
 improve inference speed by generating C code to call the generated operator implementations directly
-rather than relying on a data-driven approach with the Graph Runtime.
+rather than relying on a data-driven approach with the Graph Executor.
 
 Memory Planning
 ----------------
diff --git a/docs/dev/relay_bring_your_own_codegen.rst b/docs/dev/relay_bring_your_own_codegen.rst
index 3fcd3365c82f..b9f2337de2d4 100644
--- a/docs/dev/relay_bring_your_own_codegen.rst
+++ b/docs/dev/relay_bring_your_own_codegen.rst
@@ -757,10 +757,10 @@ Then, we implement ``ParseJson`` to parse a subgraph in ExampleJSON format and c
         entry.output = id;
         graph_[curr_subgraph].push_back(entry); // Note 2
       }
-      DLContext ctx;
-      ctx.device_type = static_cast<DLDeviceType>(1);
-      ctx.device_id = 0;
-      data_entry_[id] = NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx); // Note 3
+      DLDevice dev;
+      dev.device_type = static_cast<DLDeviceType>(1);
+      dev.device_id = 0;
+      data_entry_[id] = NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, dev); // Note 3
     }
   }
 
diff --git a/docs/dev/virtual_machine.rst b/docs/dev/virtual_machine.rst
index 9081d50b92ef..7826f68b71dd 100644
--- a/docs/dev/virtual_machine.rst
+++ b/docs/dev/virtual_machine.rst
@@ -32,9 +32,9 @@ There are further challenges in compiling dynamic code, such as dynamic scheduli
 fully dynamic tensor shapes, and control flow. The interpreter offers simple solutions
 for these, but none is sufficiently compelling or optimized.
 
-The second execution mechanism is the existing graph runtime. In order to target Relay
+The second execution mechanism is the existing graph executor. In order to target Relay
 programs to this, we compile a small subset of them to the old graph format and execute
-them on the runtime. Graph runtime provides a fast execution experience but only for a very limited
+them on the runtime. Graph executor provides a fast execution experience but only for a very limited
 subset of Relay programs.
 
 An alternative but not-standard approach is Relay's ahead-of-time compiler,
@@ -64,7 +64,7 @@ micro-optimizations present in scalar VMs are dramatically less important.
 
 TVM has provided strong support for vision models,
 but we want to grow to support a wider variety of models.
-The graph runtime is able to utilize the fully static nature of the input graphs to perform
+The graph executor is able to utilize the fully static nature of the input graphs to perform
 aggressive optimization such as fully static allocation, and optimal memory reuse.
 When we introduce models which make use of control flow, recursion, dynamic shapes, and dynamic
 allocation, we must change how execution works. A virtual machine for Relay is a natural choice.
@@ -354,7 +354,7 @@ Serialization
 
 Serializing and deserializing the executable generated by the Relay VM compiler is a must as
 we may want to save the model to the disk and perform inference later. Previously, Relay has produced
-a serialized form in a json file for the graph runtime. However, the same format is not directly
+a serialized form in a json file for the graph executor. However, the same format is not directly
 applicable to the VM as it emits bytecode instead of graph-style programs.
 Serialization of an executable essentially needs to handle both model specific
 (i.e. weights and kernels) and VM related (i.e. bytecode and global function names) data.
@@ -376,7 +376,7 @@ components in a binary format that is organized with the following sections in o
 - Code section. The VM functions, including bytecode, are sitting in this section. The dispatching
   loop iterates through this section to fetch instructions for execution.
 
-Hence, unlike the graph runtime artifact that contains weight (.params), graph json (.json),
+Hence, unlike the graph executor artifact that contains weight (.params), graph json (.json),
 and compiled kernel library (.so), the serialized executable artifact is composed of the Relay
 object file (.ro) and the compiled kernel library (.so).
 
diff --git a/docs/index.rst b/docs/index.rst
index 3131be5381fc..323fb2a9d313 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -25,7 +25,7 @@ Get Started
 -----------
 
 - Follow the :doc:`instructions <install/index>` to install TVM.
-- Checkout the :doc:`tutorials <tutorials/index>`.
+- Checkout the :doc:`Getting Started with TVM Tutorial <tutorials/index>`.
 
 For Developers
 --------------
diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst
index f6be4e31af90..f0ebad1d9edc 100644
--- a/docs/install/from_source.rst
+++ b/docs/install/from_source.rst
@@ -88,7 +88,7 @@ The configuration of TVM can be modified by `config.cmake`.
   - On macOS, for some versions of Xcode, you need to add ``-lc++abi`` in the LDFLAGS or you'll get link errors.
   - Change ``set(USE_CUDA OFF)`` to ``set(USE_CUDA ON)`` to enable CUDA backend. Do the same for other backends and libraries
     you want to build for (OpenCL, RCOM, METAL, VULKAN, ...).
-  - To help with debugging, ensure the embedded graph runtime and debugging functions are enabled with ``set(USE_GRAPH_RUNTIME ON)`` and ``set(USE_GRAPH_RUNTIME_DEBUG ON)``
+  - To help with debugging, ensure the embedded graph executor and debugging functions are enabled with ``set(USE_GRAPH_EXECUTOR ON)`` and ``set(USE_PROFILER ON)``
 
 - TVM requires LLVM for for CPU codegen. We highly recommend you to build with the LLVM support on.
 
diff --git a/docs/microtvm/index.rst b/docs/microtvm/index.rst
index 2371219af27f..a67b1547d229 100644
--- a/docs/microtvm/index.rst
+++ b/docs/microtvm/index.rst
@@ -43,7 +43,7 @@ demos run against QEMU and the following hardware:
 
 * `STM Nucleo-F746ZG <https://www.st.com/en/evaluation-tools/nucleo-f746zg.html>`_
 * `STM STM32F746 Discovery <https://www.st.com/en/evaluation-tools/32f746gdiscovery.html>`_
-* `nRF 5340 Preview Development Kit <https://www.nordicsemi.com/Software-and-tools/Development-Kits/nRF5340-PDK>`_
+* `nRF 5340 Development Kit <https://www.nordicsemi.com/Software-and-tools/Development-Kits/nRF5340-DK>`_
 
 
 Getting Started with microTVM
diff --git a/golang/Makefile b/golang/Makefile
index 6fd77996e119..137e2a488e29 100644
--- a/golang/Makefile
+++ b/golang/Makefile
@@ -25,7 +25,7 @@ NATIVE_SRC = tvm_runtime_pack.cc
 GOPATH=$(CURDIR)/gopath
 GOPATHDIR=${GOPATH}/src/${TARGET}/
 CGO_CPPFLAGS="-I. -I${TVM_BASE}/ -I${TVM_BASE}/3rdparty/dmlc-core/include -I${TVM_BASE}/include -I${TVM_BASE}/3rdparty/dlpack/include/"
-CGO_CXXFLAGS="-std=c++14"
+CGO_CXXFLAGS="-std=c++14 -DDMLC_USE_LOGGING_LIBRARY=\<tvm/runtime/logging.h\>"
 CGO_CFLAGS="-I${TVM_BASE}"
 CGO_LDFLAGS="-ldl -lm"
 
diff --git a/golang/sample/complex.go b/golang/sample/complex.go
index bbe74dc85e09..911d0a7a28c1 100644
--- a/golang/sample/complex.go
+++ b/golang/sample/complex.go
@@ -70,13 +70,13 @@ func main() {
     }
     jsonStr := string(bytes)
 
-    // Load module on tvm runtime - call tvm.graph_runtime.create
-    funp, err := gotvm.GetGlobalFunction("tvm.graph_runtime.create")
+    // Load module on tvm runtime - call tvm.graph_executor.create
+    funp, err := gotvm.GetGlobalFunction("tvm.graph_executor.create")
     if err != nil {
         fmt.Print(err)
         return
     }
-    fmt.Printf("Calling tvm.graph_runtime.create\n")
+    fmt.Printf("Calling tvm.graph_executor.create\n")
     // Call function
     graphrt, err := funp.Invoke(jsonStr, modp, (int64)(gotvm.KDLCPU), (int64)(0))
     if err != nil {
@@ -84,7 +84,7 @@ func main() {
         return
     }
     graphmod := graphrt.AsModule()
-    fmt.Printf("Graph runtime Created\n")
+    fmt.Printf("Graph executor Created\n")
 
     // Array allocation attributes
     tshapeIn  := []int64{1, 224, 224, 3}
@@ -105,7 +105,7 @@ func main() {
     }
     fmt.Printf("Input and Output Arrays allocated\n")
 
-    // Get module function from graph runtime : load_params
+    // Get module function from graph executor : load_params
     // Read params
     bytes, err = ioutil.ReadFile(modParams)
     if err != nil {
diff --git a/golang/sample/simple.go b/golang/sample/simple.go
index f9b29e9a1492..7bb503db4598 100644
--- a/golang/sample/simple.go
+++ b/golang/sample/simple.go
@@ -49,7 +49,7 @@ func main() {
 
 
     // Allocate Array for inputs and outputs.
-    // Allocation by explicit type and context.
+    // Allocation by explicit type and device.
     tshapeIn  := []int64{4}
     inX, _ := gotvm.Empty(tshapeIn, "float32", gotvm.CPU(0))
 
diff --git a/golang/src/array_test.go b/golang/src/array_test.go
index 0dbc81412a36..a2636a8b0f20 100644
--- a/golang/src/array_test.go
+++ b/golang/src/array_test.go
@@ -113,8 +113,8 @@ func TestArrayShape(t *testing.T) {
     }
 }
 
-// Create an array and check created Context.
-func TestArrayCtx(t *testing.T) {
+// Create an array and check created Device.
+func TestArrayDevice(t *testing.T) {
     // TODO: Could some test cases for other targets
     arr, err := Empty([]int64{4}, CPU(0))
     if err != nil {
@@ -122,13 +122,13 @@ func TestArrayCtx(t *testing.T) {
         return
     }
 
-    ctx := arr.GetCtx()
-    if ctx.DeviceType != KDLCPU {
-        t.Errorf("Ctx DeviceType expected: %v Got :%v\n", KDLCPU, ctx.DeviceType)
+    dev := arr.GetDevice()
+    if dev.DeviceType != KDLCPU {
+        t.Errorf("Dev DeviceType expected: %v Got :%v\n", KDLCPU, dev.DeviceType)
         return
     }
-    if ctx.DeviceID != 0 {
-        t.Errorf("Ctx DeviceID expected: %v Got :%v\n", KDLCPU, ctx.DeviceID)
+    if dev.DeviceID != 0 {
+        t.Errorf("Dev DeviceID expected: %v Got :%v\n", KDLCPU, dev.DeviceID)
         return
     }
 
@@ -138,13 +138,13 @@ func TestArrayCtx(t *testing.T) {
         return
     }
 
-    ctx = arr.GetCtx()
-    if ctx.DeviceType != KDLCPU {
-        t.Errorf("Ctx DeviceType expected: %v Got :%v\n", KDLCPU, ctx.DeviceType)
+    dev = arr.GetDevice()
+    if dev.DeviceType != KDLCPU {
+        t.Errorf("Dev DeviceType expected: %v Got :%v\n", KDLCPU, dev.DeviceType)
         return
     }
-    if ctx.DeviceID != 2 {
-        t.Errorf("Ctx DeviceID expected: %v Got :%v\n", KDLCPU, ctx.DeviceID)
+    if dev.DeviceID != 2 {
+        t.Errorf("Dev DeviceID expected: %v Got :%v\n", KDLCPU, dev.DeviceID)
         return
     }
 }
diff --git a/golang/src/context.go b/golang/src/device.go
similarity index 58%
rename from golang/src/context.go
rename to golang/src/device.go
index cc0acbc88876..6569e44bf1ef 100644
--- a/golang/src/context.go
+++ b/golang/src/device.go
@@ -18,8 +18,8 @@
  */
 
 /*!
- * \brief gotvm package source for TVMContext interface
- * \file context.go
+ * \brief gotvm package source for Device interface
+ * \file device.go
  */
 
 package gotvm
@@ -50,58 +50,58 @@ var KOpenGL                 = int32(C.kOpenGL)
 // KExtDev is golang enum correspond to TVM device type kDLExtDev.
 var KExtDev                 = int32(C.kDLExtDev)
 
-// Context dtype corresponding to TVMContext aka DLContext
-type Context struct {
+// Device dtype corresponding to Device aka DLDevice
+type Device struct {
     DeviceType int32
     DeviceID    int32
 }
 
-// CPU returns the Context object for CPU target on given index
-func CPU(index int32) Context {
-    return Context{KDLCPU, index}
+// CPU returns the Device object for CPU target on given index
+func CPU(index int32) Device {
+    return Device{KDLCPU, index}
 }
 
-// GPU returns the Context object for GPU target on given index
-func GPU(index int32) Context {
-    return Context{KDLGPU, index}
+// GPU returns the Device object for GPU target on given index
+func GPU(index int32) Device {
+    return Device{KDLGPU, index}
 }
 
-// CPUPinned returns the Context object for CPUPinned target on given index
-func CPUPinned(index int32) Context {
-    return Context{KDLCPUPinned, index}
+// CPUPinned returns the Device object for CPUPinned target on given index
+func CPUPinned(index int32) Device {
+    return Device{KDLCPUPinned, index}
 }
 
-// OpenCL returns the Context object for OpenCL target on given index
-func OpenCL(index int32) Context {
-    return Context{KDLOpenCL, index}
+// OpenCL returns the Device object for OpenCL target on given index
+func OpenCL(index int32) Device {
+    return Device{KDLOpenCL, index}
 }
 
-// Metal returns the Context object for Metal target on given index
-func Metal(index int32) Context {
-    return Context{KDLMetal, index}
+// Metal returns the Device object for Metal target on given index
+func Metal(index int32) Device {
+    return Device{KDLMetal, index}
 }
 
-// VPI returns the Context object for VPI target on given index
-func VPI(index int32) Context {
-    return Context{KDLVPI, index}
+// VPI returns the Device object for VPI target on given index
+func VPI(index int32) Device {
+    return Device{KDLVPI, index}
 }
 
-// ROCM returns the Context object for ROCM target on given index
-func ROCM(index int32) Context {
-    return Context{KDLROCM, index}
+// ROCM returns the Device object for ROCM target on given index
+func ROCM(index int32) Device {
+    return Device{KDLROCM, index}
 }
 
-// SDAccel returns the Context object for SDAccel target on given index
-func SDAccel(index int32) Context {
-    return Context{KDLSDAccel, index}
+// SDAccel returns the Device object for SDAccel target on given index
+func SDAccel(index int32) Device {
+    return Device{KDLSDAccel, index}
 }
 
-// Vulkan returns the Context object for Vulkan target on given index
-func Vulkan(index int32) Context {
-    return Context{KDLVulkan, index}
+// Vulkan returns the Device object for Vulkan target on given index
+func Vulkan(index int32) Device {
+    return Device{KDLVulkan, index}
 }
 
-// OpenGL returns the Context object for OpenGL target on given index
-func OpenGL(index int32) Context {
-    return Context{KOpenGL, index}
+// OpenGL returns the Device object for OpenGL target on given index
+func OpenGL(index int32) Device {
+    return Device{KOpenGL, index}
 }
diff --git a/golang/src/function_test.go b/golang/src/function_test.go
index 17b1c9a6e1c0..0830d16419a2 100644
--- a/golang/src/function_test.go
+++ b/golang/src/function_test.go
@@ -46,7 +46,7 @@ func TestFunctionGlobals(t *testing.T) {
 
 // Check GetFunction API
 func TestFunctionGlobalGet(t *testing.T) {
-    funp, err := GetGlobalFunction("tvm.graph_runtime.create")
+    funp, err := GetGlobalFunction("tvm.graph_executor.create")
     if err != nil {
         t.Error(err.Error())
         return
diff --git a/golang/src/ndarray.go b/golang/src/ndarray.go
index e7471347405c..b1e71aef56bd 100644
--- a/golang/src/ndarray.go
+++ b/golang/src/ndarray.go
@@ -243,10 +243,10 @@ func (parray Array) GetDType() (retVal string) {
     return
 }
 
-// GetCtx returns the number of dimentions in Array
-func (parray Array) GetCtx() (retVal Context) {
-    ret := ((*C.DLTensor)(unsafe.Pointer(parray))).ctx
-    retVal = *(*Context)(unsafe.Pointer(&ret))
+// GetDevice returns the number of dimentions in Array
+func (parray Array) GetDevice() (retVal Device) {
+    ret := ((*C.DLTensor)(unsafe.Pointer(parray))).device
+    retVal = *(*Device)(unsafe.Pointer(&ret))
     return
 }
 
@@ -289,12 +289,12 @@ func nativeTVMArrayAlloc(shape []int64, ndim int32,
 //
 //        `args[0]` is string for data type. Default value is 'float32'
 //
-//        `args[1]` is Context. Default value is '{KDLCPU, 0}'
+//        `args[1]` is Device. Default value is '{KDLCPU, 0}'
 //
 // returns pointer to Array on successful execution and error if any.
 func Empty(shape []int64, args ...interface{}) (parray *Array, err error) {
     typeName := "float32"
-    ctx := Context{KDLCPU, 0}
+    dev := Device{KDLCPU, 0}
 
     if len(shape) < 1 {
         err = fmt.Errorf("Invalid shape for Array creation: %v", len(shape))
@@ -305,8 +305,8 @@ func Empty(shape []int64, args ...interface{}) (parray *Array, err error) {
         switch val.(type) {
             case string:
                 typeName = args[i].(string)
-            case Context:
-                ctx = args[i].(Context)
+            case Device:
+                dev = args[i].(Device)
             default:
                 err = fmt.Errorf("Invalid Optional Argument Type: %T", val)
                 return
@@ -320,7 +320,7 @@ func Empty(shape []int64, args ...interface{}) (parray *Array, err error) {
     ndim := int32(len(shape))
     newArray, err := nativeTVMArrayAlloc(shape, ndim, int32(tvmType.code),
                                     int32(tvmType.bits), int32(tvmType.lanes),
-                                    ctx.DeviceType, ctx.DeviceID)
+                                    dev.DeviceType, dev.DeviceID)
     if err != nil {
         return
     }
diff --git a/golang/src/tvm_runtime_pack.cc b/golang/src/tvm_runtime_pack.cc
index 7dd6dd5e94c5..430e046e39a8 100644
--- a/golang/src/tvm_runtime_pack.cc
+++ b/golang/src/tvm_runtime_pack.cc
@@ -42,8 +42,8 @@
 #include "src/runtime/dso_library.cc"
 #include "src/runtime/system_library.cc"
 
-// Graph runtime
-#include "src/runtime/graph/graph_runtime.cc"
+// Graph executor
+#include "src/runtime/graph_executor/graph_executor.cc"
 
 // Uncomment the following lines to enable RPC
 // #include "../../src/runtime/rpc/rpc_session.cc"
diff --git a/golang/src/value.go b/golang/src/value.go
index a7db894374af..450cf4866ab0 100644
--- a/golang/src/value.go
+++ b/golang/src/value.go
@@ -39,8 +39,8 @@ var KHandle                 = int32(C.kTVMOpaqueHandle)
 var KNull                   = int32(C.kTVMNullptr)
 // KTVMType is golang type code for TVM kTVMDataType.
 var KTVMType                = int32(C.kTVMDataType)
-// KTVMContext is golang type code for TVM kTVMContext.
-var KTVMContext             = int32(C.kTVMContext)
+// KDLDevice is golang type code for TVM kDLDevice.
+var KDLDevice               = int32(C.kDLDevice)
 // KArrayHandle is golang type code for TVM kTVMDLTensorHandle.
 var KArrayHandle            = int32(C.kTVMDLTensorHandle)
 // KObjectHandle is golang type code for TVM kTVMObjectHandle.
diff --git a/include/tvm/arith/analyzer.h b/include/tvm/arith/analyzer.h
index cd20bdcf4d1a..adb037bfd050 100644
--- a/include/tvm/arith/analyzer.h
+++ b/include/tvm/arith/analyzer.h
@@ -458,6 +458,16 @@ class TVM_DLL Analyzer {
    * \note Analyzer will call into sub-analyzers to get the result.
    */
   bool CanProveLess(const PrimExpr& expr, int64_t upper_bound);
+  /*!
+   * \brief Whether can we prove lhs == rhs.
+   *
+   * \param lhs The input lhs.
+   * \param rhs The input rhs.
+   * \return Whether we can prove lhs == rhs.
+   *
+   * \note Analyzer will call into sub-analyzers to get the result.
+   */
+  bool CanProveEqual(const PrimExpr& lhs, const PrimExpr& rhs);
   /*!
    * \brief Whether can we prove condition.
    *
diff --git a/include/tvm/arith/iter_affine_map.h b/include/tvm/arith/iter_affine_map.h
index e2e081d2be89..f786c013443c 100644
--- a/include/tvm/arith/iter_affine_map.h
+++ b/include/tvm/arith/iter_affine_map.h
@@ -136,6 +136,7 @@ class IterMark : public ObjectRef {
   TVM_DLL IterMark(PrimExpr source, PrimExpr extent);
 
   TVM_DEFINE_OBJECT_REF_METHODS(IterMark, ObjectRef, IterMarkNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(IterMarkNode);
 };
 
 /*!
@@ -259,7 +260,6 @@ class IterSumExpr : public IterMapExpr {
 
 /*!
  * \brief Detect if indices can be written as
- *
  *  [y_0 + c_0, y_1 + c_1, ..., y_n + c_n]
  *
  *  Here y = some-quasi-affine-iter-map(input_iters)
@@ -272,12 +272,15 @@ class IterSumExpr : public IterMapExpr {
  *
  * \param indices The indices to detect pattern for.
  * \param input_iters Map from variable to iterator's range.
+ * \param predicate The predicate constraints on the input iterators
+ * \param require_bijective A boolean flag that indicates whether the mapping should be bijective.
  * \param analyzer Analyzer used to get context information.
  *
  * \return The detected pattern if a match exists,
  *         otherwise return an empty array.
  */
 Array<IterSumExpr> DetectIterMap(const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters,
+                                 const PrimExpr& predicate, bool require_bijective,
                                  arith::Analyzer* analyzer);
 
 }  // namespace arith
diff --git a/include/tvm/ir/attrs.h b/include/tvm/ir/attrs.h
index f05ab04c3305..da7bc12619bd 100644
--- a/include/tvm/ir/attrs.h
+++ b/include/tvm/ir/attrs.h
@@ -92,12 +92,12 @@ inline DataType NullValue<DataType>() {
 }
 
 /*! \brief Error thrown during attribute checking. */
-struct AttrError : public dmlc::Error {
+struct AttrError : public Error {
   /*!
    * \brief constructor
    * \param msg error message
    */
-  explicit AttrError(std::string msg) : dmlc::Error("AttributeError:" + msg) {}
+  explicit AttrError(std::string msg) : Error("AttributeError:" + msg) {}
 };
 
 /*!
diff --git a/include/tvm/ir/diagnostic.h b/include/tvm/ir/diagnostic.h
index 2053a295a3b8..41130a5be0aa 100644
--- a/include/tvm/ir/diagnostic.h
+++ b/include/tvm/ir/diagnostic.h
@@ -37,6 +37,15 @@ namespace tvm {
 using tvm::parser::SourceMap;
 using tvm::runtime::TypedPackedFunc;
 
+/*! \brief The diagnostic level, controls the printing of the message. */
+enum class DiagnosticLevel : int {
+  kBug = 10,
+  kError = 20,
+  kWarning = 30,
+  kNote = 40,
+  kHelp = 50,
+};
+
 class DiagnosticBuilder;
 
 /*! \brief A compiler diagnostic. */
diff --git a/include/tvm/ir/error.h b/include/tvm/ir/error.h
index ac7b96a3bd59..6ff61781ac44 100644
--- a/include/tvm/ir/error.h
+++ b/include/tvm/ir/error.h
@@ -36,11 +36,11 @@ namespace tvm {
 /*!
  * \brief A wrapper around std::stringstream to build error.
  *
- * Can be consumed by Error to construct an error.
+ * Can be consumed by CompileError to construct an error.
  *
  * \code
  *
- * void ReportError(const Error& err);
+ * void ReportError(const CompileError& err);
  *
  * void Test(int number) {
  *   // Use error reporter to construct an error.
@@ -59,13 +59,13 @@ struct ErrorBuilder {
 
  private:
   std::stringstream stream_;
-  friend class Error;
+  friend class CompileError;
 };
 
 /*!
  * \brief Custom Error class to be thrown during compilation.
  */
-class Error : public dmlc::Error {
+class CompileError : public Error {
  public:
   /*! \brief Location of the error */
   Span span;
@@ -73,20 +73,20 @@ class Error : public dmlc::Error {
    * \brief construct error from message.
    * \param msg The message
    */
-  explicit Error(const std::string& msg) : dmlc::Error(msg), span(nullptr) {}
+  explicit CompileError(const std::string& msg) : Error(msg), span(nullptr) {}
   /*!
    * \brief construct error from error builder.
    * \param err The error builder
    */
-  Error(const ErrorBuilder& err) : dmlc::Error(err.stream_.str()), span(nullptr) {}  // NOLINT(*)
+  CompileError(const ErrorBuilder& err) : Error(err.stream_.str()), span(nullptr) {}  // NOLINT(*)
   /*!
    * \brief copy constructor.
    * \param other The other ereor.
    */
-  Error(const Error& other) : dmlc::Error(other.what()), span(other.span) {}  // NOLINT(*)
+  CompileError(const CompileError& other) : Error(other.what()), span(other.span) {}  // NOLINT(*)
   /*!
    * \brief default constructor. */
-  Error() : dmlc::Error(""), span(nullptr) {}
+  CompileError() : Error(""), span(nullptr) {}
 };
 
 /*!
@@ -115,13 +115,13 @@ class ErrorReporter {
   ErrorReporter() : errors_(), node_to_error_() {}
 
   /*!
-   * \brief Report a tvm::Error.
+   * \brief Report a CompileError.
    *
    * This API is useful for reporting spanned errors.
    *
    * \param err The error to report.
    */
-  void Report(const Error& err) {
+  void Report(const CompileError& err) {
     if (!err.span.defined()) {
       throw err;
     }
@@ -143,7 +143,7 @@ class ErrorReporter {
    */
   void ReportAt(const GlobalVar& global, const ObjectRef& node, std::stringstream& err) {
     std::string err_msg = err.str();
-    this->ReportAt(global, node, Error(err_msg));
+    this->ReportAt(global, node, CompileError(err_msg));
   }
 
   /*!
@@ -158,7 +158,7 @@ class ErrorReporter {
    * \param node The expression or type to report the error at.
    * \param err The error to report.
    */
-  void ReportAt(const GlobalVar& global, const ObjectRef& node, const Error& err);
+  void ReportAt(const GlobalVar& global, const ObjectRef& node, const CompileError& err);
 
   /*!
    * \brief Render all reported errors and exit the program.
@@ -176,7 +176,7 @@ class ErrorReporter {
   inline bool AnyErrors() { return errors_.size() != 0; }
 
  private:
-  std::vector<Error> errors_;
+  std::vector<CompileError> errors_;
   std::unordered_map<ObjectRef, std::vector<size_t>, ObjectPtrHash, ObjectPtrEqual> node_to_error_;
   std::unordered_map<ObjectRef, GlobalVar, ObjectPtrHash, ObjectPtrEqual> node_to_gv_;
 };
diff --git a/include/tvm/ir/type_relation.h b/include/tvm/ir/type_relation.h
index 462588006c9b..dd6861750a10 100644
--- a/include/tvm/ir/type_relation.h
+++ b/include/tvm/ir/type_relation.h
@@ -29,7 +29,7 @@
 #include <tvm/ir/env_func.h>
 #include <tvm/ir/module.h>
 #include <tvm/ir/type.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 namespace tvm {
 
diff --git a/include/tvm/relay/analysis.h b/include/tvm/relay/analysis.h
index 5dd837038731..264f2609a4b6 100644
--- a/include/tvm/relay/analysis.h
+++ b/include/tvm/relay/analysis.h
@@ -29,7 +29,7 @@
 #include <tvm/relay/expr.h>
 #include <tvm/relay/function.h>
 #include <tvm/relay/type.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include <string>
 #include <unordered_map>
@@ -272,12 +272,12 @@ TVM_DLL Map<GlobalVar, Array<Integer>> GetCalibrateOutputMap(const IRModule& mod
  * \brief Analyze the device context of each IR node in a given relay module.
  *
  * \param mod The module for analysis.
- * \param default_context The default context used by unassigned IR nodes.
+ * \param default_device The default device used by unassigned IR nodes.
  *
- * \return The mapping between an IR node and its associated context.
+ * \return The mapping between an IR node and its associated device.
  */
-TVM_DLL std::unordered_map<Expr, TVMContext, runtime::ObjectPtrHash, runtime::ObjectPtrEqual>
-ContextAnalysis(const IRModule& mod, const TVMContext& default_context);
+TVM_DLL std::unordered_map<Expr, Device, runtime::ObjectPtrHash, runtime::ObjectPtrEqual>
+ContextAnalysis(const IRModule& mod, const Device& default_device);
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index ff344f5e1a85..a5544c8a8799 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -438,17 +438,19 @@ struct MatrixSetDiagAttrs : public tvm::AttrsNode<MatrixSetDiagAttrs> {
   }
 };  // struct MatrixSetDiagAttrs
 
-/*! \brief Attributes used in cumsum operator */
-struct CumsumAttrs : public tvm::AttrsNode<CumsumAttrs> {
+/*! \brief Attributes used in cumsum and cumprod operator */
+struct ScanopAttrs : public tvm::AttrsNode<ScanopAttrs> {
   Integer axis;
   DataType dtype;
-  Integer exclusive;
-  TVM_DECLARE_ATTRS(CumsumAttrs, "relay.attrs.CumsumAttrs") {
-    TVM_ATTR_FIELD(axis).describe("The axis to sum over").set_default(NullValue<Integer>());
+  Bool exclusive = Bool(false);
+  TVM_DECLARE_ATTRS(ScanopAttrs, "relay.attrs.ScanopAttrs") {
+    TVM_ATTR_FIELD(axis).describe("The axis to operate over").set_default(NullValue<Integer>());
     TVM_ATTR_FIELD(dtype).describe("Output data type").set_default(NullValue<DataType>());
+
+    // Default is 0 which is "false"
     TVM_ATTR_FIELD(exclusive)
         .describe("The first element is not included")
-        .set_default(NullValue<Integer>());
+        .set_default(Bool(false));
   }
 };
 
diff --git a/include/tvm/relay/interpreter.h b/include/tvm/relay/interpreter.h
index 8a41ab74658f..e3fd5ae77193 100644
--- a/include/tvm/relay/interpreter.h
+++ b/include/tvm/relay/interpreter.h
@@ -58,11 +58,11 @@ namespace relay {
  * Relay's semantics, but a readable and clear one.
  *
  * \param mod The function module.
- * \param context The primary context that the interepreter runs on.
+ * \param device The primary device that the interepreter runs on.
  * \param target Compiler target flag to compile the functions on the context.
  * \return A function that takes in an expression and returns a value.
  */
-runtime::TypedPackedFunc<ObjectRef(Expr)> CreateInterpreter(IRModule mod, DLContext context,
+runtime::TypedPackedFunc<ObjectRef(Expr)> CreateInterpreter(IRModule mod, Device device,
                                                             Target target);
 
 /*! \brief The container type of Closures used by the interpreter. */
diff --git a/include/tvm/relay/qnn/attrs.h b/include/tvm/relay/qnn/attrs.h
index c5213fe07471..f0280a90c604 100644
--- a/include/tvm/relay/qnn/attrs.h
+++ b/include/tvm/relay/qnn/attrs.h
@@ -75,6 +75,18 @@ struct QuantizeAttrs : public tvm::AttrsNode<QuantizeAttrs> {
   }
 };
 
+struct SimulatedQuantizeAttrs : public tvm::AttrsNode<SimulatedQuantizeAttrs> {
+  int axis;
+
+  TVM_DECLARE_ATTRS(SimulatedQuantizeAttrs, "relay.attrs.SimulatedQuantizeAttrs") {
+    TVM_ATTR_FIELD(axis)
+        .describe(
+            "The output channel axis for channel wise quantization. Default value is -1,"
+            "which corresponds to the last axis.")
+        .set_default(-1);
+  }
+};
+
 /*! \brief Attribute for dequantize operator */
 struct DequantizeAttrs : public tvm::AttrsNode<DequantizeAttrs> {
   int axis;
diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index 59316a0bace0..44dba4d9c463 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -108,7 +108,7 @@ typedef enum {
   kTVMOpaqueHandle = 3U,
   kTVMNullptr = 4U,
   kTVMDataType = 5U,
-  kTVMContext = 6U,
+  kDLDevice = 6U,
   kTVMDLTensorHandle = 7U,
   kTVMObjectHandle = 8U,
   kTVMModuleHandle = 9U,
@@ -129,11 +129,6 @@ typedef enum {
   kTVMExtEnd = 128U,
 } TVMArgTypeCode;
 
-/*!
- * \brief The Device information, abstract away common device types.
- */
-typedef DLContext TVMContext;
-
 /*! \brief the array handle */
 typedef DLTensor* TVMArrayHandle;
 
@@ -147,7 +142,7 @@ typedef union {
   void* v_handle;
   const char* v_str;
   DLDataType v_type;
-  TVMContext v_ctx;
+  DLDevice v_device;
 } TVMValue;
 
 /*!
@@ -382,8 +377,8 @@ TVM_DLL int TVMFuncRemoveGlobal(const char* name);
  * \param dtype_code The type code of the dtype
  * \param dtype_bits The number of bits of dtype
  * \param dtype_lanes The number of lanes in the dtype.
- * \param device_type The device type of context
- * \param device_id The device id of context.
+ * \param device_type The device type.
+ * \param device_id The device id.
  * \param out The output handle.
  * \return 0 when success, -1 when failure happens
  */
@@ -451,9 +446,9 @@ TVM_DLL void TVMDLManagedTensorCallDeleter(DLManagedTensor* dltensor);
 /*!
  * \brief Create a new runtime stream.
  *
- * \param device_type The device type of context
- * \param device_id The device id of context
- * \param out The new stream handle
+ * \param device_type The device type.
+ * \param device_id The device id.
+ * \param out The new stream handle.
  * \return 0 when success, -1 when failure happens
  */
 TVM_DLL int TVMStreamCreate(int device_type, int device_id, TVMStreamHandle* out);
@@ -461,9 +456,9 @@ TVM_DLL int TVMStreamCreate(int device_type, int device_id, TVMStreamHandle* out
 /*!
  * \brief Free a created stream handle.
  *
- * \param device_type The device type of context
- * \param device_id The device id of context
- * \param stream The stream to be freed
+ * \param device_type The device type.
+ * \param device_id The device id.
+ * \param stream The stream to be freed.
  * \return 0 when success, -1 when failure happens
  */
 TVM_DLL int TVMStreamFree(int device_type, int device_id, TVMStreamHandle stream);
@@ -474,8 +469,8 @@ TVM_DLL int TVMStreamFree(int device_type, int device_id, TVMStreamHandle stream
  *  will use the setted stream handle.
  *  The specific type of stream is runtime device dependent.
  *
- * \param device_type The device type of context
- * \param device_id The device id of context.
+ * \param device_type The device type.
+ * \param device_id The device id.
  * \param handle The stream handle.
  * \return 0 when success, -1 when failure happens
  */
@@ -484,8 +479,8 @@ TVM_DLL int TVMSetStream(int device_type, int device_id, TVMStreamHandle handle)
 /*!
  * \brief Wait until all computations on stream completes.
  *
- * \param device_type The device type of context
- * \param device_id The device id of context.
+ * \param device_type The device type.
+ * \param device_id The device id.
  * \param stream The stream to be synchronized.
  * \return 0 when success, -1 when failure happens
  */
@@ -494,8 +489,8 @@ TVM_DLL int TVMSynchronize(int device_type, int device_id, TVMStreamHandle strea
 /*!
  * \brief Synchronize two streams of execution.
  *
- * \param device_type The device type of context
- * \param device_id The device id of context
+ * \param device_type The device type.
+ * \param device_id The device id.
  * \param src The source stream to synchronize.
  * \param dst The destination stream to synchronize.
  * \return 0 when success, -1 when failure happens
@@ -548,7 +543,7 @@ TVM_DLL int TVMByteArrayFree(TVMByteArray* arr);
 
 /*!
  * \brief Allocate a data space on device.
- * \param ctx The device context to perform operation.
+ * \param dev The device to perform operation.
  * \param nbytes The number of bytes in memory.
  * \param alignment The alignment of the memory.
  * \param type_hint The type of elements. Only needed by certain backends such
@@ -556,14 +551,14 @@ TVM_DLL int TVMByteArrayFree(TVMByteArray* arr);
  * \param out_data The allocated device pointer.
  * \return 0 when success, -1 when failure happens
  */
-TVM_DLL int TVMDeviceAllocDataSpace(DLContext ctx, size_t nbytes, size_t alignment,
+TVM_DLL int TVMDeviceAllocDataSpace(DLDevice dev, size_t nbytes, size_t alignment,
                                     DLDataType type_hint, void** out_data);
 
 /*!
  * \brief Allocate a data space on device with special memory scope.
  * \note The memory could use a special multi-dimensional memory layout.
  *       That is why we pass shape and dtype instead of raw number of bytes.
- * \param ctx The device context to perform operation.
+ * \param dev The device to perform operation.
  * \param ndim The number of dimension of the tensor.
  * \param shape The shape of the tensor.
  * \param dtype The type of elements.
@@ -572,17 +567,17 @@ TVM_DLL int TVMDeviceAllocDataSpace(DLContext ctx, size_t nbytes, size_t alignme
  * \param out_data The allocated device pointer.
  * \return 0 when success, -1 when failure happens
  */
-TVM_DLL int TVMDeviceAllocDataSpaceWithScope(DLContext ctx, int ndim, const int64_t* shape,
+TVM_DLL int TVMDeviceAllocDataSpaceWithScope(DLDevice dev, int ndim, const int64_t* shape,
                                              DLDataType dtype, const char* mem_scope,
                                              void** out_data);
 
 /*!
  * \brief Free a data space on device.
- * \param ctx The device context to perform operation.
+ * \param dev The device to perform operation.
  * \param ptr The data space.
  * \return 0 when success, -1 when failure happens
  */
-TVM_DLL int TVMDeviceFreeDataSpace(TVMContext ctx, void* ptr);
+TVM_DLL int TVMDeviceFreeDataSpace(DLDevice dev, void* ptr);
 
 /*!
  * \brief Copy data from one place to another.
diff --git a/include/tvm/runtime/container.h b/include/tvm/runtime/container.h
index 336fef21ab88..362582f4dab9 100644
--- a/include/tvm/runtime/container.h
+++ b/include/tvm/runtime/container.h
@@ -30,6 +30,7 @@
 
 #include <dmlc/logging.h>
 #include <tvm/runtime/container.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/memory.h>
 #include <tvm/runtime/object.h>
 
diff --git a/include/tvm/runtime/crt/error_codes.h b/include/tvm/runtime/crt/error_codes.h
index 75e49e63e094..4cbfb0aab8e2 100644
--- a/include/tvm/runtime/crt/error_codes.h
+++ b/include/tvm/runtime/crt/error_codes.h
@@ -42,7 +42,7 @@ typedef enum {
   kTvmErrorCategorySession = 4,
   kTvmErrorCategoryPlatform = 5,
   kTvmErrorCategoryGenerated = 6,
-  kTvmErrorCategoryGraphRuntime = 7,
+  kTvmErrorCategoryGraphExecutor = 7,
   kTvmErrorCategoryFunctionCall = 8,
   kTvmErrorCategoryTimeEvaluator = 9,
 } tvm_crt_error_category_t;
@@ -83,10 +83,10 @@ typedef enum {
   // Common error codes returned from generated functions.
   kTvmErrorGeneratedInvalidStorageId = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryGenerated, 0),
 
-  // Graph runtime
-  kTvmErrorGraphModuleAlreadyCreated = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryGraphRuntime, 0),
-  kTvmErrorGraphModuleBadContext = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryGraphRuntime, 1),
-  kTvmErrorGraphModuleNoSuchInput = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryGraphRuntime, 2),
+  // Graph executor
+  kTvmErrorGraphModuleAlreadyCreated = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryGraphExecutor, 0),
+  kTvmErrorGraphModuleBadContext = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryGraphExecutor, 1),
+  kTvmErrorGraphModuleNoSuchInput = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryGraphExecutor, 2),
 
   // Function Calls - common problems encountered calling functions.
   kTvmErrorFunctionCallNumArguments = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryFunctionCall, 0),
diff --git a/include/tvm/runtime/crt/graph_runtime.h b/include/tvm/runtime/crt/graph_executor.h
similarity index 60%
rename from include/tvm/runtime/crt/graph_runtime.h
rename to include/tvm/runtime/crt/graph_executor.h
index 82d7ac4e7d18..eb68ff56d230 100644
--- a/include/tvm/runtime/crt/graph_runtime.h
+++ b/include/tvm/runtime/crt/graph_executor.h
@@ -18,11 +18,11 @@
  */
 
 /*!
- * \file graph_runtime.h
- * \brief Tiny graph runtime that can run graph containing only tvm PackedFunc.
+ * \file graph_executor.h
+ * \brief Tiny graph executor that can run graph containing only tvm PackedFunc.
  */
-#ifndef TVM_RUNTIME_CRT_GRAPH_RUNTIME_H_
-#define TVM_RUNTIME_CRT_GRAPH_RUNTIME_H_
+#ifndef TVM_RUNTIME_CRT_GRAPH_EXECUTOR_H_
+#define TVM_RUNTIME_CRT_GRAPH_EXECUTOR_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -43,7 +43,7 @@ typedef struct TVMOpParam {
 } TVMOpParam;
 
 // Graph attribute
-typedef struct TVMGraphRuntimeGraphAttr {
+typedef struct TVMGraphExecutorGraphAttr {
   uint32_t storage_num_not_alloctaed;
   uint32_t* storage_id;
   uint32_t* device_index;
@@ -52,79 +52,79 @@ typedef struct TVMGraphRuntimeGraphAttr {
   int64_t* shape;
   uint32_t* ndim;
   uint32_t shape_count;
-} TVMGraphRuntimeGraphAttr;
+} TVMGraphExecutorGraphAttr;
 
-typedef struct TVMGraphRuntime TVMGraphRuntime;
+typedef struct TVMGraphExecutor TVMGraphExecutor;
 
 // public functions
 /*!
- * \brief Allocate a new GraphRuntime with TVMPlatformMemoryAllocate and initialize it.
+ * \brief Allocate a new GraphExecutor with TVMPlatformMemoryAllocate and initialize it.
  *
  * \param sym_json JSON-encoded graph.
  * \param module_handle TVM Module that exposes the functions to call.
- * \param ctxs runtime execution context.
- * \param runtime Pointer which receives a pointer to the newly-created instance.
+ * \param devices runtime execution device.
+ * \param executor Pointer which receives a pointer to the newly-created instance.
  * \return 0 if successful.
  */
-int TVMGraphRuntime_Create(const char* sym_json, TVMModuleHandle module_handle,
-                           const TVMContext* ctxs, TVMGraphRuntime** runtime);
+int TVMGraphExecutor_Create(const char* sym_json, TVMModuleHandle module_handle,
+                            const DLDevice* devices, TVMGraphExecutor** executor);
 
-int TVMGraphRuntime_GetInputIndex(TVMGraphRuntime* runtime, const char* name);
+int TVMGraphExecutor_GetInputIndex(TVMGraphExecutor* executor, const char* name);
 
 /*!
  * \brief get number of input tensors allocated.
  * \return integer number of tensors available to use.
  */
-int TVMGraphRuntime_GetNumInputs();
+int TVMGraphExecutor_GetNumInputs();
 
 /*!
  * \brief set input to the graph based on name.
- * \param runtime The graph runtime.
+ * \param executor The graph executor.
  * \param name The name of the input.
  * \param data_in The input data.
  */
-void TVMGraphRuntime_SetInput(TVMGraphRuntime* runtime, const char* name, DLTensor* data_in);
+void TVMGraphExecutor_SetInput(TVMGraphExecutor* executor, const char* name, DLTensor* data_in);
 
 /*!
  * \brief get number of output tensors allocated.
  * \return integer number of output tensors allocated.
  */
-int TVMGraphRuntime_GetNumOutputs();
+int TVMGraphExecutor_GetNumOutputs();
 
 /*!
  * \brief Return NDArray for given output index.
- * \param runtime The graph runtime.
+ * \param executor The graph executor.
  * \param index The output index.
  * \param out The DLTensor corresponding to given output node index.
  * \return The result of this function execution.
  */
-int TVMGraphRuntime_GetOutput(TVMGraphRuntime* runtime, const int32_t index, DLTensor* out);
+int TVMGraphExecutor_GetOutput(TVMGraphExecutor* executor, const int32_t index, DLTensor* out);
 
 /*!
  * \brief Load parameters from parameter blob.
- * \param runtime The graph runtime.
+ * \param executor The graph executor.
  * \param param_blob A binary blob of parameter.
  * \param param_size The parameter size.
  * \return The result of this function execution.
  */
-int TVMGraphRuntime_LoadParams(TVMGraphRuntime* runtime, const char* param_blob,
-                               const uint32_t param_size);
+int TVMGraphExecutor_LoadParams(TVMGraphExecutor* executor, const char* param_blob,
+                                const uint32_t param_size);
 
 /*!
  * \brief Execute the graph.
- * \param runtime The graph runtime.
+ * \param executor The graph executor.
  */
-void TVMGraphRuntime_Run(TVMGraphRuntime* runtime);
+void TVMGraphExecutor_Run(TVMGraphExecutor* executor);
 
 /*!
- * \brief Release memory associated with the graph runtime.
- * \param runtime Pointer to graph runtime.
+ * \brief Release memory associated with the graph executor.
+ * \param executor Pointer to graph executor.
  * \return 0 if successful
  */
-int TVMGraphRuntime_Release(TVMGraphRuntime** runtime);
+int TVMGraphExecutor_Release(TVMGraphExecutor** executor);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // TVM_RUNTIME_CRT_GRAPH_RUNTIME_H_
+#endif  // TVM_RUNTIME_CRT_GRAPH_EXECUTOR_H_
diff --git a/include/tvm/runtime/crt/graph_runtime_module.h b/include/tvm/runtime/crt/graph_executor_module.h
similarity index 71%
rename from include/tvm/runtime/crt/graph_runtime_module.h
rename to include/tvm/runtime/crt/graph_executor_module.h
index 04e9184c8b8d..10a879e9ba30 100644
--- a/include/tvm/runtime/crt/graph_runtime_module.h
+++ b/include/tvm/runtime/crt/graph_executor_module.h
@@ -18,11 +18,11 @@
  */
 
 /*!
- * \file graph_runtime.h
- * \brief Tiny graph runtime that can run graph containing only tvm PackedFunc.
+ * \file graph_executor.h
+ * \brief Tiny graph executor that can run graph containing only tvm PackedFunc.
  */
-#ifndef TVM_RUNTIME_CRT_GRAPH_RUNTIME_MODULE_H_
-#define TVM_RUNTIME_CRT_GRAPH_RUNTIME_MODULE_H_
+#ifndef TVM_RUNTIME_CRT_GRAPH_EXECUTOR_MODULE_H_
+#define TVM_RUNTIME_CRT_GRAPH_EXECUTOR_MODULE_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -31,12 +31,12 @@ extern "C" {
 #include <tvm/runtime/crt/error_codes.h>
 
 /*!
- * \brief Register the "tvm.graph_runtime.create" constructor PackedFunc.
+ * \brief Register the "tvm.graph_executor.create" constructor PackedFunc.
  */
-tvm_crt_error_t TVMGraphRuntimeModule_Register();
+tvm_crt_error_t TVMGraphExecutorModule_Register();
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // TVM_RUNTIME_CRT_GRAPH_RUNTIME_MODULE_H_
+#endif  // TVM_RUNTIME_CRT_GRAPH_EXECUTOR_MODULE_H_
diff --git a/include/tvm/runtime/crt/memory.h b/include/tvm/runtime/crt/memory.h
index 5b937fd7d4f4..c830116528e0 100644
--- a/include/tvm/runtime/crt/memory.h
+++ b/include/tvm/runtime/crt/memory.h
@@ -42,11 +42,11 @@ struct MemoryManagerInterface {
    * \brief Allocate a chunk of memory.
    * \param interface Pointer to this structure.
    * \param num_bytes Number of bytes requested.
-   * \param ctx Execution context that will be used with the allocated memory. Must be {kDLCPU, 0}.
+   * \param dev Execution device that will be used with the allocated memory. Must be {kDLCPU, 0}.
    * \param out_ptr A pointer to which is written a pointer to the newly-allocated memory.
    * \return kTvmErrorNoError if successful; a descriptive error code otherwise.
    */
-  tvm_crt_error_t (*Allocate)(MemoryManagerInterface* interface, size_t num_bytes, DLContext ctx,
+  tvm_crt_error_t (*Allocate)(MemoryManagerInterface* interface, size_t num_bytes, DLDevice dev,
                               void** out_ptr);
 
   /*!
@@ -54,10 +54,10 @@ struct MemoryManagerInterface {
    *
    * \param interface Pointer to this structure.
    * \param ptr A pointer returned from TVMPlatformMemoryAllocate which should be free'd.
-   * \param ctx Execution context passed to TVMPlatformMemoryAllocate. Fixed to {kDLCPU, 0}.
+   * \param dev Execution device passed to TVMPlatformMemoryAllocate. Fixed to {kDLCPU, 0}.
    * \return kTvmErrorNoError if successful; a descriptive error code otherwise.
    */
-  tvm_crt_error_t (*Free)(MemoryManagerInterface* interface, void* ptr, DLContext ctx);
+  tvm_crt_error_t (*Free)(MemoryManagerInterface* interface, void* ptr, DLDevice dev);
 
   /*! \brief Used in testing; the number of allocated objects. */
   int vleak_size;
diff --git a/include/tvm/runtime/crt/platform.h b/include/tvm/runtime/crt/platform.h
index d1226e388f73..c774aaeaa0db 100644
--- a/include/tvm/runtime/crt/platform.h
+++ b/include/tvm/runtime/crt/platform.h
@@ -64,20 +64,20 @@ size_t TVMPlatformFormatMessage(char* out_buf, size_t out_buf_size_bytes, const
  * and the caller is not obligated to call TVMPlatformMemoryFree in order to avoid a memory leak.
  *
  * \param num_bytes Number of bytes requested.
- * \param ctx Execution context that will be used with the allocated memory. Fixed to {kDLCPU, 0}.
+ * \param dev Execution device that will be used with the allocated memory. Fixed to {kDLCPU, 0}.
  * \param out_ptr A pointer to which is written a pointer to the newly-allocated memory.
  * \return kTvmErrorNoError if successful; a descriptive error code otherwise.
  */
-tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLContext ctx, void** out_ptr);
+tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr);
 
 /*!
  * \brief Free memory used by TVM.
  *
  * \param ptr A pointer returned from TVMPlatformMemoryAllocate which should be free'd.
- * \param ctx Execution context passed to TVMPlatformMemoryAllocate. Fixed to {kDLCPU, 0}.
+ * \param dev Execution device passed to TVMPlatformMemoryAllocate. Fixed to {kDLCPU, 0}.
  * \return kTvmErrorNoError if successful; a descriptive error code otherwise.
  */
-tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLContext ctx);
+tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev);
 
 /*! \brief Start a device timer.
  *
diff --git a/include/tvm/runtime/data_type.h b/include/tvm/runtime/data_type.h
index 7d914ce6bff9..b4fdcbff58b4 100644
--- a/include/tvm/runtime/data_type.h
+++ b/include/tvm/runtime/data_type.h
@@ -25,7 +25,7 @@
 #define TVM_RUNTIME_DATA_TYPE_H_
 
 #include <tvm/runtime/c_runtime_api.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include <string>
 #include <type_traits>
diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h
index 1276663a2bc3..3caea1041f83 100644
--- a/include/tvm/runtime/device_api.h
+++ b/include/tvm/runtime/device_api.h
@@ -25,6 +25,7 @@
 #define TVM_RUNTIME_DEVICE_API_H_
 
 #include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
 
 #include <string>
@@ -67,46 +68,46 @@ class TVM_DLL DeviceAPI {
   /*! \brief virtual destructor */
   virtual ~DeviceAPI() {}
   /*!
-   * \brief Set the environment device id to ctx
-   * \param ctx The context to be set.
+   * \brief Set the environment device id to device
+   * \param dev The device to be set.
    */
-  virtual void SetDevice(TVMContext ctx) = 0;
+  virtual void SetDevice(Device dev) = 0;
   /*!
    * \brief Get attribute of specified device.
-   * \param ctx The device context
+   * \param dev The device device
    * \param kind The result kind
    * \param rv The return value.
    * \sa DeviceAttrKind
    */
-  virtual void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) = 0;
+  virtual void GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) = 0;
   /*!
    * \brief Allocate a data space on device.
-   * \param ctx The device context to perform operation.
+   * \param dev The device device to perform operation.
    * \param nbytes The number of bytes in memory.
    * \param alignment The alignment of the memory.
    * \param type_hint The type of elements. Only needed by certain backends such
    * as OpenGL, as nbytes & alignment are sufficient for most backends.
    * \return The allocated device pointer.
    */
-  virtual void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment,
+  virtual void* AllocDataSpace(Device dev, size_t nbytes, size_t alignment,
                                DLDataType type_hint) = 0;
   /*!
    * \brief Allocate a data space on device with memory scope support.
-   * \param ctx The device context to perform operation.
+   * \param dev The device device to perform operation.
    * \param ndim The number of dimension of allocated tensor.
    * \param shape The shape of allocated tensor.
    * \param dtype The type of elements.
    * \param mem_scope The memory scope of allocated tensor.
    * \return The allocated device pointer.
    */
-  virtual void* AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, DLDataType dtype,
+  virtual void* AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
                                Optional<String> mem_scope = NullOpt);
   /*!
    * \brief Free a data space on device.
-   * \param ctx The device context to perform operation.
+   * \param dev The device device to perform operation.
    * \param ptr The data space.
    */
-  virtual void FreeDataSpace(TVMContext ctx, void* ptr) = 0;
+  virtual void FreeDataSpace(Device dev, void* ptr) = 0;
   /*!
    * \brief copy data from one place to another
    * \note This API is designed to support special memory with shape dependent layout.
@@ -119,44 +120,43 @@ class TVM_DLL DeviceAPI {
   /*!
    * \brief Create a new stream of execution.
    *
-   * \param ctx The context of allocation.
+   * \param dev The device of allocation.
    */
-  virtual TVMStreamHandle CreateStream(TVMContext ctx);
+  virtual TVMStreamHandle CreateStream(Device dev);
 
   /*!
    * \brief Free a stream of execution
    *
-   * \param ctx The context of the stream
+   * \param dev The device of the stream
    * \param stream The pointer to be freed.
    */
-  virtual void FreeStream(TVMContext ctx, TVMStreamHandle stream);
+  virtual void FreeStream(Device dev, TVMStreamHandle stream);
 
   /*!
    * \brief Synchronize the stream
-   * \param ctx The context to perform operation.
+   * \param dev The device to perform operation.
    * \param stream The stream to be sync.
    */
-  virtual void StreamSync(TVMContext ctx, TVMStreamHandle stream) = 0;
+  virtual void StreamSync(Device dev, TVMStreamHandle stream) = 0;
   /*!
    * \brief Set the stream
-   * \param ctx The context to set stream.
+   * \param dev The device to set stream.
    * \param stream The stream to be set.
    */
-  virtual void SetStream(TVMContext ctx, TVMStreamHandle stream) {}
+  virtual void SetStream(Device dev, TVMStreamHandle stream) {}
   /*!
    * \brief Synchronize 2 streams of execution.
    *
    * An event is created in event_src stream that the second then
    * stream waits on.  Neither event_src or event_dst need to be of
-   * the same device ID as the context, but they must be of the same
+   * the same device ID as the device, but they must be of the same
    * device type.
    *
-   * \param ctx The context of the streams.
+   * \param dev The device of the streams.
    * \param event_src The source stream to synchronize.
    * \param event_dst The destination stream to synchronize.
    */
-  virtual void SyncStreamFromTo(TVMContext ctx, TVMStreamHandle event_src,
-                                TVMStreamHandle event_dst);
+  virtual void SyncStreamFromTo(Device dev, TVMStreamHandle event_src, TVMStreamHandle event_dst);
   /*!
    * \brief Allocate temporal workspace for backend execution.
    *
@@ -168,34 +168,34 @@ class TVM_DLL DeviceAPI {
    *  - Repeative pattern of same allocations over different runs.
    *  - Workspace should not overlap between different threads(i.e. be threadlocal)
    *
-   * \param ctx The context of allocation.
+   * \param dev The device of allocation.
    * \param nbytes The size to be allocated.
    * \param type_hint The type of elements. Only needed by certain backends such
    * as OpenGL, as nbytes is sufficient for most backends.
    */
-  virtual void* AllocWorkspace(TVMContext ctx, size_t nbytes, DLDataType type_hint = {});
+  virtual void* AllocWorkspace(Device dev, size_t nbytes, DLDataType type_hint = {});
   /*!
    * \brief Free temporal workspace in backend execution.
    *
-   * \param ctx The context of allocation.
+   * \param dev The device of allocation.
    * \param ptr The pointer to be freed.
    */
-  virtual void FreeWorkspace(TVMContext ctx, void* ptr);
+  virtual void FreeWorkspace(Device dev, void* ptr);
 
   /*!
-   * \brief Get device API based on context.
-   * \param ctx The context
+   * \brief Get device API based on device.
+   * \param dev The device
    * \param allow_missing Whether allow missing
    * \return The corresponding device API.
    */
-  static DeviceAPI* Get(TVMContext ctx, bool allow_missing = false);
+  static DeviceAPI* Get(Device dev, bool allow_missing = false);
 
   /*!
-   * \brief Whether a certian device type requires set device context
+   * \brief Whether a certian device type requires set device device
    *        before launching the kernel function.
    * \param device_type The device type.
    */
-  static bool NeedSetDeviceContext(int device_type) {
+  static bool NeedSetDevice(int device_type) {
     return device_type != kDLCPU && device_type != kDLMicroDev;
   }
 
@@ -207,14 +207,14 @@ class TVM_DLL DeviceAPI {
    * \param to The target array.
    * \param to_offset The byte offset in the to.
    * \param num_bytes The size of the memory in bytes
-   * \param ctx_from The source context
-   * \param ctx_to The target context
+   * \param dev_from The source device
+   * \param dev_to The target device
    * \param type_hint The type of elements, only neded by certain backends.
    *                  can be useful for cross device endian converison.
    * \param stream Optional stream object.
    */
   virtual void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
-                              size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
+                              size_t num_bytes, Device dev_from, Device dev_to,
                               DLDataType type_hint, TVMStreamHandle stream);
 };
 
@@ -263,56 +263,57 @@ inline const char* DeviceName(int type) {
 }
 
 /*!
- * \brief Return true if a TVMContext is owned by an RPC session.
+ * \brief Return true if a Device is owned by an RPC session.
  */
-inline bool IsRPCSessionContext(TVMContext ctx) { return (ctx.device_type / kRPCSessMask) > 0; }
+inline bool IsRPCSessionDevice(Device dev) { return (dev.device_type / kRPCSessMask) > 0; }
 
 /*!
- * \brief Return the RPCSessTable index of the RPC Session that owns this context.
+ * \brief Return the RPCSessTable index of the RPC Session that owns this device.
  * \return the table index.
  */
-inline int GetRPCSessionIndex(TVMContext ctx) {
-  ICHECK(IsRPCSessionContext(ctx)) << "GetRPCSessionIndex: ctx has no RPC session";
-  return ctx.device_type / kRPCSessMask - 1;
+inline int GetRPCSessionIndex(Device dev) {
+  ICHECK(IsRPCSessionDevice(dev)) << "GetRPCSessionIndex: dev has no RPC session";
+  return dev.device_type / kRPCSessMask - 1;
 }
 
 /*!
- * \brief Remove the RPC session mask from a TVMContext.
- * RPC clients typically do this when encoding a TVMContext for transmission to an RPC remote.
- * On the wire, RPCContext are expected to be valid on the server without interpretation.
- * \param ctx A TVMContext with non-zero RPC Session mask, valid on the RPC client.
- * \return A TVMContext without any RPC Session mask, valid on the RPC server.
+ * \brief Remove the RPC session mask from a Device.
+ * RPC clients typically do this when encoding a Device for transmission to an RPC remote.
+ * On the wire, RPCdevice are expected to be valid on the server without interpretation.
+ * \param dev A Device with non-zero RPC Session mask, valid on the RPC client.
+ * \return A Device without any RPC Session mask, valid on the RPC server.
  */
-inline TVMContext RemoveRPCSessionMask(TVMContext ctx) {
-  ctx.device_type = static_cast<DLDeviceType>(ctx.device_type % kRPCSessMask);
-  return ctx;
+inline Device RemoveRPCSessionMask(Device dev) {
+  dev.device_type = static_cast<DLDeviceType>(dev.device_type % kRPCSessMask);
+  return dev;
 }
 
-inline std::ostream& operator<<(std::ostream& os, DLContext ctx);
+inline std::ostream& operator<<(std::ostream& os, DLDevice dev);
 
 /*!
- * \brief Add a RPC session mask to a TVMContext.
- * RPC clients typically do this when decoding a TVMContext received from a RPC remote.
- * \param ctx A TVMContext without any RPC Session mask, valid on the RPC server.
+ * \brief Add a RPC session mask to a Device.
+ * RPC clients typically do this when decoding a Device received from a RPC remote.
+ * \param dev A Device without any RPC Session mask, valid on the RPC server.
  * \param session_table_index Numeric index of the RPC session in the session table.
- * \return A TVMContext with RPC session mask added, valid on the RPC client.
+ * \return A Device with RPC session mask added, valid on the RPC client.
  */
-inline TVMContext AddRPCSessionMask(TVMContext ctx, int session_table_index) {
-  CHECK(!IsRPCSessionContext(ctx))
-      << "AddRPCSessionMask: ctx already non-zero RPCSessionIndex: " << ctx;
-  ctx.device_type =
-      static_cast<DLDeviceType>(ctx.device_type | (kRPCSessMask * (session_table_index + 1)));
-  return ctx;
+inline Device AddRPCSessionMask(Device dev, int session_table_index) {
+  CHECK(!IsRPCSessionDevice(dev)) << "AddRPCSessionMask: dev already non-zero RPCSessionIndex: "
+                                  << dev;
+  dev.device_type =
+      static_cast<DLDeviceType>(dev.device_type | (kRPCSessMask * (session_table_index + 1)));
+  return dev;
 }
 
-inline std::ostream& operator<<(std::ostream& os, DLContext ctx) {  // NOLINT(*)
-  if (IsRPCSessionContext(ctx)) {
-    os << "remote[" << GetRPCSessionIndex(ctx) << "]-";
-    ctx = RemoveRPCSessionMask(ctx);
+inline std::ostream& operator<<(std::ostream& os, DLDevice dev) {  // NOLINT(*)
+  if (IsRPCSessionDevice(dev)) {
+    os << "remote[" << GetRPCSessionIndex(dev) << "]-";
+    dev = RemoveRPCSessionMask(dev);
   }
-  os << runtime::DeviceName(static_cast<int>(ctx.device_type)) << "(" << ctx.device_id << ")";
+  os << runtime::DeviceName(static_cast<int>(dev.device_type)) << "(" << dev.device_id << ")";
   return os;
 }
 }  // namespace runtime
 }  // namespace tvm
+
 #endif  // TVM_RUNTIME_DEVICE_API_H_
diff --git a/include/tvm/runtime/logging.h b/include/tvm/runtime/logging.h
new file mode 100644
index 000000000000..8a69c2ad7749
--- /dev/null
+++ b/include/tvm/runtime/logging.h
@@ -0,0 +1,539 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/runtime/logging.h
+ * \brief logging utilities
+ *
+ * We define our own CHECK and LOG macros to replace those from dmlc-core.
+ * These macros are then injected into dmlc-core via the
+ * DMLC_USE_LOGGING_LIBRARY define. dmlc-core will #include this file wherever
+ * it needs logging.
+ */
+#ifndef TVM_RUNTIME_LOGGING_H_
+#define TVM_RUNTIME_LOGGING_H_
+
+#include <dmlc/common.h>
+#include <tvm/runtime/c_runtime_api.h>
+
+#include <ctime>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <string>
+
+/*!
+ * \brief Macro helper to force a function not to be inlined.
+ * It is only used in places that we know not inlining is good,
+ * e.g. some logging functions.
+ */
+#if defined(_MSC_VER)
+#define TVM_NO_INLINE __declspec(noinline)
+#else
+#define TVM_NO_INLINE __attribute__((noinline))
+#endif
+
+/*!
+ * \brief Macro helper to force a function to be inlined.
+ * It is only used in places that we know inline is important,
+ * e.g. some template expansion cases.
+ */
+#ifdef _MSC_VER
+#define TVM_ALWAYS_INLINE __forceinline
+#else
+#define TVM_ALWAYS_INLINE inline __attribute__((always_inline))
+#endif
+
+/*!
+ * \brief Macro helper for exception throwing.
+ */
+#define TVM_THROW_EXCEPTION noexcept(false)
+
+/*!
+ * \brief Whether or not enable backtrace logging during a
+ *        fatal error.
+ *
+ * \note TVM won't depend on LIBBACKTRACE or other exec_info
+ *       library when this option is disabled.
+ */
+#ifndef TVM_LOG_STACK_TRACE
+#define TVM_LOG_STACK_TRACE 1
+#endif
+
+/*!
+ * \brief Whether or not use libbacktrace library
+ *        for getting backtrace information
+ */
+#ifndef TVM_USE_LIBBACKTRACE
+#define TVM_USE_LIBBACKTRACE 0
+#endif
+
+/*!
+ * \brief Whether or not customize the logging output.
+ *  If log customize is enabled, the user must implement
+ *  tvm::runtime::detail::LogFatalImpl and tvm::runtime::detail::LogMessageImpl.
+ */
+#ifndef TVM_LOG_CUSTOMIZE
+#define TVM_LOG_CUSTOMIZE 0
+#endif
+
+// a technique that enables overriding macro names on the number of parameters. This is used
+// to define other macros below
+#define GET_MACRO(_1, _2, _3, _4, _5, NAME, ...) NAME
+
+/*!
+ * \brief COND_X calls COND_X_N where N is the number of parameters passed to COND_X
+ * X can be any of CHECK_GE, CHECK_EQ, CHECK, or LOG COND_X (but not COND_X_N)
+ * are supposed to be used outside this file.
+ * The first parameter of COND_X (and therefore, COND_X_N), which we call 'quit_on_assert',
+ * is a boolean. The rest of the parameters of COND_X is the same as the parameters of X.
+ * quit_on_assert determines the overall behavior of COND_X. If it's true COND_X
+ * quits the program on assertion failure. If it's false, then it moves on and somehow reports
+ * the assertion failure back to the macro caller in an appropriate manner (e.g, 'return false'
+ * in a function, or 'continue' or 'break' in a loop)
+ * The default behavior when quit_on_assertion is false, is to 'return false'. If this is not
+ * desirable, the macro caller can pass one more last parameter to COND_X to tell COND_X what
+ * to do when when quit_on_assertion is false and the assertion fails.
+ *
+ * Rationale: These macros were designed to implement functions that have two behaviors
+ * in a concise way. Those behaviors are quitting on assertion failures, or trying to
+ * move on from assertion failures. Note that these macros hide lots of control flow in them,
+ * and therefore, makes the logic of the whole code slightly harder to understand. However,
+ * in pieces of code that use these macros frequently, it will significantly shorten the
+ * amount of code needed to be read, and we won't need to clutter the main logic of the
+ * function by repetitive control flow structure. The first problem
+ * mentioned will be improved over time as the developer gets used to the macro.
+ *
+ * Here is an example of how to use it
+ * \code
+ * bool f(..., bool quit_on_assertion) {
+ *   int a = 0, b = 0;
+ *   ...
+ *   a = ...
+ *   b = ...
+ *   // if quit_on_assertion is true, if a==b, continue, otherwise quit.
+ *   // if quit_on_assertion is false, if a==b, continue, otherwise 'return false' (default
+ * behaviour) COND_CHECK_EQ(quit_on_assertion, a, b) << "some error message when  quiting"
+ *   ...
+ *   for (int i = 0; i < N; i++) {
+ *     a = ...
+ *     b = ...
+ *     // if quit_on_assertion is true, if a==b, continue, otherwise quit.
+ *     // if quit_on_assertion is false, if a==b, continue, otherwise 'break' (non-default
+ *     // behaviour, therefore, has to be explicitly specified)
+ *     COND_CHECK_EQ(quit_on_assertion, a, b, break) << "some error message when  quiting"
+ *   }
+ * }
+ * \endcode
+ */
+#define COND_CHECK_GE(...) \
+  GET_MACRO(__VA_ARGS__, COND_CHECK_GE_5, COND_CHECK_GE_4, COND_CHECK_GE_3)(__VA_ARGS__)
+#define COND_CHECK_EQ(...) \
+  GET_MACRO(__VA_ARGS__, COND_CHECK_EQ_5, COND_CHECK_EQ_4, COND_CHECK_EQ_3)(__VA_ARGS__)
+#define COND_CHECK(...) \
+  GET_MACRO(__VA_ARGS__, COND_CHECK_5, COND_CHECK_4, COND_CHECK_3, COND_CHECK_2)(__VA_ARGS__)
+#define COND_LOG(...) \
+  GET_MACRO(__VA_ARGS__, COND_LOG_5, COND_LOG_4, COND_LOG_3, COND_LOG_2)(__VA_ARGS__)
+
+// Not supposed to be used by users directly.
+#define COND_CHECK_OP(quit_on_assert, x, y, what, op) \
+  if (!quit_on_assert) {                              \
+    if (!((x)op(y))) what;                            \
+  } else /* NOLINT(*) */                              \
+    CHECK_##op(x, y)
+
+#define COND_CHECK_EQ_4(quit_on_assert, x, y, what) COND_CHECK_OP(quit_on_assert, x, y, what, ==)
+#define COND_CHECK_GE_4(quit_on_assert, x, y, what) COND_CHECK_OP(quit_on_assert, x, y, what, >=)
+
+#define COND_CHECK_3(quit_on_assert, x, what) \
+  if (!quit_on_assert) {                      \
+    if (!(x)) what;                           \
+  } else /* NOLINT(*) */                      \
+    CHECK(x)
+
+#define COND_LOG_3(quit_on_assert, x, what) \
+  if (!quit_on_assert) {                    \
+    what;                                   \
+  } else /* NOLINT(*) */                    \
+    LOG(x)
+
+#define COND_CHECK_EQ_3(quit_on_assert, x, y) COND_CHECK_EQ_4(quit_on_assert, x, y, return false)
+#define COND_CHECK_GE_3(quit_on_assert, x, y) COND_CHECK_GE_4(quit_on_assert, x, y, return false)
+#define COND_CHECK_2(quit_on_assert, x) COND_CHECK_3(quit_on_assert, x, return false)
+#define COND_LOG_2(quit_on_assert, x) COND_LOG_3(quit_on_assert, x, return false)
+
+namespace tvm {
+namespace runtime {
+
+/*!
+ * \brief Generate a backtrace when called.
+ * \return A multiline string of the backtrace. There will be either one or two lines per frame.
+ */
+TVM_DLL std::string Backtrace();
+
+/*! \brief Base error type for TVM. Wraps a string message. */
+class Error : public ::dmlc::Error {  // for backwards compatibility
+ public:
+  /*!
+   * \brief Construct an error.
+   * \param s The message to be displayed with the error.
+   */
+  explicit Error(const std::string& s) : ::dmlc::Error(s) {}
+};
+
+/*!
+ * \brief Error type for errors from CHECK, ICHECK, and LOG(FATAL). This error
+ * contains a backtrace of where it occurred.
+ */
+class InternalError : public Error {
+ public:
+  /*! \brief Construct an error. Not recommended to use directly. Instead use LOG(FATAL).
+   *
+   * \param file The file where the error occurred.
+   * \param lineno The line number where the error occurred.
+   * \param message The error message to display.
+   * \param time The time at which the error occurred. This should be in local time.
+   * \param backtrace Backtrace from when the error occurred.
+   */
+  InternalError(std::string file, int lineno, std::string message,
+                std::time_t time = std::time(nullptr), std::string backtrace = Backtrace())
+      : Error(""),
+        file_(file),
+        lineno_(lineno),
+        message_(message),
+        time_(time),
+        backtrace_(backtrace) {
+    std::ostringstream s;
+    // XXX: Do not change this format, otherwise all error handling in python will break (because it
+    // parses the message to reconstruct the error type).
+    // TODO(tkonolige): Convert errors to Objects, so we can avoid the mess of formatting/parsing
+    // error messages correctly.
+    s << "[" << std::put_time(std::localtime(&time), "%H:%M:%S") << "] " << file << ":" << lineno
+      << ": " << message << std::endl;
+    if (backtrace.size() > 0) {
+      s << backtrace << std::endl;
+    }
+    full_message_ = s.str();
+  }
+  /*! \return The file in which the error occurred. */
+  const std::string& file() const { return file_; }
+  /*! \return The message associated with this error. */
+  const std::string& message() const { return message_; }
+  /*! \return Formatted error message including file, linenumber, backtrace, and message. */
+  const std::string& full_message() const { return full_message_; }
+  /*! \return The backtrace from where this error occurred. */
+  const std::string& backtrace() const { return backtrace_; }
+  /*! \return The time at which this error occurred. */
+  const std::time_t& time() const { return time_; }
+  /*! \return The line number at which this error occurred. */
+  int lineno() const { return lineno_; }
+  virtual const char* what() const noexcept { return full_message_.c_str(); }
+
+ private:
+  std::string file_;
+  int lineno_;
+  std::string message_;
+  std::time_t time_;
+  std::string backtrace_;
+  std::string full_message_;  // holds the full error string
+};
+
+/*! \brief Internal implementation */
+namespace detail {
+// Provide support for customized logging.
+#if TVM_LOG_CUSTOMIZE
+/*!
+ * \brief Custom implementations of LogFatal.
+ *
+ * \sa TVM_LOG_CUSTOMIZE
+ */
+TVM_DLL void LogFatalImpl(const std::string& file, int lineno, const std::string& message);
+
+/*!
+ * \brief Custom implementations of LogMessage.
+ *
+ * \sa TVM_LOG_CUSTOMIZE
+ */
+TVM_DLL void LogMessageImpl(const std::string& file, int lineno, const std::string& message);
+
+/*!
+ * \brief Class to accumulate an error message and throw it. Do not use
+ * directly, instead use LOG(FATAL).
+ */
+class LogFatal {
+ public:
+  LogFatal(const std::string& file, int lineno) : file_(file), lineno_(lineno) {}
+  ~LogFatal() TVM_THROW_EXCEPTION { LogFatalImpl(file_, lineno_, stream_.str()); }
+  std::ostringstream& stream() { return stream_; }
+
+ private:
+  std::ostringstream stream_;
+  std::string file_;
+  int lineno_;
+};
+
+/*!
+ * \brief Class to accumulate an log message. Do not use directly, instead use
+ * LOG(INFO), LOG(WARNING), LOG(ERROR).
+ */
+class LogMessage {
+ public:
+  LogMessage(const std::string& file, int lineno) : file_(file), lineno_(lineno) {}
+  ~LogMessage() { LogMessageImpl(file_, lineno_, stream_.str()); }
+  std::ostringstream& stream() { return stream_; }
+
+ private:
+  std::string file_;
+  int lineno_;
+  std::ostringstream stream_;
+};
+
+#else
+
+/*!
+ * \brief Class to accumulate an error message and throw it. Do not use
+ * directly, instead use LOG(FATAL).
+ * \note The `LogFatal` class is designed to be an empty class to reduce stack size usage.
+ * To play this trick, we use the thread-local storage to store its internal data.
+ */
+class LogFatal {
+ public:
+  TVM_NO_INLINE LogFatal(const char* file, int lineno) { GetEntry().Init(file, lineno); }
+#ifdef _MSC_VER
+#pragma disagnostic push
+#pragma warning(disable : 4722)
+#endif
+  ~LogFatal() TVM_THROW_EXCEPTION { GetEntry().Finalize(); }
+#ifdef _MSC_VER
+#pragma disagnostic pop
+#endif
+  std::ostringstream& stream() { return GetEntry().stream_; }
+
+ private:
+  struct Entry {
+    void Init(const char* file, int lineno) {
+      this->stream_.str("");
+      this->file_ = file;
+      this->lineno_ = lineno;
+    }
+    TVM_NO_INLINE dmlc::Error Finalize() { throw InternalError(file_, lineno_, stream_.str()); }
+    std::ostringstream stream_;
+    std::string file_;
+    int lineno_;
+  };
+
+  TVM_DLL TVM_NO_INLINE static Entry& GetEntry();
+};
+
+/*!
+ * \brief Class to accumulate an log message. Do not use directly, instead use
+ * LOG(INFO), LOG(WARNING), LOG(ERROR).
+ */
+class LogMessage {
+ public:
+  LogMessage(const std::string& file, int lineno) {
+    std::time_t t = std::time(nullptr);
+    stream_ << "[" << std::put_time(std::localtime(&t), "%H:%M:%S") << "] " << file << ":" << lineno
+            << ": ";
+  }
+  TVM_NO_INLINE ~LogMessage() { std::cerr << stream_.str() << std::endl; }
+  std::ostringstream& stream() { return stream_; }
+
+ private:
+  std::ostringstream stream_;
+};
+#endif
+
+// Below is from dmlc-core
+// This class is used to explicitly ignore values in the conditional
+// logging macros.  This avoids compiler warnings like "value computed
+// is not used" and "statement has no effect".
+class LogMessageVoidify {
+ public:
+  LogMessageVoidify() {}
+  // This has to be an operator with a precedence lower than << but
+  // higher than "?:". See its usage.
+  void operator&(std::ostream&) {}
+};
+
+// Also from dmlc-core
+inline bool DebugLoggingEnabled() {
+  static int state = 0;
+  if (state == 0) {
+    if (auto var = std::getenv("TVM_LOG_DEBUG")) {
+      if (std::string(var) == "1") {
+        state = 1;
+      } else {
+        state = -1;
+      }
+    } else {
+      // by default hide debug logging.
+      state = -1;
+    }
+  }
+  return state == 1;
+}
+
+constexpr const char* kTVM_INTERNAL_ERROR_MESSAGE =
+    "\n"
+    "---------------------------------------------------------------\n"
+    "An internal invariant was violated during the execution of TVM.\n"
+    "Please read TVM's error reporting guidelines.\n"
+    "More details can be found here: https://discuss.tvm.ai/t/error-reporting/7793.\n"
+    "---------------------------------------------------------------\n";
+
+template <typename X, typename Y>
+std::unique_ptr<std::string> LogCheckFormat(const X& x, const Y& y) {
+  std::ostringstream os;
+  os << " (" << x << " vs. " << y << ") ";  // CHECK_XX(x, y) requires x and y can be serialized to
+                                            // string. Use CHECK(x OP y) otherwise.
+  // no std::make_unique until c++14
+  return std::unique_ptr<std::string>(new std::string(os.str()));
+}
+
+// Inline _Pragma in macros does not work reliably on old version of MSVC and
+// GCC. We wrap all comparisons in a function so that we can use #pragma to
+// silence bad comparison warnings.
+#define TVM_CHECK_FUNC(name, op)                                                          \
+  template <typename X, typename Y>                                                       \
+  TVM_ALWAYS_INLINE std::unique_ptr<std::string> LogCheck##name(const X& x, const Y& y) { \
+    if (x op y) return nullptr;                                                           \
+    return LogCheckFormat(x, y);                                                          \
+  }                                                                                       \
+  TVM_ALWAYS_INLINE std::unique_ptr<std::string> LogCheck##name(int x, int y) {           \
+    return LogCheck##name<int, int>(x, y);                                                \
+  }
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wsign-compare"
+TVM_CHECK_FUNC(_LT, <)
+TVM_CHECK_FUNC(_GT, >)
+TVM_CHECK_FUNC(_LE, <=)
+TVM_CHECK_FUNC(_GE, >=)
+TVM_CHECK_FUNC(_EQ, ==)
+TVM_CHECK_FUNC(_NE, !=)
+#pragma GCC diagnostic pop
+}  // namespace detail
+
+#define LOG(level) LOG_##level
+#define LOG_FATAL ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream()
+#define LOG_INFO ::tvm::runtime::detail::LogMessage(__FILE__, __LINE__).stream()
+#define LOG_ERROR (::tvm::runtime::detail::LogMessage(__FILE__, __LINE__).stream() << "Error: ")
+#define LOG_WARNING (::tvm::runtime::detail::LogMessage(__FILE__, __LINE__).stream() << "Warning: ")
+
+#define TVM_CHECK_BINARY_OP(name, op, x, y)                                \
+  if (auto __tvm__log__err = ::tvm::runtime::detail::LogCheck##name(x, y)) \
+  ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream()            \
+      << "Check failed: " << #x " " #op " " #y << *__tvm__log__err << ": "
+
+#define CHECK(x)                                                \
+  if (!(x))                                                     \
+  ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream() \
+      << "Check failed: (" #x << ") is false: "
+
+#define CHECK_LT(x, y) TVM_CHECK_BINARY_OP(_LT, <, x, y)
+#define CHECK_GT(x, y) TVM_CHECK_BINARY_OP(_GT, >, x, y)
+#define CHECK_LE(x, y) TVM_CHECK_BINARY_OP(_LE, <=, x, y)
+#define CHECK_GE(x, y) TVM_CHECK_BINARY_OP(_GE, >=, x, y)
+#define CHECK_EQ(x, y) TVM_CHECK_BINARY_OP(_EQ, ==, x, y)
+#define CHECK_NE(x, y) TVM_CHECK_BINARY_OP(_NE, !=, x, y)
+#define CHECK_NOTNULL(x)                                                          \
+  ((x) == nullptr ? ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream() \
+                        << "Check not null: " #x << ' ',                          \
+   (x) : (x))  // NOLINT(*)
+
+#define LOG_IF(severity, condition) \
+  !(condition) ? (void)0 : ::tvm::runtime::detail::LogMessageVoidify() & LOG(severity)
+
+#if TVM_LOG_DEBUG
+
+#define LOG_DFATAL LOG_FATAL
+#define DFATAL FATAL
+#define DLOG(severity) LOG_IF(severity, ::tvm::runtime::detail::DebugLoggingEnabled())
+#define DLOG_IF(severity, condition) \
+  LOG_IF(severity, ::tvm::runtime::detail::DebugLoggingEnabled() && (condition))
+
+#else
+
+#define LOG_DFATAL LOG_ERROR
+#define DFATAL ERROR
+#define DLOG(severity) true ? (void)0 : ::tvm::runtime::detail::LogMessageVoidify() & LOG(severity)
+#define DLOG_IF(severity, condition) \
+  (true || !(condition)) ? (void)0 : ::tvm::runtime::detail::LogMessageVoidify() & LOG(severity)
+
+#endif
+
+#if TVM_LOG_DEBUG
+#define DCHECK(x) \
+  while (false) CHECK(x)
+#define DCHECK_LT(x, y) \
+  while (false) CHECK((x) < (y))
+#define DCHECK_GT(x, y) \
+  while (false) CHECK((x) > (y))
+#define DCHECK_LE(x, y) \
+  while (false) CHECK((x) <= (y))
+#define DCHECK_GE(x, y) \
+  while (false) CHECK((x) >= (y))
+#define DCHECK_EQ(x, y) \
+  while (false) CHECK((x) == (y))
+#define DCHECK_NE(x, y) \
+  while (false) CHECK((x) != (y))
+#else
+#define DCHECK(x) CHECK(x)
+#define DCHECK_LT(x, y) CHECK((x) < (y))
+#define DCHECK_GT(x, y) CHECK((x) > (y))
+#define DCHECK_LE(x, y) CHECK((x) <= (y))
+#define DCHECK_GE(x, y) CHECK((x) >= (y))
+#define DCHECK_EQ(x, y) CHECK((x) == (y))
+#define DCHECK_NE(x, y) CHECK((x) != (y))
+#endif
+
+#define TVM_ICHECK_INDENT "  "
+
+#define ICHECK_BINARY_OP(name, op, x, y)                                   \
+  if (auto __tvm__log__err = ::tvm::runtime::detail::LogCheck##name(x, y)) \
+  ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream()            \
+      << ::tvm::runtime::detail::kTVM_INTERNAL_ERROR_MESSAGE << std::endl  \
+      << TVM_ICHECK_INDENT << "Check failed: " << #x " " #op " " #y << *__tvm__log__err << ": "
+
+#define ICHECK(x)                                                                 \
+  if (!(x))                                                                       \
+  ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream()                   \
+      << ::tvm::runtime::detail::kTVM_INTERNAL_ERROR_MESSAGE << TVM_ICHECK_INDENT \
+      << "Check failed: (" #x << ") is false: "
+
+#define ICHECK_LT(x, y) ICHECK_BINARY_OP(_LT, <, x, y)
+#define ICHECK_GT(x, y) ICHECK_BINARY_OP(_GT, >, x, y)
+#define ICHECK_LE(x, y) ICHECK_BINARY_OP(_LE, <=, x, y)
+#define ICHECK_GE(x, y) ICHECK_BINARY_OP(_GE, >=, x, y)
+#define ICHECK_EQ(x, y) ICHECK_BINARY_OP(_EQ, ==, x, y)
+#define ICHECK_NE(x, y) ICHECK_BINARY_OP(_NE, !=, x, y)
+#define ICHECK_NOTNULL(x)                                                         \
+  ((x) == nullptr ? ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream() \
+                        << ::tvm::runtime::detail::kTVM_INTERNAL_ERROR_MESSAGE    \
+                        << TVM_ICHECK_INDENT << "Check not null: " #x << ' ',     \
+   (x) : (x))  // NOLINT(*)
+
+}  // namespace runtime
+// Re-export error types
+using runtime::Error;
+using runtime::InternalError;
+}  // namespace tvm
+#endif  // TVM_RUNTIME_LOGGING_H_
diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
index a884b5c6838f..ada9b74503bc 100644
--- a/include/tvm/runtime/ndarray.h
+++ b/include/tvm/runtime/ndarray.h
@@ -31,12 +31,15 @@
 #include <tvm/runtime/serializer.h>
 
 #include <atomic>
+#include <functional>
 #include <utility>
 #include <vector>
 
 namespace tvm {
 namespace runtime {
 
+typedef DLDevice Device;
+
 /*!
  * \brief Managed NDArray.
  *  The array is backed by reference counted blocks.
@@ -101,11 +104,11 @@ class NDArray : public ObjectRef {
    */
   TVM_DLL void CopyToBytes(void* data, size_t nbytes) const;
   /*!
-   * \brief Copy the data to another context.
-   * \param ctx The target context.
-   * \return The array under another context.
+   * \brief Copy the data to another device.
+   * \param dev The target device.
+   * \return The array under another device.
    */
-  inline NDArray CopyTo(const DLContext& ctx) const;
+  inline NDArray CopyTo(const Device& dev) const;
   /*!
    * \brief Load NDArray from stream
    * \param stream The input data stream
@@ -134,11 +137,11 @@ class NDArray : public ObjectRef {
    * \brief Create an empty NDArray.
    * \param shape The shape of the new array.
    * \param dtype The data type of the new array.
-   * \param ctx The context of the array.
+   * \param dev The device of the array.
    * \param mem_scope The memory scope of the array.
    * \return The created Array
    */
-  TVM_DLL static NDArray Empty(std::vector<int64_t> shape, DLDataType dtype, DLContext ctx,
+  TVM_DLL static NDArray Empty(std::vector<int64_t> shape, DLDataType dtype, Device dev,
                                Optional<String> mem_scope = NullOpt);
   /*!
    * \brief Create a NDArray backed by a dlpack tensor.
@@ -256,7 +259,7 @@ class NDArray::Container : public Object, public NDArray::ContainerBase {
     dl_tensor.byte_offset = 0;
   }
 
-  Container(void* data, std::vector<int64_t> shape, DLDataType dtype, DLContext ctx) {
+  Container(void* data, std::vector<int64_t> shape, DLDataType dtype, Device dev) {
     // Initialize the type index.
     type_index_ = Container::RuntimeTypeIndex();
     dl_tensor.data = data;
@@ -266,7 +269,7 @@ class NDArray::Container : public Object, public NDArray::ContainerBase {
     dl_tensor.dtype = dtype;
     dl_tensor.strides = nullptr;
     dl_tensor.byte_offset = 0;
-    dl_tensor.ctx = ctx;
+    dl_tensor.device = dev;
   }
   /*!
    * \brief Set the deleter field.
@@ -349,11 +352,11 @@ inline void NDArray::CopyTo(const NDArray& other) const {
   CopyFromTo(&(get_mutable()->dl_tensor), &(other.get_mutable()->dl_tensor));
 }
 
-inline NDArray NDArray::CopyTo(const DLContext& ctx) const {
+inline NDArray NDArray::CopyTo(const Device& dev) const {
   ICHECK(data_ != nullptr);
   const DLTensor* dptr = operator->();
   NDArray ret =
-      Empty(std::vector<int64_t>(dptr->shape, dptr->shape + dptr->ndim), dptr->dtype, ctx);
+      Empty(std::vector<int64_t>(dptr->shape, dptr->shape + dptr->ndim), dptr->dtype, dev);
   this->CopyTo(ret);
   return ret;
 }
@@ -401,12 +404,12 @@ inline bool SaveDLTensor(dmlc::Stream* strm, const DLTensor* tensor) {
   // This is used to prevent case when another user loads the parameters
   // back on machine that do not have GPU or related context.
   //
-  // We can always do array.CopyTo(target_ctx) to get a corresponding
+  // We can always do array.CopyTo(target_dev) to get a corresponding
   // array in the target context.
-  DLContext cpu_ctx;
-  cpu_ctx.device_type = kDLCPU;
-  cpu_ctx.device_id = 0;
-  strm->Write(cpu_ctx);
+  Device cpu_dev;
+  cpu_dev.device_type = kDLCPU;
+  cpu_dev.device_id = 0;
+  strm->Write(cpu_dev);
   strm->Write(tensor->ndim);
   strm->Write(tensor->dtype);
   int ndim = tensor->ndim;
@@ -419,8 +422,8 @@ inline bool SaveDLTensor(dmlc::Stream* strm, const DLTensor* tensor) {
   int64_t data_byte_size = type_bytes * num_elems;
   strm->Write(data_byte_size);
 
-  if (DMLC_IO_NO_ENDIAN_SWAP && tensor->ctx.device_type == kDLCPU && tensor->strides == nullptr &&
-      tensor->byte_offset == 0) {
+  if (DMLC_IO_NO_ENDIAN_SWAP && tensor->device.device_type == kDLCPU &&
+      tensor->strides == nullptr && tensor->byte_offset == 0) {
     // quick path
     strm->Write(tensor->data, data_byte_size);
   } else {
@@ -444,18 +447,18 @@ inline bool NDArray::Load(dmlc::Stream* strm) {
   ICHECK(strm->Read(&header)) << "Invalid DLTensor file format";
   ICHECK(strm->Read(&reserved)) << "Invalid DLTensor file format";
   ICHECK(header == kTVMNDArrayMagic) << "Invalid DLTensor file format";
-  DLContext ctx;
+  Device dev;
   int ndim;
   DLDataType dtype;
-  ICHECK(strm->Read(&ctx)) << "Invalid DLTensor file format";
+  ICHECK(strm->Read(&dev)) << "Invalid DLTensor file format";
   ICHECK(strm->Read(&ndim)) << "Invalid DLTensor file format";
   ICHECK(strm->Read(&dtype)) << "Invalid DLTensor file format";
-  ICHECK_EQ(ctx.device_type, kDLCPU) << "Invalid DLTensor context: can only save as CPU tensor";
+  ICHECK_EQ(dev.device_type, kDLCPU) << "Invalid DLTensor device: can only save as CPU tensor";
   std::vector<int64_t> shape(ndim);
   if (ndim != 0) {
     ICHECK(strm->ReadArray(&shape[0], ndim)) << "Invalid DLTensor file format";
   }
-  NDArray ret = NDArray::Empty(shape, dtype, ctx);
+  NDArray ret = NDArray::Empty(shape, dtype, dev);
   int64_t num_elems = 1;
   int elem_bytes = (ret->dtype.bits + 7) / 8;
   for (int i = 0; i < ret->ndim; ++i) {
@@ -477,5 +480,26 @@ inline bool NDArray::Load(dmlc::Stream* strm) {
 }
 
 }  // namespace runtime
+
+// alias Device
+using tvm::runtime::Device;
+
 }  // namespace tvm
+
+namespace std {
+template <>
+struct hash<tvm::runtime::Device> {
+  std::size_t operator()(const tvm::runtime::Device& dev) const {
+    return ((dev.device_id << 8) | dev.device_type);
+  }
+};
+
+template <>
+struct equal_to<tvm::runtime::Device> {
+  bool operator()(const tvm::runtime::Device& lhs, const tvm::runtime::Device& rhs) const {
+    return (lhs.device_type == rhs.device_type && lhs.device_id == rhs.device_id);
+  }
+};
+}  // namespace std
+
 #endif  // TVM_RUNTIME_NDARRAY_H_
diff --git a/include/tvm/runtime/object.h b/include/tvm/runtime/object.h
index 70ab7688c450..048fc1d5af54 100644
--- a/include/tvm/runtime/object.h
+++ b/include/tvm/runtime/object.h
@@ -24,7 +24,7 @@
 #define TVM_RUNTIME_OBJECT_H_
 
 #include <tvm/runtime/c_runtime_api.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include <string>
 #include <type_traits>
@@ -186,7 +186,7 @@ class TVM_DLL Object {
   template <typename TargetType>
   inline bool IsInstance() const;
   /*!
-   * \return Weather the cell has only one reference
+   * \return Whether the cell has only one reference
    * \note We use stl style naming to be consistent with known API in shared_ptr.
    */
   inline bool unique() const;
@@ -337,7 +337,7 @@ inline RelayRefType GetRef(const ObjectType* ptr);
 /*!
  * \brief Downcast a base reference type to a more specific type.
  *
- * \param ref The inptut reference
+ * \param ref The input reference
  * \return The corresponding SubRef.
  * \tparam SubRef The target specific reference type.
  * \tparam BaseRef the current reference type.
@@ -416,7 +416,7 @@ class ObjectPtr {
     return *get();
   }
   /*!
-   * \brief copy assignmemt
+   * \brief copy assignment
    * \param other The value to be assigned.
    * \return reference to self.
    */
@@ -427,7 +427,7 @@ class ObjectPtr {
     return *this;
   }
   /*!
-   * \brief move assignmemt
+   * \brief move assignment
    * \param other The value to be assigned.
    * \return reference to self.
    */
@@ -632,7 +632,7 @@ struct ObjectPtrEqual {
 };
 
 /*!
- * \brief helper macro to declare a base object type that can be inheritated.
+ * \brief helper macro to declare a base object type that can be inherited.
  * \param TypeName The name of the current type.
  * \param ParentType The name of the ParentType
  */
@@ -648,10 +648,10 @@ struct ObjectPtrEqual {
     return _GetOrAllocRuntimeTypeIndex();                                                      \
   }                                                                                            \
   static uint32_t _GetOrAllocRuntimeTypeIndex() {                                              \
-    static uint32_t tidx = Object::GetOrAllocRuntimeTypeIndex(                                 \
+    static uint32_t tindex = Object::GetOrAllocRuntimeTypeIndex(                               \
         TypeName::_type_key, TypeName::_type_index, ParentType::_GetOrAllocRuntimeTypeIndex(), \
         TypeName::_type_child_slots, TypeName::_type_child_slots_can_overflow);                \
-    return tidx;                                                                               \
+    return tindex;                                                                             \
   }
 
 /*!
@@ -664,7 +664,7 @@ struct ObjectPtrEqual {
   static const constexpr int _type_child_slots = 0;         \
   TVM_DECLARE_BASE_OBJECT_INFO(TypeName, ParentType)
 
-/*! \brief helper macro to supress unused warning */
+/*! \brief helper macro to suppress unused warning */
 #if defined(__GNUC__)
 #define TVM_ATTRIBUTE_UNUSED __attribute__((unused))
 #else
@@ -686,7 +686,7 @@ struct ObjectPtrEqual {
   TVM_STR_CONCAT(TVM_OBJECT_REG_VAR_DEF, __COUNTER__) = TypeName::_GetOrAllocRuntimeTypeIndex()
 
 /*
- * \brief Define the default copy/move constructor and assign opeator
+ * \brief Define the default copy/move constructor and assign operator
  * \param TypeName The class typename.
  */
 #define TVM_DEFINE_DEFAULT_COPY_MOVE_AND_ASSIGN(TypeName) \
@@ -827,7 +827,7 @@ inline bool Object::IsInstance() const {
       if (!TargetType::_type_child_slots_can_overflow) return false;
       // Invariance: parent index is always smaller than the child.
       if (self->type_index_ < TargetType::RuntimeTypeIndex()) return false;
-      // The rare slower-path, check type hierachy.
+      // The rare slower-path, check type hierarchy.
       return self->DerivedFrom(TargetType::RuntimeTypeIndex());
     }
   } else {
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index 751a435c734a..58bd2859c10a 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -24,10 +24,10 @@
 #ifndef TVM_RUNTIME_PACKED_FUNC_H_
 #define TVM_RUNTIME_PACKED_FUNC_H_
 
-#include <dmlc/logging.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/container.h>
 #include <tvm/runtime/data_type.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/object.h>
@@ -46,14 +46,6 @@
 #define TVM_RUNTIME_HEADER_ONLY 0
 #endif
 
-// Always inline macro only use in template
-// expansion cases where we know inline is important.
-#ifdef _MSC_VER
-#define TVM_ALWAYS_INLINE __forceinline
-#else
-#define TVM_ALWAYS_INLINE inline __attribute__((always_inline))
-#endif
-
 namespace tvm {
 namespace runtime {
 
@@ -547,9 +539,9 @@ class TVMPODValue_ {
     TVM_CHECK_TYPE_CODE(type_code_, kTVMModuleHandle);
     return Module(ObjectPtr<Object>(static_cast<Object*>(value_.v_handle)));
   }
-  operator TVMContext() const {
-    TVM_CHECK_TYPE_CODE(type_code_, kTVMContext);
-    return value_.v_ctx;
+  operator Device() const {
+    TVM_CHECK_TYPE_CODE(type_code_, kDLDevice);
+    return value_.v_device;
   }
   int type_code() const { return type_code_; }
   /*!
@@ -606,7 +598,7 @@ class TVMArgValue : public TVMPODValue_ {
   using TVMPODValue_::operator void*;
   using TVMPODValue_::operator DLTensor*;
   using TVMPODValue_::operator NDArray;
-  using TVMPODValue_::operator TVMContext;
+  using TVMPODValue_::operator Device;
   using TVMPODValue_::operator Module;
   using TVMPODValue_::AsObjectRef;
   using TVMPODValue_::IsObjectRef;
@@ -666,7 +658,7 @@ class TVMMovableArgValue_ : public TVMPODValue_ {
   using TVMPODValue_::operator void*;
   using TVMPODValue_::operator DLTensor*;
   using TVMPODValue_::operator NDArray;
-  using TVMPODValue_::operator TVMContext;
+  using TVMPODValue_::operator Device;
   using TVMPODValue_::operator Module;
   // reuse conversion rule from ArgValue.
   operator std::string() const { return AsArgValue().operator std::string(); }
@@ -743,7 +735,7 @@ class TVMRetValue : public TVMPODValue_ {
   /*! \brief default constructor */
   TVMRetValue() {}
   /*!
-   * \brief move constructor from anoter return value.
+   * \brief move constructor from another return value.
    * \param other The other return value.
    */
   TVMRetValue(TVMRetValue&& other) : TVMPODValue_(other.value_, other.type_code_) {
@@ -760,7 +752,7 @@ class TVMRetValue : public TVMPODValue_ {
   using TVMPODValue_::operator bool;
   using TVMPODValue_::operator void*;
   using TVMPODValue_::operator DLTensor*;
-  using TVMPODValue_::operator TVMContext;
+  using TVMPODValue_::operator Device;
   using TVMPODValue_::operator NDArray;
   using TVMPODValue_::operator Module;
   using TVMPODValue_::AsObjectRef;
@@ -827,9 +819,9 @@ class TVMRetValue : public TVMPODValue_ {
     value_.v_int64 = value;
     return *this;
   }
-  TVMRetValue& operator=(TVMContext value) {
-    this->SwitchToPOD(kTVMContext);
-    value_.v_ctx = value;
+  TVMRetValue& operator=(DLDevice value) {
+    this->SwitchToPOD(kDLDevice);
+    value_.v_device = value;
     return *this;
   }
   TVMRetValue& operator=(DLDataType t) {
@@ -1086,7 +1078,7 @@ struct PackedFuncValueConverter {
       Function(::tvm::runtime::TVMArgs(args, type_code, num_args), &rv);                    \
       rv.MoveToCHost(out_value, out_type_code);                                             \
       return 0;                                                                             \
-    } catch (const ::std::runtime_error& _except_) {                                        \
+    } catch (const ::std::exception& _except_) {                                            \
       TVMAPISetLastError(_except_.what());                                                  \
       return -1;                                                                            \
     }                                                                                       \
@@ -1119,7 +1111,7 @@ struct PackedFuncValueConverter {
  * });
  *
  * // The following code will cause compilation error.
- * // Because the same Function and ExortName
+ * // Because the same Function and ExportName
  * // TVM_DLL_EXPORT_TYPED_FUNC(AddOne_, AddOne_);
  *
  * // The following code is OK, assuming the macro
@@ -1140,7 +1132,7 @@ struct PackedFuncValueConverter {
           f, ::tvm::runtime::TVMArgs(args, type_code, num_args), &rv);                      \
       rv.MoveToCHost(out_value, out_type_code);                                             \
       return 0;                                                                             \
-    } catch (const ::std::runtime_error& _except_) {                                        \
+    } catch (const ::std::exception& _except_) {                                            \
       TVMAPISetLastError(_except_.what());                                                  \
       return -1;                                                                            \
     }                                                                                       \
@@ -1180,8 +1172,8 @@ inline const char* ArgTypeCode2Str(int type_code) {
       return "ArrayHandle";
     case kTVMDataType:
       return "DLDataType";
-    case kTVMContext:
-      return "TVMContext";
+    case kDLDevice:
+      return "DLDevice";
     case kTVMPackedFuncHandle:
       return "FunctionHandle";
     case kTVMModuleHandle:
@@ -1295,9 +1287,9 @@ class TVMArgsSetter {
     values_[i].v_handle = value;
     type_codes_[i] = kTVMDLTensorHandle;
   }
-  TVM_ALWAYS_INLINE void operator()(size_t i, TVMContext value) const {
-    values_[i].v_ctx = value;
-    type_codes_[i] = kTVMContext;
+  TVM_ALWAYS_INLINE void operator()(size_t i, Device value) const {
+    values_[i].v_device = value;
+    type_codes_[i] = kDLDevice;
   }
   TVM_ALWAYS_INLINE void operator()(size_t i, DLDataType value) const {
     values_[i].v_type = value;
diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h
index 45b60ea18acc..fa16e7820b35 100644
--- a/include/tvm/runtime/profiling.h
+++ b/include/tvm/runtime/profiling.h
@@ -80,7 +80,7 @@ class Timer : public ObjectRef {
  public:
   /*!
    * \brief Get a device specific timer.
-   * \param ctx The device context to time.
+   * \param dev The device to time.
    * \return A `Timer` that has already been started.
    *
    * Use this function to time runtime of arbitrary regions of code on a specific
@@ -95,7 +95,7 @@ class Timer : public ObjectRef {
    *
    * Example usage:
    * \code{.cpp}
-   * Timer t = Timer::Start(TVMContext::cpu());
+   * Timer t = Timer::Start(Device::cpu());
    * my_long_running_function();
    * t->Stop();
    * ... // some more computation
@@ -104,7 +104,7 @@ class Timer : public ObjectRef {
    *
    * To add a new device-specific timer, register a new function
    * "profiler.timer.my_device" (where `my_device` is the `DeviceName` of your
-   * device). This function should accept a `TVMContext` and return a new `Timer`
+   * device). This function should accept a `Device` and return a new `Timer`
    * that has already been started.
    *
    * For example, this is how the CPU timer is implemented:
@@ -125,24 +125,24 @@ class Timer : public ObjectRef {
    *  };
    *  TVM_REGISTER_OBJECT_TYPE(CPUTimerNode);
    *
-   *  TVM_REGISTER_GLOBAL("profiling.timer.cpu").set_body_typed([](TVMContext ctx) {
+   *  TVM_REGISTER_GLOBAL("profiling.timer.cpu").set_body_typed([](Device dev) {
    *    return Timer(make_object<CPUTimerNode>());
    *  });
    * \endcode
    */
-  static TVM_DLL Timer Start(TVMContext ctx);
+  static TVM_DLL Timer Start(Device dev);
 
   TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(Timer, ObjectRef, TimerNode);
 };
 
 /*!
- * \brief Default timer if one does not exist for the context.
- * \param ctx The context to time on.
+ * \brief Default timer if one does not exist for the device.
+ * \param dev The device to time on.
  *
  * Note that this timer performs synchronization between the device and CPU,
  * which can lead to overhead in the reported results.
  */
-Timer DefaultTimer(TVMContext ctx);
+Timer DefaultTimer(Device dev);
 
 }  // namespace runtime
 }  // namespace tvm
diff --git a/include/tvm/runtime/serializer.h b/include/tvm/runtime/serializer.h
index f40c87ee07ec..b35cad368832 100644
--- a/include/tvm/runtime/serializer.h
+++ b/include/tvm/runtime/serializer.h
@@ -20,7 +20,7 @@
 /*!
  * \file tvm/runtime/serializer.h
  * \brief Serializer extension to support TVM data types
- *  Include this file to enable serialization of DLDataType, DLContext
+ *  Include this file to enable serialization of DLDataType, DLDevice
  */
 #ifndef TVM_RUNTIME_SERIALIZER_H_
 #define TVM_RUNTIME_SERIALIZER_H_
@@ -49,17 +49,17 @@ struct Handler<DLDataType> {
 };
 
 template <>
-struct Handler<DLContext> {
-  inline static void Write(Stream* strm, const DLContext& ctx) {
-    int32_t device_type = static_cast<int32_t>(ctx.device_type);
+struct Handler<DLDevice> {
+  inline static void Write(Stream* strm, const DLDevice& dev) {
+    int32_t device_type = static_cast<int32_t>(dev.device_type);
     Handler<int32_t>::Write(strm, device_type);
-    Handler<int32_t>::Write(strm, ctx.device_id);
+    Handler<int32_t>::Write(strm, dev.device_id);
   }
-  inline static bool Read(Stream* strm, DLContext* ctx) {
+  inline static bool Read(Stream* strm, DLDevice* dev) {
     int32_t device_type = 0;
     if (!Handler<int32_t>::Read(strm, &(device_type))) return false;
-    ctx->device_type = static_cast<DLDeviceType>(device_type);
-    if (!Handler<int32_t>::Read(strm, &(ctx->device_id))) return false;
+    dev->device_type = static_cast<DLDeviceType>(device_type);
+    if (!Handler<int32_t>::Read(strm, &(dev->device_id))) return false;
     return true;
   }
 };
diff --git a/include/tvm/runtime/vm/bytecode.h b/include/tvm/runtime/vm/bytecode.h
index e858c4458054..72a557fa93b1 100644
--- a/include/tvm/runtime/vm/bytecode.h
+++ b/include/tvm/runtime/vm/bytecode.h
@@ -25,7 +25,7 @@
 #define TVM_RUNTIME_VM_BYTECODE_H_
 
 #include <tvm/runtime/data_type.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include <iostream>
 #include <vector>
diff --git a/include/tvm/runtime/vm/memory_manager.h b/include/tvm/runtime/vm/memory_manager.h
index c983cb066ec9..fb2354bca4ec 100644
--- a/include/tvm/runtime/vm/memory_manager.h
+++ b/include/tvm/runtime/vm/memory_manager.h
@@ -34,23 +34,6 @@
 #include <unordered_map>
 #include <vector>
 
-namespace std {
-template <>
-struct hash<TVMContext> {
-  std::size_t operator()(const TVMContext& ctx) const {
-    return ((ctx.device_id << 8) | ctx.device_type);
-  }
-};
-
-template <>
-struct equal_to<TVMContext> {
-  bool operator()(const TVMContext& lhs, const TVMContext& rhs) const {
-    return (lhs.device_type == rhs.device_type && lhs.device_id == rhs.device_id);
-  }
-};
-
-}  // namespace std
-
 namespace tvm {
 namespace runtime {
 namespace vm {
@@ -61,7 +44,7 @@ struct Buffer {
   /*! \brief The size of the block. */
   size_t size{0};
   /*! \brief The context of the allocated buffers. */
-  TVMContext ctx;
+  Device device;
 };
 
 enum AllocatorType {
@@ -76,10 +59,10 @@ class Allocator {
   /*! \brief Allocate an empty NDArray using from the allocator.
    *  \param shape The shape of the NDArray.
    *  \param dtype The datatype of the NDArray.
-   *  \param ctx The context where the array is allocated.
+   *  \param dev The device where the array is allocated.
    *  \return The empty NDArray.
    */
-  NDArray Empty(std::vector<int64_t> shape, DLDataType dtype, DLContext ctx);
+  NDArray Empty(std::vector<int64_t> shape, DLDataType dtype, Device dev);
   /*! \brief Return the allocator type. */
   inline AllocatorType type() const { return type_; }
   /*! \brief Allocate a buffer given a size, alignment and type.
@@ -107,24 +90,24 @@ class MemoryManager {
   static MemoryManager* Global();
   /*!
    * \brief Get or create an allocator given the context and allocator type.
-   * \param ctx The TVM context
+   * \param dev The TVM device
    * \param type The allocator type
    * \return The memory allocator.
    */
-  static Allocator* GetOrCreateAllocator(TVMContext ctx, AllocatorType type);
+  static Allocator* GetOrCreateAllocator(Device dev, AllocatorType type);
   /*!
    * \brief Get an allocator given the context.
-   * \param ctx The TVM context
+   * \param dev The TVM device
    * \return The memory allocator.
    */
-  static Allocator* GetAllocator(TVMContext ctx);
+  static Allocator* GetAllocator(Device dev);
 
  private:
   MemoryManager() {}
 
  private:
   std::mutex mu_;
-  std::unordered_map<TVMContext, std::unique_ptr<Allocator>> allocators_;
+  std::unordered_map<Device, std::unique_ptr<Allocator>> allocators_;
 };
 
 /*! \brief An object representing a storage allocation. */
@@ -140,7 +123,7 @@ class StorageObj : public Object {
   static void Deleter(Object* ptr);
 
   ~StorageObj() {
-    auto alloc = MemoryManager::Global()->GetAllocator(buffer.ctx);
+    auto alloc = MemoryManager::Global()->GetAllocator(buffer.device);
     alloc->Free(buffer);
   }
 
diff --git a/include/tvm/runtime/vm/vm.h b/include/tvm/runtime/vm/vm.h
index e9f51de611b6..15de1df98a78 100644
--- a/include/tvm/runtime/vm/vm.h
+++ b/include/tvm/runtime/vm/vm.h
@@ -239,17 +239,17 @@ class VirtualMachine : public runtime::ModuleNode {
                             Index output_size, const std::vector<ObjectRef>& args);
 
   /*!
-   * \brief Initialize the virtual machine for a set of contexts.
-   * \param contexts The set of TVM contexts.
-   * \param alloc_types The allocator types for each context.
+   * \brief Initialize the virtual machine for a set of devices.
+   * \param devices The set of TVM devices.
+   * \param alloc_types The allocator types for each device.
    */
-  void Init(const std::vector<TVMContext>& contexts, const std::vector<AllocatorType>& alloc_types);
+  void Init(const std::vector<Device>& devices, const std::vector<AllocatorType>& alloc_types);
 
   /*! \brief Run VM dispatch loop. */
   void RunLoop();
 
-  /*! \brief Get context from the context list based on a given device type. */
-  TVMContext GetContext(Index device_type) const;
+  /*! \brief Get device from the device list based on a given device type. */
+  Device GetDevice(Index device_type) const;
 
   /*!
    * \brief Invoke a global setting up the VM state to execute.
@@ -275,8 +275,8 @@ class VirtualMachine : public runtime::ModuleNode {
   const Executable* exec_;
   /*! \brief The function name to inputs mapping. */
   std::unordered_map<std::string, std::vector<ObjectRef>> inputs_;
-  /*! \brief The set of TVM contexts the VM is currently executing on. */
-  std::vector<TVMContext> ctxs_;
+  /*! \brief The set of TVM devices the VM is currently executing on. */
+  std::vector<Device> devices_;
   /*! \brief The cached memory allocators. */
   std::vector<Allocator*> allocators_;
   /*!
diff --git a/include/tvm/support/logging.h b/include/tvm/support/logging.h
deleted file mode 100644
index ced1902a1bd1..000000000000
--- a/include/tvm/support/logging.h
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file tvm/support/logging.h
- * \brief logging utilities on top of dmlc-core
- */
-#ifndef TVM_SUPPORT_LOGGING_H_
-#define TVM_SUPPORT_LOGGING_H_
-
-#include <dmlc/logging.h>
-
-// a technique that enables overriding macro names on the number of parameters. This is used
-// to define other macros below
-#define GET_MACRO(_1, _2, _3, _4, _5, NAME, ...) NAME
-
-/*!
- * \brief COND_X calls COND_X_N where N is the number of parameters passed to COND_X
- * X can be any of CHECK_GE, CHECK_EQ, CHECK, or LOG (defined dmlc-core/include/dmlc/logging.h.)
- * COND_X (but not COND_X_N) are supposed to be used outside this file.
- * The first parameter of COND_X (and therefore, COND_X_N), which we call 'quit_on_assert',
- * is a boolean. The rest of the parameters of COND_X is the same as the parameters of X.
- * quit_on_assert determines the overall behaviour of COND_X. If it's true COND_X
- * quits the program on assertion failure. If it's false, then it moves on and somehow reports
- * the assertion failure back to the macro caller in an appropriate manner (e.g, 'return false'
- * in a function, or 'continue' or 'break' in a loop)
- * The default behavior when quit_on_assertion is false, is to 'return false'. If this is not
- * desirable, the macro caller can pass one more last parameter to COND_X to tell COND_X what
- * to do when when quit_on_assertion is false and the assertion fails.
- *
- * Rationale: These macros were designed to implement functions that have two behaviours
- * in a concise way. Those behaviours are quitting on assertion failures, or trying to
- * move on from assertion failures. Note that these macros hide lots of control flow in them,
- * and therefore, makes the logic of the whole code slightly harder to understand. However,
- * in pieces of code that use these macros frequently, it will significantly shorten the
- * amount of code needed to be read, and we won't need to clutter the main logic of the
- * function by repetitive control flow structure. The first problem
- * mentioned will be improved over time as the developer gets used to the macro.
- *
- * Here is an example of how to use it
- * \code
- * bool f(..., bool quit_on_assertion) {
- *   int a = 0, b = 0;
- *   ...
- *   a = ...
- *   b = ...
- *   // if quit_on_assertion is true, if a==b, continue, otherwise quit.
- *   // if quit_on_assertion is false, if a==b, continue, otherwise 'return false' (default
- * behaviour) COND_CHECK_EQ(quit_on_assertion, a, b) << "some error message when  quiting"
- *   ...
- *   for (int i = 0; i < N; i++) {
- *     a = ...
- *     b = ...
- *     // if quit_on_assertion is true, if a==b, continue, otherwise quit.
- *     // if quit_on_assertion is false, if a==b, continue, otherwise 'break' (non-default
- *     // behaviour, therefore, has to be explicitly specified)
- *     COND_CHECK_EQ(quit_on_assertion, a, b, break) << "some error message when  quiting"
- *   }
- * }
- * \endcode
- */
-#define COND_CHECK_GE(...) \
-  GET_MACRO(__VA_ARGS__, COND_CHECK_GE_5, COND_CHECK_GE_4, COND_CHECK_GE_3)(__VA_ARGS__)
-#define COND_CHECK_EQ(...) \
-  GET_MACRO(__VA_ARGS__, COND_CHECK_EQ_5, COND_CHECK_EQ_4, COND_CHECK_EQ_3)(__VA_ARGS__)
-#define COND_CHECK(...) \
-  GET_MACRO(__VA_ARGS__, COND_CHECK_5, COND_CHECK_4, COND_CHECK_3, COND_CHECK_2)(__VA_ARGS__)
-#define COND_LOG(...) \
-  GET_MACRO(__VA_ARGS__, COND_LOG_5, COND_LOG_4, COND_LOG_3, COND_LOG_2)(__VA_ARGS__)
-
-// Not supposed to be used by users directly.
-#define COND_CHECK_OP(quit_on_assert, x, y, what, op) \
-  if (!quit_on_assert) {                              \
-    if (!((x)op(y))) what;                            \
-  } else /* NOLINT(*) */                              \
-    CHECK_##op(x, y)
-
-#define COND_CHECK_EQ_4(quit_on_assert, x, y, what) COND_CHECK_OP(quit_on_assert, x, y, what, ==)
-#define COND_CHECK_GE_4(quit_on_assert, x, y, what) COND_CHECK_OP(quit_on_assert, x, y, what, >=)
-
-#define COND_CHECK_3(quit_on_assert, x, what) \
-  if (!quit_on_assert) {                      \
-    if (!(x)) what;                           \
-  } else /* NOLINT(*) */                      \
-    CHECK(x)
-
-#define COND_LOG_3(quit_on_assert, x, what) \
-  if (!quit_on_assert) {                    \
-    what;                                   \
-  } else /* NOLINT(*) */                    \
-    LOG(x)
-
-#define COND_CHECK_EQ_3(quit_on_assert, x, y) COND_CHECK_EQ_4(quit_on_assert, x, y, return false)
-#define COND_CHECK_GE_3(quit_on_assert, x, y) COND_CHECK_GE_4(quit_on_assert, x, y, return false)
-#define COND_CHECK_2(quit_on_assert, x) COND_CHECK_3(quit_on_assert, x, return false)
-#define COND_LOG_2(quit_on_assert, x) COND_LOG_3(quit_on_assert, x, return false)
-
-namespace tvm {
-
-constexpr const char* kTVM_INTERNAL_ERROR_MESSAGE =
-    "\n---------------------------------------------------------------\n"
-    "An internal invariant was violated during the execution of TVM.\n"
-    "Please read TVM's error reporting guidelines.\n"
-    "More details can be found here: https://discuss.tvm.ai/t/error-reporting/7793.\n"
-    "---------------------------------------------------------------\n";
-
-#define ICHECK_INDENT "  "
-
-#define ICHECK_BINARY_OP(name, op, x, y)                           \
-  if (dmlc::LogCheckError _check_err = dmlc::LogCheck##name(x, y)) \
-  dmlc::LogMessageFatal(__FILE__, __LINE__).stream()               \
-      << tvm::kTVM_INTERNAL_ERROR_MESSAGE << std::endl             \
-      << ICHECK_INDENT << "Check failed: " << #x " " #op " " #y << *(_check_err.str) << ": "
-
-#define ICHECK(x)                                    \
-  if (!(x))                                          \
-  dmlc::LogMessageFatal(__FILE__, __LINE__).stream() \
-      << tvm::kTVM_INTERNAL_ERROR_MESSAGE << ICHECK_INDENT << "Check failed: " #x << " == false: "
-
-#define ICHECK_LT(x, y) ICHECK_BINARY_OP(_LT, <, x, y)
-#define ICHECK_GT(x, y) ICHECK_BINARY_OP(_GT, >, x, y)
-#define ICHECK_LE(x, y) ICHECK_BINARY_OP(_LE, <=, x, y)
-#define ICHECK_GE(x, y) ICHECK_BINARY_OP(_GE, >=, x, y)
-#define ICHECK_EQ(x, y) ICHECK_BINARY_OP(_EQ, ==, x, y)
-#define ICHECK_NE(x, y) ICHECK_BINARY_OP(_NE, !=, x, y)
-#define ICHECK_NOTNULL(x)                                                    \
-  ((x) == nullptr ? dmlc::LogMessageFatal(__FILE__, __LINE__).stream()       \
-                        << tvm::kTVM_INTERNAL_ERROR_MESSAGE << ICHECK_INDENT \
-                        << "Check not null: " #x << ' ',                     \
-   (x) : (x))  // NOLINT(*)
-
-/*! \brief The diagnostic level, controls the printing of the message. */
-enum class DiagnosticLevel : int {
-  kBug = 10,
-  kError = 20,
-  kWarning = 30,
-  kNote = 40,
-  kHelp = 50,
-};
-
-}  // namespace tvm
-#endif  // TVM_SUPPORT_LOGGING_H_
diff --git a/include/tvm/support/with.h b/include/tvm/support/with.h
index 90c82c4f3a06..d4547a304e8f 100644
--- a/include/tvm/support/with.h
+++ b/include/tvm/support/with.h
@@ -25,7 +25,7 @@
 #ifndef TVM_SUPPORT_WITH_H_
 #define TVM_SUPPORT_WITH_H_
 
-#include <dmlc/logging.h>
+#include <dmlc/common.h>
 
 #include <utility>
 
diff --git a/include/tvm/tir/analysis.h b/include/tvm/tir/analysis.h
index e5b2c2b6957c..250a84e782a2 100644
--- a/include/tvm/tir/analysis.h
+++ b/include/tvm/tir/analysis.h
@@ -56,6 +56,22 @@ struct ExprDeepEqual {
   TVM_DLL bool operator()(const PrimExpr& lhs, const PrimExpr& rhs) const;
 };
 
+/*!
+ * \brief Visit the PrimFuncs in the IRModule
+ * \tparam FLambda The type of the PrimFunc visitor
+ * \param mod The IRModule to be visited
+ * \param fvisit The visitor to the PrimFuncs in the IRModule
+ */
+template <class FLambda>
+inline void VisitPrimFuncs(const IRModule& mod, FLambda fvisit) {
+  for (const auto& kv : mod->functions) {
+    const BaseFunc& base_func = kv.second;
+    if (const auto* prim_func = base_func.as<PrimFuncNode>()) {
+      fvisit(prim_func);
+    }
+  }
+}
+
 /*!
  * \brief Find undefined vars in the statement.
  * \param stmt The function to be checked.
@@ -141,6 +157,27 @@ TVM_DLL bool VerifyMemory(const PrimFunc& func);
  */
 TVM_DLL bool VerifyGPUCode(const PrimFunc& func, Map<String, PrimExpr> constraints);
 
+/*!
+ * \brief Auto detect the block read/write region according to body stmt
+ *        It will detect the read/write region as an array in order of appearance in AST
+ * \param block The block to be detected
+ * \param buffer_var_map The outside buffers which may be accessed the block.
+ *                       It is a map from buffer var to the buffer.
+ * \return Array of access regions.
+ *         There are three arrays of BufferRegion:
+ *           - first: read regions
+ *           - second: write regions
+ *           - third: opaque regions
+ */
+Array<Array<BufferRegion>> GetBlockAccessRegion(const Block& block,
+                                                const Map<Var, Buffer>& buffer_var_map);
+
+/*!
+ * \brief Calculate the expresion complexity based on number of symbols it contains.
+ * \param expr The expr to be calculated.
+ */
+TVM_DLL size_t CalculateExprComplexity(const PrimExpr& expr);
+
 // Pass variants of verification analysis
 // directly throws RuntimeError when verification fails.
 namespace transform {
diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h
index ac660bfb7461..84c27498740a 100644
--- a/include/tvm/tir/stmt.h
+++ b/include/tvm/tir/stmt.h
@@ -1231,10 +1231,10 @@ constexpr const char* storage_scope = "storage_scope";
 constexpr const char* storage_alignment = "storage_alignment";
 /*! \brief Mark storage scope of realization */
 constexpr const char* realize_scope = "realize_scope";
-/*! \brief The allocation context for global malloc in host. */
-constexpr const char* device_context_id = "device_context_id";
+/*! \brief The allocation device for global malloc in host. */
+constexpr const char* device_id = "device_id";
 /*! \brief The device type. */
-constexpr const char* device_context_type = "device_context_type";
+constexpr const char* device_type = "device_type";
 /*! \brief Mark of loop scope */
 constexpr const char* loop_scope = "loop_scope";
 /*! \brief Mark of reduce scope */
@@ -1312,6 +1312,10 @@ constexpr const char* fragment_shape = "fragment_shape";
  */
 constexpr const char* fragment_layout = "fragment_layout";
 
+/*!
+ * \brief Mark that the kernel is hand threaded and doesn't need syncs inserted
+ */
+constexpr const char* hand_threaded = "hand_threaded";
 /*!
  * \brief Check if attr_key is a pragma key extension
  * \param attr_key The attr key to be compared
diff --git a/include/tvm/tir/stmt_functor.h b/include/tvm/tir/stmt_functor.h
index d6303ae266e1..c1c618f0c22f 100644
--- a/include/tvm/tir/stmt_functor.h
+++ b/include/tvm/tir/stmt_functor.h
@@ -386,6 +386,15 @@ inline T Substitute(T input, const std::unordered_map<const VarNode*, PrimExpr>&
   return Substitute(std::move(input), vmap);
 }
 
+/*!
+ * \brief Recursively visit the IR in pre DFS order node, apply fvisit.
+ * If fvisit returns false, it won't visit the children of the node.
+ * \param stmt_or_expr The ir to be visited.
+ * \param fvisit The visitor function to be applied. If fvisit returns false, it won't visit the
+ * children of the node
+ */
+TVM_DLL void PreOrderVisit(const ObjectRef& stmt_or_expr,
+                           const std::function<bool(const ObjectRef&)>& fvisit);
 }  // namespace tir
 }  // namespace tvm
 
diff --git a/include/tvm/topi/detail/constant_utils.h b/include/tvm/topi/detail/constant_utils.h
index 92ff3a4e3804..95e68f5f6d61 100644
--- a/include/tvm/topi/detail/constant_utils.h
+++ b/include/tvm/topi/detail/constant_utils.h
@@ -119,12 +119,11 @@ inline std::vector<int64_t> GetConstInt64Values(Array<PrimExpr> exprs,
 }
 
 /*!
- * \brief Check weather the two expressions are equal or not, if not simplify the expressions and
- * check again \note This is stronger equality check than tvm::tir::Equal
- *
- * \param lhs First expreesion
- * \param rhs Second expreesion
- *
+ * \brief Check whether the two expressions are equal or not, if not simplify the expressions and
+ * check again
+ * \note This is stronger equality check than tvm::tir::Equal
+ * \param lhs First expression
+ * \param rhs Second expression
  * \return result True if both expressions are equal, else false
  */
 inline bool EqualCheck(PrimExpr lhs, PrimExpr rhs) {
diff --git a/jvm/README.md b/jvm/README.md
index e23c632fb04a..6e71adf65683 100644
--- a/jvm/README.md
+++ b/jvm/README.md
@@ -125,7 +125,7 @@ The following code snippet demonstrate how to load generated shared library (add
 ```java
 import org.apache.tvm.Module;
 import org.apache.tvm.NDArray;
-import org.apache.tvm.TVMContext;
+import org.apache.tvm.Device;
 
 import java.io.File;
 import java.util.Arrays;
@@ -135,12 +135,12 @@ public class LoadAddFunc {
     String loadingDir = args[0];
     Module fadd = Module.load(loadingDir + File.separator + "add_cpu.so");
 
-    TVMContext ctx = TVMContext.cpu();
+    Device dev = Device.cpu();
 
     long[] shape = new long[]{2};
-    NDArray arr = NDArray.empty(shape, ctx);
+    NDArray arr = NDArray.empty(shape, dev);
     arr.copyFrom(new float[]{3f, 4f});
-    NDArray res = NDArray.empty(shape, ctx);
+    NDArray res = NDArray.empty(shape, dev);
 
     fadd.entryFunc().pushArg(arr).pushArg(arr).pushArg(res).invoke();
     System.out.println(Arrays.toString(res.asFloatArray()));
diff --git a/jvm/core/src/main/java/org/apache/tvm/ArgTypeCode.java b/jvm/core/src/main/java/org/apache/tvm/ArgTypeCode.java
index b3b3da56e72f..ed6d0f1a0e12 100644
--- a/jvm/core/src/main/java/org/apache/tvm/ArgTypeCode.java
+++ b/jvm/core/src/main/java/org/apache/tvm/ArgTypeCode.java
@@ -20,7 +20,7 @@
 // Type code used in API calls
 public enum ArgTypeCode {
   INT(0), UINT(1), FLOAT(2), HANDLE(3), NULL(4), TVM_TYPE(5),
-  TVM_CONTEXT(6), ARRAY_HANDLE(7), NODE_HANDLE(8), MODULE_HANDLE(9),
+  DLDEVICE(6), ARRAY_HANDLE(7), NODE_HANDLE(8), MODULE_HANDLE(9),
   FUNC_HANDLE(10), STR(11), BYTES(12), NDARRAY_CONTAINER(13);
 
   public final int id;
diff --git a/jvm/core/src/main/java/org/apache/tvm/TVMContext.java b/jvm/core/src/main/java/org/apache/tvm/Device.java
similarity index 75%
rename from jvm/core/src/main/java/org/apache/tvm/TVMContext.java
rename to jvm/core/src/main/java/org/apache/tvm/Device.java
index 76375636a6ca..3447c692b5ef 100644
--- a/jvm/core/src/main/java/org/apache/tvm/TVMContext.java
+++ b/jvm/core/src/main/java/org/apache/tvm/Device.java
@@ -22,7 +22,7 @@
 import java.util.HashMap;
 import java.util.Map;
 
-public class TVMContext {
+public class Device {
   private static final Map<Integer, String> MASK2STR = new HashMap<Integer, String>();
   private static final Map<String, Integer> STR2MASK = new HashMap<String, Integer>();
 
@@ -49,103 +49,103 @@ public class TVMContext {
   /**
    * Construct a CPU device.
    * @param devId The device id
-   * @return The created context
+   * @return The created device
    */
-  public static TVMContext cpu(int devId) {
-    return new TVMContext(1, devId);
+  public static Device cpu(int devId) {
+    return new Device(1, devId);
   }
 
-  public static TVMContext cpu() {
+  public static Device cpu() {
     return cpu(0);
   }
 
   /**
    * Construct a GPU device.
    * @param devId The device id
-   * @return The created context
+   * @return The created device
    */
-  public static TVMContext gpu(int devId) {
-    return new TVMContext(2, devId);
+  public static Device gpu(int devId) {
+    return new Device(2, devId);
   }
 
-  public static TVMContext gpu() {
+  public static Device gpu() {
     return gpu(0);
   }
 
   /**
    * Construct a OpenCL device.
    * @param devId The device id
-   * @return The created context
+   * @return The created device
    */
-  public static TVMContext opencl(int devId) {
-    return new TVMContext(4, devId);
+  public static Device opencl(int devId) {
+    return new Device(4, devId);
   }
 
-  public static TVMContext opencl() {
+  public static Device opencl() {
     return opencl(0);
   }
 
   /**
    * Construct a Vulkan device.
    * @param devId The device id
-   * @return The created context
+   * @return The created device
    */
-  public static TVMContext vulkan(int devId) {
-    return new TVMContext(7, devId);
+  public static Device vulkan(int devId) {
+    return new Device(7, devId);
   }
 
-  public static TVMContext vulkan() {
+  public static Device vulkan() {
     return vulkan(0);
   }
 
   /**
    * Construct a metal device.
    * @param devId The device id
-   * @return The created context
+   * @return The created device
    */
-  public static TVMContext metal(int devId) {
-    return new TVMContext(8, devId);
+  public static Device metal(int devId) {
+    return new Device(8, devId);
   }
 
-  public static TVMContext metal() {
+  public static Device metal() {
     return metal(0);
   }
 
   /**
    * Construct a VPI simulated device.
    * @param devId The device id
-   * @return The created context
+   * @return The created device
    */
-  public static TVMContext vpi(int devId) {
-    return new TVMContext(9, devId);
+  public static Device vpi(int devId) {
+    return new Device(9, devId);
   }
 
-  public static TVMContext vpi() {
+  public static Device vpi() {
     return vpi(0);
   }
 
   /**
    * Construct a Hexagon device.
    * @param devId The device id
-   * @return The created context
+   * @return The created device
    */
-  public static TVMContext hexagon(int devId) {
-    return new TVMContext(14, devId);
+  public static Device hexagon(int devId) {
+    return new Device(14, devId);
   }
 
-  public static TVMContext hexagon() {
+  public static Device hexagon() {
     return hexagon(0);
   }
 
   public final int deviceType;
   public final int deviceId;
 
-  public TVMContext(int deviceType, int deviceId) {
+  public Device(int deviceType, int deviceId) {
     this.deviceType = deviceType;
     this.deviceId = deviceId;
   }
 
-  public TVMContext(String deviceType, int deviceId) {
+  public Device(String deviceType, int deviceId) {
     this(STR2MASK.get(deviceType), deviceId);
   }
 
@@ -180,7 +180,7 @@ public long warpSize() {
   }
 
   /**
-   * Synchronize until jobs finished at the context.
+   * Synchronize until jobs finished at the device.
    */
   public void sync() {
     Base.checkCall(Base._LIB.tvmSynchronize(deviceType, deviceId));
@@ -191,8 +191,8 @@ public void sync() {
   }
 
   @Override public boolean equals(Object other) {
-    if (other != null && other instanceof TVMContext) {
-      TVMContext obj = (TVMContext) other;
+    if (other != null && other instanceof Device) {
+      Device obj = (Device) other;
       return deviceId == obj.deviceId && deviceType == obj.deviceType;
     }
     return false;
diff --git a/jvm/core/src/main/java/org/apache/tvm/LibInfo.java b/jvm/core/src/main/java/org/apache/tvm/LibInfo.java
index 4c5e0a66bf87..62b8c901bd71 100644
--- a/jvm/core/src/main/java/org/apache/tvm/LibInfo.java
+++ b/jvm/core/src/main/java/org/apache/tvm/LibInfo.java
@@ -71,6 +71,6 @@ native int tvmArrayAlloc(long[] shape, int dtypeCode, int dtypeBits, int dtypeLa
 
   native int tvmArrayCopyToJArray(long from, byte[] to);
 
-  // TVMContext
+  // Device
   native int tvmSynchronize(int deviceType, int deviceId);
 }
diff --git a/jvm/core/src/main/java/org/apache/tvm/NDArray.java b/jvm/core/src/main/java/org/apache/tvm/NDArray.java
index 4e7386b123f8..a301d23dfbfa 100644
--- a/jvm/core/src/main/java/org/apache/tvm/NDArray.java
+++ b/jvm/core/src/main/java/org/apache/tvm/NDArray.java
@@ -27,12 +27,12 @@
  */
 public class NDArray extends NDArrayBase {
   private final TVMType dtype;
-  private final TVMContext context;
+  private final Device device;
 
-  NDArray(long handle, boolean isView, TVMType dtype, TVMContext ctx) {
+  NDArray(long handle, boolean isView, TVMType dtype, Device dev) {
     super(handle, isView);
     this.dtype = dtype;
-    this.context = ctx;
+    this.device = dev;
   }
 
   @Override protected void finalize() throws Throwable {
@@ -364,26 +364,26 @@ private byte[][] groupInternalBytes() {
   }
 
   /**
-   * Get the context of current array.
-   * @return the context.
+   * Get the device of current array.
+   * @return the device.
    */
-  public TVMContext ctx() {
-    return context;
+  public Device device() {
+    return device;
   }
 
   /**
    * Create an empty array given shape, type and device.
    * @param shape The shape of the array.
    * @param dtype The data type of the array.
-   * @param ctx The context of the array.
+   * @param dev The device of the array.
    * @return The array tvm supported.
    */
-  public static NDArray empty(long[] shape, TVMType dtype, TVMContext ctx) {
+  public static NDArray empty(long[] shape, TVMType dtype, Device dev) {
     Base.RefLong refHandle = new Base.RefLong();
     Base.checkCall(Base._LIB.tvmArrayAlloc(
         shape, dtype.typeCode, dtype.bits, dtype.lanes,
-        ctx.deviceType, ctx.deviceId, refHandle));
-    return new NDArray(refHandle.value, false, dtype, ctx);
+        dev.deviceType, dev.deviceId, refHandle));
+    return new NDArray(refHandle.value, false, dtype, dev);
   }
 
   /**
@@ -393,7 +393,7 @@ public static NDArray empty(long[] shape, TVMType dtype, TVMContext ctx) {
    * @return The array tvm supported.
    */
   public static NDArray empty(long[] shape, TVMType dtype) {
-    return empty(shape, dtype, new TVMContext(1, 0));
+    return empty(shape, dtype, new Device(1, 0));
   }
 
   /**
@@ -402,17 +402,17 @@ public static NDArray empty(long[] shape, TVMType dtype) {
    * @return The array tvm supported.
    */
   public static NDArray empty(long[] shape) {
-    return empty(shape, new TVMType("float32", 1), new TVMContext(1, 0));
+    return empty(shape, new TVMType("float32", 1), new Device(1, 0));
   }
 
   /**
    * Create an empty float32 array given shape and device.
    * @param shape The shape of the array.
-   * @param ctx The context of the array.
+   * @param dev The device of the array.
    * @return The array tvm supported.
    */
-  public static NDArray empty(long[] shape, TVMContext ctx) {
-    return empty(shape, new TVMType("float32", 1), ctx);
+  public static NDArray empty(long[] shape, Device dev) {
+    return empty(shape, new TVMType("float32", 1), dev);
   }
 
   private static ByteBuffer wrapBytes(byte[] bytes) {
diff --git a/jvm/core/src/main/java/org/apache/tvm/contrib/GraphRuntime.java b/jvm/core/src/main/java/org/apache/tvm/contrib/GraphExecutor.java
similarity index 87%
rename from jvm/core/src/main/java/org/apache/tvm/contrib/GraphRuntime.java
rename to jvm/core/src/main/java/org/apache/tvm/contrib/GraphExecutor.java
index 61ff966eaf38..30b2fb1acafb 100644
--- a/jvm/core/src/main/java/org/apache/tvm/contrib/GraphRuntime.java
+++ b/jvm/core/src/main/java/org/apache/tvm/contrib/GraphExecutor.java
@@ -17,37 +17,37 @@
 
 package org.apache.tvm.contrib;
 
+import org.apache.tvm.Device;
 import org.apache.tvm.Function;
 import org.apache.tvm.Module;
-import org.apache.tvm.TVMContext;
 import org.apache.tvm.TVMValue;
 import org.apache.tvm.rpc.RPC;
 import org.apache.tvm.rpc.RPCSession;
-import org.apache.tvm.rpc.TVMRemoteContext;
+import org.apache.tvm.rpc.TVMRemoteDevice;
 
 import java.lang.reflect.Field;
 import java.lang.reflect.InvocationTargetException;
 import java.lang.reflect.Method;
 
-public class GraphRuntime {
+public class GraphExecutor {
   /**
    * Create a runtime executor module given a graph and module.
    * @param graphJson The graph deployed in json format output by compiler.
    * @param libmod The module of the corresponding function.
-   * @param ctx The local or remote context to deploy the module.
+   * @param dev The local or remote device to deploy the module.
    * @return Runtime graph module that can be used to execute the graph.
    */
-  public static GraphModule create(String graphJson, Module libmod, TVMContext ctx) {
-    Function fcreate = Function.getFunction("tvm.graph_runtime.create");
+  public static GraphModule create(String graphJson, Module libmod, Device dev) {
+    Function fcreate = Function.getFunction("tvm.graph_executor.create");
     if (fcreate == null) {
-      throw new RuntimeException("Cannot find global function tvm.graph_runtime.create."
+      throw new RuntimeException("Cannot find global function tvm.graph_executor.create."
           + "Did you compile tvm_runtime with correct version?");
     }
     Module graphModule = fcreate.pushArg(graphJson)
-        .pushArg(libmod).pushArg(ctx.deviceType).pushArg(ctx.deviceId)
+        .pushArg(libmod).pushArg(dev.deviceType).pushArg(dev.deviceId)
         .invoke().asModule();
 
-    return new GraphModule(graphModule, ctx);
+    return new GraphModule(graphModule, dev);
   }
 
   private static Object reflectionGetField(Object obj, String fieldName) {
diff --git a/jvm/core/src/main/java/org/apache/tvm/contrib/GraphModule.java b/jvm/core/src/main/java/org/apache/tvm/contrib/GraphModule.java
index 64f089fb4eb1..a7a03d52740e 100644
--- a/jvm/core/src/main/java/org/apache/tvm/contrib/GraphModule.java
+++ b/jvm/core/src/main/java/org/apache/tvm/contrib/GraphModule.java
@@ -19,10 +19,10 @@
 
 package org.apache.tvm.contrib;
 
+import org.apache.tvm.Device;
 import org.apache.tvm.Function;
 import org.apache.tvm.Module;
 import org.apache.tvm.NDArray;
-import org.apache.tvm.TVMContext;
 
 /**
  * Wrapper runtime module.
@@ -32,7 +32,7 @@
  */
 public class GraphModule {
   private Module module;
-  private TVMContext ctx;
+  private Device device;
 
   private Function fsetInput;
   private Function frun;
@@ -41,9 +41,9 @@ public class GraphModule {
   private Function fdebugGetOutput;
   private Function floadParams;
 
-  GraphModule(Module module, TVMContext ctx) {
+  GraphModule(Module module, Device dev) {
     this.module = module;
-    this.ctx = ctx;
+    this.device = dev;
     fsetInput = module.getFunction("set_input");
     frun = module.getFunction("run");
     fgetInput = module.getFunction("get_input");
@@ -82,8 +82,8 @@ public void release() {
    */
   public GraphModule setInput(String key, NDArray value) {
     NDArray input = value;
-    if (!value.ctx().equals(ctx)) {
-      input = NDArray.empty(value.shape(), ctx);
+    if (!value.device().equals(device)) {
+      input = NDArray.empty(value.shape(), device);
       value.copyTo(input);
     }
     fsetInput.pushArg(key).pushArg(input).invoke();
@@ -98,8 +98,8 @@ public GraphModule setInput(String key, NDArray value) {
    */
   public GraphModule setInput(int key, NDArray value) {
     NDArray input = value;
-    if (!value.ctx().equals(ctx)) {
-      input = NDArray.empty(value.shape(), ctx);
+    if (!value.device().equals(device)) {
+      input = NDArray.empty(value.shape(), device);
       value.copyTo(input);
     }
     fsetInput.pushArg(key).pushArg(input).invoke();
@@ -147,7 +147,7 @@ public NDArray debugGetOutput(String node, NDArray out) {
     if (fdebugGetOutput != null) {
       fdebugGetOutput.pushArg(node).pushArg(out).invoke();
     } else {
-      throw new RuntimeException("Please compile runtime with USE_GRAPH_RUNTIME_DEBUG = 0");
+      throw new RuntimeException("Please compile runtime with USE_GRAPH_EXECUTOR_DEBUG = 0");
     }
     return out;
   }
@@ -162,7 +162,7 @@ public NDArray debugGetOutput(int node, NDArray out) {
     if (fdebugGetOutput != null) {
       fdebugGetOutput.pushArg(node).pushArg(out).invoke();
     } else {
-      throw new RuntimeException("Please compile runtime with USE_GRAPH_RUNTIME_DEBUG = 0");
+      throw new RuntimeException("Please compile runtime with USE_GRAPH_EXECUTOR_DEBUG = 0");
     }
     return out;
   }
diff --git a/jvm/core/src/main/java/org/apache/tvm/rpc/RPCSession.java b/jvm/core/src/main/java/org/apache/tvm/rpc/RPCSession.java
index b9f621473cf4..1d3f38627926 100644
--- a/jvm/core/src/main/java/org/apache/tvm/rpc/RPCSession.java
+++ b/jvm/core/src/main/java/org/apache/tvm/rpc/RPCSession.java
@@ -17,9 +17,9 @@
 
 package org.apache.tvm.rpc;
 
+import org.apache.tvm.Device;
 import org.apache.tvm.Function;
 import org.apache.tvm.Module;
-import org.apache.tvm.TVMContext;
 
 import java.io.File;
 import java.io.FileInputStream;
@@ -52,111 +52,111 @@ public Function getFunction(String name) {
   }
 
   /**
-   * Construct a remote context.
+   * Construct a remote device.
    * @param devType device type.
    * @param devId device id.
-   * @return The corresponding encoded remote context.
+   * @return The corresponding encoded remote device.
    */
-  public TVMContext context(String devType, int devId) {
-    TVMContext ctx = new TVMContext(devType, devId);
+  public Device device(String devType, int devId) {
+    Device dev = new Device(devType, devId);
     int encode = (tblIndex + 1) * RPC.RPC_SESS_MASK;
-    return new TVMRemoteContext(ctx.deviceType + encode, devId, this);
+    return new TVMRemoteDevice(dev.deviceType + encode, devId, this);
   }
 
   /**
-   * Construct a remote context.
+   * Construct a remote device.
    * @param devType device type.
-   * @return The corresponding encoded remote context.
+   * @return The corresponding encoded remote device.
    */
-  public TVMContext context(String devType) {
-    return context(devType, 0);
+  public Device device(String devType) {
+    return device(devType, 0);
   }
 
   /**
-   * Construct a remote context.
+   * Construct a remote device.
    * @param devType device type.
    * @param devId device id.
-   * @return The corresponding encoded remote context.
+   * @return The corresponding encoded remote device.
    */
-  public TVMContext context(int devType, int devId) {
+  public Device device(int devType, int devId) {
     int encode = (tblIndex + 1) * RPC.RPC_SESS_MASK;
-    return new TVMRemoteContext(devType + encode, devId, this);
+    return new TVMRemoteDevice(devType + encode, devId, this);
   }
 
   /**
-   * Construct a remote context.
+   * Construct a remote device.
    * @param devType device type.
-   * @return The corresponding encoded remote context.
+   * @return The corresponding encoded remote device.
    */
-  public TVMContext context(int devType) {
-    return context(devType, 0);
+  public Device device(int devType) {
+    return device(devType, 0);
   }
 
   /**
    * Construct remote CPU device.
    * @param devId device id.
-   * @return Remote CPU context.
+   * @return Remote CPU device.
    */
-  public TVMContext cpu(int devId) {
-    return context(1, devId);
+  public Device cpu(int devId) {
+    return device(1, devId);
   }
 
   /**
    * Construct remote CPU device.
-   * @return Remote CPU context.
+   * @return Remote CPU device.
    */
-  public TVMContext cpu() {
+  public Device cpu() {
     return cpu(0);
   }
 
   /**
    * Construct remote GPU device.
    * @param devId device id.
-   * @return Remote GPU context.
+   * @return Remote GPU device.
    */
-  public TVMContext gpu(int devId) {
-    return context(2, devId);
+  public Device gpu(int devId) {
+    return device(2, devId);
   }
 
   /**
    * Construct remote GPU device.
-   * @return Remote GPU context.
+   * @return Remote GPU device.
    */
-  public TVMContext gpu() {
+  public Device gpu() {
     return gpu(0);
   }
 
   /**
    * Construct remote OpenCL device.
    * @param devId device id.
-   * @return Remote OpenCL context.
+   * @return Remote OpenCL device.
    */
-  public TVMContext cl(int devId) {
-    return context(4, devId);
+  public Device cl(int devId) {
+    return device(4, devId);
   }
 
   /**
    * Construct remote OpenCL device.
-   * @return Remote OpenCL context.
+   * @return Remote OpenCL device.
    */
-  public TVMContext cl() {
+  public Device cl() {
     return cl(0);
   }
 
   /**
    * Construct remote OpenCL device.
    * @param devId device id.
-   * @return Remote OpenCL context.
+   * @return Remote OpenCL device.
    */
-  public TVMContext vulkan(int devId) {
-    return context(7, devId);
+  public Device vulkan(int devId) {
+    return device(7, devId);
   }
 
   /**
    * Construct remote OpenCL device.
-   * @return Remote OpenCL context.
+   * @return Remote OpenCL device.
    */
-  public TVMContext vulkan() {
+  public Device vulkan() {
     return vulkan(0);
   }
 
@@ -164,17 +164,17 @@ public TVMContext vulkan() {
   /**
    * Construct remote Metal device.
    * @param devId device id.
-   * @return Remote metal context.
+   * @return Remote metal device.
    */
-  public TVMContext metal(int devId) {
-    return context(8, devId);
+  public Device metal(int devId) {
+    return device(8, devId);
   }
 
   /**
    * Construct remote Metal device.
-   * @return Remote metal context.
+   * @return Remote metal device.
    */
-  public TVMContext metal() {
+  public Device metal() {
     return metal(0);
   }
 
diff --git a/jvm/core/src/main/java/org/apache/tvm/rpc/TVMRemoteContext.java b/jvm/core/src/main/java/org/apache/tvm/rpc/TVMRemoteDevice.java
similarity index 86%
rename from jvm/core/src/main/java/org/apache/tvm/rpc/TVMRemoteContext.java
rename to jvm/core/src/main/java/org/apache/tvm/rpc/TVMRemoteDevice.java
index fad14eceb16b..6f70fa9a29f6 100644
--- a/jvm/core/src/main/java/org/apache/tvm/rpc/TVMRemoteContext.java
+++ b/jvm/core/src/main/java/org/apache/tvm/rpc/TVMRemoteDevice.java
@@ -17,13 +17,13 @@
 
 package org.apache.tvm.rpc;
 
-import org.apache.tvm.TVMContext;
+import org.apache.tvm.Device;
 
 // always related to RPCSession. Cannot construct by users.
-public class TVMRemoteContext extends TVMContext {
+public class TVMRemoteDevice extends Device {
   public final RPCSession rpcSession;
 
-  TVMRemoteContext(int deviceType, int deviceId, RPCSession rpcSession) {
+  TVMRemoteDevice(int deviceType, int deviceId, RPCSession rpcSession) {
     super(deviceType, deviceId);
     this.rpcSession = rpcSession;
   }
diff --git a/jvm/core/src/test/java/org/apache/tvm/ModuleTest.java b/jvm/core/src/test/java/org/apache/tvm/ModuleTest.java
index d675ee4b9202..666cbac6afee 100644
--- a/jvm/core/src/test/java/org/apache/tvm/ModuleTest.java
+++ b/jvm/core/src/test/java/org/apache/tvm/ModuleTest.java
@@ -40,13 +40,13 @@ public static void beforeClass() {
   public void test_load_add_func_cpu() {
     Module fadd = Module.load(loadingDir + File.separator + "add_cpu.so");
 
-    TVMContext ctx = new TVMContext("cpu", 0);
+    Device dev = new Device("cpu", 0);
     long[] shape = new long[]{2};
-    NDArray arr = NDArray.empty(shape, ctx);
+    NDArray arr = NDArray.empty(shape, dev);
 
     arr.copyFrom(new float[]{3f, 4f});
 
-    NDArray res = NDArray.empty(shape, ctx);
+    NDArray res = NDArray.empty(shape, dev);
 
     fadd.entryFunc().pushArg(arr).pushArg(arr).pushArg(res).invoke();
     assertArrayEquals(new float[]{6f, 8f}, res.asFloatArray(), 1e-3f);
@@ -64,8 +64,8 @@ public void test_load_add_func_cpu() {
   public void test_load_add_func_gpu() {
     final Random RND = new Random(0);
 
-    TVMContext ctx = new TVMContext("gpu", 0);
-    if (!ctx.exist()) {
+    Device dev = new Device("gpu", 0);
+    if (!dev.exist()) {
       logger.warn("GPU does not exist. Skip the test.");
       return;
     }
@@ -76,7 +76,7 @@ public void test_load_add_func_gpu() {
 
     final int dim = 100;
     long[] shape = new long[]{dim};
-    NDArray arr = NDArray.empty(shape, ctx);
+    NDArray arr = NDArray.empty(shape, dev);
 
     float[] data = new float[dim];
     float[] dataX2 = new float[dim];
@@ -86,7 +86,7 @@ public void test_load_add_func_gpu() {
     }
     arr.copyFrom(data);
 
-    NDArray res = NDArray.empty(shape, ctx);
+    NDArray res = NDArray.empty(shape, dev);
     fadd.entryFunc().pushArg(arr).pushArg(arr).pushArg(res).invoke();
 
     assertArrayEquals(dataX2, res.asFloatArray(), 1e-3f);
diff --git a/jvm/core/src/test/java/org/apache/tvm/contrib/GraphRuntimeTest.java b/jvm/core/src/test/java/org/apache/tvm/contrib/GraphExecutorTest.java
similarity index 85%
rename from jvm/core/src/test/java/org/apache/tvm/contrib/GraphRuntimeTest.java
rename to jvm/core/src/test/java/org/apache/tvm/contrib/GraphExecutorTest.java
index 82096c1aa5dc..0a5fa9a67e3a 100644
--- a/jvm/core/src/test/java/org/apache/tvm/contrib/GraphRuntimeTest.java
+++ b/jvm/core/src/test/java/org/apache/tvm/contrib/GraphExecutorTest.java
@@ -19,7 +19,7 @@
 
 import org.apache.tvm.Module;
 import org.apache.tvm.NDArray;
-import org.apache.tvm.TVMContext;
+import org.apache.tvm.Device;
 import org.apache.tvm.TestUtils;
 import org.apache.tvm.rpc.Client;
 import org.apache.tvm.rpc.RPCSession;
@@ -35,8 +35,8 @@
 
 import static org.junit.Assert.assertArrayEquals;
 
-public class GraphRuntimeTest {
-  private final Logger logger = LoggerFactory.getLogger(GraphRuntime.class);
+public class GraphExecutorTest {
+  private final Logger logger = LoggerFactory.getLogger(GraphExecutor.class);
   private static String loadingDir;
 
   @BeforeClass
@@ -51,14 +51,14 @@ public void test_add_one_local() throws IOException {
         loadingDir + File.separator + "graph_addone.json"))
         .useDelimiter("\\Z").next();
 
-    TVMContext ctx = TVMContext.cpu();
-    GraphModule graph = GraphRuntime.create(graphJson, libmod, ctx);
+    Device dev = Device.cpu();
+    GraphModule graph = GraphExecutor.create(graphJson, libmod, dev);
 
     long[] shape = new long[]{4};
-    NDArray arr = NDArray.empty(shape, ctx);
+    NDArray arr = NDArray.empty(shape, dev);
     arr.copyFrom(new float[]{1f, 2f, 3f, 4f});
 
-    NDArray out = NDArray.empty(shape, ctx);
+    NDArray out = NDArray.empty(shape, dev);
 
     graph.setInput("x", arr).run();
     graph.getOutput(0, out);
@@ -87,18 +87,18 @@ public void test_add_one_remote() throws IOException {
     try {
       server = TestUtils.startServer(port);
       RPCSession remote = Client.connect("localhost", port.value);
-      TVMContext ctx = remote.cpu();
+      Device dev = remote.cpu();
 
       remote.upload(new File(libPath));
       Module mlib = remote.loadModule("graph_addone_lib.so");
 
-      GraphModule graph = GraphRuntime.create(graphJson, mlib, ctx);
+      GraphModule graph = GraphExecutor.create(graphJson, mlib, dev);
 
       long[] shape = new long[]{4};
-      NDArray arr = NDArray.empty(shape, ctx);
+      NDArray arr = NDArray.empty(shape, dev);
       arr.copyFrom(new float[]{1f, 2f, 3f, 4f});
 
-      NDArray out = NDArray.empty(shape, ctx);
+      NDArray out = NDArray.empty(shape, dev);
 
       graph.setInput("x", arr).run();
       graph.getOutput(0, out);
diff --git a/jvm/core/src/test/scripts/test_graph_runtime.py b/jvm/core/src/test/scripts/test_graph_executor.py
similarity index 98%
rename from jvm/core/src/test/scripts/test_graph_runtime.py
rename to jvm/core/src/test/scripts/test_graph_executor.py
index 07a19fe50c1b..676b008205ca 100644
--- a/jvm/core/src/test/scripts/test_graph_runtime.py
+++ b/jvm/core/src/test/scripts/test_graph_executor.py
@@ -19,7 +19,7 @@
 import tvm
 from tvm import te
 import json
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 
 
 def dump_graph_lib(target_dir):
diff --git a/jvm/native/src/main/native/jni_helper_func.h b/jvm/native/src/main/native/jni_helper_func.h
index 0f202004f99d..82165e9e04b1 100644
--- a/jvm/native/src/main/native/jni_helper_func.h
+++ b/jvm/native/src/main/native/jni_helper_func.h
@@ -175,13 +175,13 @@ void fromJavaDType(JNIEnv* env, jobject jdtype, DLDataType* dtype) {
   env->DeleteLocalRef(tvmTypeClass);
 }
 
-void fromJavaContext(JNIEnv* env, jobject jctx, TVMContext* ctx) {
-  jclass tvmContextClass = env->FindClass("org/apache/tvm/TVMContext");
-  ctx->device_type = static_cast<DLDeviceType>(
-      env->GetIntField(jctx, env->GetFieldID(tvmContextClass, "deviceType", "I")));
-  ctx->device_id =
-      static_cast<int>(env->GetIntField(jctx, env->GetFieldID(tvmContextClass, "deviceId", "I")));
-  env->DeleteLocalRef(tvmContextClass);
+void fromJavaDevice(JNIEnv* env, jobject jdev, DLDevice* dev) {
+  jclass deviceClass = env->FindClass("org/apache/tvm/Device");
+  dev->device_type = static_cast<DLDeviceType>(
+      env->GetIntField(jdev, env->GetFieldID(deviceClass, "deviceType", "I")));
+  dev->device_id =
+      static_cast<int>(env->GetIntField(jdev, env->GetFieldID(deviceClass, "deviceId", "I")));
+  env->DeleteLocalRef(deviceClass);
 }
 
 jobject tvmRetValueToJava(JNIEnv* env, TVMValue value, int tcode) {
diff --git a/jvm/native/src/main/native/org_apache_tvm_native_c_api.cc b/jvm/native/src/main/native/org_apache_tvm_native_c_api.cc
index e3ea4b9c3766..ada714c2ec48 100644
--- a/jvm/native/src/main/native/org_apache_tvm_native_c_api.cc
+++ b/jvm/native/src/main/native/org_apache_tvm_native_c_api.cc
@@ -453,7 +453,7 @@ JNIEXPORT jint JNICALL Java_org_apache_tvm_LibInfo_tvmArrayCopyToJArray(JNIEnv*
   return ret;
 }
 
-// Context
+// Device
 JNIEXPORT jint JNICALL Java_org_apache_tvm_LibInfo_tvmSynchronize(JNIEnv* env, jint deviceType,
                                                                   jint deviceId) {
   return TVMSynchronize(static_cast<int>(deviceType), static_cast<int>(deviceId), NULL);
diff --git a/licenses/LICENSE.libbacktrace.txt b/licenses/LICENSE.libbacktrace.txt
new file mode 100644
index 000000000000..097d2774e5df
--- /dev/null
+++ b/licenses/LICENSE.libbacktrace.txt
@@ -0,0 +1,29 @@
+# Copyright (C) 2012-2016 Free Software Foundation, Inc.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+
+#     (1) Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer. 
+
+#     (2) Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in
+#     the documentation and/or other materials provided with the
+#     distribution.  
+    
+#     (3) The name of the author may not be used to
+#     endorse or promote products derived from this software without
+#     specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
diff --git a/python/setup.py b/python/setup.py
index e02369e97777..b47e5b14f6a7 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -94,7 +94,7 @@ def config_cython():
             subdir = "_cy2"
         ret = []
         path = "tvm/_ffi/_cython"
-        extra_compile_args = ["-std=c++14"]
+        extra_compile_args = ["-std=c++14", "-DDMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>"]
         if os.name == "nt":
             library_dirs = ["tvm", "../build/Release", "../build"]
             libraries = ["tvm"]
diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py
index 7a5f553ccdd5..4643062ea8e8 100644
--- a/python/tvm/__init__.py
+++ b/python/tvm/__init__.py
@@ -30,7 +30,7 @@
 # top-level alias
 # tvm.runtime
 from .runtime.object import Object
-from .runtime.ndarray import context, cpu, gpu, opencl, cl, vulkan, metal, mtl
+from .runtime.ndarray import device, cpu, gpu, opencl, cl, vulkan, metal, mtl
 from .runtime.ndarray import vpi, rocm, ext_dev, micro_dev, hexagon
 from .runtime import ndarray as nd
 
diff --git a/python/tvm/_ffi/_ctypes/packed_func.py b/python/tvm/_ffi/_ctypes/packed_func.py
index fd82b263e2dd..6cfa3e5c286a 100644
--- a/python/tvm/_ffi/_ctypes/packed_func.py
+++ b/python/tvm/_ffi/_ctypes/packed_func.py
@@ -23,12 +23,12 @@
 
 from ..base import _LIB, get_last_ffi_error, py2cerror, check_call
 from ..base import c_str, string_types
-from ..runtime_ctypes import DataType, TVMByteArray, TVMContext, ObjectRValueRef
+from ..runtime_ctypes import DataType, TVMByteArray, Device, ObjectRValueRef
 from . import ndarray as _nd
 from .ndarray import NDArrayBase, _make_array
 from .types import TVMValue, ArgTypeCode
 from .types import TVMPackedCFunc, TVMCFuncFinalizer
-from .types import RETURN_SWITCH, C_TO_PY_ARG_SWITCH, _wrap_arg_func, _ctx_to_int64
+from .types import RETURN_SWITCH, C_TO_PY_ARG_SWITCH, _wrap_arg_func, _device_to_int64
 from .object import ObjectBase, PyNativeObject, _set_class_object
 from . import object as _object
 
@@ -141,9 +141,9 @@ def _make_tvm_args(args, temp_args):
         elif isinstance(arg, DataType):
             values[i].v_str = c_str(str(arg))
             type_codes[i] = ArgTypeCode.STR
-        elif isinstance(arg, TVMContext):
-            values[i].v_int64 = _ctx_to_int64(arg)
-            type_codes[i] = ArgTypeCode.TVM_CONTEXT
+        elif isinstance(arg, Device):
+            values[i].v_int64 = _device_to_int64(arg)
+            type_codes[i] = ArgTypeCode.DLDEVICE
         elif isinstance(arg, (bytearray, bytes)):
             # from_buffer only taeks in bytearray.
             if isinstance(arg, bytes):
diff --git a/python/tvm/_ffi/_ctypes/types.py b/python/tvm/_ffi/_ctypes/types.py
index 4b6d66957b25..38d3cd72b55d 100644
--- a/python/tvm/_ffi/_ctypes/types.py
+++ b/python/tvm/_ffi/_ctypes/types.py
@@ -19,7 +19,7 @@
 import ctypes
 import struct
 from ..base import py_str, check_call, _LIB
-from ..runtime_ctypes import TVMByteArray, ArgTypeCode, TVMContext
+from ..runtime_ctypes import TVMByteArray, ArgTypeCode, Device
 
 
 class TVMValue(ctypes.Union):
@@ -68,13 +68,13 @@ def _return_bytes(x):
     return res
 
 
-def _return_context(value):
-    """return TVMContext"""
+def _return_device(value):
+    """return Device"""
     # use bit unpacking from int64 view
     # We use this to get around ctypes issue on Union of Structure
     data = struct.pack("=q", value.v_int64)
     arr = struct.unpack("=ii", data)
-    return TVMContext(arr[0], arr[1])
+    return Device(arr[0], arr[1])
 
 
 def _wrap_arg_func(return_f, type_code):
@@ -86,9 +86,9 @@ def _wrap_func(x):
     return _wrap_func
 
 
-def _ctx_to_int64(ctx):
+def _device_to_int64(dev):
     """Pack context into int64 in native endian"""
-    data = struct.pack("=ii", ctx.device_type, ctx.device_id)
+    data = struct.pack("=ii", dev.device_type, dev.device_id)
     return struct.unpack("=q", data)[0]
 
 
@@ -99,7 +99,7 @@ def _ctx_to_int64(ctx):
     ArgTypeCode.NULL: lambda x: None,
     ArgTypeCode.STR: lambda x: py_str(x.v_str),
     ArgTypeCode.BYTES: _return_bytes,
-    ArgTypeCode.TVM_CONTEXT: _return_context,
+    ArgTypeCode.DLDEVICE: _return_device,
 }
 
 C_TO_PY_ARG_SWITCH = {
@@ -109,5 +109,5 @@ def _ctx_to_int64(ctx):
     ArgTypeCode.NULL: lambda x: None,
     ArgTypeCode.STR: lambda x: py_str(x.v_str),
     ArgTypeCode.BYTES: _return_bytes,
-    ArgTypeCode.TVM_CONTEXT: _return_context,
+    ArgTypeCode.DLDEVICE: _return_device,
 }
diff --git a/python/tvm/_ffi/_cython/base.pxi b/python/tvm/_ffi/_cython/base.pxi
index 8c9e413813b9..bf4d6b933a4f 100644
--- a/python/tvm/_ffi/_cython/base.pxi
+++ b/python/tvm/_ffi/_cython/base.pxi
@@ -29,7 +29,7 @@ cdef enum TVMArgTypeCode:
     kTVMOpaqueHandle = 3
     kTVMNullptr = 4
     kTVMDataType = 5
-    kTVMContext = 6
+    kDLDevice = 6
     kTVMDLTensorHandle = 7
     kTVMObjectHandle = 8
     kTVMModuleHandle = 9
@@ -46,13 +46,13 @@ cdef extern from "tvm/runtime/c_runtime_api.h":
         uint8_t bits
         uint16_t lanes
 
-    ctypedef struct DLContext:
+    ctypedef struct DLDevice:
         int device_type
         int device_id
 
     ctypedef struct DLTensor:
         void* data
-        DLContext ctx
+        DLDevice device
         int ndim
         DLDataType dtype
         int64_t* shape
@@ -70,7 +70,7 @@ cdef extern from "tvm/runtime/c_runtime_api.h":
         void* v_handle
         const char* v_str
         DLDataType v_type
-        DLContext v_ctx
+        DLDevice v_device
 
 ctypedef int64_t tvm_index_t
 ctypedef DLTensor* DLTensorHandle
@@ -118,7 +118,7 @@ cdef extern from "tvm/runtime/c_runtime_api.h":
     int TVMArrayAlloc(tvm_index_t* shape,
                       tvm_index_t ndim,
                       DLDataType dtype,
-                      DLContext ctx,
+                      DLDevice dev,
                       DLTensorHandle* out)
     int TVMArrayFree(DLTensorHandle handle)
     int TVMArrayCopyFromTo(DLTensorHandle src,
diff --git a/python/tvm/_ffi/_cython/packed_func.pxi b/python/tvm/_ffi/_cython/packed_func.pxi
index 00585659ab76..30b879de80ed 100644
--- a/python/tvm/_ffi/_cython/packed_func.pxi
+++ b/python/tvm/_ffi/_cython/packed_func.pxi
@@ -20,7 +20,7 @@ import traceback
 from cpython cimport Py_INCREF, Py_DECREF
 from numbers import Number, Integral
 from ..base import string_types, py2cerror
-from ..runtime_ctypes import DataType, TVMContext, TVMByteArray, ObjectRValueRef
+from ..runtime_ctypes import DataType, Device, TVMByteArray, ObjectRValueRef
 
 
 cdef void tvm_callback_finalize(void* fhandle) with gil:
@@ -139,10 +139,10 @@ cdef inline int make_arg(object arg,
         value[0].v_str = tstr
         tcode[0] = kTVMStr
         temp_args.append(tstr)
-    elif isinstance(arg, TVMContext):
-        value[0].v_ctx = (<DLContext*>(
+    elif isinstance(arg, Device):
+        value[0].v_device = (<DLDevice*>(
             <unsigned long long>ctypes.addressof(arg)))[0]
-        tcode[0] = kTVMContext
+        tcode[0] = kDLDevice
     elif isinstance(arg, (bytes, bytearray)):
         # from_buffer only taeks in bytearray.
         if isinstance(arg, bytes):
@@ -220,8 +220,8 @@ cdef inline object make_ret(TVMValue value, int tcode):
         return make_ret_bytes(value.v_handle)
     elif tcode == kTVMOpaqueHandle:
         return ctypes_handle(value.v_handle)
-    elif tcode == kTVMContext:
-        return TVMContext(value.v_ctx.device_type, value.v_ctx.device_id)
+    elif tcode == kDLDevice:
+        return Device(value.v_device.device_type, value.v_device.device_id)
     elif tcode == kTVMModuleHandle:
         return _CLASS_MODULE(ctypes_handle(value.v_handle))
     elif tcode == kTVMPackedFuncHandle:
diff --git a/python/tvm/_ffi/base.py b/python/tvm/_ffi/base.py
index 397090618ade..0496195fd73f 100644
--- a/python/tvm/_ffi/base.py
+++ b/python/tvm/_ffi/base.py
@@ -253,7 +253,9 @@ def c2pyerror(err_msg):
     message = []
     for line in arr:
         if trace_mode:
-            if line.startswith("  "):
+            if line.startswith("        "):
+                stack_trace[-1] += "\n" + line
+            elif line.startswith("  "):
                 stack_trace.append(line)
             else:
                 trace_mode = False
diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py
index 3a874ebb1208..59dc652aeb0b 100644
--- a/python/tvm/_ffi/runtime_ctypes.py
+++ b/python/tvm/_ffi/runtime_ctypes.py
@@ -33,7 +33,7 @@ class ArgTypeCode(object):
     HANDLE = 3
     NULL = 4
     TVM_TYPE = 5
-    TVM_CONTEXT = 6
+    DLDEVICE = 6
     DLTENSOR_HANDLE = 7
     OBJECT_HANDLE = 8
     MODULE_HANDLE = 9
@@ -149,8 +149,8 @@ def __ne__(self, other):
 RPC_SESS_MASK = 128
 
 
-class TVMContext(ctypes.Structure):
-    """TVM context strucure."""
+class Device(ctypes.Structure):
+    """TVM device strucure."""
 
     _fields_ = [("device_type", ctypes.c_int), ("device_id", ctypes.c_int)]
     MASK2STR = {
@@ -192,7 +192,7 @@ class TVMContext(ctypes.Structure):
     }
 
     def __init__(self, device_type, device_id):
-        super(TVMContext, self).__init__()
+        super(Device, self).__init__()
         self.device_type = int(device_type)
         self.device_id = device_id
 
@@ -268,7 +268,7 @@ def sync(self):
 
     def __eq__(self, other):
         return (
-            isinstance(other, TVMContext)
+            isinstance(other, Device)
             and self.device_id == other.device_id
             and self.device_type == other.device_type
         )
@@ -283,8 +283,8 @@ def __repr__(self):
         if self.device_type >= RPC_SESS_MASK:
             tbl_id = self.device_type / RPC_SESS_MASK - 1
             dev_type = self.device_type % RPC_SESS_MASK
-            return "remote[%d]:%s(%d)" % (tbl_id, TVMContext.MASK2STR[dev_type], self.device_id)
-        return "%s(%d)" % (TVMContext.MASK2STR[self.device_type], self.device_id)
+            return "remote[%d]:%s(%d)" % (tbl_id, Device.MASK2STR[dev_type], self.device_id)
+        return "%s(%d)" % (Device.MASK2STR[self.device_type], self.device_id)
 
 
 class TVMArray(ctypes.Structure):
@@ -292,7 +292,7 @@ class TVMArray(ctypes.Structure):
 
     _fields_ = [
         ("data", ctypes.c_void_p),
-        ("ctx", TVMContext),
+        ("device", Device),
         ("ndim", ctypes.c_int),
         ("dtype", DataType),
         ("shape", ctypes.POINTER(tvm_shape_index_t)),
diff --git a/python/tvm/arith/__init__.py b/python/tvm/arith/__init__.py
index 77ec869a171e..05843ede9284 100644
--- a/python/tvm/arith/__init__.py
+++ b/python/tvm/arith/__init__.py
@@ -22,4 +22,4 @@
 from .pattern import detect_linear_equation, detect_clip_bound
 from .int_solver import solve_linear_equations, solve_linear_inequalities
 from .iter_affine_map import IterMapExpr, IterMark, IterSplitExpr, IterSumExpr
-from .iter_affine_map import detect_iter_map
+from .iter_affine_map import detect_iter_map, normalize_iter_map_to_expr
diff --git a/python/tvm/arith/iter_affine_map.py b/python/tvm/arith/iter_affine_map.py
index 123d9b85480a..5aa817bd7a24 100644
--- a/python/tvm/arith/iter_affine_map.py
+++ b/python/tvm/arith/iter_affine_map.py
@@ -88,21 +88,43 @@ def __init__(self, args, base):
         self.__init_handle_by_constructor__(_ffi_api.IterSumExpr, args, base)
 
 
-def detect_iter_map(indices, input_iters):
-    """Detect if indices can be written mapped iters from input_iters.
+def detect_iter_map(indices, input_iters, predicate=True, require_bijective=False):
+    """Detect if indices can be written as mapped iters from input iters
 
     Parameters
     ----------
     indices : List[PrimExpr]
-        The input indices.
+        The input indices
 
     input_iters : Map[Var, Range]
         The domain of each input iterators.
 
+    predicate : PrimExpr
+        The predicate constraints on the input iterators
+
+    require_bijective : bool
+        A boolean flag that indicates whether the mapping should be bijective
+
     Returns
     -------
     results : List[IterSumExpr]
         The iter map matching result.
         Empty array if no match can be found.
     """
-    return _ffi_api.DetectIterMap(indices, input_iters)
+    return _ffi_api.DetectIterMap(indices, input_iters, predicate, require_bijective)
+
+
+def normalize_iter_map_to_expr(expr):
+    """Given an IterMapExpr, transform it to normal PrimExpr
+
+    Parameters
+    ----------
+    expr : IterMapExpr
+        the input IterMapExpr
+
+    Returns
+    -------
+    result : PrimExpr
+        the corresponding normal PrimExpr
+    """
+    return _ffi_api.NormalizeIterMapToExpr(expr)
diff --git a/python/tvm/auto_scheduler/dispatcher.py b/python/tvm/auto_scheduler/dispatcher.py
index 6a25960fe7b7..c843dcfccdf0 100644
--- a/python/tvm/auto_scheduler/dispatcher.py
+++ b/python/tvm/auto_scheduler/dispatcher.py
@@ -50,7 +50,7 @@ class DispatchContext(object):
     def __init__(self):
         self._old_ctx = DispatchContext.current
 
-    def query(self, target, workload_key, has_complex_op, dag):
+    def query(self, target, workload_key, has_complex_op, dag, func_name):
         """
         Query the context to get the specific config for a workload.
         If cannot find the result inside this context, this function will query it
@@ -66,15 +66,17 @@ def query(self, target, workload_key, has_complex_op, dag):
             Whether this workload has at least one complex op.
         dag: ComputeDAG
             The ComputeDAG of the workload.
+        func_name: str
+            The function name of this workload.
 
         Returns
         -------
         state : StateObject
             The state that stores schedule configuration for the workload
         """
-        ret = self._query_inside(target, workload_key)
+        ret = self._query_inside(target, workload_key, func_name)
         if ret is None:
-            ret = self._old_ctx.query(target, workload_key, has_complex_op, dag)
+            ret = self._old_ctx.query(target, workload_key, has_complex_op, dag, func_name)
         return ret
 
     def update(self, target, workload_key, state):
@@ -92,7 +94,7 @@ def update(self, target, workload_key, state):
         """
         raise NotImplementedError()
 
-    def _query_inside(self, target, workload_key):
+    def _query_inside(self, target, workload_key, func_name):
         """
         Query the context to get the specific config for a workload.
         This function only query config inside this context.
@@ -103,6 +105,8 @@ def _query_inside(self, target, workload_key):
             The current target
         workload_key : str
             The current workload_key.
+        func_name: str
+            The function name of this workload.
 
         Returns
         -------
@@ -241,7 +245,7 @@ def load(self, records, n_lines=None):
 
         logger.debug("Finish loading %d records", counter)
 
-    def _query_inside(self, target, workload_key):
+    def _query_inside(self, target, workload_key, func_name):
         if target is None:
             raise RuntimeError(
                 "Need a target context to find the history best. "
@@ -343,18 +347,20 @@ def __init__(
             records, n_lines=None, include_compatible=True
         )
 
-    def query(self, target, workload_key, has_complex_op, dag):
+    def query(self, target, workload_key, has_complex_op, dag, func_name):
         if has_complex_op or self.sample_simple_workloads:
-            ret = self._query_inside(target, workload_key)
+            ret = self._query_inside(target, workload_key, func_name)
         else:
-            ret = super(ApplyHistoryBestOrSample, self)._query_inside(target, workload_key)
+            ret = super(ApplyHistoryBestOrSample, self)._query_inside(
+                target, workload_key, func_name
+            )
 
         if ret is None:
-            ret = self._old_ctx.query(target, workload_key, has_complex_op, dag)
+            ret = self._old_ctx.query(target, workload_key, has_complex_op, dag, func_name)
         return ret
 
-    def _query_inside(self, target, workload_key):
-        ret = super(ApplyHistoryBestOrSample, self)._query_inside(target, workload_key)
+    def _query_inside(self, target, workload_key, func_name):
+        ret = super(ApplyHistoryBestOrSample, self)._query_inside(target, workload_key, func_name)
         if ret is not None:
             return ret
 
@@ -386,7 +392,9 @@ def _query_inside(self, target, workload_key):
 
             # Load the sampled records and query again.
             self.load(log_file)
-            ret = super(ApplyHistoryBestOrSample, self)._query_inside(target, workload_key)
+            ret = super(ApplyHistoryBestOrSample, self)._query_inside(
+                target, workload_key, func_name
+            )
 
         del measure_ctx
         return ret
@@ -411,18 +419,19 @@ def __init__(self):
         # a set to prevent print duplicated message
         self.messages = set()
 
-    def query(self, target, workload_key, has_complex_op, dag):
+    def query(self, target, workload_key, has_complex_op, dag, func_name):
         key = (str(target), workload_key)
         if key in self.memory:
             return self.memory[key]
 
         if self.verbose == 2 or (has_complex_op and self.verbose == 1):
             msg = (
-                "-----------------------------------\n"
-                "Cannot find tuned schedules for target=%s, workload_key=%s. "
-                "A fallback TOPI schedule is used, "
-                "which may bring great performance regression or even compilation failure. "
-                "Compute DAG info:\n%s" % (target, workload_key, dag)
+                f"-----------------------------------\n"
+                f"{func_name}\n"
+                f"Cannot find tuned schedules for target={target}, workload_key={workload_key}. "
+                f"A fallback TOPI schedule is used, "
+                f"which may bring great performance regression or even compilation failure. "
+                f"Compute DAG info:\n{dag}"
             )
             if msg not in self.messages:
                 self.messages.add(msg)
@@ -434,8 +443,8 @@ def query(self, target, workload_key, has_complex_op, dag):
         self.memory[key] = state
         return state
 
-    def _query_inside(self, target, workload_key):
-        _ = target = workload_key
+    def _query_inside(self, target, workload_key, func_name):
+        _ = target = workload_key = func_name
         raise RuntimeError("This function should never be called")
 
     def update(self, target, workload_key, state):
diff --git a/python/tvm/auto_scheduler/measure.py b/python/tvm/auto_scheduler/measure.py
index 322143b28594..3031fce146ad 100644
--- a/python/tvm/auto_scheduler/measure.py
+++ b/python/tvm/auto_scheduler/measure.py
@@ -548,9 +548,9 @@ def __init__(
         from tvm.rpc.tracker import Tracker
         from tvm.rpc.server import Server
 
-        ctx = tvm.context("cuda", 0)
-        if ctx.exist:
-            cuda_arch = "sm_" + "".join(ctx.compute_version.split("."))
+        dev = tvm.device("cuda", 0)
+        if dev.exist:
+            cuda_arch = "sm_" + "".join(dev.compute_version.split("."))
             set_cuda_target_arch(cuda_arch)
         host = "0.0.0.0"
         self.tracker = Tracker(host, port=9000, port_end=10000, silent=True)
@@ -840,7 +840,7 @@ def _timed_eval_func(
     error_msg = None
     try:
         func = module.load_module(build_res.filename)
-        ctx = ndarray.context(str(inp.task.target), 0)
+        dev = ndarray.device(str(inp.task.target), 0)
         # Limitation:
         # We can not get PackFunction directly in the remote mode as it is wrapped
         # under the std::function. We could lift the restriction later once we fold
@@ -849,7 +849,7 @@ def _timed_eval_func(
         f_prepare = "cache_flush_cpu_non_first_arg" if enable_cpu_cache_flush else ""
         time_f = func.time_evaluator(
             func.entry_name,
-            ctx,
+            dev,
             number=number,
             repeat=repeat,
             min_repeat_ms=min_repeat_ms,
@@ -873,7 +873,11 @@ def _timed_eval_func(
                 if arg in tensor_input_map:
                     tensor_name = tensor_input_map[arg]
                     if tensor_name in task_input_names:
-                        args.append(get_task_input_buffer(inp.task.workload_key, tensor_name))
+                        args.append(
+                            ndarray.array(
+                                get_task_input_buffer(inp.task.workload_key, tensor_name), dev
+                            )
+                        )
                         task_inputs_count += 1
                     else:
                         raise ValueError(
@@ -881,14 +885,14 @@ def _timed_eval_func(
                             + "should provide with `SearchTask(..., task_inputs={...})`"
                         )
                 else:
-                    empty_array = ndarray.empty(get_const_tuple(arg.shape), arg.dtype, ctx)
+                    empty_array = ndarray.empty(get_const_tuple(arg.shape), arg.dtype, dev)
                     random_fill(empty_array)
                     args.append(empty_array)
             if task_inputs_count != len(task_input_names):
                 logger.warning(
                     "task_inputs not fully matched, check if there's any unexpected error"
                 )
-            ctx.sync()
+            dev.sync()
             costs = time_f(*args).results
         # pylint: disable=broad-except
         except Exception:
@@ -1049,7 +1053,7 @@ def _timed_rpc_run(
         remote = request_remote(key, host, port, priority, timeout)
         remote.upload(build_res.filename)
         func = remote.load_module(os.path.split(build_res.filename)[1])
-        ctx = remote.context(str(inp.task.target), 0)
+        dev = remote.device(str(inp.task.target), 0)
         # Limitation:
         # We can not get PackFunction directly in the remote mode as it is wrapped
         # under the std::function. We could lift the restriction later once we fold
@@ -1058,7 +1062,7 @@ def _timed_rpc_run(
         f_prepare = "cache_flush_cpu_non_first_arg" if enable_cpu_cache_flush else ""
         time_f = func.time_evaluator(
             func.entry_name,
-            ctx,
+            dev,
             number=number,
             repeat=repeat,
             min_repeat_ms=min_repeat_ms,
@@ -1084,7 +1088,11 @@ def _timed_rpc_run(
                 if arg in tensor_input_map:
                     tensor_name = tensor_input_map[arg]
                     if tensor_name in task_input_names:
-                        args.append(get_task_input_buffer(inp.task.workload_key, tensor_name))
+                        args.append(
+                            ndarray.array(
+                                get_task_input_buffer(inp.task.workload_key, tensor_name), dev
+                            )
+                        )
                         task_inputs_count += 1
                     else:
                         raise ValueError(
@@ -1092,14 +1100,14 @@ def _timed_rpc_run(
                             + "should provide with `SearchTask(..., task_inputs={...})`"
                         )
                 else:
-                    empty_array = ndarray.empty(get_const_tuple(arg.shape), arg.dtype, ctx)
+                    empty_array = ndarray.empty(get_const_tuple(arg.shape), arg.dtype, dev)
                     random_fill(empty_array)
                     args.append(empty_array)
             if task_inputs_count != len(task_input_names):
                 logger.warning(
                     "task_inputs not fully matched, check if there's any unexpected error"
                 )
-            ctx.sync()
+            dev.sync()
             costs = time_f(*args).results
 
             # clean up remote files
diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py
index 6c5957e89de0..5bd910802926 100644
--- a/python/tvm/auto_scheduler/relay_integration.py
+++ b/python/tvm/auto_scheduler/relay_integration.py
@@ -49,7 +49,7 @@ def call_all_topi_funcs(mod, params, target):
     """Call all TOPI compute to extract auto_scheduler tasks in a Relay program"""
     # pylint: disable=import-outside-toplevel
     from tvm import relay
-    from tvm.relay.backend import graph_runtime_codegen
+    from tvm.relay.backend import graph_executor_codegen
 
     # Turn off AutoTVM config not found warnings
     old_autotvm_silent = autotvm.GLOBAL_SCOPE.silent
@@ -65,11 +65,11 @@ def call_all_topi_funcs(mod, params, target):
     ):
         try:
             opt_mod, _ = relay.optimize(mod, target, params)
-            grc = graph_runtime_codegen.GraphRuntimeCodegen(None, target)
+            grc = graph_executor_codegen.GraphExecutorCodegen(None, target)
             grc.codegen(opt_mod["main"])
         except tvm.TVMError:
             print(
-                "Get errors with GraphRuntimeCodegen for task extraction. "
+                "Get errors with GraphExecutorCodegen for task extraction. "
                 "Fallback to VMCompiler."
             )
             compiler = relay.vm.VMCompiler()
@@ -116,12 +116,17 @@ def extract_tasks(
     env = TracingEnvironment(
         TracingMode.EXTRACT_TASK if include_simple_tasks else TracingMode.EXTRACT_COMPLEX_TASK_ONLY
     )
+
+    dispatch_ctx = DispatchContext.current
+    old_verbose = dispatch_ctx.verbose
+    dispatch_ctx.verbose = 0
     with env:
         # Wrap build call in a new thread to avoid the conflict
         # between python's multiprocessing and tvm's thread pool
         build_thread = threading.Thread(target=call_all_topi_funcs, args=(mod, params, target))
         build_thread.start()
         build_thread.join()
+    dispatch_ctx.verbose = old_verbose
 
     # create search tasks
     tasks = []
@@ -249,7 +254,7 @@ def traverse(t):
 
 
 @tvm._ffi.register_func("auto_scheduler.relay_integration.auto_schedule_topi_compute")
-def auto_schedule_topi(outs):
+def auto_schedule_topi(func_name, outs):
     """Use auto-scheduler to schedule any topi compute function.
 
     Note: This is used internally for relay integration. Do
@@ -257,6 +262,9 @@ def auto_schedule_topi(outs):
 
     Parameters
     ----------
+    func_name: str
+        The name of the function being scheduled.
+
     outs: List[Tensor]
         The output tensors of topi compute functions
 
@@ -282,7 +290,7 @@ def auto_schedule_topi(outs):
     target = tvm.target.Target.current()
 
     dispatch_ctx = DispatchContext.current
-    state = dispatch_ctx.query(target, key, has_complex_op, dag)
+    state = dispatch_ctx.query(target, key, has_complex_op, dag, func_name)
     schedule = None
 
     env = TracingEnvironment.current
diff --git a/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py b/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py
index a8e580998367..f61d34284e01 100644
--- a/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py
+++ b/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py
@@ -211,7 +211,8 @@ def get_direct_ancestor(node_list, visited_dict, target_ops, node_idx, input_nam
         else:
             tmp = get_direct_ancestor(node_list, visited_dict, target_ops, item_idx[0], input_names)
             for tmp_item in tmp:
-                node_direct_ancestor.append(tmp_item)
+                if tmp_item not in node_direct_ancestor:
+                    node_direct_ancestor.append(tmp_item)
     visited_dict[node_idx] = node_direct_ancestor
     return node_direct_ancestor
 
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 3042d93f01fd..aaa08ce91af9 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -262,18 +262,18 @@ def get_build_kwargs(self):
             or "vulkan" in self.task.target.keys
         ):
             remote = request_remote(self.key, self.host, self.port)
-            ctx = remote.context(str(self.task.target), 0)
-            max_dims = ctx.max_thread_dimensions
+            dev = remote.device(str(self.task.target), 0)
+            max_dims = dev.max_thread_dimensions
             kwargs["check_gpu"] = {
-                "max_shared_memory_per_block": ctx.max_shared_memory_per_block,
-                "max_threads_per_block": ctx.max_threads_per_block,
+                "max_shared_memory_per_block": dev.max_shared_memory_per_block,
+                "max_threads_per_block": dev.max_threads_per_block,
                 "max_thread_x": max_dims[0],
                 "max_thread_y": max_dims[1],
                 "max_thread_z": max_dims[2],
             }
 
             if "cuda" in self.task.target.keys:
-                kwargs["cuda_arch"] = "sm_" + "".join(ctx.compute_version.split("."))
+                kwargs["cuda_arch"] = "sm_" + "".join(dev.compute_version.split("."))
         if self.task.target.device_name == "micro_dev":
             kwargs.setdefault("build_option", {})["tir.disable_vectorize"] = True
 
@@ -558,7 +558,7 @@ def run_through_rpc(
     try:
         # upload built module
         with module_loader(remote_kwargs, build_result) as (remote, mod):
-            ctx = remote.context(str(measure_input.target), 0)
+            dev = remote.device(str(measure_input.target), 0)
 
             # Limitation:
             # We can not get PackFunction directly in the remote mode as it is wrapped
@@ -568,7 +568,7 @@ def run_through_rpc(
             f_prepare = "cache_flush_cpu_non_first_arg" if enable_cpu_cache_flush else ""
             time_f = mod.time_evaluator(
                 mod.entry_name,
-                ctx,
+                dev,
                 number=number,
                 repeat=repeat,
                 min_repeat_ms=min_repeat_ms,
@@ -581,12 +581,12 @@ def run_through_rpc(
                 raise AttributeError(
                     "Please make sure USE_RANDOM is ON in the config.cmake " "on the remote devices"
                 )
-            args = [nd.array(np.zeros(x[0], dtype=x[1]), ctx=ctx) for x in build_result.arg_info]
+            args = [nd.array(np.zeros(x[0], dtype=x[1]), device=dev) for x in build_result.arg_info]
             if "scatter" not in measure_input.task.name:
                 # the index tensor of scatter op cannot be randomly initialized
                 for arg in args:
                     random_fill(arg)
-            ctx.sync()
+            dev.sync()
 
             costs = time_f(*args).results
 
@@ -701,8 +701,8 @@ def check_remote(target, device_key, host=None, port=None, priority=100, timeout
 
     def _check():
         remote = request_remote(device_key, host, port, priority)
-        ctx = remote.context(str(target))
-        while not ctx.exist:  # wait until we get an available device
+        dev = remote.device(str(target))
+        while not dev.exist:  # wait until we get an available device
             pass
 
     t = threading.Thread(
diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py
index 15145b635817..0e7a08c9f808 100644
--- a/python/tvm/autotvm/task/relay_integration.py
+++ b/python/tvm/autotvm/task/relay_integration.py
@@ -37,14 +37,14 @@ def _lower(mod, target, params):
     """Helper to lower VTA properly."""
     # pylint: disable=import-outside-toplevel
     from tvm import relay
-    from tvm.relay.backend import graph_runtime_codegen
+    from tvm.relay.backend import graph_executor_codegen
 
     if hasattr(target, "device_name") and target.device_name == "vta":
         import vta
 
         with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
             mod, _ = relay.optimize(mod, target, params)
-            grc = graph_runtime_codegen.GraphRuntimeCodegen(None, target)
+            grc = graph_executor_codegen.GraphExecutorCodegen(None, target)
             grc.codegen(mod["main"])
             return
 
@@ -54,11 +54,11 @@ def _lower(mod, target, params):
     # TODO: Currently VM compiler is likely to stack overflow for large models.
     try:
         opt_mod, _ = relay.optimize(mod, target, params)
-        grc = graph_runtime_codegen.GraphRuntimeCodegen(None, target)
+        grc = graph_executor_codegen.GraphExecutorCodegen(None, target)
         grc.codegen(opt_mod["main"])
     except tvm.TVMError as e:
         print(
-            "Get errors with GraphRuntimeCodegen for task extraction. "
+            "Get errors with GraphExecutorCodegen for task extraction. "
             "Fallback to VMCompiler. Error details:\n%s" % str(e)
         )
         compiler = relay.vm.VMCompiler()
diff --git a/python/tvm/contrib/cc.py b/python/tvm/contrib/cc.py
index 59a1d11216ee..f48ae395fbcd 100644
--- a/python/tvm/contrib/cc.py
+++ b/python/tvm/contrib/cc.py
@@ -192,12 +192,16 @@ def _fcompile(outputs, objects, options=None):
 
 def _linux_compile(output, objects, options, compile_cmd="g++", compile_shared=False):
     cmd = [compile_cmd]
-    if compile_shared or output.endswith(".so") or output.endswith(".dylib"):
-        cmd += ["-shared", "-fPIC"]
-        if sys.platform == "darwin":
-            cmd += ["-undefined", "dynamic_lookup"]
-    elif output.endswith(".obj"):
-        cmd += ["-c"]
+    if compile_cmd != "nvcc":
+        if compile_shared or output.endswith(".so") or output.endswith(".dylib"):
+            cmd += ["-shared", "-fPIC"]
+            if sys.platform == "darwin":
+                cmd += ["-undefined", "dynamic_lookup"]
+        elif output.endswith(".obj"):
+            cmd += ["-c"]
+    else:
+        if compile_shared or output.endswith(".so") or output.endswith(".dylib"):
+            cmd += ["--shared"]
     cmd += ["-o", output]
     if isinstance(objects, str):
         cmd += [objects]
diff --git a/python/tvm/contrib/coreml_runtime.py b/python/tvm/contrib/coreml_runtime.py
index 4ef3593cf44b..b2555572ed42 100644
--- a/python/tvm/contrib/coreml_runtime.py
+++ b/python/tvm/contrib/coreml_runtime.py
@@ -19,7 +19,7 @@
 from ..rpc import base as rpc_base
 
 
-def create(symbol, compiled_model_path, ctx):
+def create(symbol, compiled_model_path, device):
     """Create a runtime executor module given a coreml model and context.
     Parameters
     ----------
@@ -27,19 +27,19 @@ def create(symbol, compiled_model_path, ctx):
         The symbol that represents the Core ML model.
     compiled_model_path : str
         The path of the compiled model to be deployed.
-    ctx : TVMContext
-        The context to deploy the module. It can be local or remote when there
-        is only one TVMContext.
+    device : Device
+        The device to deploy the module. It can be local or remote when there
+        is only one Device.
     Returns
     -------
     coreml_runtime : CoreMLModule
         Runtime coreml module that can be used to execute the coreml model.
     """
-    device_type = ctx.device_type
+    device_type = device.device_type
     runtime_func = "tvm.coreml_runtime.create"
 
     if device_type >= rpc_base.RPC_SESS_MASK:
-        fcreate = ctx._rpc_sess.get_function(runtime_func)
+        fcreate = device._rpc_sess.get_function(runtime_func)
     else:
         fcreate = tvm._ffi.get_global_func(runtime_func)
 
diff --git a/tests/micro/qemu/zephyr-runtime/sample.yaml b/python/tvm/contrib/cuda_graph/__init__.py
similarity index 88%
rename from tests/micro/qemu/zephyr-runtime/sample.yaml
rename to python/tvm/contrib/cuda_graph/__init__.py
index 88616b4acc40..13a83393a912 100644
--- a/tests/micro/qemu/zephyr-runtime/sample.yaml
+++ b/python/tvm/contrib/cuda_graph/__init__.py
@@ -14,9 +14,3 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-sample:
-  description: uTVM RPC Server unit test
-  name: utvm rpc server
-common:
-    tags: introduction
diff --git a/python/tvm/contrib/cuda_graph/cuda_graph_executor.py b/python/tvm/contrib/cuda_graph/cuda_graph_executor.py
new file mode 100644
index 000000000000..d047316eb564
--- /dev/null
+++ b/python/tvm/contrib/cuda_graph/cuda_graph_executor.py
@@ -0,0 +1,134 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Graph executor with CUDA Graph"""
+import tvm._ffi
+
+from tvm._ffi.base import string_types
+from tvm.contrib import graph_executor
+
+
+def create(graph_json_str, libmod, device):
+    """Create a runtime executor module given a graph and module.
+
+    Parameters
+    ----------
+    graph_json_str : str
+        The graph to be deployed in json format output by json graph.
+        The graph can contain operator(tvm_op) that points to the name
+        of PackedFunc in the libmod.
+
+    libmod : tvm.runtime.Module
+        The module of the corresponding function
+
+    device : Device
+        The device to deploy the module, only supports CUDA GPU
+
+    Returns
+    -------
+    graph_module : GraphModuleCudaGraph
+        CUDA graph executor module that can be used to execute the graph.
+
+    Note
+    ----
+    See also :py:class:`tvm.contrib.cuda_graph.cuda_graph_executor.GraphModuleCudaGraph`
+    for examples to directly construct a GraphModuleCudaGraph from an exported
+    relay compiled library.
+    """
+    assert isinstance(graph_json_str, string_types)
+    try:
+        dev, num_rpc_dev, device_type_id = graph_executor.get_device(libmod, device)
+        if num_rpc_dev == len(dev):
+            fcreate = dev[0]._rpc_sess.get_function("tvm.graph_executor_cuda_graph.create")
+        else:
+            fcreate = tvm._ffi.get_global_func("tvm.graph_executor_cuda_graph.create")
+    except ValueError:
+        raise ValueError(
+            "To enable CUDA graph support (experimental), please set "
+            "'(USE_GRAPH_EXECUTOR_CUGRAPH ON)' in config.cmake and rebuild TVM"
+        )
+
+    return GraphModuleCudaGraph(fcreate(graph_json_str, libmod, *device_type_id))
+
+
+class GraphModuleCudaGraph(graph_executor.GraphModule):
+    """CUDA graph executor module.
+
+    This is a CUDA graph executor wrapper over the TVM runtime.
+    Runtime interfaces are wrapped with CUDA graph functionalities.
+
+    Parameters
+    ----------
+    module : Module
+        The internal tvm module that holds the actual graph functions.
+    """
+
+    def __init__(self, module):
+        self._start_capture = module["start_capture"]
+        self._end_capture = module["end_capture"]
+        self._run_cuda_graph = module["run_cuda_graph"]
+        self._cuda_graph_captured = False
+        graph_executor.GraphModule.__init__(self, module)
+
+    def capture_cuda_graph(self):
+        """Capture a CUDA graph for tvm_op graph
+
+        This should be called before run_cuda_graph() to capture and
+        instantiate a CUDA graph instance.
+        """
+        self._run()  # call cuModuleLoadData before cudaStream API
+        self._start_capture()
+        self._run()
+        self._end_capture()
+        self._cuda_graph_captured = True
+
+    def run_cuda_graph(self):
+        """Run the CUDA graph for tvm_op graph
+
+        Run the captured CUDA graph instance instead of the
+        for-loop kernel launch of default graph executor
+        """
+        self._run_cuda_graph()
+
+    def run(self, **input_dict):
+        """A run wrapper for graph capture / launch, user can just
+        change default graph executor to cuda graph executor, and
+        the first call will capture a cuda graph for future launch
+
+        Parameters
+        ----------
+        input_dict: dict of str to NDArray
+            List of input values to be feed to
+        """
+        if input_dict:
+            self.set_input(**input_dict)
+        if not self._cuda_graph_captured:
+            self.capture_cuda_graph()
+        else:
+            self._run_cuda_graph()
+
+    def debug_get_output(self, node, out):
+        """Run graph up to node and get the output to out
+
+        Parameters
+        ----------
+        node : int / str
+            The node index or name
+
+        out : NDArray
+            The output array container
+        """
+        raise NotImplementedError("Please use debugger.debug_executor as graph_executor instead.")
diff --git a/python/tvm/contrib/debugger/debug_executor.py b/python/tvm/contrib/debugger/debug_executor.py
new file mode 100644
index 000000000000..b27ae6533e38
--- /dev/null
+++ b/python/tvm/contrib/debugger/debug_executor.py
@@ -0,0 +1,239 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Graph debug runtime executes TVM debug packed functions."""
+
+import os
+import tempfile
+import shutil
+import tvm._ffi
+
+from tvm._ffi.base import string_types
+from tvm.contrib import graph_executor
+from tvm.runtime.ndarray import array
+from . import debug_result
+
+_DUMP_ROOT_PREFIX = "tvmdbg_"
+_DUMP_PATH_PREFIX = "_tvmdbg_"
+
+
+def create(graph_json_str, libmod, device, dump_root=None):
+    """Create a runtime executor module given a graph and module.
+
+    Parameters
+    ----------
+    graph_json_str : str
+        The graph to be deployed in json format output by graph compiler.
+        The graph can contain operator(tvm_op) that points to the name
+        of PackedFunc in the libmod.
+
+    libmod : tvm.Module
+        The module of the corresponding function.
+
+    device : Device
+        The device to deploy the module, can be local or remote.
+
+    dump_root : str
+        To select which folder the outputs should be kept.
+        None will make a temp folder in /tmp/tvmdbg<rand_string> and does the dumping
+    Returns
+    -------
+    graph_module : GraphModuleDebug
+        Debug Runtime graph module that can be used to execute the graph.
+    """
+    assert isinstance(graph_json_str, string_types)
+
+    try:
+        dev, num_rpc_dev, device_type_id = graph_executor.get_device(libmod, device)
+        if num_rpc_dev == len(dev):
+            fcreate = dev[0]._rpc_sess.get_function("tvm.graph_executor_debug.create")
+        else:
+            fcreate = tvm._ffi.get_global_func("tvm.graph_executor_debug.create")
+    except ValueError:
+        raise ValueError(
+            "Please set '(USE_GRAPH_EXECUTOR_DEBUG ON)' in "
+            "config.cmake and rebuild TVM to enable debug mode"
+        )
+    func_obj = fcreate(graph_json_str, libmod, *device_type_id)
+    return GraphModuleDebug(func_obj, dev, graph_json_str, dump_root)
+
+
+class GraphModuleDebug(graph_executor.GraphModule):
+    """Graph debug runtime module.
+
+    This is a debug wrapper over the TVM runtime.
+    Runtime interfaces are wrapped with debug functionalities.
+    Manage the debug framework to format the debug data and
+    trigger the user interfaces.
+
+    Parameters
+    ----------
+    module : Module
+        The internal tvm module that holds the actual graph functions.
+
+    device : Device
+        The device that this module is under.
+
+    graph_json_str : str or graph class
+        Content of graph json file in string format
+
+    dump_root : str
+        To select which folder the outputs should be kept.
+        None will make a temp folder in /tmp/tvmdbg<rand_string> and does the dumping
+    """
+
+    def __init__(self, module, device, graph_json_str, dump_root):
+        self._dump_root = dump_root
+        self._dump_path = None
+        self._get_output_by_layer = module["get_output_by_layer"]
+        self._run_individual = module["run_individual"]
+        graph_executor.GraphModule.__init__(self, module)
+        self._create_debug_env(graph_json_str, device)
+
+    def _format_device(self, device):
+        return str(device[0]).upper().replace("(", ":").replace(")", "")
+
+    def _ensure_dir(self, directory):
+        """Create a directory if not exists
+
+        Parameters
+        ----------
+
+        directory : str
+            File path to create
+        """
+        if not os.path.exists(directory):
+            os.makedirs(directory, 0o700)
+
+    def _get_dump_path(self, device):
+        """Make the graph and tensor dump folder and return the path.
+
+        Parameters
+        ----------
+        device : Device
+            The device that this module is under.
+
+        Returns
+        -------
+        path : str
+            Directory path where the graph and node outputs will be stored.
+        """
+        # save to file
+        folder_name = _DUMP_PATH_PREFIX + "device_"
+        folder_name = folder_name + device.replace(":", "_")
+        path = os.path.join(self._dump_root, folder_name)
+        self._ensure_dir(path)
+        return path
+
+    def _remove_dump_root(self):
+        if os.path.isdir(self._dump_root):
+            shutil.rmtree(self._dump_root)
+
+    def _create_debug_env(self, graph_json, device):
+        """Create UI wrapper framework to handle multiple UI frontends for tvmdbg
+
+        Parameters
+        ----------
+        graph_json : json format
+            json formatted NNVM graph contain list of each node's name, shape and type.
+
+        nodes_list : list
+            List of all the nodes presented in the graph
+
+        device : Device
+            The device that this module is under.
+        """
+        # make the dump folder if not given
+        if not self._dump_root:
+            self._dump_root = tempfile.mkdtemp(prefix=_DUMP_ROOT_PREFIX)
+
+        # format the device
+        device = self._format_device(device)
+
+        # updates the dumping directories
+        self._dump_path = self._get_dump_path(device)
+
+        # init the debug dumping environment
+        self.debug_datum = debug_result.DebugResult(graph_json, self._dump_path)
+
+    def _run_debug(self):
+        """Execute the node specified with index will be executed.
+        Each debug output will be copied to the buffer
+        Time consumed for each execution will be set as debug output.
+
+        """
+        self.debug_datum._time_list = [[float(t)] for t in self.run_individual(10, 1, 1)]
+        for i, node in enumerate(self.debug_datum.get_graph_nodes()):
+            num_outputs = self.debug_datum.get_graph_node_output_num(node)
+            for j in range(num_outputs):
+                out_tensor = self._get_output_by_layer(i, j)
+                out_tensor = array(out_tensor)
+                self.debug_datum._output_tensor_list.append(out_tensor)
+
+    def debug_get_output(self, node, out=None):
+        """Run graph up to node and get the output to out
+
+        Parameters
+        ----------
+        node : int / str
+            The node index or name
+
+        out : NDArray
+            The output array container
+        """
+        if isinstance(node, str):
+            output_tensors = self.debug_datum.get_output_tensors()
+            try:
+                out = output_tensors[node]
+            except KeyError:
+                node_list = output_tensors.keys()
+                raise RuntimeError(
+                    "Node " + node + " not found, available nodes are: " + str(node_list) + "."
+                )
+        elif isinstance(node, int):
+            output_tensors = self.debug_datum._output_tensor_list
+            out = output_tensors[node]
+        else:
+            raise RuntimeError("Require node index or name only.")
+        return out
+
+    def run(self, **input_dict):
+        """Run forward execution of the graph with debug
+
+        Parameters
+        ----------
+        input_dict : dict of str to NDArray
+            List of input values to be feed to
+        """
+        if input_dict:
+            self.set_input(**input_dict)
+
+        # Step 1. Execute the graph
+        self._run_debug()
+        # Step 2. Dump the output tensors to the dump folder
+        self.debug_datum.dump_output_tensor()
+        # Step 3. Dump the Chrome trace to the dump folder
+        self.debug_datum.dump_chrome_trace()
+        # Step 4. Display the collected information
+        self.debug_datum.display_debug_result()
+
+    def run_individual(self, number, repeat=1, min_repeat_ms=0):
+        ret = self._run_individual(number, repeat, min_repeat_ms)
+        return ret.strip(",").split(",") if ret else []
+
+    def exit(self):
+        """Exits the dump folder and all its contents"""
+        self._remove_dump_root()
diff --git a/python/tvm/contrib/debugger/debug_runtime.py b/python/tvm/contrib/debugger/debug_runtime.py
index 289ac4c467e0..ebd903b47570 100644
--- a/python/tvm/contrib/debugger/debug_runtime.py
+++ b/python/tvm/contrib/debugger/debug_runtime.py
@@ -14,226 +14,16 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Graph debug runtime executes TVM debug packed functions."""
+"""Deprecated Python API for DebugExecutor."""
 
-import os
-import tempfile
-import shutil
-import tvm._ffi
+import warnings
 
-from tvm._ffi.base import string_types
-from tvm.contrib import graph_runtime
-from tvm.runtime.ndarray import array
-from . import debug_result
+from . import debug_executor
 
-_DUMP_ROOT_PREFIX = "tvmdbg_"
-_DUMP_PATH_PREFIX = "_tvmdbg_"
 
-
-def create(graph_json_str, libmod, ctx, dump_root=None):
-    """Create a runtime executor module given a graph and module.
-
-    Parameters
-    ----------
-    graph_json_str : str
-        The graph to be deployed in json format output by graph compiler.
-        The graph can contain operator(tvm_op) that points to the name
-        of PackedFunc in the libmod.
-
-    libmod : tvm.Module
-        The module of the corresponding function.
-
-    ctx : TVMContext
-        The context to deploy the module, can be local or remote.
-
-    dump_root : str
-        To select which folder the outputs should be kept.
-        None will make a temp folder in /tmp/tvmdbg<rand_string> and does the dumping
-    Returns
-    -------
-    graph_module : GraphModuleDebug
-        Debug Runtime graph module that can be used to execute the graph.
-    """
-    assert isinstance(graph_json_str, string_types)
-
-    try:
-        ctx, num_rpc_ctx, device_type_id = graph_runtime.get_device_ctx(libmod, ctx)
-        if num_rpc_ctx == len(ctx):
-            fcreate = ctx[0]._rpc_sess.get_function("tvm.graph_runtime_debug.create")
-        else:
-            fcreate = tvm._ffi.get_global_func("tvm.graph_runtime_debug.create")
-    except ValueError:
-        raise ValueError(
-            "Please set '(USE_GRAPH_RUNTIME_DEBUG ON)' in "
-            "config.cmake and rebuild TVM to enable debug mode"
-        )
-    func_obj = fcreate(graph_json_str, libmod, *device_type_id)
-    return GraphModuleDebug(func_obj, ctx, graph_json_str, dump_root)
-
-
-class GraphModuleDebug(graph_runtime.GraphModule):
-    """Graph debug runtime module.
-
-    This is a debug wrapper over the TVM runtime.
-    Runtime interfaces are wrapped with debug functionalities.
-    Manage the debug framework to format the debug data and
-    trigger the user interfaces.
-
-    Parameters
-    ----------
-    module : Module
-        The internal tvm module that holds the actual graph functions.
-
-    ctx : TVMContext
-        The context this module is under.
-
-    graph_json_str : str or graph class
-        Content of graph json file in string format
-
-    dump_root : str
-        To select which folder the outputs should be kept.
-        None will make a temp folder in /tmp/tvmdbg<rand_string> and does the dumping
-    """
-
-    def __init__(self, module, ctx, graph_json_str, dump_root):
-        self._dump_root = dump_root
-        self._dump_path = None
-        self._get_output_by_layer = module["get_output_by_layer"]
-        self._run_individual = module["run_individual"]
-        graph_runtime.GraphModule.__init__(self, module)
-        self._create_debug_env(graph_json_str, ctx)
-
-    def _format_context(self, ctx):
-        return str(ctx[0]).upper().replace("(", ":").replace(")", "")
-
-    def _ensure_dir(self, directory):
-        """Create a directory if not exists
-
-        Parameters
-        ----------
-
-        directory : str
-            File path to create
-        """
-        if not os.path.exists(directory):
-            os.makedirs(directory, 0o700)
-
-    def _get_dump_path(self, ctx):
-        """Make the graph and tensor dump folder and return the path.
-
-        Parameters
-        ----------
-        ctx : TVMContext
-            The context this module is under.
-
-        Returns
-        -------
-        path : str
-            Directory path where the graph and node outputs will be stored.
-        """
-        # save to file
-        folder_name = _DUMP_PATH_PREFIX + "ctx_"
-        folder_name = folder_name + ctx.replace(":", "_")
-        path = os.path.join(self._dump_root, folder_name)
-        self._ensure_dir(path)
-        return path
-
-    def _remove_dump_root(self):
-        if os.path.isdir(self._dump_root):
-            shutil.rmtree(self._dump_root)
-
-    def _create_debug_env(self, graph_json, ctx):
-        """Create UI wrapper framework to handle multiple UI frontends for tvmdbg
-
-        Parameters
-        ----------
-        graph_json : json format
-            json formatted NNVM graph contain list of each node's name, shape and type.
-
-        nodes_list : list
-            List of all the nodes presented in the graph
-
-        ctx : TVMContext
-            The context this module is under.
-        """
-        # make the dump folder if not given
-        if not self._dump_root:
-            self._dump_root = tempfile.mkdtemp(prefix=_DUMP_ROOT_PREFIX)
-
-        # format the context
-        ctx = self._format_context(ctx)
-
-        # updates the dumping directories
-        self._dump_path = self._get_dump_path(ctx)
-
-        # init the debug dumping environment
-        self.debug_datum = debug_result.DebugResult(graph_json, self._dump_path)
-
-    def _run_debug(self):
-        """Execute the node specified with index will be executed.
-        Each debug output will be copied to the buffer
-        Time consumed for each execution will be set as debug output.
-
-        """
-        self.debug_datum._time_list = [[float(t)] for t in self.run_individual(10, 1, 1)]
-        for i, node in enumerate(self.debug_datum.get_graph_nodes()):
-            num_outputs = self.debug_datum.get_graph_node_output_num(node)
-            for j in range(num_outputs):
-                out_tensor = self._get_output_by_layer(i, j)
-                out_tensor = array(out_tensor)
-                self.debug_datum._output_tensor_list.append(out_tensor)
-
-    def debug_get_output(self, node, out=None):
-        """Run graph up to node and get the output to out
-
-        Parameters
-        ----------
-        node : int / str
-            The node index or name
-
-        out : NDArray
-            The output array container
-        """
-        if isinstance(node, str):
-            output_tensors = self.debug_datum.get_output_tensors()
-            try:
-                out = output_tensors[node]
-            except KeyError:
-                node_list = output_tensors.keys()
-                raise RuntimeError(
-                    "Node " + node + " not found, available nodes are: " + str(node_list) + "."
-                )
-        elif isinstance(node, int):
-            output_tensors = self.debug_datum._output_tensor_list
-            out = output_tensors[node]
-        else:
-            raise RuntimeError("Require node index or name only.")
-        return out
-
-    def run(self, **input_dict):
-        """Run forward execution of the graph with debug
-
-        Parameters
-        ----------
-        input_dict : dict of str to NDArray
-            List of input values to be feed to
-        """
-        if input_dict:
-            self.set_input(**input_dict)
-
-        # Step 1. Execute the graph
-        self._run_debug()
-        # Step 2. Dump the output tensors to the dump folder
-        self.debug_datum.dump_output_tensor()
-        # Step 3. Dump the Chrome trace to the dump folder
-        self.debug_datum.dump_chrome_trace()
-        # Step 4. Display the collected information
-        self.debug_datum.display_debug_result()
-
-    def run_individual(self, number, repeat=1, min_repeat_ms=0):
-        ret = self._run_individual(number, repeat, min_repeat_ms)
-        return ret.strip(",").split(",") if ret else []
-
-    def exit(self):
-        """Exits the dump folder and all its contents"""
-        self._remove_dump_root()
+def create(*args, **kwargs):
+    warnings.warn(
+        "This function has been moved to tvm.contrib.graph_executor and will be removed "
+        "in the next TVM release"
+    )
+    return debug_executor.create(*args, **kwargs)
diff --git a/python/tvm/contrib/graph_executor.py b/python/tvm/contrib/graph_executor.py
new file mode 100644
index 000000000000..a4bc85905f5e
--- /dev/null
+++ b/python/tvm/contrib/graph_executor.py
@@ -0,0 +1,306 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Minimum graph executor that executes graph containing TVM PackedFunc."""
+import numpy as np
+import tvm._ffi
+
+from tvm.rpc import _ffi_api as _rpc_ffi_api
+from tvm.rpc import base as rpc_base
+from tvm._ffi.base import string_types
+from tvm._ffi.runtime_ctypes import Device
+
+
+def create(graph_json_str, libmod, device):
+    """Create a runtime executor module given a graph and module.
+
+    Parameters
+    ----------
+    graph_json_str : str
+        The graph to be deployed in json format output by json graph.
+        The graph can contain operator(tvm_op) that points to the name
+        of PackedFunc in the libmod.
+
+    libmod : tvm.runtime.Module
+        The module of the corresponding function
+
+    device : Device or list of Device
+        The device to deploy the module. It can be local or remote when there
+        is only one Device. Otherwise, the first device in the list will
+        be used as this purpose. All device should be given for heterogeneous
+        execution.
+
+    Returns
+    -------
+    graph_module : GraphModule
+        Runtime graph module that can be used to execute the graph.
+
+    Note
+    ----
+    See also :py:class:`tvm.contrib.graph_executor.GraphModule`
+    for examples to directly construct a GraphModule from an exported
+    relay compiled library.
+    """
+    assert isinstance(graph_json_str, string_types)
+
+    dev, num_rpc_dev, device_type_id = get_device(libmod, device)
+
+    if num_rpc_dev == len(dev):
+        fcreate = dev[0]._rpc_sess.get_function("tvm.graph_executor.create")
+    else:
+        fcreate = tvm._ffi.get_global_func("tvm.graph_executor.create")
+
+    return GraphModule(fcreate(graph_json_str, libmod, *device_type_id))
+
+
+def get_device(libmod, device):
+    """Parse and validate all the device(s).
+
+    Parameters
+    ----------
+    libmod : tvm.runtime.Module
+        The module of the corresponding function
+
+    device : Device or list of Device
+
+    Returns
+    -------
+    device : list of Device
+    num_rpc_dev : Number of rpc devices
+    device_type_id : List of device type and device id
+    """
+
+    if isinstance(device, Device):
+        device = [device]
+    elif not isinstance(device, (list, tuple)):
+        raise ValueError("dev has to be the type of Device or a list of Device")
+    for cur_dev in device:
+        if not isinstance(cur_dev, Device):
+            raise ValueError("dev has to be the type of Device or a list of Device")
+
+    # device_type_id[0], device_type_id[1] are used as the primary/fallback
+    # device type and id. All other ones are used as device for
+    # heterogeneous execution.
+    num_rpc_dev = 0
+    device_type_id = []
+    for cur_dev in device:
+        device_type = cur_dev.device_type
+        if device_type >= rpc_base.RPC_SESS_MASK:
+            assert libmod.type_key == "rpc"
+            assert _rpc_ffi_api.SessTableIndex(libmod) == cur_dev._rpc_sess._tbl_index
+            num_rpc_dev += 1
+            device_type = cur_dev.device_type % rpc_base.RPC_SESS_MASK
+        device_type_id.append(device_type)
+        device_type_id.append(cur_dev.device_id)
+
+    if 0 < num_rpc_dev < len(device):
+        raise ValueError("Either all or none of the devices should be rpc.")
+    return device, num_rpc_dev, device_type_id
+
+
+class GraphModule(object):
+    """Wrapper runtime module.
+
+    This is a thin wrapper of the underlying TVM module.
+    you can also directly call set_input, run, and get_output
+    of underlying module functions
+
+    Parameters
+    ----------
+    module : tvm.runtime.Module
+        The internal tvm module that holds the actual graph functions.
+
+    Attributes
+    ----------
+    module : tvm.runtime.Module
+        The internal tvm module that holds the actual graph functions.
+
+    Examples
+    --------
+
+    .. code-block:: python
+
+        import tvm
+        from tvm import relay
+        from tvm.contrib import graph_executor
+
+        # build the library using graph executor
+        lib = relay.build(...)
+        lib.export_library("compiled_lib.so")
+        # load it back as a runtime
+        lib: tvm.runtime.Module = tvm.runtime.load_module("compiled_lib.so")
+        # Call the library factory function for default and create
+        # a new runtime.Module, wrap with graph module.
+        gmod = graph_executor.GraphModule(lib["default"](dev))
+        # use the graph module.
+        gmod.set_input("x", data)
+        gmod.run()
+    """
+
+    def __init__(self, module):
+        self.module = module
+        self._set_input = module["set_input"]
+        self._run = module["run"]
+        self._get_output = module["get_output"]
+        self._get_input = module["get_input"]
+        self._get_num_outputs = module["get_num_outputs"]
+        self._get_num_inputs = module["get_num_inputs"]
+        self._load_params = module["load_params"]
+        self._share_params = module["share_params"]
+
+    def set_input(self, key=None, value=None, **params):
+        """Set inputs to the module via kwargs
+
+        Parameters
+        ----------
+        key : int or str
+           The input key
+
+        value : the input value.
+           The input key
+
+        params : dict of str to NDArray
+           Additional arguments
+        """
+        if key is not None:
+            v = self._get_input(key)
+            if v is None:
+                raise RuntimeError("Could not find '%s' in graph's inputs" % key)
+            v.copyfrom(value)
+
+        if params:
+            # upload big arrays first to avoid memory issue in rpc mode
+            keys = list(params.keys())
+            keys.sort(key=lambda x: -np.prod(params[x].shape))
+            for k in keys:
+                # TODO(zhiics) Skip the weights for submodule in a better way.
+                # We should use MetadataModule for initialization and remove
+                # params from set_input
+                val = self._get_input(k)
+                if val:
+                    self._get_input(k).copyfrom(params[k])
+
+    def run(self, **input_dict):
+        """Run forward execution of the graph
+
+        Parameters
+        ----------
+        input_dict: dict of str to NDArray
+            List of input values to be feed to
+        """
+        if input_dict:
+            self.set_input(**input_dict)
+        self._run()
+
+    def get_num_outputs(self):
+        """Get the number of outputs from the graph
+
+        Returns
+        -------
+        count : int
+            The number of outputs.
+        """
+        return self._get_num_outputs()
+
+    def get_num_inputs(self):
+        """Get the number of inputs to the graph
+
+        Returns
+        -------
+        count : int
+            The number of inputs.
+        """
+        return self._get_num_inputs()
+
+    def get_input(self, index, out=None):
+        """Get index-th input to out
+
+        Parameters
+        ----------
+        index : int
+            The input index
+
+        out : NDArray
+            The output array container
+        """
+        if out:
+            self._get_input(index).copyto(out)
+            return out
+
+        return self._get_input(index)
+
+    def get_output(self, index, out=None):
+        """Get index-th output to out
+
+        Parameters
+        ----------
+        index : int
+            The output index
+
+        out : NDArray
+            The output array container
+        """
+        if out:
+            self._get_output(index, out)
+            return out
+
+        return self._get_output(index)
+
+    def debug_get_output(self, node, out):
+        """Run graph up to node and get the output to out
+
+        Parameters
+        ----------
+        node : int / str
+            The node index or name
+
+        out : NDArray
+            The output array container
+        """
+        raise NotImplementedError("Please use debugger.debug_executor as graph_executor instead.")
+
+    def load_params(self, params_bytes):
+        """Load parameters from serialized byte array of parameter dict.
+
+        Parameters
+        ----------
+        params_bytes : bytearray
+            The serialized parameter dict.
+        """
+        self._load_params(bytearray(params_bytes))
+
+    def share_params(self, other, params_bytes):
+        """Share parameters from pre-existing GraphExecutor instance.
+
+        Parameters
+        ----------
+        other: GraphExecutor
+            The parent GraphExecutor from which this instance should share
+            it's parameters.
+        params_bytes : bytearray
+            The serialized parameter dict (used only for the parameter names).
+        """
+        self._share_params(other.module, bytearray(params_bytes))
+
+    def __getitem__(self, key):
+        """Get internal module function
+
+        Parameters
+        ----------
+        key : str
+            The key to the module.
+        """
+        return self.module[key]
diff --git a/python/tvm/contrib/graph_runtime.py b/python/tvm/contrib/graph_runtime.py
index 59db716e917c..f8ecfdd70a5b 100644
--- a/python/tvm/contrib/graph_runtime.py
+++ b/python/tvm/contrib/graph_runtime.py
@@ -14,293 +14,16 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Minimum graph runtime that executes graph containing TVM PackedFunc."""
-import numpy as np
-import tvm._ffi
+"""Deprecated Python API for GraphExecutor."""
 
-from tvm.rpc import _ffi_api as _rpc_ffi_api
-from tvm.rpc import base as rpc_base
-from tvm._ffi.base import string_types
-from tvm._ffi.runtime_ctypes import TVMContext
+import warnings
 
+from . import graph_executor
 
-def create(graph_json_str, libmod, ctx):
-    """Create a runtime executor module given a graph and module.
 
-    Parameters
-    ----------
-    graph_json_str : str
-        The graph to be deployed in json format output by json graph.
-        The graph can contain operator(tvm_op) that points to the name
-        of PackedFunc in the libmod.
-
-    libmod : tvm.runtime.Module
-        The module of the corresponding function
-
-    ctx : TVMContext or list of TVMContext
-        The context to deploy the module. It can be local or remote when there
-        is only one TVMContext. Otherwise, the first context in the list will
-        be used as this purpose. All context should be given for heterogeneous
-        execution.
-
-    Returns
-    -------
-    graph_module : GraphModule
-        Runtime graph module that can be used to execute the graph.
-
-    Note
-    ----
-    See also :py:class:`tvm.contrib.graph_runtime.GraphModule`
-    for examples to directly construct a GraphModule from an exported
-    relay compiled library.
-    """
-    assert isinstance(graph_json_str, string_types)
-
-    ctx, num_rpc_ctx, device_type_id = get_device_ctx(libmod, ctx)
-
-    if num_rpc_ctx == len(ctx):
-        fcreate = ctx[0]._rpc_sess.get_function("tvm.graph_runtime.create")
-    else:
-        fcreate = tvm._ffi.get_global_func("tvm.graph_runtime.create")
-
-    return GraphModule(fcreate(graph_json_str, libmod, *device_type_id))
-
-
-def get_device_ctx(libmod, ctx):
-    """Parse and validate all the device context(s).
-
-    Parameters
-    ----------
-    libmod : tvm.runtime.Module
-        The module of the corresponding function
-
-    ctx : TVMContext or list of TVMContext
-
-    Returns
-    -------
-    ctx : list of TVMContext
-    num_rpc_ctx : Number of rpc contexts
-    device_type_id : List of device type and device id
-    """
-
-    if isinstance(ctx, TVMContext):
-        ctx = [ctx]
-    elif not isinstance(ctx, (list, tuple)):
-        raise ValueError("ctx has to be the type of TVMContext or a list of " "TVMContext")
-    for cur_ctx in ctx:
-        if not isinstance(cur_ctx, TVMContext):
-            raise ValueError("ctx has to be the type of TVMContext or a list " "of TVMContext")
-
-    # device_type_id[0], device_type_id[1] are used as the primary/fallback
-    # context type and id. All other ones are used as device context for
-    # heterogeneous execution.
-    num_rpc_ctx = 0
-    device_type_id = []
-    for cur_ctx in ctx:
-        device_type = cur_ctx.device_type
-        if device_type >= rpc_base.RPC_SESS_MASK:
-            assert libmod.type_key == "rpc"
-            assert _rpc_ffi_api.SessTableIndex(libmod) == cur_ctx._rpc_sess._tbl_index
-            num_rpc_ctx += 1
-            device_type = cur_ctx.device_type % rpc_base.RPC_SESS_MASK
-        device_type_id.append(device_type)
-        device_type_id.append(cur_ctx.device_id)
-
-    if 0 < num_rpc_ctx < len(ctx):
-        raise ValueError("Either all or none of the contexts should be rpc.")
-    return ctx, num_rpc_ctx, device_type_id
-
-
-class GraphModule(object):
-    """Wrapper runtime module.
-
-    This is a thin wrapper of the underlying TVM module.
-    you can also directly call set_input, run, and get_output
-    of underlying module functions
-
-    Parameters
-    ----------
-    module : tvm.runtime.Module
-        The internal tvm module that holds the actual graph functions.
-
-    Attributes
-    ----------
-    module : tvm.runtime.Module
-        The internal tvm module that holds the actual graph functions.
-
-    Examples
-    --------
-
-    .. code-block:: python
-
-        import tvm
-        from tvm import relay
-        from tvm.contrib import graph_runtime
-
-        # build the library using graph runtime
-        lib = relay.build(...)
-        lib.export_library("compiled_lib.so")
-        # load it back as a runtime
-        lib: tvm.runtime.Module = tvm.runtime.load_module("compiled_lib.so")
-        # Call the library factory function for default and create
-        # a new runtime.Module, wrap with graph module.
-        gmod = graph_runtime.GraphModule(lib["default"](ctx))
-        # use the graph module.
-        gmod.set_input("x", data)
-        gmod.run()
-    """
-
-    def __init__(self, module):
-        self.module = module
-        self._set_input = module["set_input"]
-        self._run = module["run"]
-        self._get_output = module["get_output"]
-        self._get_input = module["get_input"]
-        self._get_num_outputs = module["get_num_outputs"]
-        self._get_num_inputs = module["get_num_inputs"]
-        self._load_params = module["load_params"]
-        self._share_params = module["share_params"]
-
-    def set_input(self, key=None, value=None, **params):
-        """Set inputs to the module via kwargs
-
-        Parameters
-        ----------
-        key : int or str
-           The input key
-
-        value : the input value.
-           The input key
-
-        params : dict of str to NDArray
-           Additional arguments
-        """
-        if key is not None:
-            v = self._get_input(key)
-            if v is None:
-                raise RuntimeError("Could not find '%s' in graph's inputs" % key)
-            v.copyfrom(value)
-
-        if params:
-            # upload big arrays first to avoid memory issue in rpc mode
-            keys = list(params.keys())
-            keys.sort(key=lambda x: -np.prod(params[x].shape))
-            for k in keys:
-                # TODO(zhiics) Skip the weights for submodule in a better way.
-                # We should use MetadataModule for initialization and remove
-                # params from set_input
-                val = self._get_input(k)
-                if val:
-                    self._get_input(k).copyfrom(params[k])
-
-    def run(self, **input_dict):
-        """Run forward execution of the graph
-
-        Parameters
-        ----------
-        input_dict: dict of str to NDArray
-            List of input values to be feed to
-        """
-        if input_dict:
-            self.set_input(**input_dict)
-        self._run()
-
-    def get_num_outputs(self):
-        """Get the number of outputs from the graph
-
-        Returns
-        -------
-        count : int
-            The number of outputs.
-        """
-        return self._get_num_outputs()
-
-    def get_num_inputs(self):
-        """Get the number of inputs to the graph
-
-        Returns
-        -------
-        count : int
-            The number of inputs.
-        """
-        return self._get_num_inputs()
-
-    def get_input(self, index, out=None):
-        """Get index-th input to out
-
-        Parameters
-        ----------
-        index : int
-            The input index
-
-        out : NDArray
-            The output array container
-        """
-        if out:
-            self._get_input(index).copyto(out)
-            return out
-
-        return self._get_input(index)
-
-    def get_output(self, index, out=None):
-        """Get index-th output to out
-
-        Parameters
-        ----------
-        index : int
-            The output index
-
-        out : NDArray
-            The output array container
-        """
-        if out:
-            self._get_output(index, out)
-            return out
-
-        return self._get_output(index)
-
-    def debug_get_output(self, node, out):
-        """Run graph up to node and get the output to out
-
-        Parameters
-        ----------
-        node : int / str
-            The node index or name
-
-        out : NDArray
-            The output array container
-        """
-        raise NotImplementedError("Please use debugger.debug_runtime as graph_runtime instead.")
-
-    def load_params(self, params_bytes):
-        """Load parameters from serialized byte array of parameter dict.
-
-        Parameters
-        ----------
-        params_bytes : bytearray
-            The serialized parameter dict.
-        """
-        self._load_params(bytearray(params_bytes))
-
-    def share_params(self, other, params_bytes):
-        """Share parameters from pre-existing GraphRuntime instance.
-
-        Parameters
-        ----------
-        other: GraphRuntime
-            The parent GraphRuntime from which this instance should share
-            it's parameters.
-        params_bytes : bytearray
-            The serialized parameter dict (used only for the parameter names).
-        """
-        self._share_params(other.module, bytearray(params_bytes))
-
-    def __getitem__(self, key):
-        """Get internal module function
-
-        Parameters
-        ----------
-        key : str
-            The key to the module.
-        """
-        return self.module[key]
+def create(*args, **kwargs):
+    warnings.warn(
+        "This function has been moved to tvm.contrib.graph_executor and will be removed "
+        "in the next TVM release"
+    )
+    return graph_executor.create(*args, **kwargs)
diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py
index 7e49f55e8d32..99844f799d7a 100644
--- a/python/tvm/contrib/nvcc.py
+++ b/python/tvm/contrib/nvcc.py
@@ -349,6 +349,18 @@ def have_tensorcore(compute_version=None, target=None):
     return False
 
 
+def have_cudagraph():
+    """Either CUDA Graph support is provided"""
+    try:
+        cuda_path = find_cuda_path()
+        cuda_ver = get_cuda_version(cuda_path)
+        if cuda_ver < 10.0:
+            return False
+        return True
+    except RuntimeError:
+        return False
+
+
 def have_bf16(compute_version):
     """Either bf16 support is provided in the compute capability or not
 
diff --git a/python/tvm/contrib/peak.py b/python/tvm/contrib/peak.py
index 0931fc737606..ea9795254a09 100644
--- a/python/tvm/contrib/peak.py
+++ b/python/tvm/contrib/peak.py
@@ -46,7 +46,7 @@ def measure_bandwidth_sum(
     target,
     target_host,
     remote,
-    ctx,
+    dev,
     n_times,
 ):
     """measure memory bandwidth of gpu by product reduction for a given type
@@ -75,8 +75,8 @@ def measure_bandwidth_sum(
         the target and option of the compilation.
     target_host : str or :any:`tvm.target.Target`
         host compilation target
-    ctx: TVMcontext
-        the context of array
+    dev: Device
+        the device of array
     remote: tvm.rpc.RPCSession
         remote rpc session
     n_times: int
@@ -112,11 +112,11 @@ def measure_bandwidth_sum(
     try:
         func = tvm.build(s, [x, y], target)
 
-        x = tvm.nd.empty((n,), dtype=dtype, ctx=ctx)
-        y = tvm.nd.empty((n // m,), dtype=dtype, ctx=ctx)
+        x = tvm.nd.empty((n,), dtype=dtype, device=dev)
+        y = tvm.nd.empty((n // m,), dtype=dtype, device=dev)
 
         func = _convert_to_remote(func, remote)
-        time_f = func.time_evaluator(func.entry_name, ctx, number=n_times)
+        time_f = func.time_evaluator(func.entry_name, dev, number=n_times)
         time = time_f(x, y).mean
     except tvm._ffi.base.TVMError:
         # build error (occur when device does not support half)
@@ -126,7 +126,7 @@ def measure_bandwidth_sum(
 
 
 def measure_bandwidth_all_types(
-    total_item, item_per_thread, n_times, target, target_host, remote, ctx, verbose=True
+    total_item, item_per_thread, n_times, target, target_host, remote, dev, verbose=True
 ):
     """measure memory bandwidth for all types
 
@@ -144,8 +144,8 @@ def measure_bandwidth_all_types(
         host compilation target
     remote: tvm.rpc.RPCSession
         remote rpc session
-    ctx: TVMcontext
-        the context of array
+    dev: Device
+        the device of array
     verbose: bool
         whether outputs immediate result
 
@@ -174,7 +174,7 @@ def measure_bandwidth_all_types(
                         target,
                         target_host,
                         remote,
-                        ctx,
+                        dev,
                         n_times,
                     )
                     max_speed = max(max_speed, speed)
@@ -186,7 +186,7 @@ def measure_bandwidth_all_types(
 
 
 def measure_compute_mad(
-    total_item, item_per_thread, base_type, bits, lanes, target, target_host, remote, ctx, n_times
+    total_item, item_per_thread, base_type, bits, lanes, target, target_host, remote, dev, n_times
 ):
     """measure peak compute speed by computing mad for a type
 
@@ -215,8 +215,8 @@ def measure_compute_mad(
         host compilation target
     remote: tvm.rpc.RPCSession
         if it is not None, use remote rpc session
-    ctx: TVMcontext
-        the context of array
+    dev: Device
+        the device of array
     n_times: int
         number of runs for taking mean
 
@@ -279,8 +279,8 @@ def mad_func(x, y):
     try:
         func = tvm.build(s, [y], target)
         func = _convert_to_remote(func, remote)
-        time_f = func.time_evaluator(func.entry_name, ctx, number=n_times)
-        y = tvm.nd.empty((n,), dtype=dtype, ctx=ctx)
+        time_f = func.time_evaluator(func.entry_name, dev, number=n_times)
+        y = tvm.nd.empty((n,), dtype=dtype, device=dev)
         time = time_f(y).mean
     except tvm._ffi.base.TVMError:
         # build error (occur when device does not support half)
@@ -290,7 +290,7 @@ def mad_func(x, y):
 
 
 def measure_compute_all_types(
-    total_item, item_per_thread, n_times, target, target_host, remote, ctx, verbose=True
+    total_item, item_per_thread, n_times, target, target_host, remote, dev, verbose=True
 ):
     """measure peak flops for all types
 
@@ -308,8 +308,8 @@ def measure_compute_all_types(
         host compilation target
     remote: tvm.rpc.RPCSession
         remote rpc session
-    ctx: TVMcontext
-        the context of array
+    dev: Device
+        the device of array
     verbose: bool
         whether outputs immediate result
 
@@ -338,7 +338,7 @@ def measure_compute_all_types(
                         target,
                         target_host,
                         remote,
-                        ctx,
+                        dev,
                         n_times,
                     )
                     max_speed = max(max_speed, speed)
@@ -375,20 +375,20 @@ def measure_peak_all(target, target_host, host, port):
     compute_item_per_thread = 4096
 
     if str(target).startswith("opencl"):
-        ctx = remote.cl()
+        dev = remote.cl()
     elif str(target).startswith("cuda"):
-        ctx = remote.gpu()
+        dev = remote.gpu()
     elif str(target).startswith("metal"):
-        ctx = remote.metal()
+        dev = remote.metal()
     else:
         raise RuntimeError("Unsupported target")
 
     logging.info("========== measure memory bandwidth ==========")
     measure_bandwidth_all_types(
-        bandwidth_total_item, bandwidth_item_per_thread, n_times, target, target_host, remote, ctx
+        bandwidth_total_item, bandwidth_item_per_thread, n_times, target, target_host, remote, dev
     )
 
     logging.info("========== measure peak compute ==========")
     measure_compute_all_types(
-        compute_total_item, compute_item_per_thread, n_times, target, target_host, remote, ctx
+        compute_total_item, compute_item_per_thread, n_times, target, target_host, remote, dev
     )
diff --git a/python/tvm/contrib/sparse.py b/python/tvm/contrib/sparse.py
index c1263c43b476..bee9b835a98a 100644
--- a/python/tvm/contrib/sparse.py
+++ b/python/tvm/contrib/sparse.py
@@ -30,7 +30,7 @@
 class CSRNDArray(object):
     """Sparse tensor object in CSR format."""
 
-    def __init__(self, arg1, ctx=None, shape=None):
+    def __init__(self, arg1, device=None, shape=None):
         """Construct a sparse matrix in CSR format.
 
         Parameters
@@ -39,8 +39,8 @@ def __init__(self, arg1, ctx=None, shape=None):
             The corresponding a dense numpy array,
             or a tuple for constructing a sparse matrix directly.
 
-        ctx: tvmContext
-            The corresponding context.
+        device: Device
+            The corresponding device.
 
         shape : tuple of int
             The shape of the array
@@ -53,14 +53,14 @@ def __init__(self, arg1, ctx=None, shape=None):
             source_array = arg1
             ridx, cidx = _np.nonzero(source_array)
             data = source_array[ridx, cidx]
-            self.data = _nd.array(data, ctx)
+            self.data = _nd.array(data, device)
             indices = _np.nonzero(source_array)[1].astype(itype)
-            self.indices = _nd.array(indices, ctx)
+            self.indices = _nd.array(indices, device)
             indptr = [0] + _np.apply_along_axis(
                 _np.count_nonzero, axis=1, arr=source_array
             ).tolist()
             indptr = _np.cumsum(_np.array(indptr, itype)).astype(itype)
-            self.indptr = _nd.array(indptr, ctx)
+            self.indptr = _nd.array(indptr, device)
             self.shape = source_array.shape
         else:
             raise RuntimeError(
@@ -89,11 +89,11 @@ def asnumpy(self):
         return full
 
 
-def array(source_array, ctx=None, shape=None, stype="csr"):
+def array(source_array, device=None, shape=None, stype="csr"):
     """Construct a sparse NDArray from numpy.ndarray"""
     ret = None
     if stype == "csr":
-        ret = CSRNDArray(source_array, shape=shape, ctx=ctx)
+        ret = CSRNDArray(source_array, shape=shape, device=device)
     else:
         raise NotImplementedError("stype=%s is not supported yet." % (stype,))
     return ret
diff --git a/python/tvm/contrib/target/coreml.py b/python/tvm/contrib/target/coreml.py
index 0f4bb6675611..18a53bdffd86 100644
--- a/python/tvm/contrib/target/coreml.py
+++ b/python/tvm/contrib/target/coreml.py
@@ -244,5 +244,5 @@ def coreml_compiler(func):
         shutil.rmtree(mlmodelc_path)
     builder.compile(model_dir)
 
-    ctx = tvm.cpu(0)
-    return coreml_runtime.create(name, mlmodelc_path, ctx).module
+    dev = tvm.cpu(0)
+    return coreml_runtime.create(name, mlmodelc_path, dev).module
diff --git a/python/tvm/contrib/tflite_runtime.py b/python/tvm/contrib/tflite_runtime.py
index 3b0e268e2a44..1558e36d51af 100644
--- a/python/tvm/contrib/tflite_runtime.py
+++ b/python/tvm/contrib/tflite_runtime.py
@@ -19,15 +19,15 @@
 from ..rpc import base as rpc_base
 
 
-def create(tflite_model_bytes, ctx, runtime_target="cpu"):
-    """Create a runtime executor module given a tflite model and context.
+def create(tflite_model_bytes, device, runtime_target="cpu"):
+    """Create a runtime executor module given a tflite model and device.
     Parameters
     ----------
     tflite_model_byte : bytes
         The tflite model to be deployed in bytes string format.
-    ctx : TVMContext
-        The context to deploy the module. It can be local or remote when there
-        is only one TVMContext.
+    device : Device
+        The device to deploy the module. It can be local or remote when there
+        is only one Device.
     runtime_target: str
         Execution target of TFLite runtime: either `cpu` or `edge_tpu`.
     Returns
@@ -35,7 +35,7 @@ def create(tflite_model_bytes, ctx, runtime_target="cpu"):
     tflite_runtime : TFLiteModule
         Runtime tflite module that can be used to execute the tflite model.
     """
-    device_type = ctx.device_type
+    device_type = device.device_type
 
     if runtime_target == "edge_tpu":
         runtime_func = "tvm.edgetpu_runtime.create"
@@ -43,11 +43,11 @@ def create(tflite_model_bytes, ctx, runtime_target="cpu"):
         runtime_func = "tvm.tflite_runtime.create"
 
     if device_type >= rpc_base.RPC_SESS_MASK:
-        fcreate = ctx._rpc_sess.get_function(runtime_func)
+        fcreate = device._rpc_sess.get_function(runtime_func)
     else:
         fcreate = tvm._ffi.get_global_func(runtime_func)
 
-    return TFLiteModule(fcreate(bytearray(tflite_model_bytes), ctx))
+    return TFLiteModule(fcreate(bytearray(tflite_model_bytes), device))
 
 
 class TFLiteModule(object):
diff --git a/python/tvm/driver/build_module.py b/python/tvm/driver/build_module.py
index e96d7a8023ba..684dba263648 100644
--- a/python/tvm/driver/build_module.py
+++ b/python/tvm/driver/build_module.py
@@ -232,7 +232,7 @@ def _build_for_device(input_mod, target, target_host):
         A module that contains device code.
     """
     target, target_host = Target.check_and_update_host_consistency(target, target_host)
-    device_type = ndarray.context(target.kind.name, 0).device_type
+    device_type = ndarray.device(target.kind.name, 0).device_type
 
     mod_mixed = input_mod
     mod_mixed = tvm.tir.transform.Apply(lambda f: f.with_attr("target", target))(mod_mixed)
@@ -405,7 +405,7 @@ def build(inputs, args=None, target=None, target_host=None, name="default_functi
     if not target_host:
         for tar, mod in target_input_mod.items():
             tar = Target(tar)
-            device_type = ndarray.context(tar.kind.name, 0).device_type
+            device_type = ndarray.device(tar.kind.name, 0).device_type
             if device_type == ndarray.cpu(0).device_type:
                 target_host = tar
                 break
diff --git a/python/tvm/driver/tvmc/__init__.py b/python/tvm/driver/tvmc/__init__.py
index d96a725877eb..d9c15792349a 100644
--- a/python/tvm/driver/tvmc/__init__.py
+++ b/python/tvm/driver/tvmc/__init__.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+# pylint: disable=redefined-builtin
 """
 TVMC - TVM driver command-line interface
 """
@@ -21,3 +22,6 @@
 from . import autotuner
 from . import compiler
 from . import runner
+from .frontends import load_model as load
+from .compiler import compile_model as compile
+from .runner import run_module as run
diff --git a/python/tvm/driver/tvmc/common.py b/python/tvm/driver/tvmc/common.py
index 71bf42ae1e5c..864c3a9bddb4 100644
--- a/python/tvm/driver/tvmc/common.py
+++ b/python/tvm/driver/tvmc/common.py
@@ -131,9 +131,19 @@ def tokenize_target(target):
         a list of parsed tokens extracted from the target string
     """
 
+    # Regex to tokenize the "--target" value. It is split into five parts
+    # to match with:
+    #  1. target and option names e.g. llvm, -mattr=, -mcpu=
+    #  2. option values, all together, without quotes e.g. -mattr=+foo,+opt
+    #  3. option values, when single quotes are used e.g. -mattr='+foo, +opt'
+    #  4. option values, when double quotes are used e.g. -mattr="+foo ,+opt"
+    #  5. commas that separate different targets e.g. "my-target, llvm"
     target_pattern = (
         r"(\-{0,2}[\w\-]+\=?"
-        r"(?:[\w\+\-]+(?:,[\w\+\-])*|[\'][\w\+\-,\s]+[\']|[\"][\w\+\-,\s]+[\"])*|,)"
+        r"(?:[\w\+\-\.]+(?:,[\w\+\-\.])*"
+        r"|[\'][\w\+\-,\s\.]+[\']"
+        r"|[\"][\w\+\-,\s\.]+[\"])*"
+        r"|,)"
     )
 
     return re.findall(target_pattern, target)
@@ -223,6 +233,11 @@ def parse_target(target):
                 else:
                     opt = opt[1:] if opt.startswith("-") else opt
                     opt_name, opt_value = opt.split("=", maxsplit=1)
+
+                    # remove quotes from the value: quotes are only parsed if they match,
+                    # so it is safe to assume that if the string starts with quote, it ends
+                    # with quote.
+                    opt_value = opt_value[1:-1] if opt_value[0] in ('"', "'") else opt_value
             except ValueError:
                 raise ValueError(f"Error when parsing '{opt}'")
 
@@ -265,7 +280,7 @@ def target_from_cli(target):
     """
     extra_targets = []
 
-    if os.path.exists(target):
+    if os.path.isfile(target):
         with open(target) as target_file:
             logger.debug("target input is a path: %s", target)
             target = "".join(target_file.readlines())
diff --git a/python/tvm/driver/tvmc/compiler.py b/python/tvm/driver/tvmc/compiler.py
index 4eeddf0ac4be..b8e88532efec 100644
--- a/python/tvm/driver/tvmc/compiler.py
+++ b/python/tvm/driver/tvmc/compiler.py
@@ -144,7 +144,7 @@ def compile_model(
 
     This function takes a union of the arguments of both frontends.load_model
     and compiler.compile_relay. The resulting TVM module can be executed using
-    the graph runtime.
+    the graph executor.
 
     Parameters
     ----------
diff --git a/python/tvm/driver/tvmc/composite_target.py b/python/tvm/driver/tvmc/composite_target.py
index 0a2592685646..886160ad000c 100644
--- a/python/tvm/driver/tvmc/composite_target.py
+++ b/python/tvm/driver/tvmc/composite_target.py
@@ -21,6 +21,7 @@
 
 from tvm.relay.op.contrib.arm_compute_lib import partition_for_arm_compute_lib
 from tvm.relay.op.contrib.ethosn import partition_for_ethosn
+from tvm.relay.op.contrib.bnns import partition_for_bnns
 
 from .common import TVMCException
 
@@ -40,6 +41,10 @@
         "config_key": "relay.ext.ethos-n.options",
         "pass_pipeline": partition_for_ethosn,
     },
+    "bnns": {
+        "config_key": None,
+        "pass_pipeline": partition_for_bnns,
+    },
 }
 
 
diff --git a/python/tvm/driver/tvmc/frontends.py b/python/tvm/driver/tvmc/frontends.py
index 16e6c8eb966e..0488223c782f 100644
--- a/python/tvm/driver/tvmc/frontends.py
+++ b/python/tvm/driver/tvmc/frontends.py
@@ -54,7 +54,7 @@ def suffixes():
         """File suffixes (extensions) used by this frontend"""
 
     @abstractmethod
-    def load(self, path, shape_dict=None):
+    def load(self, path, shape_dict=None, **kwargs):
         """Load a model from a given path.
 
         Parameters
@@ -101,7 +101,7 @@ def name():
     def suffixes():
         return ["h5"]
 
-    def load(self, path, shape_dict=None):
+    def load(self, path, shape_dict=None, **kwargs):
         # pylint: disable=C0103
         tf, keras = import_keras()
 
@@ -130,7 +130,8 @@ def load(self, path, shape_dict=None):
         input_shapes = {name: x.shape for (name, x) in zip(model.input_names, inputs)}
         if shape_dict is not None:
             input_shapes.update(shape_dict)
-        return relay.frontend.from_keras(model, input_shapes, layout="NHWC")
+        kwargs.setdefault("layout", "NHWC")
+        return relay.frontend.from_keras(model, input_shapes, **kwargs)
 
     def is_sequential_p(self, model):
         _, keras = import_keras()
@@ -158,14 +159,14 @@ def name():
     def suffixes():
         return ["onnx"]
 
-    def load(self, path, shape_dict=None):
+    def load(self, path, shape_dict=None, **kwargs):
         # pylint: disable=C0415
         import onnx
 
         # pylint: disable=E1101
         model = onnx.load(path)
 
-        return relay.frontend.from_onnx(model, shape=shape_dict)
+        return relay.frontend.from_onnx(model, shape=shape_dict, **kwargs)
 
 
 class TensorflowFrontend(Frontend):
@@ -179,7 +180,7 @@ def name():
     def suffixes():
         return ["pb"]
 
-    def load(self, path, shape_dict=None):
+    def load(self, path, shape_dict=None, **kwargs):
         # pylint: disable=C0415
         import tensorflow as tf
         import tvm.relay.testing.tf as tf_testing
@@ -192,7 +193,7 @@ def load(self, path, shape_dict=None):
         graph_def = tf_testing.ProcessGraphDefParam(graph_def)
 
         logger.debug("parse TensorFlow model and convert into Relay computation graph")
-        return relay.frontend.from_tensorflow(graph_def, shape=shape_dict)
+        return relay.frontend.from_tensorflow(graph_def, shape=shape_dict, **kwargs)
 
 
 class TFLiteFrontend(Frontend):
@@ -206,7 +207,7 @@ def name():
     def suffixes():
         return ["tflite"]
 
-    def load(self, path, shape_dict=None):
+    def load(self, path, shape_dict=None, **kwargs):
         # pylint: disable=C0415
         import tflite.Model as model
 
@@ -229,7 +230,7 @@ def load(self, path, shape_dict=None):
             raise TVMCException("input file not tflite version 3")
 
         logger.debug("parse TFLite model and convert into Relay computation graph")
-        mod, params = relay.frontend.from_tflite(tflite_model, shape_dict=shape_dict)
+        mod, params = relay.frontend.from_tflite(tflite_model, shape_dict=shape_dict, **kwargs)
         return mod, params
 
 
@@ -245,7 +246,7 @@ def suffixes():
         # Torch Script is a zip file, but can be named pth
         return ["pth", "zip"]
 
-    def load(self, path, shape_dict=None):
+    def load(self, path, shape_dict=None, **kwargs):
         # pylint: disable=C0415
         import torch
 
@@ -259,7 +260,7 @@ def load(self, path, shape_dict=None):
         input_shapes = list(shape_dict.items())
 
         logger.debug("parse Torch model and convert into Relay computation graph")
-        return relay.frontend.from_pytorch(traced_model, input_shapes)
+        return relay.frontend.from_pytorch(traced_model, input_shapes, **kwargs)
 
 
 ALL_FRONTENDS = [
@@ -339,7 +340,7 @@ def guess_frontend(path):
     raise TVMCException("failed to infer the model format. Please specify --model-format")
 
 
-def load_model(path, model_format=None, shape_dict=None):
+def load_model(path, model_format=None, shape_dict=None, **kwargs):
     """Load a model from a supported framework and convert it
     into an equivalent relay representation.
 
@@ -367,6 +368,6 @@ def load_model(path, model_format=None, shape_dict=None):
     else:
         frontend = guess_frontend(path)
 
-    mod, params = frontend.load(path, shape_dict)
+    mod, params = frontend.load(path, shape_dict, **kwargs)
 
     return mod, params
diff --git a/python/tvm/driver/tvmc/runner.py b/python/tvm/driver/tvmc/runner.py
index 1d23ccfb0c00..b4c4e75aa37a 100644
--- a/python/tvm/driver/tvmc/runner.py
+++ b/python/tvm/driver/tvmc/runner.py
@@ -26,8 +26,8 @@
 import numpy as np
 from tvm import rpc
 from tvm.autotvm.measure import request_remote
-from tvm.contrib import graph_runtime as runtime
-from tvm.contrib.debugger import debug_runtime
+from tvm.contrib import graph_executor as runtime
+from tvm.contrib.debugger import debug_executor
 from tvm.relay import load_param_dict
 
 from . import common
@@ -77,7 +77,7 @@ def add_run_parser(subparsers):
         "--profile",
         action="store_true",
         help="generate profiling data from the runtime execution. "
-        "Using --profile requires the Graph Runtime Debug enabled on TVM. "
+        "Using --profile requires the Graph Executor Debug enabled on TVM. "
         "Profiling may also have an impact on inference time, "
         "making it take longer to be generated.",
     )
@@ -296,7 +296,7 @@ def run_module(
     repeat=1,
     profile=False,
 ):
-    """Run a compiled graph runtime module locally or remotely with
+    """Run a compiled graph executor module locally or remotely with
     optional input values.
 
     If input tensors are not specified explicitly, they can be filled
@@ -361,19 +361,19 @@ def run_module(
         # TODO expand to other supported devices, as listed in tvm.rpc.client (@leandron)
         logger.debug("device is %s", device)
         if device == "gpu":
-            ctx = session.gpu()
+            dev = session.gpu()
         elif device == "cl":
-            ctx = session.cl()
+            dev = session.cl()
         else:
             assert device == "cpu"
-            ctx = session.cpu()
+            dev = session.cpu()
 
         if profile:
             logger.debug("creating runtime with profiling enabled")
-            module = debug_runtime.create(graph, lib, ctx, dump_root="./prof")
+            module = debug_executor.create(graph, lib, dev, dump_root="./prof")
         else:
             logger.debug("creating runtime with profiling disabled")
-            module = runtime.create(graph, lib, ctx)
+            module = runtime.create(graph, lib, dev)
 
         logger.debug("load params into the runtime module")
         module.load_params(params)
@@ -390,7 +390,7 @@ def run_module(
             module.run()
 
         # create the module time evaluator (returns a function)
-        timer = module.module.time_evaluator("run", ctx, 1, repeat=repeat)
+        timer = module.module.time_evaluator("run", dev, 1, repeat=repeat)
         # call the evaluator function to invoke the module and save execution times
         prof_result = timer()
         # collect a list of execution times from the profiling results
diff --git a/python/tvm/micro/__init__.py b/python/tvm/micro/__init__.py
index ade63f2da9e4..a70cb96d9b13 100644
--- a/python/tvm/micro/__init__.py
+++ b/python/tvm/micro/__init__.py
@@ -25,8 +25,8 @@
 from .micro_binary import MicroBinary
 from .model_library_format import export_model_library_format, UnsupportedInModelLibraryFormatError
 from .session import (
-    create_local_graph_runtime,
-    create_local_debug_runtime,
+    create_local_graph_executor,
+    create_local_debug_executor,
     Session,
     SessionTerminatedError,
 )
diff --git a/python/tvm/micro/build.py b/python/tvm/micro/build.py
index 3837d423f8bd..d95f14f0349e 100644
--- a/python/tvm/micro/build.py
+++ b/python/tvm/micro/build.py
@@ -118,7 +118,7 @@ def get_runtime_libs() -> str:
 RUNTIME_SRC_REGEX = re.compile(r"^.*\.cc?$", re.IGNORECASE)
 
 
-_COMMON_CFLAGS = ["-Wall", "-Werror"]
+_COMMON_CFLAGS = ["-Wall", "-Werror", "-DDMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>"]
 
 
 def _build_default_compiler_options(standalone_crt_dir: typing.Optional[str] = None) -> str:
diff --git a/python/tvm/micro/contrib/zephyr.py b/python/tvm/micro/contrib/zephyr.py
index cd9c23cd2f9d..104d955835a1 100644
--- a/python/tvm/micro/contrib/zephyr.py
+++ b/python/tvm/micro/contrib/zephyr.py
@@ -650,10 +650,10 @@ def popen_kwargs(self):
         env = dict(os.environ)
         env["ZEPHYR_BASE"] = self._zephyr_base
 
-        return dict(
+        args = dict(
             args=self._west_cmd
             + [
-                "debug",
+                "attach",
                 "--skip-rebuild",
                 "--build-dir",
                 self._build_dir,
@@ -662,3 +662,4 @@ def popen_kwargs(self):
             ],
             env=env,
         )
+        return args
diff --git a/python/tvm/micro/model_library_format.py b/python/tvm/micro/model_library_format.py
index 4ce80be647c1..6768e03f4473 100644
--- a/python/tvm/micro/model_library_format.py
+++ b/python/tvm/micro/model_library_format.py
@@ -24,7 +24,7 @@
 import tarfile
 
 from ..contrib import utils
-from ..relay.backend import graph_runtime_factory
+from ..relay.backend import graph_executor_factory
 from ..relay import param_dict
 
 
@@ -117,7 +117,7 @@ def _build_memory_map(graph_json):
     return memory_map
 
 
-def export_model_library_format(mod: graph_runtime_factory.GraphRuntimeFactoryModule, file_name):
+def export_model_library_format(mod: graph_executor_factory.GraphExecutorFactoryModule, file_name):
     """Export the build artifact in Model Library Format.
 
     This function creates a .tar archive containing the build artifacts in a standardized
@@ -126,7 +126,7 @@ def export_model_library_format(mod: graph_runtime_factory.GraphRuntimeFactoryMo
 
     Parameters
     ----------
-    mod : tvm.relay.backend.graph_runtime_factory.GraphRuntimeFactoryModule
+    mod : tvm.relay.backend.graph_executor_factory.GraphExecutorFactoryModule
         The return value of tvm.relay.build, which will be exported into Model Library Format.
     file_name : str
         Path to the .tar archive to generate.
diff --git a/python/tvm/micro/session.py b/python/tvm/micro/session.py
index 717b6e480671..78bf03379939 100644
--- a/python/tvm/micro/session.py
+++ b/python/tvm/micro/session.py
@@ -22,8 +22,8 @@
 
 from ..error import register_error
 from .._ffi import get_global_func
-from ..contrib import graph_runtime
-from ..contrib.debugger import debug_runtime
+from ..contrib import graph_executor
+from ..contrib.debugger import debug_executor
 from ..rpc import RPCSession
 from .transport import IoTimeoutError
 from .transport import TransportLogger
@@ -92,7 +92,7 @@ def __init__(
         self.timeout_override = timeout_override
 
         self._rpc = None
-        self._graph_runtime = None
+        self._graph_executor = None
 
     def get_system_lib(self):
         return self._rpc.get_function("runtime.SystemLib")()
@@ -143,7 +143,7 @@ def __enter__(self):
                     int(timeouts.session_established_timeout_sec * 1e6),
                 )
             )
-            self.context = self._rpc.cpu(0)
+            self.device = self._rpc.cpu(0)
             return self
 
         except:
@@ -155,7 +155,7 @@ def __exit__(self, exc_type, exc_value, exc_traceback):
         self.transport.__exit__(exc_type, exc_value, exc_traceback)
 
 
-def lookup_remote_linked_param(mod, storage_id, template_tensor, ctx):
+def lookup_remote_linked_param(mod, storage_id, template_tensor, device):
     """Lookup a parameter that has been pre-linked into a remote (i.e. over RPC) Module.
 
     This function signature matches the signature built by
@@ -170,8 +170,8 @@ def lookup_remote_linked_param(mod, storage_id, template_tensor, ctx):
         A DLTensor containing metadata that should be filled-in to the returned NDArray. This
         function should mostly not inspect this, and just pass it along to
         NDArrayFromRemoteOpaqueHandle.
-    ctx : TVMContext
-        The remote CPU context to be used with the returned NDArray.
+    device : Device
+        The remote CPU device to be used with the returned NDArray.
 
     Returns
     -------
@@ -188,12 +188,12 @@ def lookup_remote_linked_param(mod, storage_id, template_tensor, ctx):
         return None
 
     return get_global_func("tvm.rpc.NDArrayFromRemoteOpaqueHandle")(
-        mod, remote_data, template_tensor, ctx, None
+        mod, remote_data, template_tensor, device, None
     )
 
 
-def create_local_graph_runtime(graph_json_str, mod, ctx):
-    """Create a local graph runtime driving execution on the remote CPU context given.
+def create_local_graph_executor(graph_json_str, mod, device):
+    """Create a local graph executor driving execution on the remote CPU device given.
 
     Parameters
     ----------
@@ -203,23 +203,23 @@ def create_local_graph_runtime(graph_json_str, mod, ctx):
     mod : tvm.runtime.Module
         The remote module containing functions in graph_json_str.
 
-    ctx : tvm.Context
-        The remote CPU execution context.
+    device : tvm.runtime.Device
+        The remote CPU execution device.
 
     Returns
     -------
-    tvm.contrib.GraphRuntime :
-         A local graph runtime instance that executes on the remote device.
+    tvm.contrib.GraphExecutor :
+         A local graph executor instance that executes on the remote device.
     """
-    device_type_id = [ctx.device_type, ctx.device_id]
-    fcreate = get_global_func("tvm.graph_runtime.create")
-    return graph_runtime.GraphModule(
+    device_type_id = [device.device_type, device.device_id]
+    fcreate = get_global_func("tvm.graph_executor.create")
+    return graph_executor.GraphModule(
         fcreate(graph_json_str, mod, lookup_remote_linked_param, *device_type_id)
     )
 
 
-def create_local_debug_runtime(graph_json_str, mod, ctx, dump_root=None):
-    """Create a local debug runtime driving execution on the remote CPU context given.
+def create_local_debug_executor(graph_json_str, mod, device, dump_root=None):
+    """Create a local debug runtime driving execution on the remote CPU device given.
 
     Parameters
     ----------
@@ -229,22 +229,22 @@ def create_local_debug_runtime(graph_json_str, mod, ctx, dump_root=None):
     mod : tvm.runtime.Module
         The remote module containing functions in graph_json_str.
 
-    ctx : tvm.Context
-        The remote CPU execution context.
+    device : tvm.runtime.Device
+        The remote CPU execution device.
 
     dump_root : Optional[str]
         If given, passed as dump_root= to GraphModuleDebug.
 
     Returns
     -------
-    tvm.contrib.GraphRuntime :
-         A local graph runtime instance that executes on the remote device.
+    tvm.contrib.GraphExecutor :
+         A local graph executor instance that executes on the remote device.
     """
-    device_type_id = [ctx.device_type, ctx.device_id]
-    fcreate = get_global_func("tvm.graph_runtime_debug.create")
-    return debug_runtime.GraphModuleDebug(
+    device_type_id = [device.device_type, device.device_id]
+    fcreate = get_global_func("tvm.graph_executor_debug.create")
+    return debug_executor.GraphModuleDebug(
         fcreate(graph_json_str, mod, lookup_remote_linked_param, *device_type_id),
-        [ctx],
+        [device],
         graph_json_str,
         dump_root=dump_root,
     )
diff --git a/python/tvm/relay/analysis/analysis.py b/python/tvm/relay/analysis/analysis.py
index 48e9ce0643a9..661d7523ad77 100644
--- a/python/tvm/relay/analysis/analysis.py
+++ b/python/tvm/relay/analysis/analysis.py
@@ -28,7 +28,7 @@
 from .feature import Feature
 
 
-def context_analysis(mod, default_context):
+def context_analysis(mod, default_device):
     """Analyze the device context information of each IR node in a Relay
     program.
 
@@ -37,10 +37,10 @@ def context_analysis(mod, default_context):
     mod : tvm.IRModule
         The input module.
 
-    default_context : tvm.runtime.TVMContext
+    default_device : tvm.runtime.Device
         The default context allocated to an IR node.
     """
-    return _ffi_api.ContextAnalysis(mod, default_context)
+    return _ffi_api.ContextAnalysis(mod, default_device)
 
 
 def post_order_visit(expr, fvisit):
@@ -405,7 +405,7 @@ def search_fc_transpose(expr):
 def get_calibration_data(mod, data):
     """Get the calibration data of a given relay graph
 
-    This pass uses the graph runtime to get the calibration data of a module, which
+    This pass uses the graph executor to get the calibration data of a module, which
     includes the input and output values of each function. The returned data uses
     the GlobalVar of each function as a key. Users can further access the inputs and
     outputs by using `inputs` or  `outputs` as the key.
@@ -433,7 +433,7 @@ def get_calibration_data(mod, data):
     mod = _ffi_api.get_calibrate_module(mod)
     mod = transform.Inline()(mod)
 
-    ref_ex = build_module.create_executor("graph", mod=mod, ctx=cpu(0))
+    ref_ex = build_module.create_executor("graph", mod=mod, device=cpu(0))
     ref_res = ref_ex.evaluate()(**data)
 
     calib_data = {}
diff --git a/python/tvm/relay/backend/graph_runtime_codegen.py b/python/tvm/relay/backend/graph_executor_codegen.py
similarity index 89%
rename from python/tvm/relay/backend/graph_runtime_codegen.py
rename to python/tvm/relay/backend/graph_executor_codegen.py
index 81ab4cb4de25..f24bf2c2b55b 100644
--- a/python/tvm/relay/backend/graph_runtime_codegen.py
+++ b/python/tvm/relay/backend/graph_executor_codegen.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """
-A compiler from a Relay expression to TVM's graph runtime.
+A compiler from a Relay expression to TVM's graph executor.
 
 The compiler is built from a few pieces.
 
@@ -29,9 +29,9 @@
 graph langauge is composed of Node, NodeRef, InputNode, OpNode.
 This "little language" represents programs in TVM's graph format.
 
-To connect to the graph runtime, we use a printer that converts our graph format
+To connect to the graph executor, we use a printer that converts our graph format
 into TVM's JSON format. The resulting string can be loaded by
-contrib.graph_runtime or any other TVM runtime compatible systems.
+contrib.graph_executor or any other TVM runtime compatible systems.
 """
 from tvm.runtime.ndarray import empty
 from tvm.relay import _build_module
@@ -39,11 +39,11 @@
 from tvm.tir import expr as _expr
 
 
-class GraphRuntimeCodegen(object):
+class GraphExecutorCodegen(object):
     """The compiler from Relay to the TVM runtime system."""
 
     def __init__(self, mod, target):
-        self._mod = _build_module._GraphRuntimeCodegen()
+        self._mod = _build_module._GraphExecutorCodegen()
         self._init = self._mod["init"]
         self._codegen = self._mod["codegen"]
         self._get_graph_json = self._mod["get_graph_json"]
@@ -87,7 +87,7 @@ def codegen(self, func):
         params = {}
         for key in param_names:
             arr = self._get_param_by_name(key)
-            param = empty(arr.shape, dtype=arr.dtype, ctx=arr.ctx)
+            param = empty(arr.shape, dtype=arr.dtype, device=arr.device)
             arr.copyto(param)
             params[key] = param
         return graph_json, lowered_func, params
diff --git a/python/tvm/relay/backend/graph_runtime_factory.py b/python/tvm/relay/backend/graph_executor_factory.py
similarity index 88%
rename from python/tvm/relay/backend/graph_runtime_factory.py
rename to python/tvm/relay/backend/graph_executor_factory.py
index e92ae710ca0b..d6959d22e5c8 100644
--- a/python/tvm/relay/backend/graph_runtime_factory.py
+++ b/python/tvm/relay/backend/graph_executor_factory.py
@@ -14,16 +14,16 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Graph runtime factory."""
+"""Graph executor factory."""
 import warnings
 from ..._ffi.base import string_types
 from ..._ffi.registry import get_global_func
 from ...runtime import ndarray
 
 
-class GraphRuntimeFactoryModule:
-    """Graph runtime factory module.
-    This is a module of graph runtime factory
+class GraphExecutorFactoryModule:
+    """Graph executor factory module.
+    This is a module of graph executor factory
 
     Parameters
     ----------
@@ -43,7 +43,7 @@ class GraphRuntimeFactoryModule:
 
     def __init__(self, ir_mod, target, graph_json_str, libmod, libmod_name, params):
         assert isinstance(graph_json_str, string_types)
-        fcreate = get_global_func("tvm.graph_runtime_factory.create")
+        fcreate = get_global_func("tvm.graph_executor_factory.create")
         args = []
         for k, v in params.items():
             args.append(k)
@@ -77,9 +77,9 @@ def __getitem__(self, item):
 
     def __iter__(self):
         warnings.warn(
-            "legacy graph runtime behavior of producing json / lib / params will be "
+            "legacy graph executor behavior of producing json / lib / params will be "
             "removed in the next release."
-            " Please see documents of tvm.contrib.graph_runtime.GraphModule for the "
+            " Please see documents of tvm.contrib.graph_executor.GraphModule for the "
             " new recommended usage.",
             DeprecationWarning,
             2,
diff --git a/python/tvm/relay/backend/interpreter.py b/python/tvm/relay/backend/interpreter.py
index ba09094afca1..b62fca86668d 100644
--- a/python/tvm/relay/backend/interpreter.py
+++ b/python/tvm/relay/backend/interpreter.py
@@ -191,16 +191,16 @@ class Interpreter(Executor):
     mod : tvm.IRModule
         The module to support the execution.
 
-    ctx : tvmContext
-        The runtime context to run the code on.
+    device : Device
+        The runtime device to run the code on.
 
     target : tvm.Target
         The target option to build the function.
     """
 
-    def __init__(self, mod, ctx, target):
+    def __init__(self, mod, device, target):
         self.mod = mod
-        self.ctx = ctx
+        self.device = device
         self.target = target
 
     def optimize(self):
@@ -253,7 +253,7 @@ def _interp_wrapper(*args, **kwargs):
 
             mod = self.optimize()
             opt_expr = Call(mod["main"], relay_args)
-            _intrp = _backend.CreateInterpreter(mod, self.ctx, self.target)
+            _intrp = _backend.CreateInterpreter(mod, self.device, self.target)
             return _intrp(opt_expr)
 
         return _interp_wrapper
diff --git a/python/tvm/relay/backend/vm.py b/python/tvm/relay/backend/vm.py
index 6d8961b876f1..a86b29250052 100644
--- a/python/tvm/relay/backend/vm.py
+++ b/python/tvm/relay/backend/vm.py
@@ -200,11 +200,11 @@ def _update_target(self, target):
             raise ValueError("Target is not set in env or passed as argument.")
         tgts = {}
         if isinstance(target, (str, tvm.target.Target)):
-            dev_type = tvm.tir.IntImm("int32", tvm.nd.context(str(target)).device_type)
+            dev_type = tvm.tir.IntImm("int32", tvm.nd.device(str(target)).device_type)
             tgts[dev_type] = tvm.target.Target(target)
         elif isinstance(target, dict):
             for dev, tgt in target.items():
-                dev_type = tvm.tir.IntImm("int32", tvm.nd.context(dev).device_type)
+                dev_type = tvm.tir.IntImm("int32", tvm.nd.device(dev).device_type)
                 tgts[dev_type] = tvm.target.Target(tgt)
         else:
             raise TypeError(
@@ -253,21 +253,21 @@ class VMExecutor(Executor):
     mod : :py:class:`~tvm.IRModule`
         The module to support the execution.
 
-    ctx : :py:class:`~tvmContext`
-        The runtime context to run the code on.
+    device : :py:class:`~tvm.runtime.Device`
+        The runtime device to run the code on.
 
     target : :py:class:`Target`
         The target option to build the function.
     """
 
-    def __init__(self, mod, ctx, target):
+    def __init__(self, mod, device, target):
         if mod is None:
             raise RuntimeError("Must provide module to get VM executor.")
         self.mod = mod
-        self.ctx = ctx
+        self.device = device
         self.target = target
         self.executable = compile(mod, target)
-        self.vm = vm_rt.VirtualMachine(self.executable, ctx)
+        self.vm = vm_rt.VirtualMachine(self.executable, device)
 
     def _make_executor(self, expr=None):
         main = self.mod["main"]
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 5201ac3f4fa6..174e89bf69a4 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """
-Construct the necessary state for the TVM graph runtime
+Construct the necessary state for the TVM graph executor
 from a Relay expression.
 """
 import warnings
@@ -28,13 +28,13 @@
 from tvm.target import Target
 from .. import nd as _nd, autotvm, register_func
 from ..target import Target
-from ..contrib import graph_runtime as _graph_rt
+from ..contrib import graph_executor as _graph_rt
 from . import _build_module
 from . import ty as _ty
 from . import expr as _expr
 from . import function as _function
 from .transform import InferType
-from .backend import graph_runtime_factory as _graph_runtime_factory
+from .backend import graph_executor_factory as _graph_executor_factory
 from .backend import interpreter as _interpreter
 from .backend.vm import VMExecutor
 
@@ -46,11 +46,11 @@ def _update_target(target):
 
     tgts = {}
     if isinstance(target, (str, Target)):
-        dev_type = tvm_expr.IntImm("int32", _nd.context(str(target)).device_type)
+        dev_type = tvm_expr.IntImm("int32", _nd.device(str(target)).device_type)
         tgts[dev_type] = Target(target)
     elif isinstance(target, dict):
         for dev, tgt in target.items():
-            dev_type = tvm_expr.IntImm("int32", _nd.context(dev).device_type)
+            dev_type = tvm_expr.IntImm("int32", _nd.device(dev).device_type)
             tgts[dev_type] = Target(tgt)
     else:
         raise TypeError(
@@ -71,7 +71,7 @@ def _convert_param_map(params):
 
 
 class BuildModule(object):
-    """Build an IR module to run on TVM graph runtime. This class is used
+    """Build an IR module to run on TVM graph executor. This class is used
     to expose the `RelayBuildModule` APIs implemented in C++.
     """
 
@@ -111,8 +111,8 @@ def build(self, mod, target=None, target_host=None, params=None):
 
         Returns
         -------
-        factory_module : tvm.relay.backend.graph_runtime_factory.GraphRuntimeFactoryModule
-            The runtime factory for the TVM graph runtime.
+        factory_module : tvm.relay.backend.graph_executor_factory.GraphExecutorFactoryModule
+            The runtime factory for the TVM graph executor.
         """
         target = _update_target(target)
         target, target_host = Target.check_and_update_host_consistency(
@@ -216,7 +216,7 @@ def _build_module_no_factory(mod, target=None, target_host=None, params=None, mo
 def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"):
     # fmt: off
     # pylint: disable=line-too-long
-    """Helper function that builds a Relay function to run on TVM graph runtime.
+    """Helper function that builds a Relay function to run on TVM graph executor.
 
     Parameters
     ----------
@@ -246,7 +246,7 @@ def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"
     Returns
     -------
     graph_json : str
-        The json string that can be accepted by graph runtime.
+        The json string that can be accepted by graph executor.
 
     mod : tvm.Module
         The module containing necessary libraries.
@@ -288,10 +288,10 @@ def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"
     with tophub_context:
         bld_mod = BuildModule()
         graph_json, runtime_mod, params = bld_mod.build(mod=ir_mod, target=target, params=params)
-        runtime_mod = _graph_runtime_factory.GraphRuntimeFactoryModule(
+        executor_factory = _graph_executor_factory.GraphExecutorFactoryModule(
             ir_mod, target, graph_json, runtime_mod, mod_name, params
         )
-        return runtime_mod
+        return executor_factory
 
 
 def optimize(mod, target=None, params=None):
@@ -380,17 +380,17 @@ class GraphExecutor(_interpreter.Executor):
     mod : :py:class:`~tvm.IRModule`
         The module to support the execution.
 
-    ctx : :py:class:`TVMContext`
-        The runtime context to run the code on.
+    device : :py:class:`Device`
+        The runtime device to run the code on.
 
     target : :py:class:`Target`
         The target option to build the function.
     """
 
-    def __init__(self, mod, ctx, target):
+    def __init__(self, mod, device, target):
         assert mod is not None
         self.mod = mod
-        self.ctx = ctx
+        self.device = device
         self.target = target
 
     def _make_executor(self, expr=None):
@@ -399,9 +399,11 @@ def _make_executor(self, expr=None):
         self.mod = InferType()(self.mod)
         ret_type = self.mod["main"].checked_type.ret_type
         if _ty.is_dynamic(ret_type):
-            raise ValueError("Graph Runtime only supports static graphs, got output type", ret_type)
+            raise ValueError(
+                "Graph Executor only supports static graphs, got output type", ret_type
+            )
         mod = build(self.mod, target=self.target)
-        gmodule = _graph_rt.GraphModule(mod["default"](self.ctx))
+        gmodule = _graph_rt.GraphModule(mod["default"](self.device))
 
         def _unflatten(flat_iter, cur_type):
             if isinstance(cur_type, _ty.TensorType):
@@ -430,7 +432,7 @@ def _graph_wrapper(*args, **kwargs):
         return _graph_wrapper
 
 
-def create_executor(kind="debug", mod=None, ctx=None, target="llvm"):
+def create_executor(kind="debug", mod=None, device=None, target="llvm"):
     """Factory function to create an executor.
 
     Example
@@ -451,14 +453,14 @@ def create_executor(kind="debug", mod=None, ctx=None, target="llvm"):
     ----------
     kind : str
         The type of executor. Avaliable options are `debug` for the
-        interpreter, `graph` for the graph runtime, and `vm` for the virtual
+        interpreter, `graph` for the graph executor, and `vm` for the virtual
         machine.
 
     mod : :py:class:`~tvm.IRModule`
         The Relay module containing collection of functions
 
-    ctx : :py:class:`tvmContext`
-        The context to execute the code.
+    device : :py:class:`Device`
+        The device to execute the code.
 
     target : :py:class:`tvm.Target`
         The corresponding context
@@ -469,17 +471,17 @@ def create_executor(kind="debug", mod=None, ctx=None, target="llvm"):
     """
     if mod is None:
         mod = IRModule()
-    if ctx is not None:
-        assert ctx.device_type == _nd.context(str(target), 0).device_type
+    if device is not None:
+        assert device.device_type == _nd.device(str(target), 0).device_type
     else:
-        ctx = _nd.context(str(target), 0)
+        device = _nd.device(str(target), 0)
 
     if isinstance(target, str):
         target = Target(target)
     if kind == "debug":
-        return _interpreter.Interpreter(mod, ctx, target)
+        return _interpreter.Interpreter(mod, device, target)
     if kind == "graph":
-        return GraphExecutor(mod, ctx, target)
+        return GraphExecutor(mod, device, target)
     if kind == "vm":
-        return VMExecutor(mod, ctx, target)
+        return VMExecutor(mod, device, target)
     raise RuntimeError("unknown execution strategy: {0}".format(kind))
diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py
index 2db420a40992..c2546205c571 100644
--- a/python/tvm/relay/frontend/common.py
+++ b/python/tvm/relay/frontend/common.py
@@ -530,13 +530,13 @@ def infer_value(input_val, params, mod=None):
     try:
         # TODO(kevinthesun): Use VM for all cases.
         # pylint: disable=import-outside-toplevel
-        from tvm.contrib import graph_runtime
+        from tvm.contrib import graph_executor
 
         func = _function.Function(analysis.free_vars(input_val), input_val)
         with tvm.transform.PassContext(opt_level=0):
             lib = tvm.relay.build(func, target="llvm", params=params)
-        ctx = tvm.cpu(0)
-        m = graph_runtime.GraphModule(lib["default"](ctx))
+        dev = tvm.cpu(0)
+        m = graph_executor.GraphModule(lib["default"](dev))
         m.run()
         return m.get_output(0)
     except Exception:
@@ -544,7 +544,7 @@ def infer_value(input_val, params, mod=None):
             mod["main"] = _function.Function(analysis.free_vars(input_val), input_val)
         else:
             mod = IRModule.from_expr(input_val)
-        exc = tvm.relay.create_executor("debug", mod=mod, ctx=tvm.cpu(), target="llvm")
+        exc = tvm.relay.create_executor("debug", mod=mod, device=tvm.cpu(), target="llvm")
         inputs = []
         for param in mod["main"].params:
             inputs.append(params[param.name_hint])
diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 860753d6cd0b..01c22d0f9fab 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -103,10 +103,11 @@ def get_numpy(tensor_proto):
 def get_type(elem_type):
     """Converts onnx integer datatype to numpy datatype"""
     try:
-        from onnx import TensorProto
+        from onnx.mapping import TENSOR_TYPE_TO_NP_TYPE
     except ImportError as e:
         raise ImportError("Unable to import onnx which is required {}".format(e))
-    return TensorProto.DataType.Name(elem_type).lower()
+
+    return str(TENSOR_TYPE_TO_NP_TYPE[elem_type])
 
 
 def get_info(info_proto):
@@ -157,7 +158,7 @@ def revert_caffe2_pad(pads):
     return pads
 
 
-def get_pad_pair(input1d, kernel1d, stride1d):
+def get_pad_pair(input1d, kernel1d, stride1d, mode):
     """infer pad size"""
     if input1d % stride1d == 0:
         pad = max(kernel1d - stride1d, 0)
@@ -165,6 +166,8 @@ def get_pad_pair(input1d, kernel1d, stride1d):
         pad = max(kernel1d - (input1d % stride1d), 0)
     pad_before = pad // 2
     pad_after = pad - pad_before
+    if "LOWER" in mode:
+        return [pad_after, pad_before]
     return [pad_before, pad_after]
 
 
@@ -280,9 +283,9 @@ def _impl_v1(cls, inputs, attr, params):
                     pad_tuple = []
                     for axis in range(len(input_shape) - 2):
                         axis_shape = input_shape[2 + axis]
-                        stride = attr["strides"][axis]
+                        stride = attr.get("strides", [1] * ndim)[axis]
                         kernel = attr["kernel_shape"][axis]
-                        pad = get_pad_pair(axis_shape, kernel, stride)
+                        pad = get_pad_pair(axis_shape, kernel, stride, attr["auto_pad"])
                         pad_tuple.append(pad)
                     pad_tuple = tuple([val for pair in zip(*pad_tuple) for val in pair])
                     attr["pads"] = pad_tuple
@@ -394,7 +397,7 @@ def autopad(data, strides, kernel_shape, dilations, ndim, pad_type="constant", d
     # pad N and C with zeros
     pad = _op.concatenate([_op.const(np.zeros([2, 2], dtype="int64"), dtype="int64"), pad], axis=0)
 
-    return _op.nn.pad(data, pad, _op.const(0.0), pad_type)
+    return _op.nn.pad(data, fold_constant(pad), _op.const(0.0), pad_type)
 
 
 class Conv(OnnxOpConverter):
@@ -444,9 +447,15 @@ class ConvTranspose(OnnxOpConverter):
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
         # get number of channels
-        channels = infer_channels(inputs[1], True)
+        out_type = infer_type(inputs[1])
+        out_shapes = [get_const_tuple(out_type.checked_type.shape)]
+        channels = out_shapes[0][1]
         attr["channels"] = channels
-        groups = attr.pop("group")
+        groups = attr.get("group", 1)
+
+        if "kernel_shape" not in attr:
+            attr["kernel_shape"] = out_shapes[0][2:]
+
         attr["groups"] = groups
         # infer pads for auto_pad
         data = inputs[0]
@@ -528,13 +537,11 @@ def _impl_v1(cls, inputs, attr, params):
         if not transB:
             inputs[1] = _op.transpose(inputs[1], axes=(1, 0))
         inputs[0] = _op.nn.batch_flatten(inputs[0])
-
         if alpha != 1.0:
             inputs[0] *= _expr.const(alpha)
         out = _op.nn.dense(inputs[0], inputs[1], units=channels)
-
         if len(inputs) == 3:
-            return _op.nn.bias_add(out, _expr.const(beta) * inputs[2])
+            out = out + _expr.const(beta) * inputs[2]
         return out
 
 
@@ -618,7 +625,7 @@ def _impl_v1(cls, inputs, attr, params):
         # Note: attr['fmod'] determines whether the operator should behave like np.fmod or np.mod.
         # attr['fmod'] == 0 will behave as np.mod and attr['fmod'] == 1 will force fmod treatment.
         # The relay equivalent of np.fmod is relay.mod and np.mod is relay.floor_mod
-        if attr["fmod"] == 0:
+        if attr.get("fmod", 0) == 0:
             op_name = "floor_mod"
         else:
             op_name = "mod"
@@ -802,7 +809,6 @@ def _impl_v11(cls, inputs, attr, params):
 
         pad_width_expr = fold_constant(_op.transpose(_op.reshape(pads, (2, -1))))
         pad_mode = attr.get("mode", b"constant").decode("utf-8")
-
         if not pad_mode in ["constant", "edge", "reflect"]:
             raise tvm.error.OpAttributeInvalid(
                 "Value " + pad_mode + ' in attribute "mode" is invalid for operator Pad.'
@@ -849,12 +855,18 @@ class Flatten(OnnxOpConverter):
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
         axis = attr.get("axis", 1)
+        ishape = _op.shape_of(inputs[0])
+        ndim = infer_shape(ishape)[0]
+        if axis < 0:
+            axis = axis + ndim
+
         if axis == 1:
             out = _op.nn.batch_flatten(inputs[0])
         else:
-            newshape = [0] * (axis + 1)
-            newshape[axis] = -1
-            out = _op.reshape(inputs[0], list(newshape))
+            pre_shape = _op.prod(_op.strided_slice(ishape, [0], [axis], [1]), keepdims=True)
+            post_shape = _op.prod(_op.strided_slice(ishape, [axis], [ndim], [1]), keepdims=True)
+            newshape = _op.concatenate([pre_shape, post_shape], axis=0)
+            out = _op.reshape(inputs[0], newshape)
         return out
 
 
@@ -1036,7 +1048,7 @@ def _impl_v9(cls, inputs, attr, params):
 
         # in 3d case, we use the purely static op
         if dims == 5:
-            if isinstance(scales, _expr.Call):
+            if isinstance(scales, _expr.Expr):
                 scale_h = _op.take(scales, _op.const(3))
                 scale_w = _op.take(scales, _op.const(4))
                 scale_d = _op.take(scales, _op.const(1))
@@ -1052,7 +1064,7 @@ def _impl_v9(cls, inputs, attr, params):
             )
         # in 2d case, use dynamic op
         else:
-            if isinstance(scales, _expr.Call):
+            if isinstance(scales, _expr.Expr):
                 scale_h = _op.take(scales, _op.const(3))
                 scale_w = _op.take(scales, _op.const(4))
             else:
@@ -1247,7 +1259,13 @@ class Gather(OnnxOpConverter):
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
         axis = attr.get("axis", 0)
-        return AttrCvt("take", extras={"axis": axis})(inputs, {})
+        data = inputs[0]
+        indices = inputs[1]
+        ind_dtype = infer_type(indices).checked_type.dtype
+        # Normalize the indices to a positive range
+        s = _op.take(_op.shape_of(data, dtype=ind_dtype), _op.const(axis))
+        indices = _op.where(indices < _op.const(0, ind_dtype), indices + s, indices)
+        return _op.take(data, indices, axis)
 
 
 class GatherElements(OnnxOpConverter):
@@ -1258,6 +1276,10 @@ def _impl_v1(cls, inputs, attr, params):
         data = inputs[0]
         indices = inputs[1]
         axis = attr.get("axis", 0)
+        ind_dtype = infer_type(indices).checked_type.dtype
+        # Normalize the indices to a positive range
+        s = _op.take(_op.shape_of(data, dtype=ind_dtype), _op.const(axis))
+        indices = _op.where(indices < _op.const(0, ind_dtype), indices + s, indices)
         return _op.gather(data, axis, indices)
 
 
@@ -1318,8 +1340,8 @@ class Maximum(OnnxOpConverter):
 
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
-        if not isinstance(inputs, (list, onnx_input)) or len(inputs) < 2:
-            raise ValueError("Expect minimum 2 inputs")
+        if len(inputs) == 1:
+            return inputs[0]
         _max = inputs[0]
         for i in range(1, len(inputs)):
             _max = AttrCvt("maximum")([_max, inputs[i]], {})
@@ -1331,8 +1353,8 @@ class Minimum(OnnxOpConverter):
 
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
-        if not isinstance(inputs, (list, onnx_input)) or len(inputs) < 2:
-            raise ValueError("Expect minimum 2 inputs")
+        if len(inputs) == 1:
+            return inputs[0]
         _min = inputs[0]
         for i in range(1, len(inputs)):
             _min = AttrCvt("minimum")([_min, inputs[i]], {})
@@ -1344,8 +1366,8 @@ class Mean(OnnxOpConverter):
 
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
-        if not isinstance(inputs, (list, onnx_input)) or len(inputs) < 2:
-            raise ValueError("Expect minimum 2 inputs")
+        if len(inputs) == 1:
+            return inputs[0]
         # avoid overflow
         concat = _op.concatenate([_op.expand_dims(x, axis=0) for x in inputs], axis=0)
         return _op.mean(concat, axis=0, keepdims=False)
@@ -1485,6 +1507,8 @@ class ArgMax(OnnxOpConverter):
 
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
+        if "select_last_index" in attr:
+            raise NotImplementedError("select_last_index not supported in ArgMax")
         axis = attr.get("axis", 0)
         keepdims = attr.get("keepdims", True)
         attr = {"axis": axis, "keepdims": keepdims}
@@ -1496,6 +1520,8 @@ class ArgMin(OnnxOpConverter):
 
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
+        if "select_last_index" in attr:
+            raise NotImplementedError("select_last_index not supported in ArgMin")
         axis = attr.get("axis", 0)
         keepdims = attr.get("keepdims", True)
         attr = {"axis": axis, "keepdims": keepdims}
@@ -1510,7 +1536,35 @@ def _impl_v1(cls, inputs, attr, params):
         # set default value when axis is not set in the model
         if "axis" not in attr:
             attr["axis"] = 1
-        return AttrCvt("softmax", transforms={"axis": ("axis", 1)})(inputs, attr, params)
+        axis = attr["axis"]
+        ndim = len(infer_shape(inputs[0]))
+        if axis < 0:
+            axis += ndim
+        axes = list(range(axis, ndim))
+        x = inputs[0]
+        m = _op.max(x, axes, keepdims=True)
+        e = _op.exp(x - m)
+        return e / _op.sum(e, axes, keepdims=True)
+
+
+class LogSoftmax(OnnxOpConverter):
+    """Operator converter for Softmax."""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        # set default value when axis is not set in the model
+        if "axis" not in attr:
+            attr["axis"] = 1
+        axis = attr["axis"]
+        ndim = len(infer_shape(inputs[0]))
+        if axis < 0:
+            axis += ndim
+        axes = list(range(axis, ndim))
+        x = inputs[0]
+        m = _op.max(x, axes, keepdims=True)
+        e = _op.exp(x - m)
+        s = _op.sum(e, axes, keepdims=True)
+        return x - m - _op.log(s)
 
 
 class OneHot(OnnxOpConverter):
@@ -1520,14 +1574,24 @@ class OneHot(OnnxOpConverter):
     def _impl_v9(cls, inputs, attr, params):
         # Extract relay one_hot inputs.
         indices, depth, values = inputs
+        ndim = len(infer_shape(indices))
         # Split onnx on off values into two separate expressions.
         off_value, on_value = _op.take(values, _op.const(0)), _op.take(values, _op.const(1))
         # Extract the datatype of the output from on_value.
         dtype = infer_type(on_value).checked_type.dtype
+        ind_dtype = infer_type(indices).checked_type.dtype
+        # Normalize the indices to a positive range
+        indices = _op.where(
+            indices < _op.const(0, ind_dtype), indices + _op.cast(depth, ind_dtype), indices
+        )
         # set default value when axis is not set in the model
         if "axis" not in attr:
             attr["axis"] = -1
-        return _op.one_hot(indices, on_value, off_value, depth, int(attr["axis"]), dtype=dtype)
+        axis = attr["axis"]
+        if axis < 0:
+            axis += ndim + 1
+
+        return _op.one_hot(indices, on_value, off_value, depth, axis, dtype=dtype)
 
 
 class ConstantOfShape(OnnxOpConverter):
@@ -1552,7 +1616,7 @@ class Constant(OnnxOpConverter):
     @classmethod
     def _impl_v9(cls, inputs, attr, params):
         if "value" not in attr:
-            raise "No Value in Constant"
+            raise tvm.errors.OpAttributeRequired("no value in Constant")
         np_value = get_numpy(attr.pop("value"))
         dtype = np_value.dtype.name
         value = _expr.const(np_value, dtype)
@@ -2042,7 +2106,7 @@ def _impl_v1(cls, inputs, attr, params):
         largest = attr.get("largest", 1)
 
         if largest == 0:
-            raise ValueError("TVM only supports finding TopK largest elements")
+            raise NotImplementedError("TVM only supports finding TopK largest elements")
 
         return _op.topk(inputs[0], inputs[1], axis=axis, dtype="int64")
 
@@ -2087,7 +2151,7 @@ def _impl_v1(cls, inputs, attr, params):
         batch_indices = inputs[2]
         mode = attr.get("mode", b"avg")
         if mode not in (b"avg", b"max"):
-            raise ValueError("RoiAlign in Relay only uses avg and max modes")
+            raise NotImplementedError("RoiAlign in Relay only uses avg and max modes")
         output_height = attr.get("output_height", 1)
         output_width = attr.get("output_width", 1)
 
@@ -2128,7 +2192,8 @@ def _impl_v11(cls, inputs, attr, params):
         result = inputs[0]
         for i, op in enumerate([_op.tensor.maximum, _op.tensor.minimum]):
             if i < len(inputs) - 1:
-                result = op(result, inputs[i + 1])
+                if inputs[i + 1] is not None:
+                    result = op(result, inputs[i + 1])
         return result
 
 
@@ -2393,9 +2458,10 @@ def _impl_v10(cls, inputs, attr, params):
         dtype = infer_type(boxes).checked_type.dtype
 
         if "center_point_box" in attr:
-            assert (
-                attr["center_point_box"] == 0
-            ), "Only support center_point_box = 0 in onnx importer right now"
+            if attr["center_point_box"] != 0:
+                raise NotImplementedError(
+                    "Only support center_point_box = 0 in ONNX NonMaxSuprresion"
+                )
 
         if iou_threshold is None:
             iou_threshold = _expr.const(0.0, dtype="float32")
@@ -2453,7 +2519,7 @@ def _first_cond(
             nms_size_out,
         ):
             # Loop over classes, end when i == C
-            return _op.min(_op.less(i, C))
+            return _op.take(_op.less(i, C), _expr.const(0))
 
         def _first_body(
             i,
@@ -2561,7 +2627,7 @@ def _first_body(
 
         def _inner_cond(i, j, C, onnx_out, nms_size, out):
             # inner loop over number of classes
-            return _op.min(_op.less(j, C))
+            return _op.take(_op.less(j, C), _expr.const(0))
 
         def _inner_body(i, j, C, onnx_out, nms_size, out):
             # slice to get current batch and class for valid box indicator
@@ -2591,7 +2657,7 @@ def _inner_body(i, j, C, onnx_out, nms_size, out):
 
         def _outer_cond(i, B, C, onnx_out, nms_size_out, out):
             # Outer loop is over batch size
-            return _op.min(_op.less(i, B))
+            return _op.take(_op.less(i, B), _expr.const(0))
 
         def _outer_body(i, B, C, onnx_out, nms_size_out, out):
             # Outer loop just calls inner loop
@@ -2629,10 +2695,10 @@ def _outer_body(i, B, C, onnx_out, nms_size_out, out):
 
         # Call the second loop, rework outputs into correct form
         init_count = _op.const(np.array([0]).astype("int64"), dtype="int64")
-        init_out = _op.const(np.array([]).reshape([0, 3]).astype("int64"), dtype="int64")
+        init_out = _op.const(np.array([1, 1, 1]).reshape([1, 3]).astype("int64"), dtype="int64")
         loop_vals = outer_loop(init_count, B, C, onnx_output, nms_size_output, init_out)
-
-        return _expr.TupleGetItem(loop_vals, 5)
+        loop_out = _expr.TupleGetItem(loop_vals, 5)
+        return _op.strided_slice(loop_out, [1, 0], shape_of(loop_out), [1, 1])
 
 
 # compatible operators that do NOT require any conversion.
@@ -2718,7 +2784,7 @@ def _get_convert_map(opset):
         "Softplus": Softplus.get_converter(opset),
         # softmax default axis is different in onnx
         "Softmax": Softmax.get_converter(opset),
-        "LogSoftmax": AttrCvt("log_softmax", {"axis": ("axis", 1)}),
+        "LogSoftmax": LogSoftmax.get_converter(opset),
         "OneHot": OneHot.get_converter(opset),
         # 'Hardmax'
         "Softsign": Softsign.get_converter(opset),
@@ -2914,7 +2980,7 @@ def from_onnx(self, graph, opset, get_output_expr=False):
             else:
                 self._num_input += 1
                 if i_name in self._shape:
-                    i_shape = self._shape[i_name]
+                    i_shape = self._shape.pop(i_name)
                 else:
                     if "?" in str(i_shape):
                         warning_msg = (
@@ -2929,6 +2995,11 @@ def from_onnx(self, graph, opset, get_output_expr=False):
                     dtype = d_type
                 self._nodes[i_name] = new_var(i_name, shape=i_shape, dtype=dtype)
             self._inputs[i_name] = self._nodes[i_name]
+        assert (
+            len(self._shape) == 0
+        ), "User specified the shape for inputs that weren't found in the graph: " + str(
+            self._shape
+        )
         # get list of unsupported ops
         convert_map = _get_convert_map(opset)
         unsupported_ops = set()
@@ -2953,6 +3024,8 @@ def from_onnx(self, graph, opset, get_output_expr=False):
             for i in node.input:
                 if i != "":
                     inputs[i] = self._nodes[self._renames.get(i, i)]
+                else:
+                    inputs[i] = None
             i_name = self._parse_value_proto(node)
             node_output = self._fix_outputs(op_name, node.output)
             attr["tvm_custom"] = {}
diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index c709e2b4e7bd..cb9ea6a043f4 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -1094,8 +1094,7 @@ def instance_norm(self, inputs, input_types):
             data, gamma, beta, axis=1, epsilon=epsilon, center=center, scale=scale
         )
 
-    @staticmethod
-    def get_dims(data):
+    def get_dims(self, data):
         import torch
 
         if isinstance(data, _expr.Expr):
@@ -1354,47 +1353,54 @@ def softplus(self, inputs, input_types):
         beta = _expr.const(float(inputs[1]), dtype=dtype)
         return _op.log(_op.exp(inputs[0] * beta) + _expr.const(1.0, dtype=dtype)) / beta
 
-    def avg_pool2d(self, inputs, input_types):
-        data = inputs[0]
-
-        pool_size = self.convert_const_list(inputs[1])
-        strides = self.convert_const_list(inputs[2] if inputs[2] else pool_size)
-        padding = inputs[3]
-        ceil_mode = int(inputs[4])
-        count_include_pad = int(inputs[5])
-
-        def func(x):
-            return _op.nn.avg_pool2d(
-                x,
-                pool_size=pool_size,
-                strides=strides,
-                padding=padding,
-                ceil_mode=ceil_mode,
-                count_include_pad=count_include_pad,
-            )
+    def make_avg_pool(self, dim):
+        def avg_pool(inputs, input_types):
+            data = inputs[0]
 
-        if self.is_quantized_tensor(data):
-            return qnn_torch.apply_with_upcast(data, func)
+            pool_size = self.convert_const_list(inputs[1])
+            strides = self.convert_const_list(inputs[2] if inputs[2] else pool_size)
+            padding = inputs[3]
+            ceil_mode = int(inputs[4])
+            count_include_pad = int(inputs[5])
 
-        return func(data)
+            def func(x):
+                if dim == 1:
+                    return _op.nn.avg_pool1d(
+                        x,
+                        pool_size=pool_size,
+                        strides=strides,
+                        padding=padding,
+                        ceil_mode=ceil_mode,
+                        count_include_pad=count_include_pad,
+                    )
+                elif dim == 2:
+                    return _op.nn.avg_pool2d(
+                        x,
+                        pool_size=pool_size,
+                        strides=strides,
+                        padding=padding,
+                        ceil_mode=ceil_mode,
+                        count_include_pad=count_include_pad,
+                    )
+                elif dim == 3:
+                    return _op.nn.avg_pool3d(
+                        x,
+                        pool_size=pool_size,
+                        strides=strides,
+                        padding=padding,
+                        ceil_mode=ceil_mode,
+                        count_include_pad=count_include_pad,
+                    )
+                else:
+                    msg = "Average Pooling dimension should be between 1 and 3"
+                    raise RuntimeError(msg)
 
-    def avg_pool3d(self, inputs, input_types):
-        data = inputs[0]
+            if self.is_quantized_tensor(data):
+                return qnn_torch.apply_with_upcast(data, func)
 
-        pool_size = inputs[1]
-        strides = inputs[2] if inputs[2] else pool_size
-        padding = inputs[3]
-        ceil_mode = int(inputs[4])
-        count_include_pad = int(inputs[5])
+            return func(data)
 
-        return _op.nn.avg_pool3d(
-            data,
-            pool_size=pool_size,
-            strides=strides,
-            padding=padding,
-            ceil_mode=ceil_mode,
-            count_include_pad=count_include_pad,
-        )
+        return avg_pool
 
     def linear(self, inputs, input_types):
         # https://pytorch.org/docs/stable/nn.functional.html#linear
@@ -1575,15 +1581,31 @@ def matmul(self, inputs, input_types):
 
         # When performing a batch matmul, we need to properly handle N-dim shapes.
         if len(a_shape) > 2 or len(b_shape) > 2:
-            # Convert a and b into 3 dimensional tensors.
-            a = _op.reshape(inputs_0, [-1, a_shape[-2], a_shape[-1]])
-            b = _op.reshape(inputs_1, [-1, b_shape[-2], b_shape[-1]])
+            # Convert a into a 3 dimensional tensors.
+            need_reshape_output = False
+            if len(a_shape) != 3:
+                a = _op.reshape(inputs_0, [-1, a_shape[-2], a_shape[-1]])
+                need_reshape_output = True
+            else:
+                a = inputs_0
+
             # Transpose matrix dimensions of b.
-            b = _op.transpose(b, [0, 2, 1])
+            trans_axes = list(range(len(b_shape)))
+            trans_axes[-2], trans_axes[-1] = trans_axes[-1], trans_axes[-2]
+            b = _op.transpose(inputs_1, trans_axes)
+
+            # Convert b into a 3 dimensional tensor. Note that the last two dimensions
+            # are transposed.
+            if len(b_shape) != 3:
+                b = _op.reshape(b, [-1, b_shape[-1], b_shape[-2]])
+
             # Perform a batch matmul.
             output = _op.nn.batch_matmul(a, b)
+
             # Reshape output to original dimensions.
-            return _op.reshape(output, [*a_shape[:-2], a_shape[-2], b_shape[-1]])
+            if need_reshape_output:
+                return _op.reshape(output, [*a_shape[:-2], a_shape[-2], b_shape[-1]])
+            return output
 
         # Otherwise a simple dense op will get the job done.
         if len(b_shape) == 1:
@@ -1673,8 +1695,20 @@ def pad(inputs, input_types):
 
     def clamp(self, inputs, input_types):
         data = inputs[0]
-        amin = inputs[1] if inputs[1] else np.finfo(np.float32).min
-        amax = inputs[2] if inputs[2] else np.finfo(np.float32).max
+
+        def get_v(v, default_v):
+            if isinstance(v, _expr.Constant):
+                return float(v.data.asnumpy())
+            if isinstance(v, _expr.Expr):
+                infer_v, success = try_infer_value(v, lambda ret: float(ret))
+                if success:
+                    return infer_v
+            if v is not None:
+                return v
+            return default_v
+
+        amin = get_v(inputs[1], np.finfo(np.float32).min)
+        amax = get_v(inputs[2], np.finfo(np.float32).max)
         return _op.clip(data, amin, amax)
 
     def to(self, inputs, input_types):
@@ -2323,8 +2357,9 @@ def create_convert_map(self):
             "aten::log_softmax": self.log_softmax,
             "aten::sigmoid": self.sigmoid,
             "aten::softplus": self.softplus,
-            "aten::avg_pool2d": self.avg_pool2d,
-            "aten::avg_pool3d": self.avg_pool3d,
+            "aten::avg_pool1d": self.make_avg_pool(1),
+            "aten::avg_pool2d": self.make_avg_pool(2),
+            "aten::avg_pool3d": self.make_avg_pool(3),
             "aten::linear": self.linear,
             "aten::dropout": self.dropout,
             "aten::dropout_": self.dropout,
diff --git a/python/tvm/relay/frontend/qnn_torch.py b/python/tvm/relay/frontend/qnn_torch.py
index 2b85a1f3a1be..2dd84b650bd2 100644
--- a/python/tvm/relay/frontend/qnn_torch.py
+++ b/python/tvm/relay/frontend/qnn_torch.py
@@ -353,6 +353,7 @@ def add_input_quant_params_to_op_inputs(graph):
         "quantized::mul": 2,
         "aten::dequantize": 1,
         "aten::mean": 1,
+        "aten::upsample_nearest2d": 1,
         "aten::upsample_bilinear2d": 1,
         "aten::relu_": 1,
         "aten::relu": 1,
diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index c79c495b0360..1946223a50a4 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -44,6 +44,17 @@
 __all__ = ["from_tensorflow"]
 
 
+def check_symbolic_shape(shape):
+    return not all([isinstance(dim, (int, tvm.tir.IntImm)) for dim in shape])
+
+
+def list_shape_of(tensor, ndim):
+    shape_tensor = _op.shape_of(tensor)
+    return [
+        _op.strided_slice(shape_tensor, begin=[i], end=[i + 1], strides=[1]) for i in range(ndim)
+    ]
+
+
 def _get_pad_pair(input1d, kernel1d, stride1d):
     if input1d % stride1d == 0:
         pad = max(kernel1d - stride1d, 0)
@@ -1022,13 +1033,31 @@ def _impl(inputs, attr, params, mod):
         input_y = inputs[1]
         orig_shape_x = _infer_shape(input_x, mod)
         orig_shape_y = _infer_shape(input_y, mod)
+        ndim = len(orig_shape_x)
+
+        is_static = not check_symbolic_shape(orig_shape_x)
+
+        if ndim > 3 and not is_static:
+            shape_of_x = list_shape_of(inputs[0], ndim)
+            shape_of_y = list_shape_of(inputs[1], ndim)
 
         # reshape n-dimensional batch matmul into 3d
-        if len(orig_shape_x) > 3:
+        if ndim > 3:
             outer_dims = [orig_shape_x[i] for i in range(0, len(orig_shape_x) - 2)]
-            num_outer_elts = np.prod(outer_dims)
-            new_shape_x = (num_outer_elts, orig_shape_x[-2], orig_shape_x[-1])
-            new_shape_y = (num_outer_elts, orig_shape_y[-2], orig_shape_y[-1])
+            if is_static:
+                num_outer_elts = np.prod(outer_dims)
+                new_shape_x = (num_outer_elts, orig_shape_x[-2], orig_shape_x[-1])
+                new_shape_y = (num_outer_elts, orig_shape_y[-2], orig_shape_y[-1])
+            else:  # handle dynamic shape (dyn.reshape op)
+                # new shape = [prod(shape[:-2]), -2, -1]
+                new_shape_x = [_op.const(1), shape_of_x[-2], shape_of_x[-1]]
+                new_shape_y = [_op.const(1), shape_of_y[-2], shape_of_y[-1]]
+                for i in range(ndim - 2):
+                    new_shape_x[0] *= shape_of_x[i]
+                    new_shape_y[0] *= shape_of_y[i]
+                new_shape_x = _op.concatenate(_op.Tuple(new_shape_x), axis=0)
+                new_shape_y = _op.concatenate(_op.Tuple(new_shape_y), axis=0)
+
             input_x = _op.reshape(input_x, newshape=new_shape_x)
             input_y = _op.reshape(input_y, newshape=new_shape_y)
 
@@ -1039,12 +1068,19 @@ def _impl(inputs, attr, params, mod):
         ret = get_relay_op("batch_matmul")(input_x, input_y)
 
         # reshape result back to n-dimensional
-        if len(orig_shape_x) > 3:
-            final_shape = list(orig_shape_x)
-            final_shape[-2] = orig_shape_x[-1] if adj_x else orig_shape_x[-2]
-            final_shape[-1] = orig_shape_y[-2] if adj_y else orig_shape_y[-1]
-            ret = _op.reshape(ret, newshape=final_shape)
+        if ndim > 3:
+            if is_static:
+                final_shape = list(orig_shape_x)
+                final_shape[-2] = orig_shape_x[-1] if adj_x else orig_shape_x[-2]
+                final_shape[-1] = orig_shape_y[-2] if adj_y else orig_shape_y[-1]
+            else:
+                # calculate the resulting shape = [shape[:-2], 0, 0]
+                final_shape = list(shape_of_x)
+                final_shape[-2] = shape_of_x[-1] if adj_x else shape_of_x[-2]
+                final_shape[-1] = shape_of_y[-2] if adj_y else shape_of_y[-1]
+                final_shape = _op.concatenate(_op.Tuple(final_shape), axis=0)
 
+            ret = _op.reshape(ret, newshape=final_shape)
         return ret
 
     return _impl
@@ -1286,6 +1322,40 @@ def _impl(inputs, attr, params, mod):
     return _impl
 
 
+def _sparse_tensor_dense_add():
+    # Sparse utility from scipy
+    from scipy.sparse import csr_matrix
+
+    def _impl(inputs, attr, params, mod):
+        assert (
+            len(inputs) == 4
+        ), "There should be 4 input tensors [sparse_indices, sparse_values, sparse_shape, dense]."
+
+        indices_tensor = _infer_value(inputs[0], params, mod).asnumpy()
+        values_tensor = _infer_value(inputs[1], params, mod).asnumpy()
+        dense_shape_tensor = _infer_value(inputs[2], params, mod).asnumpy()
+
+        data = inputs[3]
+
+        rows = [x[0] for x in indices_tensor]
+        cols = [x[1] for x in indices_tensor]
+
+        # Create scipy sparse Tensor(CSR)
+        weight_sp = csr_matrix(
+            (values_tensor, (rows, cols)), shape=tuple(dense_shape_tensor.tolist())
+        )
+
+        weight_data = _expr.const(weight_sp.data, weight_sp.data.dtype)
+        weight_indptrs = _expr.const(weight_sp.indptr, weight_sp.indptr.dtype)
+        weight_indices = _expr.const(weight_sp.indices, weight_sp.indices.dtype)
+
+        ret = _op.nn.sparse_add(data, [weight_data, weight_indices, weight_indptrs])
+
+        return ret
+
+    return _impl
+
+
 def _identity():
     def _impl(inputs, attr, params, mod):
         return inputs[0]
@@ -2787,6 +2857,7 @@ def _impl(inputs, attr, params, mod):
     "SparseSegmentSqrtNWithNumSegments": _sparse_segment_sum_sqrtn_with_num_segments(),
     "SparseSegmentMean": _sparse_segment_mean(),
     "SparseSegmentMeanWithNumSegments": _sparse_segment_mean_with_num_segments(),
+    "SparseTensorDenseAdd": _sparse_tensor_dense_add(),
     "Split": _split(False),
     "SplitV": _split(True),
     "Sqrt": AttrCvt("sqrt"),
diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index 1b593ad8dea3..a5c9a586e275 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -2336,11 +2336,18 @@ def convert_cast(self, op):
         input_tensor = input_tensors[0]
         in_expr = self.get_expr(input_tensor.tensor_idx)
 
-        assert op.BuiltinOptionsType() == BuiltinOptions.CastOptions
-        op_options = op.BuiltinOptions()
-        cast_options = CastOptions()
-        cast_options.Init(op_options.Bytes, op_options.Pos)
-        cast_dtype = cast_options.OutDataType()
+        # MLIR-based converter outputs no BuiltinOptions for Cast operator. In this
+        # case the output type can be derived from the Cast operator output tensor.
+        # When TOCO converter is used there will be "normal" BuiltinOptions.CastOptions
+        # with output type.
+        if op.BuiltinOptions() is not None:
+            assert op.BuiltinOptionsType() == BuiltinOptions.CastOptions
+            op_options = op.BuiltinOptions()
+            cast_options = CastOptions()
+            cast_options.Init(op_options.Bytes, op_options.Pos)
+            cast_dtype = cast_options.OutDataType()
+        else:
+            cast_dtype = self.get_output_tensors(op)[0].tensor.Type()
 
         out = _op.cast(in_expr, self.get_tensor_type_str(cast_dtype))
 
@@ -3093,7 +3100,7 @@ def convert_detection_postprocess(self, op):
         valid_count = ret[0]
         # keep only the top 'max_detections' rows
         ret = _op.strided_slice(
-            ret[1], [0, 0, 0], [batch_size, custom_options["max_detections"], anchor_boxes]
+            ret[1], [0, 0, 0], [batch_size, custom_options["max_detections"], 6]
         )
         # the output needs some reshaping to match tflite
         ret = _op.split(ret, 6, axis=2)
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index e90263d794bc..16262833d1bf 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -19,16 +19,17 @@
 # pylint: disable=too-many-local-variables, too-many-arguments, no-else-return
 
 from __future__ import absolute_import
+
 import tvm
-from tvm import te
-from tvm.te.hybrid import script
+from tvm import te, topi
 from tvm.runtime import convert
-from tvm import topi
+from tvm.te.hybrid import script
 from tvm.topi.utils import get_const_int, get_const_tuple
+
 from . import op as _reg
 from . import strategy
-from .op import OpPattern
 from ._tensor import elemwise_shape_func
+from .op import OpPattern
 
 _reg.register_broadcast_schedule("broadcast_to")
 _reg.register_broadcast_schedule("broadcast_to_like")
@@ -159,6 +160,16 @@ def compute_cumsum(attrs, inputs, output_type):
 _reg.register_strategy("cumsum", strategy.cumsum_strategy)
 _reg.register_shape_func("cumsum", False, elemwise_shape_func)
 
+# cumprod
+@_reg.register_compute("cumprod")
+def compute_cumprod(attrs, inputs, output_type):
+    """Compute definition of cumprod"""
+    return [topi.cumprod(inputs[0], attrs.axis, attrs.dtype, attrs.exclusive)]
+
+
+_reg.register_strategy("cumprod", strategy.cumprod_strategy)
+_reg.register_shape_func("cumprod", False, elemwise_shape_func)
+
 
 @_reg.register_compute("unique")
 def compute_unique(attrs, inputs, output_type):
diff --git a/python/tvm/relay/op/annotation/annotation.py b/python/tvm/relay/op/annotation/annotation.py
index 0ab1a0ba9d68..5c75688b909d 100644
--- a/python/tvm/relay/op/annotation/annotation.py
+++ b/python/tvm/relay/op/annotation/annotation.py
@@ -16,7 +16,7 @@
 # under the License.
 """Annotation operations."""
 from tvm.runtime import ndarray as _nd
-from tvm.runtime import TVMContext as _TVMContext
+from tvm.runtime import Device as _Device
 
 from . import _make
 from .. import op as reg
@@ -30,7 +30,7 @@ def on_device(data, device):
     data : tvm.relay.Expr
         The expression to be annotated.
 
-    device : Union[:py:class:`TVMContext`, str]
+    device : Union[:py:class:`Device`, str]
         The device type to annotate.
 
     Returns
@@ -38,13 +38,13 @@ def on_device(data, device):
     result : tvm.relay.Expr
         The annotated expression.
     """
-    if isinstance(device, _TVMContext):
+    if isinstance(device, _Device):
         device = device.device_type
     elif isinstance(device, str):
-        device = _nd.context(device).device_type
+        device = _nd.device(device).device_type
     else:
         raise ValueError(
-            "device is expected to be the type of TVMContext or "
+            "device is expected to be the type of Device or "
             "str, but received %s" % (type(device))
         )
     return _make.on_device(data, device)
diff --git a/python/tvm/relay/op/contrib/__init__.py b/python/tvm/relay/op/contrib/__init__.py
index 49abf36134b4..30c2db0ddf0b 100644
--- a/python/tvm/relay/op/contrib/__init__.py
+++ b/python/tvm/relay/op/contrib/__init__.py
@@ -20,6 +20,7 @@
 
 from .arm_compute_lib import *
 from .dnnl import *
+from .bnns import *
 from .coreml import *
 from .ethosn import *
 from .tensorrt import *
diff --git a/python/tvm/relay/op/contrib/arm_compute_lib.py b/python/tvm/relay/op/contrib/arm_compute_lib.py
index 139f25fef4fd..17fdbf941e08 100644
--- a/python/tvm/relay/op/contrib/arm_compute_lib.py
+++ b/python/tvm/relay/op/contrib/arm_compute_lib.py
@@ -18,11 +18,11 @@
 """Arm Compute Library supported operators."""
 import tvm
 
+from tvm import relay
 from tvm._ffi import register_func
 from tvm.relay.expr import const
 from tvm.relay import transform
 from tvm.relay.build_module import bind_params_by_name
-from tvm.relay.testing.temp_op_attr import TempOpAttr
 
 from ...dataflow_pattern import wildcard, is_op, is_constant, is_expr
 from .register import register_pattern_table
@@ -30,7 +30,7 @@
 
 
 def is_arm_compute_runtime_enabled():
-    """Check if the ACL graph runtime is present.
+    """Check if the ACL graph executor is present.
 
     Returns
     -------
@@ -111,9 +111,9 @@ def convert_conv(attrs, inputs, tinfos, desired_layouts):
 
         return convert_conv
 
-    with TempOpAttr(
+    with OpAttrContext(
         "nn.conv2d", "FTVMConvertOpLayout", convert_layout_conv2d(tvm.relay.nn.conv2d)
-    ), TempOpAttr(
+    ), OpAttrContext(
         "qnn.conv2d", "FTVMConvertOpLayout", convert_layout_conv2d(tvm.relay.qnn.op.conv2d)
     ):
         seq = tvm.transform.Sequential(
@@ -481,3 +481,36 @@ def qnn_add(expr):
             return False
 
     return True
+
+
+class OpAttrContext(object):
+    """ Temporarily changes the attr of an op. """
+
+    def __init__(self, op_name, attr_key, attr_value):
+        """Saves the required info for RAII pattern usage.
+
+        Parameters
+        ----------
+        op_name : str
+            The op name.
+
+        attr_key : str
+            The attribute name.
+
+        attr_value : object
+            The attribute value.
+        """
+        self.op = relay.op.get(op_name)
+        self.attr_key = attr_key
+        self.attr_value = attr_value
+
+    def __enter__(self):
+        self.older_attr = self.op.get_attr(self.attr_key)
+        self.op.reset_attr(self.attr_key)
+        self.op.set_attr(self.attr_key, self.attr_value)
+        return self
+
+    def __exit__(self, ptype, value, trace):
+        self.op.reset_attr(self.attr_key)
+        if self.older_attr:
+            self.op.set_attr(self.attr_key, self.older_attr)
diff --git a/python/tvm/relay/op/contrib/bnns.py b/python/tvm/relay/op/contrib/bnns.py
new file mode 100644
index 000000000000..2ace502e6528
--- /dev/null
+++ b/python/tvm/relay/op/contrib/bnns.py
@@ -0,0 +1,327 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument
+"""BNNS library supported operators.
+Is a part of Accelerate framework on macOS/iOS platforms. Apple provide several APIs
+to handle tensor processing. Particularly:
+ * BNNS (basic neural )
+ * vDSP (1D and 2D tensor processing)
+"""
+import math
+import tvm.ir
+
+from tvm.relay import transform
+from tvm.relay.expr import const
+from tvm.relay.build_module import bind_params_by_name
+
+from .register import register_pattern_table, get_pattern_table
+from ...dataflow_pattern import wildcard, is_op, is_expr
+
+
+def partition_for_bnns(mod, params=None):
+    """Partition the graph greedily offloading supported
+    operators to BNNS.
+
+    Parameters
+    ----------
+    mod : Module
+        The module to run passes on.
+    params : Optional[Dict[str, NDArray]]
+        Constant input parameters.
+
+    Returns
+    -------
+    ret : annotated and partitioned module.
+    """
+    if params:
+        mod["main"] = bind_params_by_name(mod["main"], params)
+
+    seq = tvm.transform.Sequential(
+        [
+            transform.InferType(),
+            transform.FoldConstant(),
+            transform.FoldScaleAxis(),
+            transform.DynamicToStatic(),
+            transform.AlterOpLayout(),
+            # TODO(apeskov): WA. AlterOpLayout call lead to constants shape transformation
+            #   Some expand_dims op may appears after constants. It breaks BNNS fusing.
+            #   So we have to call FoldConstant right before bnns composite passes.
+            transform.FoldConstant(),
+            transform.MergeComposite(get_pattern_table("bnns")),
+            transform.AnnotateTarget("bnns"),
+            #   If you no need in per layer performance statistic you can
+            #   uncomment next line
+            # transform.MergeCompilerRegions(),
+            transform.PartitionGraph(),
+        ]
+    )
+
+    return seq(mod)
+
+
+def _register_external_op_helper(op_name, supported=True):
+    """The helper function to indicate that a given operator can be supported
+    by BNNS.
+
+    Parameters
+    ----------
+    op_name : Str
+        The name of supported operator that will be registered.
+
+    Returns
+    -------
+    f : callable
+        A function that returns if the operator is supported by BNNS.
+    """
+
+    @tvm.ir.register_op_attr(op_name, "target.bnns")
+    def _func_wrapper(expr):
+        return supported
+
+    return _func_wrapper
+
+
+_register_external_op_helper("nn.batch_matmul")
+
+
+@tvm.ir.register_op_attr("nn.max_pool2d", "target.bnns")
+def max_pool2d_check(expr):
+    """Check if the nn.max_pool2d can be executed in BNNS"""
+    attrs, args = expr.attrs, expr.args
+    data_typ = args[0].checked_type
+    rank = len(data_typ.shape)
+    if rank < 3 or rank > 4 or data_typ.dtype != "float32":
+        return False
+    if attrs.layout != "NCHW":
+        return False
+    return True
+
+
+@tvm.ir.register_op_attr("nn.avg_pool2d", "target.bnns")
+def avg_pool2d_check(expr):
+    """Check if the nn.avg_pool2d can be executed in BNNS"""
+    attrs, args = expr.attrs, expr.args
+    data_typ = args[0].checked_type
+    rank = len(data_typ.shape)
+    if rank < 3 or rank > 4 or data_typ.dtype != "float32":
+        return False
+    if attrs.layout != "NCHW":
+        return False
+    return True
+
+
+@tvm.ir.register_op_attr("nn.global_max_pool2d", "target.bnns")
+def global_max_pool2d_check(expr):
+    """Check if the nn.global_max_pool2d can be executed in BNNS"""
+    attrs, args = expr.attrs, expr.args
+    data_typ = args[0].checked_type
+    rank = len(data_typ.shape)
+    if rank < 3 or rank > 4 or data_typ.dtype != "float32":
+        return False
+    if attrs.layout != "NCHW":
+        return False
+    return True
+
+
+@tvm.ir.register_op_attr("nn.global_avg_pool2d", "target.bnns")
+def global_avg_pool2d_check(expr):
+    """Check if the nn.global_avg_pool2d can be executed in BNNS"""
+    attrs, args = expr.attrs, expr.args
+    data_typ = args[0].checked_type
+    rank = len(data_typ.shape)
+    if rank < 3 or rank > 4 or data_typ.dtype != "float32":
+        return False
+    if attrs.layout != "NCHW":
+        return False
+    return True
+
+
+def dtype_is_supported(dtype):
+    """Check if data type is supported by BNNS backend"""
+    return dtype in ("", "float32")
+
+
+@tvm.ir.register_op_attr("nn.conv2d", "target.bnns")
+def conv2d_check(expr):
+    """Check if the conv2d can be executed in BNNS"""
+    attrs, args = expr.attrs, expr.args
+    data_typ = args[0].checked_type
+    if len(data_typ.shape) != 4 or data_typ.dtype != "float32":
+        return False
+    if not isinstance(args[1], tvm.relay.expr.Constant):
+        return False
+    kernel_typ = args[1].checked_type
+    if len(kernel_typ.shape) != 4 or kernel_typ.dtype != "float32":
+        return False
+    if attrs.data_layout != "NCHW":
+        return False
+    if not dtype_is_supported(attrs.out_dtype):
+        return False
+    return True
+
+
+def bias_check(expr):
+    """Check is bias added through the correct dimension"""
+    attrs, args = expr.attrs, expr.args
+    if not isinstance(args[1], tvm.relay.expr.Constant):
+        return False
+    if expr.op.name == "nn.bias_add":
+        return attrs.axis == 1
+    if expr.op.name == "add":
+        b_shape = args[1].checked_type.shape
+        if len(b_shape) == 4:
+            return bool(b_shape[0] == 1 and b_shape[2] == 1 and b_shape[3] == 1)
+        if len(b_shape) == 3:
+            return bool(b_shape[1] == 1 and b_shape[2] == 1)
+
+    return False
+
+
+@tvm.ir.register_op_attr("nn.dense", "target.bnns")
+def dense(expr):
+    """Check if the dense can be used in BNNS."""
+    attrs, args = expr.attrs, expr.args
+    data_typ = args[0].checked_type
+    if data_typ.dtype != "float32":
+        return False
+    if not isinstance(args[1], tvm.relay.expr.Constant):
+        return False
+    kernel_typ = args[1].checked_type
+    if len(kernel_typ.shape) != 2 or kernel_typ.dtype != "float32":
+        return False
+    if attrs.out_dtype != "float32" and attrs.out_dtype != "":
+        return False
+    return True
+
+
+def make_conv_pattern(with_bias=True, activation="none"):
+    """Make pattern for bnns.conv2d primitive"""
+    data = wildcard()
+    weight = wildcard()
+    bias = wildcard()
+    pat = is_op("nn.conv2d")(data, weight)
+    if with_bias:
+        pat = is_op("add")(pat, bias) | is_op("nn.bias_add")(pat, bias)
+    if activation == "relu":
+        pat = is_op("nn.relu")(pat)
+    elif activation == "sigmoid":
+        pat = is_op("sigmoid")(pat)
+    return pat
+
+
+def check_conv(extract):
+    """Check conv pattern is supported by BNNS."""
+    bias_is_ok = True
+    call = extract
+    while call.op.name != "nn.conv2d":
+        if call.op.name in ("nn.bias_add", "add"):
+            bias_is_ok &= bias_check(call)
+        call = call.args[0]
+    return conv2d_check(call) and bias_is_ok
+
+
+def make_dense_bias_pattern():
+    """Make pattern for bnns.dense primitive"""
+    data = wildcard()
+    weight = wildcard()
+    bias = wildcard()
+    d = is_op("nn.dense")(data, weight)
+    return is_op("add")(d, bias)
+
+
+def make_dense_bias_gelu_pattern():
+    """Make pattern for bnns.dense primitive with fused bias and gelu activation"""
+    dense_bias = make_dense_bias_pattern()
+    const1 = is_expr(const(0.044715))
+    const2 = is_expr(const(math.sqrt(2 / math.pi)))
+
+    gelu = is_op("power")(dense_bias, is_expr(const(3, dtype="float32")))
+    gelu = is_op("multiply")(gelu, const1)
+    gelu = is_op("add")(gelu, dense_bias)
+    gelu = is_op("multiply")(gelu, const2)
+    gelu = is_op("tanh")(gelu)
+    gelu = is_op("add")(gelu, is_expr(const(1, dtype="float32")))
+    gelu = is_op("multiply")(gelu, is_expr(const(0.5)))
+    gelu = is_op("multiply")(gelu, dense_bias)
+    return gelu
+
+
+def check_dense(extract):
+    """Check dense pattern is supported by BNNS."""
+    call = extract
+    while call.op.name != "nn.dense":
+        call = call.args[0]
+    return dense(call)
+
+
+@tvm.ir.register_op_attr("nn.instance_norm", "target.bnns")
+def instance_norm_check(expr):
+    """Check if the nn.instance_norm can be executed in BNNS"""
+    attrs, args = expr.attrs, expr.args
+    data_typ = args[0].checked_type
+    rank = len(data_typ.shape)
+    if rank < 3 or rank > 4 or data_typ.dtype != "float32":
+        return False
+    if not isinstance(args[1], tvm.relay.expr.Constant) or not isinstance(
+        args[2], tvm.relay.expr.Constant
+    ):
+        return False
+    if attrs.axis == 0 and rank == 3 or attrs.axis == 1 and rank == 4:
+        return True
+    return False
+
+
+@register_pattern_table("bnns")
+def pattern_table():
+    """Get BNNS specific fusing patterns collection"""
+    conv2d_bias_pat = (
+        "bnns.conv2d_bias",
+        make_conv_pattern(with_bias=True),
+        check_conv,
+    )
+    conv2d_bias_relu_pat = (
+        "bnns.conv2d_bias_relu",
+        make_conv_pattern(with_bias=True, activation="relu"),
+        check_conv,
+    )
+    conv2d_relu_pat = (
+        "bnns.conv2d_relu",
+        make_conv_pattern(with_bias=False, activation="relu"),
+        check_conv,
+    )
+    conv2d_bias_sigmoid_pat = (
+        "bnns.conv2d_bias_sigmoid",
+        make_conv_pattern(with_bias=True, activation="sigmoid"),
+        check_conv,
+    )
+    conv2d_sigmoid_pat = (
+        "bnns.conv2d_sigmoid",
+        make_conv_pattern(with_bias=False, activation="sigmoid"),
+        check_conv,
+    )
+    dense_bias_gelu = ("bnns.dense_bias_gelu", make_dense_bias_gelu_pattern(), check_dense)
+    dense_bias = ("bnns.dense_bias", make_dense_bias_pattern(), check_dense)
+    bnns_patterns = [
+        conv2d_bias_relu_pat,
+        conv2d_relu_pat,
+        conv2d_bias_sigmoid_pat,
+        conv2d_sigmoid_pat,
+        conv2d_bias_pat,
+        dense_bias_gelu,
+        dense_bias,
+    ]
+    return bnns_patterns
diff --git a/python/tvm/relay/op/contrib/tensorrt.py b/python/tvm/relay/op/contrib/tensorrt.py
index afdea9712342..a36b66c8f0dd 100644
--- a/python/tvm/relay/op/contrib/tensorrt.py
+++ b/python/tvm/relay/op/contrib/tensorrt.py
@@ -29,7 +29,7 @@
 
 
 def is_tensorrt_runtime_enabled():
-    """Check if the TensorRT graph runtime is present.
+    """Check if the TensorRT graph executor is present.
     Returns
     -------
     ret: bool
diff --git a/python/tvm/relay/op/memory/memory.py b/python/tvm/relay/op/memory/memory.py
index 57aa7e423147..9dae23d5b65e 100644
--- a/python/tvm/relay/op/memory/memory.py
+++ b/python/tvm/relay/op/memory/memory.py
@@ -47,7 +47,7 @@ def alloc_tensor(storage, offset, shape, dtype="float32", assert_shape=None):
     return _make.alloc_tensor(storage, offset, shape, dtype, assert_shape)
 
 
-def alloc_storage(size, alignment, ctx, dtype_hint="float32"):
+def alloc_storage(size, alignment, device, dtype_hint="float32"):
     """Allocate a piece of tensor storage.
 
     Parameters
@@ -56,15 +56,17 @@ def alloc_storage(size, alignment, ctx, dtype_hint="float32"):
         The size of the allocation.
     alignment : tvm.relay.Expr
         The alignment of the allocation.
-    dtype : str
-        The dtype_hint of the allocation.
+    device : tvm.runtime.Device
+        The device of the allocation.
+    dtype_hint : str
+        The dtype hint of the allocation.
 
     Returns
     -------
     result : tvm.relay.Expr
         The alloc_storage expression.
     """
-    return _make.alloc_storage(size, alignment, ctx, dtype_hint)
+    return _make.alloc_storage(size, alignment, device, dtype_hint)
 
 
 def flatten_tuple_type(ty):
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 6ae86c0786e5..af64873ee904 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -142,6 +142,11 @@ def alter_op_layout_sparse_dense(attrs, inputs, tinfos, out_type):
     return topi.nn.sparse_dense_alter_layout(attrs, inputs, tinfos, out_type)
 
 
+# sparse_add
+reg.register_strategy("nn.sparse_add", strategy.sparse_add_strategy)
+reg.register_pattern("nn.sparse_add", reg.OpPattern.OPAQUE)
+
+
 @reg.register_compute("nn.internal.sparse_dense_padded")
 def compute_sparse_dense_padded(attrs, inputs, out_type):
     """Compute definition of sparse_dense_padded"""
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index 5135ac74de25..a1147fec4d7e 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -2148,6 +2148,53 @@ def sparse_transpose(x):
     return expr.TupleWrapper(_make.sparse_transpose(x[0], x[1], x[2]), 3)
 
 
+# pylint: disable=no-else-return,inconsistent-return-statements
+def sparse_add(dense_mat, sparse_mat):
+    r"""
+    Computes the matrix addition of `dense_mat` and `sparse_mat`, where `dense_mat` is
+    a dense matrix and `sparse_mat` is a sparse (CSR) namedtuple with
+    fields `data`, `indices`, and `indptr`.
+
+    .. math::
+
+        \mbox{sparse_add}(dense_mat, sparse_mat)[m, n] = \mbox{add}(\mbox{as_dense}(S), (D))[m, n]
+
+    where `as_dense` returns dense equivalent of the given S(sparse matrix)
+    while performing addition with given D(dense matrix).
+
+    Parameters
+    ----------
+    dense_mat : tvm.relay.Expr
+        The input dense matrix for the matrix addition
+
+    sparse_mat : Union[namedtuple, Tuple[ndarray, ndarray, ndarray]].
+        The input sparse matrix(CSR) for the matrix addition.
+
+    Returns
+    -------
+    result: tvm.relay.Expr
+        The computed result.
+
+    Examples
+    -------
+    .. code-block:: python
+        dense_data = [[ 3.,   4.,   4. ]
+                      [ 4.,  2.,  5. ]]
+        sparse_data = [4., 8.]
+        sparse_indices =[0, 2]
+        sparse_indptr =[0, 1, 2]
+
+        output = relay.sparse_add(dense_data, sparse_data, sparse_indices, sparse_indptr)
+
+        output = [[ 7.,   4.,   4. ]
+                  [ 4.,  2.,  13. ]]
+    """
+    if hasattr(sparse_mat, "indices"):
+        return _make.sparse_add(dense_mat, sparse_mat.data, sparse_mat.indices, sparse_mat.indptr)
+    else:
+        return _make.sparse_add(dense_mat, sparse_mat[0], sparse_mat[1], sparse_mat[2])
+
+
 def contrib_conv2d_winograd_without_weight_transform(
     data,
     weight,
diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 985124e305ee..005eae68b8b7 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -207,11 +207,10 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
     else:  # group_conv2d
         if layout == "NCHW":
             assert kernel_layout == "OIHW"
-            logger.warning("group_conv2d with layout NCHW is not optimized for arm cpu.")
             strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.group_conv2d_nchw, has_groups=True),
-                wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw),
-                name="group_conv2d_nchw.generic",
+                wrap_compute_conv2d(topi.arm_cpu.group_conv2d_nchw, has_groups=True),
+                wrap_topi_schedule(topi.arm_cpu.schedule_group_conv2d_nchw),
+                name="group_conv2d_nchw.arm_cpu",
             )
         elif layout == "NHWC":
             assert kernel_layout == "HWIO"
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index e0d0f165219e..1a6742526607 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -18,11 +18,12 @@
 # pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
 from tvm import topi
 from tvm.auto_scheduler import is_auto_scheduler_enabled
-from tvm.te import SpecializedCondition
 from tvm.contrib import nvcc
 from tvm.contrib.thrust import can_use_thrust
-from .generic import *
+from tvm.te import SpecializedCondition
+
 from .. import op as _op
+from .generic import *
 
 
 @schedule_injective.register(["cuda", "gpu"])
@@ -1017,13 +1018,25 @@ def cumsum_strategy_cuda(attrs, inputs, out_type, target):
     """cumsum cuda strategy"""
     strategy = _op.OpStrategy()
     strategy.add_implementation(
-        wrap_compute_cumsum(topi.cuda.cumsum),
+        wrap_compute_scanop(topi.cuda.cumsum),
         wrap_topi_schedule(topi.cuda.schedule_scan),
         name="cumsum.cuda",
     )
     return strategy
 
 
+@cumprod_strategy.register(["cuda", "gpu"])
+def cumprod_strategy_cuda(attrs, inputs, out_type, target):
+    """cumprod cuda strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_scanop(topi.cuda.cumprod),
+        wrap_topi_schedule(topi.cuda.schedule_scan),
+        name="cumprod.cuda",
+    )
+    return strategy
+
+
 @unique_strategy.register(["cuda", "gpu"])
 def unique_strategy_cuda(attrs, inputs, out_type, target):
     """unique cuda strategy"""
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index be86ea9d9184..322a3607904f 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -17,11 +17,12 @@
 """Definition of generic operator strategy."""
 # pylint: disable=invalid-name,unused-argument
 import logging
-
 import re
-from tvm import topi, _ffi, te, ir
-from tvm.topi.utils import get_const_int, get_const_float, get_const_tuple, get_float_tuple
+
+from tvm import _ffi, ir, te, topi
 from tvm.target import generic_func, override_native_generic_func
+from tvm.topi.utils import get_const_float, get_const_int, get_const_tuple, get_float_tuple
+
 from .. import op as _op
 
 logger = logging.getLogger("strategy")
@@ -799,6 +800,29 @@ def sparse_dense_padded_strategy(attrs, inputs, out_type, target):
     raise NotImplementedError("sparse_dense_padded is only implemented for cuda")
 
 
+# sparse_add
+def wrap_compute_sparse_add(topi_compute):
+    """wrap sparse add topi compute"""
+
+    def _compute_sparse_add(attrs, inputs, out_type):
+        return [topi_compute(inputs[0], inputs[1], inputs[2], inputs[3])]
+
+    return _compute_sparse_add
+
+
+@override_native_generic_func("sparse_add_strategy")
+def sparse_add_strategy(attrs, inputs, out_type, target):
+    """sparse add generic strategy"""
+    logger.warning("sparse add is not optimized for this platform.")
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_sparse_add(topi.nn.sparse_add),
+        wrap_topi_schedule(topi.generic.schedule_extern),
+        name="sparse_add.generic",
+    )
+    return strategy
+
+
 # sparse_transpose
 @generic_func
 def schedule_sparse_transpose(attrs, outs, target):
@@ -1440,13 +1464,13 @@ def threefry_split_strategy(attrs, inputs, out_type, target):
     return strategy
 
 
-def wrap_compute_cumsum(topi_compute):
-    """Wrap cumsum topi compute"""
+def wrap_compute_scanop(topi_compute):
+    """Wrap scanop style topi compute"""
 
-    def _compute_cumsum(attrs, inputs, _):
+    def _compute_scanop(attrs, inputs, _):
         return [topi_compute(inputs[0], attrs.axis, attrs.dtype, attrs.exclusive)]
 
-    return _compute_cumsum
+    return _compute_scanop
 
 
 @override_native_generic_func("cumsum_strategy")
@@ -1454,13 +1478,25 @@ def cumsum_strategy(attrs, inputs, out_type, target):
     """cumsum generic strategy"""
     strategy = _op.OpStrategy()
     strategy.add_implementation(
-        wrap_compute_cumsum(topi.cumsum),
+        wrap_compute_scanop(topi.cumsum),
         wrap_topi_schedule(topi.generic.schedule_extern),
         name="cumsum.generic",
     )
     return strategy
 
 
+@override_native_generic_func("cumprod_strategy")
+def cumprod_strategy(attrs, inputs, out_type, target):
+    """cumprod generic strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_scanop(topi.cumprod),
+        wrap_topi_schedule(topi.generic.schedule_extern),
+        name="cumprod.generic",
+    )
+    return strategy
+
+
 def wrap_compute_unique(topi_compute):
     """Wrap unique topi compute"""
 
diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 1f37a4f8e98c..60bd92ef63d1 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -205,12 +205,10 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
     else:  # group_conv2d
         if layout == "NCHW":
             assert kernel_layout == "OIHW"
-            if not is_auto_scheduler_enabled():
-                logger.warning("group_conv2d is not optimized for x86 with autotvm.")
             strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.group_conv2d_nchw, has_groups=True),
-                wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw),
-                name="group_conv2d_nchw.generic",
+                wrap_compute_conv2d(topi.x86.group_conv2d_nchw, has_groups=True),
+                wrap_topi_schedule(topi.x86.schedule_group_conv2d_nchw),
+                name="group_conv2d_nchw.x86",
             )
         elif layout == "NHWC":
             assert kernel_layout == "HWIO"
diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
index 5b011043f588..6b9ac30d3a3a 100644
--- a/python/tvm/relay/op/tensor.py
+++ b/python/tvm/relay/op/tensor.py
@@ -17,7 +17,7 @@
 """Basic tensor operations."""
 # pylint: disable=redefined-builtin, unused-argument
 from tvm.runtime import ndarray as _nd
-from tvm.runtime import TVMContext as _TVMContext
+from tvm.runtime import Device as _Device
 from tvm.te.hybrid import script
 
 from . import _make
@@ -1160,7 +1160,7 @@ def copy_shape_func(attrs, inputs, _):
 
 def device_copy(data, src_dev, dst_dev):
     """Copy data from the source device to the destination device. This
-    operator helps data transferring between difference contexts for
+    operator helps data transferring between difference devices for
     heterogeneous execution.
 
     Parameters
@@ -1168,10 +1168,10 @@ def device_copy(data, src_dev, dst_dev):
     data : tvm.relay.Expr
         The tensor to be copied.
 
-    src_dev : Union[:py:class:`TVMContext`, str]
+    src_dev : Union[:py:class:`Device`, str]
         The source device where the data is copied from.
 
-    dst_dev : Union[:py:class:`TVMContext`, str]
+    dst_dev : Union[:py:class:`Device`, str]
         The destination device where the data is copied to.
 
     Returns
@@ -1179,23 +1179,23 @@ def device_copy(data, src_dev, dst_dev):
     result : tvm.relay.Expr
         The copied result.
     """
-    if isinstance(src_dev, _TVMContext):
+    if isinstance(src_dev, _Device):
         src_dev = src_dev.device_type
     elif isinstance(src_dev, str):
-        src_dev = _nd.context(src_dev).device_type
+        src_dev = _nd.device(src_dev).device_type
     else:
         raise ValueError(
-            "src_dev is expected to be the type of TVMContext or "
+            "src_dev is expected to be the type of Device or "
             "str, but received %s" % (type(src_dev))
         )
 
-    if isinstance(dst_dev, _TVMContext):
+    if isinstance(dst_dev, _Device):
         dst_dev = dst_dev.device_type
     elif isinstance(dst_dev, str):
-        dst_dev = _nd.context(dst_dev).device_type
+        dst_dev = _nd.device(dst_dev).device_type
     else:
         raise ValueError(
-            "dst_dev is expected to be the type of TVMContext or "
+            "dst_dev is expected to be the type of Device or "
             "str, but received %s" % (type(dst_dev))
         )
     return _make.device_copy(data, src_dev, dst_dev)
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 4129b610cb7c..f94a00db2fb1 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -18,11 +18,11 @@
 # pylint: disable=import-outside-toplevel
 """Transform operators."""
 
+from ...tir import expr as _expr
+from ..expr import Constant, Expr, Tuple, TupleWrapper, const
 from . import _make
 from .dyn import _make as _dyn_make
 from .tensor import shape_of
-from ..expr import TupleWrapper, const, Constant, Expr, Tuple
-from ...tir import expr as _expr
 
 
 def cast(data, dtype):
@@ -905,10 +905,13 @@ def strided_slice(data, begin, end, strides=None, slice_mode="end"):
             end = const(list(end))
         if isinstance(strides, (tuple, list)):
             strides = const(list(strides))
-        normalized_begin = _make.where(
+        begin = _make.where(
             begin < cast_like(const(0), begin), begin + cast_like(shape_of(data), begin), begin
         )
-        return _dyn_make.strided_slice(data, normalized_begin, end, strides, slice_mode)
+        begin = _make.where(
+            begin >= cast_like(shape_of(data), begin), cast_like(shape_of(data), begin), begin
+        )
+        return _dyn_make.strided_slice(data, begin, end, strides, slice_mode)
     return _make.strided_slice(data, begin, end, strides, slice_mode)
 
 
@@ -1536,9 +1539,9 @@ def cumsum(data, axis=None, dtype=None, exclusive=None):
         Type of the returned array and of the accumulator in which the elements are summed.
         If dtype is not specified, it defaults to the dtype of data.
 
-    exclusive : int, optional
-        If set to 1 will return exclusive sum in which the first element is not
-        included. In other terms, if set to 1, the j-th output element would be
+    exclusive : bool, optional
+        If true will return exclusive sum in which the first element is not
+        included. In other terms, if true, the j-th output element would be
         the sum of the first (j-1) elements. Otherwise, it would be the sum of
         the first j elements.
 
@@ -1574,6 +1577,61 @@ def cumsum(data, axis=None, dtype=None, exclusive=None):
     return _make.cumsum(data, axis, dtype, exclusive)
 
 
+def cumprod(data, axis=None, dtype=None, exclusive=None):
+    """Numpy style cumprod op. Return the cumulative inclusive product of the elements along
+    a given axis.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    axis : int, optional
+        Axis along which the cumulative product is computed. The default (None) is to compute
+        the cumprod over the flattened array.
+
+    dtype : string, optional
+        Type of the returned array and of the accumulator in which the elements are multiplied.
+        If dtype is not specified, it defaults to the dtype of data.
+
+    exclusive : bool, optional
+        If true will return exclusive product in which the first element is not
+        included. In other terms, if true, the j-th output element would be
+        the product of the first (j-1) elements. Otherwise, it would be the product of
+        the first j elements. The product of zero elements will be 1.
+
+    Returns
+    -------
+    result : relay.Expr
+        The result has the same size as data, and the same shape as data if axis is not None.
+        If axis is None, the result is a 1-d array.
+
+    Examples
+    --------
+    .. code-block:: python
+        a = [[1,2,3], [4,5,6]]
+
+        cumprod(a)  # if axis is not provided, cumprod is done over the flattened input.
+        -> [ 1,  2,  6, 24, 120, 720]
+
+        cumprod(a, dtype="float32")
+        -> [  1.,  2.,  6., 24., 120., 720.]
+
+        cumprod(a, axis=0)  # multiply over rows for each of the 3 columns
+        -> [[1, 2, 3],
+            [4, 10, 18]]
+
+        cumprod(a, axis=1)
+        -> [[ 1,  2,  6],
+            [ 4,  20, 120]]
+
+        a = [1, 1, 1, 0, 1, 1, 0]  # a is a boolean array
+        cumprod(a, dtype=int32)  # dtype should be provided to get the expected results
+        -> [1, 1, 1, 0, 0, 0, 0]
+    """
+    return _make.cumprod(data, axis, dtype, exclusive)
+
+
 def unique(data, is_sorted=True, return_counts=False):
     """
     Find the unique elements of a 1-D tensor. Please note `output` and `counts` are all padded to
diff --git a/python/tvm/relay/qnn/op/__init__.py b/python/tvm/relay/qnn/op/__init__.py
index 6d66e12eeafc..848409360a9d 100644
--- a/python/tvm/relay/qnn/op/__init__.py
+++ b/python/tvm/relay/qnn/op/__init__.py
@@ -19,4 +19,4 @@
 from __future__ import absolute_import as _abs
 from .qnn import *
 from .op import register_qnn_legalize
-from . import legalizations, layout_conversions
+from . import _qnn, legalizations, layout_conversions
diff --git a/python/tvm/relay/qnn/op/_qnn.py b/python/tvm/relay/qnn/op/_qnn.py
new file mode 100644
index 000000000000..a059c293a0f8
--- /dev/null
+++ b/python/tvm/relay/qnn/op/_qnn.py
@@ -0,0 +1,52 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument, len-as-condition
+"""QNN operator feature registration"""
+
+from tvm import topi
+
+from ...op.op import register_compute
+from ...op.op import register_injective_schedule
+from ...op.op import register_pattern, OpPattern
+
+
+@register_compute("qnn.simulated_quantize")
+def simulated_quantize_compute(attrs, inputs, output_type):
+    assert len(inputs) == 4
+    return [
+        topi.nn.simulated_quantize(
+            inputs[0], inputs[1], inputs[2], inputs[3], axis=attrs.get_int("axis")
+        )
+    ]
+
+
+register_injective_schedule("qnn.simulated_quantize")
+register_pattern("qnn.simulated_quantize", OpPattern.ELEMWISE)
+
+
+@register_compute("qnn.simulated_dequantize")
+def simulated_dequantize_compute(attrs, inputs, output_type):
+    assert len(inputs) == 4
+    return [
+        topi.nn.simulated_dequantize(
+            inputs[0], inputs[1], inputs[2], inputs[3], axis=attrs.get_int("axis")
+        )
+    ]
+
+
+register_injective_schedule("qnn.simulated_dequantize")
+register_pattern("qnn.simulated_dequantize", OpPattern.ELEMWISE)
diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index a5892f331f06..f02f8227e14a 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -18,8 +18,10 @@
 """QNN dialect operators."""
 
 from __future__ import absolute_import as _abs
+from tvm import relay
 from tvm.relay.expr import Tuple, TupleWrapper
 from tvm.relay.op.nn.utils import get_pad_tuple2d
+from tvm.topi.nn.qnn import SQNN_DTYPE_TO_CODE
 from . import _make
 from ... import op as reg
 from ...op import OpPattern
@@ -118,6 +120,40 @@ def quantize(data, output_scale, output_zero_point, axis=-1, out_dtype="int8"):
     return _make.quantize(data, output_scale, output_zero_point, axis, out_dtype)
 
 
+def simulated_quantize(data, output_scale, output_zero_point, axis=-1, out_dtype="int8"):
+    r"""Simulated Quantize op
+    Mimics the quantize op but has more flexibility in valid inputs and always
+    outputs the same type as the input. This can be useful for
+    calibrating or training a quantized network.
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input tensor to be quantized. Can be of type float32.
+    output_zero_point : tvm.relay.Expr
+        The output zero_point.
+    output_scale : tvm.relay.Expr
+        The output scale.
+    axis : int
+        The channel axis for quantization. Default value is -1 which corresponds to the last axis.
+    out_dtype : string or tvm.relay.Expr
+        A string or tensor indicating which datatype to quantize to.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    # Convert string dtype to a constant if needed.
+    if isinstance(out_dtype, str):
+        type_code = SQNN_DTYPE_TO_CODE[out_dtype]
+        out_dtype = relay.const(type_code, dtype="int32")
+    # Wrap reshapes around qnn parameter tensors to guarantee shape compatibility.
+    output_scale = relay.op.reshape(output_scale, [-1])
+    output_zero_point = relay.op.reshape(output_zero_point, [-1])
+    return _make.simulated_quantize(data, out_dtype, output_scale, output_zero_point, axis)
+
+
 def dequantize(data, input_scale, input_zero_point, axis=-1):
     r"""Dequantize op
     This operator takes quantized int8 and unit8 as input and produces
@@ -127,7 +163,7 @@ def dequantize(data, input_scale, input_zero_point, axis=-1):
     Parameters
     ----------
     data : tvm.relay.Expr
-        The input tensor to be dequantized. Can be of type [int8, uint8].
+        The input tensor to be dequantized. Can be of type [int8, uint8, int32].
     input_zero_point : tvm.relay.Expr
         The input zero_point.
     input_scale : tvm.relay.Expr
@@ -143,6 +179,40 @@ def dequantize(data, input_scale, input_zero_point, axis=-1):
     return _make.dequantize(data, input_scale, input_zero_point, axis)
 
 
+def simulated_dequantize(data, input_scale, input_zero_point, axis=-1, in_dtype="int8"):
+    r"""Simulated Dequantize op
+    Mimics the dequantize op but has more flexibility in valid inputs and always
+    outputs the same type as the input. This can be useful for calibrating or
+    training a quantized network.
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input tensor to be dequantized.
+    input_zero_point : tvm.relay.Expr
+        The input zero_point.
+    input_scale : tvm.relay.Expr
+        The input scale.
+    axis : int
+        The channel axis for quantization. Default value is -1 which corresponds to the last axis.
+    in_dtype : string or tvm.relay.Expr
+        A string or tensor indicating which datatype to dequantize from.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    # Convert string dtype to a constant if needed.
+    if isinstance(in_dtype, str):
+        type_code = SQNN_DTYPE_TO_CODE[in_dtype]
+        in_dtype = relay.const(type_code, dtype="int32")
+    # Wrap reshapes around qnn parameter tensors to guarantee shape compatibility.
+    input_scale = relay.op.reshape(input_scale, [-1])
+    input_zero_point = relay.op.reshape(input_zero_point, [-1])
+    return _make.simulated_dequantize(data, in_dtype, input_scale, input_zero_point, axis)
+
+
 def concatenate(data, input_scales, input_zero_points, output_scale, output_zero_point, axis):
     """Concatenate the quantized input tensors along the given axis.
 
diff --git a/python/tvm/relay/quantize/_calibrate.py b/python/tvm/relay/quantize/_calibrate.py
index 8461daa0eb5a..a906a98dccd4 100644
--- a/python/tvm/relay/quantize/_calibrate.py
+++ b/python/tvm/relay/quantize/_calibrate.py
@@ -29,7 +29,7 @@
 from .. import expr as _expr
 from .. import analysis as _analysis
 from .. import build_module as _build_module
-from ...contrib import graph_runtime
+from ...contrib import graph_executor
 from .kl_divergence import _find_scale_by_kl
 
 
@@ -39,14 +39,14 @@ def _get_profile_runtime(mod):
 
     if tvm.target.Target.current():
         target = tvm.target.Target.current()
-        ctx = tvm.context(target.kind.name)
+        dev = tvm.device(target.kind.name)
     else:
         target = "llvm"
-        ctx = tvm.context(target)
+        dev = tvm.device(target)
 
     with tvm.transform.PassContext(opt_level=3):
         lib = _build_module.build(func, target=target)
-    runtime = graph_runtime.GraphModule(lib["default"](ctx))
+    runtime = graph_executor.GraphModule(lib["default"](dev))
 
     return runtime
 
diff --git a/python/tvm/relay/testing/__init__.py b/python/tvm/relay/testing/__init__.py
index f0c79bed1218..e889e9078a84 100644
--- a/python/tvm/relay/testing/__init__.py
+++ b/python/tvm/relay/testing/__init__.py
@@ -133,8 +133,8 @@ def check_grad(
     if test_inputs is None:
         test_inputs = inputs
 
-    for target, ctx in enabled_targets():
-        intrp = relay.create_executor(ctx=ctx, target=target)
+    for target, dev in enabled_targets():
+        intrp = relay.create_executor(device=dev, target=target)
 
         # Get analytic gradients.
         _, grads = intrp.evaluate(bwd_func)(*inputs)
diff --git a/python/tvm/relay/testing/darknet.py b/python/tvm/relay/testing/darknet.py
index c0468b7ef692..e1345043c6bb 100644
--- a/python/tvm/relay/testing/darknet.py
+++ b/python/tvm/relay/testing/darknet.py
@@ -31,7 +31,7 @@
 def convert_image(image):
     """Convert the image with numpy."""
     imagex = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-    imagex = np.array(image)
+    imagex = np.array(imagex)
     imagex = imagex.transpose((2, 0, 1))
     imagex = np.divide(imagex, 255.0)
     imagex = np.flip(imagex, 0)
diff --git a/python/tvm/relay/testing/init.py b/python/tvm/relay/testing/init.py
index 1d4d8d9e311c..f275712c77d1 100644
--- a/python/tvm/relay/testing/init.py
+++ b/python/tvm/relay/testing/init.py
@@ -176,5 +176,5 @@ def create_workload(net, initializer=None, seed=0):
             continue
         init_value = np.zeros(v.concrete_shape).astype(v.dtype)
         initializer(k, init_value)
-        params[k] = tvm.nd.array(init_value, ctx=tvm.cpu(0))
+        params[k] = tvm.nd.array(init_value, device=tvm.cpu(0))
     return mod, params
diff --git a/python/tvm/relay/transform/memory_plan.py b/python/tvm/relay/transform/memory_plan.py
index 7c7685d58f95..c97053f976aa 100644
--- a/python/tvm/relay/transform/memory_plan.py
+++ b/python/tvm/relay/transform/memory_plan.py
@@ -26,7 +26,7 @@
 from .. import op, expr
 from ..function import Function
 from ... import register_func, ir, cpu
-from ..._ffi.runtime_ctypes import TVMContext
+from ..._ffi.runtime_ctypes import Device
 from ... import IRModule
 from .. import transform
 from . import function_pass
@@ -54,7 +54,7 @@ class Region:
     size: expr.Expr
     alignment: Optional[expr.Expr]
     dtype: Optional[str]
-    ctx: TVMContext
+    device: Device
     offsets: Dict[expr.Var, Tuple[expr.Expr, expr.Expr]]
 
     @staticmethod
@@ -69,7 +69,7 @@ def grow(
         old_storage: expr.Var,
         size: expr.Expr,
         alignment: expr.Expr,
-        ctx: TVMContext,
+        dev: Device,
         dtype: str,
     ) -> None:
         """Grow the region by a given allocation as well as track the old storage
@@ -87,13 +87,14 @@ def grow(
         else:
             self.alignment = alignment
 
-        if self.ctx:
+        if self.device:
             assert (
-                self.ctx.device_type == ctx.device_type and self.ctx.device_id == ctx.device_id
-            ), "must have matching context"
+                self.device.device_type == dev.device_type
+                and self.device.device_id == dev.device_id
+            ), "must have matching device"
         else:
-            assert ctx
-            self.ctx = ctx
+            assert dev
+            self.device = dev
 
         new_size = (
             (size + self.alignment - expr.const(1, "int64")) / self.alignment * self.alignment
@@ -116,8 +117,8 @@ def to_expr(self, body: expr.Expr) -> expr.Expr:
         all offset computations.
         """
 
-        if self.ctx is None:
-            self.ctx = cpu(0)
+        if self.device is None:
+            self.device = cpu(0)
 
         # Generate bindings for each and every size computation
         # we must do this to maintain ANF.
@@ -128,7 +129,7 @@ def to_expr(self, body: expr.Expr) -> expr.Expr:
         bindings.append((total_size, self.size))
 
         # Allocate the entire region with a single call.
-        alloc = op.memory.alloc_storage(total_size, self.alignment, self.ctx, self.dtype)
+        alloc = op.memory.alloc_storage(total_size, self.alignment, self.device, self.dtype)
         bindings.append((self.var, alloc))
 
         # Generate variables which contain all of the offset math.
@@ -279,21 +280,21 @@ def process_alloc_storage(self, dynamic_regions, lhs, call):
         """Process alloc_storage"""
         size, alignment = call.args
         dtype = call.attrs.dtype
-        ctx = TVMContext(call.attrs.device_type, call.attrs.device_id)
+        dev = Device(call.attrs.device_type, call.attrs.device_id)
 
         if not isinstance(size, expr.Constant):
             self.enter_scope()
             dynamic_regions.append(lhs)
         else:
             # A new scope is created when entering a new region with different
-            # device context.
+            # device device.
             region = self.current_region(dtype)
-            if region.ctx and region.ctx.device_type != ctx.device_type:
+            if region.device and region.device.device_type != dev.device_type:
                 self.enter_scope()
                 dynamic_regions.append(lhs)
 
         region = self.current_region(dtype)
-        region.grow(lhs, size, alignment, ctx, dtype)
+        region.grow(lhs, size, alignment, dev, dtype)
         return lhs, region.var
 
     def process_alloc_tensor(self, lhs, call):
diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py
index b61f209505d8..5b0e480f5f28 100644
--- a/python/tvm/relay/transform/transform.py
+++ b/python/tvm/relay/transform/transform.py
@@ -800,12 +800,36 @@ def gradient(expr, mod=None, mode="higher_order"):
       The transformed expression.
     """
     if mode == "first_order":
-        return _ffi_api.first_order_gradient(expr, mod)
+        warnings.warn(
+            "using transform.gradient for first-order AD is deprecated, please use the"
+            "FirstOrderGradient module pass",
+            DeprecationWarning,
+        )
+        if mod is not None:
+            raise RuntimeError(
+                "to run first-order AD on a module, please use the FirstOrderGradient module pass."
+            )
+        return FirstOrderGradient()(tvm.IRModule.from_expr(expr))["main"]
     if mode == "higher_order":
         return _ffi_api.gradient(expr, mod)
     raise Exception("unknown mode")
 
 
+def FirstOrderGradient():
+    """
+    Transforms all global functions in the module to return the original result, paired with the
+    gradients of the inputs. This pass transforms each global function independently and does not
+    support interprocedural AD. Additionally, this pass does not support any control-flow or
+    references, and should only be used on pure data-flow graphs.
+
+    Returns
+    -------
+    ret : tvm.transform.Pass
+        The registered FirstOrderGradient pass.
+    """
+    return _ffi_api.FirstOrderGradient()
+
+
 def Defunctionalization(func, mod):
     """
     Performs defunctionalization on func,
diff --git a/python/tvm/rpc/client.py b/python/tvm/rpc/client.py
index a50f3b856800..7196f209712e 100644
--- a/python/tvm/rpc/client.py
+++ b/python/tvm/rpc/client.py
@@ -72,8 +72,8 @@ def get_function(self, name):
         """
         return self._sess.get_function(name)
 
-    def context(self, dev_type, dev_id=0):
-        """Construct a remote context.
+    def device(self, dev_type, dev_id=0):
+        """Construct a remote device.
 
         Parameters
         ----------
@@ -83,14 +83,14 @@ def context(self, dev_type, dev_id=0):
 
         Returns
         -------
-        ctx: TVMContext
-            The corresponding encoded remote context.
+        dev: Device
+            The corresponding encoded remote device.
         """
-        ctx = nd.context(dev_type, dev_id)
+        dev = nd.device(dev_type, dev_id)
         encode = (self._tbl_index + 1) * base.RPC_SESS_MASK
-        ctx.device_type += encode
-        ctx._rpc_sess = self
-        return ctx
+        dev.device_type += encode
+        dev._rpc_sess = self
+        return dev
 
     def upload(self, data, target=None):
         """Upload file to remote runtime temp folder
@@ -199,35 +199,35 @@ def download_linked_module(self, path):
 
     def cpu(self, dev_id=0):
         """Construct CPU device."""
-        return self.context(1, dev_id)
+        return self.device(1, dev_id)
 
     def gpu(self, dev_id=0):
         """Construct GPU device."""
-        return self.context(2, dev_id)
+        return self.device(2, dev_id)
 
     def cl(self, dev_id=0):
         """Construct OpenCL device."""
-        return self.context(4, dev_id)
+        return self.device(4, dev_id)
 
     def vulkan(self, dev_id=0):
         """Construct Vulkan device."""
-        return self.context(7, dev_id)
+        return self.device(7, dev_id)
 
     def metal(self, dev_id=0):
         """Construct Metal device."""
-        return self.context(8, dev_id)
+        return self.device(8, dev_id)
 
     def ext_dev(self, dev_id=0):
         """Construct extension device."""
-        return self.context(12, dev_id)
+        return self.device(12, dev_id)
 
     def hexagon(self, dev_id=0):
         """Construct Hexagon device."""
-        return self.context(14, dev_id)
+        return self.device(14, dev_id)
 
     def webgpu(self, dev_id=0):
         """Construct WebGPU device."""
-        return self.context(15, dev_id)
+        return self.device(15, dev_id)
 
 
 class LocalSession(RPCSession):
diff --git a/python/tvm/runtime/__init__.py b/python/tvm/runtime/__init__.py
index 7d58af70afe1..fe2ea2d593b5 100644
--- a/python/tvm/runtime/__init__.py
+++ b/python/tvm/runtime/__init__.py
@@ -20,12 +20,12 @@
 from .packed_func import PackedFunc
 from .object import Object
 from .object_generic import ObjectGeneric, ObjectTypes
-from .ndarray import NDArray, DataType, DataTypeCode, TVMContext
+from .ndarray import NDArray, DataType, DataTypeCode, Device
 from .module import Module
 
 # function exposures
 from .object_generic import convert_to_object, convert, const
-from .ndarray import context, cpu, gpu, opencl, cl, vulkan, metal, mtl
+from .ndarray import device, cpu, gpu, opencl, cl, vulkan, metal, mtl
 from .ndarray import vpi, rocm, ext_dev, micro_dev
 from .module import load_module, enabled, system_lib
 from .container import String
diff --git a/python/tvm/runtime/module.py b/python/tvm/runtime/module.py
index 53576a60f32f..5165ae0854fa 100644
--- a/python/tvm/runtime/module.py
+++ b/python/tvm/runtime/module.py
@@ -168,7 +168,7 @@ def save(self, file_name, fmt=""):
         """
         _ffi_api.ModuleSaveToFile(self, file_name, fmt)
 
-    def time_evaluator(self, func_name, ctx, number=10, repeat=1, min_repeat_ms=0, f_preproc=""):
+    def time_evaluator(self, func_name, dev, number=10, repeat=1, min_repeat_ms=0, f_preproc=""):
         """Get an evaluator that measures time cost of running function.
 
         Parameters
@@ -176,8 +176,8 @@ def time_evaluator(self, func_name, ctx, number=10, repeat=1, min_repeat_ms=0, f
         func_name: str
             The name of the function in the module.
 
-        ctx: TVMContext
-            The context we should run this function on.
+        dev: Device
+            The device we should run this function on.
 
         number: int
             The number of times to run this function for taking average.
@@ -215,8 +215,8 @@ def time_evaluator(self, func_name, ctx, number=10, repeat=1, min_repeat_ms=0, f
             feval = _ffi_api.RPCTimeEvaluator(
                 self,
                 func_name,
-                ctx.device_type,
-                ctx.device_id,
+                dev.device_type,
+                dev.device_id,
                 number,
                 repeat,
                 min_repeat_ms,
@@ -339,6 +339,9 @@ def export_library(self, file_name, fcompile=None, addons=None, workspace_dir=No
                 else:
                     assert module.type_key == "c"
                     object_format = "c"
+                    if "cc" in kwargs:
+                        if kwargs["cc"] == "nvcc":
+                            object_format = "cu"
                     has_c_module = True
             path_obj = os.path.join(workspace_dir, f"lib{index}.{object_format}")
             module.save(path_obj)
@@ -451,7 +454,7 @@ def load_module(path, fmt=""):
         files = [tar_temp.relpath(x) for x in tar_temp.listdir()]
         _cc.create_shared(path + ".so", files, cc=cc)
         path += ".so"
-    # TODO(weberlo): we should probably use a more distinctive suffix for uTVM object files
+    # TODO(weberlo): we should probably use a more distinctive suffix for microTVM object files
     elif path.endswith(".obj"):
         fmt = "micro_dev"
     # Redirect to the load API
diff --git a/python/tvm/runtime/ndarray.py b/python/tvm/runtime/ndarray.py
index 5c60515e3448..980f70d8dc53 100644
--- a/python/tvm/runtime/ndarray.py
+++ b/python/tvm/runtime/ndarray.py
@@ -14,14 +14,14 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=invalid-name, unused-import
+# pylint: disable=invalid-name, unused-import, redefined-outer-name
 """Runtime NDArray API"""
 import ctypes
 import numpy as np
 import tvm._ffi
 
 from tvm._ffi.base import _LIB, check_call, c_array, string_types, _FFI_MODE
-from tvm._ffi.runtime_ctypes import DataType, TVMContext, TVMArray, TVMArrayHandle
+from tvm._ffi.runtime_ctypes import DataType, Device, TVMArray, TVMArrayHandle
 from tvm._ffi.runtime_ctypes import DataTypeCode, tvm_shape_index_t
 from . import _ffi_api
 
@@ -58,14 +58,9 @@ def dtype(self):
         return str(self.handle.contents.dtype)
 
     @property
-    def ctx(self):
-        """context of this array"""
-        return self.handle.contents.ctx
-
-    @property
-    def context(self):
-        """context of this array"""
-        return self.ctx
+    def device(self):
+        """Device of this array"""
+        return self.handle.contents.device
 
     def __hash__(self):
         return ctypes.cast(self.handle, ctypes.c_void_p).value
@@ -158,7 +153,7 @@ def copyfrom(self, source_array):
         return self
 
     def __repr__(self):
-        res = "<tvm.nd.NDArray shape={0}, {1}>\n".format(self.shape, self.context)
+        res = "<tvm.nd.NDArray shape={0}, {1}>\n".format(self.shape, self.device)
         res += self.asnumpy().__repr__()
         return res
 
@@ -196,14 +191,14 @@ def copyto(self, target):
         """
         if isinstance(target, NDArrayBase):
             return self._copyto(target)
-        if isinstance(target, TVMContext):
+        if isinstance(target, Device):
             res = empty(self.shape, self.dtype, target)
             return self._copyto(res)
         raise ValueError("Unsupported target type %s" % str(type(target)))
 
 
-def context(dev_type, dev_id=0):
-    """Construct a TVM context with given device type and id.
+def device(dev_type, dev_id=0):
+    """Construct a TVM device with given device type and id.
 
     Parameters
     ----------
@@ -215,29 +210,29 @@ def context(dev_type, dev_id=0):
 
     Returns
     -------
-    ctx: tvm.runtime.TVMContext
-        The corresponding context.
+    dev: tvm.runtime.Device
+        The corresponding device.
 
     Examples
     --------
-    Context can be used to create reflection of context by
+    Device can be used to create reflection of device by
     string representation of the device type.
 
     .. code-block:: python
 
-      assert tvm.context("cpu", 1) == tvm.cpu(1)
-      assert tvm.context("gpu", 0) == tvm.gpu(0)
-      assert tvm.context("cuda", 0) == tvm.gpu(0)
+      assert tvm.device("cpu", 1) == tvm.cpu(1)
+      assert tvm.device("gpu", 0) == tvm.gpu(0)
+      assert tvm.device("cuda", 0) == tvm.gpu(0)
     """
     if isinstance(dev_type, string_types):
         if "-device=micro_dev" in dev_type:
-            dev_type = TVMContext.STR2MASK["micro_dev"]
+            dev_type = Device.STR2MASK["micro_dev"]
         else:
             dev_type = dev_type.split()[0]
-            if dev_type not in TVMContext.STR2MASK:
+            if dev_type not in Device.STR2MASK:
                 raise ValueError("Unknown device type %s" % dev_type)
-            dev_type = TVMContext.STR2MASK[dev_type]
-    return TVMContext(dev_type, dev_id)
+            dev_type = Device.STR2MASK[dev_type]
+    return Device(dev_type, dev_id)
 
 
 def numpyasarray(np_data):
@@ -252,11 +247,11 @@ def numpyasarray(np_data):
     arr.dtype = DataType(np.dtype(data.dtype).name)
     arr.ndim = data.ndim
     # CPU device
-    arr.ctx = context(1, 0)
+    arr.device = device(1, 0)
     return arr, shape
 
 
-def empty(shape, dtype="float32", ctx=context(1, 0), mem_scope=None):
+def empty(shape, dtype="float32", device=device(1, 0), mem_scope=None):
     """Create an empty array given shape and device
 
     Parameters
@@ -267,8 +262,8 @@ def empty(shape, dtype="float32", ctx=context(1, 0), mem_scope=None):
     dtype : type or str
         The data type of the array.
 
-    ctx : TVMContext
-        The context of the array.
+    device : Device
+        The device of the array.
 
     mem_scope : Optional[str]
         The memory scope of the array.
@@ -289,7 +284,7 @@ def empty(shape, dtype="float32", ctx=context(1, 0), mem_scope=None):
     shape_ptr = ctypes.cast(ptr, ctypes.c_void_p)
     ndim = len(shape_imm)
     dtype = DataType(dtype)
-    arr = _ffi_api.TVMArrayAllocWithScope(shape_ptr, ndim, dtype, ctx, mem_scope)
+    arr = _ffi_api.TVMArrayAllocWithScope(shape_ptr, ndim, dtype, device, mem_scope)
     return arr
 
 
@@ -322,10 +317,10 @@ def cpu(dev_id=0):
 
     Returns
     -------
-    ctx : TVMContext
-        The created context
+    dev : Device
+        The created device
     """
-    return TVMContext(1, dev_id)
+    return Device(1, dev_id)
 
 
 def gpu(dev_id=0):
@@ -338,10 +333,10 @@ def gpu(dev_id=0):
 
     Returns
     -------
-    ctx : TVMContext
-        The created context
+    dev : Device
+        The created device
     """
-    return TVMContext(2, dev_id)
+    return Device(2, dev_id)
 
 
 def rocm(dev_id=0):
@@ -354,10 +349,10 @@ def rocm(dev_id=0):
 
     Returns
     -------
-    ctx : TVMContext
-        The created context
+    dev : Device
+        The created device
     """
-    return TVMContext(10, dev_id)
+    return Device(10, dev_id)
 
 
 def opencl(dev_id=0):
@@ -370,10 +365,10 @@ def opencl(dev_id=0):
 
     Returns
     -------
-    ctx : TVMContext
-        The created context
+    dev : Device
+        The created device
     """
-    return TVMContext(4, dev_id)
+    return Device(4, dev_id)
 
 
 def metal(dev_id=0):
@@ -386,10 +381,10 @@ def metal(dev_id=0):
 
     Returns
     -------
-    ctx : TVMContext
-        The created context
+    dev : Device
+        The created device
     """
-    return TVMContext(8, dev_id)
+    return Device(8, dev_id)
 
 
 def vpi(dev_id=0):
@@ -402,10 +397,10 @@ def vpi(dev_id=0):
 
     Returns
     -------
-    ctx : TVMContext
-        The created context
+    dev : Device
+        The created device
     """
-    return TVMContext(9, dev_id)
+    return Device(9, dev_id)
 
 
 def vulkan(dev_id=0):
@@ -418,10 +413,10 @@ def vulkan(dev_id=0):
 
     Returns
     -------
-    ctx : TVMContext
-        The created context
+    dev : Device
+        The created device
     """
-    return TVMContext(7, dev_id)
+    return Device(7, dev_id)
 
 
 def ext_dev(dev_id=0):
@@ -434,15 +429,15 @@ def ext_dev(dev_id=0):
 
     Returns
     -------
-    ctx : TVMContext
-        The created context
+    dev : Device
+        The created device
 
     Note
     ----
     This API is reserved for quick testing of new
     device by plugin device API as ext_dev.
     """
-    return TVMContext(12, dev_id)
+    return Device(12, dev_id)
 
 
 def micro_dev(dev_id=0):
@@ -455,10 +450,10 @@ def micro_dev(dev_id=0):
 
     Returns
     -------
-    ctx : TVMContext
-        The created context
+    dev : Device
+        The created device
     """
-    return TVMContext(13, dev_id)
+    return Device(13, dev_id)
 
 
 def hexagon(dev_id=0):
@@ -471,10 +466,10 @@ def hexagon(dev_id=0):
 
     Returns
     -------
-    ctx : TVMContext
-        The created context
+    dev : Device
+        The created device
     """
-    return TVMContext(14, dev_id)
+    return Device(14, dev_id)
 
 
 def webgpu(dev_id=0):
@@ -487,17 +482,17 @@ def webgpu(dev_id=0):
 
     Returns
     -------
-    ctx : TVMContext
-        The created context
+    dev : Device
+        The created device
     """
-    return TVMContext(15, dev_id)
+    return Device(15, dev_id)
 
 
 cl = opencl
 mtl = metal
 
 
-def array(arr, ctx=cpu(0)):
+def array(arr, device=cpu(0)):
     """Create an array from source arr.
 
     Parameters
@@ -505,8 +500,8 @@ def array(arr, ctx=cpu(0)):
     arr : numpy.ndarray
         The array to be copied from
 
-    ctx : TVMContext, optional
-        The device context to create the array
+    device : Device, optional
+        The device device to create the array
 
     Returns
     -------
@@ -515,7 +510,7 @@ def array(arr, ctx=cpu(0)):
     """
     if not isinstance(arr, (np.ndarray, NDArray)):
         arr = np.array(arr)
-    return empty(arr.shape, arr.dtype, ctx).copyfrom(arr)
+    return empty(arr.shape, arr.dtype, device).copyfrom(arr)
 
 
 # Register back to FFI
diff --git a/python/tvm/runtime/profiler_vm.py b/python/tvm/runtime/profiler_vm.py
index 5df10e55f848..e70ef320289d 100644
--- a/python/tvm/runtime/profiler_vm.py
+++ b/python/tvm/runtime/profiler_vm.py
@@ -32,15 +32,15 @@ def enabled():
 class VirtualMachineProfiler(vm.VirtualMachine):
     """Relay profile VM runtime."""
 
-    def __init__(self, exe, ctx, memory_cfg=None):
-        super(VirtualMachineProfiler, self).__init__(exe, ctx, memory_cfg)
+    def __init__(self, exe, device, memory_cfg=None):
+        super(VirtualMachineProfiler, self).__init__(exe, device, memory_cfg)
         self.module = _ffi_api._VirtualMachineDebug(exe.module)
         self._init = self.module["init"]
         self._invoke = self.module["invoke"]
         self._get_stat = self.module["get_stat"]
         self._set_input = self.module["set_input"]
         self._reset = self.module["reset"]
-        self._setup_ctx(ctx, memory_cfg)
+        self._setup_device(device, memory_cfg)
 
     def get_stat(self, sort_by_time=True):
         """Get the statistics of executed ops.
diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py
index d641e52d7184..a503da53c465 100644
--- a/python/tvm/runtime/vm.py
+++ b/python/tvm/runtime/vm.py
@@ -33,7 +33,7 @@ def _convert(arg, cargs):
     if isinstance(arg, Object):
         cargs.append(arg)
     elif isinstance(arg, np.ndarray):
-        nd_arr = tvm.nd.array(arg, ctx=tvm.cpu(0))
+        nd_arr = tvm.nd.array(arg, device=tvm.cpu(0))
         cargs.append(nd_arr)
     elif isinstance(arg, tvm.runtime.NDArray):
         cargs.append(arg)
@@ -44,7 +44,7 @@ def _convert(arg, cargs):
         cargs.append(container.tuple_object(field_args))
     elif isinstance(arg, (_base.numeric_types, bool)):
         dtype = "int32" if isinstance(arg, (int, bool)) else "float32"
-        value = tvm.nd.array(np.array(arg, dtype=dtype), ctx=tvm.cpu(0))
+        value = tvm.nd.array(np.array(arg, dtype=dtype), device=tvm.cpu(0))
         cargs.append(value)
     else:
         raise TypeError("Unsupported type: %s" % (type(arg)))
@@ -115,7 +115,7 @@ def save(self):
             f = relay.Function([x], x + x)
             mod = tvm.IRModule({"main": f})
             # create a Relay VM.
-            ctx = tvm.cpu()
+            dev = tvm.cpu()
             target = "llvm"
             executable = relay.vm.compile(mod, target)
             code, lib = executable.save()
@@ -131,7 +131,7 @@ def save(self):
             des_exec = tvm.runtime.vm.Executable.load_exec(loaded_code, loaded_lib)
             # execute the deserialized executable.
             x_data = np.random.rand(10, 10).astype('float32')
-            des_vm = tvm.runtime.vm.VirtualMachine(des_exec, ctx)
+            des_vm = tvm.runtime.vm.VirtualMachine(des_exec, dev)
             res = des_vm.run(x_data)
             print(res.asnumpy())
         """
@@ -283,14 +283,14 @@ class VirtualMachine(object):
     exe : Executable
         The VM executable.
 
-    ctx : tvm.runtime.TVMContext or List[tvm.runtime.TVMContext]
-        The context to deploy the module
+    device : tvm.runtime.Device or List[tvm.runtime.Device]
+        The device to deploy the module
 
-    memory_cfg : str or Dict[tvm.runtime.TVMContext, str], optional
+    memory_cfg : str or Dict[tvm.runtime.Device, str], optional
         Config the type of memory allocator. The allocator type can be ["naive",
-        "pooled"]. If memory_cfg is None, all contexts will use pooled allocator
-        by default. If memory_cfg is string, all contexts will use the specified
-        allocator type. If memory_cfg is a dict, each context uses the allocator
+        "pooled"]. If memory_cfg is None, all devices will use pooled allocator
+        by default. If memory_cfg is string, all devices will use the specified
+        allocator type. If memory_cfg is a dict, each device uses the allocator
         type specified in the dict, or pooled allocator if not specified in the
         dict.
     """
@@ -298,7 +298,7 @@ class VirtualMachine(object):
     NAIVE_ALLOCATOR = 1
     POOLED_ALLOCATOR = 2
 
-    def __init__(self, exe, ctx, memory_cfg=None):
+    def __init__(self, exe, device, memory_cfg=None):
         if not isinstance(exe, Executable):
             raise TypeError(
                 "exe is expected to be the type of Executable, "
@@ -309,22 +309,22 @@ def __init__(self, exe, ctx, memory_cfg=None):
         self._init = self.module["init"]
         self._invoke = self.module["invoke"]
         self._set_input = self.module["set_input"]
-        self._setup_ctx(ctx, memory_cfg)
+        self._setup_device(device, memory_cfg)
 
-    def _setup_ctx(self, ctx, memory_cfg):
-        """Init context and allocators."""
-        ctxs = ctx
-        if not isinstance(ctx, (list, tuple)):
-            if not isinstance(ctx, tvm.runtime.TVMContext):
+    def _setup_device(self, dev, memory_cfg):
+        """Init devices and allocators."""
+        devs = dev
+        if not isinstance(dev, (list, tuple)):
+            if not isinstance(dev, tvm.runtime.Device):
                 raise TypeError(
-                    "ctx is expected to be TVMContext or \
-                                List[TVMContext]"
+                    "dev is expected to be Device or \
+                                List[Device]"
                 )
-            ctxs = [ctx]
+            devs = [dev]
 
         # CPU is required for executing shape functions
-        if not any(c.device_type == tvm.cpu().device_type for c in ctxs):
-            ctxs.append(tvm.cpu())
+        if not any(c.device_type == tvm.cpu().device_type for c in devs):
+            devs.append(tvm.cpu())
 
         default_alloc_type = VirtualMachine.POOLED_ALLOCATOR
         if memory_cfg is None:
@@ -340,10 +340,10 @@ def _setup_ctx(self, ctx, memory_cfg):
                 + "but received {}".format(type(memory_cfg))
             )
         init_args = []
-        for context in ctxs:
-            init_args.append(context.device_type)
-            init_args.append(context.device_id)
-            alloc_type = memory_cfg[context] if context in memory_cfg else default_alloc_type
+        for device in devs:
+            init_args.append(device.device_type)
+            init_args.append(device.device_id)
+            alloc_type = memory_cfg[device] if device in memory_cfg else default_alloc_type
             init_args.append(alloc_type)
         self._init(*init_args)
 
diff --git a/python/tvm/script/context_maintainer.py b/python/tvm/script/context_maintainer.py
index 955266c4a3e0..ae3e9d885f1a 100644
--- a/python/tvm/script/context_maintainer.py
+++ b/python/tvm/script/context_maintainer.py
@@ -16,59 +16,217 @@
 # under the License.
 """TVM Script Context Maintainer for TIR"""
 
-from tvm.te import schedule
+from typing import List, Mapping, Union, Optional, Dict, Callable
+import synr
+
+
+import tvm
+from tvm.ir import Span
+from tvm.tir import Var, Buffer, PrimExpr, Stmt, MatchBufferRegion
+from tvm.runtime import Object
+from .node import BufferSlice
+
+
+class BlockInfo:
+    """Information for block and block_realize signature
+
+    Examples
+    ----------
+    .. code-block:: python
+
+        @tvm.script.tir
+        def example_func(a: ty.handle, b: ty.handle, c: ty.handle) -> None:
+            A = tir.match_buffer(a, (16, 16), "float32")
+            B = tir.match_buffer(b, (16, 16), "float32")
+            C = tir.match_buffer(a, (16, 16), "float32")
+
+            for i, j, k in tir.grid(16, 16, 16):
+                with tir.block([16, 16, tir.reduce_axis(16)], "matmul") as [vi, vj, vk]:
+                    tir.bind(vi, i)
+                    tir.bind(vj, j)
+                    tir.bind(vk, k)         # iter_bindings = {vj: i, vj: j, vk: k}
+
+                    tir.where(True)         # predicate of the block_realize
+
+                    tir.reads(A[0:16, 0:16], B[0: 16, 0: 16])      # reads region of the block
+                    tir.writes(C[0: 16, 0: 16])                    # writes region of the block
+                    tir.block_attr({"attr_key": "attr_value"})     # block annotations
+
+                    # alloc_buffers inside the block
+                    CC = tir.alloc_buffer((1, 1), dtype="float32")
+
+                    # match_buffers of the block,
+                    # which bind a sub-region of source buffer into a new buffer
+                    D = tir.match_buffer_region(C[vi, vj])
+
+                    # init part of the block, executed when all reduce axes are the beginning value
+                    with tir.init():
+                        C[vi, vj] = tir.float32(0)
+
+                    # block body
+                    CC[0, 0] = A[vi, vk] * B[vj, vk]
+                    D[0, 0] += CC[0, 0]         # The same as C[vi, vj] += CC[0, 0]
+    """
+
+    alloc_buffers: List[Buffer] = []
+    """List[Buffer]: list of tir.alloc_buffer statements in the block signature"""
+    match_buffers: List[MatchBufferRegion] = []
+    """List[MatchBufferRegion]: list of tir.match_buffer_region statements in the block signature"""
+    iter_bindings: Mapping[Var, PrimExpr] = {}
+    """Mapping[Var, PrimExpr]: map of block iter var to its values"""
+    reads: Optional[List[BufferSlice]] = None
+    """Optional[List[BufferSlice]]:
+    list of tir.reads statements in the block signature, None for not-visited"""
+    writes: Optional[List[BufferSlice]] = None
+    """Optional[List[BufferSlice]]:
+    list of tir.writes statements in the block signature, None for not-visited"""
+    annotations: Optional[Mapping[str, Object]] = None
+    """Optional[Mapping[str, Object]]:
+    list of tir.block_attr statements in the block signature, None for not-visited"""
+    predicate: Optional[PrimExpr] = None
+    """Optional[PrimExpr]: block realize predicate, None for not-visited"""
+    init: Optional[Stmt] = None
+    """Optional[Stmt]: init part of the block, None for not-visited"""
+
+    def __init__(self):
+        self.alloc_buffers = []
+        self.match_buffers = []
+        self.iter_bindings = {}
+        self.reads = None
+        self.writes = None
+        self.annotations = None
+        self.predicate = None
+        self.init = None
 
 
 class ContextMaintainer:
-    """Maintain all the necessary context info"""
+    """Maintain all the necessary context info
+    Parameters
+    ----------
+    _report_error : Callable[[str, Union[Span, synr.ast.Span]], None]
+        The report error function handle
+    """
+
+    # scope context
+    node_stack: List[List[synr.ast.Node]] = []
+    """List[List[synr.ast.Node]]: The ast nodes insides the current scope"""
+    block_info_stack: List[BlockInfo] = []
+    """List[BlockInfo]: The block info for the current block scope"""
+    loop_stack: List[List[Var]] = []
+    """List[List[Var]]: List of loop vars inside the current block scope"""
+    symbols: List[Dict[str, Union[Var, Buffer]]] = []
+    """List[Dict[str, Union[Var, Buffer]]]: Symbol map from name to object for the current scope"""
 
-    def __init__(self, parser):
+    # function context
+    func_params: List[Var] = []
+    """List[Var]: The function parameters"""
+    func_buffer_map: Mapping[Var, Buffer] = {}
+    """Mapping[Var, Buffer]: The function buffer map"""
+    func_dict_attr: Mapping[str, Object] = {}
+    """Mapping[str, Object]: The function attrs"""
+    func_var_env_dict: Mapping[Var, str] = {}
+    """Mapping[Var, str]: The map from var to env thread"""
+
+    # parser and analyzer
+    analyzer: tvm.arith.Analyzer = tvm.arith.Analyzer()
+    """tvm.arith.Analyzer: The analyzer for simplifying"""
+    _report_error: Callable[[str, Union[Span, synr.ast.Span]], None]
+    """Callable[[str, Union[Span, synr.ast.Span]], None]: The report error function handle"""
+
+    def __init__(self, _report_error: Callable[[str, Union[Span, synr.ast.Span]], None]):
         # scope context
-        self.node_stack = []  # AST nodes of scopes
-        self.symbols = []  # symbols of scopes
+        self.node_stack = []
+        self.block_info_stack = []
+        self.loop_stack = []
+        self.symbols = []
         # function context
-        self.func_params = []  # parameter list of function
-        self.func_buffer_map = {}  # buffer_map of function
-        self.func_dict_attr = {}  # func_attr of function
-        self.func_var_env_dict = {}  # map from var to env_name
-        # parser
-        self.parser = parser
-
-    def pop_scope(self):
-        """Pop the inner most scope"""
-        self.symbols.pop()
-        self.node_stack.pop()
+        self.func_params = []
+        self.func_buffer_map = {}
+        self.func_dict_attr = {}
+        self.func_var_env_dict = {}
+        # parser and analyzer
+        self._report_error = _report_error
+        self.analyzer = tvm.arith.Analyzer()
+
+    def enter_scope(self, nodes: Optional[List[synr.ast.Node]] = None):
+        """Creates a new scope
 
-    def new_scope(self, nodes=None):
-        """Creating a new scope"""
+        Note
+        ----
+        This function is used for normal scopes that do not involve
+        a `with block` scope. Use `enter_block_scope`
+        for block scope cases.
+
+        Parameters
+        ----------
+        nodes : Optional[List[synr.ast.Node]]
+            The synr AST nodes in new scope
+        """
         if nodes is None:
             nodes = []
         self.node_stack.append(list(reversed(nodes)))
         self.symbols.append(dict())
 
-    def update_symbol(self, name, symbol):
+    def enter_block_scope(self, nodes: Optional[List[synr.ast.Node]] = None):
+        """Creates a new block scope, the function will call `enter_scope` implicitly
+        Besides the behaviors of `enter_scope`, it will update loop_stack and block_info_stack
+        to maintain block info.
+
+        Note
+        ----
+        This function should be used to handle a block scope,
+        aka the blocks that involve a `with block` scope.
+
+        Parameters
+        ----------
+        nodes : Optional[List[synr.ast.Node]]
+            The synr AST nodes in new scope
+        """
+        self.enter_scope(nodes)
+        # Create a new loop stack for the new block
+        self.loop_stack.append([])
+        # Create a new BlockInfo for the new block
+        self.block_info_stack.append(BlockInfo())
+
+    def exit_scope(self):
+        """Pop the inner most scope"""
+        self.symbols.pop()
+        self.node_stack.pop()
+
+    def exit_block_scope(self):
+        """Pop the inner most block scope, the function will call `exit_scope` implicitly"""
+        self.exit_scope()
+        # Pop loop stack
+        self.loop_stack.pop()
+        # Pop block_info
+        self.block_info_stack.pop()
+
+    def update_symbol(self, name: str, symbol: Union[Buffer, Var], node: synr.ast.Node):
         """Append a symbol into current scope"""
-        if isinstance(symbol, schedule.Buffer):
+        if isinstance(symbol, Buffer):
             if name in self.symbols[0]:
-                self.parser.report_error("Duplicate Buffer name")
+                self.report_error("Duplicate Buffer name: " + symbol.name, node.span)
             self.symbols[0][name] = symbol
         else:
             self.symbols[-1][name] = symbol
 
-    def remove_symbol(self, name):
+    def remove_symbol(self, name: str):
         """Remove a symbol"""
         for symbols in reversed(self.symbols):
             if name in symbols:
                 symbols.pop(name)
                 return
-        raise RuntimeError("Internal error of tvm script parser: no symbol named" + name)
+        raise RuntimeError("Internal error of tvm script parser: no symbol named " + name)
 
-    def lookup_symbol(self, name):
+    def lookup_symbol(self, name: str) -> Optional[Union[Buffer, Var]]:
         """Look up symbol by name"""
         for symbols in reversed(self.symbols):
             if name in symbols:
                 return symbols[name]
         return None
 
-    def report_error(self, message, span):
-        self.parser.report_error(message, span)
+    def report_error(self, message: str, span: Union[Span, synr.ast.Span]):
+        self._report_error(message, span)
+
+    def current_block_scope(self) -> BlockInfo:
+        return self.block_info_stack[-1]
diff --git a/python/tvm/script/intrin.py b/python/tvm/script/intrin.py
index 053cd4a45846..48f50a2da442 100644
--- a/python/tvm/script/intrin.py
+++ b/python/tvm/script/intrin.py
@@ -16,9 +16,11 @@
 # under the License.
 """TVM Script Parser Intrinsic Classes"""
 # pylint: disable=redefined-builtin, relative-beyond-top-level
+from typing import List, Any
+
 import tvm.tir
 from .registry import register
-from .utils import get_param_list, from_synr_span
+from .utils import get_param_list, tvm_span_from_synr
 
 
 class Intrin:
@@ -29,8 +31,8 @@ def __init__(self, intrin, stmt=False):
     def signature(self):
         return "tir." + self.intrin.__name__, get_param_list(self.intrin)
 
-    def handle(self, arg_list, span):
-        return self.intrin(*arg_list, span=from_synr_span(span))
+    def handle(self, arg_list: List[Any], span: tvm.ir.Span):
+        return self.intrin(*arg_list, span=tvm_span_from_synr(span))
 
 
 @register
@@ -98,6 +100,16 @@ def float64(imm, span):
     return tvm.tir.Cast("float64", imm, span)
 
 
+@register
+def min_value(dtype, span):
+    return tvm.tir.min_value(dtype, span)
+
+
+@register
+def max_value(dtype, span):
+    return tvm.tir.max_value(dtype, span)
+
+
 @register
 def floordiv(x, y, span):
     return tvm.tir.floordiv(x, y, span)
@@ -145,7 +157,7 @@ def get_axis(begin, end, iter_type, span):
     block_var_dom = tvm.ir.Range.from_min_extent(begin, extent)
 
     iter_type_dict = {"data_par": 0, "reduce": 2, "scan": 3, "opaque": 4}
-    return tvm.tir.IterVar(block_var_dom, "bv", iter_type_dict[iter_type], span)
+    return tvm.tir.IterVar(block_var_dom, "bv", iter_type_dict[iter_type], span=span)
 
 
 @register
diff --git a/python/tvm/script/node.py b/python/tvm/script/node.py
new file mode 100644
index 000000000000..039eeb452ddb
--- /dev/null
+++ b/python/tvm/script/node.py
@@ -0,0 +1,150 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=redefined-builtin
+"""TVM Script nodes."""
+
+from typing import Optional, Union, List, Callable
+import synr
+
+from tvm.runtime import ObjectGeneric
+from tvm.tir import PrimExpr, Buffer, BufferLoad
+from tvm.ir import Span
+
+
+class Slice:
+    """A helper class to present slice information for BufferSlice
+
+    Parameters
+    ----------
+    start : Union[PrimExpr, int]
+        The start index.
+
+    stop : Optional[Union[PrimExpr, int]]
+        The stop index, None means the Slice is an element-wise index
+
+    span : Optional[Span]
+        The location of the slice in the source.
+    """
+
+    start: Union[PrimExpr, int]
+    stop: Optional[Union[PrimExpr, int]]
+    span: Optional[Span]
+
+    def __init__(
+        self,
+        start: Union[PrimExpr, int],
+        stop: Optional[Union[PrimExpr, int]] = None,
+        span: Optional[Span] = None,
+    ):
+        self.start = start
+        self.stop = stop
+        self.span = span
+
+
+class BufferSlice(ObjectGeneric):
+    """A generic object for representing general buffer access. Following cases are supported:
+        - element wise access buffer[i, j], which can be converted to BufferLoad if necessary
+        - slice access buffer[i: i + 1, j : j + 2]
+        - union of element and slice buffer[i, j: j + 2]
+
+        This node is used in TVMScript to parse BufferLoad, BufferRegion and Realize
+
+    Parameters
+    ----------
+    buffer : Buffer
+        The buffer.
+
+    indices : List[Union[Slice, PrimExpr, int]]
+        The access indexes can be slice, PrimExpr or int.
+
+    report_error: Callable[[str, Union[Span, synr.ast.Span]], None]
+        The error report func
+
+    span : Optional[Span]
+        The location of the buffer access in the source.
+    """
+
+    buffer: Buffer
+    slices: List[Slice]
+    report_error: Callable[[str, Union[Span, synr.ast.Span]], None]
+    span: Optional[Span]
+
+    def __init__(
+        self,
+        buffer: Buffer,
+        indices: List[Union[Slice, PrimExpr, int]],
+        report_error: Callable[[str, Union[Span, synr.ast.Span]], None],
+        span: Optional[Span] = None,
+    ):
+        def check_index(index: Union[int, PrimExpr]):
+            """ Check input index is non-negative integer or PrimExpr"""
+            if isinstance(index, int):
+                if index < 0:
+                    report_error("Negative index is not allowed during buffer access", span)
+            elif isinstance(index, PrimExpr):
+                if index.dtype != "int32":
+                    report_error(
+                        "index expected an int32 type PrimExpr but got " + str(index.dtype),
+                        index.span,
+                    )
+            else:
+                report_error(
+                    "Unsupported index type, expected int or tvm.tir.PrimExpr, but got "
+                    + str(type(index)),
+                    span,
+                )
+
+        slices: List[Slice] = []
+        for index in indices:
+            if isinstance(index, Slice):
+                check_index(index.start)
+                check_index(index.stop)
+                slices.append(index)
+            elif isinstance(index, (PrimExpr, int)):
+                check_index(index)
+                slices.append(Slice(index))
+            else:
+                report_error(
+                    "Unsupported index type for BufferSlice, "
+                    + "expected int, tvm.tir.PrimExpr, tvm.tir.Slice, but got "
+                    + str(type(index)),
+                    span,
+                )
+
+        self.buffer = buffer
+        self.slices = slices
+        self.report_error = report_error
+        self.span = span
+
+    def __str__(self):
+        regions: List[str] = []
+        for s in self.slices:
+            if s.stop is None:
+                regions.append(str(s.start))
+            else:
+                regions.append(str(s.start) + ": " + str(s.stop))
+
+        return self.buffer.name + "[" + ", ".join(regions) + "]"
+
+    def asobject(self) -> BufferLoad:
+        """Convert object."""
+        for s in self.slices:
+            if s.stop is not None:
+                self.report_error("BufferLoad only accepts elementwise access", self.span)
+
+        indices = [s.start for s in self.slices]
+        return BufferLoad(self.buffer, indices, span=self.span)
diff --git a/python/tvm/script/parser.py b/python/tvm/script/parser.py
index 33b0bab0d7e7..8f6d338238b7 100644
--- a/python/tvm/script/parser.py
+++ b/python/tvm/script/parser.py
@@ -24,6 +24,7 @@
 import json
 import operator
 import inspect
+from typing import Union
 from synr import ast, Transformer, to_ast
 
 import tvm
@@ -32,6 +33,7 @@
 from tvm.ir import GlobalVar
 
 from . import context_maintainer, ty
+from .context_maintainer import BlockInfo
 from .meta_unparser import MetaUnparser
 from .registry import Registry
 from .intrin import Intrin
@@ -39,7 +41,8 @@
 from .scope_handler import ScopeHandler, WithScopeHandler, ForScopeHandler
 from . import _ffi_api
 from .diagnostics import TVMDiagnosticCtx
-from .utils import from_synr_span
+from .utils import tvm_span_from_synr, synr_span_from_tvm, call_with_error_reporting
+from .node import Slice, BufferSlice
 
 
 class CallArgumentReader(object):
@@ -158,7 +161,7 @@ def __init__(self, base_lienno):
 
     def init_function_parsing_env(self):
         """Initialize function parsing environment"""
-        self.context = context_maintainer.ContextMaintainer(self)  # scope emitter
+        self.context = context_maintainer.ContextMaintainer(self.report_error)  # scope emitter
 
     def init_meta(self, meta_dict):
         if meta_dict is not None:
@@ -182,7 +185,7 @@ def transform(self, node):
 
         return transform_res
 
-    def report_error(self, message, span):
+    def report_error(self, message: str, span: Union[ast.Span, tvm.ir.Span]):
         """Report an error occuring at a location.
 
         This just dispatches to synr's DiagnosticContext.
@@ -191,9 +194,11 @@ def report_error(self, message, span):
         ----------
         message : str
             Error message
-        span : synr.ast.Span
+        span : Union[synr.ast.Span, tvm.ir.Span】
             Location of the error
         """
+        if isinstance(span, tvm.ir.Span):
+            span = synr_span_from_tvm(span)
         self.error(message, span)
 
     def parse_body(self, parent):
@@ -221,7 +226,7 @@ def parse_body(self, parent):
             )
         else:
             return (
-                tvm.tir.SeqStmt(body, from_synr_span(ast.Span.union(spans)))
+                tvm.tir.SeqStmt(body, tvm_span_from_synr(ast.Span.union(spans)))
                 if len(body) > 1
                 else body[0]
             )
@@ -270,6 +275,13 @@ def parse_arg_list(self, func, node_call):
             internal_args.append(reader.get_kwarg(i + 1 + len(pos_only), arg_name, default=default))
         if varargs is not None:
             internal_args.extend(reader.get_varargs(len(pos_only) + len(kwargs) + 1))
+        elif len(args) + len(kw_args) > len(pos_only) + len(kwargs):
+            self.report_error(
+                "Arguments mismatched. "
+                + f"Expected {len(pos_only) + len(kwargs)} args but got "
+                + f"{len(args) + len(kw_args)}",
+                node_call.span,
+            )
         return internal_args
 
     def parse_type(self, type_node, parent):
@@ -401,25 +413,52 @@ def my_function(x: ty.handle):  # 1. Argument types
         """
 
         self.init_function_parsing_env()
-        self.context.new_scope(nodes=node.body.stmts)
+        self.context.enter_scope(nodes=node.body.stmts)
 
         # add parameters of function
         for arg in node.params:
             arg_var = tvm.te.var(arg.name, self.parse_type(arg.ty, arg))
-            self.context.update_symbol(arg.name, arg_var)
+            self.context.update_symbol(arg.name, arg_var, node)
             self.context.func_params.append(arg_var)
 
-        # fetch the body and return a tir.PrimFunc
+        # New Scope : Implicit root block
+        # Each function contains an implicit root block in TensorIR,
+        # so here we need a block scope for it. Please note that `enter_block_scope`
+        # will not create a block directly but just stores some information.
+        # If the PrimFunc is not a TensorIR func (e.g. TE scheduled func or low-level func),
+        # the root block will not be added. The logic to add root block is in `_ffi_api.Complete`
+        self.context.enter_block_scope(nodes=node.body.stmts)
+
+        # fetch the body of root block
+        body = self.parse_body(node.body)
+        # Emit Scope : Implicit root block
+        root_info: BlockInfo = self.context.current_block_scope()
+        self.context.exit_block_scope()
+
+        # return a tir.PrimFunc
+        dict_attr = self.context.func_dict_attr
         func = tvm.tir.PrimFunc(
             self.context.func_params,
-            self.parse_body(node.body),
+            body,
             ret_type=self.parse_type(node.ret_type, node),
             buffer_map=self.context.func_buffer_map,
-            attrs=tvm.ir.make_node("DictAttrs", **self.context.func_dict_attr),
-            span=from_synr_span(node.span),
+            attrs=tvm.ir.make_node("DictAttrs", **dict_attr) if dict_attr else None,
+            span=tvm_span_from_synr(node.span),
+        )
+
+        # Fix the PrimFunc
+        # 1. generate root block if necessary
+        # 2. generate surrounding loops for blocks if necessary
+
+        func = call_with_error_reporting(
+            self.report_error,
+            node.span,
+            _ffi_api.Complete,
+            func,
+            root_info.alloc_buffers,
         )
 
-        self.context.pop_scope()
+        self.context.exit_scope()
         return func
 
     def transform_Assign(self, node):
@@ -470,12 +509,12 @@ def transform_Assign(self, node):
                 var = tvm.te.var(
                     node.lhs.id.name,
                     self.parse_type(node.ty, node.lhs),
-                    span=from_synr_span(node.lhs.span),
+                    span=tvm_span_from_synr(node.lhs.span),
                 )
-                self.context.update_symbol(var.name, var)
+                self.context.update_symbol(var.name, var, node)
                 body = self.parse_body(node)
                 self.context.remove_symbol(var.name)
-                return tvm.tir.LetStmt(var, value, body, span=from_synr_span(node.span))
+                return tvm.tir.LetStmt(var, value, body, span=tvm_span_from_synr(node.span))
 
         self.report_error("Unsupported Assign stmt", node.span)
 
@@ -484,28 +523,28 @@ def transform_SubscriptAssign(self, node):
         symbol = self.transform(node.params[0])
         indexes = self.transform(node.params[1])
         rhs = self.transform(node.params[2])
-        rhs_span = from_synr_span(node.params[2].span)
+        rhs_span = tvm_span_from_synr(node.params[2].span)
         if isinstance(symbol, tvm.tir.Buffer):
             # BufferStore
             return tvm.tir.BufferStore(
                 symbol,
                 tvm.runtime.convert(rhs, span=rhs_span),
                 indexes,
-                span=from_synr_span(node.span),
+                span=tvm_span_from_synr(node.span),
             )
         else:
             if len(indexes) != 1:
                 self.report_error(
                     f"Store is only allowed with one index, but {len(indexes)} were provided.",
-                    Span.union([x.span for x in indexes]),
+                    tvm.ir.Span.union([x.span for x in indexes]),
                 )
             # Store
             return tvm.tir.Store(
                 symbol,
                 tvm.runtime.convert(rhs, span=rhs_span),
                 indexes[0],
-                tvm.runtime.convert(True, span=from_synr_span(node.span)),
-                span=from_synr_span(node.span),
+                tvm.runtime.convert(True, span=tvm_span_from_synr(node.span)),
+                span=tvm_span_from_synr(node.span),
             )
 
     def transform_Assert(self, node):
@@ -520,7 +559,7 @@ def transform_Assert(self, node):
         message = self.transform(node.msg)
         body = self.parse_body(node)
         return tvm.tir.AssertStmt(
-            condition, tvm.runtime.convert(message), body, span=from_synr_span(node.span)
+            condition, tvm.runtime.convert(message), body, span=tvm_span_from_synr(node.span)
         )
 
     def transform_For(self, node):
@@ -529,7 +568,8 @@ def transform_For(self, node):
             For(expr target, expr iter, stmt* body, stmt* orelse, string? type_comment)
         By now 1 pattern of For is supported:
             1. for scope handler
-                for name in tir.serial()/tir.parallel()/tir.vectorized()/tir.unroll()
+                for name in tir.serial()/tir.parallel()/tir.vectorized()/tir.unroll()/tir.range()/
+                            tir.grid()/tir.thread_binding()
         """
 
         if not isinstance(node.rhs, ast.Call):
@@ -543,14 +583,14 @@ def transform_For(self, node):
         old_lineno, old_col_offset = self.current_lineno, self.current_col_offset
         self.current_lineno = node.span.start_line
         self.current_col_offset = node.span.start_column
-        self.context.new_scope(nodes=node.body.stmts)
+        self.context.enter_scope(nodes=node.body.stmts)
         # for scope handler process the scope
         arg_list = self.parse_arg_list(func, node.rhs)
         func.enter_scope(node, self.context, arg_list, node.rhs.func_name.span)
         func.body = self.parse_body(node)
         res = func.exit_scope(node, self.context, arg_list, node.rhs.func_name.span)
         # exit the scope
-        self.context.pop_scope()
+        self.context.exit_scope()
         self.current_lineno, self.current_col_offset = old_lineno, old_col_offset
         return res
 
@@ -561,9 +601,9 @@ def transform_With(self, node):
             withitem = (expr context_expr, expr? optional_vars)
         By now 2 patterns of With is supported:
             1. with scope handler with symbol def
-                with tir.allocate() as targets:
+                with tir.block(*axes)/tir.allocate() as targets:
             2. with scope handler without symbol def
-                with tir.let()/tir.Assert()/tir.attr()//tir.realize()
+                with tir.let()/tir.Assert()/tir.attr()/tir.realize()
         """
 
         if not isinstance(node.rhs, ast.Call):
@@ -582,14 +622,14 @@ def transform_With(self, node):
         old_lineno, old_col_offset = self.current_lineno, self.current_col_offset
         self.current_lineno = node.body.span.start_line
         self.current_col_offset = node.body.span.start_column
-        self.context.new_scope(nodes=node.body.stmts)
+        self.context.enter_block_scope(nodes=node.body.stmts)
         # with scope handler process the scope
         arg_list = self.parse_arg_list(func, node.rhs)
         func.enter_scope(node, self.context, arg_list, node.rhs.func_name.span)
         func.body = self.parse_body(node)
         res = func.exit_scope(node, self.context, arg_list, node.rhs.func_name.span)
         # exit the scope
-        self.context.pop_scope()
+        self.context.exit_block_scope()
         self.current_lineno, self.current_col_offset = old_lineno, old_col_offset
         return res
 
@@ -601,19 +641,21 @@ def transform_If(self, node):
 
         condition = self.transform(node.condition)
         # then body
-        self.context.new_scope(nodes=node.true.stmts)
+        self.context.enter_scope(nodes=node.true.stmts)
         then_body = self.parse_body(node)
-        self.context.pop_scope()
+        self.context.exit_scope()
 
         # else body
         if len(node.false.stmts) > 0:
-            self.context.new_scope(nodes=node.false.stmts)
+            self.context.enter_scope(nodes=node.false.stmts)
             else_body = self.parse_body(node)
-            self.context.pop_scope()
+            self.context.exit_scope()
         else:
             else_body = None
 
-        return tvm.tir.IfThenElse(condition, then_body, else_body, span=from_synr_span(node.span))
+        return tvm.tir.IfThenElse(
+            condition, then_body, else_body, span=tvm_span_from_synr(node.span)
+        )
 
     def transform_Call(self, node):
         """Call visitor
@@ -633,18 +675,26 @@ def transform_Call(self, node):
                 lhs = self.transform(node.params[0])
                 rhs = self.transform(node.params[1])
                 return self._binop_maker[node.func_name.name](
-                    lhs, rhs, span=from_synr_span(node.span)
+                    lhs, rhs, span=tvm_span_from_synr(node.span)
                 )
             if node.func_name.name in self._unaryop_maker:
                 rhs = self.transform(node.params[0])
-                return self._unaryop_maker[node.func_name.name](rhs, span=from_synr_span(node.span))
+                return self._unaryop_maker[node.func_name.name](
+                    rhs, span=tvm_span_from_synr(node.span)
+                )
             self.report_error(f"Unsupported operator {node.func_name.name}.", node.func_name.span)
         else:
             func = self.transform(node.func_name)
             if isinstance(func, Intrin) and not func.stmt:
                 # pattern 1
                 arg_list = self.parse_arg_list(func, node)
-                return func.handle(arg_list, node.func_name.span)
+                return call_with_error_reporting(
+                    self.report_error,
+                    node.func_name.span,
+                    func.handle,
+                    arg_list,
+                    node.func_name.span,
+                )
             else:
                 args = [self.transform(arg) for arg in node.params]
                 kw_args = {
@@ -653,7 +703,7 @@ def transform_Call(self, node):
                 if isinstance(func, tvm.tir.op.Op):
                     # pattern 2
                     return tvm.tir.Call(
-                        kw_args["dtype"], func, args, span=from_synr_span(node.span)
+                        kw_args["dtype"], func, args, span=tvm_span_from_synr(node.span)
                     )
                 elif callable(func):
                     # pattern 3
@@ -700,7 +750,13 @@ def f():
             )
 
         if isinstance(func, Intrin) and func.stmt:
-            return func.handle(arg_list, node.call.func_name.span)
+            return call_with_error_reporting(
+                self.report_error,
+                node.call.func_name.span,
+                func.handle,
+                arg_list,
+                node.call.func_name.span,
+            )
         elif isinstance(func, WithScopeHandler) and func.concise_scope and not func.def_symbol:
             func.enter_scope(node, self.context, arg_list, node.call.func_name.span)
             func.body = self.parse_body(node)
@@ -716,11 +772,7 @@ def transform_Slice(self, node):
         end = self.transform(node.end)
         if not (isinstance(node.step, ast.Constant) and node.step.value == 1):
             self.report_error("Only step size 1 is supported for slices.", node.step.span)
-        extent = end - start
-        if isinstance(extent, tvm.tir.PrimExpr):
-            ana = tvm.arith.Analyzer()
-            extent = ana.simplify(extent)
-        return tvm.ir.Range.from_min_extent(start, extent, span=from_synr_span(node.span))
+        return Slice(start, end)
 
     def transform_Subscript(self, node):
         """Array access visitor.
@@ -728,7 +780,7 @@ def transform_Subscript(self, node):
         By now only 2 types of Subscript are supported:
             1. Buffer[index, index, ...], Buffer element access(BufferLoad & BufferStore)
                Var[index] Buffer element access()
-            2. meta[type_key][index], Meta info access
+            2. Buffer[start: stop, start: stop, ...], BufferRealize(realize(buffer[...]))
         """
 
         symbol = self.transform(node.params[0])
@@ -736,19 +788,27 @@ def transform_Subscript(self, node):
             self.report_error(f"Variable {node.value.id} is not defined.", node.params[0].span)
 
         indexes = [self.transform(x) for x in node.params[1].values]
-        if isinstance(indexes[0], tvm.ir.Range):
-            return symbol, indexes
-
         if isinstance(symbol, tvm.tir.expr.Var):
-            return tvm.tir.Load("float32", symbol, indexes, True, span=from_synr_span(node.span))
-        if isinstance(symbol, tvm.tir.Buffer):
-            return tvm.tir.BufferLoad(symbol, indexes, span=from_synr_span(node.span))
-
-        self.report_error(
-            f"Cannot subscript from a {type(symbol).__name__}. Only variables and "
-            "buffers are supported.",
-            node.params[0].span,
-        )
+            for index in indexes:
+                if not isinstance(index, (tvm.tir.PrimExpr, int)):
+                    self.report_error(
+                        "Buffer load indexes should be int or PrimExpr, but they are "
+                        + type(index),
+                        node.span,
+                    )
+            return tvm.tir.Load(
+                "float32", symbol, indexes, True, span=tvm_span_from_synr(node.span)
+            )
+        elif isinstance(symbol, tvm.tir.Buffer):
+            return BufferSlice(
+                symbol, indexes, self.report_error, span=tvm_span_from_synr(node.span)
+            )
+        else:
+            self.report_error(
+                f"Cannot subscript from a {type(symbol).__name__}. Only variables and "
+                "buffers are supported.",
+                node.params[0].span,
+            )
 
     def transform_Attr(self, node):
         """Visitor for field access of the form `x.y`.
@@ -756,7 +816,7 @@ def transform_Attr(self, node):
         This visitor is used to lookup function and symbol names. We have two
         cases to handle here:
         1. If we have a statement of the form `tir.something`, then we lookup
-           `tir.somthing` in the `Registry`. If the function is not in the
+           `tir.something` in the `Registry`. If the function is not in the
            registry, then we try to find a `tvm.ir.op.Op` with the same name.
         2. All other names `tvm.something` are lookup up in this current python
            namespace.
@@ -875,7 +935,7 @@ def transform_Constant(self, node):
         Constant values include `None`, `"strings"`, `2` (integers), `4.2`
         (floats), and `true` (booleans).
         """
-        return tvm.runtime.convert(node.value, span=from_synr_span(node.span))
+        return tvm.runtime.convert(node.value, span=tvm_span_from_synr(node.span))
 
     def transform_TypeConstant(self, node):
         """Constant value visitor for types.
@@ -902,8 +962,7 @@ def from_source(src):
     ----------
     src : [str, function, class]
         Pruned source of original script
-    func_lineno : Optional[int]
-        The line number of the first line of the script to be parsed
+
     Returns
     -------
     functions : PrimFunc or IRModule
diff --git a/python/tvm/script/registry.py b/python/tvm/script/registry.py
index 389570115935..245cc01051d5 100644
--- a/python/tvm/script/registry.py
+++ b/python/tvm/script/registry.py
@@ -16,7 +16,8 @@
 # under the License.
 """TVM Script Parser Function Registry """
 # pylint: disable=inconsistent-return-statements, relative-beyond-top-level, import-outside-toplevel
-import inspect
+import types
+from typing import Union, Callable, Dict, Optional, Any
 
 
 class Registry(object):
@@ -24,10 +25,10 @@ class Registry(object):
     All these maps are static
     """
 
-    registrations = dict()
+    registrations: Dict[str, type] = dict()
 
     @staticmethod
-    def lookup(name):
+    def lookup(name: str) -> Optional[Any]:
         if name in Registry.registrations:
             # every time we create a new handler
             # since we may want to keep some local info inside it
@@ -35,12 +36,14 @@ def lookup(name):
         return None
 
 
-def register(inputs):
+def register(inputs: Union[Callable, type]) -> type:
     """Register Intrin/ScopeHandler/SpecialStmt"""
-    if inspect.isfunction(inputs):
+    registration: type
+    if isinstance(inputs, types.FunctionType):
+        # is function
         from .intrin import Intrin
 
-        def create_new_intrin(func):
+        def create_new_intrin(func) -> type:
             class NewIntrin(Intrin):
                 def __init__(self):
                     super().__init__(func)
@@ -48,11 +51,12 @@ def __init__(self):
             return NewIntrin
 
         registration = create_new_intrin(inputs)
-    elif inspect.isclass(inputs):
+    elif isinstance(inputs, type):
+        # is class
         registration = inputs
     else:
         raise ValueError()
 
-    key = registration().signature()[0]
+    key: str = registration().signature()[0]
     Registry.registrations[key] = registration
     return registration
diff --git a/python/tvm/script/scope_handler.py b/python/tvm/script/scope_handler.py
index 9449cbdc156c..c7d841abc36d 100644
--- a/python/tvm/script/scope_handler.py
+++ b/python/tvm/script/scope_handler.py
@@ -16,32 +16,59 @@
 # under the License.
 """TVM Script Parser Scope Handler Classes"""
 # pylint: disable=redefined-builtin, unused-argument, invalid-name, relative-beyond-top-level
+from typing import Tuple, Any, Callable, Optional, List, Union, Mapping
 
+import synr
 from synr import ast
 import tvm.tir
-from .utils import get_param_list, from_synr_span
+from tvm.runtime import Object
+from tvm.ir import Span, Range
+from tvm.tir import Stmt, PrimExpr, IterVar, Var, Buffer, BufferRegion, ForKind
+
+from .context_maintainer import ContextMaintainer
+from .utils import (
+    get_param_list,
+    tvm_span_from_synr,
+    buffer_slice_to_region,
+    call_with_error_reporting,
+)
 from .registry import register
+from .node import BufferSlice
 
 
 class ScopeHandler:
     """Base class for all scope handlers"""
 
-    def __init__(self, func):
-        self.func = func
-        self.body = None
-        self.node = None
-        self.context = None
+    def __init__(self, func: Callable):
+        self.func: Callable = func
+        self.body: Optional[Stmt] = None
+        self.node: Optional[synr.ast.Node] = None
+        self.context: Optional[ContextMaintainer] = None
 
-    def signature(self):
+    def signature(self) -> Tuple[str, Tuple[list, list, Any]]:
         return "tir." + self.func.__name__, get_param_list(self.func)
 
-    def enter_scope(self, node, context, arg_list, span):
+    def enter_scope(
+        self,
+        node: synr.ast.Node,
+        context: ContextMaintainer,
+        arg_list: List[Any],
+        span: synr.ast.Span,
+    ):
         pass
 
-    def exit_scope(self, node, context, arg_list, span):
+    def exit_scope(
+        self,
+        node: synr.ast.Node,
+        context: ContextMaintainer,
+        arg_list: List[Any],
+        span: synr.ast.Span,
+    ):
         self.node = node
         self.context = context
-        return self.func(*arg_list, span=from_synr_span(span))
+        return call_with_error_reporting(
+            context.report_error, span, self.func, *arg_list, span=tvm_span_from_synr(span)
+        )
 
 
 class WithScopeHandler(ScopeHandler):
@@ -55,24 +82,29 @@ def __init__(self, func, concise_scope, def_symbol):
     @staticmethod
     def get_optional_var_names(node, context):
         """Get list of names from ast.With's optional_vars"""
-        assert isinstance(node, ast.With)
-
-        var_names = None
-        if isinstance(node.items[0].optional_vars, ast.Name):
-            var_names = [node.items[0].optional_vars.id]
-        elif isinstance(node.items[0].optional_vars, (ast.List, ast.Tuple)):
-            for var in node.items[0].optional_vars.elts:
-                if not isinstance(var, ast.Name):
-                    context.report_error("Invalid optional var definition")
-            var_names = [var.id for var in node.items[0].optional_vars.elts]
+        assert isinstance(
+            node, ast.With
+        ), f"WithScopeHandler expected ast.With but got {type(node)}"
+
+        if isinstance(node.lhs, list):
+            for var in node.lhs:
+                if not isinstance(var, ast.Var):
+                    context.report_error(
+                        f"Invalid optional var definition, expected Var but got {type(var)}",
+                        node.span,
+                    )
+            var_names = [var.id.name for var in node.lhs]
         else:
-            context.report_error("Invalid optional var definition")
+            context.report_error(
+                f"Invalid optional var definition, expected list of Var but got {type(node.lhs)}",
+                node.span,
+            )
         return var_names
 
 
 @register
 class Allocate(WithScopeHandler):
-    """ With scope handler tir.alloc_with_scope(var, extents, dtype, scope, condition) """
+    """ With scope handler tir.allocate(extents, dtype, scope, condition) """
 
     def __init__(self):
         def allocate(extents, dtype, scope, condition=True, span=None):
@@ -86,7 +118,13 @@ def allocate(extents, dtype, scope, condition=True, span=None):
         super().__init__(allocate, concise_scope=True, def_symbol=True)
         self.buffer_var = None
 
-    def enter_scope(self, node, context, arg_list, span):
+    def enter_scope(
+        self,
+        node: synr.ast.Node,
+        context: ContextMaintainer,
+        arg_list: List[Any],
+        span: synr.ast.Span,
+    ):
         # define buffer vars in symbol table
         if isinstance(node, ast.With):
             names = WithScopeHandler.get_optional_var_names(node, context)
@@ -98,13 +136,13 @@ def enter_scope(self, node, context, arg_list, span):
         else:
             raise Exception("Internal Bug")
 
-        def setup_buffer_var(extents, dtype, scope, condition=True, span=None):
+        def setup_buffer_var(extents, dtype, scope, condition=True, span: Span = None):
             """Setup buffer var for a given type."""
             buffer_ptr_type = tvm.ir.PointerType(tvm.ir.PrimType(dtype))
             self.buffer_var = tvm.tir.Var(name, buffer_ptr_type, span)
 
-        setup_buffer_var(*arg_list, span=from_synr_span(node.lhs.id.span))
-        context.update_symbol(name, self.buffer_var)
+        setup_buffer_var(*arg_list, span=tvm_span_from_synr(node.lhs.id.span))
+        context.update_symbol(name, self.buffer_var, node)
 
 
 @register
@@ -115,10 +153,10 @@ def __init__(self):
         def launch_thread(env_var, extent, span):
             extent = tvm.runtime.convert(extent, span=span)
             return tvm.tir.AttrStmt(
-                tvm.tir.IterVar(
+                IterVar(
                     None,
                     env_var,
-                    getattr(tvm.tir.IterVar, "ThreadIndex"),
+                    getattr(IterVar, "ThreadIndex"),
                     self.context.func_var_env_dict[env_var],
                     span=span,
                 ),
@@ -136,8 +174,19 @@ class Realize(WithScopeHandler):
     """ With scope handler tir.realize(buffer_bounds, scope, condition) """
 
     def __init__(self):
-        def realize(buffer_bounds, scope, condition=True, span=None):
-            buffer, bounds = buffer_bounds
+        def realize(
+            buffer_slice: BufferSlice, scope: str, condition: bool = True, span: bool = None
+        ):
+            assert self.context, "call 'exit_scope' before 'enter_scope'"
+            buffer: Buffer = buffer_slice.buffer
+            bounds: List[Range] = []
+            for s in buffer_slice.slices:
+                min: Union[PrimExpr, int] = s.start
+                extent: Union[PrimExpr, int] = 1 if s.stop is None else s.stop - s.start
+                if isinstance(extent, PrimExpr):
+                    extent = self.context.analyzer.simplify(extent)
+                bounds.append(Range.from_min_extent(min, extent, span=s.span))
+
             scope = tvm.runtime.convert(scope, span=span)
             return tvm.tir.AttrStmt(
                 buffer,
@@ -185,92 +234,380 @@ def let(var, value, span):
         super().__init__(let, concise_scope=False, def_symbol=False)
 
 
+@register
+class Block(WithScopeHandler):
+    """ With scope handler tir.block(extents, name) as iter_vars"""
+
+    def __init__(self):
+        def block(axes=None, name_hint: str = "", span: Optional[Span] = None):
+            assert (
+                self.node and self.context and self.body
+            ), "call 'exit_scope' before 'enter_scope'"
+            block_info = self.context.block_info_stack[-1]
+            if axes is None:
+                axes = []
+            if len(axes) != len(self.block_vars):
+                self.context.report_error(
+                    "Inconsistent number of block vars, "
+                    + f"there are {len(axes)} axes but {len(self.block_vars)} block vars. "
+                    + "The number of block vars should match the number of axes.",
+                    self.node.span,
+                )
+            block_iters: List[IterVar] = []
+            for i, axis in enumerate(axes):
+                axis = tvm.runtime.convert(axis)
+                if isinstance(axis, tvm.tir.PrimExpr):
+                    block_var_dom = Range.from_min_extent(0, axis)
+                    block_iters.append(IterVar(block_var_dom, self.block_vars[i], 0))
+                elif isinstance(axis, Range):
+                    block_iters.append(IterVar(axis, self.block_vars[i], 0))
+                elif isinstance(axis, IterVar):
+                    block_iters.append(IterVar(axis.dom, self.block_vars[i], axis.iter_type))
+                else:
+                    self.context.report_error(
+                        "Invalid argument of tir.block(), "
+                        + f"expected PrimExpr, Range or IterVar, but got {type(axis)}",
+                        self.node.span,
+                    )
+
+            # create block read/write regions
+
+            reads: List[BufferRegion] = (
+                [buffer_slice_to_region(read) for read in block_info.reads]
+                if block_info.reads
+                else []
+            )
+            writes: List[BufferRegion] = (
+                [buffer_slice_to_region(write) for write in block_info.writes]
+                if block_info.writes
+                else []
+            )
+            inner = tvm.tir.Block(
+                block_iters,
+                reads,
+                writes,
+                name_hint,
+                self.body,
+                block_info.init,
+                block_info.alloc_buffers,
+                block_info.match_buffers,
+                block_info.annotations,
+                span,
+            )
+            # create block var iter binding
+            values: List[PrimExpr]
+            if not block_info.iter_bindings:
+                values = self.context.loop_stack[-2].copy()
+                if len(values) == 0:
+                    values = [tvm.tir.const(float("nan"), dtype="float32")] * len(block_iters)
+                elif len(values) != len(block_iters):
+                    self.context.report_error(
+                        "Number of block iter var and outer loop nesting mismatch, "
+                        + f"{len(block_iters)} block iter vars but {len(values)} loops",
+                        self.node.span,
+                    )
+            else:
+                for block_var in self.block_vars:
+                    if block_var not in block_info.iter_bindings:
+                        self.context.report_error(
+                            "Missing block iter var binding for " + block_var.name,
+                            self.node.span,
+                        )
+                values = [block_info.iter_bindings[block_var] for block_var in self.block_vars]
+            predicate = (
+                tvm.tir.const(True, "bool")
+                if block_info.predicate is None
+                else block_info.predicate
+            )
+            body = tvm.tir.BlockRealize(values, predicate, inner, span)
+            return body
+
+        super().__init__(func=block, concise_scope=False, def_symbol=True)
+        self.block_vars = None
+
+    def enter_scope(
+        self,
+        node: synr.ast.Node,
+        context: ContextMaintainer,
+        arg_list: List[Any],
+        span: synr.ast.Span,
+    ):
+        # define block vars
+        assert isinstance(
+            node, ast.With
+        ), f"BlockScopeHandler expected to work on ast.With but got {type(node)}"
+
+        var_names = WithScopeHandler.get_optional_var_names(node, context)
+        self.block_vars = [tvm.te.var(name) for name in var_names]
+        for block_var in self.block_vars:
+            context.update_symbol(block_var.name, block_var, node)
+
+
+@register
+class InitBlock(WithScopeHandler):
+    """ With scope handler tir.init()"""
+
+    def __init__(self):
+        def init(span: Span = None):
+            assert self.context, "call 'exit_scope' before 'enter_scope'"
+            if self.context.block_info_stack[-2].init is not None:
+                self.context.report_error("Duplicate init block declaration", span)
+            self.context.block_info_stack[-2].init = self.body
+
+        super().__init__(func=init, concise_scope=False, def_symbol=True)
+
+
 class ForScopeHandler(ScopeHandler):
     """Base class for all for scope handlers"""
 
     def __init__(self, func):
         super().__init__(func)
-        self.loop_vars = None
+        self.loop_vars: Optional[List[Var]] = None
 
-    def enter_scope(self, node, context, arg_list, span):
-        assert isinstance(node, ast.For)
+    def enter_scope(
+        self,
+        node: synr.ast.Node,
+        context: ContextMaintainer,
+        arg_list: List[Any],
+        span: synr.ast.Span,
+    ):
+        assert isinstance(node, ast.For), f"ForScopeHandler expected ast.For but got {type(node)}"
 
         loop_var_names = list()
         spans = list()
         if isinstance(node.lhs, ast.Var):
             loop_var_names.append(node.lhs.id.name)
-            spans.append(from_synr_span(node.lhs.id.span))
-        elif isinstance(node.lhs, ast.Tuple):
-            for elt in node.lhs.values:
+            spans.append(tvm_span_from_synr(node.lhs.id.span))
+        elif isinstance(node.lhs, list):
+            for elt in node.lhs:
                 if not isinstance(elt, ast.Var):
-                    context.report_error("Invalid loop var", elt.span)
+                    context.report_error(
+                        f"Invalid loop var. Expected a var, but got {type(elt)}", elt.span
+                    )
                 loop_var_names.append(elt.id.name)
-                spans.append(from_synr_span(elt.id.span))
+                spans.append(tvm_span_from_synr(elt.id.span))
         else:
-            context.report_error("Invalid loop var", node.lhs.span)
+            context.report_error(
+                f"Invalid loop var. Expected var or list of vars as lhs, but got {type(node.lhs)}",
+                span,
+            )
 
         self.loop_vars = [
             tvm.te.var(name, dtype="int32", span=span) for name, span in zip(loop_var_names, spans)
         ]
         for loop_var in self.loop_vars:
-            context.update_symbol(loop_var.name, loop_var)
+            context.update_symbol(loop_var.name, loop_var, node)
+            context.loop_stack[-1].append(loop_var)
+
+    def exit_scope(
+        self,
+        node: synr.ast.Node,
+        context: ContextMaintainer,
+        arg_list: List[Any],
+        span: synr.ast.Span,
+    ):
+        assert self.loop_vars, "call 'exit_scope' before 'enter_scope'"
+        for _ in self.loop_vars:
+            context.loop_stack[-1].pop()
+        return super().exit_scope(node, context, arg_list, span)
+
+    def create_loop(
+        self,
+        begin: PrimExpr,
+        end: PrimExpr,
+        kind: ForKind,
+        thread_binding: Optional[str] = None,
+        annotations: Optional[Mapping[str, Object]] = None,
+        span: Optional[Span] = None,
+    ) -> tvm.tir.For:
+        """
+        Helper function for creating For in TVM Script parser.
+
+        Parameters
+        ----------
+        begin : PrimExpr
+            The beginning value.
+
+        end : PrimExpr
+            The endding value.
+
+        kind : ForKind
+            The type of the for.
+
+        thread_binding: Optional[str]
+            The thread this loop binds to.
+
+        annotations : Optional[Mapping[str, Object]]
+            Additional annotation hints.
+
+        span : Optional[Span]
+            The location of this for in the source code.
+
+        Returns
+        -------
+        for : For
+            The constructed For.
+        """
+        assert (
+            self.loop_vars and self.context and self.node
+        ), "call 'exit_scope' before 'enter_scope'"
+        if len(self.loop_vars) != 1:
+            self.context.report_error(
+                f"Expected exactly one loop var, but got {self.loop_vars}", self.node.span
+            )
+        extent = end if begin == 0 else self.context.analyzer.simplify(end - begin)
+        annos: Mapping[str, Object] = {}
+        if annotations is not None:
+            annos = {
+                key: tvm.tir.StringImm(val) if isinstance(val, str) else val
+                for key, val in annotations.items()
+            }
+        return tvm.tir.For(
+            self.loop_vars[0],
+            begin,
+            extent,
+            kind,
+            self.body,
+            thread_binding=thread_binding,
+            annotations=annos,
+            span=span,
+        )
 
 
 @register
 class Serial(ForScopeHandler):
-    """ For scope handler tir.serial(begin, end)"""
+    """ For scope handler tir.serial(begin, end, annotations)"""
 
     def __init__(self):
-        def serial(begin, end, span):
-            if len(self.loop_vars) != 1:
-                self.context.report_error("Expect exact 1 loop var", span)
-            ana = tvm.arith.Analyzer()
-            extent = end if begin == 0 else ana.simplify(end - begin)
-            return tvm.tir.For(self.loop_vars[0], begin, extent, 0, self.body, span=span)
+        def serial(
+            begin: PrimExpr,
+            end: PrimExpr,
+            annotations: Optional[Mapping[str, Object]] = None,
+            span: Optional[Span] = None,
+        ):
+            return self.create_loop(begin, end, ForKind.SERIAL, annotations=annotations, span=span)
 
         super().__init__(serial)
 
 
 @register
 class Parallel(ForScopeHandler):
-    """ For scope handler tir.parallel(begin, end)"""
+    """ For scope handler tir.parallel(begin, end, annotations)"""
 
     def __init__(self):
-        def parallel(begin, end, span):
-            if len(self.loop_vars) != 1:
-                self.context.report_error("Expect exact 1 loop var")
-            ana = tvm.arith.Analyzer()
-            extent = end if begin == 0 else ana.simplify(end - begin)
-            return tvm.tir.For(self.loop_vars[0], begin, extent, 1, self.body, span=span)
+        def parallel(
+            begin: PrimExpr,
+            end: PrimExpr,
+            annotations: Optional[Mapping[str, Object]] = None,
+            span: Optional[Span] = None,
+        ):
+            return self.create_loop(
+                begin, end, ForKind.PARALLEL, annotations=annotations, span=span
+            )
 
         super().__init__(parallel)
 
 
 @register
 class Vectorized(ForScopeHandler):
-    """ For scope handler tir.vectorized(begin, end)"""
+    """ For scope handler tir.vectorized(begin, end, annotations)"""
 
     def __init__(self):
-        def vectorized(begin, end, span):
-            if len(self.loop_vars) != 1:
-                self.context.report_error("Expect exact 1 loop var")
-            ana = tvm.arith.Analyzer()
-            extent = end if begin == 0 else ana.simplify(end - begin)
-            return tvm.tir.For(self.loop_vars[0], begin, extent, 2, self.body, span=span)
+        def vectorized(
+            begin: PrimExpr,
+            end: PrimExpr,
+            annotations: Optional[Mapping[str, Object]] = None,
+            span: Optional[Span] = None,
+        ):
+            return self.create_loop(
+                begin, end, ForKind.VECTORIZED, annotations=annotations, span=span
+            )
 
         super().__init__(vectorized)
 
 
 @register
 class Unroll(ForScopeHandler):
-    """ For scope handler tir.unroll(begin, end)"""
+    """ For scope handler tir.unroll(begin, end, annotations)"""
 
     def __init__(self):
-        def unroll(begin, end, span):
-            if len(self.loop_vars) != 1:
-                self.context.report_error("Expect exact 1 loop var")
-            ana = tvm.arith.Analyzer()
-            extent = end if begin == 0 else ana.simplify(end - begin)
-            return tvm.tir.For(self.loop_vars[0], begin, extent, 3, self.body, span=span)
+        def unroll(
+            begin: PrimExpr,
+            end: PrimExpr,
+            annotations: Optional[Mapping[str, Object]] = None,
+            span: Optional[Span] = None,
+        ):
+            return self.create_loop(
+                begin, end, ForKind.UNROLLED, annotations=annotations, span=span
+            )
 
         super().__init__(unroll)
+
+
+@register
+class ThreadBinding(ForScopeHandler):
+    """ For scope handler tir.thread_binding(begin, end, thread, annotations)"""
+
+    def __init__(self):
+        def thread_binding(
+            begin: PrimExpr,
+            end: PrimExpr,
+            thread: str,
+            annotations: Optional[Mapping[str, Object]] = None,
+            span: Optional[Span] = None,
+        ):
+            thread_iter_var = IterVar(None, None, IterVar.ThreadIndex, thread, span=span)
+            return self.create_loop(
+                begin,
+                end,
+                ForKind.THREAD_BINDING,
+                thread_binding=thread_iter_var,
+                annotations=annotations,
+                span=span,
+            )
+
+        super().__init__(thread_binding)
+
+
+@register
+class RangeHandler(ForScopeHandler):
+    """For scope handler range(begin, end, annotations)
+    Note that tir.range is totally the same as tir.serial
+    """
+
+    def __init__(self):
+        def for_range(
+            begin: PrimExpr,
+            end: PrimExpr,
+            annotations: Optional[Mapping[str, Object]] = None,
+            span: Optional[Span] = None,
+        ):
+            return self.create_loop(begin, end, ForKind.SERIAL, annotations=annotations, span=span)
+
+        super().__init__(for_range)
+
+    def signature(self):
+        return "range", get_param_list(self.func)
+
+
+@register
+class Grid(ForScopeHandler):
+    """ For scope handler tir.grid(extents)"""
+
+    def __init__(self):
+        def grid(*extents: List[PrimExpr], span: Span):
+            assert (
+                self.node and self.context and self.loop_vars
+            ), "call 'exit_scope' before 'enter_scope'"
+            if len(self.loop_vars) != len(extents):
+                self.context.report_error(
+                    "Inconsistent number of loop vars and extents, "
+                    + f"got {len(self.loop_vars)} vs {len(extents)}",
+                    self.node.span,
+                )
+            body = self.body
+            for loop_var, extent in zip(reversed(self.loop_vars), reversed(extents)):
+                body = tvm.tir.For(loop_var, 0, extent, ForKind.SERIAL, body, span=span)
+            return body
+
+        super().__init__(grid)
diff --git a/python/tvm/script/special_stmt.py b/python/tvm/script/special_stmt.py
index 62ce1ea19d89..6aa1239e9d79 100644
--- a/python/tvm/script/special_stmt.py
+++ b/python/tvm/script/special_stmt.py
@@ -17,30 +17,81 @@
 """TVM Script Parser Special Stmt Classes"""
 # pylint: disable=unused-argument, no-self-argument, inconsistent-return-statements
 # pylint: disable=relative-beyond-top-level
+from typing import Callable, List, Optional, Tuple, Any, Mapping, Union
+
+import synr
 from synr import ast
 
 import tvm.tir
+from tvm.runtime import Object
 from tvm import te
-from .utils import get_param_list, from_synr_span
+from tvm.ir import Span
+from tvm.tir import IntImm
+from .utils import (
+    get_param_list,
+    tvm_span_from_synr,
+    buffer_slice_to_region,
+    call_with_error_reporting,
+)
 from .registry import register
+from .context_maintainer import ContextMaintainer
+from .node import BufferSlice
+
+
+def convert_to_int(
+    value: Union[IntImm, int],
+    arg_name: str,
+    report_error: Callable,
+    span: Union[Span, synr.ast.Span],
+) -> int:
+    """convert a const int or TVM IntImm to Python int.
+    Reports an error when input cannot be converted to int.
+
+    Parameters
+    ----------
+    value : Union[tvm.tir.IntImm, int]
+        The input value to be converted.
+    arg_name : str
+        Function argument name for error reporting.
+    report_error: Callable
+        The report error function handle
+    span : Union[synr.ast.Span, tvm.ir.Span]
+        Location of the error
+    """
+    if isinstance(value, IntImm):
+        return value.value
+    if isinstance(value, int):
+        return value
+    report_error(
+        f"Expected int or IntImm for {arg_name}, but got {str(type(value))}",
+        span,
+    )
 
 
 class SpecialStmt:
     """Base class for all Special Stmts"""
 
-    def __init__(self, func, def_symbol):
-        self.func = func
-        self.def_symbol = def_symbol
-        self.node = None
-        self.context = None
+    def __init__(self, func: Callable, def_symbol: bool):
+        self.func: Callable = func
+        self.def_symbol: bool = def_symbol
+        self.node: Optional[synr.ast.Node] = None
+        self.context: Optional[ContextMaintainer] = None
 
-    def signature(self):
+    def signature(self) -> Tuple[str, Tuple[list, list, Any]]:
         return "tir." + self.func.__name__, get_param_list(self.func)
 
-    def handle(self, node, context, arg_list, span):
+    def handle(
+        self,
+        node: ast.Node,
+        context: ContextMaintainer,
+        arg_list: List[Any],
+        span: synr.ast.Span,
+    ):
         self.node = node
         self.context = context
-        return self.func(*arg_list, span=from_synr_span(span))
+        return call_with_error_reporting(
+            context.report_error, span, self.func, *arg_list, span=tvm_span_from_synr(span)
+        )
 
 
 @register
@@ -67,17 +118,20 @@ def match_buffer(
             buffer_type="default",
             span=None,
         ):
-            assert isinstance(self.node, ast.Assign)
-
+            if not isinstance(self.node, ast.Assign):
+                self.context.report_error(
+                    "match_buffer must be assigned to a buffer, e.g. A = match_buffer(...)",
+                    self.node.span,
+                )
             if param not in self.context.func_params:
                 self.context.report_error(
                     "Can not bind non-input param to buffer", self.node.rhs.params[0].span
                 )
             if strides is None:
                 strides = []
-            align = align.value if not isinstance(align, int) else align
-            offset_factor = (
-                offset_factor.value if not isinstance(offset_factor, int) else offset_factor
+            align = convert_to_int(align, "align", self.context.report_error, self.node.span)
+            offset_factor = convert_to_int(
+                offset_factor, "offset_factor", self.context.report_error, self.node.span
             )
             buffer = tvm.tir.decl_buffer(
                 shape,
@@ -93,7 +147,7 @@ def match_buffer(
                 span=span,
             )
             self.context.func_buffer_map[param] = buffer
-            self.context.update_symbol(self.node.lhs.id.name, buffer)
+            self.context.update_symbol(self.node.lhs.id.name, buffer, self.node)
 
         super().__init__(match_buffer, def_symbol=True)
 
@@ -121,13 +175,17 @@ def buffer_decl(
             buffer_type="default",
             span=None,
         ):
-            assert isinstance(self.node, ast.Assign)
+            if not isinstance(self.node, ast.Assign):
+                self.context.report_error(
+                    "buffer_decl must be assigned to a buffer, e.g. A = buffer_decl(...)",
+                    self.node.span,
+                )
 
             if strides is None:
                 strides = []
-            align = align.value if not isinstance(align, int) else align
-            offset_factor = (
-                offset_factor.value if not isinstance(offset_factor, int) else offset_factor
+            align = convert_to_int(align, "align", self.context.report_error, self.node.span)
+            offset_factor = convert_to_int(
+                offset_factor, "offset_factor", self.context.report_error, self.node.span
             )
             buffer = tvm.tir.decl_buffer(
                 shape,
@@ -142,21 +200,293 @@ def buffer_decl(
                 buffer_type,
                 span=span,
             )
-            self.context.update_symbol(self.node.lhs.id.name, buffer)
+            self.context.update_symbol(self.node.lhs.id.name, buffer, self.node)
             return buffer
 
         super().__init__(buffer_decl, def_symbol=True)
 
 
+@register
+class AllocBuffer(SpecialStmt):
+    """Special function alloc_buffer(shape, dtype, data, strides, elem_offset, scope, align,
+                                     offset_factor, buffer_type)
+
+    Example
+    -------
+    .. code-block:: python
+
+        A = tir.alloc_buffer((128, 128), dtype="float32")
+    """
+
+    def __init__(self):
+        def alloc_buffer(
+            shape,
+            dtype="float32",
+            data=None,
+            strides=None,
+            elem_offset=None,
+            scope="",
+            align=-1,
+            offset_factor=0,
+            buffer_type="default",
+            span=None,
+        ):
+            if not isinstance(self.node, ast.Assign):
+                self.context.report_error(
+                    "alloc_buffer must be assigned to a buffer, e.g. A = alloc_buffer(...)",
+                    self.node.span,
+                )
+
+            if strides is None:
+                strides = []
+            align = convert_to_int(align, "align", self.context.report_error, self.node.span)
+            offset_factor = convert_to_int(
+                offset_factor, "offset_factor", self.context.report_error, self.node.span
+            )
+            buffer = tvm.tir.decl_buffer(
+                shape,
+                dtype,
+                self.node.lhs.id.name,
+                data,
+                strides,
+                elem_offset,
+                scope,
+                align,
+                offset_factor,
+                buffer_type,
+                span=span,
+            )
+            self.context.current_block_scope().alloc_buffers.append(buffer)
+            self.context.update_symbol(self.node.lhs.id.name, buffer, self.node)
+
+        super().__init__(alloc_buffer, def_symbol=True)
+
+
+@register
+class BlockVarBind(SpecialStmt):
+    """Special function bind(block_iter, binding_value)
+
+    Example
+    -------
+    .. code-block:: python
+
+        tir.bind(vx, i)
+    """
+
+    def __init__(self):
+        def bind(iter_var, values, span=None):
+            block_scope = self.context.current_block_scope()
+            if iter_var in block_scope.iter_bindings:
+                self.context.report_error("Duplicate iter_var bindings of " + str(iter_var), span)
+            block_scope.iter_bindings[iter_var] = values
+
+        super().__init__(bind, def_symbol=False)
+
+
+@register
+class BlockReads(SpecialStmt):
+    """Special function reads([read_buffer_regions])
+
+    Example
+    -------
+    .. code-block:: python
+
+        tir.reads([A[vi: vi + 4, vk: vk + 4], B[vk: vk + 4, vj]])
+    """
+
+    def __init__(self):
+        def reads(read_regions: Union[BufferSlice, List[BufferSlice]], span: Span = None):
+            assert self.context, "call 'exit_scope' before 'enter_scope'"
+            block_scope = self.context.current_block_scope()
+            if block_scope.reads is not None:
+                self.context.report_error(
+                    "Duplicate write region declaration, "
+                    + "previous one is "
+                    + str(", ".join(str(x) for x in block_scope.reads)),
+                    span,
+                )
+            if isinstance(read_regions, BufferSlice):
+                read_regions = [read_regions]
+            if not isinstance(read_regions, list):
+                self.context.report_error(
+                    "Incorrect input type. "
+                    + f"Expected BufferSlice or List[BufferSlice], but got {type(read_regions)}",
+                    span,
+                )
+            block_scope.reads = read_regions
+
+        super().__init__(reads, def_symbol=False)
+
+
+@register
+class BlockWrites(SpecialStmt):
+    """Special function writes([write_buffer_regions])
+
+    Example
+    -------
+    .. code-block:: python
+
+        tir.writes([C[vi: vi + 4, vj])
+    """
+
+    def __init__(self):
+        def writes(write_region: Union[BufferSlice, List[BufferSlice]], span: Span = None):
+            assert self.context, "call 'exit_scope' before 'enter_scope'"
+            block_scope = self.context.current_block_scope()
+            if block_scope.writes is not None:
+                self.context.report_error(
+                    "Duplicate write region declaration, "
+                    + "previous one is "
+                    + str(", ".join(str(x) for x in block_scope.writes)),
+                    span,
+                )
+            if isinstance(write_region, list):
+                pass
+            elif isinstance(write_region, BufferSlice):
+                write_region = [write_region]
+            else:
+                self.context.report_error(
+                    "Incorrect input type. "
+                    + f"Expected BufferSlice or List[BufferSlice], but got {type(write_region)}",
+                    span,
+                )
+            block_scope.writes = write_region
+
+        super().__init__(writes, def_symbol=False)
+
+
+@register
+class BlockAttr(SpecialStmt):
+    """Special function block_attr({attr_key: attr_value})
+
+    Example
+    -------
+    .. code-block:: python
+
+        tir.block_attr({"double_buffer_scope": 1})
+    """
+
+    def __init__(self):
+        def block_attr(attrs: Mapping[str, Object], span: Span = None):
+            assert self.context, "call 'exit_scope' before 'enter_scope'"
+            block_scope = self.context.current_block_scope()
+            if block_scope.annotations is not None:
+                self.context.report_error(
+                    "Duplicate block annotations declaration, "
+                    + "previous one is "
+                    + str(block_scope.annotations),
+                    span,
+                )
+            attrs = {
+                key: tvm.tir.StringImm(val) if isinstance(val, str) else val
+                for key, val in attrs.items()
+            }
+            block_scope.annotations = attrs
+
+        super().__init__(block_attr, def_symbol=False)
+
+
+@register
+class BlockPredicate(SpecialStmt):
+    """Special function where(predicate)
+
+    Example
+    -------
+    .. code-block:: python
+
+        tir.where(i < 4)
+    """
+
+    def __init__(self):
+        def where(predicate, span=None):
+            assert self.context, "call 'exit_scope' before 'enter_scope'"
+            block_scope = self.context.current_block_scope()
+            if block_scope.predicate is not None:
+                self.context.report_error(
+                    "Duplicate block predicate declaration, "
+                    + "previous one is "
+                    + str(block_scope.predicate),
+                    span,
+                )
+
+            block_scope.predicate = predicate
+
+        super().__init__(where, def_symbol=False)
+
+
+@register
+class BlockMatchBufferRegion(SpecialStmt):
+    """Special function match_buffer_region(source, strides, elem_offset, align, offset_factor)
+
+    Example
+    -------
+    .. code-block:: python
+
+        B = tir.match_buffer_region(A[0: 4])
+    """
+
+    def __init__(self):
+        def match_buffer_region(
+            source,
+            strides=None,
+            elem_offset=None,
+            align=-1,
+            offset_factor=0,
+            span=None,
+        ):
+            assert self.context, "call 'exit_scope' before 'enter_scope'"
+            if not isinstance(self.node, ast.Assign):
+                self.context.report_error(
+                    "match_buffer_region must be assigned to a buffer, "
+                    + "e.g. A = match_buffer_region(...)",
+                    self.node.span,
+                )
+
+            if strides is None:
+                strides = []
+            align = convert_to_int(align, "align", self.context.report_error, self.node.span)
+            offset_factor = convert_to_int(
+                offset_factor, "offset_factor", self.context.report_error, self.node.span
+            )
+
+            if not isinstance(source, BufferSlice):
+                self.context.report_error(
+                    "match_buffer_region needs a buffer region as source",
+                    span=span,
+                )
+            buffer_region = buffer_slice_to_region(source)
+            shape = [r.extent for r in buffer_region.region]
+            buffer = tvm.tir.decl_buffer(
+                shape,
+                buffer_region.buffer.dtype,
+                self.node.lhs.id.name,
+                data=None,
+                strides=strides,
+                elem_offset=elem_offset,
+                scope=buffer_region.buffer.scope,
+                data_alignment=align,
+                offset_factor=offset_factor,
+                span=span,
+            )
+            self.context.current_block_scope().match_buffers.append(
+                tvm.tir.MatchBufferRegion(buffer, buffer_region)
+            )
+            self.context.update_symbol(self.node.lhs.id.name, buffer, self.node)
+
+        super().__init__(match_buffer_region, def_symbol=True)
+
+
 @register
 class VarDef(SpecialStmt):
     """ Special function for defining a Var"""
 
     def __init__(self):
         def var(dtype, span):
-            assert isinstance(self.node, ast.Assign)
+            assert isinstance(
+                self.node, ast.Assign
+            ), f"VarDef expected ast.Assign but got {type(self.node)}"
             v = te.var(self.node.lhs.id.name, dtype, span=span)
-            self.context.update_symbol(v.name, v)
+            self.context.update_symbol(v.name, v, self.node)
 
         super().__init__(var, def_symbol=True)
 
@@ -167,10 +497,12 @@ class EnvThread(SpecialStmt):
 
     def __init__(self):
         def env_thread(env_name, span):
-            assert isinstance(self.node, ast.Assign)
+            assert isinstance(
+                self.node, ast.Assign
+            ), f"EnvThread expected ast.Assign but got {type(self.node)}"
             v = te.var(self.node.lhs.id.name, span=span)
             self.context.func_var_env_dict[v] = env_name
-            self.context.update_symbol(v.name, v)
+            self.context.update_symbol(v.name, v, self.node)
 
         super().__init__(env_thread, def_symbol=True)
 
diff --git a/python/tvm/script/utils.py b/python/tvm/script/utils.py
index a6ba9d087aa6..f8a0f610d477 100644
--- a/python/tvm/script/utils.py
+++ b/python/tvm/script/utils.py
@@ -16,15 +16,32 @@
 # under the License.
 """Helper functions in TVM Script Parser"""
 
+from typing import Callable, List, Any, Optional, Tuple, Union
+
 import inspect
-from ..ir import Span, SourceName
+import synr
+
+from tvm.arith import Analyzer
+from tvm.ir import Range, Span, SourceName
+from tvm.tir import PrimExpr, BufferRegion
+from tvm.error import DiagnosticError
+from .node import BufferSlice
 
 
-def get_param_list(func):
+def get_param_list(
+    func: Callable,
+) -> Tuple[List[str], List[Tuple[str, Tuple[Any, ...]]], Optional[str]]:
     """Get the parameter list from definition of function"""
-    full_arg_spec = inspect.getfullargspec(func)
+    full_arg_spec: inspect.FullArgSpec = inspect.getfullargspec(func)
 
-    args, defaults = full_arg_spec.args, full_arg_spec.defaults
+    args: List[str]
+    defaults: Optional[Tuple[Any, ...]]
+    kwonlyargs: List[str]
+    args, defaults, kwonlyargs = (
+        full_arg_spec.args,
+        full_arg_spec.defaults,
+        full_arg_spec.kwonlyargs,
+    )
 
     if defaults is None:
         defaults = tuple()
@@ -33,14 +50,17 @@ def get_param_list(func):
         raise RuntimeError(
             "TVM Script register error : variable keyword argument is not supported now"
         )
-    if not len(full_arg_spec.kwonlyargs) == 0:
+
+    if len(kwonlyargs) == 1 and kwonlyargs[0] == "span":
+        pass
+    elif not len(kwonlyargs) == 0:
         raise RuntimeError("TVM Script register error : keyword only argument is not supported now")
 
-    pos_only = list()
+    pos_only: List[str] = list()
     for arg in args[: len(args) - len(defaults)]:
         if arg != "span":
             pos_only.append(arg)
-    kwargs = list()
+    kwargs: List[Tuple[str, Tuple[Any, ...]]] = list()
     for default, arg in zip(defaults, args[len(args) - len(defaults) :]):
         if arg != "span":
             kwargs.append((arg, default))
@@ -48,7 +68,37 @@ def get_param_list(func):
     return pos_only, kwargs, full_arg_spec.varargs
 
 
-def from_synr_span(span):
+def buffer_slice_to_region(
+    buffer_slice: BufferSlice, analyzer: Optional[Analyzer] = None
+) -> BufferRegion:
+    """Construct BufferRegion from BufferSlice
+
+    Parameters
+    ----------
+    buffer_slice : BufferSlice
+        The input BufferSlice
+
+    analyzer : Optional[tvm.arith.Analyzer]
+        The analyzer for simplifying. If not provided, the method will construct a new one
+
+    Returns
+    -------
+    buffer_region : BufferRegion
+        The constructed BufferRegion.
+    """
+    region: List[Range] = []
+    for s in buffer_slice.slices:
+        start: Union[PrimExpr, int] = s.start
+        extent: Union[PrimExpr, int] = 1 if s.stop is None else s.stop - s.start
+        if not analyzer:
+            analyzer = Analyzer()
+        if isinstance(extent, PrimExpr):
+            extent = analyzer.simplify(extent)
+        region.append(Range.from_min_extent(start, extent, span=s.span))
+    return BufferRegion(buffer_slice.buffer, region)
+
+
+def tvm_span_from_synr(span: synr.ast.Span) -> Span:
     """Convert a synr span to a TVM span"""
     return Span(
         SourceName(span.filename),
@@ -57,3 +107,32 @@ def from_synr_span(span):
         span.start_column,
         span.end_column,
     )
+
+
+def synr_span_from_tvm(span: Span) -> synr.ast.Span:
+    """Convert a TVM span to a synr span"""
+    return synr.ast.Span(
+        span.source_name.name,
+        span.line,
+        span.column,
+        span.end_line,
+        span.end_column,
+    )
+
+
+def call_with_error_reporting(
+    report_error,
+    node_span,
+    func,
+    *args,
+    **kwargs,
+):
+    """Call function with exception handling and report error using node_span"""
+    try:
+        return func(*args, **kwargs)
+    except DiagnosticError:
+        raise
+    except Exception as err:  # pylint: disable=broad-except
+        # printing last non-empty row of error message.
+        error_msg = list(filter(None, str(err).split("\n")))[-1]
+        report_error(error_msg, node_span)
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 80852cf60605..dfbcd281e439 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -271,26 +271,28 @@ def intel_graphics(model="unknown", options=None):
     return Target(" ".join(["opencl"] + opts))
 
 
+MICRO_SUPPORTED_MODELS = {
+    "host": [],
+    "stm32f746xx": ["-mcpu=cortex-m7", "-march=armv7e-m"],
+    "nrf5340dk": ["-mcpu=cortex-m33"],
+}
+
+
 def micro(model="unknown", options=None):
     """Returns a microTVM target.
 
     Parameters
     ----------
     model : str
-        Canonically identifies the target device. This is typically a CPU or board level name (other
-        flags such as -mcpu identify the ISA).
+        Canonically identifies the target device. This is typically a device board level name.
+        The allowed values are MICRO_SUPPORTED_MODELS.keys().
     options : str or list of str
         Additional options
     """
-    trans_table = {
-        "host": [],
-        "stm32f746xx": ["-mcpu=cortex-m7", "-march=armv7e-m"],
-        "nrf5340dk": ["-mcpu=cortex-m33"],
-    }
-    if model not in trans_table:
+    if model not in MICRO_SUPPORTED_MODELS:
         raise ValueError(f"Model {model} not supported by tvm.target.micro.")
     opts = _merge_opts(
-        trans_table[model] + ["-runtime=c", "--system-lib", f"-model={model}"],
+        MICRO_SUPPORTED_MODELS[model] + ["-runtime=c", "--system-lib", f"-model={model}"],
         options,
     )
 
diff --git a/python/tvm/testing.py b/python/tvm/testing.py
index d65ab23677b5..df0f2afc583f 100644
--- a/python/tvm/testing.py
+++ b/python/tvm/testing.py
@@ -375,7 +375,7 @@ def _get_targets():
         if len(dev) == 0:
             continue
         target_kind = dev.split()[0]
-        if tvm.runtime.enabled(target_kind) and tvm.context(target_kind, 0).exist:
+        if tvm.runtime.enabled(target_kind) and tvm.device(target_kind, 0).exist:
             targets.add(dev)
     if len(targets) == 0:
         logging.warning(
@@ -450,7 +450,7 @@ def enabled_targets():
     targets: list
         A list of pairs of all enabled devices and the associated context
     """
-    return [(tgt, tvm.context(tgt)) for tgt in _get_targets()]
+    return [(tgt, tvm.device(tgt)) for tgt in _get_targets()]
 
 
 def _compose(args, decs):
@@ -514,6 +514,25 @@ def requires_cuda(*args):
     return _compose(args, _requires_cuda)
 
 
+def requires_cudagraph(*args):
+    """Mark a test as requiring the CUDA Graph Feature
+
+    This also marks the test as requiring cuda
+
+    Parameters
+    ----------
+    f : function
+        Function to mark
+    """
+    _requires_cudagraph = [
+        pytest.mark.skipif(
+            not nvcc.have_cudagraph(), reason="CUDA Graph is not supported in this environment"
+        ),
+        *requires_cuda(),
+    ]
+    return _compose(args, _requires_cudagraph)
+
+
 def requires_opencl(*args):
     """Mark a test as requiring the OpenCL runtime.
 
@@ -684,7 +703,7 @@ def parametrize_targets(*args):
     Parameters
     ----------
     f : function
-        Function to parametrize. Must be of the form `def test_xxxxxxxxx(target, ctx)`:,
+        Function to parametrize. Must be of the form `def test_xxxxxxxxx(target, device)`:,
         where `xxxxxxxxx` is any name.
     targets : list[str], optional
         Set of targets to run against. If not supplied,
@@ -693,23 +712,23 @@ def parametrize_targets(*args):
     Example
     -------
     >>> @tvm.testing.parametrize
-    >>> def test_mytest(target, ctx):
+    >>> def test_mytest(target, dev):
     >>>     ...  # do something
 
     Or
 
     >>> @tvm.testing.parametrize("llvm", "cuda")
-    >>> def test_mytest(target, ctx):
+    >>> def test_mytest(target, dev):
     >>>     ...  # do something
     """
 
     def wrap(targets):
         def func(f):
             params = [
-                pytest.param(target, tvm.context(target, 0), marks=_target_to_requirement(target))
+                pytest.param(target, tvm.device(target, 0), marks=_target_to_requirement(target))
                 for target in targets
             ]
-            return pytest.mark.parametrize("target,ctx", params)(f)
+            return pytest.mark.parametrize("target,dev", params)(f)
 
         return func
 
diff --git a/python/tvm/tir/analysis/analysis.py b/python/tvm/tir/analysis/analysis.py
index 1a3eb4806677..829eb8bbdedb 100644
--- a/python/tvm/tir/analysis/analysis.py
+++ b/python/tvm/tir/analysis/analysis.py
@@ -106,3 +106,26 @@ def verify_gpu_code(func, constraints):
         The result of verification.
     """
     return _ffi_api.verify_gpu_code(func, constraints)
+
+
+def get_block_access_region(block, buffer_var_map):
+    """Detect which regions of tensors in this block are read or written to.
+       Regions are sorted by order of appearance in the AST.
+
+    Parameters
+    ----------
+    block: tvm.tir.Block
+        The block in which we are detecting read/write regions.
+
+    buffer_var_map : Dict[Var, Buffer]
+        The outside buffers which may access the block. Mapping from buffer var to the buffer
+
+    Returns
+    -------
+    result : List[List[BufferRegion]]
+        Array of access regions. There are three arrays of BufferRegion:
+            - first: read regions
+            - second: write regions
+            - third: opaque regions
+    """
+    return _ffi_api.get_block_access_region(block, buffer_var_map)
diff --git a/python/tvm/tir/buffer.py b/python/tvm/tir/buffer.py
index 95966a5050e1..d7067a5bdd94 100644
--- a/python/tvm/tir/buffer.py
+++ b/python/tvm/tir/buffer.py
@@ -217,10 +217,10 @@ def decl_buffer(
         Bb = tvm.tir.decl_buffer(B.shape, B.dtype, name="Bb", buffer_type="auto_broadcast")
         s = te.create_schedule(C.op)
         fadd = tvm.build(s, [A, B, C], target='llvm', name='bcast_add', binds={A:Ab, B:Bb})
-        ctx = tvm.cpu(0)
-        a = tvm.nd.array(np.random.uniform(size=(2, 4, 3)).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=(2, 1, 3)).astype(B.dtype), ctx)
-        c = tvm.nd.array(np.zeros((2, 4, 3), dtype=C.dtype), ctx)
+        dev = tvm.cpu(0)
+        a = tvm.nd.array(np.random.uniform(size=(2, 4, 3)).astype(A.dtype), dev)
+        b = tvm.nd.array(np.random.uniform(size=(2, 1, 3)).astype(B.dtype), dev)
+        c = tvm.nd.array(np.zeros((2, 4, 3), dtype=C.dtype), dev)
         fadd(a, b, c)
         tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
 
diff --git a/python/tvm/topi/__init__.py b/python/tvm/topi/__init__.py
index c196b33cf880..ef2c5c1ea4b5 100644
--- a/python/tvm/topi/__init__.py
+++ b/python/tvm/topi/__init__.py
@@ -42,7 +42,7 @@
 from .sparse_reshape import *
 from .scatter_add import *
 from .argwhere import *
-from .cumsum import *
+from .scan import *
 from .einsum import *
 from .unique import *
 from . import generic
diff --git a/python/tvm/topi/arm_cpu/__init__.py b/python/tvm/topi/arm_cpu/__init__.py
index e121fbc7ec6d..9e2057a7126f 100644
--- a/python/tvm/topi/arm_cpu/__init__.py
+++ b/python/tvm/topi/arm_cpu/__init__.py
@@ -26,3 +26,4 @@
 from .bitserial_dense import *
 from .injective import *
 from . import cortex_m7
+from .group_conv2d import *
diff --git a/python/tvm/topi/arm_cpu/group_conv2d.py b/python/tvm/topi/arm_cpu/group_conv2d.py
new file mode 100644
index 000000000000..d852b9acef66
--- /dev/null
+++ b/python/tvm/topi/arm_cpu/group_conv2d.py
@@ -0,0 +1,370 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
+# pylint: disable=no-value-for-parameter,import-outside-toplevel
+"""Grouped Spatial Pack Convolution (Group Conv2D) schedule on ARM"""
+
+import tvm
+from tvm import autotvm
+from tvm import te
+from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
+
+from ..utils import get_const_tuple
+from ..nn.pad import pad
+from .. import tag
+
+from ..nn.conv2d import _get_workload as _get_conv2d_workload
+
+
+def group_conv2d_nchw(data, kernel, strides, padding, dilation, groups, out_dtype):
+    """Compute group_conv2d with NCHW layout"""
+    return group_conv2d_nchw_spatial_pack(
+        data, kernel, strides, padding, dilation, groups, out_dtype
+    )
+
+
+def schedule_group_conv2d_nchw(outs):
+    """Compute group_conv2d with NCHW layout"""
+    return schedule_group_conv2d_nchwc(outs)
+
+
+def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype, layout="NCHW"):
+    """
+    Get default schedule config for the workload
+    """
+    static_data_shape = []
+    for dim in get_const_tuple(data.shape):
+        if isinstance(dim, tvm.tir.Var):
+            static_data_shape.append(1)
+        else:
+            static_data_shape.append(dim)
+    data = te.placeholder(static_data_shape, dtype=data.dtype)
+
+    wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype, layout)
+    _fallback_schedule(cfg, wkl)
+
+
+def _fallback_schedule(cfg, wkl):
+    simd_width = 4  # assume ARM SIMD Width is 4
+    pad_left, pad_right = wkl.padl, wkl.padr
+    stride_w = wkl.stride_w
+    out_width = (wkl.width + pad_left + pad_right - wkl.kernel_w) // stride_w + 1
+    groups = wkl.groups
+    kernels_per_group = wkl.out_filter // groups
+    kernel_depth = wkl.in_filter // groups
+
+    oc_bn = 1
+
+    oc_bn = 1
+    for bn in range(simd_width, 0, -1):
+        if kernels_per_group % bn == 0:
+            oc_bn = bn
+            break
+    if oc_bn > kernels_per_group:
+        oc_bn = kernels_per_group
+
+    ic_bn = 1
+    for bn in range(oc_bn, 0, -1):
+        if kernel_depth % bn == 0:
+            ic_bn = bn
+            break
+    if ic_bn > kernel_depth:
+        ic_bn = kernel_depth
+
+    reg_n = 1
+    for n in range(31, 0, -1):
+        if out_width % n == 0:
+            reg_n = n
+            break
+
+    cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn])
+    cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn])
+    cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n])
+    cfg["unroll_kw"] = OtherOptionEntity(False)
+
+
+@autotvm.register_topi_compute("group_conv2d_nchw.arm_cpu")
+def group_conv2d_nchw_spatial_pack(
+    cfg, data, kernel, strides, padding, dilation, groups, out_dtype="float32"
+):
+    """
+    Compute group conv2d with NCHW layout, using GSPC algorithm.
+    https://arxiv.org/abs/2006.09791
+    """
+    assert isinstance(dilation, int) or len(dilation) == 2
+    if isinstance(dilation, int):
+        dilation_h, dilation_w = dilation, dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4
+    if isinstance(padding, int):
+        pad_top, pad_left, pad_bottom, pad_right = padding, padding, padding, padding
+    elif len(padding) == 2:
+        hpad, wpad = padding
+        pad_top, pad_bottom = hpad, hpad
+        pad_left, pad_right = wpad, wpad
+    else:
+        pad_top, pad_left, pad_bottom, pad_right = padding
+
+    hpad = pad_top + pad_bottom
+    wpad = pad_left + pad_right
+
+    assert isinstance(strides, int) or len(strides) == 2
+    if isinstance(strides, int):
+        stride_h, stride_w = strides, strides
+    else:
+        stride_h, stride_w = strides
+
+    batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape)
+    out_channel, kernel_depth, k_height, k_width = get_const_tuple(kernel.shape)
+
+    pad_height = in_height + pad_top + pad_bottom
+    pad_width = in_width + pad_left + pad_right
+
+    dilated_kernel_h = (k_height - 1) * dilation_h + 1
+    dilated_kernel_w = (k_width - 1) * dilation_w + 1
+    out_height = (in_height + pad_top + pad_bottom - dilated_kernel_h) // stride_h + 1
+    out_width = (in_width + pad_left + pad_right - dilated_kernel_w) // stride_w + 1
+
+    kernels_per_group = out_channel // groups
+
+    cfg.define_split("tile_ic", in_channel, num_outputs=2)
+    cfg.define_split("tile_oc", out_channel, num_outputs=2)
+    cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
+    cfg.define_knob("unroll_kw", [True, False])
+
+    # If no config was set, we can fallback to default config.
+    if cfg.is_fallback:
+        _get_default_config(
+            cfg,
+            te.placeholder((batch_size, in_channel, in_height, in_width), dtype=data.dtype),
+            te.placeholder(
+                (out_channel, in_channel // groups, k_height, k_width), dtype=kernel.dtype
+            ),
+            strides,
+            padding,
+            groups,
+            out_dtype,
+        )
+
+    oc_bn = cfg["tile_oc"].size[-1]
+    ic_bn = cfg["tile_ic"].size[-1]
+
+    # pack data
+    DOPAD = hpad != 0 or wpad != 0
+    if DOPAD:
+        data_pad = pad(
+            data, (0, 0, pad_top, pad_left), (0, 0, pad_bottom, pad_right), name="data_pad"
+        )
+    else:
+        data_pad = data
+
+    shape = (groups, batch_size, kernel_depth // ic_bn, pad_height, ic_bn, pad_width)
+
+    data_vec = te.compute(
+        shape,
+        lambda g, n, C, h, c, w: data_pad[n, C * ic_bn + c + kernel_depth * g, h, w],
+        name="data_vec",
+    )
+
+    # pack kernel
+    shape = (
+        groups,
+        kernels_per_group // oc_bn,
+        kernel_depth // ic_bn,
+        k_height,
+        k_width,
+        ic_bn,
+        oc_bn,
+    )
+
+    kernel_vec = te.compute(
+        shape,
+        lambda g, out_channel, in_channel, h, w, ci, co: kernel[
+            (out_channel * oc_bn + co + g * kernels_per_group), in_channel * ic_bn + ci, h, w
+        ],
+        name="kernel_vec",
+    )
+
+    # convolution
+    oshape = (groups, batch_size, kernels_per_group // oc_bn, out_height, out_width, oc_bn)
+    unpack_shape = (batch_size, out_channel, out_height, out_width)
+
+    ic = te.reduce_axis((0, (kernel_depth)), name="ic")
+    kh = te.reduce_axis((0, k_height), name="kh")
+    kw = te.reduce_axis((0, k_width), name="kw")
+
+    idxmod = tvm.tir.indexmod
+    idxdiv = tvm.tir.indexdiv
+
+    conv = te.compute(
+        oshape,
+        lambda g, n, oc_chunk, oh, ow, oc_block: te.sum(
+            data_vec[
+                g,
+                n,
+                idxdiv(ic, ic_bn),
+                oh * stride_h + kh * dilation_h,
+                idxmod(ic, ic_bn),
+                ow * stride_w + kw * dilation_w,
+            ].astype(out_dtype)
+            * kernel_vec[
+                g, oc_chunk, idxdiv(ic, ic_bn), kh, kw, idxmod(ic, ic_bn), oc_block
+            ].astype(out_dtype),
+            axis=[ic, kh, kw],
+        ),
+        name="conv",
+    )
+
+    unpack = te.compute(
+        unpack_shape,
+        lambda n, c, h, w: conv[
+            idxdiv(c, kernels_per_group),
+            n,
+            idxmod(idxdiv(c, oc_bn), (kernels_per_group // oc_bn)),
+            h,
+            w,
+            idxmod(idxmod(c, oc_bn), kernels_per_group),
+        ].astype(out_dtype),
+        name="output_unpack",
+        tag="group_conv2d_nchw",
+    )
+
+    return unpack
+
+
+@autotvm.register_topi_schedule("group_conv2d_nchw.arm_cpu")
+def schedule_group_conv2d_nchwc(cfg, outs):
+    """Create schedule for tensors"""
+    s = te.create_schedule([x.op for x in outs])
+    scheduled_ops = []
+
+    def traverse(op):
+        """Traverse operators from computation graph"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(op.tag):
+            if op not in s.outputs:
+                s[op].compute_inline()
+            for tensor in op.input_tensors:
+                if isinstance(tensor.op, tvm.te.ComputeOp) and tensor.op not in scheduled_ops:
+                    traverse(tensor.op)
+
+        if "group_conv2d_nchw" in op.tag:
+            output = op.output(0)
+
+            if "tile_ic" not in cfg:
+                return
+            conv_out = op.input_tensors[0]
+            kernel_vec = conv_out.op.input_tensors[1]
+            kernel = kernel_vec.op.input_tensors[0]
+            if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
+                s[kernel].compute_inline()
+            data_vec = conv_out.op.input_tensors[0]
+            data = data_vec.op.input_tensors[0]
+            data_pad = None
+            if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag:
+                data_pad = data
+                data = data_pad.op.input_tensors[0]
+
+            args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, outs[0]]
+            _schedule_gspc_nchw(*args)
+
+        scheduled_ops.append(op)
+
+    traverse(outs[0].op)
+    return s
+
+
+def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, last):
+    """Schedule GSPC"""
+    ic_bn, oc_bn, reg_n, unroll_kw = (
+        cfg["tile_ic"].size[-1],
+        cfg["tile_oc"].size[-1],
+        cfg["tile_ow"].size[-1],
+        cfg["unroll_kw"].val,
+    )
+
+    _, W = data, kernel_vec
+    A0, A1 = data_pad, data_vec
+
+    # schedule data
+    if (
+        data_pad is not None
+        and isinstance(data_pad.op, tvm.te.ComputeOp)
+        and "pad" in data_pad.op.tag
+    ):
+        s[A0].compute_inline()
+
+    groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis
+
+    parallel_axis = s[A1].fuse(batch, ic_chunk, ih)
+    s[A1].parallel(parallel_axis)
+
+    # schedule kernel pack
+    groups, oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[W].op.axis
+    s[W].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block)
+
+    if oc_bn > 1:
+        s[W].vectorize(oc_block)
+
+    parallel_axis = s[W].fuse(groups, oc_chunk, oh)
+    s[W].parallel(parallel_axis)
+
+    # schedule conv
+    C, O0, O = conv_out, output, last
+    CC = s.cache_write(C, "global")
+
+    _, _, oc_chunk, oh, ow, oc_block = s[C].op.axis
+
+    ow_chunk, ow_block = s[C].split(ow, factor=reg_n)
+
+    s[C].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
+    s[C].fuse(oc_chunk, oh)
+    s[C].vectorize(oc_block)
+
+    groups, batch, oc_chunk, oh, ow, oc_block = s[CC].op.axis
+
+    ic, kh, kw = s[CC].op.reduce_axis
+    ow_chunk, ow_block = s[CC].split(ow, factor=reg_n)
+    ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn)
+
+    if unroll_kw:
+        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, ic_block, kw, ow_block, oc_block)
+        s[CC].unroll(kw)
+    else:
+        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, kw, ic_block, ow_block, oc_block)
+
+    parallel_axis = s[CC].fuse(groups, batch, oc_chunk, oh)
+    s[CC].parallel(parallel_axis)
+
+    s[CC].vectorize(oc_block)
+
+    s[CC].unroll(ow_block)
+
+    if O0 != O:
+        s[O0].compute_inline()
+
+    batch, oc, oh, ow = s[O].op.axis
+    ow_chunk, ow_block = s[O].split(ow, factor=reg_n)
+    oc_chunk, oc_block = s[O].split(oc, factor=oc_bn)
+
+    s[O].reorder(batch, oc_chunk, oh, ow_chunk, ow_block, oc_block)
+    parallel_axis = s[O].fuse(oc_chunk, oh)
+    s[O].vectorize(oc_block)
+    s[O].parallel(parallel_axis)
+    return s
diff --git a/python/tvm/topi/cuda/batch_matmul.py b/python/tvm/topi/cuda/batch_matmul.py
index 006b866d6bad..04e484f526d2 100644
--- a/python/tvm/topi/cuda/batch_matmul.py
+++ b/python/tvm/topi/cuda/batch_matmul.py
@@ -159,9 +159,10 @@ def batch_matmul_cublas(cfg, x, y, out_shape=None):
     output : tvm.te.Tensor
         3-D with shape [batch, M, N]
     """
-    b, m, k = x.shape
-    b, n, k = y.shape
-    cfg.add_flop(b * m * k * n * 2)
+    b, m, k = get_const_tuple(x.shape)
+    b, n, k = get_const_tuple(y.shape)
+    if all([isinstance(s, int) for s in [b, m, n, k]]):
+        cfg.add_flop(b * m * k * n * 2)
     return cublas.batch_matmul(x, y, False, True)
 
 
diff --git a/python/tvm/topi/cuda/dense.py b/python/tvm/topi/cuda/dense.py
index ad4882ab09f2..8adc38b84b1b 100644
--- a/python/tvm/topi/cuda/dense.py
+++ b/python/tvm/topi/cuda/dense.py
@@ -17,7 +17,7 @@
 # pylint: disable=invalid-name, unused-argument
 """Schedule for dense operator"""
 import logging
-from tvm import te, tir
+from tvm import te
 import tvm.autotvm as autotvm
 from tvm.autotvm.task.space import SplitEntity
 from tvm.contrib import cublas
@@ -39,14 +39,11 @@ def dense_cublas(cfg, data, weight, bias=None, out_dtype=None):
     if out_dtype is None:
         out_dtype = data.dtype
     assert out_dtype == data.dtype, "Mixed precision not supported."
-    batch, in_dim = data.shape
-    out_dim, _ = weight.shape
+    batch, in_dim = get_const_tuple(data.shape)
+    out_dim, _ = get_const_tuple(weight.shape)
     matmul = cublas.matmul(data, weight, False, True)
-    if isinstance(batch, int):
+    if all(isinstance(d, int) for d in [batch, in_dim, out_dim]):
         cfg.add_flop(batch * in_dim * out_dim * 2)
-    elif isinstance(batch, tir.IntImm):
-        cfg.add_flop(batch.value * in_dim * out_dim * 2)
-    # if we get a te.Var, we cannot add flop counts
     if bias is not None:
         matmul = te.compute(
             (batch, out_dim), lambda i, j: matmul[i, j] + bias[j], tag=tag.BROADCAST
diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
index ccc2ec9d0c21..c83dae0d3b96 100644
--- a/python/tvm/topi/cuda/nms.py
+++ b/python/tvm/topi/cuda/nms.py
@@ -474,7 +474,9 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
                     box_indices[i * num_anchors + j] = -1
 
         with ib.else_scope():
-            with ib.if_scope(j < valid_count[i]):
+            # Need to copy all boxes if not using return_indices
+            bounds = valid_count[i] if return_indices else num_anchors
+            with ib.if_scope(j < bounds):
                 src_offset = base_src_idx + j * box_data_length
 
                 with ib.for_range(0, 4, kind="unroll") as k:
@@ -869,10 +871,10 @@ def non_max_suppression(
         np_valid_count = np.array([4])
         s = topi.generic.schedule_nms(out)
         f = tvm.build(s, [data, valid_count, out], "cuda")
-        ctx = tvm.gpu(0)
-        tvm_data = tvm.nd.array(np_data, ctx)
-        tvm_valid_count = tvm.nd.array(np_valid_count, ctx)
-        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx)
+        dev = tvm.gpu(0)
+        tvm_data = tvm.nd.array(np_data, dev)
+        tvm_valid_count = tvm.nd.array(np_valid_count, dev)
+        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), dev)
         f(tvm_data, tvm_valid_count, tvm_out)
     """
     data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
diff --git a/python/tvm/topi/cuda/scan.py b/python/tvm/topi/cuda/scan.py
index 84ab5dcf9756..3240ebcd515c 100644
--- a/python/tvm/topi/cuda/scan.py
+++ b/python/tvm/topi/cuda/scan.py
@@ -16,13 +16,16 @@
 # under the License.
 # pylint: disable=invalid-name, too-many-locals, too-many-statements
 "Scan related operators"
+from typing import Callable, Optional, Union
+
 import tvm
 from tvm import te
-from tvm.contrib.thrust import can_use_thrust, can_use_rocthrust
-from ..transform import expand_dims, squeeze, transpose, reshape
-from ..utils import ceil_div, swap, prod, get_const_int
-from ..math import cast
+from tvm.contrib.thrust import can_use_rocthrust, can_use_thrust
+
 from .. import tag
+from ..math import cast
+from ..transform import expand_dims, reshape, squeeze, transpose
+from ..utils import ceil_div, get_const_int, prod, swap
 from .injective import schedule_injective_from_existing
 
 
@@ -32,7 +35,7 @@ def _get_thrust_func_name(tvmop):
     return tvmop_to_thrust_func_name[tvmop]
 
 
-def exclusive_scan_ir(data, output, reduction=None, binop=tvm.tir.generic.add):
+def exclusive_scan_ir(data, output, reduction=None, binop=tvm.tir.generic.add, identity_value=0):
     """Low level IR to do exclusive sum scan along rows of 2D input.
 
     Parameters
@@ -50,6 +53,11 @@ def exclusive_scan_ir(data, output, reduction=None, binop=tvm.tir.generic.add):
         A binary associative op to use for scan. The function takes two TIR expressions
         and produce a new TIR expression. By default it uses tvm.tir.generic.add to compute
         prefix sum.
+
+    identity_value: int or float
+        A value for the binary operation which provides the identity property. E.g. if * is
+        your operator and i is the identity_value then a * i = a for all a in the domain of
+        your operation.
     """
 
     batch_size = prod(data.shape[:-1])
@@ -134,7 +142,7 @@ def exclusive_scan_ir(data, output, reduction=None, binop=tvm.tir.generic.add):
             with ib.if_scope(bx < batch_size):
                 if reduction is not None:
                     reduction[bx] = output[(bx + 1) * scan_axis_size - 1]
-                output[(bx + 1) * scan_axis_size - 1] = cast(0, out_dtype)
+                output[(bx + 1) * scan_axis_size - 1] = cast(identity_value, out_dtype)
 
         with ib.for_range(0, lim, dtype="int64") as l2_width:
             width = 2 << (lim - l2_width - 1)
@@ -309,7 +317,12 @@ def scan_thrust(
 
 
 def exclusive_scan(
-    data, axis=-1, return_reduction=False, output_dtype=None, binop=tvm.tir.generic.add
+    data,
+    axis=-1,
+    return_reduction=False,
+    output_dtype=None,
+    binop=tvm.tir.generic.add,
+    identity_value=0,
 ):
     """Do exclusive scan on 1D or multidimensional input.
 
@@ -335,6 +348,11 @@ def exclusive_scan(
         and produce a new TIR expression. By default it uses tvm.tir.generic.add to compute
         prefix sum.
 
+    identity_value: int or float
+        A value for the binary operation which provides the identity property. E.g. if * is
+        your operator and i is the identity_value then a * i = a for all a in the domain of
+        your operation.
+
     Returns
     -------
     output : tvm.te.Tensor
@@ -347,9 +365,15 @@ def exclusive_scan(
 
     def do_scan(data, output_dtype):
         target = tvm.target.Target.current()
-        if target and (
-            can_use_thrust(target, "tvm.contrib.thrust.sum_scan")
-            or can_use_rocthrust(target, "tvm.contrib.thrust.sum_scan")
+
+        # TODO: add support for a prod_scan
+        if (
+            target
+            and binop == tvm.tir.generic.add
+            and (
+                can_use_thrust(target, "tvm.contrib.thrust.sum_scan")
+                or can_use_rocthrust(target, "tvm.contrib.thrust.sum_scan")
+            )
         ):
             return scan_thrust(
                 data, output_dtype, exclusive=True, return_reduction=return_reduction, binop=binop
@@ -366,7 +390,9 @@ def do_scan(data, output_dtype):
             output, reduction = te.extern(
                 [data.shape, data.shape[:-1]],
                 [data],
-                lambda ins, outs: exclusive_scan_ir(ins[0], outs[0], outs[1], binop=binop),
+                lambda ins, outs: exclusive_scan_ir(
+                    ins[0], outs[0], outs[1], binop=binop, identity_value=identity_value
+                ),
                 dtype=[data.dtype, output_dtype],
                 in_buffers=[data_buf],
                 name="exclusive_scan",
@@ -376,7 +402,9 @@ def do_scan(data, output_dtype):
             output = te.extern(
                 [data.shape],
                 [data],
-                lambda ins, outs: exclusive_scan_ir(ins[0], outs[0], binop=binop),
+                lambda ins, outs: exclusive_scan_ir(
+                    ins[0], outs[0], binop=binop, identity_value=identity_value
+                ),
                 dtype=[output_dtype],
                 in_buffers=[data_buf],
                 out_buffers=[output_buf],
@@ -423,7 +451,7 @@ def do_scan(data, output_dtype):
     return output
 
 
-def inclusive_scan(data, axis=-1, output_dtype=None, binop=tvm.tir.generic.add):
+def inclusive_scan(data, axis=-1, output_dtype=None, binop=tvm.tir.generic.add, identity_value=0):
     """Do inclusive scan on 1D or multidimensional input.
 
     Parameters
@@ -442,12 +470,19 @@ def inclusive_scan(data, axis=-1, output_dtype=None, binop=tvm.tir.generic.add):
         and produce a new TIR expression. By default it uses tvm.tir.generic.add to compute
         prefix sum.
 
+    identity_value: int or float
+        A value for the binary operation which provides the identity property. E.g. if * is
+        your operator and i is the identity_value then a * i = a for all a in the domain of
+        your operation.
+
     Returns
     -------
     output : tvm.te.Tensor
         A N-D tensor of the same rank N as the input data.
     """
-    ex_scan = exclusive_scan(data, axis, output_dtype=output_dtype, binop=binop)
+    ex_scan = exclusive_scan(
+        data, axis, output_dtype=output_dtype, binop=binop, identity_value=identity_value
+    )
 
     if output_dtype is not None and data.dtype != output_dtype and output_dtype != "":
         data = cast(data, output_dtype)
@@ -486,7 +521,74 @@ def traverse(op):
     return s
 
 
-def cumsum(data, axis=None, dtype=None, exclusive=None):
+def scanop(
+    data: tvm.te.Tensor,
+    binop: Callable[["tvm.Expr", "tvm.Expr"], "tvm.Expr"],
+    identity_value: Union[float, int],
+    axis: Optional[int] = None,
+    dtype: Optional[str] = None,
+    exclusive: Optional[bool] = None,
+) -> tvm.te.Tensor:
+    """Cumulative binary operator (scan) with similar axis behavior as np.cumsum and np.cumprod.
+
+    See cumprod and cumsum for an example of use.
+
+    E.g. if * is your binary operator and the input tensor is [1, 2, 3, 4] the output may be
+    [1, 1 * 2, 1 * 2 * 3, 1 * 2 * 3 * 4]
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        The input data to the operator.
+
+    binop: Callable (tvm.Expr, tvm.Expr) -> tvm.Expr
+        A binary operator which should be associative and commutative. E.g. if * is your
+        operator then a * (b * c) = (a * b) * c and a * b = b * a
+
+    identity_value: int or float
+        A value for the binary operation which provides the identity property. E.g. if * is
+        your operator and i is the identity_value then a * i = a for all a in the domain of
+        your operation.
+
+    axis : int, optional
+        Axis along which the operation is computed. The default (None) is to compute
+        the cumulative operation over the flattened array.
+
+    dtype : string, optional
+        Type of the returned array and of the accumulator in which the elements are computed.
+        If dtype is not specified, it defaults to the dtype of data.
+
+    exclusive : bool, optional
+        If true will return exclusive cumulative operation in which the first element is not
+        included. In other terms, if true, the j-th output element would be
+        the cumulative operation of the first (j-1) elements. Otherwise, it would be the
+        cumulative operation of the first j elements.
+
+    Returns
+    -------
+    result : tvm.te.Tensor
+        The result has the same size as data, and the same shape as data if axis is not None.
+        If axis is None, the result is a 1-d array.
+    """
+    if axis is None:
+        axis = 0
+        data = reshape(data, (prod(data.shape),))
+    axis = get_const_int(axis)
+    if exclusive is not None and exclusive:
+        return exclusive_scan(
+            data, axis, output_dtype=dtype, binop=binop, identity_value=identity_value
+        )
+    return inclusive_scan(
+        data, axis, output_dtype=dtype, binop=binop, identity_value=identity_value
+    )
+
+
+def cumsum(
+    data: tvm.te.Tensor,
+    axis: Optional[int] = None,
+    dtype: Optional[int] = None,
+    exclusive: Optional[bool] = None,
+) -> tvm.te.Tensor:
     """Numpy style cumsum op. Return the cumulative sum of the elements along a given axis.
 
     Parameters
@@ -502,9 +604,9 @@ def cumsum(data, axis=None, dtype=None, exclusive=None):
         Type of the returned array and of the accumulator in which the elements are summed.
         If dtype is not specified, it defaults to the dtype of data.
 
-    exclusive : int, optional
-        If set to 1 will return exclusive sum in which the first element is not
-        included. In other terms, if set to 1, the j-th output element would be
+    exclusive : bool, optional
+        If true will return exclusive sum in which the first element is not
+        included. In other terms, if true, the j-th output element would be
         the sum of the first (j-1) elements. Otherwise, it would be the sum of
         the first j elements.
 
@@ -514,10 +616,54 @@ def cumsum(data, axis=None, dtype=None, exclusive=None):
         The result has the same size as data, and the same shape as data if axis is not None.
         If axis is None, the result is a 1-d array.
     """
-    if axis is None:
-        axis = 0
-        data = reshape(data, (prod(data.shape),))
-    axis = get_const_int(axis)
-    if exclusive is not None and exclusive != 0:
-        return exclusive_scan(data, axis, output_dtype=dtype, binop=tvm.tir.generic.add)
-    return inclusive_scan(data, axis, output_dtype=dtype, binop=tvm.tir.generic.add)
+    return scanop(
+        data=data,
+        binop=tvm.tir.generic.add,
+        identity_value=0,
+        axis=axis,
+        dtype=dtype,
+        exclusive=exclusive,
+    )
+
+
+def cumprod(
+    data: tvm.te.Tensor,
+    axis: Optional[int] = None,
+    dtype: Optional[int] = None,
+    exclusive: Optional[bool] = None,
+):
+    """Numpy style cumprod op. Return the cumulative product of the elements along a given axis.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        The input data to the operator.
+
+    axis : int, optional
+        Axis along which the cumulative product is computed. The default (None) is to compute
+        the cumproduct over the flattened array.
+
+    dtype : string, optional
+        Type of the returned array and of the accumulator in which the elements are multiplied.
+        If dtype is not specified, it defaults to the dtype of data.
+
+    exclusive : bool, optional
+        If True, will return exclusive product in which the first element is not
+        included. In other terms, if True, the j-th output element would be
+        the product of the first (j-1) elements. Otherwise, it would be the product of
+        the first j elements.
+
+    Returns
+    -------
+    result : tvm.te.Tensor
+        The result has the same size as data, and the same shape as data if axis is not None.
+        If axis is None, the result is a 1-d array.
+    """
+    return scanop(
+        data=data,
+        binop=tvm.tir.generic.multiply,
+        identity_value=1,
+        axis=axis,
+        dtype=dtype,
+        exclusive=exclusive,
+    )
diff --git a/python/tvm/topi/cuda/sort.py b/python/tvm/topi/cuda/sort.py
index ca832ef0ef36..5ebd3060a6bb 100644
--- a/python/tvm/topi/cuda/sort.py
+++ b/python/tvm/topi/cuda/sort.py
@@ -57,6 +57,20 @@ def traverse(op):
     return s
 
 
+def _get_threads(ib, nthread_tx, nthread_bx, nthread_by, nthread_bz):
+    tx = te.thread_axis("threadIdx.x")
+    bx = te.thread_axis("blockIdx.x")
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+
+    by = te.thread_axis("blockIdx.y")
+    bz = te.thread_axis("blockIdx.z")
+    ib.scope_attr(by, "thread_extent", nthread_by)
+    ib.scope_attr(bz, "thread_extent", nthread_bz)
+
+    return tx, bx, by, bz
+
+
 def _sort_init(ib, shape, axis, keys_in, keys_out, values_out=None, value_init_func=None):
     """Initialize the output buffers by copying from inputs"""
     axis_mul_before = 1
@@ -78,16 +92,8 @@ def _sort_init(ib, shape, axis, keys_in, keys_out, values_out=None, value_init_f
 
     # Copy the keys_in to initial output
     with ib.new_scope():
-        tx = te.thread_axis("threadIdx.x")
-        bx = te.thread_axis("blockIdx.x")
-        ib.scope_attr(tx, "thread_extent", nthread_tx)
-        ib.scope_attr(bx, "thread_extent", nthread_bx)
+        tx, bx, by, bz = _get_threads(ib, nthread_tx, nthread_bx, nthread_by, nthread_bz)
         tid = bx * nthread_tx + tx
-
-        by = te.thread_axis("blockIdx.y")
-        bz = te.thread_axis("blockIdx.z")
-        ib.scope_attr(by, "thread_extent", nthread_by)
-        ib.scope_attr(bz, "thread_extent", nthread_bz)
         idx = (by * shape[axis] + tid) * axis_mul_after + bz
         with ib.if_scope(tid < shape[axis]):
             keys_out[idx] = keys_in[idx]
@@ -97,6 +103,100 @@ def _sort_init(ib, shape, axis, keys_in, keys_out, values_out=None, value_init_f
     return axis_mul_before, axis_mul_after
 
 
+## TODO(mbrookhart): These are effective optimziation hyperparametrs
+## Perhaps we can autotune?
+block_size = 128
+thread_work = 4
+
+
+def _odd_even_sort(
+    ib,
+    size,
+    axis_mul_before,
+    axis_mul_after,
+    is_ascend,
+    keys,
+    keys_swap,
+    values=None,
+    values_swap=None,
+):
+
+    nthread_tx = block_size // 2
+    nthread_bx = ceil_div(size, block_size)
+    nthread_by = axis_mul_before
+    nthread_bz = axis_mul_after
+    with ib.new_scope():
+        ib.scope_attr(tvm.tir.const(0), "hand_threaded", 0)
+        tx, bx, by, bz = _get_threads(ib, nthread_tx, nthread_bx, nthread_by, nthread_bz)
+        tid = 2 * tx
+        start = bx * block_size
+
+        ## Create shared memory as syncable thread scratch space
+        tmp_keys_swap = ib.allocate(
+            keys_swap.dtype,
+            (block_size,),
+            name="temp_keys_swap",
+            scope="shared",
+        )
+        if values_swap is not None:
+            tmp_values_swap = ib.allocate(
+                values_swap.dtype,
+                (block_size,),
+                name="temp_values_swap",
+                scope="shared",
+            )
+
+        ## Create thread local data for swapping
+        temp_keys = ib.allocate(keys_swap.dtype, (1,), name="temp_keys", scope="local")
+        if values_swap is not None:
+            temp_values = ib.allocate(values_swap.dtype, (1,), name="temp_values", scope="local")
+
+        temp_cond1 = ib.allocate(keys_swap.dtype, (1,), name="temp_cond1", scope="local")
+        temp_cond2 = ib.allocate(keys_swap.dtype, (1,), name="temp_cond2", scope="local")
+        # Copy data to scratch space
+        base_idx = by * size * axis_mul_after + bz
+        with ib.for_range(0, 2) as n:
+            with ib.if_scope((tid + n + start) < size):
+                tmp_keys_swap[tid + n] = keys[base_idx + (tid + n + start) * axis_mul_after]
+                if values_swap is not None:
+                    tmp_values_swap[tid + n] = values[base_idx + (tid + n + start) * axis_mul_after]
+
+        ib.emit(tvm.tir.Call(None, "tir.tvm_storage_sync", tvm.runtime.convert(["shared"])))
+
+        idxm = tvm.tir.indexmod
+        # OddEvenTransposeSort
+        current_sort_num = tvm.tir.min(block_size, size - start)
+        with ib.for_range(0, current_sort_num) as k:
+            n = idxm(tid + k, 2)
+            with ib.if_scope(tid + n < current_sort_num - 1):
+                temp_cond1[0] = tmp_keys_swap[tid + n]
+                temp_cond2[0] = tmp_keys_swap[tid + n + 1]
+                if is_ascend:
+                    cond = temp_cond1[0] > temp_cond2[0]
+                else:
+                    cond = temp_cond1[0] < temp_cond2[0]
+                with ib.if_scope(cond):
+                    temp_keys[0] = tmp_keys_swap[tid + n]
+                    tmp_keys_swap[tid + n] = tmp_keys_swap[tid + n + 1]
+                    tmp_keys_swap[tid + n + 1] = temp_keys[0]
+                    if values_swap is not None:
+                        temp_values[0] = tmp_values_swap[tid + n]
+                        tmp_values_swap[tid + n] = tmp_values_swap[tid + n + 1]
+                        tmp_values_swap[tid + n + 1] = temp_values[0]
+            ib.emit(tvm.tir.Call(None, "tir.tvm_storage_sync", tvm.runtime.convert(["shared"])))
+
+        ## Copy sorted data to output
+        with ib.for_range(0, 2) as n:
+            with ib.if_scope(tid + n + start < size):
+                keys[base_idx + (tid + n + start) * axis_mul_after] = tmp_keys_swap[tid + n]
+                keys_swap[base_idx + (tid + n + start) * axis_mul_after] = tmp_keys_swap[tid + n]
+                if values_swap is not None:
+                    values[base_idx + (tid + n + start) * axis_mul_after] = tmp_values_swap[tid + n]
+                    values_swap[base_idx + (tid + n + start) * axis_mul_after] = tmp_values_swap[
+                        tid + n
+                    ]
+
+
 def _sort_common(
     ib,
     size,
@@ -110,22 +210,22 @@ def _sort_common(
 ):
     """Either sort only values or sort values by keys."""
 
-    ## we are looping over the array doing mergesort from the bottom up.
-    ## The outer loop runs on the host and launches a cuda kernel for each iteration
-    ## of the algorithm.
-    ## The basic idea is that at iteration 0, each thread does sort on 2 elements.
-    ## On iteration 1, each thread merges 2 sorted arrays of 2 elements,
-    ## to deal with 4 total elements.
-    ## On iteration 2, each thread merges 2 sorted arrays of 4 elements,
-    ## to deal with 8 total elements. On iteration 3, each thread deals with 16 elements, etc
-    ## On the final iteration of the algorithm, one thread will merge two sorted lists
-    ## to sort the entire array
+    ## This function performs a multi-level mergesort
+    ## For blocks of length <= block_size, it does odd-even transpose sort
+    ##    in GPU shared memory
+    ## For intermediate block sizes (>block_size, < max_threads * thread_work)
+    ##    it uses the mergpath algorthim https://arxiv.org/abs/1406.2628
+    ##    to merge blocks in parallel
+    ## At some point, the size of the blocks to be merged is too big for max_threads
+    ##    and we switch to using a dual-level mergepath where the outer mergepath
+    ##    finds the start/end locations of the inner mergepath so that we can split
+    ##    the merge into more blocks
 
     max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
+    nthread_by = axis_mul_before * axis_mul_after
+    nthread_bz = 1
     nthread_tx = max_threads
-    nthread_bx = ceil_div(size, max_threads)
-    nthread_by = axis_mul_before
-    nthread_bz = axis_mul_after
+    nthread_bx = ceil_div(size, nthread_tx)
 
     def compare(a, b):
         """
@@ -137,91 +237,234 @@ def compare(a, b):
             out = b <= a
         return out
 
-    def bottom_up_merge(source, dest, source_idx, dest_idx, start, middle, end, even):
-        """
-        Merge the two sections of the array assigned to this thread
-        """
-        # pylint: disable=arguments-out-of-order
-        # initialize iterators
+    # Sort the lower levels of the merge using odd-even sort, it's fast for small inputs
+    lower_lim = tvm.tir.generic.cast(
+        tvm.tir.ceil(tvm.tir.log2(tvm.tir.generic.cast(block_size, "float64"))), "int64"
+    )
+
+    _odd_even_sort(
+        ib,
+        size,
+        axis_mul_before * axis_mul_after,
+        1,
+        is_ascend,
+        keys,
+        keys_swap,
+        values,
+        values_swap,
+    )
+
+    upper_lim = tvm.tir.generic.cast(
+        tvm.tir.ceil(tvm.tir.log2(tvm.tir.generic.cast(size, "float64"))), "int64"
+    )
+
+    def get_merge_begin(source, base_idx, aCount, bCount, aStart, bStart, diag, step_count):
+        first = ib.allocate("int64", (1,), name="first", scope="local")
+        mid = ib.allocate("int64", (1,), name="mid", scope="local")
+        last = ib.allocate("int64", (1,), name="last", scope="local")
+        first[0] = tvm.te.max(0, diag - bCount)
+        last[0] = tvm.te.min(diag, aCount)
+        with ib.while_loop(first[0] < last[0]):
+            mid = (first[0] + last[0]) >> 1
+            a = source[base_idx + (aStart + mid)]
+            b = source[base_idx + (bStart + diag - 1 - mid)]
+            with ib.if_scope(compare(a, b)):
+                first[0] = mid + 1
+            with ib.else_scope():
+                last[0] = mid
+        return first[0], last[0]
+
+    def serial_merge(
+        source,
+        dest,
+        source_idx,
+        dest_idx,
+        base_idx,
+        aCount,
+        bCount,
+        aStart,
+        bStart,
+        kStart,
+        diag,
+        step_count,
+        first,
+        last,
+    ):
         i = ib.allocate("int64", (1,), name="i", scope="local")
         j = ib.allocate("int64", (1,), name="j", scope="local")
-        i[0] = start
-        j[0] = middle
-        # set up indexes
-        base_idx = by * size * axis_mul_after + bz
-        # iterate over the output loop
-        with ib.for_range(0, end - start) as k:
-            i_idx = base_idx + i[0] * axis_mul_after
-            j_idx = base_idx + j[0] * axis_mul_after
-            k_idx = base_idx + (k + start) * axis_mul_after
-
-            def swap_values(source, dest, source_idx, dest_idx):
-                def assign_i():
-                    """assign i value to current output"""
-                    dest[k_idx] = source[i_idx]
-                    if values is not None:
-                        dest_idx[k_idx] = source_idx[i_idx]
-                    i[0] += 1
-
-                def assign_j():
-                    """assign j value to current output"""
-                    dest[k_idx] = source[j_idx]
-                    if values is not None:
-                        dest_idx[k_idx] = source_idx[j_idx]
-                    j[0] += 1
-
-                ## if both of the iterators are in range
-                with ib.if_scope(tvm.tir.all(i[0] < middle, j[0] < end)):
-                    # compare them and insert whichever is next into the output
-                    with ib.if_scope(compare(source[i_idx], source[j_idx])):
-                        assign_i()
-                    with ib.else_scope():
-                        assign_j()
-                # otherwise, simply copy the remainder of the valid iterator to the output
-                with ib.else_scope():
-                    with ib.if_scope(i[0] < middle):
-                        assign_i()
-                    with ib.else_scope():
-                        assign_j()
+        i[0] = aStart + first
+        j[0] = bStart + diag - last
+        with ib.for_range(0, tvm.te.min(aCount + bCount - diag, step_count)) as count:
+            i_idx = base_idx + i[0]
+            j_idx = base_idx + j[0]
+            k_idx = base_idx + (kStart + diag + count)
+
+            def assign_i():
+                """assign i value to current output"""
+                dest[k_idx] = source[i_idx]
+                if values is not None:
+                    dest_idx[k_idx] = source_idx[i_idx]
+                i[0] += 1
 
-            # Switch which input is the source and which is the destination each iteration
-            with ib.if_scope(even):
-                swap_values(source, dest, source_idx, dest_idx)
+            def assign_j():
+                """assign j value to current output"""
+                dest[k_idx] = source[j_idx]
+                if values is not None:
+                    dest_idx[k_idx] = source_idx[j_idx]
+                j[0] += 1
+
+            ## if both of the iterators are in range
+            with ib.if_scope(tvm.tir.all(i[0] < aStart + aCount, j[0] < bStart + bCount)):
+                # compare them and insert whichever is next into the output
+                with ib.if_scope(compare(source[i_idx], source[j_idx])):
+                    assign_i()
+                with ib.else_scope():
+                    assign_j()
+            # otherwise, simply copy the remainder of the valid iterator to the output
             with ib.else_scope():
-                swap_values(dest, source, dest_idx, source_idx)
-
-    def mergesort(source, dest, source_idx, dest_idx, size, width, even):
-        # calculate the start, mid, and end points of this section
-        start = width * tid
-
-        with ib.if_scope(start < size):
-            middle = cast(tvm.te.min(start + tvm.tir.indexdiv(width, 2), size), "int64")
-            end = cast(tvm.te.min(start + width, size), "int64")
-            # merge the start->middle and middle->end arrays
-            bottom_up_merge(source, dest, source_idx, dest_idx, start, middle, end, even)
+                with ib.if_scope(i[0] < aStart + aCount):
+                    assign_i()
+                with ib.else_scope():
+                    assign_j()
 
-    lim = tvm.tir.generic.cast(
-        tvm.tir.ceil(tvm.tir.log2(tvm.tir.generic.cast(size, "float64"))), "int64"
-    )
-    with ib.for_range(0, lim, dtype="int64") as l2_width:
-        width = 2 << l2_width
+    with ib.for_range(0, upper_lim - lower_lim, dtype="int64") as l2_width:
+        width = 2 << (l2_width + lower_lim)
         # Define and launch the cuda kernel
         with ib.new_scope():
-            tx = te.thread_axis("threadIdx.x")
-            bx = te.thread_axis("blockIdx.x")
-            ib.scope_attr(tx, "thread_extent", nthread_tx)
-            # Reduce the number of blocks as the work per thread grows
-            ib.scope_attr(
-                bx,
-                "thread_extent",
-                tvm.tir.generic.cast(ceil_div(size, width * max_threads), "int32"),
-            )
-            tid = bx * nthread_tx + tx
-
-            by = te.thread_axis("blockIdx.y")
-            bz = te.thread_axis("blockIdx.z")
-            ib.scope_attr(by, "thread_extent", nthread_by)
-            ib.scope_attr(bz, "thread_extent", nthread_bz)
+            target = tvm.target.Target.current()
+            if "vulkan" in str(target):
+                # Vulkan can't handle dynamic nthread, so we thread slightly differently
+                # for vulkan. We don't do this generally because it causes a 15% perf
+                # regression on other platforms
+                ntx = max_threads
+                nbx = tvm.tir.generic.cast(ceil_div(width, max_threads * thread_work), "int32")
+                nbz = tvm.tir.generic.cast(ceil_div(size, width), "int32")
+                tx, bx, by, bz = _get_threads(ib, ntx, nbx, nthread_by, nbz)
+            else:
+                ntx = tvm.tir.generic.cast(tvm.te.min(max_threads, width), "int32")
+                nbx = tvm.tir.generic.cast(ceil_div(width, max_threads * thread_work), "int32")
+                nbz = tvm.tir.generic.cast(ceil_div(size, width), "int32")
+                tx, bx, by, bz = _get_threads(ib, ntx, nbx, nthread_by, nbz)
+
+            def mergepath(
+                source,
+                dest,
+                source_idx,
+                dest_idx,
+                aCount,
+                bCount,
+                aStart,
+                bStart,
+                kStart,
+                step_count,
+                even,
+            ):
+                # pylint: disable=arguments-out-of-order
+                def merge(source, dest, source_idx, dest_idx):
+                    diag = tx * step_count
+                    first, last = get_merge_begin(
+                        source,
+                        by * size,
+                        aCount,
+                        bCount,
+                        aStart,
+                        bStart,
+                        diag,
+                        step_count,
+                    )
+                    # iterate over the output loop
+                    serial_merge(
+                        source,
+                        dest,
+                        source_idx,
+                        dest_idx,
+                        by * size,
+                        aCount,
+                        bCount,
+                        aStart,
+                        bStart,
+                        kStart,
+                        diag,
+                        step_count,
+                        first,
+                        last,
+                    )
+
+                with ib.if_scope(even):
+                    merge(source, dest, source_idx, dest_idx)
+                with ib.else_scope():
+                    merge(dest, source, dest_idx, source_idx)
+
+            def mergesort(source, dest, source_idx, dest_idx, size, width, even):
+                # calculate the start, mid, and end points of this section
+                start = width * bz
+                middle = cast(tvm.te.min(start + tvm.tir.indexdiv(width, 2), size), "int64")
+                end = cast(tvm.te.min(start + width, size), "int64")
+                with ib.if_scope(start < size):
+                    with ib.if_scope(nbx == 1):
+                        ## merge the start->middle and middle->end arrays
+                        aCount = middle - start
+                        bCount = end - middle
+                        mergepath(
+                            source,
+                            dest,
+                            source_idx,
+                            dest_idx,
+                            aCount,
+                            bCount,
+                            start,
+                            middle,
+                            start,
+                            ceil_div(width, ntx),
+                            even,
+                        )
+                    with ib.else_scope():
+                        step_count = max_threads * thread_work
+                        diag = bx * step_count
+
+                        def do_merge(first, last):
+                            aStart = start + first
+                            bStart = middle + diag - last
+                            aCount = tvm.te.min(middle - aStart, step_count)
+                            bCount = tvm.te.min(end - bStart, step_count)
+                            mergepath(
+                                source,
+                                dest,
+                                source_idx,
+                                dest_idx,
+                                aCount,
+                                bCount,
+                                aStart,
+                                bStart,
+                                start + diag,
+                                thread_work,
+                                even,
+                            )
+
+                        with ib.if_scope(even):
+                            first, last = get_merge_begin(
+                                source,
+                                by * size,
+                                middle - start,
+                                end - middle,
+                                start,
+                                middle,
+                                diag,
+                                step_count,
+                            )
+                            do_merge(first, last)
+                        with ib.else_scope():
+                            first, last = get_merge_begin(
+                                dest,
+                                by * size,
+                                middle - start,
+                                end - middle,
+                                start,
+                                middle,
+                                diag,
+                                step_count,
+                            )
+                            do_merge(first, last)
 
             # Call the kernel
             mergesort(
@@ -233,29 +476,23 @@ def mergesort(source, dest, source_idx, dest_idx, size, width, even):
                 width,
                 tvm.tir.indexmod(l2_width, 2) == 0,
             )
-
+    nthread_by = axis_mul_before
+    nthread_bz = axis_mul_after
+    nthread_tx = max_threads
+    nthread_bx = ceil_div(size, nthread_tx)
     ## if the final sorted data ended up in the swap, copy it to the real output
-    with ib.if_scope(tvm.tir.indexmod(lim, 2) == 1):
+    with ib.if_scope(
+        tvm.tir.all(upper_lim > lower_lim, tvm.tir.indexmod(upper_lim - lower_lim, 2) == 1)
+    ):
         with ib.new_scope():
-            tx = te.thread_axis("threadIdx.x")
-            bx = te.thread_axis("blockIdx.x")
-            ib.scope_attr(tx, "thread_extent", nthread_tx)
-            ib.scope_attr(bx, "thread_extent", nthread_bx)
+            tx, bx, by, bz = _get_threads(ib, nthread_tx, nthread_bx, nthread_by, nthread_bz)
             tid = bx * nthread_tx + tx
-
-            by = te.thread_axis("blockIdx.y")
-            bz = te.thread_axis("blockIdx.z")
-            ib.scope_attr(by, "thread_extent", nthread_by)
-            ib.scope_attr(bz, "thread_extent", nthread_bz)
-            idx = (by * size + tid) * axis_mul_after + bz
+            idx = (by * axis_mul_after + bz) * size + tid
             with ib.if_scope(tid < size):
-                idx = (by * size + tid) * axis_mul_after + bz
                 keys[idx] = keys_swap[idx]
                 if values is not None:
                     values[idx] = values_swap[idx]
 
-    return ib.get()
-
 
 def sort_ir(
     data, values_out, values_out_swap, axis, is_ascend, indices_out=None, indices_out_swap=None
@@ -301,27 +538,30 @@ def sort_ir(
         assert indices_out_swap is not None
         indices_out_swap = ib.buffer_ptr(indices_out_swap)
 
-    axis_mul_before, axis_mul_after = _sort_init(
-        ib,
-        shape,
-        axis,
-        data,
-        values_out,
-        indices_out,
-        value_init_func=lambda _, tid: tvm.tir.generic.cast(tid, indices_out.dtype),
-    )
+    with ib.if_scope(shape[axis] > 0):
+        axis_mul_before, axis_mul_after = _sort_init(
+            ib,
+            shape,
+            axis,
+            data,
+            values_out,
+            indices_out,
+            value_init_func=lambda _, tid: tvm.tir.generic.cast(tid, indices_out.dtype),
+        )
+
+        _sort_common(
+            ib,
+            shape[axis],
+            axis_mul_before,
+            axis_mul_after,
+            is_ascend,
+            values_out,
+            values_out_swap,
+            values=indices_out,
+            values_swap=indices_out_swap,
+        )
 
-    return _sort_common(
-        ib,
-        shape[axis],
-        axis_mul_before,
-        axis_mul_after,
-        is_ascend,
-        values_out,
-        values_out_swap,
-        values=indices_out,
-        values_swap=indices_out_swap,
-    )
+    return ib.get()
 
 
 def sort_by_key_ir(
@@ -376,27 +616,29 @@ def sort_by_key_ir(
     values_out = ib.buffer_ptr(values_out)
     values_out_swap = ib.buffer_ptr(values_out_swap)
 
-    axis_mul_before, axis_mul_after = _sort_init(
-        ib,
-        shape,
-        axis,
-        keys_in,
-        keys_out,
-        values_out,
-        value_init_func=lambda idx, _: values_in[idx],
-    )
-
-    return _sort_common(
-        ib,
-        shape[axis],
-        axis_mul_before,
-        axis_mul_after,
-        is_ascend,
-        keys_out,
-        keys_out_swap,
-        values=values_out,
-        values_swap=values_out_swap,
-    )
+    with ib.if_scope(shape[axis] > 0):
+        axis_mul_before, axis_mul_after = _sort_init(
+            ib,
+            shape,
+            axis,
+            keys_in,
+            keys_out,
+            values_out,
+            value_init_func=lambda idx, _: values_in[idx],
+        )
+
+        _sort_common(
+            ib,
+            shape[axis],
+            axis_mul_before,
+            axis_mul_after,
+            is_ascend,
+            keys_out,
+            keys_out_swap,
+            values=values_out,
+            values_swap=values_out_swap,
+        )
+    return ib.get()
 
 
 def sort(data, axis=-1, is_ascend=1):
@@ -419,16 +661,29 @@ def sort(data, axis=-1, is_ascend=1):
     out : tvm.te.Tensor
         The output of this function.
     """
+    ndim = len(data.shape)
+    axis = ndim + axis if axis < 0 else axis
+    if axis != ndim - 1:
+        # Prepare for sorting along axis -1.
+        axes = swap(list(range(ndim)), axis)
+        data = transpose(data, axes)
+
     value_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "value_buf", data_alignment=8)
     value_buf_swap = tvm.tir.decl_buffer(data.shape, data.dtype, "value_buf_swap", data_alignment=8)
+
     out = te.extern(
         [data.shape, data.shape],
         [data],
-        lambda ins, outs: sort_ir(ins[0], outs[0], outs[1], axis, is_ascend),
+        lambda ins, outs: sort_ir(ins[0], outs[0], outs[1], -1, is_ascend),
         out_buffers=[value_buf, value_buf_swap],
         name="sort_gpu",
         tag="sort_gpu",
     )[0]
+
+    if axis != ndim - 1:
+        axes = swap(list(range(ndim)), axis)
+        out = transpose(out, axes)
+
     return out
 
 
@@ -507,10 +762,18 @@ def argsort(data, axis=-1, is_ascend=1, dtype="float32"):
     out : tvm.te.Tensor
         The output of this function.
     """
+    ndim = len(data.shape)
+    axis = ndim + axis if axis < 0 else axis
+    if axis != ndim - 1:
+        # Prepare for sorting along axis -1.
+        axes = swap(list(range(ndim)), axis)
+        data = transpose(data, axes)
+
     value_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "value_buf", data_alignment=8)
     value_swap_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "value_swap_buf", data_alignment=8)
     indices_buf = tvm.tir.decl_buffer(data.shape, dtype, "out_buf", data_alignment=8)
     indices_swap_buf = tvm.tir.decl_buffer(data.shape, dtype, "out_swap_buf", data_alignment=8)
+
     out = te.extern(
         [data.shape, data.shape, data.shape, data.shape],
         [data],
@@ -518,7 +781,7 @@ def argsort(data, axis=-1, is_ascend=1, dtype="float32"):
             ins[0],
             outs[0],
             outs[2],
-            axis,
+            -1,
             is_ascend,
             indices_out=outs[1],
             indices_out_swap=outs[3],
@@ -527,6 +790,11 @@ def argsort(data, axis=-1, is_ascend=1, dtype="float32"):
         name="argsort_gpu",
         tag="argsort_gpu",
     )[1]
+
+    if axis != ndim - 1:
+        axes = swap(list(range(ndim)), axis)
+        out = transpose(out, axes)
+
     return out
 
 
@@ -625,21 +893,30 @@ def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"):
     ndim = len(data.shape)
     axis = axis + ndim if axis < 0 else axis
     assert 0 <= axis < ndim
+    dshape = data.shape
+    if axis != ndim - 1:
+        axes = swap(list(range(ndim)), axis)
+        data = transpose(data, axes)
+
     values_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "values_buf", data_alignment=8)
     values_swap_buf = tvm.tir.decl_buffer(
         data.shape, data.dtype, "values_swap_buf", data_alignment=8
     )
     indices_buf = tvm.tir.decl_buffer(data.shape, dtype, "indices_buf", data_alignment=8)
     indices_swap_buf = tvm.tir.decl_buffer(data.shape, dtype, "indies_swap_buf", data_alignment=8)
+
     if ret_type == "values":
         output = te.extern(
             [data.shape, data.shape],
             [data],
-            lambda ins, outs: sort_ir(ins[0], outs[0], outs[1], axis, is_ascend),
+            lambda ins, outs: sort_ir(ins[0], outs[0], outs[1], -1, is_ascend),
             out_buffers=[values_buf, values_swap_buf],
             name="topk_gpu",
             tag="topk_gpu",
         )[0]
+        if axis != ndim - 1:
+            axes = swap(list(range(ndim)), axis)
+            output = transpose(output, axes)
     else:
         output = te.extern(
             [data.shape, data.shape, data.shape, data.shape],
@@ -648,7 +925,7 @@ def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"):
                 ins[0],
                 outs[0],
                 outs[2],
-                axis,
+                -1,
                 is_ascend,
                 indices_out=outs[1],
                 indices_out_swap=outs[3],
@@ -657,6 +934,11 @@ def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"):
             name="topk_gpu",
             tag="topk_gpu",
         )[0:2]
+        if axis != ndim - 1:
+            axes = swap(list(range(ndim)), axis)
+            output[0] = transpose(output[0], axes)
+            output[1] = transpose(output[1], axes)
+
     if isinstance(k, int) and k < 1:
         if ret_type == "indices":
             return output[1]
@@ -668,7 +950,7 @@ def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"):
         if i == axis:
             end.append(k if isinstance(k, int) else tvm.te.size_var("dim"))
         else:
-            end.append(data.shape[i])
+            end.append(dshape[i])
     if ret_type == "both":
         values_out, indices_out = output
         values_out = strided_slice(values_out, beg, end, strides)
diff --git a/python/tvm/topi/cuda/unique.py b/python/tvm/topi/cuda/unique.py
index 02a5cf3bc592..2bca3c447c4c 100644
--- a/python/tvm/topi/cuda/unique.py
+++ b/python/tvm/topi/cuda/unique.py
@@ -24,6 +24,15 @@
 from ..utils import ceil_div
 
 
+def _get_max_threads(batch_size):
+    target = tvm.target.Target.current()
+    max_threads = tvm.target.Target.current(allow_none=False).max_num_threads
+    if "vulkan" in str(target) and not isinstance(batch_size, tvm.tir.IntImm):
+        # SPIR-V does not support dynamic thread group size
+        return max_threads
+    return tir.min(batch_size, max_threads)
+
+
 def _calc_adjacent_diff_ir(data, output, binop=tir.Sub):
     """Low level IR to calculate adjacent difference in an 1-D array.
 
@@ -46,7 +55,7 @@ def _calc_adjacent_diff_ir(data, output, binop=tir.Sub):
     data_ptr = ib.buffer_ptr(data)
     output_ptr = ib.buffer_ptr(output)
     batch_size = data.shape[0]
-    max_threads = tir.min(batch_size, tvm.target.Target.current(allow_none=False).max_num_threads)
+    max_threads = _get_max_threads(batch_size)
     with ib.new_scope():
         nthread_tx = max_threads
         nthread_bx = ceil_div(batch_size, max_threads)
@@ -157,7 +166,7 @@ def _calc_unique_ir(
         unique_seq_indices_ptr = ib.buffer_ptr(indices)
 
     batch_size = data.shape[0]
-    max_threads = tir.min(batch_size, tvm.target.Target.current(allow_none=False).max_num_threads)
+    max_threads = _get_max_threads(batch_size)
 
     # if need to return counts
     if isinstance(counts, tir.Buffer):
@@ -238,7 +247,7 @@ def _calc_first_occurence_ir(argsorted_indices, inc_scan, first_occurence):
     inc_scan_ptr = ib.buffer_ptr(inc_scan)
     first_occurence_ptr = ib.buffer_ptr(first_occurence)
     batch_size = argsorted_indices.shape[0]
-    max_threads = tir.min(batch_size, tvm.target.Target.current(allow_none=False).max_num_threads)
+    max_threads = _get_max_threads(batch_size)
     with ib.new_scope():
         nthread_tx = max_threads
         nthread_bx = ceil_div(batch_size, max_threads)
diff --git a/python/tvm/topi/cumsum.py b/python/tvm/topi/cumsum.py
deleted file mode 100644
index 2013a352874d..000000000000
--- a/python/tvm/topi/cumsum.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Cumsum operator"""
-from ..tir import decl_buffer, ir_builder
-from ..te import extern
-from .utils import prod, get_const_int
-from .math import cast
-
-
-def cumsum(data, axis=None, dtype=None, exclusive=None):
-    """Numpy style cumsum op. Return the cumulative sum of the elements along a given axis.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        The input data to the operator.
-
-    axis : int, optional
-        Axis along which the cumulative sum is computed. The default (None) is to compute
-        the cumsum over the flattened array.
-
-    dtype : string, optional
-        Type of the returned array and of the accumulator in which the elements are summed.
-        If dtype is not specified, it defaults to the dtype of data.
-
-    exclusive : int, optional
-        If set to 1 will return exclusive sum in which the first element is not
-        included. In other terms, if set to 1, the j-th output element would be
-        the sum of the first (j-1) elements. Otherwise, it would be the sum of
-        the first j elements.
-
-    Returns
-    -------
-    result : tvm.te.Tensor
-        The result has the same size as data, and the same shape as data if axis is not None.
-        If axis is None, the result is a 1-d array.
-    """
-    if dtype is None or dtype == "":
-        dtype = data.dtype
-
-    def maybe_cast(x):
-        if dtype != data.dtype:
-            return cast(x, dtype)
-        return x
-
-    axis_mul_before = 1
-    axis_mul_after = 1
-
-    if axis is None:
-        axis = 0
-        cumsum_axis_len = prod(data.shape)
-        shape = (cumsum_axis_len,)
-    else:
-        if not isinstance(axis, int):
-            axis = get_const_int(axis)
-
-        shape = data.shape
-        cumsum_axis_len = shape[axis]
-
-        if axis < 0:
-            axis = len(shape) + axis
-
-        for i, value in enumerate(shape, 0):
-            if i < axis:
-                axis_mul_before *= value
-            elif i > axis:
-                axis_mul_after *= value
-
-    if exclusive is None:
-        exclusive = 0
-
-    def gen_ir(data_buf, out_buf):
-        ib = ir_builder.create()
-        data_buf = ib.buffer_ptr(data_buf)
-        out_buf = ib.buffer_ptr(out_buf)
-
-        with ib.for_range(0, axis_mul_before * axis_mul_after, "fused", kind="parallel") as fused:
-            i = fused // axis_mul_after
-            j = fused % axis_mul_after
-            base_idx = i * cumsum_axis_len * axis_mul_after + j
-            if exclusive == 0:
-                out_buf[base_idx] = maybe_cast(data_buf[base_idx])
-            else:
-                out_buf[base_idx] = cast(0, dtype)
-            with ib.for_range(0, cumsum_axis_len - 1, "_k") as _k:
-                k = _k + 1
-                cur_idx = base_idx + k * axis_mul_after
-                prev_idx = base_idx + (k - 1) * axis_mul_after
-                if exclusive == 0:
-                    out_buf[cur_idx] = out_buf[prev_idx] + maybe_cast(data_buf[cur_idx])
-                else:
-                    out_buf[cur_idx] = out_buf[prev_idx] + maybe_cast(data_buf[prev_idx])
-
-        return ib.get()
-
-    out_buf = decl_buffer(shape, dtype, "out_buf")
-
-    return extern(
-        [shape],
-        [data],
-        lambda ins, outs: gen_ir(ins[0], outs[0]),
-        dtype=dtype,
-        out_buffers=[out_buf],
-        name="cumsum_generic",
-        tag="cumsum_generic",
-    )
diff --git a/python/tvm/topi/nn/__init__.py b/python/tvm/topi/nn/__init__.py
index 2ebbd1d67bd1..94a5b30c9b76 100644
--- a/python/tvm/topi/nn/__init__.py
+++ b/python/tvm/topi/nn/__init__.py
@@ -36,6 +36,7 @@
 from .conv2d_transpose import *
 from .conv1d_transpose import *
 from .bnn import *
+from .qnn import *
 from .upsampling import *
 from .local_response_norm import *
 from .bitserial_conv2d import *
diff --git a/python/tvm/topi/nn/qnn.py b/python/tvm/topi/nn/qnn.py
new file mode 100644
index 000000000000..caed28580037
--- /dev/null
+++ b/python/tvm/topi/nn/qnn.py
@@ -0,0 +1,190 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Quantized Neural Network (QNN) Operators"""
+import tvm
+from tvm import te, tir, topi
+
+SQNN_DISABLE = 0
+SQNN_INT8 = 1
+SQNN_UINT8 = 2
+SQNN_INT32 = 3
+
+SQNN_DTYPE_TO_CODE = {
+    "disable": SQNN_DISABLE,
+    "int8": SQNN_INT8,
+    "uint8": SQNN_UINT8,
+    "int32": SQNN_INT32,
+}
+
+SQNN_CODE_TO_DTYPE = {v: k for k, v in SQNN_DTYPE_TO_CODE.items()}
+
+
+@tvm.te.tag_scope(tag=topi.tag.ELEMWISE)
+def simulated_quantize(data, out_dtype, output_scale=None, output_zero_point=None, axis=-1):
+    """Simulated QNN quantize operator that mimics QNN outputs without changing datatype.
+    The benefit of this operator over true QNN quantize is that this operator allows dynamic
+    datatype selection and can operate on both per-channel and scalar scales and zero points while
+    QNN quantize requires both of these to be fixed at compile time.
+
+    Parameters
+    ----------
+    data: tvm.te.Tensor
+        An N-D input tensor to the operator.
+
+    out_dtype: tvm.te.Tensor
+        A scalar variable that indicates which datatype to simulate quantization with. Use
+        SQNN_DTYPE_TO_CODE to convert a dtype string into the corresponding variable
+        value.
+
+    output_scale: tvm.te.Tensor, optional
+        A scalar tensor representing the scale to use when quantizing to integer datatypes.
+        When it contains more than a single value, N must match the number of channels in data.
+
+    output_zero_point: tvm.te.Tensor, optional
+        A 1-D tensor representing the zero point to use when quantizing to integer datatypes.
+        When it contains more than a single value, N must match the number of channels in data.
+
+    axis: int, optional
+        The channel axis for quantization. Default value is -1 which corresponds to the last axis.
+
+    """
+    # When disabled, just pass through the input values.
+    def _compute_pass_through(value, *indices):
+        return value[indices]
+
+    # Simulate quantization for arbitrary integer datatypes. The computation for all datatypes is:
+    # Q_output = clip((round(input_tensor/output_scale) + output_zero_point),
+    #                 out_dtype::min,
+    #                 out_dtype::max)
+    def _compute_intn(dtype, value, *indices):
+        assert output_scale is not None and output_zero_point is not None
+        const_min = tvm.tir.min_value(dtype)
+        const_max = tvm.tir.max_value(dtype)
+        # Use indexmod to handle both scalar and per-channel QNN parameters.
+        scale_idx = tir.indexmod(indices[axis], topi.shape(output_scale)[0])
+        zp_idx = tir.indexmod(indices[axis], topi.shape(output_zero_point)[0])
+        return te.max(
+            te.min(
+                te.round(value[indices] / output_scale[scale_idx]) + output_zero_point[zp_idx],
+                const_max,
+            ),
+            const_min,
+        )
+
+    # Use an if chain to dynamically return the proper quantization based on the input datatype.
+    # This allows the op to compile once but apply different quantization approaches
+    # using a variable datatype input.
+    def _dispatch_sim_quantize(value):
+        pass_through_value = te.compute(
+            data.shape, lambda *indices: _compute_pass_through(value, *indices)
+        )
+        int8_value = te.compute(
+            data.shape,
+            lambda *indices: tir.if_then_else(
+                out_dtype.equal(SQNN_DTYPE_TO_CODE["int8"]),
+                _compute_intn("int8", value, *indices),
+                pass_through_value[indices],
+            ),
+        )
+        uint8_value = te.compute(
+            data.shape,
+            lambda *indices: tir.if_then_else(
+                out_dtype.equal(SQNN_DTYPE_TO_CODE["uint8"]),
+                _compute_intn("uint8", value, *indices),
+                int8_value[indices],
+            ),
+        )
+        int32_value = te.compute(
+            data.shape,
+            lambda *indices: tir.if_then_else(
+                out_dtype.equal(SQNN_DTYPE_TO_CODE["int32"]),
+                _compute_intn("int32", value, *indices),
+                uint8_value[indices],
+            ),
+        )
+
+        return int32_value
+
+    return te.compute(data.shape, lambda *indices: _dispatch_sim_quantize(data)[indices])
+
+
+@tvm.te.tag_scope(tag=topi.tag.ELEMWISE)
+def simulated_dequantize(data, in_dtype, input_scale=None, input_zero_point=None, axis=-1):
+    """Simulated QNN dequantize operator that mimics QNN outputs without changing datatype.
+    The benefit of this operator over true QNN dequantize is that this operator allows dynamic
+    datatype selection and can operate on both per-channel and scalar scales and zero points while
+    QNN dequantize requires both of these to be fixed at compile time.
+
+    Parameters
+    ----------
+    data: tvm.te.Tensor
+        An N-D input tensor to the operator.
+
+    in_dtype: tvm.te.Tensor
+        A scalar variable that indicates which datatype to simulate dequantization with. Use
+        SQNN_DTYPE_TO_CODE to convert a dtype string into the corresponding variable
+        value.
+
+    input_scale: tvm.te.Tensor, optional
+        A scalar tensor representing the scale to use when dequantizing from integer datatypes.
+        When it contains more than a single value, N must match the number of channels in data.
+
+    input_zero_point: tvm.te.Tensor, optional
+        A 1-D tensor representing the zero point to use when dequantizing from integer datatypes.
+        When it contains more than a single value, N must match the number of channels in data.
+
+    axis: int, optional
+        The channel axis for quantization. Default value is -1 which corresponds to the last axis.
+
+    """
+    # When disabled simply return the input tensor.
+    def _compute_pass_through(value, *indices):
+        return value[indices]
+
+    # Simulate dequantization for arbitrary integer datatypes. The computation for all datatypes is:
+    # DQ_output = (input - zero_point) * scale
+    def _compute_intn(value, *indices):
+        assert input_scale is not None and input_zero_point is not None
+        # Use indexmod to handle both scalar and per-channel QNN parameters.
+        scale_idx = tir.indexmod(indices[axis], topi.shape(input_scale)[0])
+        zp_idx = tir.indexmod(indices[axis], topi.shape(input_zero_point)[0])
+        return (value[indices] - input_zero_point[zp_idx]) * input_scale[scale_idx]
+
+    # Use an if chain to dynamically return the proper dequantization based on the input datatype.
+    # This allows the op to compile once but apply different quantization approaches
+    # using a variable datatype input.
+    def _dispatch_sim_dequantize(value):
+        pass_through_value = te.compute(
+            data.shape, lambda *indices: _compute_pass_through(value, *indices)
+        )
+        intn_condition = tvm.te.any(
+            in_dtype.equal(SQNN_DTYPE_TO_CODE["int8"]),
+            in_dtype.equal(SQNN_DTYPE_TO_CODE["uint8"]),
+            in_dtype.equal(SQNN_DTYPE_TO_CODE["int32"]),
+        )
+        intn_value = te.compute(
+            data.shape,
+            lambda *indices: tir.if_then_else(
+                intn_condition,
+                _compute_intn(value, *indices),
+                pass_through_value[indices],
+            ),
+        )
+
+        return intn_value
+
+    return te.compute(data.shape, lambda *indices: _dispatch_sim_dequantize(data)[indices])
diff --git a/python/tvm/topi/nn/sparse.py b/python/tvm/topi/nn/sparse.py
index 1bf18df09da3..756110624aa1 100644
--- a/python/tvm/topi/nn/sparse.py
+++ b/python/tvm/topi/nn/sparse.py
@@ -468,3 +468,72 @@ def _traverse(t):
     sparse_input_map[sparse_indptr] = sparse_prefix + "W_indptr"
 
     return sparse_input_map
+
+
+def sparse_add(dense_data, sparse_data, sparse_indices, sparse_indptr):
+    """
+    Computes sparse-dense addition
+
+    Parameters
+    ----------
+    dense_data : tvm.te.Tensor
+        2-D with shape [M, N]
+
+    sparse_data : tvm.te.Tensor
+        1-D with shape [nnz] (CSR)
+
+    sparse_indices : tvm.te.Tensor
+        1-D with shape [nnz] (CSR)
+
+    sparse_indptr : tvm.te.Tensor
+        1-D with shape [M + 1] (CSR)
+
+    Returns
+    -------
+    output : tvm.te.Tensor
+        2-D with shape [M, N]
+    """
+    # TODO(ANSHUMAN87): support BSR format too
+    assert len(sparse_data.shape) == 1, "only CSR format is supported"
+    return _sparse_add_csr(dense_data, sparse_data, sparse_indices, sparse_indptr)
+
+
+def _sparse_add_csr(dense_data_inp, sparse_data_inp, sparse_indices_inp, sparse_indptr_inp):
+    oshape = get_const_tuple(dense_data_inp.shape)
+
+    def _csr_add_ir(dense_data, sparse_data, sparse_indices, sparse_indptr, out_data):
+        irb = tvm.tir.ir_builder.create()
+        dense_data_ptr = irb.buffer_ptr(dense_data)
+        sparse_data_ptr = irb.buffer_ptr(sparse_data)
+        sparse_indices_ptr = irb.buffer_ptr(sparse_indices)
+        sparse_indptr_ptr = irb.buffer_ptr(sparse_indptr)
+
+        out_data_ptr = irb.buffer_ptr(out_data)
+
+        with irb.for_range(0, oshape[0], kind="vectorize", name="row") as row:
+            with irb.for_range(0, oshape[1], kind="parallel", name="col") as col:
+                out_data_ptr[row, col] = dense_data_ptr[row, col]
+
+        with irb.for_range(0, oshape[0], kind="parallel", name="row") as row:
+            offset = sparse_indptr_ptr[row]
+            diff = sparse_indptr_ptr[row + 1] - sparse_indptr_ptr[row]
+            with irb.for_range(0, diff, kind="serial", name="idx") as idx:
+                real_idx = offset + idx
+                col = sparse_indices_ptr[real_idx]
+                out_data_ptr[row, col] = sparse_data_ptr[real_idx] + out_data_ptr[row, col]
+
+        return irb.get()
+
+    return te.extern(
+        shape=oshape,
+        inputs=[dense_data_inp, sparse_data_inp, sparse_indices_inp, sparse_indptr_inp],
+        fcompute=lambda ins, outs: _csr_add_ir(ins[0], ins[1], ins[2], ins[3], outs[0]),
+        tag="sparse_add_csr",
+        dtype=[
+            dense_data_inp.dtype,
+            sparse_data_inp.dtype,
+            sparse_indices_inp.dtype,
+            sparse_indptr_inp.dtype,
+        ],
+        name="sparse_add_csr_output",
+    )
diff --git a/python/tvm/topi/random/kernel.py b/python/tvm/topi/random/kernel.py
index 728cd682fa42..b6c0b3fa5930 100644
--- a/python/tvm/topi/random/kernel.py
+++ b/python/tvm/topi/random/kernel.py
@@ -141,7 +141,7 @@ def mix(a, b, rotation):
         return [x, y]
 
     # temporary buffer for holding the results of _PERMUTATIONS
-    tmp = irb.allocate(out_buf.dtype, out_shape, name="tmp", scope="global")
+    tmp = irb.allocate(out_buf.dtype, out_shape * nwords, name="tmp", scope="global")
     tmp_offset = 0
 
     # Initialize entire key. It is composed of the original key with one
@@ -430,14 +430,14 @@ def gen_ir(gen_ptr, out_left_ptr, out_right_ptr):
     )
 
 
-def threefry_test_wrapping(target, ctx):
+def threefry_test_wrapping(target, device):
     """Test that unsigned arithmetic wraps on overflow.
 
     Parameters
     ----------
     target : tvm.target.Target
         Target to run against
-    ctx : tvm.runtime.TVMContext
+    device : tvm.runtime.Device
         Context to run the test on
 
     Returns
@@ -463,6 +463,6 @@ def gen_ir(out_ptr):
         [out.shape], [], lambda ins, outs: gen_ir(outs[0]), dtype="uint64", out_buffers=[out]
     )
     s = tvm.te.create_schedule([f.op])
-    out_ary = tvm.nd.array(np.ones((1,), "uint64"), ctx)
+    out_ary = tvm.nd.array(np.ones((1,), "uint64"), device)
     tvm.build(s, [f], target=target)(out_ary)
     return out_ary.asnumpy()[0] == 0
diff --git a/python/tvm/topi/scan.py b/python/tvm/topi/scan.py
new file mode 100644
index 000000000000..f5796730f762
--- /dev/null
+++ b/python/tvm/topi/scan.py
@@ -0,0 +1,236 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""Scan (cumulative binary) operators"""
+from typing import Callable, Optional
+
+import tvm
+
+from ..te import extern
+from ..tir import decl_buffer, generic, ir_builder
+from .math import cast
+from .utils import get_const_int, prod
+
+
+def scanop(
+    data: tvm.te.Tensor,
+    binop: Callable[["tvm.Expr", "tvm.Expr"], "tvm.Expr"],
+    identity_value: "tvm.Expr",
+    op_name: str,
+    axis: Optional[int] = None,
+    dtype: Optional[str] = None,
+    exclusive: Optional[bool] = None,
+) -> tvm.te.Tensor:
+    """Cumulative binary operator (scan) with similar axis behavior as np.cumsum and np.cumprod.
+
+    See cumprod and cumsum for an example of use.
+
+    E.g. if * is your binary operator and the input tensor is [1, 2, 3, 4] the output may be
+    [1, 1 * 2, 1 * 2 * 3, 1 * 2 * 3 * 4]
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        The input data to the operator.
+
+    binop: Callable (tvm.Expr, tvm.Expr) -> tvm.Expr
+        A binary operator which should be associative and commutative. E.g. if * is your
+        operator then a * (b * c) = (a * b) * c and a * b = b * a
+
+    identity_value: tvm.Expr
+        A value for the binary operation which provides the identity property. E.g. if * is
+        your operator and i is the identity_value then a * i = a for all a in the domain of
+        your operation.
+
+    axis : int, optional
+        Axis along which the operation is computed. The default (None) is to compute
+        the cumulative operation over the flattened array.
+
+    dtype : string, optional
+        Type of the returned array and of the accumulator in which the elements are computed.
+        If dtype is not specified, it defaults to the dtype of data.
+
+    exclusive : bool, optional
+        If True will return exclusive cumulative operation in which the first element is not
+        included. In other terms, if True, the j-th output element would be
+        the cumulative operation of the first (j-1) elements. Otherwise, it would be the
+        cumulative operation of the first j elements. The cumulative operation of zero elements
+        is assumed to be the identity_value.
+
+    Returns
+    -------
+    result : tvm.te.Tensor
+        The result has the same size as data, and the same shape as data if axis is not None.
+        If axis is None, the result is a 1-d array.
+    """
+    if dtype is None or dtype == "":
+        dtype = data.dtype
+
+    if exclusive is None:
+        exclusive = False
+
+    def maybe_cast(x):
+        if dtype != data.dtype:
+            return cast(x, dtype)
+        return x
+
+    axis_mul_before = 1
+    axis_mul_after = 1
+
+    if axis is None:
+        axis = 0
+        cumsum_axis_len = prod(data.shape)
+        shape = (cumsum_axis_len,)
+    else:
+        if not isinstance(axis, int):
+            axis = get_const_int(axis)
+
+        shape = data.shape
+        cumsum_axis_len = shape[axis]
+
+        if axis < 0:
+            axis = len(shape) + axis
+
+        for i, value in enumerate(shape, 0):
+            if i < axis:
+                axis_mul_before *= value
+            elif i > axis:
+                axis_mul_after *= value
+
+    def gen_ir(data_buf, out_buf):
+        ib = ir_builder.create()
+        data_buf = ib.buffer_ptr(data_buf)
+        out_buf = ib.buffer_ptr(out_buf)
+
+        with ib.for_range(0, axis_mul_before * axis_mul_after, "fused", kind="parallel") as fused:
+            i = fused // axis_mul_after
+            j = fused % axis_mul_after
+            base_idx = i * cumsum_axis_len * axis_mul_after + j
+            if exclusive:
+                out_buf[base_idx] = cast(identity_value, dtype)
+            else:
+                out_buf[base_idx] = maybe_cast(data_buf[base_idx])
+            with ib.for_range(0, cumsum_axis_len - 1, "_k") as _k:
+                k = _k + 1
+                cur_idx = base_idx + k * axis_mul_after
+                prev_idx = base_idx + (k - 1) * axis_mul_after
+                if exclusive:
+                    out_buf[cur_idx] = binop(out_buf[prev_idx], maybe_cast(data_buf[prev_idx]))
+                else:
+                    out_buf[cur_idx] = binop(out_buf[prev_idx], maybe_cast(data_buf[cur_idx]))
+
+        return ib.get()
+
+    out_buf = decl_buffer(shape, dtype, "out_buf")
+
+    return extern(
+        [shape],
+        [data],
+        lambda ins, outs: gen_ir(ins[0], outs[0]),
+        dtype=dtype,
+        out_buffers=[out_buf],
+        name=op_name,
+        tag=op_name,
+    )
+
+
+def cumsum(
+    data: tvm.te.Tensor,
+    axis: Optional[int] = None,
+    dtype: Optional[int] = None,
+    exclusive: Optional[bool] = None,
+) -> tvm.te.Tensor:
+    """Numpy style cumsum op. Return the cumulative sum of the elements along a given axis.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        The input data to the operator.
+
+    axis : int, optional
+        Axis along which the cumulative sum is computed. The default (None) is to compute
+        the cumsum over the flattened array.
+
+    dtype : string, optional
+        Type of the returned array and of the accumulator in which the elements are summed.
+        If dtype is not specified, it defaults to the dtype of data.
+
+    exclusive : bool, optional
+        If True, will return exclusive sum in which the first element is not
+        included. In other terms, if True, the j-th output element would be
+        the sum of the first (j-1) elements. Otherwise, it would be the sum of
+        the first j elements.
+
+    Returns
+    -------
+    result : tvm.te.Tensor
+        The result has the same size as data, and the same shape as data if axis is not None.
+        If axis is None, the result is a 1-d array.
+    """
+    return scanop(
+        data=data,
+        binop=generic.add,
+        identity_value=0,
+        op_name="cumsum_generic",
+        axis=axis,
+        dtype=dtype,
+        exclusive=exclusive,
+    )
+
+
+def cumprod(
+    data: tvm.te.Tensor,
+    axis: Optional[int] = None,
+    dtype: Optional[int] = None,
+    exclusive: Optional[bool] = None,
+) -> tvm.te.Tensor:
+    """Numpy style cumprod op. Return the cumulative product of the elements along a given axis.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        The input data to the operator.
+
+    axis : int, optional
+        Axis along which the cumulative product is computed. The default (None) is to compute
+        the cumproduct over the flattened array.
+
+    dtype : string, optional
+        Type of the returned array and of the accumulator in which the elements are multiplied.
+        If dtype is not specified, it defaults to the dtype of data.
+
+    exclusive : bool, optional
+        If True, will return exclusive product in which the first element is not
+        included. In other terms, if True, the j-th output element would be
+        the product of the first (j-1) elements. Otherwise, it would be the product of
+        the first j elements.
+
+    Returns
+    -------
+    result : tvm.te.Tensor
+        The result has the same size as data, and the same shape as data if axis is not None.
+        If axis is None, the result is a 1-d array.
+    """
+    return scanop(
+        data=data,
+        binop=generic.multiply,
+        identity_value=1,
+        op_name="cumprod_generic",
+        axis=axis,
+        dtype=dtype,
+        exclusive=exclusive,
+    )
diff --git a/python/tvm/topi/sort.py b/python/tvm/topi/sort.py
index 8964e363b06f..5b8e33413d65 100644
--- a/python/tvm/topi/sort.py
+++ b/python/tvm/topi/sort.py
@@ -104,9 +104,9 @@ def argsort(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"):
         np_data = np.random.uniform(dshape)
         s = topi.generic.schedule_argsort(out)
         f = tvm.build(s, [data, out], "llvm")
-        ctx = tvm.cpu()
-        tvm_data = tvm.nd.array(np_data, ctx)
-        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx)
+        dev = tvm.cpu()
+        tvm_data = tvm.nd.array(np_data, dev)
+        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), dev)
         f(tvm_data, tvm_out)
     """
     data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
diff --git a/python/tvm/topi/testing/common.py b/python/tvm/topi/testing/common.py
index e4e5e811ab18..69ffc1482ba1 100644
--- a/python/tvm/topi/testing/common.py
+++ b/python/tvm/topi/testing/common.py
@@ -81,7 +81,7 @@ def get_conv2d_nchw_implement(target):
     return dispatch(target, _conv2d_nchw_implement)
 
 
-def compare_numpy_tvm(inputs, output, target, ctx, compute, schedule):
+def compare_numpy_tvm(inputs, output, target, device, compute, schedule):
     """Compare a numpy inputs and output of a function to the results of the TVM version.
 
     Parameters
@@ -92,7 +92,7 @@ def compare_numpy_tvm(inputs, output, target, ctx, compute, schedule):
         Verified correct function output.
     target : tvm.target.Target
         Target to run on.
-    ctx : tvm.TVMContext
+    device : tvm.runtime.Device
         Context to run on.
     compute : callable
         Topi compute function to test against.
@@ -100,11 +100,11 @@ def compare_numpy_tvm(inputs, output, target, ctx, compute, schedule):
         Topi scheduling function to test against.
     """
     te_inputs = [tvm.te.placeholder(shape=i.shape, dtype=str(i.dtype)) for i in inputs]
-    te_out = tvm.nd.array(np.zeros(output.shape).astype(output.dtype), ctx=ctx)
+    te_out = tvm.nd.array(np.zeros(output.shape).astype(output.dtype), device=device)
     with tvm.target.Target(target):
         out = compute(*te_inputs)
         s = schedule([out])
         func = tvm.build(s, te_inputs + [out])
-        arys = [tvm.nd.array(x, ctx=ctx) for x in inputs]
+        arys = [tvm.nd.array(x, device=device) for x in inputs]
         func(*(arys + [te_out]))
         assert_allclose(te_out.asnumpy(), output, atol=1e-4, rtol=1e-4)
diff --git a/python/tvm/topi/unique.py b/python/tvm/topi/unique.py
index b4f27b38f65f..e7256551d7b6 100644
--- a/python/tvm/topi/unique.py
+++ b/python/tvm/topi/unique.py
@@ -18,7 +18,7 @@
 """Unique operator"""
 from tvm import te, tir
 from ..te import hybrid
-from .cumsum import cumsum
+from .scan import cumsum
 from .sort import sort, argsort
 
 
diff --git a/python/tvm/topi/vision/nms.py b/python/tvm/topi/vision/nms.py
index cbf136a5552c..8be62a73c09e 100644
--- a/python/tvm/topi/vision/nms.py
+++ b/python/tvm/topi/vision/nms.py
@@ -545,10 +545,10 @@ def non_max_suppression(
         np_valid_count = np.array([4])
         s = topi.generic.schedule_nms(out)
         f = tvm.build(s, [data, valid_count, out], "llvm")
-        ctx = tvm.cpu()
-        tvm_data = tvm.nd.array(np_data, ctx)
-        tvm_valid_count = tvm.nd.array(np_valid_count, ctx)
-        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx)
+        dev = tvm.cpu()
+        tvm_data = tvm.nd.array(np_data, dev)
+        tvm_valid_count = tvm.nd.array(np_valid_count, dev)
+        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), dev)
         f(tvm_data, tvm_valid_count, tvm_out)
     """
     batch_size = data.shape[0]
diff --git a/python/tvm/topi/x86/__init__.py b/python/tvm/topi/x86/__init__.py
index bb6a7cdd4122..d1bd58dd4831 100644
--- a/python/tvm/topi/x86/__init__.py
+++ b/python/tvm/topi/x86/__init__.py
@@ -41,3 +41,4 @@
 from .conv2d_alter_op import *
 from .dense_alter_op import *
 from .scatter import *
+from .group_conv2d import *
diff --git a/python/tvm/topi/x86/group_conv2d.py b/python/tvm/topi/x86/group_conv2d.py
new file mode 100644
index 000000000000..0501c5534cf2
--- /dev/null
+++ b/python/tvm/topi/x86/group_conv2d.py
@@ -0,0 +1,371 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
+# pylint: disable=no-value-for-parameter,import-outside-toplevel
+"""Grouped Spatial Pack Convolution (Group Conv2D) schedule on x86"""
+
+import tvm
+from tvm import autotvm
+from tvm import te
+from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
+
+from .utils import get_fp32_len
+from ..utils import get_const_tuple
+from ..nn.pad import pad
+from .. import tag
+
+from ..nn.conv2d import _get_workload as _get_conv2d_workload
+
+
+def group_conv2d_nchw(data, kernel, strides, padding, dilation, groups, out_dtype):
+    """Compute group_conv2d with NCHW layout"""
+    return group_conv2d_nchw_spatial_pack(
+        data, kernel, strides, padding, dilation, groups, out_dtype
+    )
+
+
+def schedule_group_conv2d_nchw(outs):
+    """Compute group_conv2d with NCHW layout"""
+    return schedule_group_conv2d_nchwc(outs)
+
+
+def _get_default_config(cfg, data, kernel, strides, padding, groups, out_dtype, layout="NCHW"):
+    """
+    Get default schedule config for the workload
+    """
+    static_data_shape = []
+    for dim in get_const_tuple(data.shape):
+        if isinstance(dim, tvm.tir.Var):
+            static_data_shape.append(1)
+        else:
+            static_data_shape.append(dim)
+    data = te.placeholder(static_data_shape, dtype=data.dtype)
+
+    wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype, layout)
+    _fallback_schedule(cfg, wkl)
+
+
+def _fallback_schedule(cfg, wkl):
+    simd_width = get_fp32_len()
+    pad_left, pad_right = wkl.padl, wkl.padr
+    stride_w = wkl.stride_w
+    out_width = (wkl.width + pad_left + pad_right - wkl.kernel_w) // stride_w + 1
+    groups = wkl.groups
+    kernels_per_group = wkl.out_filter // groups
+    kernel_depth = wkl.in_filter // groups
+
+    oc_bn = 1
+
+    oc_bn = 1
+    for bn in range(simd_width, 0, -1):
+        if kernels_per_group % bn == 0:
+            oc_bn = bn
+            break
+    if oc_bn > kernels_per_group:
+        oc_bn = kernels_per_group
+
+    ic_bn = 1
+    for bn in range(oc_bn, 0, -1):
+        if kernel_depth % bn == 0:
+            ic_bn = bn
+            break
+    if ic_bn > kernel_depth:
+        ic_bn = kernel_depth
+
+    reg_n = 1
+    for n in range(31, 0, -1):
+        if out_width % n == 0:
+            reg_n = n
+            break
+
+    cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn])
+    cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn])
+    cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n])
+    cfg["unroll_kw"] = OtherOptionEntity(False)
+
+
+@autotvm.register_topi_compute("group_conv2d_nchw.x86")
+def group_conv2d_nchw_spatial_pack(
+    cfg, data, kernel, strides, padding, dilation, groups, out_dtype="float32"
+):
+    """
+    Compute group conv2d with NCHW layout, using GSPC algorithm.
+    https://arxiv.org/abs/2006.09791
+    """
+    assert isinstance(dilation, int) or len(dilation) == 2
+    if isinstance(dilation, int):
+        dilation_h, dilation_w = dilation, dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4
+    if isinstance(padding, int):
+        pad_top, pad_left, pad_bottom, pad_right = padding, padding, padding, padding
+    elif len(padding) == 2:
+        hpad, wpad = padding
+        pad_top, pad_bottom = hpad, hpad
+        pad_left, pad_right = wpad, wpad
+    else:
+        pad_top, pad_left, pad_bottom, pad_right = padding
+
+    hpad = pad_top + pad_bottom
+    wpad = pad_left + pad_right
+
+    assert isinstance(strides, int) or len(strides) == 2
+    if isinstance(strides, int):
+        stride_h, stride_w = strides, strides
+    else:
+        stride_h, stride_w = strides
+
+    batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape)
+    out_channel, kernel_depth, k_height, k_width = get_const_tuple(kernel.shape)
+
+    pad_height = in_height + pad_top + pad_bottom
+    pad_width = in_width + pad_left + pad_right
+
+    dilated_kernel_h = (k_height - 1) * dilation_h + 1
+    dilated_kernel_w = (k_width - 1) * dilation_w + 1
+    out_height = (in_height + pad_top + pad_bottom - dilated_kernel_h) // stride_h + 1
+    out_width = (in_width + pad_left + pad_right - dilated_kernel_w) // stride_w + 1
+
+    kernels_per_group = out_channel // groups
+
+    cfg.define_split("tile_ic", in_channel, num_outputs=2)
+    cfg.define_split("tile_oc", out_channel, num_outputs=2)
+    cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
+    cfg.define_knob("unroll_kw", [True, False])
+
+    # If no config was set, we can fallback to default config.
+    if cfg.is_fallback:
+        _get_default_config(
+            cfg,
+            te.placeholder((batch_size, in_channel, in_height, in_width), dtype=data.dtype),
+            te.placeholder(
+                (out_channel, in_channel // groups, k_height, k_width), dtype=kernel.dtype
+            ),
+            strides,
+            padding,
+            groups,
+            out_dtype,
+        )
+
+    oc_bn = cfg["tile_oc"].size[-1]
+    ic_bn = cfg["tile_ic"].size[-1]
+
+    # pack data
+    DOPAD = hpad != 0 or wpad != 0
+    if DOPAD:
+        data_pad = pad(
+            data, (0, 0, pad_top, pad_left), (0, 0, pad_bottom, pad_right), name="data_pad"
+        )
+    else:
+        data_pad = data
+
+    shape = (groups, batch_size, kernel_depth // ic_bn, pad_height, ic_bn, pad_width)
+
+    data_vec = te.compute(
+        shape,
+        lambda g, n, C, h, c, w: data_pad[n, C * ic_bn + c + kernel_depth * g, h, w],
+        name="data_vec",
+    )
+
+    # pack kernel
+    shape = (
+        groups,
+        kernels_per_group // oc_bn,
+        kernel_depth // ic_bn,
+        k_height,
+        k_width,
+        ic_bn,
+        oc_bn,
+    )
+
+    kernel_vec = te.compute(
+        shape,
+        lambda g, out_channel, in_channel, h, w, ci, co: kernel[
+            (out_channel * oc_bn + co + g * kernels_per_group), in_channel * ic_bn + ci, h, w
+        ],
+        name="kernel_vec",
+    )
+
+    # convolution
+    oshape = (groups, batch_size, kernels_per_group // oc_bn, out_height, out_width, oc_bn)
+    unpack_shape = (batch_size, out_channel, out_height, out_width)
+
+    ic = te.reduce_axis((0, (kernel_depth)), name="ic")
+    kh = te.reduce_axis((0, k_height), name="kh")
+    kw = te.reduce_axis((0, k_width), name="kw")
+
+    idxmod = tvm.tir.indexmod
+    idxdiv = tvm.tir.indexdiv
+    conv = te.compute(
+        oshape,
+        lambda g, n, oc_chunk, oh, ow, oc_block: te.sum(
+            data_vec[
+                g,
+                n,
+                idxdiv(ic, ic_bn),
+                oh * stride_h + kh * dilation_h,
+                idxmod(ic, ic_bn),
+                ow * stride_w + kw * dilation_w,
+            ].astype(out_dtype)
+            * kernel_vec[
+                g, oc_chunk, idxdiv(ic, ic_bn), kh, kw, idxmod(ic, ic_bn), oc_block
+            ].astype(out_dtype),
+            axis=[ic, kh, kw],
+        ),
+        name="conv",
+    )
+
+    unpack = te.compute(
+        unpack_shape,
+        lambda n, c, h, w: conv[
+            idxdiv(c, kernels_per_group),
+            n,
+            idxmod(idxdiv(c, oc_bn), (kernels_per_group // oc_bn)),
+            h,
+            w,
+            idxmod(idxmod(c, oc_bn), kernels_per_group),
+        ].astype(out_dtype),
+        name="output_unpack",
+        tag="group_conv2d_nchw",
+    )
+
+    return unpack
+
+
+@autotvm.register_topi_schedule("group_conv2d_nchw.x86")
+def schedule_group_conv2d_nchwc(cfg, outs):
+    """Create schedule for tensors"""
+    s = te.create_schedule([x.op for x in outs])
+    scheduled_ops = []
+
+    def traverse(op):
+        """Traverse operators from computation graph"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(op.tag):
+            if op not in s.outputs:
+                s[op].compute_inline()
+            for tensor in op.input_tensors:
+                if isinstance(tensor.op, tvm.te.ComputeOp) and tensor.op not in scheduled_ops:
+                    traverse(tensor.op)
+
+        if "group_conv2d_nchw" in op.tag:
+            output = op.output(0)
+
+            if "tile_ic" not in cfg:
+                return
+            conv_out = op.input_tensors[0]
+            kernel_vec = conv_out.op.input_tensors[1]
+            kernel = kernel_vec.op.input_tensors[0]
+            if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
+                s[kernel].compute_inline()
+            data_vec = conv_out.op.input_tensors[0]
+            data = data_vec.op.input_tensors[0]
+            data_pad = None
+            if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag:
+                data_pad = data
+                data = data_pad.op.input_tensors[0]
+
+            args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, outs[0]]
+            _schedule_gspc_nchw(*args)
+
+        scheduled_ops.append(op)
+
+    traverse(outs[0].op)
+    return s
+
+
+def _schedule_gspc_nchw(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, last):
+    """Schedule GSPC"""
+    ic_bn, oc_bn, reg_n, unroll_kw = (
+        cfg["tile_ic"].size[-1],
+        cfg["tile_oc"].size[-1],
+        cfg["tile_ow"].size[-1],
+        cfg["unroll_kw"].val,
+    )
+
+    _, W = data, kernel_vec
+    A0, A1 = data_pad, data_vec
+
+    # schedule data
+    if (
+        data_pad is not None
+        and isinstance(data_pad.op, tvm.te.ComputeOp)
+        and "pad" in data_pad.op.tag
+    ):
+        s[A0].compute_inline()
+
+    groups, batch, ic_chunk, ih, ic_block, _ = s[A1].op.axis
+
+    parallel_axis = s[A1].fuse(batch, ic_chunk, ih)
+    s[A1].parallel(parallel_axis)
+
+    # schedule kernel pack
+    groups, oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[W].op.axis
+    s[W].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block)
+
+    if oc_bn > 1:
+        s[W].vectorize(oc_block)
+
+    parallel_axis = s[W].fuse(groups, oc_chunk, oh)
+    s[W].parallel(parallel_axis)
+
+    # schedule conv
+    C, O0, O = conv_out, output, last
+    CC = s.cache_write(C, "global")
+
+    _, _, oc_chunk, oh, ow, oc_block = s[C].op.axis
+
+    ow_chunk, ow_block = s[C].split(ow, factor=reg_n)
+
+    s[C].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
+    s[C].fuse(oc_chunk, oh)
+    s[C].vectorize(oc_block)
+
+    groups, batch, oc_chunk, oh, ow, oc_block = s[CC].op.axis
+
+    ic, kh, kw = s[CC].op.reduce_axis
+    ow_chunk, ow_block = s[CC].split(ow, factor=reg_n)
+    ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn)
+
+    if unroll_kw:
+        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, ic_block, kw, ow_block, oc_block)
+        s[CC].unroll(kw)
+    else:
+        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, kw, ic_block, ow_block, oc_block)
+
+    parallel_axis = s[CC].fuse(groups, batch, oc_chunk, oh)
+
+    s[CC].parallel(parallel_axis)
+
+    s[CC].vectorize(oc_block)
+
+    s[CC].unroll(ow_block)
+
+    if O0 != O:
+        s[O0].compute_inline()
+
+    batch, oc, oh, ow = s[O].op.axis
+    ow_chunk, ow_block = s[O].split(ow, factor=reg_n)
+    oc_chunk, oc_block = s[O].split(oc, factor=oc_bn)
+
+    s[O].reorder(batch, oc_chunk, oh, ow_chunk, ow_block, oc_block)
+    parallel_axis = s[O].fuse(oc_chunk, oh)
+    s[O].vectorize(oc_block)
+    s[O].parallel(parallel_axis)
+    return s
diff --git a/rust/tvm-graph-rt/Cargo.toml b/rust/tvm-graph-rt/Cargo.toml
index 13837f62695d..5c492393a75e 100644
--- a/rust/tvm-graph-rt/Cargo.toml
+++ b/rust/tvm-graph-rt/Cargo.toml
@@ -19,7 +19,7 @@
 name = "tvm-graph-rt"
 version = "0.1.0"
 license = "Apache-2.0"
-description = "A static graph runtime for TVM."
+description = "A static graph executor for TVM."
 repository = "https://github.com/apache/tvm"
 readme = "README.md"
 keywords = ["tvm"]
diff --git a/rust/tvm-graph-rt/src/array.rs b/rust/tvm-graph-rt/src/array.rs
index deacf11bec04..8ae716a3266f 100644
--- a/rust/tvm-graph-rt/src/array.rs
+++ b/rust/tvm-graph-rt/src/array.rs
@@ -20,7 +20,7 @@
 use std::{convert::TryFrom, mem, os::raw::c_void, ptr, slice};
 
 use ndarray;
-use tvm_sys::{ffi::DLTensor, Context, DataType};
+use tvm_sys::{ffi::DLTensor, DataType, Device};
 
 use crate::allocator::Allocation;
 use crate::errors::ArrayError;
@@ -151,7 +151,7 @@ impl<'d, 's, T> From<&'d [T]> for Storage<'s> {
 pub struct Tensor<'a> {
     /// The bytes which contain the data this `Tensor` represents.
     pub(crate) data: Storage<'a>,
-    pub(crate) ctx: Context,
+    pub(crate) device: Device,
     pub(crate) dtype: DataType,
     pub(crate) shape: Vec<i64>,
     // ^ not usize because `typedef int64_t tvm_index_t` in c_runtime_api.h
@@ -243,7 +243,7 @@ impl<'a> Tensor<'a> {
     pub fn to_owned(&self) -> Tensor<'static> {
         let t = Tensor {
             data: self.data.to_owned(),
-            ctx: self.ctx,
+            device: self.device,
             dtype: self.dtype,
             size: self.size,
             shape: self.shape.clone(),
@@ -262,7 +262,7 @@ impl<'a> Tensor<'a> {
 
         Tensor {
             data: storage,
-            ctx: Context::default(),
+            device: Device::default(),
             dtype: dtype_fn(8 * type_width, 1),
             size: arr.len(),
             shape: arr.shape().iter().map(|&v| v as i64).collect(),
@@ -275,7 +275,7 @@ impl<'a> Tensor<'a> {
         assert!(!flatten || self.is_contiguous());
         DLTensor {
             data: unsafe { self.data.as_mut_ptr().offset(self.byte_offset) } as *mut c_void,
-            ctx: self.ctx.into(),
+            device: self.device.into(),
             ndim: if flatten { 1 } else { self.shape.len() } as i32,
             dtype: self.dtype.into(),
             shape: if flatten {
@@ -356,7 +356,7 @@ impl<'a> From<DLTensor> for Tensor<'a> {
             ));
             Self {
                 data: storage,
-                ctx: Context::default(),
+                device: Device::default(),
                 dtype,
                 size,
                 shape,
diff --git a/rust/tvm-graph-rt/src/graph.rs b/rust/tvm-graph-rt/src/graph.rs
index 83fe37ea7970..de2e7dddff5c 100644
--- a/rust/tvm-graph-rt/src/graph.rs
+++ b/rust/tvm-graph-rt/src/graph.rs
@@ -33,13 +33,13 @@ use serde_json;
 
 use tvm_sys::ffi::{DLDataTypeCode_kDLFloat, DLDataTypeCode_kDLInt, DLDataTypeCode_kDLUInt};
 
-use tvm_sys::{ffi::DLTensor, ArgValue, Context, DataType, DeviceType};
+use tvm_sys::{ffi::DLTensor, ArgValue, DataType, Device, DeviceType};
 
 use crate::{errors::*, Module, Storage, Tensor};
 
 // @see `kTVMNDArrayMagic` in `ndarray.h`
 const _NDARRAY_MAGIC: u64 = 0xDD5E_40F0_96B4_A13F;
-// @see `kTVMNDArrayListMagic` in `graph_runtime.h`
+// @see `kTVMNDArrayListMagic` in `graph_executor.h`
 const _NDARRAY_LIST_MAGIC: u64 = 0xF7E5_8D4F_0504_9CB7;
 
 /// A TVM computation graph.
@@ -240,7 +240,7 @@ impl<'m, 't> GraphExecutor<'m, 't> {
                 let storage = storages[storage_id].view();
                 Tensor {
                     data: mem::replace(&mut storages[storage_id], storage),
-                    ctx: Context::default(),
+                    device: Device::default(),
                     dtype,
                     size: shape.iter().product::<i64>() as usize,
                     shape,
@@ -418,14 +418,14 @@ named! {
     )
 }
 
-// Parses a Context
+// Parses a Device
 named! {
-  tvm_ctx<&[u8], Context>,
+  tvm_device<&[u8], Device>,
   do_parse!(
     device_type: le_u32 >>
     device_id:   le_i32 >>
     (
-        Context {
+        Device {
             device_type: DeviceType::from(device_type),
             device_id: device_id as usize,
         }
@@ -449,7 +449,7 @@ named! {
     do_parse!(
                 take!(8)      >>
                 le_u64        >>
-        ctx:    tvm_ctx       >>
+        device: tvm_device    >>
         ndim:   le_u32        >>
         dtype:  data_type     >>
         shape:  count!(map!(le_i64, |sz| sz as i64), ndim as usize) >>
@@ -458,7 +458,7 @@ named! {
         (
             Tensor {
                 data: Storage::from(data),
-                ctx: ctx,
+                device: device,
                 dtype: dtype,
                 size: shape.iter().product::<i64>() as usize,
                 shape: shape,
diff --git a/rust/tvm-rt/src/context.rs b/rust/tvm-rt/src/device.rs
similarity index 91%
rename from rust/tvm-rt/src/context.rs
rename to rust/tvm-rt/src/device.rs
index b0fea33c6c61..b1cb58cd54cf 100644
--- a/rust/tvm-rt/src/context.rs
+++ b/rust/tvm-rt/src/device.rs
@@ -24,10 +24,10 @@ use crate::errors::Error;
 
 use tvm_sys::ffi;
 
-pub use tvm_sys::context::*;
+pub use tvm_sys::device::*;
 
-trait ContextExt {
-    /// Checks whether the context exists or not.
+trait DeviceExt {
+    /// Checks whether the device exists or not.
     fn exist(&self) -> bool;
     fn sync(&self) -> Result<(), Error>;
     fn max_threads_per_block(&self) -> isize;
@@ -57,7 +57,7 @@ crate::external! {
     fn get_device_attr(device_type: i32, device_id: i32, device_kind: i32) -> i32;
 }
 
-impl ContextExt for Context {
+impl DeviceExt for Device {
     fn exist(&self) -> bool {
         let exists = get_device_attr(self.device_type as i32, self.device_id as i32, 0)
             .expect("should not fail");
@@ -65,7 +65,7 @@ impl ContextExt for Context {
         exists != 0
     }
 
-    /// Synchronize the context stream.
+    /// Synchronize the device stream.
     fn sync(&self) -> Result<(), Error> {
         check_call!(ffi::TVMSynchronize(
             self.device_type as i32,
@@ -91,7 +91,7 @@ mod tests {
 
     #[test]
     fn sync() {
-        let ctx = Context::cpu(0);
-        assert!(ctx.sync().is_ok())
+        let dev = Device::cpu(0);
+        assert!(dev.sync().is_ok())
     }
 }
diff --git a/rust/tvm-rt/src/lib.rs b/rust/tvm-rt/src/lib.rs
index 5f9ab1617378..c43264da9e5b 100644
--- a/rust/tvm-rt/src/lib.rs
+++ b/rust/tvm-rt/src/lib.rs
@@ -38,7 +38,7 @@ use std::{
 };
 
 pub use crate::{
-    context::{Context, DeviceType},
+    device::{Device, DeviceType},
     errors::*,
     function::Function,
     module::Module,
@@ -92,7 +92,7 @@ pub(crate) fn set_last_error<E: std::error::Error>(err: &E) {
 }
 
 pub mod array;
-pub mod context;
+pub mod device;
 pub mod errors;
 pub mod function;
 pub mod map;
@@ -111,7 +111,7 @@ pub fn version() -> &'static str {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::{ByteArray, Context, DataType};
+    use crate::{ByteArray, DataType, Device};
     use std::{convert::TryInto, str::FromStr};
 
     #[test]
@@ -148,9 +148,9 @@ mod tests {
     }
 
     #[test]
-    fn ctx() {
-        let c = Context::from_str("gpu").unwrap();
-        let tvm: Context = RetValue::from(c).try_into().unwrap();
+    fn device() {
+        let c = Device::from_str("gpu").unwrap();
+        let tvm: Device = RetValue::from(c).try_into().unwrap();
         assert_eq!(tvm, c);
     }
 }
diff --git a/rust/tvm-rt/src/ndarray.rs b/rust/tvm-rt/src/ndarray.rs
index 07f783f0ef43..4c48ce50b4f3 100644
--- a/rust/tvm-rt/src/ndarray.rs
+++ b/rust/tvm-rt/src/ndarray.rs
@@ -20,16 +20,16 @@
 //! This module implements the [`NDArray`] type for working with *TVM tensors* or
 //! coverting from a Rust's ndarray to TVM `NDArray`.
 //!
-//! One can create an empty NDArray given the shape, context and dtype using [`empty`].
+//! One can create an empty NDArray given the shape, device and dtype using [`empty`].
 //! To create an NDArray from a mutable buffer in cpu use [`copy_from_buffer`].
-//! To copy an NDArray to different context use [`copy_to_ctx`].
+//! To copy an NDArray to different device use [`copy_to_device`].
 //!
 //! Given a [`Rust's dynamic ndarray`], one can convert it to TVM NDArray as follows:
 //!
 //! # Example
 //!
 //! ```
-//! # use tvm_rt::{NDArray, Context, DataType};
+//! # use tvm_rt::{NDArray, DataType, Device};
 //! # use ndarray::{Array, ArrayD};
 //! # use std::str::FromStr;
 //! use std::convert::TryFrom;
@@ -37,7 +37,7 @@
 //! let a = Array::from_shape_vec((2, 2), vec![1f32, 2., 3., 4.])
 //!     .unwrap()
 //!     .into_dyn(); // Rust's ndarray
-//! let nd = NDArray::from_rust_ndarray(&a, Context::cpu(0), DataType::from_str("float32").unwrap()).unwrap();
+//! let nd = NDArray::from_rust_ndarray(&a, Device::cpu(0), DataType::from_str("float32").unwrap()).unwrap();
 //! assert_eq!(nd.shape(), &[2, 2]);
 //! let rnd: ArrayD<f32> = ArrayD::try_from(&nd).unwrap();
 //! assert!(rnd.all_close(&a, 1e-8f32));
@@ -45,7 +45,7 @@
 //!
 //! [`Rust's dynamic ndarray`]:https://docs.rs/ndarray/0.12.1/ndarray/
 //! [`copy_from_buffer`]:struct.NDArray.html#method.copy_from_buffer
-//! [`copy_to_ctx`]:struct.NDArray.html#method.copy_to_ctx
+//! [`copy_to_device`]:struct.NDArray.html#method.copy_to_device
 
 use std::ffi::c_void;
 use std::{borrow::Cow, convert::TryInto};
@@ -54,7 +54,7 @@ use std::{convert::TryFrom, mem, os::raw::c_int, ptr, slice, str::FromStr};
 use mem::size_of;
 use tvm_macros::Object;
 use tvm_sys::ffi::DLTensor;
-use tvm_sys::{ffi, ByteArray, Context, DataType};
+use tvm_sys::{ffi, ByteArray, DataType, Device};
 
 use ndarray::{Array, ArrayD};
 use num_traits::Num;
@@ -189,9 +189,9 @@ impl NDArray {
         self.len() * self.dtype().itemsize()
     }
 
-    /// Returns the context which the NDArray was defined.
-    pub fn ctx(&self) -> Context {
-        self.as_dltensor().ctx.into()
+    /// Returns the device which the NDArray was defined.
+    pub fn device(&self) -> Device {
+        self.as_dltensor().device.into()
     }
 
     /// Returns the type of the entries of the NDArray.
@@ -239,12 +239,12 @@ impl NDArray {
     /// ## Example
     ///
     /// ```
-    /// # use tvm_rt::{Context, DataType, NDArray};
+    /// # use tvm_rt::{Device, DataType, NDArray};
     /// # use std::str::FromStr;
     /// let mut shape = [4];
     /// let mut data = vec![1i32, 2, 3, 4];
-    /// let ctx = Context::cpu(0);
-    /// let mut ndarray = NDArray::empty(&mut shape, ctx, DataType::from_str("int32").unwrap());
+    /// let dev = Device::cpu(0);
+    /// let mut ndarray = NDArray::empty(&mut shape, dev, DataType::from_str("int32").unwrap());
     /// ndarray.copy_from_buffer(&mut data);
     /// assert_eq!(ndarray.shape(), shape);
     /// assert_eq!(ndarray.to_vec::<i32>().unwrap(), data);
@@ -272,12 +272,12 @@ impl NDArray {
     /// ## Example
     ///
     /// ```
-    /// # use tvm_rt::{Context, DataType, NDArray};
+    /// # use tvm_rt::{Device, DataType, NDArray};
     /// # use std::str::FromStr;
     /// let shape = &mut [2];
     /// let mut data = vec![1f32, 2.0];
-    /// let ctx = Context::cpu(0);
-    /// let mut ndarray = NDArray::empty(shape, ctx, DataType::from_str("int32").unwrap());
+    /// let dev = Device::cpu(0);
+    /// let mut ndarray = NDArray::empty(shape, dev, DataType::from_str("int32").unwrap());
     /// ndarray.copy_from_buffer(&mut data);
     /// ```
     ///
@@ -332,8 +332,8 @@ impl NDArray {
         Ok(target)
     }
 
-    /// Copies the NDArray to a target context.
-    pub fn copy_to_ctx(&self, target: &Context) -> Result<NDArray, NDArrayError> {
+    /// Copies the NDArray to a target device.
+    pub fn copy_to_device(&self, target: &Device) -> Result<NDArray, NDArrayError> {
         let tmp = NDArray::empty(self.shape(), *target, self.dtype());
         let copy = self.copy_to_ndarray(tmp)?;
         Ok(copy)
@@ -342,17 +342,17 @@ impl NDArray {
     /// Converts a Rust's ndarray to TVM NDArray.
     pub fn from_rust_ndarray<T: Num32 + Copy>(
         input_nd: &ArrayD<T>,
-        ctx: Context,
+        dev: Device,
         dtype: DataType,
     ) -> Result<Self, NDArrayError> {
         let shape: Vec<i64> = input_nd.shape().iter().map(|&x| x as i64).collect();
-        let mut nd = NDArray::empty(&shape, ctx, dtype);
+        let mut nd = NDArray::empty(&shape, dev, dtype);
         nd.fill_from_iter(input_nd.iter().copied());
         Ok(nd)
     }
 
-    /// Allocates and creates an empty NDArray given the shape, context and dtype.
-    pub fn empty(shape: &[i64], ctx: Context, dtype: DataType) -> NDArray {
+    /// Allocates and creates an empty NDArray given the shape, device and dtype.
+    pub fn empty(shape: &[i64], dev: Device, dtype: DataType) -> NDArray {
         let mut handle = ptr::null_mut() as ffi::TVMArrayHandle;
         let dtype: tvm_sys::ffi::DLDataType = dtype.into();
         check_call!(ffi::TVMArrayAlloc(
@@ -361,8 +361,8 @@ impl NDArray {
             i32::from(dtype.code) as c_int,
             i32::from(dtype.bits) as c_int,
             i32::from(dtype.lanes) as c_int,
-            ctx.device_type as c_int,
-            ctx.device_id as c_int,
+            dev.device_type as c_int,
+            dev.device_id as c_int,
             &mut handle as *mut _,
         ));
         let ptr = NDArrayContainer::from_raw(handle)
@@ -441,9 +441,9 @@ mod tests {
     #[test]
     fn basics() {
         let shape = &[1, 2, 3];
-        let ctx = Context::cpu(0);
+        let dev = Device::cpu(0);
         println!("before empty");
-        let ndarray = NDArray::empty(shape, ctx, DataType::from_str("int32").unwrap());
+        let ndarray = NDArray::empty(shape, dev, DataType::from_str("int32").unwrap());
         println!("after empty");
         assert_eq!(ndarray.shape(), shape);
         assert_eq!(ndarray.len(), shape.iter().product::<i64>() as usize);
@@ -456,8 +456,8 @@ mod tests {
     fn copy() {
         let shape = &[4];
         let data = vec![1i32, 2, 3, 4];
-        let ctx = Context::cpu(0);
-        let mut ndarray = NDArray::empty(shape, ctx, DataType::int(32, 1)).zeroed();
+        let dev = Device::cpu(0);
+        let mut ndarray = NDArray::empty(shape, dev, DataType::int(32, 1)).zeroed();
         assert_eq!(ndarray.to_vec::<i32>().unwrap(), vec![0, 0, 0, 0]);
         ndarray.copy_from_buffer(&data);
         assert_eq!(ndarray.shape(), shape);
@@ -466,11 +466,7 @@ mod tests {
         assert!(ndarray.is_contiguous());
         assert_eq!(ndarray.byte_offset(), 0);
         let shape = vec![4];
-        let e = NDArray::empty(
-            &shape,
-            Context::cpu(0),
-            DataType::from_str("int32").unwrap(),
-        );
+        let e = NDArray::empty(&shape, Device::cpu(0), DataType::from_str("int32").unwrap());
         let nd = ndarray.copy_to_ndarray(e);
         assert!(nd.is_ok());
         assert_eq!(nd.unwrap().to_vec::<i32>().unwrap(), data);
@@ -482,10 +478,10 @@ mod tests {
     fn copy_wrong_dtype() {
         let shape = vec![4];
         let mut data = vec![1f32, 2., 3., 4.];
-        let ctx = Context::cpu(0);
-        let mut nd_float = NDArray::empty(&shape, ctx, DataType::from_str("float32").unwrap());
+        let dev = Device::cpu(0);
+        let mut nd_float = NDArray::empty(&shape, dev, DataType::from_str("float32").unwrap());
         nd_float.copy_from_buffer(&mut data);
-        let empty_int = NDArray::empty(&shape, ctx, DataType::from_str("int32").unwrap());
+        let empty_int = NDArray::empty(&shape, dev, DataType::from_str("int32").unwrap());
         nd_float.copy_to_ndarray(empty_int).unwrap();
     }
 
@@ -495,7 +491,7 @@ mod tests {
             .unwrap()
             .into_dyn();
         let nd =
-            NDArray::from_rust_ndarray(&a, Context::cpu(0), DataType::from_str("float32").unwrap())
+            NDArray::from_rust_ndarray(&a, Device::cpu(0), DataType::from_str("float32").unwrap())
                 .unwrap();
         assert_eq!(nd.shape(), &[2, 2]);
         let rnd: ArrayD<f32> = ArrayD::try_from(&nd).unwrap();
diff --git a/rust/tvm-sys/src/array.rs b/rust/tvm-sys/src/array.rs
index 5d09d8670eda..92208303e89c 100644
--- a/rust/tvm-sys/src/array.rs
+++ b/rust/tvm-sys/src/array.rs
@@ -23,7 +23,7 @@ use std::{
 };
 
 use crate::ffi::{
-    DLContext, DLDataType, DLDataTypeCode_kDLFloat, DLDataTypeCode_kDLInt, DLDataTypeCode_kDLUInt,
+    DLDataType, DLDataTypeCode_kDLFloat, DLDataTypeCode_kDLInt, DLDataTypeCode_kDLUInt, DLDevice,
     DLDeviceType_kDLCPU, DLTensor,
 };
 
@@ -35,7 +35,7 @@ macro_rules! impl_dltensor_from_ndarray {
             fn from(arr: &'a mut ndarray::Array<$type, D>) -> Self {
                 DLTensor {
                     data: arr.as_mut_ptr() as *mut c_void,
-                    ctx: DLContext {
+                    device: DLDevice {
                         device_type: DLDeviceType_kDLCPU,
                         device_id: 0,
                     },
diff --git a/rust/tvm-sys/src/context.rs b/rust/tvm-sys/src/device.rs
similarity index 71%
rename from rust/tvm-sys/src/context.rs
rename to rust/tvm-sys/src/device.rs
index a5165fccf0aa..910cc5973408 100644
--- a/rust/tvm-sys/src/context.rs
+++ b/rust/tvm-sys/src/device.rs
@@ -17,25 +17,25 @@
  * under the License.
  */
 
-//! Provides [`Context`] and related device queries.
+//! Provides [`Device`] and related device queries.
 //!
-//! Create a new context for device type and device id.
+//! Create a new device for device type and device id.
 //!
 //! # Example
 //!
 //! ```
-//! # use tvm_sys::{DeviceType, Context};
+//! # use tvm_sys::{DeviceType, Device};
 //! let cpu = DeviceType::from("cpu");
-//! let ctx = Context::new(cpu , 0);
-//! let cpu0 = Context::cpu(0);
-//! assert_eq!(ctx, cpu0);
+//! let dev = Device::new(cpu , 0);
+//! let cpu0 = Device::cpu(0);
+//! assert_eq!(dev, cpu0);
 //! ```
 //!
 //! Or from a supported device name.
 //!
 //! ```
-//! use tvm_sys::Context;
-//! let cpu0 = Context::from("cpu");
+//! use tvm_sys::Device;
+//! let cpu0 = Device::from("cpu");
 //! println!("{}", cpu0);
 //! ```
 
@@ -141,30 +141,30 @@ impl<'a> From<&DeviceType> for ArgValue<'a> {
 }
 
 #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
-pub struct Context {
+pub struct Device {
     pub device_type: DeviceType,
     pub device_id: usize,
 }
 
-impl Context {
-    pub fn new(device_type: DeviceType, device_id: usize) -> Context {
-        Context {
+impl Device {
+    pub fn new(device_type: DeviceType, device_id: usize) -> Device {
+        Device {
             device_type,
             device_id,
         }
     }
 }
 
-impl<'a> From<&'a Context> for DLContext {
-    fn from(ctx: &'a Context) -> Self {
+impl<'a> From<&'a Device> for DLDevice {
+    fn from(dev: &'a Device) -> Self {
         Self {
-            device_type: ctx.device_type.into(),
-            device_id: ctx.device_id as i32,
+            device_type: dev.device_type.into(),
+            device_id: dev.device_id as i32,
         }
     }
 }
 
-impl Default for Context {
+impl Default for Device {
     fn default() -> Self {
         Self {
             device_type: DLDeviceType_kDLCPU.into(),
@@ -177,10 +177,10 @@ impl Default for Context {
 #[error("unsupported device: {0}")]
 pub struct UnsupportedDeviceError(String);
 
-macro_rules! impl_tvm_context {
+macro_rules! impl_tvm_device {
     ( $( $dev_type:ident : [ $( $dev_name:ident ),+ ] ),+ ) => {
-        /// Creates a Context from a string (e.g., "cpu", "gpu", "ext_dev")
-        impl FromStr for Context {
+        /// Creates a Device from a string (e.g., "cpu", "gpu", "ext_dev")
+        impl FromStr for Device {
             type Err = UnsupportedDeviceError;
             fn from_str(type_str: &str) -> Result<Self, Self::Err> {
                 Ok(Self {
@@ -193,7 +193,7 @@ macro_rules! impl_tvm_context {
             }
         }
 
-        impl Context {
+        impl Device {
             $(
                 $(
                     pub fn $dev_name(device_id: usize) -> Self {
@@ -208,7 +208,7 @@ macro_rules! impl_tvm_context {
     };
 }
 
-impl_tvm_context!(
+impl_tvm_device!(
     DLDeviceType_kDLCPU: [cpu, llvm, stackvm],
     DLDeviceType_kDLGPU: [gpu, cuda, nvptx],
     DLDeviceType_kDLOpenCL: [cl],
@@ -218,59 +218,59 @@ impl_tvm_context!(
     DLDeviceType_kDLExtDev: [ext_dev]
 );
 
-impl<'a> From<&'a str> for Context {
+impl<'a> From<&'a str> for Device {
     fn from(target: &str) -> Self {
-        Context::new(DeviceType::from(target), 0)
+        Device::new(DeviceType::from(target), 0)
     }
 }
 
-impl From<ffi::DLContext> for Context {
-    fn from(ctx: ffi::DLContext) -> Self {
-        Context {
-            device_type: DeviceType::from(ctx.device_type),
-            device_id: ctx.device_id as usize,
+impl From<ffi::DLDevice> for Device {
+    fn from(dev: ffi::DLDevice) -> Self {
+        Device {
+            device_type: DeviceType::from(dev.device_type),
+            device_id: dev.device_id as usize,
         }
     }
 }
 
-impl From<Context> for ffi::DLContext {
-    fn from(ctx: Context) -> Self {
-        ffi::DLContext {
-            device_type: ctx.device_type.into(),
-            device_id: ctx.device_id as i32,
+impl From<Device> for ffi::DLDevice {
+    fn from(dev: Device) -> Self {
+        ffi::DLDevice {
+            device_type: dev.device_type.into(),
+            device_id: dev.device_id as i32,
         }
     }
 }
 
-impl Display for Context {
+impl Display for Device {
     fn fmt(&self, f: &mut Formatter) -> fmt::Result {
         write!(f, "{}({})", self.device_type, self.device_id)
     }
 }
 
-impl<'a> From<&'a Context> for ArgValue<'a> {
-    fn from(ctx: &'a Context) -> Self {
-        DLContext::from(ctx).into()
+impl<'a> From<&'a Device> for ArgValue<'a> {
+    fn from(dev: &'a Device) -> Self {
+        DLDevice::from(dev).into()
     }
 }
 
-impl<'a> From<Context> for ArgValue<'a> {
-    fn from(ctx: Context) -> Self {
-        DLContext::from(ctx).into()
+impl<'a> From<Device> for ArgValue<'a> {
+    fn from(dev: Device) -> Self {
+        DLDevice::from(dev).into()
     }
 }
 
-impl From<Context> for RetValue {
-    fn from(ret_value: Context) -> RetValue {
-        RetValue::Context(ret_value.into())
+impl From<Device> for RetValue {
+    fn from(ret_value: Device) -> RetValue {
+        RetValue::Device(ret_value.into())
     }
 }
 
-impl TryFrom<RetValue> for Context {
+impl TryFrom<RetValue> for Device {
     type Error = anyhow::Error;
-    fn try_from(ret_value: RetValue) -> anyhow::Result<Context> {
+    fn try_from(ret_value: RetValue) -> anyhow::Result<Device> {
         match ret_value {
-            RetValue::Context(dt) => Ok(dt.into()),
+            RetValue::Device(dt) => Ok(dt.into()),
             // TODO(@jroesch): improve
             _ => Err(anyhow::anyhow!("unable to convert datatype from ...")),
         }
@@ -282,15 +282,15 @@ mod tests {
     use super::*;
 
     #[test]
-    fn context() {
-        let ctx = Context::cpu(0);
-        println!("ctx: {}", ctx);
-        let default_ctx = Context::new(DeviceType::CPU, 0);
-        assert_eq!(ctx.clone(), default_ctx);
-        assert_ne!(ctx, Context::gpu(0));
+    fn device() {
+        let dev = Device::cpu(0);
+        println!("device: {}", dev);
+        let default_dev = Device::new(DeviceType::CPU, 0);
+        assert_eq!(dev.clone(), default_dev);
+        assert_ne!(dev, Device::gpu(0));
 
-        let str_ctx = Context::new(DeviceType::GPU, 0);
-        assert_eq!(str_ctx.clone(), str_ctx);
-        assert_ne!(str_ctx, Context::new(DeviceType::CPU, 0));
+        let str_dev = Device::new(DeviceType::GPU, 0);
+        assert_eq!(str_dev.clone(), str_dev);
+        assert_ne!(str_dev, Device::new(DeviceType::CPU, 0));
     }
 }
diff --git a/rust/tvm-sys/src/lib.rs b/rust/tvm-sys/src/lib.rs
index 231569ba682e..8ed6f37f5f48 100644
--- a/rust/tvm-sys/src/lib.rs
+++ b/rust/tvm-sys/src/lib.rs
@@ -45,16 +45,16 @@ pub mod ffi {
 
 pub mod array;
 pub mod byte_array;
-pub mod context;
 pub mod datatype;
+pub mod device;
 pub mod errors;
 #[macro_use]
 pub mod packed_func;
 pub mod value;
 
 pub use byte_array::ByteArray;
-pub use context::{Context, DeviceType};
 pub use datatype::DataType;
+pub use device::{Device, DeviceType};
 pub use errors::*;
 pub use packed_func::{ArgValue, RetValue};
 
diff --git a/rust/tvm-sys/src/packed_func.rs b/rust/tvm-sys/src/packed_func.rs
index 7b8d5296d641..6f43b786780a 100644
--- a/rust/tvm-sys/src/packed_func.rs
+++ b/rust/tvm-sys/src/packed_func.rs
@@ -76,7 +76,7 @@ macro_rules! TVMPODValue {
             Null,
             DataType(DLDataType),
             String(*mut c_char),
-            Context(TVMContext),
+            Device(DLDevice),
             Handle(*mut c_void),
             ArrayHandle(TVMArrayHandle),
             ObjectHandle(*mut c_void),
@@ -97,7 +97,7 @@ macro_rules! TVMPODValue {
                         DLDataTypeCode_kDLFloat => Float($value.v_float64),
                         TVMArgTypeCode_kTVMNullptr => Null,
                         TVMArgTypeCode_kTVMDataType => DataType($value.v_type),
-                        TVMArgTypeCode_kTVMContext => Context($value.v_ctx),
+                        TVMArgTypeCode_kDLDevice => Device($value.v_device),
                         TVMArgTypeCode_kTVMOpaqueHandle => Handle($value.v_handle),
                         TVMArgTypeCode_kTVMDLTensorHandle => ArrayHandle($value.v_handle as TVMArrayHandle),
                         TVMArgTypeCode_kTVMObjectHandle => ObjectHandle($value.v_handle),
@@ -119,7 +119,7 @@ macro_rules! TVMPODValue {
                     Float(val) => (TVMValue { v_float64: *val }, DLDataTypeCode_kDLFloat),
                     Null => (TVMValue{ v_int64: 0 },TVMArgTypeCode_kTVMNullptr),
                     DataType(val) => (TVMValue { v_type: *val }, TVMArgTypeCode_kTVMDataType),
-                    Context(val) => (TVMValue { v_ctx: val.clone() }, TVMArgTypeCode_kTVMContext),
+                    Device(val) => (TVMValue { v_device: val.clone() }, TVMArgTypeCode_kDLDevice),
                     String(val) => {
                         (
                             TVMValue { v_handle: *val as *mut c_void },
@@ -264,7 +264,7 @@ impl_pod_value!(Int, i64, [i8, i16, i32, i64, isize]);
 impl_pod_value!(UInt, i64, [u8, u16, u32, u64, usize]);
 impl_pod_value!(Float, f64, [f32, f64]);
 impl_pod_value!(DataType, DLDataType, [DLDataType]);
-impl_pod_value!(Context, TVMContext, [TVMContext]);
+impl_pod_value!(Device, DLDevice, [DLDevice]);
 
 impl<'a> From<&'a str> for ArgValue<'a> {
     fn from(s: &'a str) -> Self {
diff --git a/rust/tvm-sys/src/value.rs b/rust/tvm-sys/src/value.rs
index a9ad5f523fde..f939d5177806 100644
--- a/rust/tvm-sys/src/value.rs
+++ b/rust/tvm-sys/src/value.rs
@@ -47,16 +47,16 @@ macro_rules! impl_pod_tvm_value {
 impl_pod_tvm_value!(v_int64, i64, i8, u8, i16, u16, i32, u32, i64, u64, isize, usize);
 impl_pod_tvm_value!(v_float64, f64, f32, f64);
 impl_pod_tvm_value!(v_type, DLDataType);
-impl_pod_tvm_value!(v_ctx, TVMContext);
+impl_pod_tvm_value!(v_device, DLDevice);
 
 #[derive(Debug, Error)]
 #[error("unsupported device: {0}")]
 pub struct UnsupportedDeviceError(String);
 
-macro_rules! impl_tvm_context {
+macro_rules! impl_tvm_device {
     ( $( $dev_type:ident : [ $( $dev_name:ident ),+ ] ),+ ) => {
-        /// Creates a TVMContext from a string (e.g., "cpu", "gpu", "ext_dev")
-        impl FromStr for TVMContext {
+        /// Creates a DLDevice from a string (e.g., "cpu", "gpu", "ext_dev")
+        impl FromStr for DLDevice {
             type Err = UnsupportedDeviceError;
             fn from_str(type_str: &str) -> Result<Self, Self::Err> {
                 Ok(Self {
@@ -69,7 +69,7 @@ macro_rules! impl_tvm_context {
             }
         }
 
-        impl TVMContext {
+        impl DLDevice {
             $(
                 $(
                     pub fn $dev_name(device_id: usize) -> Self {
@@ -84,7 +84,7 @@ macro_rules! impl_tvm_context {
     };
 }
 
-impl_tvm_context!(
+impl_tvm_device!(
     DLDeviceType_kDLCPU: [cpu, llvm, stackvm],
     DLDeviceType_kDLGPU: [gpu, cuda, nvptx],
     DLDeviceType_kDLOpenCL: [cl],
diff --git a/rust/tvm/README.md b/rust/tvm/README.md
index 75fabe7d9a1b..b518f93195b7 100644
--- a/rust/tvm/README.md
+++ b/rust/tvm/README.md
@@ -37,7 +37,7 @@ The Rust bindings are composed of a few crates:
 - The [tvm_rt](https://tvm.apache.org/docs/api/rust/tvm_rt/index.html) crate which exposes Rust
   bindings to the TVM runtime APIs.
 - The [tvm_sys] crate which provides raw bindings and linkage to the TVM C++ library.
-- The [tvm_graph_rt] crate which implements a version of the TVM graph runtime in Rust vs. C++.
+- The [tvm_graph_rt] crate which implements a version of the TVM graph executor in Rust vs. C++.
 
 These crates have been recently refactored and reflect a much different philosophy than
 previous bindings, as well as much increased support for more of the TVM API including
diff --git a/rust/tvm/examples/resnet/src/build_resnet.py b/rust/tvm/examples/resnet/src/build_resnet.py
index fdacb5bb1fca..13c499b54deb 100644
--- a/rust/tvm/examples/resnet/src/build_resnet.py
+++ b/rust/tvm/examples/resnet/src/build_resnet.py
@@ -29,7 +29,7 @@
 from tvm import te
 from tvm import relay, runtime
 from tvm.relay import testing
-from tvm.contrib import graph_runtime, cc
+from tvm.contrib import graph_executor, cc
 from PIL import Image
 from tvm.contrib.download import download_testdata
 from mxnet.gluon.model_zoo.vision import get_model
@@ -49,7 +49,7 @@
     default=3,
     help="level of optimization. 0 is unoptimized and 3 is the highest level",
 )
-aa("--target", type=str, default="llvm", help="target context for compilation")
+aa("--target", type=str, default="llvm", help="target for compilation")
 aa("--image-shape", type=str, default="3,224,224", help="input image dimensions")
 aa("--image-name", type=str, default="cat.png", help="name of input image to download")
 args = parser.parse_args()
@@ -140,8 +140,8 @@ def test_build(build_dir):
     lib = tvm.runtime.load_module(osp.join(build_dir, "deploy_lib.so"))
     params = bytearray(open(osp.join(build_dir, "deploy_param.params"), "rb").read())
     input_data = get_cat_image()
-    ctx = tvm.cpu()
-    module = graph_runtime.create(graph, lib, ctx)
+    dev = tvm.cpu()
+    module = graph_executor.create(graph, lib, dev)
     module.load_params(params)
     module.run(data=input_data)
     out = module.get_output(0).asnumpy()
@@ -151,7 +151,7 @@ def test_build(build_dir):
 
 
 if __name__ == "__main__":
-    logger.info("Compiling the model to graph runtime.")
+    logger.info("Compiling the model to graph executor.")
     build(build_dir)
     logger.info("Testing the model's predication on test data.")
     test_build(build_dir)
diff --git a/rust/tvm/examples/resnet/src/main.rs b/rust/tvm/examples/resnet/src/main.rs
index f24c358ab52a..7f5fcd458c26 100644
--- a/rust/tvm/examples/resnet/src/main.rs
+++ b/rust/tvm/examples/resnet/src/main.rs
@@ -31,7 +31,7 @@ use tvm::runtime::graph_rt::GraphRt;
 use tvm::*;
 
 fn main() -> anyhow::Result<()> {
-    let ctx = Context::cpu(0);
+    let dev = Device::cpu(0);
     println!("{}", concat!(env!("CARGO_MANIFEST_DIR"), "/cat.png"));
 
     let img = image::open(concat!(env!("CARGO_MANIFEST_DIR"), "/cat.png"))
@@ -61,7 +61,7 @@ fn main() -> anyhow::Result<()> {
     // make arr shape as [1, 3, 224, 224] acceptable to resnet
     let arr = arr.insert_axis(Axis(0));
     // create input tensor from rust's ndarray
-    let input = NDArray::from_rust_ndarray(&arr, Context::cpu(0), DataType::float(32, 1))?;
+    let input = NDArray::from_rust_ndarray(&arr, Device::cpu(0), DataType::float(32, 1))?;
     println!(
         "input shape is {:?}, len: {}, size: {}",
         input.shape(),
@@ -78,7 +78,7 @@ fn main() -> anyhow::Result<()> {
         "/deploy_lib.so"
     )))?;
 
-    let mut graph_rt = GraphRt::create_from_parts(&graph, lib, ctx)?;
+    let mut graph_rt = GraphRt::create_from_parts(&graph, lib, dev)?;
 
     // parse parameters and convert to TVMByteArray
     let params: Vec<u8> = fs::read(concat!(env!("CARGO_MANIFEST_DIR"), "/deploy_param.params"))?;
@@ -91,7 +91,7 @@ fn main() -> anyhow::Result<()> {
 
     // prepare to get the output
     let output_shape = &[1, 1000];
-    let output = NDArray::empty(output_shape, Context::cpu(0), DataType::float(32, 1));
+    let output = NDArray::empty(output_shape, Device::cpu(0), DataType::float(32, 1));
     graph_rt.get_output_into(0, output.clone())?;
 
     // flatten the output as Vec<f32>
diff --git a/rust/tvm/src/ir/diagnostics/mod.rs b/rust/tvm/src/ir/diagnostics/mod.rs
index 182ffd4d9081..91e221131216 100644
--- a/rust/tvm/src/ir/diagnostics/mod.rs
+++ b/rust/tvm/src/ir/diagnostics/mod.rs
@@ -51,7 +51,7 @@ external! {
     fn diagnostic_context_render(ctx: DiagnosticContext) -> ();
 
     #[name("diagnostics.DiagnosticRendererRender")]
-    fn diagnositc_renderer_render(renderer: DiagnosticRenderer,ctx: DiagnosticContext) -> ();
+    fn diagnositc_renderer_render(renderer: DiagnosticRenderer, ctx: DiagnosticContext) -> ();
 
     #[name("diagnostics.ClearRenderer")]
     fn clear_renderer() -> ();
diff --git a/rust/tvm/src/lib.rs b/rust/tvm/src/lib.rs
index caae07775d21..81abe338bd1b 100644
--- a/rust/tvm/src/lib.rs
+++ b/rust/tvm/src/lib.rs
@@ -24,7 +24,7 @@
 //! One particular use case is that given optimized deep learning model artifacts,
 //! (compiled with TVM) which include a shared library
 //! `lib.so`, `graph.json` and a byte-array `param.params`, one can load them
-//! in Rust idiomatically to create a TVM Graph Runtime and
+//! in Rust idiomatically to create a TVM Graph Executor and
 //! run the model for some inputs and get the
 //! desired predictions *all in Rust*.
 //!
@@ -32,9 +32,9 @@
 
 pub use crate::{errors::*, function::Function, module::Module, ndarray::NDArray};
 
-pub use tvm_rt::{Context, DataType, DeviceType};
+pub use tvm_rt::{DataType, Device, DeviceType};
 
-pub use tvm_rt::context;
+pub use tvm_rt::device;
 pub use tvm_rt::errors;
 pub use tvm_rt::function;
 pub use tvm_rt::module;
diff --git a/rust/tvm/src/runtime/graph_rt.rs b/rust/tvm/src/runtime/graph_rt.rs
index fcc41aca560f..421a00386cf5 100644
--- a/rust/tvm/src/runtime/graph_rt.rs
+++ b/rust/tvm/src/runtime/graph_rt.rs
@@ -20,44 +20,44 @@
 use std::convert::TryInto;
 
 use crate::runtime::Function;
-use crate::{runtime::function::Result, runtime::ByteArray, Context, Module, NDArray};
+use crate::{runtime::function::Result, runtime::ByteArray, Device, Module, NDArray};
 
-/// An instance of the C++ graph runtime.
+/// An instance of the C++ graph executor.
 ///
 /// An efficient and light weight runtime for static deep learning models.
 pub struct GraphRt {
-    /// The backing graph runtime module which exposes a set of packed functions
+    /// The backing graph executor module which exposes a set of packed functions
     /// which can be invoked by a client.
     ///
-    /// In the graph runtime module, it exposes create, load_params, set_input, get_output, and run.
+    /// In the graph executor module, it exposes create, load_params, set_input, get_output, and run.
     module: Module,
 }
 
 impl GraphRt {
-    /// Create a graph runtime directly from a runtime module.
-    pub fn from_module(module: Module, ctx: Context) -> Result<GraphRt> {
-        let default: Box<dyn Fn(Context) -> Result<Module>> =
+    /// Create a graph executor directly from a runtime module.
+    pub fn from_module(module: Module, dev: Device) -> Result<GraphRt> {
+        let default: Box<dyn Fn(Device) -> Result<Module>> =
             module.get_function("default", false)?.into();
 
         Ok(Self {
-            module: default(ctx)?,
+            module: default(dev)?,
         })
     }
 
-    /// Create a graph runtime from the deprecated graph, lib, ctx triple.
-    pub fn create_from_parts(graph: &str, lib: Module, ctx: Context) -> Result<Self> {
-        let runtime_create_fn = Function::get("tvm.graph_runtime.create").unwrap();
+    /// Create a graph executor from the deprecated graph, lib, dev triple.
+    pub fn create_from_parts(graph: &str, lib: Module, dev: Device) -> Result<Self> {
+        let runtime_create_fn = Function::get("tvm.graph_executor.create").unwrap();
 
         let runtime_create_fn_ret = runtime_create_fn.invoke(vec![
             graph.into(),
             lib.into(),
-            (&ctx.device_type).into(),
+            (&dev.device_type).into(),
             // NOTE you must pass the device id in as i32 because that's what TVM expects
-            (ctx.device_id as i32).into(),
+            (dev.device_id as i32).into(),
         ]);
-        let graph_runtime_module: Module = runtime_create_fn_ret?.try_into()?;
+        let graph_executor_module: Module = runtime_create_fn_ret?.try_into()?;
         Ok(Self {
-            module: graph_runtime_module,
+            module: graph_executor_module,
         })
     }
 
@@ -92,13 +92,13 @@ impl GraphRt {
         Ok(())
     }
 
-    /// Extract the ith output from the graph runtime and returns it.
+    /// Extract the ith output from the graph executor and returns it.
     pub fn get_output(&mut self, i: i64) -> Result<NDArray> {
         let get_output_fn = self.module.get_function("get_output", false)?;
         get_output_fn.invoke(vec![i.into()])?.try_into()
     }
 
-    /// Extract the ith output from the graph runtime and write the results into output.
+    /// Extract the ith output from the graph executor and write the results into output.
     pub fn get_output_into(&mut self, i: i64, output: NDArray) -> Result<()> {
         let get_output_fn = self.module.get_function("get_output", false)?;
         get_output_fn.invoke(vec![i.into(), output.into()])?;
diff --git a/rust/tvm/tests/basics/src/main.rs b/rust/tvm/tests/basics/src/main.rs
index 450ab48dc1b2..d1cfbcf78dc7 100644
--- a/rust/tvm/tests/basics/src/main.rs
+++ b/rust/tvm/tests/basics/src/main.rs
@@ -25,18 +25,18 @@ fn main() {
     let shape = &mut [2];
     let mut data = vec![3f32, 4.0];
 
-    let (ctx, ctx_name) = if cfg!(feature = "cpu") {
-        (Context::cpu(0), "cpu")
+    let (dev, dev_name) = if cfg!(feature = "cpu") {
+        (Device::cpu(0), "cpu")
     } else {
-        (Context::gpu(0), "gpu")
+        (Device::gpu(0), "gpu")
     };
 
     let dtype = DataType::from_str("float32").unwrap();
-    let mut arr = NDArray::empty(shape, ctx, dtype);
+    let mut arr = NDArray::empty(shape, dev, dtype);
     arr.copy_from_buffer(data.as_mut_slice());
-    let ret = NDArray::empty(shape, ctx, dtype);
+    let ret = NDArray::empty(shape, dev, dtype);
     let mut fadd = Module::load(&concat!(env!("OUT_DIR"), "/test_add.so")).unwrap();
-    if !fadd.enabled(ctx_name) {
+    if !fadd.enabled(dev_name) {
         return;
     }
 
diff --git a/rust/tvm/tests/callback/src/bin/array.rs b/rust/tvm/tests/callback/src/bin/array.rs
index 2f1848ec6471..81ee426d3967 100644
--- a/rust/tvm/tests/callback/src/bin/array.rs
+++ b/rust/tvm/tests/callback/src/bin/array.rs
@@ -47,7 +47,7 @@ fn main() {
 
     let shape = &[2];
     let data = vec![3.0, 4.0];
-    let mut arr = NDArray::empty(shape, Context::cpu(0), DataType::float(32, 1));
+    let mut arr = NDArray::empty(shape, Device::cpu(0), DataType::float(32, 1));
     arr.copy_from_buffer(data.as_slice());
 
     register_untyped(sum, "sum", true).unwrap();
diff --git a/src/arith/analyzer.cc b/src/arith/analyzer.cc
index 9737b53703fd..08e32f576299 100644
--- a/src/arith/analyzer.cc
+++ b/src/arith/analyzer.cc
@@ -100,6 +100,13 @@ bool Analyzer::CanProveLess(const PrimExpr& expr, int64_t upper_bound) {
   return false;
 }
 
+bool Analyzer::CanProveEqual(const PrimExpr& lhs, const PrimExpr& rhs) {
+  const auto* clhs = lhs.as<IntImmNode>();
+  const auto* crhs = rhs.as<IntImmNode>();
+  if (clhs && crhs) return clhs->value == crhs->value;
+  return CanProve(lhs - rhs == 0);
+}
+
 bool Analyzer::CanProve(const PrimExpr& expr) {
   if (const auto* ptr = expr.as<IntImmNode>()) {
     return ptr->value != 0;
diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc
index 7efdd03fa11e..a49478a43635 100644
--- a/src/arith/iter_affine_map.cc
+++ b/src/arith/iter_affine_map.cc
@@ -29,6 +29,7 @@
 
 #include "../support/utils.h"
 #include "const_fold.h"
+#include "pattern_match.h"
 
 namespace tvm {
 namespace arith {
@@ -123,11 +124,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 /*!
- * \brief Collector that collects
- *  the outgoing split reference of each IterMark.
+ * \brief Collector that collects the outgoing split reference of each IterMark.
  *
- *  These out-going splits can then be used to
- *  check if the iterators are independent.
+ *  These out-going splits can then be used to check if the iterators are independent.
  */
 class IterMarkSplitCollector {
  public:
@@ -161,8 +160,7 @@ class IterMarkSplitCollector {
   }
 };
 
-// Rewriter to rewrite PrimExpr to IterMapExpr
-// when possible
+/*! \brief Rewriter to rewrite PrimExpr to IterMapExpr when possible */
 class IterMapRewriter : public ExprMutator {
  public:
   using Parent = ExprMutator;
@@ -170,16 +168,19 @@ class IterMapRewriter : public ExprMutator {
   explicit IterMapRewriter(Analyzer* analyzer, const Map<Var, Range>& input_iters)
       : analyzer_(analyzer) {
     for (auto kv : input_iters) {
-      const auto& vrng = kv.second;
-      if (is_zero(vrng->min)) {
-        IterMark mark(kv.first, vrng->extent);
-        var_map_[kv.first] = IterSplitExpr(mark);
+      const Var& var = kv.first;
+      const Range& vrng = kv.second;
+      if (is_one(vrng->extent)) {
+        var_map_[var] = IterSumExpr({}, vrng->min);
+      } else if (is_zero(vrng->min)) {
+        IterMark mark(var, vrng->extent);
+        var_map_[var] = IterSplitExpr(mark);
         input_marks_.push_back(mark);
       } else {
-        IterMark mark(kv.first - vrng->min, vrng->extent);
-        auto sum_expr = ToIterSumExpr(IterSplitExpr(mark));
+        IterMark mark(var - vrng->min, vrng->extent);
+        IterSumExpr sum_expr = ToIterSumExpr(IterSplitExpr(mark));
         sum_expr.CopyOnWrite()->base = vrng->min;
-        var_map_[kv.first] = sum_expr;
+        var_map_[var] = sum_expr;
         input_marks_.push_back(mark);
       }
     }
@@ -187,33 +188,88 @@ class IterMapRewriter : public ExprMutator {
 
   size_t unresolved_count() const { return unresolved_count_; }
 
-  IterSumExpr Rewrite(PrimExpr expr) {
+  IterSumExpr Rewrite(const PrimExpr& expr) {
     return NormalizeToIterWithOffset(ToIterSumExpr(DirectMutate(expr)));
   }
 
-  bool CheckBijective(const Array<IterSumExpr>& indices) {
-    // This function checks two conditions:
-    // - C0: Each iter mark should be fully covered by non-overlapping splits.
-    // - C1: All of the input iterators are used.
-    //
-    // Example: given x in [0, 8) y in [0, 6)
-    // - indices = [x, x+1, y] won't pass because x and x+1 contribute
-    //   two splits that overlaps with each other.
-    // - indices = [x / 4, x % 4, y] will pass because x / 4 and x % 4
-    //   contribute two non-overlapping splits that covers x.
-    // - indices = [x / 4, x % 4] won't pass because y is not used.
-    //
+  IterSumExpr RewriteIterConstraint(const PrimExpr& expr,
+                                    const PrimExpr& predicate_induced_extent) {
+    return NormalizeToIterOnBoundExpr(ToIterSumExpr(DirectMutate(expr)), predicate_induced_extent);
+  }
+
+  /*!
+   * \brief If require_bijective is true, this function checks two conditions:
+   *   - C0: Each iter mark should be fully covered by non-overlapping splits.
+   *   - C1: All of the input iterators are used.
+   *   Example: given x in [0, 8) y in [0, 6)
+   *   - bindings = [x, x + 1, y] won't pass because x and x+1 contribute
+   *     two splits that overlaps with each other.
+   *   - bindings = [x / 4, x % 4, y] will pass because x / 4 and x % 4
+   *     contribute two non-overlapping splits that covers x.
+   *   - bindings = [x / 4, x % 4] won't pass because y is not used.
+   *
+   *   If require_bijective is false, this function checks one condition:
+   *   - C0: Each iter mark has a chance to be fully covered by non-overlapping splits.
+   *   Example: given x in [0, 8) y in [0, 6)
+   *   - bindings = [x / 4] will pass because x / 4 can be one split of x
+   *   - bindings = [x / 4, x % 4] will pass because x / 4 and x % 4
+   *     contribute two non-overlapping splits that covers x.
+   *   - bindings = [x / 3] will not pass because x / 3 can not be one split of x
+   * \return whether the bindings are valid
+   */
+  bool CheckMapping(const Array<IterSumExpr>& bindings, bool require_bijective) {
     IterMarkSplitCollector collector;
     // We can check that for each iter mark:
-    // All the splits that refers to the itermark covers its extent.
+    // All the splits that refers to the iter_mark covers its extent.
     // The splits do not overlap with each other.
-    collector.Collect(indices);
+    collector.Collect(bindings);
     for (const IterMark& mark : collector.visited_) {
-      if (TryNormalizeSplits(mark, collector.mark2splits_[mark]).empty()) return false;
+      if (TryNormalizeSplits(mark, collector.mark2splits_[mark], require_bijective).empty())
+        return false;
     }
-    // all input marks must be visited
-    for (const auto& mark : input_marks_) {
-      if (collector.visited_.count(mark) == 0) return false;
+    if (require_bijective) {
+      // all input marks must be visited
+      for (const IterMark& mark : input_marks_) {
+        if (collector.visited_.count(mark) == 0) return false;
+      }
+    }
+    return true;
+  }
+
+  /*!
+   * \brief Check the validity of iterator constraints
+   *    The flattened forms of two different iterator constraints
+   *    either 1) follow inclusion relation or 2) have no intersection
+   *
+   *    For Example, x = i0*30 + i1*15 + i2*3 + i3,
+   *    1) [i0*2 + i1 < 3, i2*3 + i3 < 5] is valid, since {i0, i1} \intersect {i2, i3} = empty set.
+   *    2) [i0*2 + i1 < 3, i1*5 + i2 < 5] is not valid,
+   *       since {i0, i1} \intersect {i1, i2} = {i1}, i0 \in {i0, i1}, i0 \notin {i1, i2}
+   * \return whether the predicates are valid;
+   */
+  bool CheckConstraints() const {
+    // the constrained_iters_flattened_ are in the order of shorter to longer
+    // since we visit the predicates in the order of size
+    for (size_t i = 0; i < constrained_iters_flattened_.size(); ++i) {
+      for (size_t j = i + 1; j < constrained_iters_flattened_.size(); ++j) {
+        // state: 0(start), -1(no intersection), 1(inclusion)
+        int state = 0;
+        for (const IterSplitExpr& arg1 : constrained_iters_flattened_[i]->args) {
+          bool found = false;
+          for (const IterSplitExpr& arg2 : constrained_iters_flattened_[j]->args) {
+            if (IterSplitEqual(arg1, arg2)) {
+              found = true;
+              break;
+            }
+          }
+          // Check either it is inclusion or intersection, but not both
+          if (state == 0) {
+            state = found ? 1 : -1;
+          } else if ((state == -1 && found) || (state == 1 && !found)) {
+            return false;
+          }
+        }
+      }
     }
     return true;
   }
@@ -243,25 +299,30 @@ class IterMapRewriter : public ExprMutator {
     size_t operator()(const IterSumExpr& value) const {
       // for now only hash on source index.
       size_t hash = value->args.size();
-      for (const auto& arg : value->args) {
+      for (const IterSplitExpr& arg : value->args) {
         hash = support::HashCombine(hash, std::hash<const Object*>()(arg->source.get()));
       }
       return hash;
     }
   };
 
+  static bool IterSplitEqual(const IterSplitExpr& lhs, const IterSplitExpr& rhs,
+                             bool check_scale = true) {
+    tir::ExprDeepEqual equal;
+    if (!lhs->source.same_as(rhs->source)) return false;
+    if (!equal(lhs->lower_factor, rhs->lower_factor)) return false;
+    if (check_scale && !equal(lhs->scale, rhs->scale)) return false;
+    if (!equal(lhs->extent, rhs->extent)) return false;
+    return true;
+  }
+
   struct IterSumEqual {
     bool operator()(const IterSumExpr& lhs, const IterSumExpr& rhs) const {
       tir::ExprDeepEqual equal;
       if (lhs->args.size() != rhs->args.size()) return false;
       if (!equal(lhs->base, rhs->base)) return false;
       for (size_t i = 0; i < lhs->args.size(); ++i) {
-        auto lvalue = lhs->args[i];
-        auto rvalue = rhs->args[i];
-        if (!lvalue->source.same_as(rvalue->source)) return false;
-        if (!equal(lvalue->lower_factor, rvalue->lower_factor)) return false;
-        if (!equal(lvalue->scale, rvalue->scale)) return false;
-        if (!equal(lvalue->extent, rvalue->extent)) return false;
+        if (!IterSplitEqual(lhs->args[i], rhs->args[i])) return false;
       }
       return true;
     }
@@ -275,19 +336,64 @@ class IterMapRewriter : public ExprMutator {
   std::unordered_map<Var, PrimExpr, ObjectPtrHash, ObjectPtrEqual> var_map_;
   // input iter marks
   std::vector<IterMark> input_marks_;
-  // The canonical map for sum
-  std::unordered_map<IterSumExpr, IterSplitExpr, IterSumHash, IterSumEqual> sum_fuse_map_;
+  // The map for sum that maps flattened form to IterMark with normal form and extent
+  // Example: expr = i*9 + j*2 + k, i in [0, 4) j in [0, 5) k in [0, 2)
+  //          predicate: j*2 + k < 9
+  // Then,    flattened form = IterSum(IterSplit(i, scale=9),
+  //                                   IterSplit(j, scale=2),
+  //                                   IterSplit(k, scale=1))
+  //          normal form    = IterSum(IterSplit(i, scale=9),
+  //                                   IterSplit(IterMark(IterSum(IterSplit(j, scale=2),
+  //                                                              IterSplit(k, scale=1)),
+  //                                                      extent=9)
+  //                                             scale=1))
+  std::unordered_map<IterSumExpr, IterMark, IterSumHash, IterSumEqual> sum_fuse_map_;
+  // The map for sum that maps normal form to flattened form
+  std::unordered_map<IterSumExpr, IterSumExpr, IterSumHash, IterSumEqual> flattened_map_;
+  // The flattened forms of constrained iters
+  std::vector<IterSumExpr> constrained_iters_flattened_;
+
+  /*!
+   * \brief Look for a split in splits that is not used such that its lower_factor is smallest.
+   *        Note that here we use division to compare lower_factor.
+   * \param splits the split array to search in.
+   * \param used the input used array.
+   * \param expected_lower_factor the skipped lower factor.
+   * \return the index of the expected split, split.size() if not found.
+   */
+  size_t SearchSkipLowerFactor(const std::vector<IterSplitExpr>& splits,
+                               const std::vector<bool>& used,
+                               const PrimExpr& expected_lower_factor) {
+    size_t res = splits.size();
+    for (size_t i = 0; i < splits.size(); ++i) {
+      if (used[i]) continue;
+      if (!used[i] && !CanProveDivisible(splits[i]->lower_factor, expected_lower_factor)) {
+        // all the remaining unused splits should have their lower factor divisible
+        return splits.size();
+      }
+      if (res == splits.size() ||
+          CanProveDivisible(splits[res]->lower_factor, splits[i]->lower_factor)) {
+        // note down the split with smaller lower factor
+        res = i;
+      }
+    }
+    return res;
+  }
 
   /*!
-   * \brief Verify that splits fully covers mark in a non-overlapping fashion.
-   *        If verification passes, return splits from outermost to inner most order.
-   *        If not, return an empty array
+   * \brief If bijective is required, verify that splits fully covers mark in a non-overlapping
+   *   fashion, If not, verify that splits are valid and compatible for the mark.
+   *   If verification passes, return splits from outermost to innermost order.
+   *   If not, return an empty array.
    * \param mark The iterator of interest.
    * \param splits The splits to be verified.
+   * \param require_bijective A boolean flag that indicates whether the bindings should be
+   * bijective.
    * \return The normalized splits.
    */
   Array<IterSplitExpr> TryNormalizeSplits(const IterMark& mark,
-                                          const std::vector<IterSplitExpr>& splits) {
+                                          const std::vector<IterSplitExpr>& splits,
+                                          bool require_bijective) {
     std::vector<bool> used(splits.size(), false);
     std::vector<IterSplitExpr> iters;
     PrimExpr expected_lower_factor = make_const(mark->source->dtype, 1);
@@ -296,31 +402,83 @@ class IterMapRewriter : public ExprMutator {
       size_t j = 0;
       for (; j < splits.size(); ++j) {
         if (used[j]) continue;
-        if (!used[j] && CanProveEqual(splits[j]->lower_factor, expected_lower_factor)) break;
+        if (!used[j] && analyzer_->CanProveEqual(splits[j]->lower_factor, expected_lower_factor))
+          break;
       }
       if (j == splits.size()) {
-        return Array<IterSplitExpr>();
+        // we do not allow incomplete split if the bindings should be bijective
+        if (require_bijective) return Array<IterSplitExpr>();
+        // look for the next split skipping this lower factor
+        // For example, y \in [0, 24) has 3 splits [y / 6, (y / 2) % 6, y % 2]
+        // It is valid to only have [y / 6, y % 2] if bijective is not required
+        // We can skip (y / 2) % 6
+        j = SearchSkipLowerFactor(splits, used, expected_lower_factor);
+        // split not found
+        if (j == splits.size()) return Array<IterSplitExpr>();
       }
       used[j] = true;
       iters.push_back(splits[j]);
-      expected_lower_factor *= splits[j]->extent;
+      expected_lower_factor = splits[j]->lower_factor * splits[j]->extent;
+    }
+    // Case 1. bijective is required.
+    //         We check the extent we calculate is consistent with the extent of the mark
+    // Case 2. bijective is not required.
+    //         We check the extent we calculate is a factor of the extent of the mark
+    //         For example, y \in [0, 24) [(y / 2) % 6, y % 2] is valid, but y \in [0, 25) is not.
+    if ((require_bijective && !analyzer_->CanProveEqual(expected_lower_factor, mark->extent)) ||
+        (!require_bijective && !CanProveDivisible(mark->extent, expected_lower_factor))) {
+      return Array<IterSplitExpr>();
     }
-    if (!CanProveEqual(expected_lower_factor, mark->extent)) return Array<IterSplitExpr>();
     return Array<IterSplitExpr>(iters.rbegin(), iters.rend());
   }
 
+  /*!
+   * \brief Normalize the left hand side of iter constraint(expr < predicate_induced_extent)
+   * \param expr The left hand side of iter constraint.
+   * \param predicate_induced_extent Extent from iter constraint.
+   * \return The Normalized expression.
+   */
+  IterSumExpr NormalizeToIterOnBoundExpr(IterSumExpr expr,
+                                         const PrimExpr& predicate_induced_extent) {
+    // We are normalizing the left hand side of iter constraint(iter < predicate_induced_extent)
+    Optional<IterSplitExpr> opt = TryFuseIters(expr);
+    // scale should be 1
+    if (opt.defined() && is_one(opt.value()->scale)) {
+      IterSumExpr sum = Downcast<IterSumExpr>(opt.value()->source->source);
+      // get the flattened form
+      auto it = flattened_map_.find(sum);
+      ICHECK(it != flattened_map_.end());
+      IterSumExpr flattened_form = it->second;
+      // get the mark
+      auto it_mark = sum_fuse_map_.find(flattened_form);
+      ICHECK(it_mark != sum_fuse_map_.end());
+      IterMark mark = it_mark->second;
+      mark.CopyOnWrite()->extent = min(predicate_induced_extent, mark->extent);
+      // update the bound of the lhs based on predicate_induced_extent
+      sum_fuse_map_[flattened_form] = mark;
+      // we need to note down the flattened form of constrained iterators
+      // to check the validity of constraints, see also CheckConstraints()
+      constrained_iters_flattened_.push_back(flattened_form);
+      expr.CopyOnWrite()->args = Array<IterSplitExpr>({opt.value()});
+      return expr;
+    }
+    ++unresolved_count_;
+    return expr;
+  }
+
   /*!
    * \brief Normalize expr to an iterator + offset.
    * \param expr The input expression.
    * \return The Normalized expression.
    */
   IterSumExpr NormalizeToIterWithOffset(IterSumExpr expr) {
+    // We are normalizing a regular iter
     if (expr->args.size() <= 1) return expr;
     PrimExpr base = expr->base;
     expr.CopyOnWrite()->base = make_zero(expr->dtype);
-    auto opt = TryFuseIters(expr);
+    Optional<IterSplitExpr> opt = TryFuseIters(expr);
     expr.CopyOnWrite()->base = base;
-    if (opt) {
+    if (opt.defined()) {
       expr.CopyOnWrite()->args = Array<IterSplitExpr>({opt.value()});
       return expr;
     } else {
@@ -329,13 +487,6 @@ class IterMapRewriter : public ExprMutator {
     }
   }
 
-  bool CanProveEqual(PrimExpr lhs, PrimExpr rhs) {
-    const auto* clhs = lhs.as<IntImmNode>();
-    const auto* crhs = rhs.as<IntImmNode>();
-    if (clhs && crhs) return clhs->value == crhs->value;
-    return analyzer_->CanProve(lhs - rhs == 0);
-  }
-
   /*!
    * \brief Create a IterSumExpr from expr.
    * \param expr The input expr.
@@ -352,22 +503,24 @@ class IterMapRewriter : public ExprMutator {
     }
   }
 
-  // Try to normalize IterSum into a fused IterMark
-  // return a corresponding splitexpr if needed.
-  // IterSum = x1*c1 + x2*c2 + ... + xn*cn
-  //         = (x1*s1 + x2*s2 + ... + xn)*cn
-  //         = y*cn (IterMark y => x1*s1 + x2*s2 + ... + xn)
-  //         = [IterSplit(IterMark(y), scale=cn)]
-  // return a corresponding IterSplitExpr if needed.
+  /*!
+   * \brief IterSum = x1*c1 + x2*c2 + ... + xn*cn
+   *      = (x1*s1 + x2*s2 + ... + xn)*cn
+   *      = y*cn (IterMark y => x1*s1 + x2*s2 + ... + xn)
+   *      = [IterSplit(IterMark(y), scale=cn)]
+   *    return a corresponding IterSplitExpr if needed.
+   *    Try to normalize IterSum into a fused IterMark
+   * \param expr The input sum.
+   * \return The split with the fused IterMark if succeed.
+   */
   Optional<IterSplitExpr> TryFuseIters(IterSumExpr expr) {
     if (!is_zero(expr->base)) return NullOpt;
     if (expr->args.size() == 1) return expr->args[0];
     // select the iterators in order
     std::vector<bool> visited(expr->args.size(), false);
-    std::vector<IterSplitExpr> iters;
-    iters.reserve(expr->args.size());
-    // canonicalize the expression
-    // find the base scale first
+    std::vector<IterSplitExpr> flattened_iters, grouped_iters;
+    // canonicalize the expression into two different forms: flattened form and structured form
+    // step0. check if find the base scale first
     Optional<IntImm> base_scale = NullOpt;
     size_t base_index = 0;
     for (size_t i = 0; i < expr->args.size(); ++i) {
@@ -381,35 +534,87 @@ class IterMapRewriter : public ExprMutator {
     if (!base_scale) return NullOpt;
     // check if it can be remapped into a fused pattern.
     PrimExpr expected_scale = base_scale.value();
-    for (size_t i = 0; i < expr->args.size(); ++i) {
+    for (size_t i = 0; i < expr->args.size();) {
+      // find j such that expr->args[j] has expected scale
       size_t j = i == 0 ? base_index : 0;
       for (; j < expr->args.size(); ++j) {
-        if (!visited[j] && CanProveEqual(expr->args[j]->scale, expected_scale)) break;
+        if (!visited[j] && analyzer_->CanProveEqual(expr->args[j]->scale, expected_scale)) break;
       }
-      if (j == expr->args.size()) {
-        return NullOpt;
+      if (j == expr->args.size()) return NullOpt;
+      // look for the longest constrained iter started from expr->args[j]
+      // Example: expr = i*9 + j*2 + k, i in [0, 4) j in [0, 5) k in [0, 2)
+      //          predicate: j*2 + k < 9
+      // We need to match the predicate in expr and adjust the expected scale,
+      // otherwise we expect the scale of i to be 2*5=10
+      Optional<IterSumExpr> constraint_to_match;
+      for (const IterSumExpr& iter : constrained_iters_flattened_) {
+        if (IterSplitEqual(expr->args[j], iter->args.back(), false)) {
+          // find a predicate started from expr->args[j]
+          if (!constraint_to_match ||
+              constraint_to_match.value()->args.size() < iter->args.size()) {
+            constraint_to_match = iter;
+          }
+        }
       }
-      visited[j] = true;
-      auto arg = expr->args[j];
-      arg.CopyOnWrite()->scale = div(expr->args[j]->scale, base_scale.value());
-      iters.push_back(arg);
-      expected_scale *= expr->args[j]->extent;
-    }
-    // update the iterator to use the canonicalized form
-    expr.CopyOnWrite()->args = Array<IterSplitExpr>(iters.rbegin(), iters.rend());
-    auto it = sum_fuse_map_.find(expr);
-    if (it != sum_fuse_map_.end()) return it->second;
-    auto mark = IterMark(expr, div(expected_scale, base_scale.value()));
-    IterSplitExpr split(mark, base_scale.value());
-    sum_fuse_map_[expr] = split;
-    return split;
+      if (constraint_to_match) {
+        // match the predicate and mark the iterators in the constraint_to_match as visited
+        // Example: expr = i*9 + j*2 + k, i in [0, 4) j in [0, 5) k in [0, 2)
+        //          predicate = j*2 + k < 9
+        //          then j*2 + k matches the lower two splits of expr
+        for (auto it = constraint_to_match.value()->args.rbegin();
+             it != constraint_to_match.value()->args.rend(); ++it) {
+          size_t k = 0;
+          for (; k < expr->args.size(); ++k) {
+            if (!visited[k] && IterSplitEqual(expr->args[k], *it, false)) {
+              if (analyzer_->CanProveEqual((*it)->scale * expected_scale, expr->args[k]->scale))
+                break;
+            }
+          }
+          if (k == expr->args.size()) return NullOpt;
+          visited[k] = true;
+          flattened_iters.push_back(expr->args[k]);
+        }
+        auto iter = sum_fuse_map_.find(constraint_to_match.value());
+        ICHECK(iter != sum_fuse_map_.end());
+        IterMark iter_matched = iter->second;
+        grouped_iters.emplace_back(iter_matched, expected_scale);
+        expected_scale *= iter_matched->extent;
+        // move forward
+        i += constraint_to_match.value()->args.size();
+      } else {
+        // constraint_to_match not found, skip this iterator
+        visited[j] = true;
+        flattened_iters.push_back(expr->args[j]);
+        grouped_iters.push_back(expr->args[j]);
+        expected_scale *= expr->args[j]->extent;
+        ++i;
+      }
+    }
+    // Get the flattened form and structured form
+    // both forms have splits from outermost to innermost
+    IterSumExpr structured_form = expr, flattened_form = expr;
+    flattened_form.CopyOnWrite()->args =
+        Array<IterSplitExpr>(flattened_iters.rbegin(), flattened_iters.rend());
+    structured_form.CopyOnWrite()->args =
+        Array<IterSplitExpr>(grouped_iters.rbegin(), grouped_iters.rend());
+    auto it = sum_fuse_map_.find(flattened_form);
+    if (it != sum_fuse_map_.end()) {
+      // old iter
+      return IterSplitExpr(it->second, base_scale.value());
+    } else {
+      // new iter, form a new mark
+      IterMark mark = IterMark(structured_form, div(expected_scale, base_scale.value()));
+      sum_fuse_map_[flattened_form] = mark;
+      flattened_map_[structured_form] = flattened_form;
+      return IterSplitExpr(mark, base_scale.value());
+    }
   }
 
   bool CanProveDivisible(const PrimExpr& lhs, const PrimExpr& rhs) {
     const auto* clhs = lhs.as<IntImmNode>();
     const auto* crhs = rhs.as<IntImmNode>();
     if (clhs && crhs) return clhs->value % crhs->value == 0;
-    return analyzer_->CanProve(floormod(lhs, rhs) == 0);
+    return analyzer_->CanProveEqual(lhs, rhs) || analyzer_->CanProve(floormod(lhs, rhs) == 0);
   }
 
   PrimExpr SplitFloorDivConst(IterSplitExpr lhs, PrimExpr rhs, const PrimExpr& orig);
@@ -459,27 +664,87 @@ class IterMapRewriter : public ExprMutator {
   }
 };
 
+/*! \brief An internal struct to represent range extent on iterators(iter < upper_bound). */
+struct IterConstraint {
+  // The expr of the iter
+  PrimExpr iter;
+  // The expr of the upper_bound
+  PrimExpr upper_bound;
+  // The size of the iter, which is the number of nodes
+  size_t expr_size = 0;
+
+  IterConstraint(PrimExpr iter, PrimExpr upper_bound, size_t size)
+      : iter(std::move(iter)), upper_bound(std::move(upper_bound)), expr_size(size) {}
+};
+
+/*!
+ * \brief Split the predicate into `(a < b) && (c < d) && ...`
+ * \param pred The predicate to be split.
+ * \return A list of pairs, each element of which are lhs and rhs of the '<' sign,
+ *         empty if the split failed.
+ */
+std::vector<IterConstraint> MatchUpperBoundConstraints(PrimExpr pred) {
+  std::vector<IterConstraint> result;
+  arith::PVar<PrimExpr> lhs, rhs, rest;
+  for (;;) {
+    if ((rest && (lhs < rhs)).Match(pred)) {
+      result.emplace_back(lhs.Eval(), rhs.Eval(), 0);
+      pred = rest.Eval();
+    } else if ((lhs < rhs).Match(pred)) {
+      result.emplace_back(lhs.Eval(), rhs.Eval(), 0);
+      break;
+    } else {
+      return std::vector<IterConstraint>();
+    }
+  }
+  return result;
+}
+
 Array<IterSumExpr> DetectIterMap(const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters,
+                                 const PrimExpr& predicate, bool require_bijective,
                                  arith::Analyzer* analyzer) {
   // Overall detection algorithm is divided into two steps:
   // - Step0: IterMapRewriter rewrites the expression to use IterMapExpr patterns.
   // - Step1: IterIndependenceChecker checks if the iterator are independent.
+
+  std::vector<IterConstraint> constraints = MatchUpperBoundConstraints(predicate);
+  if (!is_one(predicate) && constraints.empty()) return Array<IterSumExpr>();
+
+  // We have to make sure when we visit an iterator, all the constraints related with its successors
+  // in the iter var graph has been visited, where the expression of this iterator will contain the
+  // expression of its successor, so we sort them by their sizes.
+  for (IterConstraint& constraint : constraints) {
+    constraint.expr_size = CalculateExprComplexity(constraint.iter);
+  }
+
+  std::sort(
+      constraints.begin(), constraints.end(),
+      [](const IterConstraint& a, const IterConstraint& b) { return a.expr_size < b.expr_size; });
+
   IterMapRewriter rewriter(analyzer, input_iters);
+  // Step0.0: rewrite constraints in the order from size-small ones to size-big ones
+  for (const IterConstraint& constraint : constraints) {
+    PrimExpr res = rewriter.RewriteIterConstraint(constraint.iter, constraint.upper_bound);
+    if (rewriter.unresolved_count() != 0) return Array<IterSumExpr>();
+  }
+  if (!rewriter.CheckConstraints()) return Array<IterSumExpr>();
+  // Step0.1: rewrite indices
   Array<IterSumExpr> results;
-
   for (PrimExpr value : indices) {
     results.push_back(rewriter.Rewrite(value));
     if (rewriter.unresolved_count() != 0) return Array<IterSumExpr>();
   }
-  if (!rewriter.CheckBijective(results)) return Array<IterSumExpr>();
+  // Step1: IterIndependenceChecker checks if the iterator are independent.
+  if (!rewriter.CheckMapping(results, require_bijective)) return Array<IterSumExpr>();
 
   return results;
 }
 
 TVM_REGISTER_GLOBAL("arith.DetectIterMap")
-    .set_body_typed([](const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters) {
+    .set_body_typed([](const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters,
+                       const PrimExpr& input_pred, bool is_bijective) {
       arith::Analyzer ana;
-      return DetectIterMap(indices, input_iters, &ana);
+      return DetectIterMap(indices, input_iters, input_pred, is_bijective, &ana);
     });
 
 PrimExpr IterMapRewriter::VisitExpr_(const VarNode* op) {
@@ -675,7 +940,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const FloorDivNode* op) {
 
   if (a->IsInstance<IterSumExprNode>()) {
     IterSumExpr ret = Downcast<IterSumExpr>(a);
-    if (auto opt = TryFuseIters(ret)) {
+    if (Optional<IterSplitExpr> opt = TryFuseIters(ret)) {
       return SplitFloorDivConst(opt.value(), b, GetRef<PrimExpr>(op));
     } else {
       ++unresolved_count_;
@@ -750,7 +1015,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const FloorModNode* op) {
 
   if (a->IsInstance<IterSumExprNode>()) {
     IterSumExpr ret = Downcast<IterSumExpr>(a);
-    if (auto opt = TryFuseIters(ret)) {
+    if (Optional<IterSplitExpr> opt = TryFuseIters(ret)) {
       return SplitFloorModConst(opt.value(), b, GetRef<PrimExpr>(op));
     } else {
       ++unresolved_count_;
@@ -763,5 +1028,63 @@ PrimExpr IterMapRewriter::VisitExpr_(const FloorModNode* op) {
   }
 }
 
+/*! * \brief Given an IterVarMapExpr, transform it to normal PrimExpr. */
+class IterMapToExprNormalizer {
+ public:
+  explicit IterMapToExprNormalizer(Analyzer* analyzer) : analyzer_(analyzer) {}
+
+  PrimExpr Convert(const IterMapExpr& expr) {
+    if (const auto* op = expr.as<IterSplitExprNode>()) {
+      return ConvertIterSplitExpr(GetRef<IterSplitExpr>(op));
+    } else if (const auto* op = expr.as<IterSumExprNode>()) {
+      return ConvertIterSumExpr(GetRef<IterSumExpr>(op));
+    } else {
+      ICHECK(expr.defined());
+      LOG(FATAL) << "Unknown IterMapExpr type " << expr->GetTypeKey();
+      return 0;
+    }
+  }
+
+  PrimExpr ConvertIterSumExpr(const IterSumExpr& expr) {
+    PrimExpr res = 0;
+    for (const IterSplitExpr& arg : expr->args) {
+      res += ConvertIterSplitExpr(arg);
+    }
+    res += expr->base;
+    return res;
+  }
+
+  PrimExpr ConvertIterSplitExpr(const IterSplitExpr& expr) {
+    PrimExpr source;
+    if (const auto* op = expr->source->source.as<VarNode>()) {
+      source = GetRef<Var>(op);
+    } else if (const auto* op = expr->source->source.as<IterSumExprNode>()) {
+      source = ConvertIterSumExpr(GetRef<IterSumExpr>(op));
+    } else {
+      LOG(FATAL) << "Unexpected source of IterSplitExpr";
+    }
+    if (analyzer_->CanProve(expr->extent == expr->source->extent) && is_one(expr->lower_factor)) {
+      return source * expr->scale;
+    } else if (analyzer_->CanProve(expr->source->extent == expr->lower_factor * expr->extent)) {
+      return floordiv(source, expr->lower_factor) * expr->scale;
+    } else {
+      return floormod(floordiv(source, expr->lower_factor), expr->extent) * expr->scale;
+    }
+  }
+
+ private:
+  Analyzer* analyzer_;
+};
+
+PrimExpr NormalizeIterMapToExpr(const IterMapExpr& expr) {
+  arith::Analyzer analyzer;
+  IterMapToExprNormalizer normalizer(&analyzer);
+  return normalizer.Convert(expr);
+}
+
+TVM_REGISTER_GLOBAL("arith.NormalizeIterMapToExpr").set_body_typed([](const IterMapExpr& expr) {
+  return NormalizeIterMapToExpr(expr);
+});
+
 }  // namespace arith
 }  // namespace tvm
diff --git a/src/arith/solve_linear_inequality.cc b/src/arith/solve_linear_inequality.cc
index dd9044833546..6aad5b7b0a25 100644
--- a/src/arith/solve_linear_inequality.cc
+++ b/src/arith/solve_linear_inequality.cc
@@ -39,58 +39,9 @@ namespace arith {
 using namespace tvm::runtime;
 using namespace tvm::tir;
 
-#define PLUS_ONE(OP) \
-  void VisitExpr_(const OP* op) final { num_symbols_++; }
-
-#define PLUS_ONE_BINARY(OP)             \
-  void VisitExpr_(const OP* op) final { \
-    num_symbols_++;                     \
-    VisitExpr(op->a);                   \
-    VisitExpr(op->b);                   \
-  }
-
-/*!
- * \brief Calculate the expresion complexity based on number of symbols it contains.
- */
-class ExprComplexity : public ExprVisitor {
- public:
-  size_t Eval(const PrimExpr& expr) {
-    VisitExpr(expr);
-    return num_symbols_;
-  }
-
-  PLUS_ONE_BINARY(AddNode)
-  PLUS_ONE_BINARY(SubNode)
-  PLUS_ONE_BINARY(MulNode)
-  PLUS_ONE_BINARY(DivNode)
-  PLUS_ONE_BINARY(ModNode)
-  PLUS_ONE_BINARY(FloorDivNode)
-  PLUS_ONE_BINARY(FloorModNode)
-  PLUS_ONE_BINARY(MinNode)
-  PLUS_ONE_BINARY(MaxNode)
-  PLUS_ONE_BINARY(EQNode)
-  PLUS_ONE_BINARY(NENode)
-  PLUS_ONE_BINARY(LTNode)
-  PLUS_ONE_BINARY(LENode)
-  PLUS_ONE_BINARY(GTNode)
-  PLUS_ONE_BINARY(GENode)
-  PLUS_ONE_BINARY(AndNode)
-  PLUS_ONE_BINARY(OrNode)
-  PLUS_ONE(VarNode)
-  PLUS_ONE(FloatImmNode)
-  PLUS_ONE(IntImmNode)
-  void VisitExpr_(const NotNode* op) final {
-    num_symbols_++;
-    VisitExpr(op->a);
-  }
-
- private:
-  size_t num_symbols_{0};
-};
-
 struct ExprLess {
   bool operator()(const PrimExpr& l, const PrimExpr& r) const {
-    return ExprComplexity().Eval(l) < ExprComplexity().Eval(r);
+    return CalculateExprComplexity(l) < CalculateExprComplexity(r);
   }
 };
 
diff --git a/src/auto_scheduler/compute_dag.cc b/src/auto_scheduler/compute_dag.cc
index 4e7fb05660a4..abbcba234848 100644
--- a/src/auto_scheduler/compute_dag.cc
+++ b/src/auto_scheduler/compute_dag.cc
@@ -1367,7 +1367,7 @@ Array<State> ComputeDAG::InferBound(const Array<State>& states) const {
   support::parallel_for(0, states.size(), [this, &states, &out_states](int i) {
     try {
       out_states.Set(i, (states[i].defined()) ? this->InferBound(states[i]) : states[i]);
-    } catch (dmlc::Error& e) {
+    } catch (Error& e) {
       LOG(WARNING) << "InferBound fails on the state:\n"
                    << states[i] << "\n"
                    << "with: " << e.what() << std::endl;
diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc
index cdfb71fe8fa0..be78bc4aa9f9 100755
--- a/src/auto_scheduler/feature.cc
+++ b/src/auto_scheduler/feature.cc
@@ -1328,7 +1328,7 @@ void GetPerStoreFeaturesWorkerFunc(const SearchTask& task, const State& state, i
     const auto& prim_func = (*it).second.as<PrimFuncNode>();
     GetPerStoreFeature(prim_func->body, task->hardware_params->cache_line_bytes, max_n_bufs,
                        feature);
-  } catch (dmlc::Error& e) {
+  } catch (Error& e) {
     (*error_ct)++;
   }
 }
diff --git a/src/auto_scheduler/search_policy/sketch_policy_rules.cc b/src/auto_scheduler/search_policy/sketch_policy_rules.cc
index 110be6bd6f68..8df69fc7ce3b 100644
--- a/src/auto_scheduler/search_policy/sketch_policy_rules.cc
+++ b/src/auto_scheduler/search_policy/sketch_policy_rules.cc
@@ -164,6 +164,7 @@ SketchGenerationRule::ConditionKind RuleAddCacheRead::MeetCondition(const Sketch
   // Don't cache_read a stage if it has multiple consumers
   const std::set<int>& consumers = GetConsumers(task, state, stage_id);
 
+  if (consumers.size() == 0) return ConditionKind::kSkip;
   // Don't cache_read a stage if its consumer does not need multi-level tiling
   int target_stage_id = *consumers.begin();
   if (!NeedsMultilevelTiling(task, state, target_stage_id)) {
@@ -1106,7 +1107,7 @@ PopulationGenerationRule::ResultKind MutateComputeLocation::Apply(SketchPolicyNo
     }
     try {
       StepApplyToState(tmp_s->transform_steps.back(), &tmp_s, policy->search_task->compute_dag);
-    } catch (dmlc::Error& e) {
+    } catch (Error& e) {
       return ResultKind::kInvalid;
     }
   }
@@ -1228,7 +1229,7 @@ PopulationGenerationRule::ResultKind MutateParallel::Apply(SketchPolicyNode* pol
     tmp_s.CopyOnWrite()->transform_steps.push_back(step);
     try {
       StepApplyToState(tmp_s->transform_steps.back(), &tmp_s, policy->search_task->compute_dag);
-    } catch (dmlc::Error& e) {
+    } catch (Error& e) {
       return ResultKind::kInvalid;
     }
   }
diff --git a/src/auto_scheduler/search_task.cc b/src/auto_scheduler/search_task.cc
index 46db045b663a..db53a325fdc4 100755
--- a/src/auto_scheduler/search_task.cc
+++ b/src/auto_scheduler/search_task.cc
@@ -58,24 +58,24 @@ HardwareParams HardwareParamsNode::GetDefaultHardwareParams(const Target& target
   if (device_type == kDLCPU) {
     return HardwareParams(tvm::runtime::threading::MaxConcurrency(), 64, 64, 0, 0, 0, 0, 0);
   } else if (device_type == kDLGPU || device_type == kDLROCM) {
-    auto ctx = TVMContext{static_cast<DLDeviceType>(device_type), 0};
+    auto dev = Device{static_cast<DLDeviceType>(device_type), 0};
     auto device_name = device_type == kDLGPU ? "device_api.gpu" : "device_api.rocm";
     auto func = tvm::runtime::Registry::Get(device_name);
     ICHECK(func != nullptr) << "Cannot find GPU device_api in registry";
     auto device_api = static_cast<tvm::runtime::DeviceAPI*>(((*func)()).operator void*());
 
     tvm::runtime::TVMRetValue ret;
-    device_api->GetAttr(ctx, tvm::runtime::DeviceAttrKind::kMaxSharedMemoryPerBlock, &ret);
+    device_api->GetAttr(dev, tvm::runtime::DeviceAttrKind::kMaxSharedMemoryPerBlock, &ret);
     int max_shared_memory_per_block = ret;
 
     // There is no explicit local memory limition in CUDA runtime,
     // so we can use INT32_MAX to disalbe the check on local_memory.
     int max_local_memory_per_block = INT32_MAX;
 
-    device_api->GetAttr(ctx, tvm::runtime::DeviceAttrKind::kMaxThreadsPerBlock, &ret);
+    device_api->GetAttr(dev, tvm::runtime::DeviceAttrKind::kMaxThreadsPerBlock, &ret);
     int max_threads_per_block = ret;
 
-    device_api->GetAttr(ctx, tvm::runtime::DeviceAttrKind::kWarpSize, &ret);
+    device_api->GetAttr(dev, tvm::runtime::DeviceAttrKind::kWarpSize, &ret);
     int warp_size = ret;
 
     int max_vthread_extent = warp_size / 4;
@@ -107,6 +107,29 @@ HardwareParams HardwareParamsNode::GetDefaultHardwareParams(const Target& target
       auto target_device = target->GetAttr<String>("device", "");
       LOG(FATAL) << "No default hardware parameters for opencl target device: " << target_device;
     }
+  } else if (device_type == kDLVulkan) {
+    auto dev = Device{static_cast<DLDeviceType>(device_type), 0};
+    auto device_name = "device_api.vulkan";
+    auto func = tvm::runtime::Registry::Get(device_name);
+    ICHECK(func != nullptr) << "Cannot find Vulkan device_api in registry";
+    auto device_api = static_cast<tvm::runtime::DeviceAPI*>(((*func)()).operator void*());
+
+    tvm::runtime::TVMRetValue ret;
+    device_api->GetAttr(dev, tvm::runtime::DeviceAttrKind::kMaxSharedMemoryPerBlock, &ret);
+    int max_shared_memory_per_block = ret;
+
+    int max_local_memory_per_block = INT32_MAX;
+
+    device_api->GetAttr(dev, tvm::runtime::DeviceAttrKind::kMaxThreadsPerBlock, &ret);
+    int max_threads_per_block = ret;
+
+    device_api->GetAttr(dev, tvm::runtime::DeviceAttrKind::kWarpSize, &ret);
+    int warp_size = ret;
+
+    int max_vthread_extent = std::max(1, warp_size / 4);
+
+    return HardwareParams(-1, 16, 64, max_shared_memory_per_block, max_local_memory_per_block,
+                          max_threads_per_block, max_vthread_extent, warp_size);
   } else {
     LOG(FATAL) << "No default hardware parameters for target: " << target;
   }
diff --git a/src/auto_scheduler/transform_step.cc b/src/auto_scheduler/transform_step.cc
old mode 100755
new mode 100644
index 5ba3eee07098..b67d5cdd7bd9
--- a/src/auto_scheduler/transform_step.cc
+++ b/src/auto_scheduler/transform_step.cc
@@ -26,8 +26,8 @@
 #include <tvm/auto_scheduler/compute_dag.h>
 #include <tvm/auto_scheduler/loop_state.h>
 #include <tvm/auto_scheduler/transform_step.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 #include <tvm/te/operation.h>
 
 #include <string>
diff --git a/src/contrib/tf_op/tvm_dso_op_kernels.cc b/src/contrib/tf_op/tvm_dso_op_kernels.cc
index 5c119b64b93d..c816119d0fad 100644
--- a/src/contrib/tf_op/tvm_dso_op_kernels.cc
+++ b/src/contrib/tf_op/tvm_dso_op_kernels.cc
@@ -154,7 +154,7 @@ void EnsureAlignment(OpKernelContext* ctx, const tensorflow::Tensor& tensor, Ten
 }
 
 // Create DLPack tensor from TensorFlow tensor
-tensorflow::Status MakeDLTensor(const TensorAsBuf& src, const DLContext& ctx, int64_t* tf_shape,
+tensorflow::Status MakeDLTensor(const TensorAsBuf& src, const DLDevice& dev, int64_t* tf_shape,
                                 DLTensor* out) {
   DLDataType dlpack_type;
   const tensorflow::Tensor& tensor = *src.tensor;
@@ -163,7 +163,7 @@ tensorflow::Status MakeDLTensor(const TensorAsBuf& src, const DLContext& ctx, in
   if (!status.ok()) {
     return status;
   }
-  out->ctx = ctx;
+  out->device = dev;
   out->ndim = tensor.shape().dims();
   out->shape = tf_shape;
   out->strides = nullptr;
@@ -256,7 +256,7 @@ class TVMDSOOp : public OpKernel {
     int device_id = TVMDSOOpTrait<DEVICE_TYPE>::device_id(context);
     int device_type = TVMDSOOpTrait<DEVICE_TYPE>::device_type;
 
-    DLContext dl_ctx = {DLDeviceType(device_type), device_id};
+    DLDevice dl_dev = {DLDeviceType(device_type), device_id};
 
     // Get output shape
     tensorflow::TensorShape output_shape;
@@ -287,7 +287,7 @@ class TVMDSOOp : public OpKernel {
       EnsureAlignment(context, input_tensor, &input);
       input.CopyFromOrigin();
 
-      status = MakeDLTensor(input, dl_ctx, shape_ptr, &args[i]);
+      status = MakeDLTensor(input, dl_dev, shape_ptr, &args[i]);
       OP_REQUIRES_OK(context, status);
     }
 
@@ -302,7 +302,7 @@ class TVMDSOOp : public OpKernel {
     output.device_type = device_type;
     EnsureAlignment(context, *output_tensor, &output);
 
-    status = MakeDLTensor(output, dl_ctx, output_shape_ptr, &args[num_inputs]);
+    status = MakeDLTensor(output, dl_dev, output_shape_ptr, &args[num_inputs]);
     OP_REQUIRES_OK(context, status);
 
     // Prepare PackedFunc arguments
diff --git a/src/ir/error.cc b/src/ir/error.cc
index 5d3978dda4ff..0089f55a4da8 100644
--- a/src/ir/error.cc
+++ b/src/ir/error.cc
@@ -132,7 +132,8 @@ void ErrorReporter::RenderErrors(const IRModule& module, bool use_color) {
   LOG(FATAL) << annotated_prog.str() << std::endl;
 }
 
-void ErrorReporter::ReportAt(const GlobalVar& global, const ObjectRef& node, const Error& err) {
+void ErrorReporter::ReportAt(const GlobalVar& global, const ObjectRef& node,
+                             const CompileError& err) {
   size_t index_to_insert = this->errors_.size();
   this->errors_.push_back(err);
   auto it = this->node_to_error_.find(node);
diff --git a/src/node/structural_hash.cc b/src/node/structural_hash.cc
index efedd1b99d6d..23911efe39c5 100644
--- a/src/node/structural_hash.cc
+++ b/src/node/structural_hash.cc
@@ -328,7 +328,7 @@ struct NDArrayContainerTrait {
   static constexpr const std::nullptr_t VisitAttrs = nullptr;
 
   static void SHashReduce(const runtime::NDArray::Container* key, SHashReducer hash_reduce) {
-    ICHECK_EQ(key->dl_tensor.ctx.device_type, kDLCPU) << "can only compare CPU tensor";
+    ICHECK_EQ(key->dl_tensor.device.device_type, kDLCPU) << "can only compare CPU tensor";
     ICHECK(runtime::IsContiguous(key->dl_tensor)) << "Can only hash contiguous tensor";
     hash_reduce(runtime::DataType(key->dl_tensor.dtype));
     hash_reduce(key->dl_tensor.ndim);
@@ -345,8 +345,8 @@ struct NDArrayContainerTrait {
 
     auto ldt = lhs->dl_tensor.dtype;
     auto rdt = rhs->dl_tensor.dtype;
-    ICHECK_EQ(lhs->dl_tensor.ctx.device_type, kDLCPU) << "can only compare CPU tensor";
-    ICHECK_EQ(rhs->dl_tensor.ctx.device_type, kDLCPU) << "can only compare CPU tensor";
+    ICHECK_EQ(lhs->dl_tensor.device.device_type, kDLCPU) << "can only compare CPU tensor";
+    ICHECK_EQ(rhs->dl_tensor.device.device_type, kDLCPU) << "can only compare CPU tensor";
     ICHECK(runtime::IsContiguous(lhs->dl_tensor)) << "Can only compare contiguous tensor";
     ICHECK(runtime::IsContiguous(rhs->dl_tensor)) << "Can only compare contiguous tensor";
 
diff --git a/src/parser/parser.cc b/src/parser/parser.cc
index 3061735eff7c..b72a632635d9 100644
--- a/src/parser/parser.cc
+++ b/src/parser/parser.cc
@@ -28,9 +28,9 @@
 #include <tvm/relay/expr.h>
 #include <tvm/relay/function.h>
 #include <tvm/relay/transform.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/object.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 #include <fstream>
 
@@ -172,8 +172,8 @@ class ScopeStack {
   void PopStack() { this->scope_stack.pop_back(); }
 };
 
-struct DuplicateKeyError : public dmlc::Error {
-  explicit DuplicateKeyError(const std::string& msg) : dmlc::Error(msg) {}
+struct DuplicateKeyError : public Error {
+  explicit DuplicateKeyError(const std::string& msg) : Error(msg) {}
 };
 
 /*! \brief A table of interning strings as global function and type names. */
@@ -523,18 +523,18 @@ class Parser {
   /*! \brief Convert a numeric token to an NDArray for embedding into the Relay program. */
   NDArray NumberToNDArray(const Token& token) {
     if (token->token_type == TokenType::kInteger) {
-      DLContext ctx = {DLDeviceType::kDLCPU, 0};
+      DLDevice dev = {DLDeviceType::kDLCPU, 0};
       auto dtype = String2DLDataType("int32");
-      auto data = NDArray::Empty({}, dtype, ctx);
+      auto data = NDArray::Empty({}, dtype, dev);
       auto array = reinterpret_cast<int32_t*>(data->data);
       // revisit this, literal node issue.
       int64_t value = Downcast<tvm::Integer>(token->data);
       array[0] = (int32_t)value;
       return data;
     } else if (token->token_type == TokenType::kFloat) {
-      DLContext ctx = {DLDeviceType::kDLCPU, 0};
+      DLDevice dev = {DLDeviceType::kDLCPU, 0};
       auto float_imm = Downcast<tvm::FloatImm>(token->data);
-      auto data = NDArray::Empty({}, float_imm->dtype, ctx);
+      auto data = NDArray::Empty({}, float_imm->dtype, dev);
       auto array = reinterpret_cast<float*>(data->data);
       // revisit this, literal node issue.
       // TODO(@jroesch): bounds checking
@@ -549,9 +549,9 @@ class Parser {
 
   /*! \brief Convert a boolean value to an NDArray for embedding into the Relay program. */
   NDArray BooleanToNDarray(bool value) {
-    DLContext ctx = {DLDeviceType::kDLCPU, 0};
+    DLDevice dev = {DLDeviceType::kDLCPU, 0};
     auto dtype = String2DLDataType("bool");
-    auto data = NDArray::Empty({}, dtype, ctx);
+    auto data = NDArray::Empty({}, dtype, dev);
     auto array = reinterpret_cast<bool*>(data->data);
     array[0] = value;
     return data;
@@ -1492,7 +1492,7 @@ class Parser {
     DLOG(INFO) << "op_name=" << op_name << " span=" << span;
     try {
       return Op::Get(op_name);
-    } catch (const dmlc::Error& e) {
+    } catch (const Error& e) {
       // we can relax this, but probably need to relax checks or return non-null here.
       this->diag_ctx.EmitFatal(Diagnostic::Error(span)
                                << "operator `" << op_name
diff --git a/src/parser/span_check.h b/src/parser/span_check.h
index 9a887474fe67..ab71d30a54f5 100644
--- a/src/parser/span_check.h
+++ b/src/parser/span_check.h
@@ -30,8 +30,8 @@
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/runtime/container.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/object.h>
-#include <tvm/support/logging.h>
 
 #include <fstream>
 #include <string>
diff --git a/src/printer/relay_text_printer.cc b/src/printer/relay_text_printer.cc
index cbee04f96096..31f98ce4d270 100644
--- a/src/printer/relay_text_printer.cc
+++ b/src/printer/relay_text_printer.cc
@@ -322,7 +322,7 @@ Doc RelayTextPrinter::VisitExpr_(const ConstantNode* op) {
   if (op->is_scalar()) {
     std::ostringstream os;
     DataType dtype = DataType(op->data->dtype);
-    ICHECK_EQ(op->data->ctx.device_type, kDLCPU);
+    ICHECK_EQ(op->data->device.device_type, kDLCPU);
     if (dtype == DataType::Int(32)) {
       return ScalarLiteral(dtype, static_cast<const int32_t*>(op->data->data)[0]);
     } else if (dtype == DataType::Int(64)) {
diff --git a/src/printer/text_printer.h b/src/printer/text_printer.h
index 6ec32a9e104c..90e46c5624fa 100644
--- a/src/printer/text_printer.h
+++ b/src/printer/text_printer.h
@@ -310,6 +310,7 @@ class TIRTextPrinter : public StmtFunctor<Doc(const Stmt&)>,
   Doc VisitStmt_(const ForNode* op) override;
   Doc VisitStmt_(const WhileNode* op) override;
   Doc VisitStmt_(const PrefetchNode* op) override;
+  Doc VisitStmt_(const BlockRealizeNode* op) override;
   Doc VisitStmtDefault_(const Object* op) override;
 
   Doc VisitType_(const PrimTypeNode* node) override;
@@ -324,6 +325,7 @@ class TIRTextPrinter : public StmtFunctor<Doc(const Stmt&)>,
   Doc PrintBuffer(const BufferNode* op);
   Doc BufferNode2Doc(const BufferNode* op, Doc doc);
   Doc PrintString(const StringObj* op) { return Doc::StrLiteral(op->data); }
+  Doc PrintBufferRegion(const BufferRegionNode* op);
 
   /*!
    * \brief special method to print out data type
diff --git a/src/printer/tir_text_printer.cc b/src/printer/tir_text_printer.cc
index 8d5bba5e5bb0..2976f4871f64 100644
--- a/src/printer/tir_text_printer.cc
+++ b/src/printer/tir_text_printer.cc
@@ -66,6 +66,8 @@ Doc TIRTextPrinter::Print(const ObjectRef& node) {
     return PrintBuffer(node.as<BufferNode>());
   } else if (node->IsInstance<StringObj>()) {
     return PrintString(node.as<StringObj>());
+  } else if (node->IsInstance<BufferRegionNode>()) {
+    return PrintBufferRegion(node.as<BufferRegionNode>());
   } else {
     return this->meta_->GetMetaNode(node);
   }
@@ -217,6 +219,24 @@ Doc TIRTextPrinter::BufferNode2Doc(const BufferNode* buf, Doc doc) {
   return doc << ")";
 }
 
+Doc TIRTextPrinter::PrintBufferRegion(const BufferRegionNode* op) {
+  Doc doc;
+  doc << Print(op->buffer) << "[";
+  for (size_t i = 0; i < op->region.size(); ++i) {
+    if (i != 0) {
+      doc << ", ";
+    }
+    const auto& range = op->region[i];
+    if (!is_one(range->extent)) {
+      doc << Print(range->min) << ":" << Print(range->min + range->extent);
+    } else {
+      doc << Print(range->min);
+    }
+  }
+  doc << "]";
+  return doc;
+}
+
 Doc TIRTextPrinter::VisitExprDefault_(const Object* op) {
   return this->meta_->GetMetaNode(GetRef<ObjectRef>(op));
 }
@@ -476,8 +496,7 @@ inline const char* ForKind2String(ForKind t) {
     case ForKind::kUnrolled:
       return "unroll";
     case ForKind::kThreadBinding:
-      LOG(FATAL) << "Loop ThreadBinding is reserved for future used and "
-                 << "not yet supported in TIR";
+      return "thread_binding";
   }
   LOG(FATAL) << "Unknown ForKind";
   return "Unknown";
@@ -507,6 +526,92 @@ Doc TIRTextPrinter::VisitStmt_(const PrefetchNode* op) {
   return doc;
 }
 
+Doc TIRTextPrinter::VisitStmt_(const BlockRealizeNode* op) {
+  const auto* block_op = op->block.as<BlockNode>();
+  // print block name and block vars
+  Doc doc;
+  doc << "block([";
+  std::vector<Doc> block_var_docs;
+  for (const auto& iter_var : block_op->iter_vars) {
+    Doc block_var_doc;
+    if (is_zero(iter_var->dom->min) && iter_var->iter_type == kDataPar) {
+      block_var_doc << Print(iter_var->dom->extent);
+    } else {
+      block_var_doc << "tir.";
+      switch (iter_var->iter_type) {
+        case kDataPar:
+          block_var_doc << "range";
+          break;
+        case kCommReduce:
+          block_var_doc << "reduce_axis";
+          break;
+        case kOrdered:
+          block_var_doc << "scan_axis";
+          break;
+        case kOpaque:
+          block_var_doc << "opaque_axis";
+          break;
+        default:
+          LOG(FATAL) << "Unknown block var iter type";
+          break;
+      }
+      block_var_doc << "(" << Print(iter_var->dom->min) << ", "
+                    << Print(iter_var->dom->min + iter_var->dom->extent) << ")";
+    }
+    block_var_docs.push_back(block_var_doc);
+  }
+  doc << PrintSep(block_var_docs, Doc::Text(", ")) << "], ";
+  doc << Doc::StrLiteral(block_op->name_hint) << ")";
+  std::vector<Doc> block_var_names;
+  for (const auto& iter_var : block_op->iter_vars) {
+    Doc block_var_name;
+    AllocVar(iter_var->var);
+    block_var_names.push_back(Print(iter_var->var));
+  }
+  if (!block_var_names.empty()) {
+    doc << " as [" << PrintSep(block_var_names, Doc::Text(", ")) << "]";
+  }
+  doc << " {";
+  Doc block_attr_doc;
+  // print predicate, binding, read/write tensor region, annotations
+  if (!is_one(op->predicate)) {
+    block_attr_doc << Doc::NewLine() << "where(" << Print(op->predicate) << ")";
+  }
+  for (size_t i = 0; i < block_op->iter_vars.size(); ++i)
+    block_attr_doc << Doc::NewLine() << "bind(" << Print(block_op->iter_vars[i]->var) << ", "
+                   << Print(op->iter_values[i]) << ")";
+  block_attr_doc << Doc::NewLine() << "tir.reads(" << Print(block_op->reads) << ")";
+  block_attr_doc << Doc::NewLine() << "tir.writes(" << Print(block_op->writes) << ")";
+  if (!block_op->annotations.empty()) {
+    std::vector<Doc> attr_docs;
+    for (const auto& it : block_op->annotations) {
+      attr_docs.push_back(Doc::StrLiteral(it.first) << ": " << Print(it.second));
+    }
+    block_attr_doc << Doc::NewLine() << "tir.attrs({" << PrintSep(attr_docs, Doc::Text(", "))
+                   << "})";
+  }
+  // print body
+  Doc body;
+  body << Doc::NewLine();
+  for (const auto& alloc_buf : block_op->alloc_buffers) {
+    body << AllocBuf(alloc_buf) << " = alloc_buffer(" << PrintDType(alloc_buf->dtype)
+         << Print(alloc_buf->shape) << ")" << Doc::NewLine();
+  }
+  for (const auto& match_buf : block_op->match_buffers) {
+    body << AllocBuf(match_buf->buffer) << " = match_buffer_region(" << Print(match_buf->source)
+         << ")" << Doc::NewLine();
+  }
+  if (block_op->init.defined()) {
+    Doc init_block;
+    init_block << "with init()";
+    init_block << PrintBody(block_op->init.value());
+    body << init_block << Doc::NewLine();
+  }
+  body << Print(block_op->body);
+  doc << Doc::Indent(2, block_attr_doc << body);
+  return doc;
+}
+
 Doc TIRTextPrinter::VisitType_(const PrimTypeNode* node) {
   Doc doc;
   doc << PrintDType(node->dtype);
diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc
index 86b175e1676c..438079502306 100644
--- a/src/printer/tvmscript_printer.cc
+++ b/src/printer/tvmscript_printer.cc
@@ -22,6 +22,7 @@
  * \brief Printer class to print Tensor IR to python syntax script
  */
 
+#include <tvm/arith/analyzer.h>
 #include <tvm/ir/module.h>
 #include <tvm/node/serialization.h>
 #include <tvm/runtime/registry.h>
@@ -66,7 +67,10 @@ class TVMScriptPrinter : public StmtFunctor<Doc(const Stmt&)>,
   std::unordered_map<const BaseFuncNode*, GlobalVar> func2var_;
   /*! \brief var collector (var defined by For/Loop/Block) */
   std::unordered_set<const VarNode*> var_not_in_headers;
-  /*! \brief buffer collector (buffer defined in BufferMap and BufferAllocation)*/
+  /*!
+   * \brief buffer collector
+   *        (buffer defined in BufferMap, BufferAllocation and MatchBufferRegion)
+   */
   std::unordered_set<const BufferNode*> buf_not_in_headers;
   /*! \brief Map from Var to thread env name */
   std::unordered_map<Var, String, ObjectPtrHash, ObjectPtrEqual> var_env_map_;
@@ -84,6 +88,8 @@ class TVMScriptPrinter : public StmtFunctor<Doc(const Stmt&)>,
   int num_child_;
   /*! \brief the number of current node */
   int current_num_;
+  /*! \brief loop stack without annotations */
+  std::vector<For> loop_stack_;
 
   Doc VisitExpr_(const CastNode* op) override;
   Doc VisitExpr_(const VarNode* op) override;
@@ -131,6 +137,7 @@ class TVMScriptPrinter : public StmtFunctor<Doc(const Stmt&)>,
   Doc VisitStmt_(const ForNode* op) override;
   Doc VisitStmt_(const PrefetchNode* op) override;
   Doc VisitStmt_(const EvaluateNode* op) override;
+  Doc VisitStmt_(const BlockRealizeNode* op) override;
   Doc VisitStmtDefault_(const Object* op) override;
 
   Doc VisitType_(const PrimTypeNode* node) override;
@@ -145,12 +152,24 @@ class TVMScriptPrinter : public StmtFunctor<Doc(const Stmt&)>,
   Doc PrintArray(const ArrayNode* op);
   Doc PrintBuffer(const BufferNode* op);
   Doc AllocBufferDeclaration(const Buffer& buf);
+  Doc PrintBufferRegion(const BufferRegionNode* op);
+  Doc PrintMatchBufferRegion(const MatchBufferRegionNode* op);
+  Doc PrintAnnotations(const Map<String, ObjectRef>& annotations);
   static Doc PrintString(const StringObj* op) { return Doc::StrLiteral(op->data); }
 
   Doc GetUniqueName(std::string prefix);
   Doc AllocVar(const Var& var);
   Doc AllocBuf(const Buffer& buffer);
 
+  /*! Helper functions for loop printing. */
+  /*!
+   * \brief Print a single for loop
+   * \param loop The for loop to be printed
+   */
+  Doc PrintLoop(const For& loop);
+  /*! \brief Print all simple loops in stack into one line using tir.grid(). */
+  Doc PrintLoopStack();
+
   /*!
    * \brief Print additional info about expr in comment.
    * \param expr The expression.
@@ -308,6 +327,36 @@ Doc TVMScriptPrinter::AllocBuf(const Buffer& buffer) {
   return val;
 }
 
+Doc TVMScriptPrinter::PrintMatchBufferRegion(const MatchBufferRegionNode* op) {
+  const Buffer& buf = op->buffer;
+  buf_not_in_headers.insert(buf.get());
+
+  Doc doc = Print(op->buffer) << " = tir.match_buffer_region(" << Print(op->source);
+  if (!buf->strides.empty()) {
+    doc << ", strides=" << Print(buf->strides);
+  }
+  if (buf->offset_factor != 0 && buf->elem_offset->IsInstance<VarNode>()) {
+    Var elem_offset = Downcast<Var>(buf->elem_offset);
+    if (memo_var_.find(elem_offset) != memo_var_.end()) {
+      doc << ", elem_offset=" << Print(buf->elem_offset);
+    } else {
+      // implicitly define elem_offset
+      memo_var_[elem_offset] = Doc::Text(memo_buf_[buf].str() + ".elem_offset");
+      var_not_in_headers.insert(elem_offset.get());
+    }
+  } else {
+    doc << ", elem_offset=" << Print(buf->elem_offset);
+  }
+  if (buf->data_alignment != -1) {
+    doc << ", align=" << buf->data_alignment;
+  }
+  if (buf->offset_factor != 0) {
+    doc << ", offset_factor=" << buf->offset_factor;
+  }
+  doc << ")";
+  return doc;
+}
+
 Doc TVMScriptPrinter::Print(const ObjectRef& node) {
   if (!node.defined()) return Doc::Text("None");
   if (node->IsInstance<StmtNode>()) {
@@ -330,6 +379,10 @@ Doc TVMScriptPrinter::Print(const ObjectRef& node) {
     return PrintIterVar(node.as<IterVarNode>());
   } else if (node->IsInstance<RangeNode>()) {
     return PrintRange(node.as<RangeNode>());
+  } else if (node->IsInstance<BufferRegionNode>()) {
+    return PrintBufferRegion(node.as<BufferRegionNode>());
+  } else if (node->IsInstance<MatchBufferRegionNode>()) {
+    return PrintMatchBufferRegion(node.as<MatchBufferRegionNode>());
   } else {
     meta_collector_.Collect(node);
     return this->meta_.GetMetaNode(node);
@@ -660,9 +713,7 @@ inline const char* ForKind2String(ForKind t) {
     case ForKind::kUnrolled:
       return "unroll";
     case ForKind::kThreadBinding:
-      LOG(FATAL) << "Loop ThreadBinding is reserved for future used and "
-                 << "not yet supported in TIR";
-      return "threadbinding";
+      return "thread_binding";
   }
   LOG(FATAL) << "Unknown ForKind";
   return "Unknown";
@@ -671,9 +722,27 @@ inline const char* ForKind2String(ForKind t) {
 Doc TVMScriptPrinter::VisitStmt_(const ForNode* op) {
   Doc doc;
   var_not_in_headers.insert(op->loop_var.get());
-  doc << "for " << Print(op->loop_var) << " in tir." + std::string(ForKind2String(op->kind)) + "("
-      << Print(op->min) << ", " << Print(op->min + op->extent)
-      << "):" << Doc::Indent(4, Doc::NewLine() << PrintBody(op->body));
+  const auto* body = op->body.as<ForNode>();
+  bool simple_loop = op->kind == ForKind::kSerial && op->annotations.empty() && is_zero(op->min);
+  if (simple_loop) loop_stack_.push_back(GetRef<For>(op));
+  // It is a loop that can be compressed, let the loops below print it out
+  if (simple_loop && body != nullptr) return Print(GetRef<For>(body));
+  // It is a loop that can not be compressed
+  bool print_above = !loop_stack_.empty();
+  // print loops above if needed
+  if (print_above) {
+    doc << PrintLoopStack();
+    loop_stack_.clear();
+  }
+  if (!simple_loop) {
+    // print current loop if needed
+    Doc current_loop;
+    current_loop << PrintLoop(GetRef<For>(op));
+    current_loop << Doc::Indent(4, Doc::NewLine() << PrintBody(op->body));
+    doc << (print_above ? Doc::Indent(4, Doc::NewLine() << current_loop) : current_loop);
+  } else {
+    doc << Doc::Indent(4, Doc::NewLine() << PrintBody(op->body));
+  }
   return doc;
 }
 
@@ -713,6 +782,88 @@ Doc TVMScriptPrinter::VisitStmt_(const BufferStoreNode* op) {
   return doc;
 }
 
+Doc TVMScriptPrinter::VisitStmt_(const BlockRealizeNode* op) {
+  const auto* block_op = op->block.as<BlockNode>();
+  // print block name and block vars
+  Doc doc;
+  doc << "with tir.block([";
+  std::vector<Doc> block_var_docs;
+  for (const auto& iter_var : block_op->iter_vars) {
+    Doc block_var_doc;
+    if (is_zero(iter_var->dom->min) && iter_var->iter_type == kDataPar) {
+      block_var_doc << Print(iter_var->dom->extent);
+    } else {
+      block_var_doc << "tir.";
+      switch (iter_var->iter_type) {
+        case kDataPar:
+          block_var_doc << "range";
+          break;
+        case kCommReduce:
+          block_var_doc << "reduce_axis";
+          break;
+        case kOrdered:
+          block_var_doc << "scan_axis";
+          break;
+        case kOpaque:
+          block_var_doc << "opaque_axis";
+          break;
+        default:
+          LOG(FATAL) << "Unknown block var iter type: " << iter_var->iter_type;
+          break;
+      }
+      block_var_doc << "(" << Print(iter_var->dom->min) << ", "
+                    << Print(iter_var->dom->min + iter_var->dom->extent) << ")";
+    }
+    block_var_docs.push_back(block_var_doc);
+  }
+  doc << PrintSep(block_var_docs, Doc::Text(", ")) << "], ";
+  doc << Doc::StrLiteral(block_op->name_hint) << ")";
+  std::vector<Doc> block_var_names;
+  for (const auto& iter_var : block_op->iter_vars) {
+    var_not_in_headers.insert(iter_var->var.get());
+    block_var_names.push_back(Print(iter_var->var));
+  }
+  if (!block_var_names.empty()) {
+    doc << " as [" << PrintSep(block_var_names, Doc::Text(", ")) << "]";
+  }
+  doc << ":";
+  Doc block_attr_doc;
+  // print predicate, binding, read/write tensor region, annotations
+  if (!is_one(op->predicate)) {
+    block_attr_doc << Doc::NewLine() << "tir.where(" << Print(op->predicate) << ")";
+  }
+  for (size_t i = 0; i < block_op->iter_vars.size(); ++i)
+    block_attr_doc << Doc::NewLine() << "tir.bind(" << Print(block_op->iter_vars[i]->var) << ", "
+                   << Print(op->iter_values[i]) << ")";
+  block_attr_doc << Doc::NewLine() << "tir.reads(" << Print(block_op->reads) << ")";
+  block_attr_doc << Doc::NewLine() << "tir.writes(" << Print(block_op->writes) << ")";
+  if (!block_op->annotations.empty()) {
+    block_attr_doc << Doc::NewLine() << "tir.block_attr({";
+    block_attr_doc << PrintAnnotations(block_op->annotations);
+    block_attr_doc << "})";
+  }
+  // print body
+  Doc body;
+  body << Doc::NewLine();
+  for (const auto& alloc_buf : block_op->alloc_buffers) {
+    buf_not_in_headers.insert(alloc_buf.get());
+    body << Print(alloc_buf) << " = tir.alloc_buffer(" << memo_buf_decl_[alloc_buf] << ")"
+         << Doc::NewLine();
+  }
+  for (const auto& match_buf : block_op->match_buffers) {
+    body << Print(match_buf) << Doc::NewLine();
+  }
+  if (block_op->init.defined()) {
+    Doc init_block;
+    init_block << "with tir.init():";
+    init_block << Doc::Indent(4, Doc::NewLine() << PrintBody(block_op->init.value()));
+    body << init_block << Doc::NewLine();
+  }
+  body << PrintBody(block_op->body);
+  doc << Doc::Indent(4, block_attr_doc << body);
+  return doc;
+}
+
 Doc TVMScriptPrinter::PrintBody(const Stmt& body) {
   int memo_num_child, memo_current_num;
   std::swap(memo_num_child, num_child_);
@@ -890,6 +1041,73 @@ Doc TVMScriptPrinter::PrintBuffer(const BufferNode* op) {
   return meta_.InMeta(buffer) ? meta_.GetMetaNode(buffer) : AllocBuf(buffer);
 }
 
+Doc TVMScriptPrinter::PrintBufferRegion(const BufferRegionNode* op) {
+  Doc doc;
+  doc << Print(op->buffer) << "[";
+  for (size_t i = 0; i < op->region.size(); ++i) {
+    if (i != 0) doc << ", ";
+    const auto& range = op->region[i];
+    if (!is_one(range->extent)) {
+      doc << Print(range->min) << ":" << Print(range->min + range->extent);
+    } else {
+      doc << Print(range->min);
+    }
+  }
+  doc << "]";
+  return doc;
+}
+
+Doc TVMScriptPrinter::PrintAnnotations(const Map<String, ObjectRef>& annotations) {
+  Doc res;
+  std::vector<std::pair<String, ObjectRef>> anno_list;
+  anno_list.reserve(annotations.size());
+  for (const auto& pair : annotations) {
+    anno_list.emplace_back(pair);
+  }
+  sort(anno_list.begin(), anno_list.end());
+  for (size_t i = 0; i < anno_list.size(); ++i) {
+    if (i != 0) {
+      res << ", ";
+    }
+    res << "\"" << anno_list[i].first << "\":" << Print(anno_list[i].second);
+  }
+  return res;
+}
+
+Doc TVMScriptPrinter::PrintLoop(const For& loop) {
+  Doc res;
+  res << "for " << Print(loop->loop_var)
+      << " in tir." + std::string(ForKind2String(loop->kind)) + "(" << Print(loop->min) << ", "
+      << Print(loop->min + loop->extent);
+  if (loop->thread_binding.defined()) {
+    res << ", thread = ";
+    res << Print(loop->thread_binding.value()->thread_tag);
+  }
+  if (!loop->annotations.empty()) {
+    res << ", annotation = {";
+    res << PrintAnnotations(loop->annotations);
+    res << "}";
+  }
+  res << "):";
+  return res;
+}
+
+Doc TVMScriptPrinter::PrintLoopStack() {
+  Doc res;
+  if (loop_stack_.size() == 1) {
+    res << PrintLoop(loop_stack_[0]);
+  } else if (loop_stack_.size() > 1) {
+    std::vector<Doc> vars, extents;
+    for (const auto& loop : loop_stack_) {
+      vars.push_back(Print(loop->loop_var));
+      extents.push_back(Print(loop->extent));
+    }
+    res << "for " << PrintSep(vars, Doc::Text(", ")) << " in tir.grid("
+        << PrintSep(extents, Doc::Text(", ")) << "):";
+  }
+  return res;
+}
+
 TVM_REGISTER_GLOBAL("script.AsTVMScript")
     .set_body_typed<std::string(const ObjectRef&, bool)>([](const ObjectRef& functions,
                                                             bool show_meta) {
diff --git a/src/relay/analysis/annotated_region_set.cc b/src/relay/analysis/annotated_region_set.cc
index 04a18c4b7351..85a9c51a2fa8 100644
--- a/src/relay/analysis/annotated_region_set.cc
+++ b/src/relay/analysis/annotated_region_set.cc
@@ -157,8 +157,9 @@ class AnnotatedRegionSet::Creator : protected MixedModeVisitor {
       // Check if the argument already belongs to a region
       auto region = region_set_->GetRegion(call->args[0]);
       if (!region.defined()) {
-        throw Error(ErrorBuilder() << "Cannot find the corresponding region for end annotation:\n"
-                                   << AsText(GetRef<Call>(call), false));
+        throw CompileError(ErrorBuilder()
+                           << "Cannot find the corresponding region for end annotation:\n"
+                           << AsText(GetRef<Call>(call), false));
       } else {
         // If the argument is belonged to a region, it must have the same target.
         // Otherwise we should see a region_begin op.
diff --git a/src/relay/analysis/context_analysis.cc b/src/relay/analysis/context_analysis.cc
index a648b7af8fd3..970ceda070df 100644
--- a/src/relay/analysis/context_analysis.cc
+++ b/src/relay/analysis/context_analysis.cc
@@ -67,7 +67,7 @@ namespace relay {
 
 using PackedAnalysisResultMap = Map<Expr, Array<Integer>>;
 using AnalysisResultMap =
-    std::unordered_map<Expr, TVMContext, runtime::ObjectPtrHash, runtime::ObjectPtrEqual>;
+    std::unordered_map<Expr, Device, runtime::ObjectPtrHash, runtime::ObjectPtrEqual>;
 
 namespace analysis {
 
@@ -90,21 +90,22 @@ class DeviceDomain {
  public:
   // Construct an empty domain.
   DeviceDomain() {
-    ctx_.device_type = static_cast<DLDeviceType>(-1);
-    ctx_.device_id = -1;
+    device_.device_type = static_cast<DLDeviceType>(-1);
+    device_.device_id = -1;
   }
 
   // Construct a domain based on a given context.
-  explicit DeviceDomain(const TVMContext& ctx) : ctx_(ctx) {}
+  explicit DeviceDomain(const Device& dev) : device_(dev) {}
 
   // Check if the current domain is empty.
   bool IsEmptyDomain() const {
-    return static_cast<int>(ctx_.device_type) == -1 && ctx_.device_id == -1;
+    return static_cast<int>(device_.device_type) == -1 && device_.device_id == -1;
   }
 
   // Check if the current domain equals the other one.
   bool operator==(const DeviceDomain& other) const {
-    return ctx_.device_type == other.ctx_.device_type && ctx_.device_id == other.ctx_.device_id;
+    return device_.device_type == other.device_.device_type &&
+           device_.device_id == other.device_.device_id;
   }
 
   bool operator!=(const DeviceDomain& other) const { return !(*this == other); }
@@ -116,8 +117,8 @@ class DeviceDomain {
       if (domain->IsEmptyDomain()) {
         return (size_t)(domain.get());
       } else {
-        size_t const h1(std::hash<int>()(static_cast<int>(domain->ctx_.device_type)));
-        size_t const h2(std::hash<int>()(domain->ctx_.device_id));
+        size_t const h1(std::hash<int>()(static_cast<int>(domain->device_.device_type)));
+        size_t const h2(std::hash<int>()(domain->device_.device_id));
         return h1 ^ (h2 << 1);
       }
     }
@@ -136,7 +137,7 @@ class DeviceDomain {
   };
 
   /* \brief The device to be assigned to the current domain. */
-  TVMContext ctx_;
+  Device device_;
 
   friend DeviceDomainPtr Join(const DeviceDomainPtr& lhs, const DeviceDomainPtr& rhs);
   friend class ContextAnalyzer;
@@ -163,13 +164,13 @@ DeviceDomainPtr Join(const DeviceDomainPtr& lhs, const DeviceDomainPtr& rhs) {
 class ContextAnalyzer : public MixedModeVisitor {
  public:
   ContextAnalyzer(const IRModule& mod, const GlobalVar& current_func,
-                  const TVMContext& default_context)
+                  const Device& default_device)
       : MixedModeVisitor(9),  // the number of repeated visits a node can perform
         mod_(mod),
         current_func_(current_func),
-        default_context_(default_context) {
-    cpu_ctx_.device_type = kDLCPU;
-    cpu_ctx_.device_id = 0;
+        default_device_(default_device) {
+    cpu_dev_.device_type = kDLCPU;
+    cpu_dev_.device_id = 0;
   }
 
   // Create an empty domain.
@@ -177,8 +178,8 @@ class ContextAnalyzer : public MixedModeVisitor {
   DeviceDomainPtr Bottom() { return std::make_shared<DeviceDomain>(DeviceDomain()); }
 
   // Create a domain with the given device context.
-  DeviceDomainPtr DeviceType(const TVMContext& ctx) {
-    return std::make_shared<DeviceDomain>(DeviceDomain(ctx));
+  DeviceDomainPtr DeviceType(const Device& dev) {
+    return std::make_shared<DeviceDomain>(DeviceDomain(dev));
   }
 
   // Find the root of a device.
@@ -233,19 +234,19 @@ class ContextAnalyzer : public MixedModeVisitor {
   // attribute of other nodes can be propagated from it.
   void UnifyDeviceCopy(const std::vector<Expr>& inps, const std::vector<Expr>& outputs,
                        DLDeviceType src_dev_type, DLDeviceType dst_dev_type) {
-    TVMContext src_ctx;
-    src_ctx.device_type = src_dev_type;
-    src_ctx.device_id = 0;
-    auto src_domain = DeviceType(src_ctx);
+    Device src_dev;
+    src_dev.device_type = src_dev_type;
+    src_dev.device_id = 0;
+    auto src_domain = DeviceType(src_dev);
     for (const auto& it : inps) {
       auto lhs = DeviceFor(it);
       Unify(lhs, src_domain);
     }
 
-    TVMContext dst_ctx;
-    dst_ctx.device_type = dst_dev_type;
-    dst_ctx.device_id = 0;
-    auto dst_domain = DeviceType(dst_ctx);
+    Device dst_dev;
+    dst_dev.device_type = dst_dev_type;
+    dst_dev.device_id = 0;
+    auto dst_domain = DeviceType(dst_dev);
     for (const auto& it : outputs) {
       auto lhs = DeviceFor(it);
       Unify(lhs, dst_domain);
@@ -387,9 +388,9 @@ class ContextAnalyzer : public MixedModeVisitor {
     for (const auto& it : expr_to_device_) {
       auto device = Lookup(it.second);
       if (device->IsEmptyDomain()) {
-        ret[it.first] = default_context_;
+        ret[it.first] = default_device_;
       } else {
-        ret[it.first] = device->ctx_;
+        ret[it.first] = device->device_;
       }
     }
 
@@ -478,14 +479,14 @@ class ContextAnalyzer : public MixedModeVisitor {
 
     // The arguments of alloc storage should be on CPU.
     for (int i = 0; i < 2; i++) {
-      Unify(DeviceFor(call->args[i]), DeviceType(cpu_ctx_));
+      Unify(DeviceFor(call->args[i]), DeviceType(cpu_dev_));
       MixedModeVisitor::VisitExpr(call->args[i]);
     }
-    TVMContext ctx;
+    Device dev;
     const auto* attrs = call->attrs.as<AllocStorageAttrs>();
-    ctx.device_type = static_cast<DLDeviceType>(attrs->device_type);
-    ctx.device_id = attrs->device_id;
-    Unify(DeviceFor(GetRef<Call>(call)), DeviceType(ctx));
+    dev.device_type = static_cast<DLDeviceType>(attrs->device_type);
+    dev.device_id = attrs->device_id;
+    Unify(DeviceFor(GetRef<Call>(call)), DeviceType(dev));
   }
 
   void UnifyAllocTensorCall(const CallNode* call) {
@@ -497,14 +498,14 @@ class ContextAnalyzer : public MixedModeVisitor {
     Unify(DeviceFor(storage), DeviceFor(GetRef<Call>(call)));
 
     // The shape for alloc_tensor should be on CPU.
-    Unify(DeviceFor(shape), DeviceType(cpu_ctx_));
+    Unify(DeviceFor(shape), DeviceType(cpu_dev_));
     MixedModeVisitor::VisitExpr(shape);
   }
 
   void UnifyShapeFuncCall(const CallNode* call) {
     // [func, inputs, outputs]
     ICHECK_EQ(call->args.size(), 3U);
-    auto shape_func_domain = DeviceType(cpu_ctx_);
+    auto shape_func_domain = DeviceType(cpu_dev_);
 
     // No need to unify the op of a shape_func as shape_func doesn't
     // invoke the op itself. It should be handled by invoke_tvm_op.
@@ -539,7 +540,7 @@ class ContextAnalyzer : public MixedModeVisitor {
     // a tensor regardless its device type.
     // Instead, the device type of the input is left for its other consumers to
     // unify or it will fallback to the default context.
-    Unify(DeviceFor(GetRef<Call>(call)), DeviceType(cpu_ctx_));
+    Unify(DeviceFor(GetRef<Call>(call)), DeviceType(cpu_dev_));
   }
 
   void UnifyReshapeTensorCall(const CallNode* call) {
@@ -550,7 +551,7 @@ class ContextAnalyzer : public MixedModeVisitor {
     Unify(DeviceFor(GetRef<Call>(call)), DeviceFor(data));
 
     // The shape field of reshape_tensor is always on the CPU.
-    Unify(DeviceFor(shape), DeviceType(cpu_ctx_));
+    Unify(DeviceFor(shape), DeviceType(cpu_dev_));
     MixedModeVisitor::VisitExpr(data);
     MixedModeVisitor::VisitExpr(shape);
   }
@@ -668,13 +669,13 @@ class ContextAnalyzer : public MixedModeVisitor {
 
  private:
   /* \brief The cpu context. */
-  TVMContext cpu_ctx_;
+  Device cpu_dev_;
   /* \brief The module that helps context analysis. */
   const IRModule& mod_;
   /* \brief The current function that is being analyzed. */
   GlobalVar current_func_;
   /* \brief The default device that could be attached to an expression. */
-  const TVMContext& default_context_;
+  const Device& default_device_;
   /* \brief The IR node to device domain mapping. */
   std::unordered_map<Expr, DeviceDomainPtr, runtime::ObjectPtrHash, runtime::ObjectPtrEqual>
       expr_to_device_;
@@ -690,21 +691,20 @@ class ContextAnalyzer : public MixedModeVisitor {
 
 }  // namespace analysis
 
-AnalysisResultMap ContextAnalysis(const IRModule& mod, const TVMContext& default_context) {
+AnalysisResultMap ContextAnalysis(const IRModule& mod, const Device& default_device) {
   // TODO(@zhiics) Apply the pass to all functions/entries
   auto entry = mod->GetGlobalVar("main");
-  auto ca = analysis::ContextAnalyzer(mod, entry, default_context);
+  auto ca = analysis::ContextAnalyzer(mod, entry, default_device);
   auto expr = mod->Lookup(entry);
   ca.VisitExpr(expr);
   return ca.Results();
 }
 
-// Unpack the device type and deivce id fields in TVMContext for PackedFunc calls
-// as TVMContext is not in the object system.
-PackedAnalysisResultMap ContextAnalysisPacked(const IRModule& mod,
-                                              const TVMContext& default_context) {
+// Unpack the device type and deivce id fields in Device for PackedFunc calls
+// as Device is not in the object system.
+PackedAnalysisResultMap ContextAnalysisPacked(const IRModule& mod, const Device& default_device) {
   PackedAnalysisResultMap ret;
-  auto res = ContextAnalysis(mod, default_context);
+  auto res = ContextAnalysis(mod, default_device);
   for (const auto& it : res) {
     Integer dev_ty = static_cast<int>(it.second.device_type);
     Integer dev_id = it.second.device_id;
diff --git a/src/relay/analysis/get_calibration_data.cc b/src/relay/analysis/get_calibration_data.cc
index 70fe2a68f21e..12bab1e38ddd 100644
--- a/src/relay/analysis/get_calibration_data.cc
+++ b/src/relay/analysis/get_calibration_data.cc
@@ -36,7 +36,7 @@ namespace relay {
 
 /*!
  * \brief This function returns a module that will be used by
- * the relay graph runtime for collecting the calibration data.
+ * the relay graph executor for collecting the calibration data.
  * To do that, we first make all inputs and outputs of each
  * function into the final output (i.e., the final output is a
  * tuple of tensors). Then, we change the compiler attribute of
@@ -106,7 +106,7 @@ IRModule GetCalibrateModule(IRModule module) {
       }
     }
   }
-  // reset the attribute of functions for running graph runtime
+  // reset the attribute of functions for running graph executor
   for (const auto& pair : glob_funcs) {
     if (auto* fn = pair.second.as<FunctionNode>()) {
       auto func = GetRef<Function>(fn);
diff --git a/src/relay/analysis/kind_check.cc b/src/relay/analysis/kind_check.cc
index c7c5a0a9f083..65b8516cb16c 100644
--- a/src/relay/analysis/kind_check.cc
+++ b/src/relay/analysis/kind_check.cc
@@ -139,7 +139,7 @@ struct KindChecker : TypeFunctor<Kind(const Type&)> {
                   << "Expected " << data->type_vars.size() << "arguments for " << tc << "; got "
                   << op->args.size());
       }
-    } catch (const dmlc::Error& err) {
+    } catch (const Error& err) {
       // TODO(@jroesch): can probably relax to just emit
       EmitFatal(Diagnostic::Error(op->span)
                 << "the type variable : `" << var->name_hint << "` is undefined");
diff --git a/src/relay/analysis/type_solver.cc b/src/relay/analysis/type_solver.cc
index cc1ada677c65..22e2e9a71040 100644
--- a/src/relay/analysis/type_solver.cc
+++ b/src/relay/analysis/type_solver.cc
@@ -617,10 +617,10 @@ bool TypeSolver::Solve() {
       }
 
       rnode->resolved = resolved;
-    } catch (const Error& err) {
+    } catch (const CompileError& err) {
       this->diag_ctx_.Emit(Diagnostic::Error(rnode->span) << err.what());
       rnode->resolved = false;
-    } catch (const dmlc::Error& e) {
+    } catch (const Error& e) {
       ICHECK(false) << e.what();
     }
 
diff --git a/src/relay/analysis/util.cc b/src/relay/analysis/util.cc
index 90750575b9d4..a4120d20288f 100644
--- a/src/relay/analysis/util.cc
+++ b/src/relay/analysis/util.cc
@@ -370,7 +370,7 @@ std::unordered_map<const Object*, size_t> GetExprRefCount(const Expr& body) {
 
 template <typename T>
 bool IsNDArrayAllGreaterEqual(const runtime::NDArray& tensor, T value) {
-  ICHECK_EQ(tensor->ctx.device_type, kDLCPU);
+  ICHECK_EQ(tensor->device.device_type, kDLCPU);
   ICHECK(tensor->strides == nullptr);
   ICHECK_EQ(tensor->byte_offset, 0);
   const T* data = static_cast<const T*>(tensor->data);
diff --git a/src/relay/analysis/well_formed.cc b/src/relay/analysis/well_formed.cc
index 856c5dc7aac1..acc1a9adc9f4 100644
--- a/src/relay/analysis/well_formed.cc
+++ b/src/relay/analysis/well_formed.cc
@@ -24,7 +24,7 @@
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/pattern_functor.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include <unordered_set>
 
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index 54d8b0056859..07bb51150bee 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -19,7 +19,7 @@
 
 /*!
  * \file relay/backend/build_module.cc
- * \brief Code generation for TVM's graph runtime.
+ * \brief Code generation for TVM's graph executor.
  */
 #include <tvm/driver/driver_api.h>
 #include <tvm/ir/expr.h>
@@ -60,7 +60,7 @@ struct BuildOutput {
 struct GraphCodegen {
  public:
   GraphCodegen() {
-    auto pf = GetPackedFunc("relay.build_module._GraphRuntimeCodegen");
+    auto pf = GetPackedFunc("relay.build_module._GraphExecutorCodegen");
     mod = (*pf)();
   }
   ~GraphCodegen() {}
@@ -228,7 +228,7 @@ class RelayBuildModule : public runtime::ModuleNode {
   const char* type_key() const final { return "RelayBuildModule"; }
 
   /*!
-   * \brief Build relay IRModule for graph runtime
+   * \brief Build relay IRModule for graph executor
    *
    * \param mod Relay IRModule
    * \param target Target device
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index ae975a5f3240..0777b19ec557 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -157,7 +157,7 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
             runtime::Registry::Get("auto_scheduler.relay_integration.auto_schedule_topi_compute");
         ICHECK(fauto_schedule != nullptr)
             << "auto_scheduler.relay_integration.auto_schedule_topi_compute is not registered";
-        ObjectRef obj = (*fauto_schedule)(tensor_outs);
+        ObjectRef obj = (*fauto_schedule)(String(cache_node->func_name), tensor_outs);
         if (obj.defined()) {
           schedule = Downcast<te::Schedule>(obj);
         }
@@ -262,7 +262,7 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
       ICHECK(tuple_type) << "Expect output to be a tuple type";
       ICHECK_EQ(tuple_type->fields.size(), outputs.size());
     }
-    // Set the name to `__copy`. It will be detected in graph runtime to perform
+    // Set the name to `__copy`. It will be detected in graph executor to perform
     // data copy across devices.
     if (op == device_copy_op_) {
       readable_name_stream_.str(std::string());
diff --git a/src/relay/backend/contrib/arm_compute_lib/codegen.cc b/src/relay/backend/contrib/arm_compute_lib/codegen.cc
index e0669ae64bdb..8098c8d51274 100644
--- a/src/relay/backend/contrib/arm_compute_lib/codegen.cc
+++ b/src/relay/backend/contrib/arm_compute_lib/codegen.cc
@@ -376,12 +376,12 @@ runtime::Module ACLCompiler(const ObjectRef& ref) {
 TVM_REGISTER_GLOBAL("relay.ext.arm_compute_lib").set_body_typed(ACLCompiler);
 
 /*!
- * \brief Check whether ACL graph runtime is used.
+ * \brief Check whether ACL graph executor is used.
  *
- * \return True if ACL graph runtime is enabled, False if not.
+ * \return True if ACL graph executor is enabled, False if not.
  */
 inline constexpr bool IsACLRuntimeEnabled() {
-#if TVM_GRAPH_RUNTIME_ARM_COMPUTE_LIB
+#if TVM_GRAPH_EXECUTOR_ARM_COMPUTE_LIB
   return true;
 #else
   return false;
diff --git a/src/relay/backend/contrib/bnns/codegen.cc b/src/relay/backend/contrib/bnns/codegen.cc
new file mode 100644
index 000000000000..72c32fb5b19e
--- /dev/null
+++ b/src/relay/backend/contrib/bnns/codegen.cc
@@ -0,0 +1,215 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file
+ * \brief Implementation of BNNS codegen APIs.
+ */
+
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/registry.h>
+
+#include <numeric>
+#include <sstream>
+
+#include "../../../../runtime/contrib/json/json_node.h"
+#include "../../utils.h"
+#include "../codegen_json/codegen_json.h"
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+
+using namespace backend;
+
+/*!
+ * \brief Retrieve the expected "root" op nested inside a fused call, such as conv2d in
+ *        relu(add(conv2d))
+ * \param call A Relay call node. Typically nn.relu when called the first time.
+ * \param max_depth The maximum number of calls before the root op, counting from current_call.
+ * \param root_name The name of expected "root" op in this fused call.
+ * \return A CallNode corresponding to the root op
+ */
+inline const CallNode* FindCallWithName(const CallNode* current_call, int max_depth,
+                                        const std::string& root_name) {
+  ICHECK(current_call && max_depth >= 0);
+
+  if (max_depth == 0) {
+    ICHECK(current_call && IsOp(current_call, root_name));
+    return current_call;
+  }
+  if (IsOp(current_call, root_name)) {
+    return current_call;
+  }
+
+  ICHECK_GT(current_call->args.size(), 0);
+
+  const auto* next_call = current_call->args[0].as<CallNode>();
+  return FindCallWithName(next_call, max_depth - 1, root_name);
+}
+
+class BNNSJSONSerializer : public backend::contrib::JSONSerializer {
+  using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
+  using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry;
+
+ public:
+  BNNSJSONSerializer(const std::string& symbol, const Expr& expr) : JSONSerializer(symbol, expr) {}
+
+  std::vector<JSONGraphNodeEntry> VisitExpr_(const CallNode* cn) override {
+    Expr expr = GetRef<Expr>(cn);
+    std::string name;
+    const CallNode* call = cn;
+    if (const auto* op_node = cn->op.as<OpNode>()) {
+      name = op_node->name;
+    } else if (const auto* fn = cn->op.as<FunctionNode>()) {
+      auto comp = fn->GetAttr<String>(attr::kComposite);
+      ICHECK(comp.defined()) << "BNNS JSON runtime only supports composite functions.";
+      name = comp.value();
+
+      auto body = fn->body.as<CallNode>();
+      if (name == "bnns.conv2d_bias_relu") {
+        auto add_op_type = IsOp(body->args[0].as<CallNode>(), "add") ? "add" : "nn.bias_add";
+        call = GetRootCall(body, 2, {"nn.conv2d", add_op_type, "nn.relu"});
+      } else if (name == "bnns.conv2d_bias") {
+        auto add_op_type = IsOp(body, "add") ? "add" : "nn.bias_add";
+        call = GetRootCall(body, 1, {"nn.conv2d", add_op_type});
+      } else if (name == "bnns.conv2d_relu") {
+        call = GetRootCall(body, 1, {"nn.conv2d", "nn.relu"});
+        ICHECK(call->op.as<OpNode>()) << "Not op node";
+      } else if (name == "bnns.conv2d_bias_sigmoid") {
+        auto add_op_type = IsOp(body->args[0].as<CallNode>(), "add") ? "add" : "nn.bias_add";
+        call = GetRootCall(body, 2, {"nn.conv2d", add_op_type, "sigmoid"});
+        ICHECK(call->op.as<OpNode>()) << "Not op node";
+      } else if (name == "bnns.conv2d_sigmoid") {
+        call = GetRootCall(body, 1, {"nn.conv2d", "sigmoid"});
+        ICHECK(call->op.as<OpNode>()) << "Not op node";
+      } else if (name == "bnns.dense_bias") {
+        call = GetRootCall(fn->body.as<CallNode>(), 1, {"nn.dense", "add"});
+      } else if (name == "bnns.dense_bias_gelu") {
+        call = FindCallWithName(fn->body.as<CallNode>(), 10, "nn.dense");
+      } else {
+        LOG(FATAL) << "Unrecognized BNNS pattern: " << name;
+      }
+    } else {
+      LOG(FATAL) << "BNNS JSON runtime does not support calls to " << cn->op->GetTypeKey();
+    }
+
+    std::vector<JSONGraphNodeEntry> inputs;
+    for (const auto& arg : cn->args) {
+      auto res = VisitExpr(arg);
+      inputs.insert(inputs.end(), res.begin(), res.end());
+    }
+    auto node = std::make_shared<JSONGraphNode>(name,     /* name_ */
+                                                "kernel", /* op_type_ */
+                                                inputs, 1 /* num_outputs_ */);
+    SetCallNodeAttribute(node, call);
+    return AddNode(node, GetRef<Expr>(cn));
+  }
+};
+
+/*!
+ * \brief The external compiler/codegen tool. It takes a Relay expression/module and
+ * compile it into a runtime module.
+ */
+runtime::Module BNNSCompiler(const ObjectRef& ref) {
+  ICHECK(ref->IsInstance<FunctionNode>());
+  auto func = Downcast<Function>(ref);
+  auto func_name = GetExtSymbol(func);
+  BNNSJSONSerializer serializer(func_name, func);
+  serializer.serialize();
+  std::string graph_json = serializer.GetJSON();
+  auto params = serializer.GetParams();
+
+  const auto* pf = runtime::Registry::Get("runtime.BNNSJSONRuntimeCreate");
+  ICHECK(pf != nullptr) << "Cannot find JSON runtime module to create";
+  auto mod = (*pf)(func_name, graph_json, params);
+  return mod;
+}
+
+TVM_REGISTER_GLOBAL("relay.ext.bnns").set_body_typed(BNNSCompiler);
+
+/**
+ * \brief A helper to expand the params by adding ones which used by BNNS runtime
+ * for a given expression. Same as default ConstantUpdater but skip constant from
+ * essential BNNS composed function ops.
+ */
+struct BNNSConstantUpdater : public ConstantUpdater {
+ public:
+  BNNSConstantUpdater(const std::string& symbol,
+                      std::unordered_map<std::string, runtime::NDArray>* params,
+                      const std::vector<std::string>& skip_mask)
+      : ConstantUpdater(symbol, params), skip_mask_(skip_mask) {}
+  using ConstantUpdater::VisitExpr_;
+
+  /**!
+   * Like an original implementation but avoid visiting of body nodes
+   * for BNNS specific composite primitives.
+   */
+  void VisitExpr_(const FunctionNode* op) final {
+    this->VisitSpan(op->span);
+    for (auto param : op->params) {
+      this->VisitExpr(param);
+    }
+
+    if (!isBNNSSpecificCompositeFunc(op)) {
+      this->VisitExpr(op->body);
+    }
+  }
+
+ private:
+  bool isBNNSSpecificCompositeFunc(const FunctionNode* op) {
+    auto comp = op->GetAttr<String>(attr::kComposite);
+    if (!comp) return false;
+
+    auto comp_name = comp.value();
+
+    bool is_match = false;
+    for (const auto& mask : skip_mask_) {
+      if (std::string(comp_name).substr(0, mask.size()) == mask) {
+        is_match = true;
+        break;
+      }
+    }
+    return is_match;
+  }
+
+  std::vector<std::string> skip_mask_;
+};
+
+Map<String, runtime::NDArray> BNNSConstantUpdaterFunc(Expr expr, std::string symbol) {
+  std::vector<std::string> bnns_composite_filter = {"bnns."};
+
+  // Visit all suitable constant nodes
+  std::unordered_map<std::string, runtime::NDArray> res;
+  BNNSConstantUpdater const_updater(symbol, &res, bnns_composite_filter);
+  const_updater(expr);
+
+  // Convert to tvm::Map
+  Map<String, runtime::NDArray> ret;
+  for (const auto& kvp : res) ret.Set(kvp.first, kvp.second);
+  return ret;
+}
+
+TVM_REGISTER_GLOBAL("relay.ext.bnns.constant_updater").set_body_typed(BNNSConstantUpdaterFunc);
+
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/contrib/codegen_c/codegen_c.h b/src/relay/backend/contrib/codegen_c/codegen_c.h
index af835cfca02e..b81fd14b99c2 100644
--- a/src/relay/backend/contrib/codegen_c/codegen_c.h
+++ b/src/relay/backend/contrib/codegen_c/codegen_c.h
@@ -343,6 +343,8 @@ class CodegenCBase {
     std::string dtype;
     if (runtime::TypeMatch(ttype->dtype, kDLFloat, 32)) {
       dtype = "float";
+    } else if (runtime::TypeMatch(ttype->dtype, kDLFloat, 16)) {
+      dtype = "half";
     } else if (runtime::TypeMatch(ttype->dtype, kDLInt, 32)) {
       dtype = "int";
     } else if (runtime::TypeMatch(ttype->dtype, kDLInt, 64)) {
diff --git a/src/relay/backend/contrib/ethosn/codegen.cc b/src/relay/backend/contrib/ethosn/codegen.cc
index 5e052b3e4fd6..dab0e6c42f80 100644
--- a/src/relay/backend/contrib/ethosn/codegen.cc
+++ b/src/relay/backend/contrib/ethosn/codegen.cc
@@ -198,7 +198,7 @@ sl::TensorsAndId MakeOps(const sl::TensorAndId<sl::Operand>& op) {
 
 NetworkWithIDs ConstructNetworkVisitor::Construct(const Function& func) {
   // Initialise everything
-#if _ETHOSN_API_VERSION_ == 2011
+#if _ETHOSN_API_VERSION_ >= 2011
   auto ctx = transform::PassContext::Current();
   auto cfg = ctx->GetConfig<EthosnCompilerConfig>("relay.ext.ethos-n.options");
   if (!cfg.defined()) {
@@ -206,7 +206,7 @@ NetworkWithIDs ConstructNetworkVisitor::Construct(const Function& func) {
   }
 #endif
   NetworkWithIDs network_with_ids;
-#if _ETHOSN_API_VERSION_ == 2011
+#if _ETHOSN_API_VERSION_ >= 2011
   network_ = sl::CreateNetwork(variants[cfg.value()->variant]);
 #else
   network_ = sl::CreateNetwork();
@@ -572,7 +572,7 @@ sl::CompilationOptions EthosnCompiler::CreateOptions() {
     cfg = AttrsWithDefaultValues<EthosnCompilerConfig>();
   }
 
-#if _ETHOSN_API_VERSION_ == 2011
+#if _ETHOSN_API_VERSION_ >= 2011
   sl::CompilationOptions options;
 #else
   sl::CompilationOptions options(variants[cfg.value()->variant]);
@@ -619,7 +619,7 @@ std::pair<std::vector<uint32_t>, std::vector<uint32_t>> EthosnCompiler::GetInput
   return std::make_pair(input_order, output_order);
 }
 
-#if _ETHOSN_API_VERSION_ == 2011
+#if _ETHOSN_API_VERSION_ >= 2011
 auto ctx = transform::PassContext::Current();
 auto cfg = ctx -> GetConfig<EthosnCompilerConfig>("relay.ext.ethos-n.options").defined()
                ? ctx -> GetConfig<EthosnCompilerConfig>("relay.ext.ethos-n.options")
@@ -632,7 +632,7 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.conv2d")
       Call call = args[0];
       ConvolutionParams params;
       auto err = EthosnAPI::QnnConv2d(call, &params);
-#if _ETHOSN_API_VERSION_ == 2011
+#if _ETHOSN_API_VERSION_ >= 2011
       if (params.is_depthwise) {
         *rv = !err &&
               m_Queries.IsDepthwiseConvolutionSupported(params.bias_info, params.weights_info,
@@ -657,7 +657,7 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.fc")
       Call call = args[0];
       FullyConnectedParams params;
       auto err = EthosnAPI::QnnFullyConnected(call, &params);
-#if _ETHOSN_API_VERSION_ == 2011
+#if _ETHOSN_API_VERSION_ >= 2011
       *rv = !err && m_Queries.IsFullyConnectedSupported(params.bias_info, params.weights_info,
                                                         params.fc_info, params.input_info);
 #else
@@ -671,7 +671,7 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.max_pool2d")
       Call call = args[0];
       MaxPool2DParams params;
       auto err = EthosnAPI::MaxPool2D(call, &params);
-#if _ETHOSN_API_VERSION_ == 2011
+#if _ETHOSN_API_VERSION_ >= 2011
       *rv = !err && m_Queries.IsPoolingSupported(params.pool_info, params.input_info);
 #else
       *rv = !err && sl::IsPoolingSupported(params.pool_info, params.input_info);
@@ -683,7 +683,7 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.avg_pool2d")
       Call call = args[0];
       AvgPool2DParams params;
       auto err = EthosnAPI::AvgPool2D(call, &params);
-#if _ETHOSN_API_VERSION_ == 2011
+#if _ETHOSN_API_VERSION_ >= 2011
       *rv = !err && m_Queries.IsPoolingSupported(params.pool_info, params.input_info);
 #else
       *rv = !err && sl::IsPoolingSupported(params.pool_info, params.input_info);
@@ -695,7 +695,7 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.reshape")
       Call call = args[0];
       ReshapeParams params;
       auto err = EthosnAPI::Reshape(call, &params);
-#if _ETHOSN_API_VERSION_ == 2011
+#if _ETHOSN_API_VERSION_ >= 2011
       *rv = !err && m_Queries.IsReshapeSupported(params.new_shape, params.input_info);
 #else
       *rv = !err && sl::IsReshapeSupported(params.new_shape, params.input_info);
@@ -707,7 +707,7 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.addition")
       Call call = args[0];
       AdditionParams params;
       auto err = EthosnAPI::Addition(call, &params);
-#if _ETHOSN_API_VERSION_ == 2011
+#if _ETHOSN_API_VERSION_ >= 2011
       *rv = !err && m_Queries.IsAdditionSupported(params.lhs_info, params.rhs_info,
                                                   params.output_quantization_info);
 #else
@@ -721,7 +721,7 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.sigmoid")
       Call call = args[0];
       SigmoidParams params;
       auto err = EthosnAPI::Sigmoid(call, &params);
-#if _ETHOSN_API_VERSION_ == 2011
+#if _ETHOSN_API_VERSION_ >= 2011
       *rv = !err && m_Queries.IsSigmoidSupported(params.input_info);
 #else
       *rv = !err && sl::IsSigmoidSupported(params.input_info);
@@ -733,7 +733,7 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.concatenate")
       Call call = args[0];
       ConcatenateParams params;
       auto err = EthosnAPI::Concatenate(call, &params);
-#if _ETHOSN_API_VERSION_ == 2011
+#if _ETHOSN_API_VERSION_ >= 2011
       *rv = !err && m_Queries.IsConcatenationSupported(params.input_infos, params.concat_info);
 #else
       *rv = !err && sl::IsConcatenationSupported(params.input_infos, params.concat_info);
@@ -745,7 +745,7 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.split")
       Call call = args[0];
       SplitParams params;
       auto err = EthosnAPI::Split(call, &params);
-#if _ETHOSN_API_VERSION_ == 2011
+#if _ETHOSN_API_VERSION_ >= 2011
       *rv = !err && m_Queries.IsSplitSupported(params.input_info, params.split_info);
 #else
       *rv = !err && sl::IsSplitSupported(params.input_info, params.split_info);
@@ -757,7 +757,7 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.depth_to_space")
       Call call = args[0];
       DepthToSpaceParams params;
       auto err = EthosnAPI::DepthToSpace(call, &params);
-#if _ETHOSN_API_VERSION_ == 2011
+#if _ETHOSN_API_VERSION_ >= 2011
       *rv = !err && m_Queries.IsDepthToSpaceSupported(params.input_info, params.depth_info);
 #else
       *rv = !err && sl::IsDepthToSpaceSupported(params.input_info, params.depth_info);
@@ -769,7 +769,7 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.relu")
       Call call = args[0];
       ReluParams params;
       auto err = EthosnAPI::Relu(call, &params);
-#if _ETHOSN_API_VERSION_ == 2011
+#if _ETHOSN_API_VERSION_ >= 2011
       *rv = !err && m_Queries.IsReluSupported(params.relu_info, params.input_info);
 #else
       *rv = !err && sl::IsReluSupported(params.relu_info, params.input_info);
diff --git a/src/relay/backend/contrib/ethosn/ethosn_api_version.h b/src/relay/backend/contrib/ethosn/ethosn_api_version.h
index 78f08950bb48..c975ee6e8ae8 100644
--- a/src/relay/backend/contrib/ethosn/ethosn_api_version.h
+++ b/src/relay/backend/contrib/ethosn/ethosn_api_version.h
@@ -29,6 +29,9 @@
  * along with associated compatibility measures when no
  * longer necessary.
  */
+#if ETHOSN_SUPPORT_LIBRARY_VERSION_PATCH == 2
+#define _ETHOSN_API_VERSION_ 2102
+#else
 #ifndef ETHOSN_API_VERSION
 #define _ETHOSN_API_VERSION_ 2008
 #elif ~(~ETHOSN_API_VERSION + 0) == 0 && ~(~ETHOSN_API_VERSION + 1) == 1
@@ -36,5 +39,6 @@
 #else
 #define _ETHOSN_API_VERSION_ ETHOSN_API_VERSION
 #endif
+#endif
 
 #endif  // TVM_RELAY_BACKEND_CONTRIB_ETHOSN_ETHOSN_API_VERSION_H_
diff --git a/src/relay/backend/contrib/tensorrt/codegen.cc b/src/relay/backend/contrib/tensorrt/codegen.cc
index 059dbc192a04..e121b6010ad8 100644
--- a/src/relay/backend/contrib/tensorrt/codegen.cc
+++ b/src/relay/backend/contrib/tensorrt/codegen.cc
@@ -32,7 +32,7 @@
 #include "../../utils.h"
 #include "../codegen_json/codegen_json.h"
 
-#if TVM_GRAPH_RUNTIME_TENSORRT
+#if TVM_GRAPH_EXECUTOR_TENSORRT
 #include "NvInfer.h"
 #endif
 
@@ -217,15 +217,15 @@ runtime::Module TensorRTCompiler(const ObjectRef& ref) {
 TVM_REGISTER_GLOBAL("relay.ext.tensorrt").set_body_typed(TensorRTCompiler);
 
 /*!
- * \brief Check whether TensorRT graph runtime is enabled.
+ * \brief Check whether TensorRT graph executor is enabled.
  * \return True if enabled, False if not.
  */
 inline constexpr bool IsTensorRTRuntimeEnabled() {
-#if TVM_GRAPH_RUNTIME_TENSORRT
+#if TVM_GRAPH_EXECUTOR_TENSORRT
   return true;
 #else
   return false;
-#endif  // TVM_GRAPH_RUNTIME_TENSORRT
+#endif  // TVM_GRAPH_EXECUTOR_TENSORRT
 }
 
 /*!
@@ -234,11 +234,11 @@ inline constexpr bool IsTensorRTRuntimeEnabled() {
  * runtime is not enabled.
  */
 Array<Integer> GetTensorRTVersion() {
-#if TVM_GRAPH_RUNTIME_TENSORRT
+#if TVM_GRAPH_EXECUTOR_TENSORRT
   return {Integer(NV_TENSORRT_MAJOR), Integer(NV_TENSORRT_MINOR), Integer(NV_TENSORRT_PATCH)};
 #else
   return {};
-#endif  // TVM_GRAPH_RUNTIME_TENSORRT
+#endif  // TVM_GRAPH_EXECUTOR_TENSORRT
 }
 
 TVM_REGISTER_GLOBAL("relay.op.is_tensorrt_runtime_enabled")
diff --git a/src/relay/backend/graph_runtime_codegen.cc b/src/relay/backend/graph_executor_codegen.cc
similarity index 96%
rename from src/relay/backend/graph_runtime_codegen.cc
rename to src/relay/backend/graph_executor_codegen.cc
index 7ed150495104..72989b5ba46a 100644
--- a/src/relay/backend/graph_runtime_codegen.cc
+++ b/src/relay/backend/graph_executor_codegen.cc
@@ -19,7 +19,7 @@
 
 /*!
  * \file relay/backend/graph_codegen.cc
- * \brief Graph runtime codegen
+ * \brief Graph executor codegen
  */
 
 #include <dmlc/any.h>
@@ -181,10 +181,10 @@ class GraphOpNode : public GraphNode {
   const std::string op_type_name_{"tvm_op"};
 };
 
-/*! \brief Code generator for graph runtime */
-class GraphRuntimeCodegen : public backend::MemoizedExprTranslator<std::vector<GraphNodeRef>> {
+/*! \brief Code generator for graph executor */
+class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<GraphNodeRef>> {
  public:
-  GraphRuntimeCodegen(runtime::Module* mod, const TargetsMap& targets) : mod_(mod) {
+  GraphExecutorCodegen(runtime::Module* mod, const TargetsMap& targets) : mod_(mod) {
     compile_engine_ = CompileEngine::Global();
     targets_ = targets;
   }
@@ -541,7 +541,7 @@ class GraphRuntimeCodegen : public backend::MemoizedExprTranslator<std::vector<G
   TargetsMap targets_;
   /*!
    * \brief parameters (i.e. ConstantNodes found in the graph).
-   * These are take as inputs to the GraphRuntime.
+   * These are take as inputs to the GraphExecutor.
    * Maps param name to a pair of storage_id and NDArray. At runtime, the storage_id can be
    * used to lookup the parameter.
    */
@@ -557,9 +557,9 @@ class GraphRuntimeCodegen : public backend::MemoizedExprTranslator<std::vector<G
   CompileEngine compile_engine_;
 };
 
-class GraphRuntimeCodegenModule : public runtime::ModuleNode {
+class GraphExecutorCodegenModule : public runtime::ModuleNode {
  public:
-  GraphRuntimeCodegenModule() {}
+  GraphExecutorCodegenModule() {}
   virtual PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) {
     if (name == "init") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
@@ -573,8 +573,8 @@ class GraphRuntimeCodegenModule : public runtime::ModuleNode {
           ICHECK(dev_type);
           targets[dev_type->value] = it.second;
         }
-        codegen_ =
-            std::make_shared<GraphRuntimeCodegen>(reinterpret_cast<runtime::Module*>(mod), targets);
+        codegen_ = std::make_shared<GraphExecutorCodegen>(reinterpret_cast<runtime::Module*>(mod),
+                                                          targets);
       });
     } else if (name == "codegen") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
@@ -619,19 +619,19 @@ class GraphRuntimeCodegenModule : public runtime::ModuleNode {
     }
   }
 
-  const char* type_key() const final { return "RelayGraphRuntimeCodegenModule"; }
+  const char* type_key() const final { return "RelayGraphExecutorCodegenModule"; }
 
  private:
-  std::shared_ptr<GraphRuntimeCodegen> codegen_;
+  std::shared_ptr<GraphExecutorCodegen> codegen_;
   LoweredOutput output_;
 };
 
 runtime::Module CreateGraphCodegenMod() {
-  auto ptr = make_object<GraphRuntimeCodegenModule>();
+  auto ptr = make_object<GraphExecutorCodegenModule>();
   return runtime::Module(ptr);
 }
 
-TVM_REGISTER_GLOBAL("relay.build_module._GraphRuntimeCodegen")
+TVM_REGISTER_GLOBAL("relay.build_module._GraphExecutorCodegen")
     .set_body([](TVMArgs args, TVMRetValue* rv) { *rv = CreateGraphCodegenMod(); });
 
 }  // namespace backend
diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc
index 15173c2c79db..4260f052d2c0 100644
--- a/src/relay/backend/graph_plan_memory.cc
+++ b/src/relay/backend/graph_plan_memory.cc
@@ -20,7 +20,7 @@
 /*!
  * \file relay/backend/graph_plan_memory.cc
  * \brief Memory index assignment pass for executing
- *   the program in the graph runtime.
+ *   the program in the graph executor.
  */
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/expr.h>
@@ -42,7 +42,7 @@ struct StorageToken {
   /*! \brief The corresponding tensor type node. */
   const TensorTypeNode* ttype{nullptr};
   /*! \brief virtual device index that corresponds to the device_type in
-   * DLContext. */
+   * DLDevice. */
   int device_type{0};
   /*! \brief The storage id */
   int64_t storage_id{-1};
diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc
index 993fb1a62787..eeba010dc164 100644
--- a/src/relay/backend/interpreter.cc
+++ b/src/relay/backend/interpreter.cc
@@ -212,8 +212,8 @@ InterpreterState::InterpreterState(Expr current_expr, InterpreterState::Stack st
 class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
                     PatternFunctor<bool(const Pattern& p, const ObjectRef& v)> {
  public:
-  Interpreter(IRModule mod, DLContext context, Target target)
-      : mod_(mod), context_(context), target_(target), debug_op_(Op::Get("debug")) {
+  Interpreter(IRModule mod, Device device, Target target)
+      : mod_(mod), device_(device), target_(target), debug_op_(Op::Get("debug")) {
     engine_ = CompileEngine::Global();
   }
 
@@ -243,7 +243,7 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
     return ObjectRef();
   }
 
-  ObjectRef VisitExpr_(const ConstantNode* op) final { return op->data.CopyTo(context_); }
+  ObjectRef VisitExpr_(const ConstantNode* op) final { return op->data.CopyTo(device_); }
 
   ObjectRef VisitExpr_(const TupleNode* op) final {
     std::vector<ObjectRef> values;
@@ -294,9 +294,9 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
     std::vector<NDArray> inputs(cfunc->inputs.size());
     std::vector<NDArray> outputs(cfunc->outputs.size());
 
-    DLContext cpu_ctx;
-    cpu_ctx.device_type = kDLCPU;
-    cpu_ctx.device_id = 0;
+    Device cpu_dev;
+    cpu_dev.device_type = kDLCPU;
+    cpu_dev.device_id = 0;
 
     auto fset_input = [&](size_t i, ObjectRef val, bool need_shape) {
       auto nd_array = Downcast<NDArray>(val);
@@ -304,9 +304,9 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
         int64_t ndim = nd_array.Shape().size();
         NDArray shape_arr;
         if (ndim == 0) {
-          shape_arr = NDArray::Empty({}, DataType::Int(64), cpu_ctx);
+          shape_arr = NDArray::Empty({}, DataType::Int(64), cpu_dev);
         } else {
-          shape_arr = NDArray::Empty({ndim}, DataType::Int(64), cpu_ctx);
+          shape_arr = NDArray::Empty({ndim}, DataType::Int(64), cpu_dev);
           int64_t* data = reinterpret_cast<int64_t*>(shape_arr->data);
           for (auto j = 0; j < ndim; ++j) {
             data[j] = nd_array.Shape()[j];
@@ -315,7 +315,7 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
         inputs[i] = shape_arr;
         setter(i, shape_arr);
       } else {
-        auto arr = nd_array.CopyTo(cpu_ctx);
+        auto arr = nd_array.CopyTo(cpu_dev);
         inputs[i] = arr;
         setter(i, arr);
       }
@@ -354,7 +354,7 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
       const TensorTypeNode* rtype = val_type.as<TensorTypeNode>();
       ICHECK(rtype != nullptr);
       int64_t ndim = rtype->shape.size();
-      auto arr = NDArray::Empty({ndim}, DataType::Int(64), cpu_ctx);
+      auto arr = NDArray::Empty({ndim}, DataType::Int(64), cpu_dev);
       outputs[i] = arr;
       setter(arg_counter + i, arr);
     };
@@ -438,9 +438,9 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
     auto fset_input = [&](size_t i, ObjectRef val) {
       const auto nd_array = Downcast<NDArray>(val);
       setter(i, nd_array);
-      DLContext arg_ctx = nd_array->ctx;
-      ICHECK(arg_ctx.device_type == context_.device_type && arg_ctx.device_id == context_.device_id)
-          << "Interpreter expect context to be " << context_ << ", but get " << arg_ctx;
+      Device arg_dev = nd_array->device;
+      ICHECK(arg_dev.device_type == device_.device_type && arg_dev.device_id == device_.device_id)
+          << "Interpreter expect device to be " << device_ << ", but get " << arg_dev;
     };
 
     int arg_counter = 0;
@@ -470,7 +470,7 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
         shape.push_back(ivalue[0]);
       }
       DLDataType dtype = rtype->dtype;
-      NDArray nd_array = NDArray::Empty(shape, dtype, context_);
+      NDArray nd_array = NDArray::Empty(shape, dtype, device_);
       setter(num_inputs + i, nd_array);
       return nd_array;
     };
@@ -603,10 +603,10 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
     ObjectRef v = Eval(op->cond);
     if (v->IsInstance<NDArray::ContainerType>()) {
       auto nd_array = Downcast<NDArray>(v);
-      DLContext cpu_ctx;
-      cpu_ctx.device_type = kDLCPU;
-      cpu_ctx.device_id = 0;
-      NDArray cpu_array = nd_array.CopyTo(cpu_ctx);
+      Device cpu_dev;
+      cpu_dev.device_type = kDLCPU;
+      cpu_dev.device_id = 0;
+      NDArray cpu_array = nd_array.CopyTo(cpu_dev);
       ICHECK_EQ(DataType(cpu_array->dtype), DataType::Bool());
       // TODO(@jroesch, @MK): Refactor code into helper from DCE.
       if (reinterpret_cast<uint8_t*>(cpu_array->data)[0]) {
@@ -704,7 +704,7 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
   IRModule mod_;
   // For simplicity we only run the interpreter on a single context.
   // Context to run the interpreter on.
-  DLContext context_;
+  Device device_;
   // Target parameter being used by the interpreter.
   Target target_;
   // Object stack.
@@ -715,7 +715,7 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
   const Op& debug_op_;
 };
 
-TypedPackedFunc<ObjectRef(Expr)> CreateInterpreter(IRModule mod, DLContext context, Target target) {
+TypedPackedFunc<ObjectRef(Expr)> CreateInterpreter(IRModule mod, Device device, Target target) {
   if (mod.defined()) {
     // eta expand to support constructors in argument position
     transform::Sequential seq({transform::EtaExpand(
@@ -727,7 +727,7 @@ TypedPackedFunc<ObjectRef(Expr)> CreateInterpreter(IRModule mod, DLContext conte
     mod = seq(mod);
   }
 
-  auto intrp = std::make_shared<Interpreter>(mod, context, target);
+  auto intrp = std::make_shared<Interpreter>(mod, device, target);
   auto packed = [intrp](Expr expr) {
     auto f = DetectFeature(expr);
     ICHECK(f.is_subset_of(FeatureSet::All() - fGraph));
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index af58a8a2747f..f734bb3ba126 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -33,8 +33,8 @@
 #include <tvm/relay/interpreter.h>
 #include <tvm/relay/qnn/transform.h>
 #include <tvm/relay/transform.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/vm/vm.h>
-#include <tvm/support/logging.h>
 #include <tvm/te/operation.h>
 
 #include <iostream>
@@ -1176,7 +1176,7 @@ void VMCompiler::Codegen() {
 }
 
 ExprDeviceMap VMCompiler::AnalyzeContext() const {
-  TVMContext default_device;
+  Device default_device;
   ExprDeviceMap expr_device_map;
   if (targets_.size() > 1) {
     int fallback_dev = GetFallbackDevice();
diff --git a/src/relay/backend/vm/compiler.h b/src/relay/backend/vm/compiler.h
index 615a8181b387..3a3796373a61 100644
--- a/src/relay/backend/vm/compiler.h
+++ b/src/relay/backend/vm/compiler.h
@@ -29,8 +29,8 @@
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/interpreter.h>
 #include <tvm/relay/transform.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/vm/vm.h>
-#include <tvm/support/logging.h>
 #include <tvm/tir/function.h>
 
 #include <iostream>
@@ -62,7 +62,7 @@ using GlobalMap = NodeMap<GlobalVar, Index>;
 using ConstMap = NodeMap<Constant, Index>;
 using ConstTensorShapeMap = NodeMap<TensorType, std::pair<Index, NDArray>>;
 using TargetsMap = Map<tvm::Integer, tvm::Target>;
-using ExprDeviceMap = std::unordered_map<Expr, TVMContext, ObjectPtrHash, ObjectPtrEqual>;
+using ExprDeviceMap = std::unordered_map<Expr, Device, ObjectPtrHash, ObjectPtrEqual>;
 
 struct VMCompilerContext {
   // The module context for the compilation
diff --git a/src/relay/backend/vm/inline_primitives.cc b/src/relay/backend/vm/inline_primitives.cc
index eb848eb7a828..05fb2a120620 100644
--- a/src/relay/backend/vm/inline_primitives.cc
+++ b/src/relay/backend/vm/inline_primitives.cc
@@ -25,7 +25,7 @@
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include <iostream>
 #include <vector>
diff --git a/src/relay/backend/vm/lambda_lift.cc b/src/relay/backend/vm/lambda_lift.cc
index cc530a10188e..c768a2c300ec 100644
--- a/src/relay/backend/vm/lambda_lift.cc
+++ b/src/relay/backend/vm/lambda_lift.cc
@@ -28,7 +28,7 @@
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include <iostream>
 #include <vector>
diff --git a/src/relay/backend/vm/removed_unused_funcs.cc b/src/relay/backend/vm/removed_unused_funcs.cc
index cdf898fca756..5e9b1b7978f9 100644
--- a/src/relay/backend/vm/removed_unused_funcs.cc
+++ b/src/relay/backend/vm/removed_unused_funcs.cc
@@ -26,7 +26,7 @@
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include <iostream>
 #include <unordered_set>
diff --git a/src/relay/op/make_op.h b/src/relay/op/make_op.h
index 79f7e135e29d..36a5ec1c0e72 100644
--- a/src/relay/op/make_op.h
+++ b/src/relay/op/make_op.h
@@ -75,6 +75,8 @@ Expr MakeSqueeze(Expr data, Array<Integer> axis);
 
 Expr MakeStack(Expr data, int axis);
 
+Expr MakeTranspose(Expr data, Array<Integer> axes);
+
 Expr MakeStridedSlice(Expr data, Array<Integer> begin, Array<Integer> end, Array<Integer> strides,
                       String slice_mode);
 
diff --git a/src/relay/op/memory/memory.cc b/src/relay/op/memory/memory.cc
index 287564ba4f21..c2997fb6cf95 100644
--- a/src/relay/op/memory/memory.cc
+++ b/src/relay/op/memory/memory.cc
@@ -48,11 +48,11 @@ TVM_REGISTER_NODE_TYPE(AllocTensorAttrs);
 // The passing value in attrs and args doesn't seem super great.
 // We should consider a better solution, i.e the type relation
 // being able to see the arguments as well?
-Expr AllocStorage(Expr size, Expr alignment, TVMContext ctx, DataType dtype_hint) {
+Expr AllocStorage(Expr size, Expr alignment, Device dev, DataType dtype_hint) {
   auto attrs = make_object<AllocStorageAttrs>();
   attrs->dtype = dtype_hint;
-  attrs->device_id = ctx.device_id;
-  attrs->device_type = ctx.device_type;
+  attrs->device_id = dev.device_id;
+  attrs->device_type = dev.device_type;
   static const Op& op = Op::Get("memory.alloc_storage");
   return Call(op, {size, alignment}, Attrs(attrs), {});
 }
diff --git a/src/relay/op/memory/memory.h b/src/relay/op/memory/memory.h
index 6e184507bad5..bbbd11867549 100644
--- a/src/relay/op/memory/memory.h
+++ b/src/relay/op/memory/memory.h
@@ -32,7 +32,7 @@
 namespace tvm {
 namespace relay {
 
-Expr AllocStorage(Expr size, Expr alignment, TVMContext ctx, DataType dtype_hint);
+Expr AllocStorage(Expr size, Expr alignment, Device dev, DataType dtype_hint);
 Expr DeviceCopy(Expr data, int src_dev_type, int dst_dev_type);
 Expr AllocTensor(Expr storage, Expr offset, tvm::relay::Expr shape, DataType dtype,
                  Array<IndexExpr> assert_shape);
diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h
index 2a49a2e251f8..379fa3fa71d3 100644
--- a/src/relay/op/nn/convolution.h
+++ b/src/relay/op/nn/convolution.h
@@ -25,7 +25,7 @@
 #define TVM_RELAY_OP_NN_CONVOLUTION_H_
 
 #include <tvm/auto_scheduler/compute_dag.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/tir/analysis.h>
 
 #include <string>
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 0ea71de367fa..b2404cc1954b 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -590,6 +590,7 @@ The whole array is rescaled by ``1/(1-p)`` to keep the expected sum of the input
     .set_num_inputs(1)
     .add_argument("data", "Tensor", "Input to which dropout will be applied.")
     .set_support_level(1)
+    .set_attr<TOpPattern>("TOpPattern", kOpaque)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
     .add_type_rel("Dropout", DropoutRel)
     .set_attr<TOpIsStateful>("TOpIsStateful", true);
diff --git a/src/relay/op/nn/sparse.cc b/src/relay/op/nn/sparse.cc
index 6322cfffd7c2..b1a16f18b623 100644
--- a/src/relay/op/nn/sparse.cc
+++ b/src/relay/op/nn/sparse.cc
@@ -196,5 +196,46 @@ RELAY_REGISTER_OP("nn.sparse_transpose")
     .set_support_level(1)
     .add_type_rel("SparseTranspose", SparseTransposeRel);
 
+// relay.nn.sparse_add
+bool SparseAddRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                  const TypeReporter& reporter) {
+  ICHECK_EQ(types.size(), 5) << "expecting 4 inputs and 1 output.";
+  const auto* dense_data = types[0].as<TensorTypeNode>();
+  const auto* sparse_data = types[1].as<TensorTypeNode>();
+  ICHECK(reporter->Assert(sparse_data->dtype == dense_data->dtype))
+      << "sparse tensor and dense tensor datatype should match.";
+  ICHECK(reporter->Assert(sparse_data->shape.size() == 1)) << "sparse data tensor should be 1D.";
+  const auto* sparse_indices = types[2].as<TensorTypeNode>();
+  ICHECK(reporter->Assert(sparse_indices->shape.size() == 1))
+      << "sparse indices tensor should be 1D.";
+
+  reporter->Assign(types[4], TensorType(dense_data->shape, dense_data->dtype));
+  return true;
+}
+
+Expr MakeSparseAdd(Expr dense_data, Expr sparse_data, Expr sparse_indices, Expr sparse_indptr) {
+  static const Op& op = Op::Get("nn.sparse_add");
+  return Call(op, {dense_data, sparse_data, sparse_indices, sparse_indptr}, Attrs(), {});
+}
+
+TVM_REGISTER_GLOBAL("relay.op.nn._make.sparse_add").set_body_typed(MakeSparseAdd);
+
+RELAY_REGISTER_OP("nn.sparse_add")
+    .describe(R"code(Add a dense matrix X with sparse matrix Y.
+
+- **dense**: `(M, N)`
+- **sparse**: `(M, N)`
+
+- **out**: `(M, N)`.
+
+)code" TVM_ADD_FILELINE)
+    .set_num_inputs(4)
+    .add_argument("dense_data", "2D Tensor", "Dense data matrix.")
+    .add_argument("sparse_data", "1D Tensor", "Sparse data vector.")
+    .add_argument("sparse_indices", "1D Tensor", "Sparse indices vector.")
+    .add_argument("sparse_indptr", "1D Tensor", "Sparse index pointer vector.")
+    .set_support_level(1)
+    .add_type_rel("SparseAdd", SparseAddRel);
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index e3929bf8b77e..6fb9f77f99ea 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -312,7 +312,7 @@ bool StackRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
       if (first->shape[j].as<AnyNode>() || e->shape[j].as<AnyNode>() ||
           reporter->AssertEQ(first->shape[j], e->shape[j]))
         continue;
-      throw Error(
+      throw CompileError(
           "relay.stack requires all tensors have the same shape "
           "on non-stacking axes");
     }
@@ -483,7 +483,7 @@ Array<Array<Layout>> TransposeInferCorrectLayout(const Attrs& attrs,
     }
     try {
       return Array<Array<Layout>>({{Layout(in_layout_str)}, {Layout(out_layout_str)}});
-    } catch (const dmlc::Error& e) {
+    } catch (const tvm::Error& e) {
       // If the layout string is invalid for any reason, give up.
       return Array<Array<Layout>>({{Layout::Undef()}, {Layout::Undef()}});
     }
@@ -1691,8 +1691,8 @@ bool MeshgridRel(const Array<Type>& types, int num_inputs, const Attrs& raw_attr
   const MeshgridAttrs* attrs = raw_attrs.as<MeshgridAttrs>();
   const auto* tensor_tuple = types[0].as<TupleTypeNode>();
   if (tensor_tuple == nullptr) {
-    throw Error(
-        ErrorBuilder() << "meshgrid requires a tuple of tensors as the first argument, found "
+    throw CompileError(ErrorBuilder()
+                       << "meshgrid requires a tuple of tensors as the first argument, found "
                        << PrettyPrint(types[0]));
   } else if (types[0].as<IncompleteTypeNode>() != nullptr) {
     return false;
@@ -1714,14 +1714,14 @@ bool MeshgridRel(const Array<Type>& types, int num_inputs, const Attrs& raw_attr
     int e_ndim = static_cast<int>(e->shape.size());
     const DataType& e_dtype = e->dtype;
     if (e_dtype != dtype) {
-      throw Error("relay.meshgrid requires all tensors have the same dtype");
+      throw CompileError("relay.meshgrid requires all tensors have the same dtype");
     }
     if (e_ndim == 0) {
       grid_shape.emplace_back(1);
     } else if (e_ndim == 1) {
       grid_shape.emplace_back(e->shape[0]);
     } else {
-      throw Error("relay.meshgrid requires all tensors be either scalars or 1-D vectors.");
+      throw CompileError("relay.meshgrid requires all tensors be either scalars or 1-D vectors.");
     }
   }
 
@@ -3772,20 +3772,20 @@ RELAY_REGISTER_OP("adv_index")
     .set_attr<TOpPattern>("TOpPattern", kInjective)
     .set_attr<FTVMCompute>("FTVMCompute", AdvIndexCompute);
 
-TVM_REGISTER_NODE_TYPE(CumsumAttrs);
+TVM_REGISTER_NODE_TYPE(ScanopAttrs);
 
-bool CumsumRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+bool ScanopRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
   // types: [data, output]
   ICHECK_EQ(types.size(), 2) << "Expects two types, one for the input and another for the output";
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
     ICHECK(types[0].as<IncompleteTypeNode>())
-        << "cumsum: expect input type to be TensorType but get " << types[0];
+        << "Scanop: expect input type to be TensorType but get " << types[0];
     return false;
   }
 
-  const auto* param = attrs.as<CumsumAttrs>();
+  const auto* param = attrs.as<ScanopAttrs>();
 
   auto dtype = param->dtype;
   if (dtype.is_void()) {
@@ -3805,8 +3805,8 @@ bool CumsumRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   return true;
 }
 
-Expr MakeCumsum(Expr data, Integer axis, DataType dtype, Integer exclusive) {
-  auto attrs = make_object<CumsumAttrs>();
+Expr MakeCumsum(Expr data, Integer axis, DataType dtype, Bool exclusive) {
+  auto attrs = make_object<ScanopAttrs>();
   attrs->dtype = dtype;
   attrs->axis = axis;
   attrs->exclusive = exclusive;
@@ -3822,7 +3822,27 @@ RELAY_REGISTER_OP("cumsum")
     .set_num_inputs(1)
     .add_argument("data", "Tensor", "The input tensor.")
     .set_support_level(3)
-    .add_type_rel("Cumsum", CumsumRel)
+    .add_type_rel("Cumsum", ScanopRel)
+    .set_attr<TOpPattern>("TOpPattern", kOpaque);
+
+Expr MakeCumprod(Expr data, Integer axis, DataType dtype, Bool exclusive) {
+  auto attrs = make_object<ScanopAttrs>();
+  attrs->dtype = dtype;
+  attrs->axis = axis;
+  attrs->exclusive = exclusive;
+  static const Op& op = Op::Get("cumprod");
+  return Call(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.cumprod").set_body_typed(MakeCumprod);
+
+RELAY_REGISTER_OP("cumprod")
+    .describe(
+        R"doc(Return the cumulative product of the elements along a given axis.)doc" TVM_ADD_FILELINE)
+    .set_num_inputs(1)
+    .add_argument("data", "Tensor", "The input tensor.")
+    .set_support_level(3)
+    .add_type_rel("Cumprod", ScanopRel)
     .set_attr<TOpPattern>("TOpPattern", kOpaque);
 
 TVM_REGISTER_NODE_TYPE(UniqueAttrs);
diff --git a/src/relay/op/tensor/transform.h b/src/relay/op/tensor/transform.h
index dbf8537e0dad..3c670bcaaa51 100644
--- a/src/relay/op/tensor/transform.h
+++ b/src/relay/op/tensor/transform.h
@@ -78,8 +78,8 @@ bool ConcatenateRel(const Array<Type>& types, int num_inputs, const Attrs& attrs
   // Sanity check: axis
   int axis = param->axis;
   if (!(-ndim <= axis && axis < ndim)) {
-    throw Error(ErrorBuilder() << "concatenate only accepts `axis` in [-ndim, ndim)"
-                               << ", but got axis = " << axis << ", and ndim = " << ndim);
+    throw CompileError(ErrorBuilder() << "concatenate only accepts `axis` in [-ndim, ndim)"
+                                      << ", but got axis = " << axis << ", and ndim = " << ndim);
   }
   axis = axis < 0 ? ndim + axis : axis;
 
diff --git a/src/relay/op/type_relations.cc b/src/relay/op/type_relations.cc
index 7b30aea2eb57..6e30ad9624c4 100644
--- a/src/relay/op/type_relations.cc
+++ b/src/relay/op/type_relations.cc
@@ -85,7 +85,7 @@ TensorType ConcreteBroadcast(const TensorType& t1, const TensorType& t2, DataTyp
     } else if (EqualCheck(s1, s2)) {
       oshape.push_back(s1);
     } else {
-      throw Error(ErrorBuilder() << "Incompatible broadcast type " << t1 << " and " << t2);
+      throw CompileError(ErrorBuilder() << "Incompatible broadcast type " << t1 << " and " << t2);
     }
   }
 
diff --git a/src/relay/qnn/op/concatenate.cc b/src/relay/qnn/op/concatenate.cc
index 59a519d66436..eb0f83836a54 100644
--- a/src/relay/qnn/op/concatenate.cc
+++ b/src/relay/qnn/op/concatenate.cc
@@ -51,9 +51,10 @@ bool QnnConcatenateRel(const Array<Type>& types, int num_inputs, const Attrs& at
     if (types[1].as<IncompleteTypeNode>()) {
       return false;
     } else {
-      throw Error(ErrorBuilder()
-                  << "qnn concatenate requires a tuple of scales as the second argument, found "
-                  << PrettyPrint(types[1]));
+      throw CompileError(
+          ErrorBuilder()
+          << "qnn concatenate requires a tuple of scales as the second argument, found "
+          << PrettyPrint(types[1]));
     }
   }
   for (const auto& input_scale : input_scales_tuple->fields) {
@@ -68,9 +69,10 @@ bool QnnConcatenateRel(const Array<Type>& types, int num_inputs, const Attrs& at
     if (types[2].as<IncompleteTypeNode>()) {
       return false;
     } else {
-      throw Error(ErrorBuilder()
-                  << "qnn concatenate requires a tuple of zero_points as the third argument, found "
-                  << PrettyPrint(types[2]));
+      throw CompileError(
+          ErrorBuilder()
+          << "qnn concatenate requires a tuple of zero_points as the third argument, found "
+          << PrettyPrint(types[2]));
     }
   }
   for (const auto& input_zero_point : input_zero_points_tuple->fields) {
diff --git a/src/relay/qnn/op/dequantize.cc b/src/relay/qnn/op/dequantize.cc
index 724441e0c523..b0fe9356a758 100644
--- a/src/relay/qnn/op/dequantize.cc
+++ b/src/relay/qnn/op/dequantize.cc
@@ -53,7 +53,7 @@ bool DequantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 
   const auto* dequantize_attrs = attrs.as<DequantizeAttrs>();
   int axis = dequantize_attrs->axis;
-  axis = (axis == -1) ? data->shape.size() - 1 : axis;
+  axis = (axis < 0) ? data->shape.size() + axis : axis;
   ICHECK_LT(axis, static_cast<int>(data->shape.size()))
       << "axis " << dequantize_attrs->axis << " is out of range";
   ICHECK_GE(axis, 0) << "axis " << dequantize_attrs->axis << " is out of range";
@@ -81,7 +81,7 @@ Expr MakeDequantize(Expr data, Expr input_scale, Expr input_zero_point, int axis
 Expr DequantizeLower(const Expr& input_tensor, const Expr& input_scale,
                      const Expr& input_zero_point, const Array<tvm::relay::Type>& types,
                      const DequantizeAttrs* attrs) {
-  const auto axis = attrs->axis;
+  auto axis = attrs->axis;
 
   ICHECK_EQ(types.size(), 4);
   auto in_type = types[0];
@@ -92,6 +92,11 @@ Expr DequantizeLower(const Expr& input_tensor, const Expr& input_scale,
 
   size_t n_dim = input_shape.size();
 
+  // Wrap axis from negative to positive if needed.
+  if (axis < 0) {
+    axis = static_cast<int>(n_dim) + axis;
+  }
+
   // Expand scale and zero point if the input tensor is channel quantized
   auto expanded_input_scale = input_scale;
   if (!IsConstScalar(input_scale) && !IsScalarType(types[1])) {
diff --git a/src/relay/qnn/op/quantize.cc b/src/relay/qnn/op/quantize.cc
index 9829834f43a3..751abfc5ca81 100644
--- a/src/relay/qnn/op/quantize.cc
+++ b/src/relay/qnn/op/quantize.cc
@@ -19,8 +19,8 @@
 
 /*!
  * \file src/relay/qnn/op/quantize.cc
- * \brief QNN dequantize operator. Dequantize operator converts from quantized
- * domain to unquantized domain.
+ * \brief QNN quantize operator. Quantize operator converts from unquantized
+ * domain to quantized domain.
  */
 
 #include <tvm/relay/analysis.h>
@@ -51,7 +51,7 @@ bool QuantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 
   const auto* quantize_attrs = attrs.as<QuantizeAttrs>();
   int axis = quantize_attrs->axis;
-  axis = (axis == -1) ? data->shape.size() - 1 : axis;
+  axis = (axis < 0) ? data->shape.size() + axis : axis;
   ICHECK_LT(axis, static_cast<int>(data->shape.size()))
       << "axis " << quantize_attrs->axis << " is out of range";
   ICHECK_GE(axis, 0) << "axis " << quantize_attrs->axis << " is out of range";
@@ -93,10 +93,15 @@ Expr QuantizeLower(const Expr& input_tensor, const Expr& output_scale,
   Array<IndexExpr> input_shape = in_tensor_type->shape;
 
   const auto out_dtype = attrs->out_dtype;
-  const auto axis = attrs->axis;
+  auto axis = attrs->axis;
 
   size_t n_dim = input_shape.size();
 
+  // Wrap axis from negative to positive if needed.
+  if (axis < 0) {
+    axis = static_cast<int>(n_dim) + axis;
+  }
+
   auto expanded_output_scale = output_scale;
   if (!IsConstScalar(output_scale) && !IsScalarType(types[1])) {
     expanded_output_scale = ExpandBiasToMatchAxis(output_scale, n_dim, {axis});
diff --git a/src/relay/qnn/op/simulated_dequantize.cc b/src/relay/qnn/op/simulated_dequantize.cc
new file mode 100644
index 000000000000..e1fc47d700c9
--- /dev/null
+++ b/src/relay/qnn/op/simulated_dequantize.cc
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/simulated_dequantize.cc
+ * \brief QNN simulated dequantize operator. Mimics the behavior
+ * of QNN dequantize in floating point with added flexibility.
+ */
+
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/relay/qnn/attrs.h>
+
+#include "../../transforms/pattern_utils.h"
+#include "../utils.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool SimulatedDequantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                            const TypeReporter& reporter) {
+  // types = [data_type, datatype_type, scale_type, zp_type, ret_type]
+  ICHECK_EQ(types.size(), 5);
+  const auto* data = types[0].as<TensorTypeNode>();
+  const auto* dtype = types[1].as<TensorTypeNode>();
+
+  if ((data == nullptr) || (dtype == nullptr)) {
+    return false;
+  }
+
+  // assign output type
+  reporter->Assign(types[4], TensorType(data->shape, data->dtype));
+  return true;
+}
+
+Expr MakeSimulatedDequantize(Expr data, Expr in_dtype, Expr input_scale, Expr input_zero_point,
+                             int axis) {
+  auto attrs = make_object<DequantizeAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.simulated_dequantize");
+  return Call(op, {data, in_dtype, input_scale, input_zero_point}, Attrs(attrs), {});
+}
+
+RELAY_REGISTER_OP("qnn.simulated_dequantize")
+    .describe(R"code(Simulates the functionality of qnn.dequantize but allows more flexible
+    dynamic input type conversion and always operates on float values.
+)code" TVM_ADD_FILELINE)
+    .set_attrs_type<DequantizeAttrs>()
+    .set_num_inputs(4)
+    .add_argument("data", "Tensor", "The tensor to dequantize.")
+    .add_argument("in_dtype", "Tensor",
+                  "A code corresponding to the type of quantization to convert from.")
+    .add_argument("input_scale", "Tensor", "The quantization scale of the input tensor.")
+    .add_argument("input_zero_point", "Tensor", "The quantization zero_point of the input tensor.")
+    .set_support_level(11)
+    .add_type_rel("QNNSimulatedDequantize", SimulatedDequantizeRel);
+
+TVM_REGISTER_GLOBAL("relay.qnn.op._make.simulated_dequantize")
+    .set_body_typed(MakeSimulatedDequantize);
+
+}  // namespace qnn
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/qnn/op/simulated_quantize.cc b/src/relay/qnn/op/simulated_quantize.cc
new file mode 100644
index 000000000000..089762a6ade0
--- /dev/null
+++ b/src/relay/qnn/op/simulated_quantize.cc
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/simulated_quantize.cc
+ * \brief QNN simulated quantize operator. Mimics the behavior
+ * of QNN quantize in floating point with added flexibility.
+ */
+
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/relay/qnn/attrs.h>
+
+#include "../../transforms/pattern_utils.h"
+#include "../utils.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+TVM_REGISTER_NODE_TYPE(SimulatedQuantizeAttrs);
+
+bool SimulatedQuantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                          const TypeReporter& reporter) {
+  // types = [data_type, datatype_type, scale_type, zp_type, ret_type]
+  ICHECK_EQ(types.size(), 5);
+  const auto* data = types[0].as<TensorTypeNode>();
+  const auto* dtype = types[1].as<TensorTypeNode>();
+
+  if ((data == nullptr) || (dtype == nullptr)) {
+    return false;
+  }
+
+  // assign output type
+  reporter->Assign(types[4], TensorType(data->shape, data->dtype));
+  return true;
+}
+
+Expr MakeSimulatedQuantize(Expr data, Expr out_dtype, Expr output_scale, Expr output_zero_point,
+                           int axis) {
+  auto attrs = make_object<SimulatedQuantizeAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.simulated_quantize");
+  return Call(op, {data, out_dtype, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+RELAY_REGISTER_OP("qnn.simulated_quantize")
+    .describe(R"code(Simulates the functionality of qnn.quantize but allows more flexible
+    dynamic input type conversion and always outputs float values.
+)code" TVM_ADD_FILELINE)
+    .set_attrs_type<SimulatedQuantizeAttrs>()
+    .set_num_inputs(4)
+    .add_argument("data", "Tensor", "The tensor to quantize.")
+    .add_argument("out_dtype", "Tensor",
+                  "A code corresponding to the type of quantization to apply.")
+    .add_argument("output_scale", "Tensor", "The quantization scale of the output tensor.")
+    .add_argument("output_zero_point", "Tensor",
+                  "The quantization zero_point of the output tensor.")
+    .set_support_level(11)
+    .add_type_rel("QNNSimulatedQuantize", SimulatedQuantizeRel);
+
+TVM_REGISTER_GLOBAL("relay.qnn.op._make.simulated_quantize").set_body_typed(MakeSimulatedQuantize);
+
+}  // namespace qnn
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/transforms/first_order_gradient.cc b/src/relay/transforms/first_order_gradient.cc
new file mode 100644
index 000000000000..55714592ded7
--- /dev/null
+++ b/src/relay/transforms/first_order_gradient.cc
@@ -0,0 +1,309 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file first_order_gradient.cc
+ * \brief First-order Automatic Differentiation in Relay for pure dataflow graphs.
+ */
+#include <tvm/ir/type_functor.h>
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/dataflow_matcher.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/feature.h>
+#include <tvm/relay/transform.h>
+#include <tvm/te/operation.h>
+
+#include "gradient.h"
+#include "let_list.h"
+#include "pass_utils.h"
+#include "pattern_utils.h"
+
+namespace tvm {
+namespace relay {
+
+template <typename F>
+Expr MultiFactory(const Type& t, F factory, DiagnosticContext diag_ctx) {
+  if (auto* tt = t.as<TensorTypeNode>()) {
+    return factory(tt->shape, tt->dtype);
+  } else if (auto* tt = t.as<TupleTypeNode>()) {
+    std::vector<Expr> res;
+    for (size_t i = 0; i < tt->fields.size(); i++) {
+      res.push_back(MultiFactory(tt->fields[i], factory, diag_ctx));
+    }
+    return Tuple(res);
+  } else {
+    diag_ctx.EmitFatal(Diagnostic::Error(t->span)
+                       << "could not build tensors using factory for type " << PrettyPrint(t));
+    throw;
+  }
+}
+
+template <typename F, typename F2>
+Expr MultiFactoryLike(const Expr& e, const Type& t, F factory, F2 factory_like,
+                      DiagnosticContext diag_ctx) {
+  if (t.as<TensorTypeNode>()) {
+    return factory_like(e);
+  } else if (auto* tt = t.as<TupleTypeNode>()) {
+    return MultiFactory(t, factory, diag_ctx);
+  } else {
+    diag_ctx.EmitFatal(Diagnostic::Error(t->span)
+                       << "could not build tensors using factory for type " << PrettyPrint(t));
+    throw;
+  }
+}
+
+/*! \brief A fragment of the program being built by the automatic differentation
+ *  pass.
+ */
+struct ADValueNode {
+  virtual ~ADValueNode() {}
+  template <typename T>
+  T& get() {
+    auto ret = dynamic_cast<T*>(this);
+    ICHECK(ret) << "cannot downcast";
+    return *ret;
+  }
+};
+
+using ADValue = std::shared_ptr<ADValueNode>;
+
+/*! \brief AD over a program which generates a tensor output. */
+struct ADTensor : ADValueNode {
+  Expr forward;
+  mutable Expr reverse;  // must be a variable to avoid duplication
+  ADTensor(LetList* ll, const Expr& forward, DiagnosticContext diag_ctx)
+      : forward(ll->Push(forward)),
+        reverse(ll->Push(
+            MultiFactoryLike(this->forward, forward->checked_type(), Zeros, ZerosLike, diag_ctx))) {
+    this->forward->checked_type_ = forward->checked_type();
+  }
+};
+
+/*! \brief A staged representation of the program, we reflect
+ * Relay functions into a function over fragments of AD. We
+ * can compute away this function to obtain a reverse mode program.
+ */
+struct ADFunction : ADValueNode {
+  // (ad_args, orig) -> ad_ret
+  using ADFunctionType = ADValue(const std::vector<ADValue>&, const Call&);
+  std::function<ADFunctionType> func;
+  explicit ADFunction(const std::function<ADFunctionType>& func) : func(func) {}
+};
+
+struct FirstOrderReverseAD : ExprFunctor<ADValue(const Expr&)> {
+  const OpAttrMap<FPrimalGradient> rev_map = Op::GetAttrMap<FPrimalGradient>("FPrimalGradient");
+  std::vector<std::function<void(LetList* ll)>> backprop_actions;
+  // we assume no closure so no need for lexical scoping
+  std::unordered_map<Expr, ADValue, ObjectPtrHash, ObjectPtrEqual> env;
+  LetList* ll;
+  DiagnosticContext diag_ctx;
+
+  FirstOrderReverseAD(LetList* ll, DiagnosticContext diag_ctx) : ll(ll), diag_ctx(diag_ctx) {}
+
+  ADValue VisitExpr(const Expr& n) final {
+    if (env.count(n)) {
+      return env.at(n);
+    }
+    auto ret = ExprFunctor::VisitExpr(n);
+    env[n] = ret;
+    return ret;
+  }
+
+  static Expr LiftedAdd(const Type& t, const Expr& x, const Expr& y, LetList* ll) {
+    if (t.as<TensorTypeNode>()) {
+      return ll->Push(Add(x, y));
+    } else if (auto* tt = t.as<TupleTypeNode>()) {
+      Array<Expr> fields;
+      for (size_t i = 0; i < tt->fields.size(); ++i) {
+        fields.push_back(
+            LiftedAdd(tt->fields[i], ll->Push(GetField(x, i)), ll->Push(GetField(y, i)), ll));
+      }
+      return ll->Push(Tuple(fields));
+    } else {
+      LOG(FATAL) << "cannot lift addition for type " << PrettyPrint(t);
+      throw;
+    }
+  }
+
+  ADValue VisitExpr_(const OpNode* op) final {
+    Op op_ref = GetRef<Op>(op);
+    if (!rev_map.count(op_ref)) {
+      diag_ctx.EmitFatal(Diagnostic::Error(op->span)
+                         << "the operator " << op->name << " does not have a registered gradient.");
+    }
+    return std::make_shared<ADFunction>([this, op_ref](const std::vector<ADValue>& ad_args,
+                                                       const Call& orig) {
+      std::vector<Expr> orig_args;
+      for (const ADValue& adval : ad_args) {
+        orig_args.push_back(adval->get<ADTensor>().forward);
+      }
+      auto orig_new = Call(op_ref, orig_args, orig->attrs, orig->type_args);
+      orig_new->checked_type_ = orig->checked_type();
+      auto ret = std::make_shared<ADTensor>(ll, orig_new, diag_ctx);
+      backprop_actions.push_back([this, ad_args, orig_new, ret, op_ref](LetList* ll) {
+        tvm::Array<Expr> rev = rev_map[op_ref](orig_new, ret->reverse);
+        if (ad_args.size() != rev.size()) {
+          diag_ctx.EmitFatal(Diagnostic::Error(op_ref->span)
+                             << "arity mismatch for operator " << op_ref->name
+                             << " and its registered gradient: expected " << ad_args.size()
+                             << " but got " << rev.size() << " gradients.");
+        }
+        for (size_t i = 0; i < ad_args.size(); ++i) {
+          auto& ad_arg = ad_args[i]->get<ADTensor>();
+          ad_arg.reverse = LiftedAdd(ad_arg.forward->checked_type(), ad_arg.reverse, rev[i], ll);
+        }
+      });
+      return ret;
+    });
+  }
+
+  ADValue VisitExpr_(const TupleGetItemNode* op) final {
+    Expr e = GetRef<Expr>(op);
+    ADValue tup = VisitExpr(op->tuple);
+    auto tt = op->tuple->checked_type().as<TupleTypeNode>();
+    size_t idx = op->index;
+    auto ret = std::make_shared<ADTensor>(ll, e, diag_ctx);
+    backprop_actions.push_back([tup, tt, idx, ret](LetList* ll) {
+      auto& ad_tup = tup->get<ADTensor>();
+      std::vector<Expr> updated_grads;
+      for (size_t i = 0; i < tt->fields.size(); ++i) {
+        Expr grad_pre = GetField(ad_tup.reverse, i);
+        updated_grads.push_back(i != idx ? grad_pre
+                                         : LiftedAdd(tt->fields[i], grad_pre, ret->reverse, ll));
+      }
+      ad_tup.reverse = ll->Push(Tuple(updated_grads));
+    });
+    return ret;
+  }
+
+  ADValue VisitExpr_(const TupleNode* op) final {
+    Expr e = GetRef<Expr>(op);
+    std::vector<ADValue> fields;
+    for (const auto& f : op->fields) {
+      fields.push_back(VisitExpr(f));
+    }
+    auto tt = op->checked_type().as<TupleTypeNode>();
+    auto ret = std::make_shared<ADTensor>(ll, e, diag_ctx);
+    backprop_actions.push_back([fields, tt, ret](LetList* ll) {
+      for (size_t i = 0; i < fields.size(); ++i) {
+        auto& ad_field = fields[i]->get<ADTensor>();
+        ad_field.reverse =
+            LiftedAdd(tt->fields[i], ad_field.reverse, GetField(ret->reverse, i), ll);
+      }
+    });
+    return ret;
+  }
+
+  ADValue VisitExpr_(const ConstantNode* op) final {
+    Expr e = GetRef<Expr>(op);
+    return std::make_shared<ADTensor>(ll, e, diag_ctx);
+  }
+
+  ADValue VisitExpr_(const CallNode* op) final {
+    ADValue f = VisitExpr(op->op);
+    std::vector<ADValue> args;
+    for (const auto& arg : op->args) {
+      args.push_back(VisitExpr(arg));
+    }
+    return f->get<ADFunction>().func(args, GetRef<Call>(op));
+  }
+
+  ADValue VisitExpr_(const FunctionNode* op) final {
+    Function f = GetRef<Function>(op);
+    // todo: assert no closure
+    return std::make_shared<ADFunction>(
+        [this, f](const std::vector<ADValue>& ad_args, const Call& orig) {
+          ICHECK_EQ(f->params.size(), ad_args.size());
+          for (size_t i = 0; i < f->params.size(); ++i) {
+            env[f->params[i]] = ad_args[i];
+          }
+          return VisitExpr(f->body);
+        });
+  }
+
+  // Var will always be in env, handled in VisitExpr (without _), so we don't need
+  // to implement its VisitExpr_.
+};
+
+namespace transform {
+
+Pass FirstOrderGradient() {
+  runtime::TypedPackedFunc<IRModule(IRModule, PassContext)> f = [](IRModule mod, PassContext ctx) {
+    CheckFeature(
+        mod, FeatureSet({fVar, fConstant, fTuple, fTupleGetItem, fFunction, fOp, fCall, fGraph}));
+    IRModule ad_mod = GetRef<IRModule>(mod.CopyOnWrite());
+    DiagnosticContext diag_ctx = DiagnosticContext::Default(ad_mod);
+
+    if (mod->functions.size() > 1) {
+      LOG(WARNING) << "IRModule contains multiple global functions: first-order AD will transform "
+                      "them indepedently!";
+    }
+
+    for (const auto& pr : mod->functions) {
+      const FunctionNode* func = pr.second.as<FunctionNode>();
+      if (!func) {
+        diag_ctx.Emit(Diagnostic::Warning(pr.second->span)
+                      << "AD can only be performed on Relay functions, skipping "
+                      << PrettyPrint(pr.first));
+      }
+      if (func->type_params.size() > 0) {
+        diag_ctx.EmitFatal(Diagnostic::Error(pr.second->span)
+                           << "first-order AD does not support polymorphism yet.");
+      }
+      Expr body = LetList::With([&](LetList* ll) {
+        FirstOrderReverseAD reverse_ad(ll, diag_ctx);
+        ADValue rev = reverse_ad(pr.second);
+        std::vector<ADValue> args;
+        for (const auto& p : func->params) {
+          args.push_back(std::make_shared<ADTensor>(ll, p, diag_ctx));
+        }
+        Call placeholder = Call(GetRef<Function>(func), {});
+        placeholder->checked_type_ = func->checked_type().as<FuncTypeNode>()->ret_type;
+        auto grad_call = rev->get<ADFunction>().func(args, placeholder);
+        auto& res = grad_call->get<ADTensor>();
+        Expr grad_tuple = LetList::With([&](LetList* ll) {
+          res.reverse =
+              MultiFactoryLike(res.forward, res.forward->checked_type(), Ones, OnesLike, diag_ctx);
+          for (auto it = reverse_ad.backprop_actions.rbegin();
+               it != reverse_ad.backprop_actions.rend(); ++it) {
+            (*it)(ll);
+          }
+          std::vector<Expr> grads;
+          for (const auto& a : args) {
+            grads.push_back(a->get<ADTensor>().reverse);
+          }
+          return Tuple(grads);
+        });
+        return Pair(res.forward, grad_tuple);
+      });
+      ad_mod->Update(pr.first,
+                     Function(func->params, body, GradRetType(GetRef<Function>(func)), {}));
+    }
+
+    return ad_mod;
+  };
+  return CreateModulePass(f, 0, "FirstOrderGradient", {});
+}
+
+TVM_REGISTER_GLOBAL("relay._transform.FirstOrderGradient").set_body_typed(FirstOrderGradient);
+
+}  // namespace transform
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/transforms/fold_constant.cc b/src/relay/transforms/fold_constant.cc
index 9416b0ec4580..fe5f547449ad 100644
--- a/src/relay/transforms/fold_constant.cc
+++ b/src/relay/transforms/fold_constant.cc
@@ -248,16 +248,16 @@ class ConstantFolder : public MixedModeMutator {
     expr = expr.as<FunctionNode>() == nullptr ? entry_func->body : entry_func;
 
     using tvm::transform::PassContext;
-    DLContext ctx;
-    ctx.device_type = kDLCPU;
-    ctx.device_id = 0;
+    Device dev;
+    dev.device_type = kDLCPU;
+    dev.device_id = 0;
     Target target = Target("llvm");
     // use a fresh build context
     // in case we are already in a build context.
     // needed for both execution and creation(due to JIT)
     With<PassContext> fresh_build_ctx(PassContext::Create());
 
-    FInterpreter executor = CreateInterpreter(mod, ctx, target);
+    FInterpreter executor = CreateInterpreter(mod, dev, target);
     return ObjectToExpr(executor(expr));
   }
 
@@ -276,17 +276,17 @@ class ConstantFolder : public MixedModeMutator {
     }
 
     // Get the constant shape
-    DLContext ctx;
-    ctx.device_type = kDLCPU;
-    ctx.device_id = 0;
+    Device dev;
+    dev.device_type = kDLCPU;
+    dev.device_id = 0;
     runtime::NDArray value;
     DLDataType cdtype = DataType::Int(32);
     if (ishape.size() == 0) {
-      value = runtime::NDArray::Empty({}, cdtype, ctx);
+      value = runtime::NDArray::Empty({}, cdtype, dev);
     } else {
       ICHECK_NE(ishape.size(), 0);
       std::vector<int64_t> cshape = {static_cast<int64_t>(ishape.size())};
-      value = runtime::NDArray::Empty(cshape, cdtype, ctx);
+      value = runtime::NDArray::Empty(cshape, cdtype, dev);
       int32_t* dims = static_cast<int32_t*>(value->data);
       using ::tvm::tir::IntImmNode;
       for (size_t i = 0; i < ishape.size(); ++i) {
@@ -301,7 +301,7 @@ class ConstantFolder : public MixedModeMutator {
     Constant shape = Downcast<Constant>(ObjectToExpr(value));
 
     if (shape->data.Shape().size() == 0 && GetScalarFromConstant<int32_t>(shape) == 0) {
-      auto ndarray = runtime::NDArray::Empty({}, cdtype, ctx);
+      auto ndarray = runtime::NDArray::Empty({}, cdtype, dev);
       shape = Constant(ndarray);
     }
 
@@ -323,12 +323,12 @@ class ConstantFolder : public MixedModeMutator {
     }
 
     // Get the constant size
-    DLContext ctx;
-    ctx.device_type = kDLCPU;
-    ctx.device_id = 0;
+    Device dev;
+    dev.device_type = kDLCPU;
+    dev.device_id = 0;
     runtime::NDArray value;
     DLDataType cdtype = DataType::Int(32);
-    value = runtime::NDArray::Empty({}, cdtype, ctx);
+    value = runtime::NDArray::Empty({}, cdtype, dev);
     int32_t* data = static_cast<int32_t*>(value->data);
     if (ishape.size() == 0) {
       *data = 0;
diff --git a/src/relay/transforms/fold_explicit_padding.cc b/src/relay/transforms/fold_explicit_padding.cc
index bab8b814df05..d959e5b75e40 100644
--- a/src/relay/transforms/fold_explicit_padding.cc
+++ b/src/relay/transforms/fold_explicit_padding.cc
@@ -26,7 +26,7 @@
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include "../op/tensor/transform.h"
 #include "pattern_utils.h"
diff --git a/src/relay/transforms/gradient.h b/src/relay/transforms/gradient.h
new file mode 100644
index 000000000000..2e6ffbcc7c9e
--- /dev/null
+++ b/src/relay/transforms/gradient.h
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file gradient.h
+ * \brief Utility functions for Automatic Differentiation in Relay.
+ */
+#ifndef TVM_RELAY_TRANSFORMS_GRADIENT_H_
+#define TVM_RELAY_TRANSFORMS_GRADIENT_H_
+
+#include <tvm/relay/expr.h>
+#include <tvm/relay/function.h>
+
+#include <vector>
+
+namespace tvm {
+namespace relay {
+
+inline Type GradRetType(const Function& f) {
+  // if type annotations are provided, we will construct a ret type;
+  // otherwise, leave it to be inferred
+  if (!f->ret_type.defined()) {
+    return Type();
+  }
+  std::vector<Type> vt;
+  for (const auto& p : f->params) {
+    if (!p->type_annotation.defined()) {
+      return Type();
+    }
+    vt.push_back(p->type_annotation);
+  }
+
+  return TupleType({f->ret_type, TupleType(vt)});
+}
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_TRANSFORMS_GRADIENT_H_
diff --git a/src/relay/transforms/gradient.cc b/src/relay/transforms/higher_order_gradient.cc
similarity index 64%
rename from src/relay/transforms/gradient.cc
rename to src/relay/transforms/higher_order_gradient.cc
index cd3a99655341..202275626d5d 100644
--- a/src/relay/transforms/gradient.cc
+++ b/src/relay/transforms/higher_order_gradient.cc
@@ -18,8 +18,8 @@
  */
 
 /*!
- * \file gradient.cc
- * \brief API for Automatic Differentiation for the Relay IR.
+ * \file higher_order_gradient.cc
+ * \brief Higher-order Automatic Differentiation in Relay IR, for non-graph programs.
  */
 #include <tvm/ir/type_functor.h>
 #include <tvm/relay/analysis.h>
@@ -28,6 +28,7 @@
 #include <tvm/relay/transform.h>
 #include <tvm/te/operation.h>
 
+#include "gradient.h"
 #include "let_list.h"
 #include "pass_utils.h"
 #include "pattern_utils.h"
@@ -64,13 +65,6 @@ using namespace tvm::runtime;
  * output. There are multiple implementation of AD in relay, with different characteristic. However,
  * they all transform the input expr according to WithGradientType.
  */
-Type WithGradientType(const Type&);
-
-/*! return an expression that represent differentiation of e (according to WithGradientType).
- *  This version only work on first order code without control flow.
- */
-Expr FirstOrderGradient(const Expr& e, const Optional<IRModule>& mod);
-
 Type WithGradientType(const Type& t) {
   // TODO(@M.K.): stricter checking
   auto ty = t.as<FuncTypeNode>();
@@ -94,268 +88,6 @@ Expr DeGlobal(const Optional<IRModule>& mod, const Expr& e) {
   }
 }
 
-/*! \brief A fragment of the program being built by the automatic differentation
- *  pass.
- */
-struct ADValueNode {
-  virtual ~ADValueNode() {}
-  template <typename T>
-  T& get() {
-    auto ret = dynamic_cast<T*>(this);
-    ICHECK(ret) << "cannot downcast";
-    return *ret;
-  }
-};
-
-template <typename F>
-Expr MultiFactory(const Type& t, F factory) {
-  if (auto* tt = t.as<TensorTypeNode>()) {
-    return factory(tt->shape, tt->dtype);
-  } else if (auto* tt = t.as<TupleTypeNode>()) {
-    std::vector<Expr> res;
-    for (size_t i = 0; i < tt->fields.size(); i++) {
-      res.push_back(MultiFactory(tt->fields[i], factory));
-    }
-    return Tuple(res);
-  } else {
-    LOG(FATAL) << "unsupported type to create tensors of: " << tt;
-    throw;
-  }
-}
-
-template <typename F, typename F2>
-Expr MultiFactoryLike(const Expr& e, const Type& t, F factory, F2 factory_like) {
-  if (t.as<TensorTypeNode>()) {
-    return factory_like(e);
-  } else if (auto* tt = t.as<TupleTypeNode>()) {
-    return MultiFactory(t, factory);
-  } else {
-    LOG(FATAL) << "unsupported type to tensors of: " << tt;
-    throw;
-  }
-}
-
-using ADValue = std::shared_ptr<ADValueNode>;
-
-/*! \brief AD over a program which generates a tensor output. */
-struct ADTensor : ADValueNode {
-  Expr forward;
-  mutable Expr reverse;  // must be a variable to avoid duplication
-  ADTensor(LetList* ll, const Expr& forward)
-      : forward(ll->Push(forward)),
-        reverse(
-            ll->Push(MultiFactoryLike(this->forward, forward->checked_type(), Zeros, ZerosLike))) {
-    this->forward->checked_type_ = forward->checked_type();
-  }
-};
-
-/*! \brief A staged representation of the program, we reflect
- * Relay functions into a function over fragments of AD. We
- * can compute away this function to obtain a reverse mode program.
- */
-struct ADFunction : ADValueNode {
-  std::function<ADValue(const Type&, const std::vector<ADValue>&, const Attrs&,
-                        const tvm::Array<Type>&)>
-      func;
-  explicit ADFunction(const std::function<ADValue(const Type&, const std::vector<ADValue>&,
-                                                  const Attrs&, const tvm::Array<Type>&)>& func)
-      : func(func) {}
-};
-
-struct FirstOrderReverseAD : ExprFunctor<ADValue(const Expr&)> {
-  using TBase = ExprFunctor<ADValue(const Expr&)>;
-  const OpAttrMap<FPrimalGradient> rev_map = Op::GetAttrMap<FPrimalGradient>("FPrimalGradient");
-  std::vector<std::function<void(LetList* ll)>> backprop_actions;
-  // we assume no closure so no need for lexical scoping
-  std::unordered_map<Expr, ADValue, ObjectPtrHash, ObjectPtrEqual> env;
-  LetList* ll;
-
-  FirstOrderReverseAD(LetList* ll) : ll(ll) {}
-
-  ADValue VisitExpr(const Expr& n) final {
-    if (env.count(n)) {
-      return env.at(n);
-    }
-    auto ret = TBase::VisitExpr(n);
-    env[n] = ret;
-    return ret;
-  }
-
-  Expr UpdateGrad(const Type& t, const Expr& arg, const Expr& grad, LetList* ll) {
-    if (t.as<TensorTypeNode>()) {
-      return ll->Push(Add(arg, grad));
-    } else if (auto* tt = t.as<TupleTypeNode>()) {
-      Array<Expr> updates;
-      for (size_t i = 0; i < tt->fields.size(); ++i) {
-        updates.push_back(this->UpdateGrad(tt->fields[i], ll->Push(GetField(arg, i)),
-                                           ll->Push(GetField(grad, i)), ll));
-      }
-      return ll->Push(Tuple(updates));
-    } else {
-      LOG(FATAL) << "unsupported arg type of operator: " << t;
-      throw;
-    }
-  }
-
-  ADValue VisitExpr_(const OpNode* op) final {
-    Op op_ref = GetRef<Op>(op);
-    ICHECK(rev_map.count(op_ref)) << op->name << " does not have reverse mode defined";
-    return std::make_shared<ADFunction>(
-        [this, op_ref](const Type& orig_type, const std::vector<ADValue>& args, const Attrs& attrs,
-                       const tvm::Array<Type>& type_args) {
-          std::vector<Expr> call_args;
-          for (const ADValue& adval : args) {
-            call_args.push_back(adval->get<ADTensor>().forward);
-          }
-          auto orig = Call(op_ref, call_args, attrs, type_args);
-          orig->checked_type_ = orig_type;
-          auto ret = std::make_shared<ADTensor>(ll, orig);
-          backprop_actions.push_back([this, args, orig, ret, op_ref](LetList* ll) {
-            tvm::Array<Expr> rev = rev_map[op_ref](orig, ret->reverse);
-            ICHECK(args.size() == rev.size());
-            for (size_t i = 0; i < args.size(); ++i) {
-              auto ad_arg = args[i]->get<ADTensor>();
-              auto ad_arg_type = ad_arg.forward->checked_type();
-              args[i]->get<ADTensor>().reverse =
-                  this->UpdateGrad(ad_arg_type, ad_arg.reverse, rev[i], ll);
-            }
-          });
-          return ret;
-        });
-  }
-
-  ADValue VisitExpr_(const TupleGetItemNode* op) final {
-    Expr e = GetRef<Expr>(op);
-    ADValue tup = VisitExpr(op->tuple);
-    auto tt = op->tuple->checked_type().as<TupleTypeNode>();
-    size_t size = tt->fields.size();
-    size_t idx = op->index;
-    auto ret = std::make_shared<ADTensor>(ll, e);
-    backprop_actions.push_back([tup, idx, size, ret](LetList* ll) {
-      auto rev = tup->get<ADTensor>().reverse;
-      // special-case Tuple, to avoid long chains of GetItem/Tuple,
-      // but we might have functions using tuples, so we don't know
-      // that the reverse node is always a tuple
-      std::vector<Expr> grfields;
-      if (auto tup_node = rev.as<TupleNode>()) {
-        for (size_t i = 0; i < size; ++i) {
-          grfields.push_back(i != idx ? tup_node->fields[i]
-                                      : Add(tup_node->fields[i], ret->reverse));
-        }
-      } else {
-        for (size_t i = 0; i < size; ++i) {
-          grfields.push_back(i != idx ? TupleGetItem(rev, i)
-                                      : Add(TupleGetItem(rev, i), ret->reverse));
-        }
-      }
-      tup->get<ADTensor>().reverse = ll->Push(Tuple(grfields));
-    });
-    return ret;
-  }
-
-  ADValue VisitExpr_(const TupleNode* op) final {
-    Expr e = GetRef<Expr>(op);
-    std::vector<ADValue> fields;
-    for (const auto& f : op->fields) {
-      fields.push_back(VisitExpr(f));
-    }
-    auto ret = std::make_shared<ADTensor>(ll, e);
-    backprop_actions.push_back([fields, ret](LetList* ll) {
-      for (size_t i = 0; i < fields.size(); ++i) {
-        fields[i]->get<ADTensor>().reverse =
-            ll->Push(Add(fields[i]->get<ADTensor>().reverse, TupleGetItem(ret->reverse, i)));
-      }
-    });
-    return ret;
-  }
-
-  ADValue VisitExpr_(const ConstantNode* op) final {
-    Expr e = GetRef<Expr>(op);
-    return std::make_shared<ADTensor>(ll, e);
-  }
-
-  ADValue VisitExpr_(const CallNode* op) final {
-    ADValue f = VisitExpr(op->op);
-    std::vector<ADValue> args;
-    for (const auto& arg : op->args) {
-      args.push_back(VisitExpr(arg));
-    }
-    return f->get<ADFunction>().func(op->checked_type(), args, op->attrs, op->type_args);
-  }
-
-  ADValue VisitExpr_(const FunctionNode* op) final {
-    Function f = GetRef<Function>(op);
-    // todo: assert no closure
-    return std::make_shared<ADFunction>(
-        [this, f](const Type& orig_type, const std::vector<ADValue>& args, const Attrs& attrs,
-                  const tvm::Array<Type>& type_args) {
-          ICHECK_EQ(f->params.size(), args.size());
-          for (size_t i = 0; i < f->params.size(); ++i) {
-            env[f->params[i]] = args[i];
-          }
-          return VisitExpr(f->body);
-        });
-  }
-
-  // Var will always be in env, handled in VisitExpr (without _), so we don't need
-  // to implement its VisitExpr_.
-};
-
-Type GradRetType(const Function& f) {
-  // if type annotations are provided, we will construct a ret type;
-  // otherwise, leave it to be inferred
-  if (!f->ret_type.defined()) {
-    return Type();
-  }
-  std::vector<Type> vt;
-  for (const auto& p : f->params) {
-    if (!p->type_annotation.defined()) {
-      return Type();
-    }
-    vt.push_back(p->type_annotation);
-  }
-
-  return TupleType({f->ret_type, TupleType(vt)});
-}
-
-Expr FirstOrderGradient(const Expr& re, const Optional<IRModule>& mod) {
-  // Currently we first remove any global functions for the first
-  // order case.
-  auto e = DeGlobal(mod, re);
-  auto f = e.as<FunctionNode>();
-  ICHECK(f) << "FOWithGradient expects its argument to be a function: " << f;
-  ICHECK(f->type_params.size() == 0) << "no polymorphism supported for now";
-
-  // We will then build a sequence of lets which implement reverse mode.
-  Expr body = LetList::With([&](LetList* ll) {
-    FirstOrderReverseAD reverse_ad(ll);
-    ADValue rev = reverse_ad(e);
-    std::vector<ADValue> args;
-    for (const auto& p : f->params) {
-      args.push_back(std::make_shared<ADTensor>(ll, p));
-    }
-    auto c = rev->get<ADFunction>().func(f->checked_type(), args, Attrs(), {});
-    const auto& res = c->get<ADTensor>();
-    Expr grad = LetList::With([&](LetList* ll) {
-      res.reverse = MultiFactoryLike(res.forward, res.forward->checked_type(), Ones, OnesLike);
-      for (auto it = reverse_ad.backprop_actions.rbegin(); it != reverse_ad.backprop_actions.rend();
-           ++it) {
-        (*it)(ll);
-      }
-      std::vector<Expr> grad_res;
-      for (const auto& a : args) {
-        grad_res.push_back(a->get<ADTensor>().reverse);
-      }
-      return Tuple(grad_res);
-    });
-    return Pair(res.forward, grad);
-  });
-
-  return Function(f->params, body, GradRetType(GetRef<Function>(f)), {});
-}
-
-TVM_REGISTER_GLOBAL("relay._transform.first_order_gradient").set_body_typed(FirstOrderGradient);
-
 static Type bpt = RelayRefType(FuncType({}, TupleType(Array<Type>()), {}, {}));
 
 struct ReverseADType : TypeMutator {
diff --git a/src/relay/transforms/inline.cc b/src/relay/transforms/inline.cc
index dae34674de77..6e6505b28dc6 100644
--- a/src/relay/transforms/inline.cc
+++ b/src/relay/transforms/inline.cc
@@ -36,7 +36,7 @@
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include <string>
 #include <unordered_set>
diff --git a/src/relay/transforms/memory_alloc.cc b/src/relay/transforms/memory_alloc.cc
index dd1b1ecdc066..1dc204d43ba1 100644
--- a/src/relay/transforms/memory_alloc.cc
+++ b/src/relay/transforms/memory_alloc.cc
@@ -31,7 +31,7 @@
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/op.h>
 #include <tvm/relay/transform.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/target/target.h>
 
 #include <cstdint>
@@ -52,7 +52,7 @@ namespace tvm {
 namespace relay {
 
 using AnalysisResultMap =
-    std::unordered_map<Expr, TVMContext, runtime::ObjectPtrHash, runtime::ObjectPtrEqual>;
+    std::unordered_map<Expr, Device, runtime::ObjectPtrHash, runtime::ObjectPtrEqual>;
 
 inline Constant MakeConstant(const std::vector<int64_t>& value) {
   return MakeConstantTensor(DataType::Int(64), {static_cast<int64_t>(value.size())}, value);
@@ -104,8 +104,8 @@ class DialectRewriter : public ExprMutator {
   DialectRewriter(const Target& target_host, const AnalysisResultMap& context_analysis_map)
       : target_host_(target_host), context_analysis_map_(context_analysis_map) {}
 
-  // Get the context of an expression.
-  TVMContext GetContext(const Expr& expr) const {
+  // Get the device of an expression.
+  Device GetDevice(const Expr& expr) const {
     auto it = context_analysis_map_.find(expr);
     CHECK(it != context_analysis_map_.end()) << "Cannot find expr in the context analysis map:\n"
                                              << AsText(expr, false);
@@ -189,8 +189,8 @@ class DialectRewriter : public ExprMutator {
         // Handle the static case
         Array<Expr> outs;
         for (size_t i = 0; i < out_types.size(); ++i) {
-          TVMContext ctx = GetContext(GetRef<Call>(cn));
-          auto out = MakeStaticAllocation(&scope, out_types[i], ctx, std::to_string(i));
+          Device dev = GetDevice(GetRef<Call>(cn));
+          auto out = MakeStaticAllocation(&scope, out_types[i], dev, std::to_string(i));
           outs.push_back(out);
         }
         Tuple output(outs);
@@ -206,8 +206,8 @@ class DialectRewriter : public ExprMutator {
 
  private:
   // Insert a device copy node.
-  Expr DeviceCopy(const Expr& inp, int src_ctx, int dst_ctx) {
-    return ExprMutator::Mutate(relay::DeviceCopy(inp, src_ctx, dst_ctx));
+  Expr DeviceCopy(const Expr& inp, int src_dev, int dst_dev) {
+    return ExprMutator::Mutate(relay::DeviceCopy(inp, src_dev, dst_dev));
   }
 
   // Check if a call invokes a primitive function.
@@ -262,8 +262,7 @@ class DialectRewriter : public ExprMutator {
   }
 
   // Allocate a tensor with a statically known shape.
-  Var MakeStaticAllocation(LetList* scope, const TensorType& type, TVMContext ctx,
-                           String name_hint) {
+  Var MakeStaticAllocation(LetList* scope, const TensorType& type, Device dev, String name_hint) {
     std::vector<int64_t> int_shape;
     for (auto it : type->shape) {
       const auto* imm = it.as<IntImmNode>();
@@ -275,7 +274,7 @@ class DialectRewriter : public ExprMutator {
     Expr alignment = ComputeAlignment(type->dtype);
     // Run type inference later to get the correct type.
     Var var("storage_" + name_hint, Type(nullptr));
-    Expr value = AllocStorage(size, alignment, ctx, type->dtype);
+    Expr value = AllocStorage(size, alignment, dev, type->dtype);
     auto sto = scope->Push(var, value);
 
     // TODO(@jroesch): There is a bug with typing based on the constant shape.
@@ -295,7 +294,7 @@ class DialectRewriter : public ExprMutator {
 
     Array<Integer> is_inputs;
     int input_pos = 0;
-    TVMContext cpu_ctx = default_context_;
+    Device cpu_dev = default_device_;
     CHECK_EQ(new_args.size(), input_states.size());
     for (size_t i = 0; i < new_args.size(); ++i) {
       Expr arg = new_args[i];
@@ -318,9 +317,9 @@ class DialectRewriter : public ExprMutator {
         is_inputs.push_back(0);
       } else if (state == 1) {
         auto new_arg = ExprMutator::Mutate(arg);
-        auto ctx = GetContext(arg);
-        if (ctx.device_type != cpu_ctx.device_type) {
-          new_arg = DeviceCopy(new_arg, ctx.device_type, cpu_ctx.device_type);
+        auto dev = GetDevice(arg);
+        if (dev.device_type != cpu_dev.device_type) {
+          new_arg = DeviceCopy(new_arg, dev.device_type, cpu_dev.device_type);
         }
         Var in_shape_var("in_shape_" + std::to_string(input_pos), Type(nullptr));
         shape_func_ins.push_back(scope->Push(in_shape_var, new_arg));
@@ -338,7 +337,7 @@ class DialectRewriter : public ExprMutator {
       auto tt = TensorType(out->shape, out->dtype);
       // Put shape func on CPU. This also ensures that everything between
       // shape_of and shape_func are on CPU.
-      auto alloc = MakeStaticAllocation(scope, tt, cpu_ctx, std::to_string(i));
+      auto alloc = MakeStaticAllocation(scope, tt, cpu_dev, std::to_string(i));
       Var shape_func_out_var("shape_func_out_" + std::to_string(i), Type(nullptr));
       alloc = scope->Push(shape_func_out_var, alloc);
       out_shapes.push_back(alloc);
@@ -355,7 +354,7 @@ class DialectRewriter : public ExprMutator {
                      const Type& ret_type) {
     auto out_shapes = EmitShapeFunc(scope, func, new_args);
     std::vector<Var> storages;
-    auto func_ctx = GetContext(func);
+    auto func_dev = GetDevice(func);
     CHECK_EQ(out_shapes.size(), out_types.size());
     for (size_t i = 0; i < out_shapes.size(); ++i) {
       auto out_shape = out_shapes[i];
@@ -363,7 +362,7 @@ class DialectRewriter : public ExprMutator {
       auto size = ComputeStorageInRelay(out_shape, out_type);
       auto alignment = ComputeAlignment(out_type->dtype);
       Var sto_var("storage_" + std::to_string(i), Type(nullptr));
-      auto val = AllocStorage(size, alignment, func_ctx, out_type->dtype);
+      auto val = AllocStorage(size, alignment, func_dev, out_type->dtype);
       storages.push_back(scope->Push(sto_var, val));
     }
 
@@ -409,7 +408,7 @@ class DialectRewriter : public ExprMutator {
   std::vector<LetList> scopes_;
 
   runtime::DataType compute_dtype_ = runtime::DataType::Int(64);
-  TVMContext default_context_{kDLCPU, 0};
+  Device default_device_{kDLCPU, 0};
 };
 
 namespace transform {
@@ -424,21 +423,21 @@ Pass ManifestAlloc(Target target_host, Map<tvm::Integer, tvm::Target> targets) {
         mod->ImportFromStd("core.rly");
         mod = relay::transform::InferType()(mod);
 
-        TVMContext fallback_ctx;
+        Device fallback_dev;
         if (targets.size() > 1) {
           auto pass_ctx = PassContext::Current();
-          Optional<Integer> opt_fallback_dev =
+          Optional<Integer> opt_fallback_dev_type =
               pass_ctx->GetConfig("relay.fallback_device_type", Integer(static_cast<int>(kDLCPU)));
-          auto fallback_dev = opt_fallback_dev.value();
-          CHECK_GT(fallback_dev->value, 0U);
-          fallback_ctx.device_type = static_cast<DLDeviceType>(fallback_dev->value);
-          fallback_ctx.device_id = 0;
+          auto fallback_dev_type = opt_fallback_dev_type.value();
+          CHECK_GT(fallback_dev_type->value, 0U);
+          fallback_dev.device_type = static_cast<DLDeviceType>(fallback_dev_type->value);
+          fallback_dev.device_id = 0;
         } else {
           const auto& it = targets.begin();
-          fallback_ctx.device_type = static_cast<DLDeviceType>((*it).first->value);
-          fallback_ctx.device_id = 0;
+          fallback_dev.device_type = static_cast<DLDeviceType>((*it).first->value);
+          fallback_dev.device_id = 0;
         }
-        auto ca = ContextAnalysis(mod, fallback_ctx);
+        auto ca = ContextAnalysis(mod, fallback_dev);
 
         auto glob_funcs = mod->functions;
         for (const auto& it : glob_funcs) {
diff --git a/src/relay/transforms/partial_eval.cc b/src/relay/transforms/partial_eval.cc
index fa080a7ff22c..9572faf08714 100644
--- a/src/relay/transforms/partial_eval.cc
+++ b/src/relay/transforms/partial_eval.cc
@@ -526,11 +526,11 @@ bool StatefulOp(const Expr& e) {
 
 using FInterpreter = runtime::TypedPackedFunc<ObjectRef(Expr)>;
 
-DLContext CPUContext() {
-  DLContext ctx;
-  ctx.device_type = kDLCPU;
-  ctx.device_id = 0;
-  return ctx;
+Device CPUDevice() {
+  Device dev;
+  dev.device_type = kDLCPU;
+  dev.device_id = 0;
+  return dev;
 }
 
 FInterpreter CPUInterpreter() {
@@ -541,7 +541,7 @@ FInterpreter CPUInterpreter() {
   // in case we are already in a build context.
   With<PassContext> fresh_build_ctx(PassContext::Create());
 
-  return CreateInterpreter(IRModule(nullptr), CPUContext(), target);
+  return CreateInterpreter(IRModule(nullptr), CPUDevice(), target);
 }
 
 using FuncId = int;
@@ -613,7 +613,7 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
   }
 
   PStatic VisitExpr_(const ConstantNode* op, LetList* ll) final {
-    return HasStatic(MkSTensor(op->data.CopyTo(context_)), ll->Push(GetRef<Expr>(op)));
+    return HasStatic(MkSTensor(op->data.CopyTo(device_)), ll->Push(GetRef<Expr>(op)));
   }
 
   PStatic VisitExpr_(const TupleNode* op, LetList* ll) final {
@@ -669,7 +669,7 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
   PStatic VisitExpr_(const IfNode* op, LetList* ll) final {
     PStatic c = VisitExpr(op->cond, ll);
     if (c->pstatic.defined()) {
-      NDArray cpu_array = Downcast<STensor>(c->pstatic)->data.CopyTo(CPUContext());
+      NDArray cpu_array = Downcast<STensor>(c->pstatic)->data.CopyTo(CPUDevice());
       ICHECK_EQ(DataType(cpu_array->dtype), DataType::Bool());
       if (reinterpret_cast<uint8_t*>(cpu_array->data)[0]) {
         return VisitExpr(op->true_branch, ll);
@@ -754,7 +754,7 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
     if (ps->pstatic.defined()) {
       if (auto* st = ps->pstatic.as<STensorNode>()) {
         if (st->data.Shape().empty()) {
-          NDArray cpu_array = st->data.CopyTo(CPUContext());
+          NDArray cpu_array = st->data.CopyTo(CPUDevice());
           DataType dtype = DataType(cpu_array->dtype);
           if (dtype == DataType::Int(32)) {
             return std::max<int32_t>(0, *static_cast<const int32_t*>(cpu_array->data));
@@ -861,8 +861,8 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
     return VisitFunc(GetRef<Function>(op), ll);
   }
 
-  struct ReflectError : dmlc::Error {
-    ReflectError() : dmlc::Error("static value not found") {}
+  struct ReflectError : Error {
+    ReflectError() : Error("static value not found") {}
   };
 
   Expr Reflect(const PStatic& st) {
@@ -1136,7 +1136,7 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
   std::unordered_map<Function, FuncId, ObjectPtrHash, ObjectPtrEqual> func_map_;
   std::unordered_map<FuncId, Fuel> fuel_map_;
   Store store_;
-  DLContext context_ = CPUContext();
+  Device device_ = CPUDevice();
   FInterpreter executor_ = CPUInterpreter();
 };
 
diff --git a/src/relay/transforms/partition_graph.cc b/src/relay/transforms/partition_graph.cc
index 404c7efb10b0..94891c3c98ea 100644
--- a/src/relay/transforms/partition_graph.cc
+++ b/src/relay/transforms/partition_graph.cc
@@ -428,7 +428,7 @@ IRModule RemoveDefaultAnnotations(IRModule module) {
  *  could be a tuple output. Such tuple outputs needs to be flattened
  *  otherwise the function would create tuples of tuples. Moreover, tuple
  *  of tuples are valid relay, however they are not currently supported by
- *  graph runtime or relay VM.
+ *  graph executor or relay VM.
  */
 
 // New annotations would be required to be added for each flattened output
diff --git a/src/relay/transforms/simplify_expr.cc b/src/relay/transforms/simplify_expr.cc
index 74e48dc4bc54..b4f4cc16e9df 100644
--- a/src/relay/transforms/simplify_expr.cc
+++ b/src/relay/transforms/simplify_expr.cc
@@ -26,7 +26,7 @@
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include "../op/tensor/transform.h"
 #include "pattern_utils.h"
@@ -82,6 +82,99 @@ class SimplifyReshape : public SimplifyPattern {
   DFPattern x_;
 };
 
+/*!
+ * \brief SimplifyTranspose matches the pattern of consecutive transpose op,
+ *   and merges or cancels them.
+ */
+class SimplifyTranspose : public SimplifyPattern {
+ public:
+  SimplifyTranspose() {
+    x_ = IsWildcard();
+    auto trans1 = IsOp("transpose") || IsOp("layout_transform");
+    auto trans2 = IsOp("transpose") || IsOp("layout_transform");
+    pattern_ = trans1({trans2({x_})});
+  }
+
+  Expr callback(const Expr& pre, const Expr& post,
+                const Map<DFPattern, Array<Expr>>& node_map) const override {
+    // Helper function to get the axes from call node attribute
+    auto get_axes_from_call = [](const Call trans_call, int ndim) {
+      std::vector<int> attr_axes;
+      if (auto attr = trans_call->attrs.as<TransposeAttrs>()) {
+        if (attr->axes.defined()) {
+          for (int i = 0; i < ndim; ++i) {
+            int64_t axis = attr->axes[i];
+            axis += (axis < 0) ? ndim : 0;
+            attr_axes.push_back(axis);
+          }
+        } else {
+          // Empty axes means reverse
+          for (int i = ndim - 1; i >= 0; --i) {
+            attr_axes.push_back(i);
+          }
+        }
+      } else if (auto attr = trans_call->attrs.as<LayoutTransformAttrs>()) {
+        Layout src_layout(attr->src_layout);
+        Layout dst_layout(attr->dst_layout);
+        for (int i = 0; i < ndim; ++i) {
+          attr_axes.push_back(src_layout.IndexOf(dst_layout[i]));
+        }
+      } else {
+        CHECK(false) << "Expected transpose or layout_transform, but got "
+                     << Downcast<Op>(trans_call->op)->name;
+      }
+      return std::move(attr_axes);
+    };
+
+    auto x = node_map[x_][0];
+
+    // Initialize axes
+    int ndim = Downcast<TensorType>(pre->checked_type())->shape.size();
+    Array<Integer> axes;
+    for (int i = 0; i < ndim; ++i) {
+      axes.push_back(i);
+    }
+
+    // Collect axes changes from the matched pattern, including two consecutive transposes.
+    std::vector<std::vector<int>> interm_axes;
+    Call trans_call = Downcast<Call>(post);
+    interm_axes.push_back(get_axes_from_call(trans_call, ndim));
+    trans_call = Downcast<Call>(trans_call->args[0]);
+    interm_axes.push_back(get_axes_from_call(trans_call, ndim));
+
+    // Calculate the final axes in reverse order (from root to output)
+    auto it = interm_axes.rbegin();
+    while (it != interm_axes.rend()) {
+      auto interm = *it;
+
+      Array<Integer> new_axes;
+      for (int i = 0; i < ndim; ++i) {
+        new_axes.push_back(axes[interm[i]]);
+      }
+      axes = new_axes;
+      it++;
+    }
+
+    // Check if the transpose is still required
+    bool need_transpose = false;
+    for (int i = 0; i < ndim; ++i) {
+      if (axes[i] != i) {
+        need_transpose = true;
+        break;
+      }
+    }
+
+    if (need_transpose) {
+      return MakeTranspose(x, axes);
+    }
+    return x;
+  }
+
+ private:
+  /*! \brief Pattern input */
+  DFPattern x_;
+};
+
 /*!
  * \brief FullArgwhere finds full followed by argwhere and turns it into an Arange op
  */
@@ -162,6 +255,7 @@ class ExprSimplifier {
  public:
   explicit ExprSimplifier(IRModule mod) : mod_(mod) {
     CreateCallback(SimplifyReshape());
+    CreateCallback(SimplifyTranspose());
     CreateCallback(FullElementwise());
   }
   template <typename T>
diff --git a/src/relay/transforms/to_a_normal_form.cc b/src/relay/transforms/to_a_normal_form.cc
index 05844477cc5b..91e8d90c1232 100644
--- a/src/relay/transforms/to_a_normal_form.cc
+++ b/src/relay/transforms/to_a_normal_form.cc
@@ -26,7 +26,7 @@
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include "../../support/arena.h"
 #include "../analysis/dependency_graph.h"
diff --git a/src/relay/transforms/to_basic_block_normal_form.cc b/src/relay/transforms/to_basic_block_normal_form.cc
index 1aab367cf22a..79157bba1918 100644
--- a/src/relay/transforms/to_basic_block_normal_form.cc
+++ b/src/relay/transforms/to_basic_block_normal_form.cc
@@ -26,7 +26,7 @@
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include "../../support/arena.h"
 #include "../analysis/dependency_graph.h"
diff --git a/src/relay/transforms/type_infer.cc b/src/relay/transforms/type_infer.cc
index b4ccd1659865..4c6013792426 100644
--- a/src/relay/transforms/type_infer.cc
+++ b/src/relay/transforms/type_infer.cc
@@ -166,7 +166,7 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
              bool assign_rhs = true) {
     try {
       return solver_.Unify(t1, t2, span, assign_lhs, assign_rhs);
-    } catch (const dmlc::Error& e) {
+    } catch (const Error& e) {
       this->EmitFatal(Diagnostic::Error(span)
                       << "Error unifying `" << t1 << "` and `" << t2 << "`: " << e.what());
       return Type();
diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc
index 7fd27cba6136..b9e8c2549fd5 100644
--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -92,7 +92,7 @@ class DeviceAPIManager {
  public:
   static const int kMaxDeviceAPI = 32;
   // Get API
-  static DeviceAPI* Get(const TVMContext& ctx) { return Get(ctx.device_type); }
+  static DeviceAPI* Get(const Device& dev) { return Get(dev.device_type); }
   static DeviceAPI* Get(int dev_type, bool allow_missing = false) {
     return Global()->GetAPI(dev_type, allow_missing);
   }
@@ -136,12 +136,12 @@ class DeviceAPIManager {
   }
 };
 
-DeviceAPI* DeviceAPI::Get(TVMContext ctx, bool allow_missing) {
-  return DeviceAPIManager::Get(static_cast<int>(ctx.device_type), allow_missing);
+DeviceAPI* DeviceAPI::Get(Device dev, bool allow_missing) {
+  return DeviceAPIManager::Get(static_cast<int>(dev.device_type), allow_missing);
 }
 
-void* DeviceAPI::AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) {
-  return AllocDataSpace(ctx, size, kTempAllocaAlignment, type_hint);
+void* DeviceAPI::AllocWorkspace(Device dev, size_t size, DLDataType type_hint) {
+  return AllocDataSpace(dev, size, kTempAllocaAlignment, type_hint);
 }
 
 static size_t GetDataAlignment(const DLDataType dtype) {
@@ -150,13 +150,13 @@ static size_t GetDataAlignment(const DLDataType dtype) {
   return align;
 }
 
-void* DeviceAPI::AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, DLDataType dtype,
+void* DeviceAPI::AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
                                 Optional<String> mem_scope) {
   if (!mem_scope.defined() || mem_scope.value() == "global") {
     // by default, we can always redirect to the flat memory allocations
     DLTensor temp;
     temp.data = nullptr;
-    temp.ctx = ctx;
+    temp.device = dev;
     temp.ndim = ndim;
     temp.dtype = dtype;
     temp.shape = const_cast<int64_t*>(shape);
@@ -164,7 +164,7 @@ void* DeviceAPI::AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape,
     temp.byte_offset = 0;
     size_t size = GetDataSize(temp);
     size_t alignment = GetDataAlignment(temp.dtype);
-    return AllocDataSpace(ctx, size, alignment, dtype);
+    return AllocDataSpace(dev, size, alignment, dtype);
   }
   LOG(FATAL) << "Device does not support allocate data space with "
              << "specified memory scope: " << mem_scope.value();
@@ -178,29 +178,28 @@ void DeviceAPI::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle str
 
   ICHECK(IsContiguous(*from) && IsContiguous(*to))
       << "CopyDataFromTo only support contiguous array for now";
-  CopyDataFromTo(from->data, from->byte_offset, to->data, to->byte_offset, nbytes, from->ctx,
-                 to->ctx, from->dtype, stream);
+  CopyDataFromTo(from->data, from->byte_offset, to->data, to->byte_offset, nbytes, from->device,
+                 to->device, from->dtype, stream);
 }
 
 void DeviceAPI::CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
-                               size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
+                               size_t num_bytes, Device dev_from, Device dev_to,
                                DLDataType type_hint, TVMStreamHandle stream) {
   LOG(FATAL) << "Device does not support CopyDataFromTo.";
 }
 
-void DeviceAPI::FreeWorkspace(TVMContext ctx, void* ptr) { FreeDataSpace(ctx, ptr); }
+void DeviceAPI::FreeWorkspace(Device dev, void* ptr) { FreeDataSpace(dev, ptr); }
 
-TVMStreamHandle DeviceAPI::CreateStream(TVMContext ctx) {
+TVMStreamHandle DeviceAPI::CreateStream(Device dev) {
   LOG(FATAL) << "Device does not support stream api.";
   return nullptr;
 }
 
-void DeviceAPI::FreeStream(TVMContext ctx, TVMStreamHandle stream) {
+void DeviceAPI::FreeStream(Device dev, TVMStreamHandle stream) {
   LOG(FATAL) << "Device does not support stream api.";
 }
 
-void DeviceAPI::SyncStreamFromTo(TVMContext ctx, TVMStreamHandle event_src,
-                                 TVMStreamHandle event_dst) {
+void DeviceAPI::SyncStreamFromTo(Device dev, TVMStreamHandle event_src, TVMStreamHandle event_dst) {
   LOG(FATAL) << "Device does not support stream api.";
 }
 
@@ -384,7 +383,7 @@ typedef dmlc::ThreadLocalStore<TVMRuntimeEntry> TVMAPIRuntimeStore;
 
 const char* TVMGetLastError() { return TVMAPIRuntimeStore::Get()->last_error.c_str(); }
 
-int TVMAPIHandleException(const std::runtime_error& e) {
+int TVMAPIHandleException(const std::exception& e) {
   TVMAPISetLastError(NormalizeError(e.what()).c_str());
   return -1;
 }
@@ -430,23 +429,23 @@ int TVMBackendGetFuncFromEnv(void* mod_node, const char* func_name, TVMFunctionH
 
 void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t size, int dtype_code_hint,
                                int dtype_bits_hint) {
-  TVMContext ctx;
-  ctx.device_type = static_cast<DLDeviceType>(device_type);
-  ctx.device_id = device_id;
+  DLDevice dev;
+  dev.device_type = static_cast<DLDeviceType>(device_type);
+  dev.device_id = device_id;
 
   DLDataType type_hint;
   type_hint.code = static_cast<decltype(type_hint.code)>(dtype_code_hint);
   type_hint.bits = static_cast<decltype(type_hint.bits)>(dtype_bits_hint);
   type_hint.lanes = 1;
 
-  return DeviceAPIManager::Get(ctx)->AllocWorkspace(ctx, static_cast<size_t>(size), type_hint);
+  return DeviceAPIManager::Get(dev)->AllocWorkspace(dev, static_cast<size_t>(size), type_hint);
 }
 
 int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) {
-  TVMContext ctx;
-  ctx.device_type = static_cast<DLDeviceType>(device_type);
-  ctx.device_id = device_id;
-  DeviceAPIManager::Get(ctx)->FreeWorkspace(ctx, ptr);
+  DLDevice dev;
+  dev.device_type = static_cast<DLDeviceType>(device_type);
+  dev.device_id = device_id;
+  DeviceAPIManager::Get(dev)->FreeWorkspace(dev, ptr);
   return 0;
 }
 
@@ -518,7 +517,7 @@ int TVMFuncCreateFromCFunc(TVMPackedCFunc func, void* resource_handle, TVMPacked
       int ret = func(const_cast<TVMValue*>(args.values), const_cast<int*>(args.type_codes),
                      args.num_args, rv, resource_handle);
       if (ret != 0) {
-        throw dmlc::Error(TVMGetLastError() + ::dmlc::StackTrace());
+        throw tvm::Error(TVMGetLastError() + tvm::runtime::Backtrace());
       }
     });
   } else {
@@ -529,7 +528,7 @@ int TVMFuncCreateFromCFunc(TVMPackedCFunc func, void* resource_handle, TVMPacked
       int ret = func(const_cast<TVMValue*>(args.values), const_cast<int*>(args.type_codes),
                      args.num_args, rv, rpack.get());
       if (ret != 0) {
-        throw dmlc::Error(TVMGetLastError() + ::dmlc::StackTrace());
+        throw tvm::Error(TVMGetLastError() + tvm::runtime::Backtrace());
       }
     });
   }
@@ -538,47 +537,47 @@ int TVMFuncCreateFromCFunc(TVMPackedCFunc func, void* resource_handle, TVMPacked
 
 int TVMStreamCreate(int device_type, int device_id, TVMStreamHandle* out) {
   API_BEGIN();
-  TVMContext ctx;
-  ctx.device_type = static_cast<DLDeviceType>(device_type);
-  ctx.device_id = device_id;
-  *out = DeviceAPIManager::Get(ctx)->CreateStream(ctx);
+  DLDevice dev;
+  dev.device_type = static_cast<DLDeviceType>(device_type);
+  dev.device_id = device_id;
+  *out = DeviceAPIManager::Get(dev)->CreateStream(dev);
   API_END();
 }
 
 int TVMStreamFree(int device_type, int device_id, TVMStreamHandle stream) {
   API_BEGIN();
-  TVMContext ctx;
-  ctx.device_type = static_cast<DLDeviceType>(device_type);
-  ctx.device_id = device_id;
-  DeviceAPIManager::Get(ctx)->FreeStream(ctx, stream);
+  DLDevice dev;
+  dev.device_type = static_cast<DLDeviceType>(device_type);
+  dev.device_id = device_id;
+  DeviceAPIManager::Get(dev)->FreeStream(dev, stream);
   API_END();
 }
 
 int TVMSetStream(int device_type, int device_id, TVMStreamHandle stream) {
   API_BEGIN();
-  TVMContext ctx;
-  ctx.device_type = static_cast<DLDeviceType>(device_type);
-  ctx.device_id = device_id;
-  DeviceAPIManager::Get(ctx)->SetStream(ctx, stream);
+  DLDevice dev;
+  dev.device_type = static_cast<DLDeviceType>(device_type);
+  dev.device_id = device_id;
+  DeviceAPIManager::Get(dev)->SetStream(dev, stream);
   API_END();
 }
 
 int TVMSynchronize(int device_type, int device_id, TVMStreamHandle stream) {
   API_BEGIN();
-  TVMContext ctx;
-  ctx.device_type = static_cast<DLDeviceType>(device_type);
-  ctx.device_id = device_id;
-  DeviceAPIManager::Get(ctx)->StreamSync(ctx, stream);
+  DLDevice dev;
+  dev.device_type = static_cast<DLDeviceType>(device_type);
+  dev.device_id = device_id;
+  DeviceAPIManager::Get(dev)->StreamSync(dev, stream);
   API_END();
 }
 
 int TVMStreamStreamSynchronize(int device_type, int device_id, TVMStreamHandle src,
                                TVMStreamHandle dst) {
   API_BEGIN();
-  TVMContext ctx;
-  ctx.device_type = static_cast<DLDeviceType>(device_type);
-  ctx.device_id = device_id;
-  DeviceAPIManager::Get(ctx)->SyncStreamFromTo(ctx, src, dst);
+  DLDevice dev;
+  dev.device_type = static_cast<DLDeviceType>(device_type);
+  dev.device_id = device_id;
+  DeviceAPIManager::Get(dev)->SyncStreamFromTo(dev, src, dst);
   API_END();
 }
 
@@ -590,64 +589,64 @@ int TVMCbArgToReturn(TVMValue* value, int* code) {
   API_END();
 }
 
-int TVMDeviceAllocDataSpace(DLContext ctx, size_t nbytes, size_t alignment, DLDataType type_hint,
+int TVMDeviceAllocDataSpace(DLDevice dev, size_t nbytes, size_t alignment, DLDataType type_hint,
                             void** out_data) {
   API_BEGIN();
-  out_data[0] = DeviceAPIManager::Get(ctx)->AllocDataSpace(ctx, nbytes, alignment, type_hint);
+  out_data[0] = DeviceAPIManager::Get(dev)->AllocDataSpace(dev, nbytes, alignment, type_hint);
   API_END();
 }
 
-int TVMDeviceAllocDataSpaceWithScope(DLContext ctx, int ndim, const int64_t* shape,
-                                     DLDataType dtype, const char* mem_scope, void** out_data) {
+int TVMDeviceAllocDataSpaceWithScope(DLDevice dev, int ndim, const int64_t* shape, DLDataType dtype,
+                                     const char* mem_scope, void** out_data) {
   API_BEGIN();
   Optional<String> scope;
   if (mem_scope != nullptr) {
     scope = String(std::string(mem_scope));
   }
-  out_data[0] = DeviceAPIManager::Get(ctx)->AllocDataSpace(ctx, ndim, shape, dtype, scope);
+  out_data[0] = DeviceAPIManager::Get(dev)->AllocDataSpace(dev, ndim, shape, dtype, scope);
   API_END();
 }
 
-int TVMDeviceFreeDataSpace(DLContext ctx, void* ptr) {
+int TVMDeviceFreeDataSpace(DLDevice dev, void* ptr) {
   API_BEGIN();
-  DeviceAPIManager::Get(ctx)->FreeDataSpace(ctx, ptr);
+  DeviceAPIManager::Get(dev)->FreeDataSpace(dev, ptr);
   API_END();
 }
 
 int TVMDeviceCopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
   API_BEGIN();
-  TVMContext ctx_from = from->ctx;
-  TVMContext ctx_to = to->ctx;
-  TVMContext ctx = ctx_from.device_type != kDLCPU ? ctx_from : ctx_to;
-  DeviceAPIManager::Get(ctx)->CopyDataFromTo(from, to, stream);
+  DLDevice dev_from = from->device;
+  DLDevice dev_to = to->device;
+  DLDevice dev = dev_from.device_type != kDLCPU ? dev_from : dev_to;
+  DeviceAPIManager::Get(dev)->CopyDataFromTo(from, to, stream);
   API_END();
 }
 
 // set device api
 TVM_REGISTER_GLOBAL(tvm::runtime::symbol::tvm_set_device)
     .set_body([](TVMArgs args, TVMRetValue* ret) {
-      TVMContext ctx;
-      ctx.device_type = static_cast<DLDeviceType>(args[0].operator int());
-      ctx.device_id = args[1];
-      DeviceAPIManager::Get(ctx)->SetDevice(ctx);
+      DLDevice dev;
+      dev.device_type = static_cast<DLDeviceType>(args[0].operator int());
+      dev.device_id = args[1];
+      DeviceAPIManager::Get(dev)->SetDevice(dev);
     });
 
 // set device api
 TVM_REGISTER_GLOBAL("runtime.GetDeviceAttr").set_body([](TVMArgs args, TVMRetValue* ret) {
-  TVMContext ctx;
-  ctx.device_type = static_cast<DLDeviceType>(args[0].operator int());
-  ctx.device_id = args[1];
+  DLDevice dev;
+  dev.device_type = static_cast<DLDeviceType>(args[0].operator int());
+  dev.device_id = args[1];
 
   DeviceAttrKind kind = static_cast<DeviceAttrKind>(args[2].operator int());
   if (kind == kExist) {
-    DeviceAPI* api = DeviceAPIManager::Get(ctx.device_type, true);
+    DeviceAPI* api = DeviceAPIManager::Get(dev.device_type, true);
     if (api != nullptr) {
-      api->GetAttr(ctx, kind, ret);
+      api->GetAttr(dev, kind, ret);
     } else {
       *ret = 0;
     }
   } else {
-    DeviceAPIManager::Get(ctx)->GetAttr(ctx, kind, ret);
+    DeviceAPIManager::Get(dev)->GetAttr(dev, kind, ret);
   }
 });
 
diff --git a/src/runtime/contrib/arm_compute_lib/acl_allocator.cc b/src/runtime/contrib/arm_compute_lib/acl_allocator.cc
index f9a67010e6e2..b843841f5755 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_allocator.cc
+++ b/src/runtime/contrib/arm_compute_lib/acl_allocator.cc
@@ -30,10 +30,10 @@ namespace contrib {
 
 void* ACLAllocator::allocate(size_t size, size_t alignment) {
   ICHECK_GT(size, 0) << "Cannot allocate size less than or equal to zero";
-  return this->device_api_->AllocWorkspace(this->ctx_, size, {});
+  return this->device_api_->AllocWorkspace(this->device_, size, {});
 }
 
-void ACLAllocator::free(void* ptr) { this->device_api_->FreeWorkspace(this->ctx_, ptr); }
+void ACLAllocator::free(void* ptr) { this->device_api_->FreeWorkspace(this->device_, ptr); }
 
 std::unique_ptr<arm_compute::IMemoryRegion> ACLAllocator::make_region(size_t size,
                                                                       size_t alignment) {
@@ -43,7 +43,7 @@ std::unique_ptr<arm_compute::IMemoryRegion> ACLAllocator::make_region(size_t siz
 ACLMemoryRegion::ACLMemoryRegion(size_t size, size_t alignment)
     : IMemoryRegion(size), ptr_(nullptr) {
   if (size != 0) {
-    this->ptr_ = this->device_api_->AllocDataSpace(this->ctx_, size, alignment, {});
+    this->ptr_ = this->device_api_->AllocDataSpace(this->device_, size, alignment, {});
   }
 }
 
@@ -56,7 +56,7 @@ ACLMemoryRegion::ACLMemoryRegion(void* ptr, size_t size)
 
 ACLMemoryRegion::~ACLMemoryRegion() {
   if (this->ptr_ != nullptr && !is_subregion_) {
-    this->device_api_->FreeDataSpace(this->ctx_, this->ptr_);
+    this->device_api_->FreeDataSpace(this->device_, this->ptr_);
   }
 }
 
diff --git a/src/runtime/contrib/arm_compute_lib/acl_allocator.h b/src/runtime/contrib/arm_compute_lib/acl_allocator.h
index 49d0d0c764e8..d4e72a73314f 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_allocator.h
+++ b/src/runtime/contrib/arm_compute_lib/acl_allocator.h
@@ -74,9 +74,9 @@ class ACLAllocator : public arm_compute::IAllocator {
 
  private:
   /*! \brief Always allocate data in the context of the current CPU. */
-  const TVMContext ctx_{kDLCPU, 0};
+  const Device device_{kDLCPU, 0};
   /*! \brief Device API which allows requests for memory from TVM. */
-  runtime::DeviceAPI* device_api_ = runtime::DeviceAPI::Get(ctx_);
+  runtime::DeviceAPI* device_api_ = runtime::DeviceAPI::Get(device_);
 };
 
 /*!
@@ -125,9 +125,9 @@ class ACLMemoryRegion : public arm_compute::IMemoryRegion {
   /*! \brief A subregion doesn't manage TVM memory so we don't need to free it. */
   bool is_subregion_ = false;
   /*! \brief Always allocate data in the context of the current CPU. */
-  const TVMContext ctx_{kDLCPU, 0};
+  const Device device_{kDLCPU, 0};
   /*! \brief Device API which allows requests for memory from TVM. */
-  runtime::DeviceAPI* device_api_ = runtime::DeviceAPI::Get(ctx_);
+  runtime::DeviceAPI* device_api_ = runtime::DeviceAPI::Get(device_);
 };
 
 }  // namespace contrib
diff --git a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
index ed8f6adbd083..6562d1bfc62d 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
+++ b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
@@ -28,7 +28,7 @@
 #include "../json/json_node.h"
 #include "../json/json_runtime.h"
 
-#ifdef TVM_GRAPH_RUNTIME_ARM_COMPUTE_LIB
+#ifdef TVM_GRAPH_EXECUTOR_ARM_COMPUTE_LIB
 #include <arm_compute/core/Types.h>
 #include <arm_compute/runtime/NEON/functions/NEArithmeticAddition.h>
 #include <arm_compute/runtime/NEON/functions/NEConvolutionLayer.h>
@@ -82,7 +82,7 @@ class ACLRuntime : public JSONRuntimeBase {
     BuildEngine();
   }
 
-#ifdef TVM_GRAPH_RUNTIME_ARM_COMPUTE_LIB
+#ifdef TVM_GRAPH_EXECUTOR_ARM_COMPUTE_LIB
   /*!
    * \brief Unpack inputs and outputs and run inference on a given layer.
    *
@@ -518,12 +518,12 @@ class ACLRuntime : public JSONRuntimeBase {
 #else
   void Run() override {
     LOG(FATAL) << "Cannot call run on Arm Compute Library module without runtime enabled. "
-               << "Please build with USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME.";
+               << "Please build with USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR.";
   }
 
   void BuildEngine() {
     LOG(WARNING) << "Arm Compute Library engine is not initialized. "
-                 << "Please build with USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME.";
+                 << "Please build with USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR.";
   }
 #endif
 };
diff --git a/src/runtime/contrib/bnns/bnns_json_runtime.cc b/src/runtime/contrib/bnns/bnns_json_runtime.cc
new file mode 100644
index 000000000000..87b01567cd30
--- /dev/null
+++ b/src/runtime/contrib/bnns/bnns_json_runtime.cc
@@ -0,0 +1,573 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/**
+ * \file
+ * \brief Simple JSON runtime for Apple BNNS primitives
+ */
+
+#include <tvm/runtime/c_backend_api.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/registry.h>
+
+#include <cstddef>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "../json/json_node.h"
+#include "../json/json_runtime.h"
+#include "bnns_wrp.h"
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+using namespace ::tvm::runtime;
+using namespace ::tvm::runtime::json;
+using namespace ::tvm::runtime::contrib::BNNS;
+
+struct ThreadingConfig {
+  /**
+   * Internal parallelism level ov BNNS primitive specified via BNNSFilterParameters
+   * struct. BNNS doesn't provide real control of internal threading, so it may be
+   * ignored by BNNS implementation.
+   *
+   * Valid values:
+   *   0  use default num of threads suggested by BNNS implementation
+   *  >0  suggests to use this num of internal BNNS threads
+   */
+  size_t internalConcurrency = 0;
+
+  /**
+   * TVM level parallelism for BNNS runtime.
+   * BNNS runtime will split primitive into set of independent sub primitives which
+   * can be executed in parallel. As a rule the splitting are performed through output
+   * channels, so the effective shape of executed primitive is changed.
+   *
+   * Valid values:
+   *   0  do not use graph level treading
+   *  >0  split into this num of primitives
+   */
+  size_t externalConcurrency = 0;
+};
+
+/**
+ * Depends on platform hardware the optimal ThreadingConfig may differ.
+ * This function contains a priori knowledge about some Apple platforms
+ * and their specific.
+ *
+ * @return default ThreadingConfig suggested for this platform
+ */
+ThreadingConfig getDefaultThreadingConfig() {
+  // TODO(apeskov): have to implement CPU/iOS version check.
+  //  meanwhile will use {0, 2} stub to utilize big cores of A13/A14 CPU.
+  return {0, 2};
+}
+
+/**
+ * Main entry point to BNNS runtime
+ */
+class BNNSJSONRuntime : public JSONRuntimeBase {
+ public:
+  BNNSJSONRuntime(const std::string& symbol_name, const std::string& graph_json,
+                  const Array<String> const_names)
+      : JSONRuntimeBase(symbol_name, graph_json, const_names) {}
+
+  const char* type_key() const override { return "bnns_json"; }
+
+  void Init(const Array<NDArray>& consts) override {
+    ICHECK_EQ(consts.size(), const_idx_.size())
+        << "The number of input constants must match the number of required.";
+
+    SetupConstants(consts);
+    BindInputsAndOutputs();
+    AllocateIntermediateTensors();
+    BuildEngine();
+  }
+
+  void Run() override {
+    // Wrap external handler into BNNS tensor representation
+    auto bind_ext_hdl_to_tensor = [this](uint32_t eid) {
+      const auto& ext_dlt = *data_entry_[eid];
+      auto& bnns_tensor = tensors_eid_[eid];
+      bnns_tensor->set_data_hdl(ext_dlt.data);
+    };
+
+    // Bind all input/output external data object into internal abstractions
+    for (const auto& eid : input_var_eid_) bind_ext_hdl_to_tensor(eid);
+    for (const auto& out_entity : outputs_) bind_ext_hdl_to_tensor(EntryID(out_entity));
+
+    // Invoke primitives in topological order
+    for (const auto& prim : primitives_) prim->execute();
+  }
+
+ private:
+  /** Make corresponding input/output tensor stubs */
+  void BindInputsAndOutputs() {
+    tensors_eid_.resize(data_entry_.size());
+    auto createTensor = [&](JSONGraphNodeEntry entry) {
+      auto node = nodes_[entry.id_];
+      auto dlshape = node.GetOpShape()[entry.index_];
+      auto dltype = node.GetOpDataType()[entry.index_];
+      void* data = nullptr;
+      if (data_entry_[entry.id_] != nullptr) data = data_entry_[entry.id_]->data;
+      tensors_eid_[entry.id_] = std::make_shared<BNNS::Tensor>(
+          BNNS::Shape{dlshape.begin(), dlshape.end()}, convertToBNNS(dltype), data);
+    };
+
+    for (auto& id : input_nodes_) {
+      auto eid = JSONGraphNodeEntry(id, 0);
+      createTensor(eid);
+    }
+
+    for (auto entry : outputs_) {
+      createTensor(entry);
+    }
+  }
+
+  /** Allocate intermediate tensors */
+  void AllocateIntermediateTensors() {
+    for (int i = 0; i < nodes_.size(); ++i) {
+      auto eid = JSONGraphNodeEntry(i, 0);
+      if (tensors_eid_[eid.id_] != nullptr) continue;
+      auto node = nodes_[i];
+      auto dlshape = node.GetOpShape()[0];
+      auto dltype = node.GetOpDataType()[0];
+      tensors_eid_[eid.id_] = std::make_shared<BNNS::Tensor>(
+          BNNS::Shape{dlshape.begin(), dlshape.end()}, convertToBNNS(dltype), nullptr);
+      tensors_eid_[eid.id_]->allocate_memory();
+    }
+  }
+
+  // Build up the engine based on the input graph.
+  void BuildEngine() {
+    // Build subgraph engine.
+    for (size_t nid = 0; nid < nodes_.size(); ++nid) {
+      const auto& node = nodes_[nid];
+      if (node.GetOpType() == "kernel") {
+        ICHECK_EQ(node.GetOpType(), "kernel");
+        auto op_name = node.GetOpName();
+        if ("nn.conv2d" == op_name) {
+          Conv2d(nid);
+        } else if ("bnns.conv2d_relu" == op_name) {
+          Conv2d(nid, false, "relu");
+        } else if ("bnns.conv2d_bias_relu" == op_name) {
+          Conv2d(nid, true, "relu");
+        } else if ("bnns.conv2d_sigmoid" == op_name) {
+          Conv2d(nid, false, "sigmoid");
+        } else if ("bnns.conv2d_bias_sigmoid" == op_name) {
+          Conv2d(nid, true, "sigmoid");
+        } else if ("bnns.conv2d_bias" == op_name) {
+          Conv2d(nid, true);
+        } else if ("nn.dense" == op_name) {
+          Dense(nid);
+        } else if ("bnns.dense_bias" == op_name) {
+          Dense(nid, true);
+        } else if ("bnns.dense_bias_gelu" == op_name) {
+          Dense(nid, true, true);
+        } else if ("nn.batch_matmul" == op_name) {
+          MatMul(nid);
+        } else if ("nn.instance_norm" == op_name) {
+          InstanceNormalization(nid);
+        } else if ("nn.max_pool2d" == op_name) {
+          Pooling(nid, false);
+        } else if ("nn.avg_pool2d" == op_name) {
+          Pooling(nid, true);
+        } else if ("nn.global_max_pool2d" == op_name) {
+          Pooling(nid, false, true);
+        } else if ("nn.global_avg_pool2d" == op_name) {
+          Pooling(nid, true, true);
+        } else {
+          LOG(FATAL) << "Unsupported op: " << op_name;
+        }
+      }
+    }
+  }
+
+  // Get BNNS tensor.
+  std::shared_ptr<BNNS::Tensor> GetBNNSTensor(const JSONGraphNodeEntry& entry) {
+    auto eid = EntryID(entry);
+    ICHECK(eid < tensors_eid_.size());
+    return tensors_eid_[eid];
+  }
+
+  void Conv2d(const size_t& nid, const bool has_bias = false,
+              const std::string activation_type = "none") {
+    auto node = nodes_[nid];
+
+    // Setup attributes.
+    auto src_entry = node.GetInputs()[0];
+    auto wgh_entry = node.GetInputs()[1];
+    auto dst_entry = JSONGraphNodeEntry(nid, 0);
+
+    auto dl_input_shape = nodes_[src_entry.id_].GetOpShape()[src_entry.index_];
+    auto dl_weight_shape = nodes_[wgh_entry.id_].GetOpShape()[wgh_entry.index_];
+    BNNS::Shape input_shape{dl_input_shape.begin(), dl_input_shape.end()};
+    BNNS::Shape weight_shape{dl_weight_shape.begin(), dl_weight_shape.end()};
+    std::vector<std::string> str_strides = node.GetAttr<std::vector<std::string>>("strides");
+    std::vector<std::string> str_dilation = node.GetAttr<std::vector<std::string>>("dilation");
+    std::vector<std::string> str_padding = node.GetAttr<std::vector<std::string>>("padding");
+    BNNS::Dim groups = std::stoi(node.GetAttr<std::vector<std::string>>("groups")[0]);
+
+    BNNS::Dim PH_L = std::stoi(str_padding[0]),  // height padding: left
+        PH_R = std::stoi(str_padding[2]),        // height padding: right
+        PW_L = std::stoi(str_padding[1]),        // width padding: left
+        PW_R = std::stoi(str_padding[3]),        // width padding: right
+        SH = std::stoi(str_strides[0]),          // height-wise stride
+        SW = std::stoi(str_strides[1]),          // weight-wise stride
+        DH = std::stoi(str_dilation[0]),         // height kernel dilation
+        DW = std::stoi(str_dilation[1]);         // width kernel dilation
+
+    // Memory descriptions.
+    const auto& src_t = GetBNNSTensor(src_entry);
+    const auto& wgh_t = GetBNNSTensor(wgh_entry);
+    const auto& dst_t = GetBNNSTensor(dst_entry);
+
+    auto src_view = TView::as_is(src_t).extract_outer_dim().with_layout(BNNSDataLayoutImageCHW);
+    auto wgh_view = TView::as_is(wgh_t).with_layout(BNNSDataLayoutConvolutionWeightsOIHW);
+    auto dst_view = TView::as_is(dst_t).extract_outer_dim().with_layout(BNNSDataLayoutImageCHW);
+    TView bias_view;
+
+    if (has_bias) {
+      auto bias_entry = node.GetInputs()[2];
+
+      auto bias_t = GetBNNSTensor(bias_entry);
+      bias_view = TView::as_is(bias_t).squeeze().with_layout(BNNSDataLayoutVector);
+    }
+
+    BNNSActivation activation = {BNNSActivationFunctionIdentity};
+    if (activation_type == "relu")
+      activation = {BNNSActivationFunctionRectifiedLinear};
+    else if (activation_type == "sigmoid")
+      activation = {BNNSActivationFunctionSigmoid};
+
+    BNNSLayerParametersConvolution conv_param = {
+        src_view.get_bnns_view(),
+        wgh_view.get_bnns_view(),
+        dst_view.get_bnns_view(),
+        bias_view.get_bnns_view(),
+        activation,
+        SW,                      /* x_stride */
+        SH,                      /* y_stride */
+        DW,                      /* x_dilation_stride */
+        DH,                      /* y_dilation_stride */
+        0,                       /* x_padding, explicit pads will be used */
+        0,                       /* y_padding, explicit pads will be used */
+        groups,                  /* groups */
+        {PW_L, PW_R, PH_L, PH_R} /* explicit pad values */
+    };
+
+    size_t num_sub_prim = default_thread_config.externalConcurrency;
+    std::vector<BNNSLayerParametersConvolution> params;
+    std::tie(params, src_view, dst_view) =
+        split_to_n(num_sub_prim, conv_param, src_view, wgh_view, bias_view, dst_view);
+
+    std::vector<BNNSFilter> filters(params.size(), nullptr);
+    for (int i = 0; i < params.size(); i++) {
+      auto common_filter_param = getCommonFilterParams();
+      filters[i] = BNNSFilterCreateLayerConvolution(&params[i], &common_filter_param);
+      ICHECK(filters[i]) << "BNNS primitive was not created. Unsupported attributes configuration";
+    }
+
+    primitives_.emplace_back(std::make_shared<BNNS::Primitive>(filters, src_view, dst_view));
+  }
+
+  void Dense(const size_t& nid, const bool has_bias = false, const bool has_gelu = false) {
+    auto node = nodes_[nid];
+
+    // Setup attributes.
+    auto src_entry = node.GetInputs()[0];
+    auto weight_entry = node.GetInputs()[1];
+    auto dst_entry = JSONGraphNodeEntry(nid, 0);
+
+    // Memory descriptions.
+    auto src_t = GetBNNSTensor(src_entry);
+    auto wgh_t = GetBNNSTensor(weight_entry);
+    auto dst_t = GetBNNSTensor(dst_entry);
+
+    auto src_view = TView::as_is(src_t).extract_outer_dim().with_layout(BNNSDataLayoutVector);
+    auto wgh_view = TView::as_is(wgh_t).with_layout(BNNSDataLayoutRowMajorMatrix);
+    auto dst_view = TView::as_is(dst_t).extract_outer_dim().with_layout(BNNSDataLayoutVector);
+
+    TView bias_view;
+    if (has_bias) {
+      auto bias_entry = node.GetInputs()[2];
+      auto bias_md = GetBNNSTensor(bias_entry);
+      bias_view = TView::as_is(bias_md).with_layout(BNNSDataLayoutVector);
+    }
+
+    BNNSActivation activation = {BNNSActivationFunctionIdentity};
+    if (has_gelu) {
+      activation = {BNNSActivationFunctionGELUApproximation};
+      activation.alpha = std::sqrt(2.0 / M_PI);
+      activation.beta = 0.044715;
+    }
+
+    BNNSLayerParametersFullyConnected layerParameters = {
+        src_view.get_bnns_view(),
+        wgh_view.get_bnns_view(),
+        dst_view.get_bnns_view(),
+        bias_view.get_bnns_view(),
+        activation,
+    };
+
+    auto common_filter_param = getCommonFilterParams();
+    auto filter = BNNSFilterCreateLayerFullyConnected(&layerParameters, &common_filter_param);
+    ICHECK(filter) << "BNNS primitive was not created. Unsupported attributes configuration";
+    std::vector<BNNSFilter> filters = {filter};
+    primitives_.emplace_back(std::make_shared<BNNS::Primitive>(filters, src_view, dst_view));
+  }
+
+  void MatMul(const size_t& nid) {
+    auto node = nodes_[nid];
+
+    // Setup attributes.
+    auto a_entry = node.GetInputs()[0];
+    auto b_entry = node.GetInputs()[1];
+    auto dst_entry = JSONGraphNodeEntry(nid, 0);
+    bool a_is_weighted = data_entry_[EntryID(a_entry)] != nullptr;
+    bool b_is_weighted = data_entry_[EntryID(b_entry)] != nullptr;
+
+    // Memory descriptions.
+    auto a_t = GetBNNSTensor(a_entry);
+    auto b_t = GetBNNSTensor(b_entry);
+    auto dst_t = GetBNNSTensor(dst_entry);
+
+    auto a_view = TView::as_is(a_t);
+    auto b_view = TView::as_is(b_t);
+    auto dst_view = TView::as_is(dst_t);
+
+    BNNSLayerParametersBroadcastMatMul layerParameters = {1,      // alpha
+                                                          0,      // beta
+                                                          false,  // transA
+                                                          true,   // transB
+                                                          false,  // quadratic
+                                                          a_is_weighted,
+                                                          b_is_weighted,
+                                                          a_view.get_bnns_view(),
+                                                          b_view.get_bnns_view(),
+                                                          dst_view.get_bnns_view()};
+
+    // BNNS limitation: MatMul use reverse dims values. However strides are calculated correctly
+    //    based on BNNSNDArrayDescriptor::layout value.
+    std::reverse(layerParameters.iA_desc.size, layerParameters.iA_desc.size + 3);
+    std::reverse(layerParameters.iB_desc.size, layerParameters.iB_desc.size + 3);
+    std::reverse(layerParameters.o_desc.size, layerParameters.o_desc.size + 3);
+
+    auto common_filter_param = getCommonFilterParams();
+    auto filter = BNNSFilterCreateLayerBroadcastMatMul(&layerParameters, &common_filter_param);
+    ICHECK(filter) << "BNNS primitive was not created. Unsupported attributes configuration";
+
+    std::vector<BNNSFilter> filters{filter};
+    if (a_is_weighted || b_is_weighted) {
+      auto src_view = a_is_weighted ? b_view : a_view;
+      primitives_.emplace_back(std::make_shared<BNNS::Primitive>(filters, src_view, dst_view));
+    } else {
+      primitives_.emplace_back(
+          std::make_shared<BNNS::TwoInputPrimitive>(filters, a_view, b_view, dst_view));
+    }
+  }
+
+  void InstanceNormalization(const size_t& nid) {
+    auto node = nodes_[nid];
+    size_t axis = std::stoi(node.GetAttr<std::vector<std::string>>("axis")[0]);
+    float epsilon = std::stof(node.GetAttr<std::vector<std::string>>("epsilon")[0]);
+    bool center = std::stoi(node.GetAttr<std::vector<std::string>>("center")[0]);
+    bool scale = std::stoi(node.GetAttr<std::vector<std::string>>("scale")[0]);
+
+    // Setup attributes.
+    auto src_entry = node.GetInputs()[0];
+    auto scale_entry = node.GetInputs()[1];
+    auto bias_entry = node.GetInputs()[2];
+    auto dst_entry = JSONGraphNodeEntry(nid, 0);
+
+    // Memory descriptions.
+    auto src_t = GetBNNSTensor(src_entry);
+    auto scale_t = GetBNNSTensor(scale_entry);
+    auto bias_t = GetBNNSTensor(bias_entry);
+    auto dst_t = GetBNNSTensor(dst_entry);
+
+    auto src_view = TView::as_is(src_t);
+    auto dst_view = TView::as_is(dst_t);
+    size_t src_rank = Tensor::getRank(src_view.get_bnns_view());
+    size_t dst_rank = Tensor::getRank(dst_view.get_bnns_view());
+    ICHECK_EQ(src_rank, dst_rank);
+    ICHECK_LE(src_rank, 4);
+    if (src_rank < 4) {
+      src_view = src_view.unsqueeze(4);
+      dst_view = dst_view.unsqueeze(4);
+    }
+    src_view = src_view.extract_outer_dim().with_layout(BNNSDataLayoutImageCHW);
+    dst_view = dst_view.extract_outer_dim().with_layout(BNNSDataLayoutImageCHW);
+    auto scale_view = TView::as_is(scale_t).with_layout(BNNSDataLayoutVector);
+    auto bias_view = TView::as_is(bias_t).with_layout(BNNSDataLayoutVector);
+    BNNSActivation activation = {BNNSActivationFunctionIdentity};
+
+    auto b_desc = bias_view.get_bnns_view();
+    if (!center) b_desc = {};
+    auto s_desc = scale_view.get_bnns_view();
+    if (!scale) s_desc = {};
+
+    // NOTE: Axis option is ignored in BNNS. The result doesn't depends on value of axis.
+    BNNSLayerParametersNormalization layerParameters = {src_view.get_bnns_view(),  // i_desc
+                                                        dst_view.get_bnns_view(),  // o_desc
+                                                        b_desc,                    // beta_desc
+                                                        s_desc,                    // gamma_desc
+                                                        {},          // moving_mean_desc
+                                                        {},          // moving_variance_desc
+                                                        1.f,         // momentum
+                                                        epsilon,     // epsilon
+                                                        activation,  // activation
+                                                        1,           // num_groups
+                                                        axis};       // normalization_axis
+
+    BNNSFilterType filter_type = BNNSInstanceNorm;
+    auto common_filter_param = getCommonFilterParams();
+    auto filter =
+        BNNSFilterCreateLayerNormalization(filter_type, &layerParameters, &common_filter_param);
+    ICHECK(filter) << "BNNS primitive was not created. Unsupported attributes configuration";
+
+    std::vector<BNNSFilter> filters{filter};
+    primitives_.emplace_back(std::make_shared<BNNS::NormPrimitive>(filters, src_view, dst_view));
+  }
+
+  void Pooling(const size_t& nid, bool avg_pooling, bool global = false) {
+    auto node = nodes_[nid];
+
+    auto src_entry = node.GetInputs()[0];
+    auto dst_entry = JSONGraphNodeEntry(nid, 0);
+
+    // Memory descriptions.
+    auto src_t = GetBNNSTensor(src_entry);
+    auto dst_t = GetBNNSTensor(dst_entry);
+
+    auto src_view = TView::as_is(src_t);
+    auto dst_view = TView::as_is(dst_t);
+    size_t src_rank = Tensor::getRank(src_view.get_bnns_view());
+    size_t dst_rank = Tensor::getRank(dst_view.get_bnns_view());
+    ICHECK_EQ(src_rank, dst_rank);
+    ICHECK_LE(src_rank, 4);
+    if (src_rank < 4) {
+      src_view = src_view.unsqueeze(4);
+      dst_view = dst_view.unsqueeze(4);
+    }
+    src_view = src_view.extract_outer_dim().with_layout(BNNSDataLayoutImageCHW);
+    dst_view = dst_view.extract_outer_dim().with_layout(BNNSDataLayoutImageCHW);
+    BNNSActivation activation = {BNNSActivationFunctionIdentity};
+    BNNSPoolingFunction pf = {BNNSPoolingFunctionMax};
+    if (avg_pooling) pf = {BNNSPoolingFunctionAverageCountExcludePadding};
+
+    // Setup attributes.
+    size_t k_height = 0;
+    size_t k_width = 0;
+    size_t y_padding = 0;
+    size_t x_padding = 0;
+    size_t y_stride = 1;
+    size_t x_stride = 1;
+    if (!global) {
+      std::vector<std::string> pool_size = node.GetAttr<std::vector<std::string>>("pool_size");
+      std::vector<std::string> padding = node.GetAttr<std::vector<std::string>>("padding");
+      std::vector<std::string> strides = node.GetAttr<std::vector<std::string>>("strides");
+      k_height = std::stoi(pool_size[0]);
+      k_width = std::stoi(pool_size[1]);
+      y_padding = std::stoi(padding[0]);
+      x_padding = std::stoi(padding[1]);
+      y_stride = std::stoi(strides[0]);
+      x_stride = std::stoi(strides[1]);
+    } else {
+      auto sv = src_view.get_bnns_view();
+      k_height = sv.size[1];
+      k_width = sv.size[0];
+    }
+
+    BNNSLayerParametersPooling layerParameters = {src_view.get_bnns_view(),  // i_desc
+                                                  dst_view.get_bnns_view(),  // o_desc
+                                                  {},                        // bias
+                                                  activation,                // activation
+                                                  pf,                        // pooling_function
+                                                  k_width,                   // k_width
+                                                  k_height,                  // k_height
+                                                  x_stride,                  // x_stride
+                                                  y_stride,                  // y_stride
+                                                  0,                         // x_dilation_stride
+                                                  0,                         // y_dilation_stride
+                                                  x_padding,                 // x_padding
+                                                  y_padding,                 // y_padding
+                                                  {}};  // pad left, right, up, down padding
+
+    auto common_filter_param = getCommonFilterParams();
+    auto filter = BNNSFilterCreateLayerPooling(&layerParameters, &common_filter_param);
+    ICHECK(filter) << "BNNS primitive was not created. Unsupported attributes configuration";
+
+    std::vector<BNNSFilter> filters{filter};
+    primitives_.emplace_back(std::make_shared<BNNS::PoolingPrimitive>(filters, src_view, dst_view));
+  }
+
+  BNNS::Dtype convertToBNNS(const DLDataType& dl_dtype) {
+    if (dl_dtype.code == DLDataTypeCode::kDLFloat) {
+      if (dl_dtype.bits == 32) return BNNSDataTypeFloat32;
+      if (dl_dtype.bits == 16) return BNNSDataTypeFloat16;
+    }
+    if (dl_dtype.code == DLDataTypeCode::kDLInt) {
+      if (dl_dtype.bits == 32) return BNNSDataTypeInt32;
+      if (dl_dtype.bits == 16) return BNNSDataTypeInt16;
+      if (dl_dtype.bits == 8) return BNNSDataTypeInt8;
+    }
+    if (dl_dtype.code == DLDataTypeCode::kDLUInt) {
+      if (dl_dtype.bits == 32) return BNNSDataTypeUInt32;
+      if (dl_dtype.bits == 16) return BNNSDataTypeUInt16;
+      if (dl_dtype.bits == 8) return BNNSDataTypeUInt8;
+    }
+    LOG(FATAL) << "Unsupported data type for BNNS runtime";
+    return BNNS::Dtype(0);
+  }
+
+  BNNSFilterParameters getCommonFilterParams() {
+    // NOTE: To force weights tensor copy on stage of filter create
+    //       just change : BNNSFlagsUseClientPtr -> 0
+    return {BNNSFlagsUseClientPtr, default_thread_config.internalConcurrency};
+  }
+
+  /** Default threading config. Should be used if there are
+   *  no other threading specificator. */
+  const ThreadingConfig default_thread_config = getDefaultThreadingConfig();
+
+  /** Collection of all primitives in topological order */
+  std::vector<std::shared_ptr<BNNS::Primitive>> primitives_;
+
+  /** Vector with BNNS tensors. Index of tensor matched with
+   *  corresponding EntryID from base JSONRuntimeBase. */
+  std::vector<TensorPtr> tensors_eid_;
+};
+
+runtime::Module BNNSJSONRuntimeCreate(String symbol_name, String graph_json,
+                                      const Array<String>& const_names) {
+  auto n = make_object<BNNSJSONRuntime>(symbol_name, graph_json, const_names);
+  return runtime::Module(n);
+}
+
+TVM_REGISTER_GLOBAL("runtime.BNNSJSONRuntimeCreate").set_body_typed(BNNSJSONRuntimeCreate);
+
+TVM_REGISTER_GLOBAL("runtime.module.loadbinary_bnns_json")
+    .set_body_typed(BNNSJSONRuntime::LoadFromBinary<BNNSJSONRuntime>);
+
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/contrib/bnns/bnns_wrp.h b/src/runtime/contrib/bnns/bnns_wrp.h
new file mode 100644
index 000000000000..b31e97e554da
--- /dev/null
+++ b/src/runtime/contrib/bnns/bnns_wrp.h
@@ -0,0 +1,495 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/**
+ * \file
+ * \brief C++ wrappers and helpers to handle BNNS objects
+ */
+
+#ifndef TVM_RUNTIME_CONTRIB_BNNS_BNNS_WRP_H_
+#define TVM_RUNTIME_CONTRIB_BNNS_BNNS_WRP_H_
+
+#include <Accelerate/Accelerate.h>
+
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <numeric>
+#include <tuple>
+#include <vector>
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+namespace BNNS {
+
+using Dim = size_t;
+using Shape = std::vector<Dim>;
+using Dtype = BNNSDataType;
+using HDL = void*;
+
+void* default_alloc(size_t size) { return malloc(size); }
+
+void default_free(void* ptr) { free(ptr); }
+
+/**
+ * Main abstraction for tensor representation
+ *
+ * Contains buffer handler and common attributes like shape and dtype.
+ */
+class Tensor {
+ public:
+  Tensor() = delete;
+  Tensor(Tensor&) = delete;
+
+  Tensor(Shape shape, Dtype dtype, void* hdl) {
+    auto rank = shape.size();
+    ICHECK(rank < BNNS_MAX_TENSOR_DIMENSION);
+
+    desc_ = {BNNSNDArrayFlags(0),
+             getPlainLayout(rank),
+             {},       // shape
+             {},       // strides
+             hdl,      // data handler
+             dtype,    // data type
+             nullptr,  // table_data (clustering case), is not used
+             dtype,
+             1.f,
+             0.f};
+    std::copy(shape.rbegin(), shape.rend(), std::begin(desc_.size));
+
+    desc_.data = hdl;
+    is_external_data = true;
+  }
+
+  ~Tensor() {
+    if (desc_.data && !is_external_data) {
+      default_free(desc_.data);
+      desc_.data = nullptr;
+    }
+  }
+
+  void allocate_memory() {
+    if (desc_.data && !is_external_data) {
+      default_free(desc_.data);
+    }
+    const size_t buff_size = getSize(desc_) * getElementSize(desc_);
+    desc_.data = default_alloc(buff_size);
+    ICHECK(desc_.data);
+    is_external_data = false;
+  }
+
+  void* get_data_hdl() const { return desc_.data; }
+
+  void set_data_hdl(void* hdl) {
+    if (desc_.data && !is_external_data) {
+      default_free(desc_.data);
+      desc_.data = nullptr;
+    }
+
+    desc_.data = hdl;
+    is_external_data = true;
+  }
+
+  const BNNSNDArrayDescriptor& get_desc() const { return desc_; }
+
+  static BNNSDataLayout getPlainLayout(size_t rank) {
+    ICHECK(rank <= BNNS_MAX_TENSOR_DIMENSION);
+    return static_cast<BNNSDataLayout>((rank << 16) | 0x8001);
+  }
+
+  static size_t getRank(BNNSDataLayout layout) { return (layout & 0xF0000) >> 16; }
+
+  static size_t getRank(BNNSNDArrayDescriptor desc) { return getRank(desc.layout); }
+
+  static size_t getSize(BNNSNDArrayDescriptor desc) {
+    auto rank = getRank(desc);
+    return std::accumulate(desc.size, desc.size + rank, 1, std::multiplies<int>());
+  }
+
+  /** return size of element in bytes */
+  static size_t getElementSize(Dtype dtype) { return (dtype & 0xFFFF) / 8; }
+
+  /** return size of element in bytes */
+  static size_t getElementSize(const BNNSNDArrayDescriptor& desc) {
+    return getElementSize(desc.data_type);
+  }
+
+ private:
+  bool is_external_data = false;
+  BNNSNDArrayDescriptor desc_;
+};
+
+using TensorPtr = std::shared_ptr<Tensor>;
+
+/**
+ * Tensor View object which represent how provided BNNS::Tensor will be considered
+ *
+ * The single BNNS::Tensor can be treated in different form depend on particular primitive
+ * expectation. More other some primitive supports only external form of batching. So we have
+ * some abstraction to describe how primitive will handle provided tensor.
+ *
+ * Batched View
+ *   View with extracted dimension as external batch value
+ *   example: Tensor [2, 3, 224, 224] -> View [3, 224, 224] with ext batch 2
+ *
+ * Party View
+ *   The collection of view on the same tensor, can be the same view or with some stride
+ *   example: Tensor [6, 5, 3, 3] -> 3 x View [2, 5, 3, 3] with stride 45
+ */
+class TView {
+ public:
+  /** Make view on provided tensor as is */
+  static TView as_is(const TensorPtr& origin) {
+    TView res;
+    res.origin_ = origin;
+    res.view_desc_ = origin->get_desc();
+    return res;
+  }
+
+  /** Extract outer dimension to separate batch field. TView will became batched view */
+  TView extract_outer_dim() const {
+    auto rank = Tensor::getRank(view_desc_);
+    TView res = *this;
+    res.batch_size_ = view_desc_.size[rank - 1];
+    res.batch_stride_ =
+        std::accumulate(view_desc_.size, view_desc_.size + rank - 1, 1, std::multiplies<>());
+    res.view_desc_.size[rank - 1] = 0;
+    res.view_desc_.layout = Tensor::getPlainLayout(rank - 1);
+    return res;
+  }
+
+  /** Squeeze all dims equal 1 */
+  TView squeeze(size_t min_rank = 1) const {
+    auto rank = Tensor::getRank(view_desc_);
+    size_t squeezed_shape[BNNS_MAX_TENSOR_DIMENSION] = {};
+    size_t squeezed_rank = 0;
+    for (int i = 0; i < rank; i++)
+      if (view_desc_.size[i] != 1) squeezed_shape[squeezed_rank++] = view_desc_.size[i];
+
+    if (min_rank > squeezed_rank) {
+      std::fill(squeezed_shape + squeezed_rank, squeezed_shape + min_rank, 1);
+      squeezed_rank = min_rank;
+    }
+
+    TView res = *this;
+    std::copy(squeezed_shape, squeezed_shape + squeezed_rank, res.view_desc_.size);
+    std::fill(res.view_desc_.size + squeezed_rank, res.view_desc_.size + rank, 0);
+    res.view_desc_.layout = Tensor::getPlainLayout(squeezed_rank);
+    return res;
+  }
+
+  /** Expand the shape of an array */
+  TView expand_dims(std::vector<size_t> axes) const {
+    auto rank = Tensor::getRank(view_desc_);
+    TView res = *this;
+    size_t unsqueezed_shape[BNNS_MAX_TENSOR_DIMENSION] = {};
+    size_t unsqueezed_rank = axes.size() + rank;
+    ICHECK_LE(unsqueezed_rank, BNNS_MAX_TENSOR_DIMENSION);
+    for (const auto& axis : axes) {
+      ICHECK_LT(axis, unsqueezed_rank);
+      unsqueezed_shape[axis] = 1;
+    }
+    for (int i = 0, orig_idx = 0; i < unsqueezed_rank; ++i) {
+      if (unsqueezed_shape[i] == 1) continue;
+      unsqueezed_shape[i] = view_desc_.size[orig_idx++];
+    }
+    std::copy(unsqueezed_shape, unsqueezed_shape + unsqueezed_rank, res.view_desc_.size);
+    res.view_desc_.layout = Tensor::getPlainLayout(unsqueezed_rank);
+    return res;
+  }
+
+  /** Unsqueeze tensor to a new rank */
+  TView unsqueeze(size_t new_rank) const {
+    ICHECK_LE(new_rank, BNNS_MAX_TENSOR_DIMENSION);
+    auto rank = Tensor::getRank(view_desc_);
+    ICHECK_GT(new_rank, rank);
+    std::vector<size_t> axes(new_rank - rank);
+    std::iota(axes.begin(), axes.end(), rank);
+    return expand_dims(axes);
+  }
+
+  /** Construct new TView with specified layout if it applicable */
+  TView with_layout(BNNSDataLayout layout) const {
+    ICHECK_EQ(Tensor::getRank(view_desc_), Tensor::getRank(layout));
+
+    TView res = *this;
+    res.view_desc_.layout = layout;
+    return res;
+  }
+
+  /** Construct party TView by splitting original TView into num parts */
+  TView party_split_n(size_t num) const {
+    ICHECK_EQ(party_size_, 1);
+
+    TView res = *this;
+    size_t rank = Tensor::getRank(view_desc_);
+    size_t size = Tensor::getSize(view_desc_);
+    res.party_size_ = num;
+    res.party_stride_ = size / num;
+
+    if (res.batch_size_ != 1) {
+      res.batch_size_ /= num;
+    } else {
+      res.view_desc_.size[rank - 1] /= num;
+      res.batch_stride_ /= num;
+    }
+    return res;
+  }
+
+  /** Construct party TView by duplicating original TView num times */
+  TView party_duplicate_n(size_t num) const {
+    ICHECK_EQ(party_size_, 1);
+
+    TView res = *this;
+    res.party_size_ = num;
+    res.party_stride_ = 0;
+
+    return res;
+  }
+
+  /** Return data buffer handler */
+  HDL get_data_hdl() const { return view_desc_.data; }
+
+  /** Return external batch dimension value */
+  size_t get_batch_size() const { return batch_size_; }
+
+  /** Return external batch dimension stride */
+  size_t get_stride() const { return batch_stride_; }
+
+  /** Return party element by index */
+  TView operator[](size_t i) const {
+    ICHECK_LT(i, party_size_);
+
+    TView res = *this;
+    res.party_size_ = 1;
+    if (origin_) {
+      auto hdl = reinterpret_cast<uint8_t*>(origin_->get_data_hdl());
+      hdl += i * party_stride_ * Tensor::getElementSize(view_desc_.data_type);
+      res.view_desc_.data = hdl;
+    }
+    return res;
+  }
+
+  /** Check if view is empty and doesn't relay to any tensor */
+  operator bool() const { return origin_ != nullptr; }
+
+  /** Get BNNS descriptor for particular View. Batch and Party attributed are ignored. */
+  const BNNSNDArrayDescriptor& get_bnns_view() const { return view_desc_; }
+
+ private:
+  /** Original tensor object to view on */
+  TensorPtr origin_;
+
+  /** Batched view parameters */
+  BNNSNDArrayDescriptor view_desc_ = {};
+  size_t batch_size_ = 1;
+  size_t batch_stride_ = 0;
+
+  /** Party representation parameters */
+  size_t party_size_ = 1;
+  size_t party_stride_ = 0;
+};
+
+/**
+ * Wrapper on top of BNNSFilter and src/dst TensorView.
+ *
+ * Support decomposed representation of filter and can execute sub primitives in parallel.
+ */
+class Primitive {
+ public:
+  Primitive(const std::vector<BNNSFilter> fs, const TView& src, const TView& dst)
+      : filters(fs), src_view(src), dst_view(dst) {}
+
+  virtual ~Primitive() {
+    for (auto& filter : filters)
+      if (filter) {
+        BNNSFilterDestroy(filter);
+        filter = nullptr;
+      }
+  }
+
+  /** Execute primitive with using specified src/dst */
+  void execute() {
+    auto res = TVMBackendParallelLaunch(run_task, this, filters.size());
+    ICHECK_EQ(res, 0) << "BNNS runtime. Primitive was not executed properly";
+  }
+
+ private:
+  virtual int execute_impl(int part_idx) {
+    const auto filter = this->filters[part_idx];
+    const auto src_view = this->src_view[part_idx];
+    const auto dst_view = this->dst_view[part_idx];
+
+    size_t mb = src_view.get_batch_size();
+
+    // NB! BNNS limitations
+    //   * Do not use simple BNNSFilterApply. There is a bug inside BNNS,
+    //     BNNSFilterApply doesn't work for grouped convolution.
+    //   * Group convolution doesn't support arbitrary stride for Batch dim.
+    //     The tensor should be dense.
+    return BNNSFilterApplyBatch(filter, mb, src_view.get_data_hdl(), src_view.get_stride(),
+                                dst_view.get_data_hdl(), dst_view.get_stride());
+  }
+
+  static int run_task(int task_id, TVMParallelGroupEnv* penv, void* cdata) {
+    auto prim = reinterpret_cast<Primitive*>(cdata);
+    return prim->execute_impl(task_id);
+  }
+
+ protected:
+  /** BNNS kernels/filters collect which will execute primitive */
+  std::vector<BNNSFilter> filters = {};
+  const TView src_view;
+  const TView dst_view;
+};
+
+/**
+ * Wrapper on top of BNNS::Primitive
+ *
+ * This primitive should be used for executing primitive with two inputs.
+ */
+class TwoInputPrimitive : public Primitive {
+ public:
+  TwoInputPrimitive(const std::vector<BNNSFilter> fs, const TView& src, const TView& src2,
+                    const TView& dst)
+      : Primitive(fs, src, dst), src2_view(src2) {}
+
+ private:
+  int execute_impl(int task_id) override {
+    const auto filter = this->filters[task_id];
+    const auto src_view = this->src_view[task_id];
+    const auto src2_view = this->src2_view[task_id];
+    const auto dst_view = this->dst_view[task_id];
+
+    size_t mb = src_view.get_batch_size();
+
+    return BNNSFilterApplyTwoInputBatch(filter, mb, src_view.get_data_hdl(), src_view.get_stride(),
+                                        src2_view.get_data_hdl(), src2_view.get_stride(),
+                                        dst_view.get_data_hdl(), dst_view.get_stride());
+  }
+
+ protected:
+  const TView src2_view;
+};
+
+/**
+ * Wrapper on top of BNNS::Primitive
+ *
+ * This primitive should be used for executing normalization filter
+ */
+class NormPrimitive : public Primitive {
+ public:
+  using Primitive::Primitive;
+
+ private:
+  int execute_impl(int task_id) override {
+    const auto filter = this->filters[task_id];
+    const auto src_view = this->src_view[task_id];
+    const auto dst_view = this->dst_view[task_id];
+
+    size_t mb = src_view.get_batch_size();
+    return BNNSNormalizationFilterApplyBatch(filter, mb, src_view.get_data_hdl(),
+                                             src_view.get_stride(), dst_view.get_data_hdl(),
+                                             dst_view.get_stride(), false);
+  }
+};
+
+/**
+ * Wrapper on top of BNNS::Primitive
+ *
+ * This primitive should be used for executing pooling filter
+ */
+class PoolingPrimitive : public Primitive {
+ public:
+  using Primitive::Primitive;
+
+ private:
+  int execute_impl(int task_id) override {
+    const auto filter = this->filters[task_id];
+    const auto src_view = this->src_view[task_id];
+    const auto dst_view = this->dst_view[task_id];
+
+    size_t mb = src_view.get_batch_size();
+    return BNNSPoolingFilterApplyBatch(filter, mb, src_view.get_data_hdl(), src_view.get_stride(),
+                                       dst_view.get_data_hdl(), dst_view.get_stride(), nullptr, 0);
+  }
+};
+
+/**
+ * Function which split primitive into sub primitives to parallel execution
+ *
+ * @param num requested num of sub primitives
+ * @param orig_conv_param original convolution descriptor
+ * @param src_view source tensor view
+ * @param wgh_view weight tensor view
+ * @param b_view bias tensor view
+ * @param dst_view destination tensor view
+ * @param num number of part to split into
+ * @return collection of Convolution descriptors plus corresponding src/dst tensors view
+ */
+static std::tuple<std::vector<BNNSLayerParametersConvolution>, TView, TView> split_to_n(
+    size_t num, const BNNSLayerParametersConvolution& orig_conv_param, const TView& src_view,
+    const TView& wgh_view, const TView& b_view, const TView& dst_view) {
+  size_t batch = src_view.get_batch_size();
+  size_t oc = dst_view.get_bnns_view().size[2];
+  size_t groups = orig_conv_param.groups;
+
+  BNNS::TView src_view_new;
+  BNNS::TView wgh_view_new;
+  BNNS::TView b_view_new;
+  BNNS::TView dst_view_new;
+
+  // TODO(apeskov): Add split by batch dim. Meanwhile we just disable it...
+  if (batch > 1 || oc % num != 0 || (groups > 1 && groups % num != 0)) {
+    return {{orig_conv_param}, src_view, dst_view};
+  }
+
+  // if groups > 1 split only by groups
+  // otherwise split inside one convolution by output channels
+  if (groups > 1) {
+    src_view_new = src_view.party_split_n(num);
+    groups = groups / num;
+  } else {
+    src_view_new = src_view.party_duplicate_n(num);
+  }
+
+  wgh_view_new = wgh_view.party_split_n(num);
+  b_view_new = b_view.party_split_n(num);
+  dst_view_new = dst_view.party_split_n(num);
+
+  std::vector<BNNSLayerParametersConvolution> res(num);
+  for (size_t i = 0; i < num; i++) {
+    auto& cur = res[i];
+    cur = orig_conv_param;
+
+    cur.i_desc = src_view_new[i].get_bnns_view();
+    cur.o_desc = dst_view_new[i].get_bnns_view();
+    cur.w_desc = wgh_view_new[i].get_bnns_view();
+    cur.bias = b_view_new[i].get_bnns_view();
+    cur.groups = groups;
+  }
+  return {res, src_view_new, dst_view_new};
+}
+
+}  // namespace BNNS
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_CONTRIB_BNNS_BNNS_WRP_H_
diff --git a/src/runtime/contrib/cblas/cblas.cc b/src/runtime/contrib/cblas/cblas.cc
index 16496e06aae3..fbac6222488d 100644
--- a/src/runtime/contrib/cblas/cblas.cc
+++ b/src/runtime/contrib/cblas/cblas.cc
@@ -21,8 +21,8 @@
  * \file Use external cblas library call.
  */
 #include <tvm/runtime/data_type.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 extern "C" {
 #include <cblas.h>
diff --git a/src/runtime/contrib/cblas/mkl.cc b/src/runtime/contrib/cblas/mkl.cc
index 273aa45367dd..4323878db276 100644
--- a/src/runtime/contrib/cblas/mkl.cc
+++ b/src/runtime/contrib/cblas/mkl.cc
@@ -21,8 +21,8 @@
  * \file Use external mkl library call.
  */
 #include <tvm/runtime/data_type.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 extern "C" {
 #include <mkl_cblas.h>
diff --git a/src/runtime/contrib/cblas/mkldnn.cc b/src/runtime/contrib/cblas/mkldnn.cc
index 1c3fa023dcc7..31abd317c6a4 100644
--- a/src/runtime/contrib/cblas/mkldnn.cc
+++ b/src/runtime/contrib/cblas/mkldnn.cc
@@ -21,8 +21,8 @@
  * \file Use external cblas library call.
  */
 #include <tvm/runtime/data_type.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 extern "C" {
 #include <dnnl.h>
diff --git a/src/runtime/contrib/coreml/coreml_runtime.mm b/src/runtime/contrib/coreml/coreml_runtime.mm
index 18d4f735a55e..5aef10ed8adf 100644
--- a/src/runtime/contrib/coreml/coreml_runtime.mm
+++ b/src/runtime/contrib/coreml/coreml_runtime.mm
@@ -98,11 +98,11 @@
     LOG(FATAL) << "unexpected data type " << data_desc.dataType;
   }
   MLMultiArray* src = [output_ featureValueForName:name].multiArrayValue;
-  TVMContext cpu_ctx = {
+  Device cpu_dev = {
       .device_type = kDLCPU,
       .device_id = 0,
   };
-  NDArray ret = NDArray::Empty(shape, dtype, cpu_ctx);
+  NDArray ret = NDArray::Empty(shape, dtype, cpu_dev);
   ret.CopyFromBytes(src.dataPointer, size);
 
   return ret;
diff --git a/src/runtime/contrib/cublas/cublas.cc b/src/runtime/contrib/cublas/cublas.cc
index b12992f57159..9af1602cf3c0 100644
--- a/src/runtime/contrib/cublas/cublas.cc
+++ b/src/runtime/contrib/cublas/cublas.cc
@@ -21,8 +21,8 @@
  * \file Use external cblas library call.
  */
 #include <tvm/runtime/data_type.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 #include "../cblas/gemm_common.h"
 #include "cublas_utils.h"
diff --git a/src/runtime/contrib/cublas/cublas_utils.h b/src/runtime/contrib/cublas/cublas_utils.h
index 32c3b03ddbb0..3edb8300be88 100644
--- a/src/runtime/contrib/cublas/cublas_utils.h
+++ b/src/runtime/contrib/cublas/cublas_utils.h
@@ -28,7 +28,7 @@
 #include <cuda_runtime.h>
 #include <cuda_runtime_api.h>
 #include <dlpack/dlpack.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include <cstdint>
 #if CUDART_VERSION >= 10010
diff --git a/src/runtime/contrib/cudnn/conv_forward.cc b/src/runtime/contrib/cudnn/conv_forward.cc
index 223a5b4fe435..ad3b959338bb 100644
--- a/src/runtime/contrib/cudnn/conv_forward.cc
+++ b/src/runtime/contrib/cudnn/conv_forward.cc
@@ -41,8 +41,8 @@ void ConvolutionForward(int mode, int format, int algo, int dims, int groups, co
   entry_ptr->conv_entry.tensor_format = static_cast<cudnnTensorFormat_t>(format);
   // Set Algo
   entry_ptr->conv_entry.fwd_algo = static_cast<cudnnConvolutionFwdAlgo_t>(algo);
-  // Set Ctx
-  entry_ptr->conv_entry.ctx = x->ctx;
+  // Set Device
+  entry_ptr->conv_entry.device = x->device;
   // Set Data Type
   entry_ptr->conv_entry.data_type = CuDNNDataType::DLTypeToCuDNNType(String2DLDataType(conv_dtype));
   cudnnDataType_t data_type = CuDNNDataType::DLTypeToCuDNNType(x->dtype);
diff --git a/src/runtime/contrib/cudnn/cudnn_utils.cc b/src/runtime/contrib/cudnn/cudnn_utils.cc
index cd934bcb7081..006064e57a19 100644
--- a/src/runtime/contrib/cudnn/cudnn_utils.cc
+++ b/src/runtime/contrib/cudnn/cudnn_utils.cc
@@ -133,12 +133,12 @@ void ConvEntry::UpdateWorkspace(const size_t wsize) {
       CleanWorkspace();
     }
     workspace_size = wsize;
-    workspace = cuda_api->AllocWorkspace(ctx, workspace_size);
+    workspace = cuda_api->AllocWorkspace(device, workspace_size);
   }
 }
 
 void ConvEntry::CleanWorkspace() {
-  if (workspace) cuda_api->FreeWorkspace(ctx, workspace);
+  if (workspace) cuda_api->FreeWorkspace(device, workspace);
   workspace_size = 0;
 }
 
diff --git a/src/runtime/contrib/cudnn/cudnn_utils.h b/src/runtime/contrib/cudnn/cudnn_utils.h
index 528298b75187..72380b64121a 100644
--- a/src/runtime/contrib/cudnn/cudnn_utils.h
+++ b/src/runtime/contrib/cudnn/cudnn_utils.h
@@ -26,7 +26,7 @@
 
 #include <cudnn.h>
 #include <tvm/runtime/device_api.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include "../../cuda/cuda_common.h"
 
@@ -72,7 +72,7 @@ struct ConvEntry {
   cudnnTensorDescriptor_t output_desc;
   cudnnConvolutionFwdAlgo_t fwd_algo;
   // cudnnMathType_t math_type;
-  TVMContext ctx;
+  Device device;
   runtime::DeviceAPI* cuda_api;
   void* workspace{nullptr};
   size_t workspace_size{0};
diff --git a/src/runtime/contrib/edgetpu/edgetpu_runtime.cc b/src/runtime/contrib/edgetpu/edgetpu_runtime.cc
index 13b3c34a6b17..af6b3cc42bfd 100644
--- a/src/runtime/contrib/edgetpu/edgetpu_runtime.cc
+++ b/src/runtime/contrib/edgetpu/edgetpu_runtime.cc
@@ -31,7 +31,7 @@
 namespace tvm {
 namespace runtime {
 
-void EdgeTPURuntime::Init(const std::string& tflite_model_bytes, TVMContext ctx) {
+void EdgeTPURuntime::Init(const std::string& tflite_model_bytes, Device dev) {
   const char* buffer = tflite_model_bytes.c_str();
   size_t buffer_size = tflite_model_bytes.size();
   // Load compiled model as a FlatBufferModel
@@ -53,12 +53,12 @@ void EdgeTPURuntime::Init(const std::string& tflite_model_bytes, TVMContext ctx)
   status = interpreter_->AllocateTensors();
   CHECK_TFLITE_STATUS(status) << "Failed to allocate tensors.";
 
-  ctx_ = ctx;
+  device_ = dev;
 }
 
-Module EdgeTPURuntimeCreate(const std::string& tflite_model_bytes, TVMContext ctx) {
+Module EdgeTPURuntimeCreate(const std::string& tflite_model_bytes, Device dev) {
   auto exec = make_object<EdgeTPURuntime>();
-  exec->Init(tflite_model_bytes, ctx);
+  exec->Init(tflite_model_bytes, dev);
   return Module(exec);
 }
 
diff --git a/src/runtime/contrib/edgetpu/edgetpu_runtime.h b/src/runtime/contrib/edgetpu/edgetpu_runtime.h
index af3517ba76f3..a7a57ff422e3 100644
--- a/src/runtime/contrib/edgetpu/edgetpu_runtime.h
+++ b/src/runtime/contrib/edgetpu/edgetpu_runtime.h
@@ -25,6 +25,8 @@
 #ifndef TVM_RUNTIME_CONTRIB_EDGETPU_EDGETPU_RUNTIME_H_
 #define TVM_RUNTIME_CONTRIB_EDGETPU_EDGETPU_RUNTIME_H_
 
+#include <edgetpu.h>
+
 #include <memory>
 #include <string>
 
@@ -47,11 +49,11 @@ class EdgeTPURuntime : public TFLiteRuntime {
   const char* type_key() const final { return "EdgeTPURuntime"; }
 
   /*!
-   * \brief Initialize the edge TPU tflite runtime with tflite model and context.
+   * \brief Initialize the edge TPU tflite runtime with tflite model and device.
    * \param tflite_model_bytes The tflite model.
-   * \param ctx The context where the tflite model will be executed on.
+   * \param dev The device where the tflite model will be executed on.
    */
-  void Init(const std::string& tflite_model_bytes, TVMContext ctx);
+  void Init(const std::string& tflite_model_bytes, Device dev);
 
  private:
   std::shared_ptr<edgetpu::EdgeTpuContext> edgetpu_context_;
diff --git a/src/runtime/contrib/ethosn/ethosn_device.cc b/src/runtime/contrib/ethosn/ethosn_device.cc
index c51c4288b709..0ffbee29032b 100644
--- a/src/runtime/contrib/ethosn/ethosn_device.cc
+++ b/src/runtime/contrib/ethosn/ethosn_device.cc
@@ -190,7 +190,8 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.test.infra.inference_result")
       for (int argc = 0; argc < args.size(); argc++) {
         const DLTensor* tensor = args[argc];
         auto shape = std::vector<int64_t>(tensor->shape, tensor->shape + tensor->ndim);
-        test_outputs.emplace_back(tvm::runtime::NDArray::Empty(shape, tensor->dtype, tensor->ctx));
+        test_outputs.emplace_back(
+            tvm::runtime::NDArray::Empty(shape, tensor->dtype, tensor->device));
         test_outputs[test_outputs.size() - 1].CopyFrom(tensor);
       }
     });
diff --git a/src/runtime/contrib/miopen/conv_forward.cc b/src/runtime/contrib/miopen/conv_forward.cc
index 1353e2f996bb..139801feef15 100644
--- a/src/runtime/contrib/miopen/conv_forward.cc
+++ b/src/runtime/contrib/miopen/conv_forward.cc
@@ -24,6 +24,8 @@
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/registry.h>
 
+#include <cassert>
+
 #include "miopen_utils.h"
 
 namespace tvm {
@@ -57,8 +59,8 @@ TVM_REGISTER_GLOBAL("tvm.contrib.miopen.conv2d.setup").set_body([](TVMArgs args,
   if (n_group > 1) assert(mode > 1 && "Group /Depthwise Conv mode when num of groups > 1");
   // Set Mode
   entry_ptr->conv_entry.mode = static_cast<miopenConvolutionMode_t>(mode);
-  // Set Ctx
-  entry_ptr->conv_entry.ctx = TVMContext{kDLROCM, 0};
+  // Set Device
+  entry_ptr->conv_entry.device = Device{kDLROCM, 0};
   // Set Data Type
   entry_ptr->conv_entry.data_type =
       static_cast<miopenDataType_t>(dtype);  // MIOpen supports fp32(miopenFloat), fp16(miopenHalf),
@@ -104,11 +106,11 @@ TVM_REGISTER_GLOBAL("tvm.contrib.miopen.conv2d.setup").set_body([](TVMArgs args,
 
   runtime::DeviceAPI* rocm_api = entry_ptr->conv_entry.rocm_api;
   float* input_buf = static_cast<float*>(
-      rocm_api->AllocWorkspace(entry_ptr->conv_entry.ctx, input_size * sizeof(float)));
+      rocm_api->AllocWorkspace(entry_ptr->conv_entry.device, input_size * sizeof(float)));
   float* filter_buf = static_cast<float*>(
-      rocm_api->AllocWorkspace(entry_ptr->conv_entry.ctx, filter_size * sizeof(float)));
+      rocm_api->AllocWorkspace(entry_ptr->conv_entry.device, filter_size * sizeof(float)));
   float* output_buf = static_cast<float*>(
-      rocm_api->AllocWorkspace(entry_ptr->conv_entry.ctx, output_size * sizeof(float)));
+      rocm_api->AllocWorkspace(entry_ptr->conv_entry.device, output_size * sizeof(float)));
 
   const int request_algo_count = 4;
   const bool exhaustive_search = false;
@@ -123,9 +125,9 @@ TVM_REGISTER_GLOBAL("tvm.contrib.miopen.conv2d.setup").set_body([](TVMArgs args,
       entry_ptr->conv_entry.output_desc, output_buf, request_algo_count, &returned_algo_count,
       perfs, workspace, workspace_size, exhaustive_search));
 
-  rocm_api->FreeWorkspace(entry_ptr->conv_entry.ctx, input_buf);
-  rocm_api->FreeWorkspace(entry_ptr->conv_entry.ctx, filter_buf);
-  rocm_api->FreeWorkspace(entry_ptr->conv_entry.ctx, output_buf);
+  rocm_api->FreeWorkspace(entry_ptr->conv_entry.device, input_buf);
+  rocm_api->FreeWorkspace(entry_ptr->conv_entry.device, filter_buf);
+  rocm_api->FreeWorkspace(entry_ptr->conv_entry.device, output_buf);
 
   const std::vector<std::string> fwd_algo_names{
       "miopenConvolutionFwdAlgoGEMM",
@@ -164,8 +166,8 @@ TVM_REGISTER_GLOBAL("tvm.contrib.miopen.conv2d.forward")
       entry_ptr->conv_entry.fwd_algo = static_cast<miopenConvFwdAlgorithm_t>(algo);
       // Set Mode
       entry_ptr->conv_entry.mode = static_cast<miopenConvolutionMode_t>(mode);
-      // Set Ctx
-      entry_ptr->conv_entry.ctx = x->ctx;
+      // Set Device
+      entry_ptr->conv_entry.device = x->device;
       // Set Data Type
       entry_ptr->conv_entry.data_type =
           static_cast<miopenDataType_t>(dtype);  // MIOpen supports fp32(miopenFloat),
diff --git a/src/runtime/contrib/miopen/miopen_utils.cc b/src/runtime/contrib/miopen/miopen_utils.cc
index a57918045d87..426d2f24ddf5 100644
--- a/src/runtime/contrib/miopen/miopen_utils.cc
+++ b/src/runtime/contrib/miopen/miopen_utils.cc
@@ -80,12 +80,12 @@ void ConvEntry::UpdateWorkspace(const size_t wsize) {
       CleanWorkspace();
     }
     workspace_size = wsize;
-    workspace = rocm_api->AllocWorkspace(ctx, workspace_size);
+    workspace = rocm_api->AllocWorkspace(device, workspace_size);
   }
 }
 
 void ConvEntry::CleanWorkspace() {
-  if (workspace) rocm_api->FreeWorkspace(ctx, workspace);
+  if (workspace) rocm_api->FreeWorkspace(device, workspace);
   workspace_size = 0;
 }
 
diff --git a/src/runtime/contrib/miopen/miopen_utils.h b/src/runtime/contrib/miopen/miopen_utils.h
index 9982f0914f6b..d3a8c7b9ad64 100644
--- a/src/runtime/contrib/miopen/miopen_utils.h
+++ b/src/runtime/contrib/miopen/miopen_utils.h
@@ -26,7 +26,7 @@
 
 #include <miopen/miopen.h>
 #include <tvm/runtime/device_api.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include <string>
 
@@ -52,7 +52,7 @@ struct ConvEntry {
   miopenTensorDescriptor_t input_desc;
   miopenTensorDescriptor_t output_desc;
   miopenConvFwdAlgorithm_t fwd_algo;
-  TVMContext ctx;
+  Device device;
   runtime::DeviceAPI* rocm_api;
   void* workspace{nullptr};
   size_t workspace_size{0};
diff --git a/src/runtime/contrib/mps/conv.mm b/src/runtime/contrib/mps/conv.mm
index b860ee29bdf5..84ca5e10f980 100644
--- a/src/runtime/contrib/mps/conv.mm
+++ b/src/runtime/contrib/mps/conv.mm
@@ -31,10 +31,10 @@
   id<MTLBuffer> mtlbuf = (__bridge id<MTLBuffer>)(buf->data);
   MetalThreadEntry* entry_ptr = MetalThreadEntry::ThreadLocal();
   runtime::metal::MetalThreadEntry* rt = runtime::metal::MetalThreadEntry::ThreadLocal();
-  id<MTLDevice> dev = entry_ptr->metal_api->GetDevice(buf->ctx);
-  id<MTLBuffer> temp = rt->GetTempBuffer(buf->ctx, [mtlbuf length]);
+  id<MTLDevice> dev = entry_ptr->metal_api->GetDevice(buf->device);
+  id<MTLBuffer> temp = rt->GetTempBuffer(buf->device, [mtlbuf length]);
   entry_ptr->metal_api->CopyDataFromTo((__bridge void*)mtlbuf, 0, (__bridge void*)temp, 0,
-                                       [mtlbuf length], buf -> ctx, buf -> ctx, buf -> dtype,
+                                       [mtlbuf length], buf -> device, buf -> device, buf -> dtype,
                                        nullptr);
 
   MPSImageDescriptor* desc =
@@ -63,14 +63,14 @@
   MPSImage* mpsimg = (__bridge MPSImage*)(img->data);
   MetalThreadEntry* entry_ptr = MetalThreadEntry::ThreadLocal();
   runtime::metal::MetalThreadEntry* rt = runtime::metal::MetalThreadEntry::ThreadLocal();
-  id<MTLBuffer> temp = rt->GetTempBuffer(buf->ctx, [mtlbuf length]);
+  id<MTLBuffer> temp = rt->GetTempBuffer(buf->device, [mtlbuf length]);
 
   [mpsimg readBytes:[temp contents]
          dataLayout:MPSDataLayoutHeightxWidthxFeatureChannels
          imageIndex:0];
 
   entry_ptr->metal_api->CopyDataFromTo((__bridge void*)temp, 0, (__bridge void*)mtlbuf, 0,
-                                       [mtlbuf length], buf -> ctx, buf -> ctx, buf -> dtype,
+                                       [mtlbuf length], buf -> device, buf -> device, buf -> dtype,
                                        nullptr);
 });
 
@@ -102,8 +102,8 @@
   // Get Metal device API
   MetalThreadEntry* entry_ptr = MetalThreadEntry::ThreadLocal();
   runtime::metal::MetalThreadEntry* rt = runtime::metal::MetalThreadEntry::ThreadLocal();
-  id<MTLDevice> dev = entry_ptr->metal_api->GetDevice(data->ctx);
-  id<MTLCommandQueue> queue = entry_ptr->metal_api->GetCommandQueue(data->ctx);
+  id<MTLDevice> dev = entry_ptr->metal_api->GetDevice(data->device);
+  id<MTLCommandQueue> queue = entry_ptr->metal_api->GetCommandQueue(data->device);
   id<MTLCommandBuffer> cb = [queue commandBuffer];
   // data to MPSImage
   DLTensor tmp_in;
@@ -111,10 +111,10 @@
   MPSImage* tempA = (__bridge MPSImage*)tmp_in.data;
   // weight to temp memory
   id<MTLBuffer> bufB = (__bridge id<MTLBuffer>)(weight->data);
-  id<MTLBuffer> tempB = rt->GetTempBuffer(weight->ctx, [bufB length]);
+  id<MTLBuffer> tempB = rt->GetTempBuffer(weight->device, [bufB length]);
   entry_ptr->metal_api->CopyDataFromTo((__bridge void*)bufB, 0, (__bridge void*)tempB, 0,
-                                       [bufB length], weight -> ctx, weight -> ctx, tmp_in.dtype,
-                                       nullptr);
+                                       [bufB length], weight -> device, weight -> device,
+                                       tmp_in.dtype, nullptr);
   float* ptr_w = (float*)[tempB contents];
   // output to MPSImage
   DLTensor tmp_out;
diff --git a/src/runtime/contrib/mps/gemm.mm b/src/runtime/contrib/mps/gemm.mm
index c1d80dbed7f3..db3a80ba49d8 100644
--- a/src/runtime/contrib/mps/gemm.mm
+++ b/src/runtime/contrib/mps/gemm.mm
@@ -42,10 +42,10 @@
   ICHECK(TypeMatch(C->dtype, kDLFloat, 32));
   // Get Metal device API
   MetalThreadEntry* entry_ptr = MetalThreadEntry::ThreadLocal();
-  // ICHECK_EQ(A->ctx, B->ctx);
-  // ICHECK_EQ(A->ctx, C->ctx);
-  id<MTLDevice> dev = entry_ptr->metal_api->GetDevice(A->ctx);
-  id<MTLCommandQueue> queue = entry_ptr->metal_api->GetCommandQueue(A->ctx);
+  // ICHECK_EQ(A->device, B->device);
+  // ICHECK_EQ(A->device, C->device);
+  id<MTLDevice> dev = entry_ptr->metal_api->GetDevice(A->device);
+  id<MTLCommandQueue> queue = entry_ptr->metal_api->GetCommandQueue(A->device);
   id<MTLCommandBuffer> cb = [queue commandBuffer];
   NSUInteger M = A->shape[0 + (transa ? 1 : 0)];
   NSUInteger N = B->shape[1 - (transb ? 1 : 0)];
diff --git a/src/runtime/contrib/mps/mps_utils.h b/src/runtime/contrib/mps/mps_utils.h
index d1c49732318a..c2b7e3c7aa99 100644
--- a/src/runtime/contrib/mps/mps_utils.h
+++ b/src/runtime/contrib/mps/mps_utils.h
@@ -28,8 +28,8 @@
 #include <dmlc/thread_local.h>
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/device_api.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 #include <vector>
 
diff --git a/src/runtime/contrib/nnpack/convolution.cc b/src/runtime/contrib/nnpack/convolution.cc
index b3ea6c891d43..2362e31f92ee 100644
--- a/src/runtime/contrib/nnpack/convolution.cc
+++ b/src/runtime/contrib/nnpack/convolution.cc
@@ -23,8 +23,8 @@
 #include <nnpack.h>
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/device_api.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 #include "nnpack_utils.h"
 
@@ -99,12 +99,12 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference")
       // Division with rounding up, in case size is not multiple of sizeof(float)
       const size_t workspace_elements = (workspace_size + sizeof(float) - 1) / sizeof(float);
 
-      TVMContext ctx = input->ctx;
+      Device dev = input->device;
       DLDataType type_hint = input->dtype;
 
-      DeviceAPI* cpu_api = DeviceAPI::Get(ctx);
+      DeviceAPI* cpu_api = DeviceAPI::Get(dev);
       void* workspace_buffer =
-          cpu_api->AllocWorkspace(ctx, workspace_elements * sizeof(float), type_hint);
+          cpu_api->AllocWorkspace(dev, workspace_elements * sizeof(float), type_hint);
       ICHECK(workspace_buffer != nullptr);
 
       for (auto n = 0; n < input->shape[0]; ++n) {
@@ -122,7 +122,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference")
 
         ICHECK_EQ(status, nnp_status_success);
       }
-      cpu_api->FreeWorkspace(ctx, workspace_buffer);
+      cpu_api->FreeWorkspace(dev, workspace_buffer);
     });
 
 TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference_without_weight_transform")
@@ -188,12 +188,12 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference_without_weight_tra
       // Division with rounding up, in case size is not multiple of sizeof(float)
       const size_t workspace_elements = (workspace_size + sizeof(float) - 1) / sizeof(float);
 
-      TVMContext ctx = input->ctx;
+      Device dev = input->device;
       DLDataType type_hint = input->dtype;
 
-      DeviceAPI* cpu_api = DeviceAPI::Get(ctx);
+      DeviceAPI* cpu_api = DeviceAPI::Get(dev);
       void* workspace_buffer =
-          cpu_api->AllocWorkspace(ctx, workspace_elements * sizeof(float), type_hint);
+          cpu_api->AllocWorkspace(dev, workspace_elements * sizeof(float), type_hint);
       ICHECK(workspace_buffer != nullptr);
 
       for (auto n = 0; n < input->shape[0]; ++n) {
@@ -211,7 +211,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference_without_weight_tra
         ICHECK_EQ(status, nnp_status_success);
       }
 
-      cpu_api->FreeWorkspace(ctx, workspace_buffer);
+      cpu_api->FreeWorkspace(dev, workspace_buffer);
     });
 
 TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference_weight_transform")
diff --git a/src/runtime/contrib/nnpack/fully_connected.cc b/src/runtime/contrib/nnpack/fully_connected.cc
index 8b72eb38e08c..28570026ada3 100644
--- a/src/runtime/contrib/nnpack/fully_connected.cc
+++ b/src/runtime/contrib/nnpack/fully_connected.cc
@@ -22,8 +22,8 @@
  */
 #include <nnpack.h>
 #include <tvm/runtime/data_type.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 #include "nnpack_utils.h"
 
diff --git a/src/runtime/contrib/nnpack/nnpack_utils.h b/src/runtime/contrib/nnpack/nnpack_utils.h
index 231309baaa8e..4396ea0bcde6 100644
--- a/src/runtime/contrib/nnpack/nnpack_utils.h
+++ b/src/runtime/contrib/nnpack/nnpack_utils.h
@@ -25,8 +25,8 @@
 #include <dmlc/thread_local.h>
 #include <nnpack.h>
 #include <tvm/runtime/data_type.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 namespace tvm {
 namespace contrib {
diff --git a/src/runtime/contrib/random/mt_random_engine.cc b/src/runtime/contrib/random/mt_random_engine.cc
index 49bc056dcafb..a1c6dc2498c8 100644
--- a/src/runtime/contrib/random/mt_random_engine.cc
+++ b/src/runtime/contrib/random/mt_random_engine.cc
@@ -22,8 +22,8 @@
  * \brief mt19937 random engine
  */
 #include <tvm/runtime/device_api.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/ndarray.h>
-#include <tvm/support/logging.h>
 
 #include <algorithm>
 #include <ctime>
@@ -82,7 +82,7 @@ class RandomEngine {
 
     ICHECK(dtype.code == kDLFloat && dtype.bits == 32 && dtype.lanes == 1);
 
-    if (data->ctx.device_type == kDLCPU) {
+    if (data->device.device_type == kDLCPU) {
       std::uniform_real_distribution<float> uniform_dist(low, high);
       std::generate_n(static_cast<float*>(data->data), size,
                       [&]() { return uniform_dist(rnd_engine_); });
@@ -106,7 +106,7 @@ class RandomEngine {
 
     ICHECK(dtype.code == kDLFloat && dtype.bits == 32 && dtype.lanes == 1);
 
-    if (data->ctx.device_type == kDLCPU) {
+    if (data->device.device_type == kDLCPU) {
       std::normal_distribution<float> normal_dist(loc, scale);
       std::generate_n(static_cast<float*>(data->data), size,
                       [&]() { return normal_dist(rnd_engine_); });
@@ -121,13 +121,14 @@ class RandomEngine {
       size *= data->shape[i];
     }
 
-    if (data->ctx.device_type == kDLCPU) {
+    if (data->device.device_type == kDLCPU) {
       FillData(data, size);
     } else {
       runtime::NDArray local = runtime::NDArray::Empty(
           std::vector<int64_t>{data->shape, data->shape + data->ndim}, data->dtype, {kDLCPU, 0});
-      FillData(&local.ToDLPack()->dl_tensor, size);
-      runtime::NDArray::CopyFromTo(&local.ToDLPack()->dl_tensor, data);
+      DLTensor* tensor = const_cast<DLTensor*>(local.operator->());
+      FillData(tensor, size);
+      runtime::NDArray::CopyFromTo(tensor, data);
     }
   }
 
diff --git a/src/runtime/contrib/random/random.cc b/src/runtime/contrib/random/random.cc
index edcd20883369..2cb56b87fdf5 100644
--- a/src/runtime/contrib/random/random.cc
+++ b/src/runtime/contrib/random/random.cc
@@ -22,8 +22,8 @@
  */
 #include <dmlc/thread_local.h>
 #include <tvm/runtime/data_type.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 #include <algorithm>
 
@@ -89,7 +89,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.random.randint").set_body([](TVMArgs args, TVMR
     low = std::max(low, numeric_low);
     high = std::min(high, numeric_high);
 
-    if (out->ctx.device_type == kDLCPU) {
+    if (out->device.device_type == kDLCPU) {
       // file the data with random byte
       std::generate_n(static_cast<DType*>(out->data), size, [&]() {
         unsigned rint = entry->random_engine.GetRandInt();
diff --git a/src/runtime/contrib/rocblas/rocblas.cc b/src/runtime/contrib/rocblas/rocblas.cc
index dca1ebc6ed83..d977b1a211b0 100644
--- a/src/runtime/contrib/rocblas/rocblas.cc
+++ b/src/runtime/contrib/rocblas/rocblas.cc
@@ -23,8 +23,8 @@
 #include "rocblas.h"
 
 #include <tvm/runtime/data_type.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 namespace tvm {
 namespace contrib {
diff --git a/src/runtime/contrib/tensorrt/tensorrt_builder.cc b/src/runtime/contrib/tensorrt/tensorrt_builder.cc
index 09b36d720877..e98413eacc7c 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_builder.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_builder.cc
@@ -191,7 +191,7 @@ TensorRTEngineAndContext TensorRTBuilder::BuildEngine() {
 
 nvinfer1::Weights TensorRTBuilder::GetDLTensorAsWeights(const DLTensor* dptr,
                                                         DLDeviceType src_device) {
-  ICHECK_EQ(dptr->ctx.device_type, src_device);
+  ICHECK_EQ(dptr->device.device_type, src_device);
   ICHECK(static_cast<int>(dptr->dtype.code) == kDLFloat ||
          static_cast<int>(dptr->dtype.code) == kDLInt);
   const auto trt_dtype = static_cast<int>(dptr->dtype.code) == kDLFloat
@@ -248,7 +248,7 @@ void TensorRTBuilder::CleanUp() {
 void TensorRTBuilder::AllocateDeviceBuffer(nvinfer1::ICudaEngine* engine, const std::string& name,
                                            std::vector<runtime::NDArray>* device_buffers) {
   const uint32_t entry_id = entry_id_map_[name];
-  if (data_entry_[entry_id]->ctx.device_type != kDLGPU) {
+  if (data_entry_[entry_id]->device.device_type != kDLGPU) {
     const int binding_index = engine->getBindingIndex(name.c_str());
     ICHECK_NE(binding_index, -1);
     std::vector<int64_t> shape(data_entry_[entry_id]->shape,
diff --git a/src/runtime/contrib/tensorrt/tensorrt_logger.h b/src/runtime/contrib/tensorrt/tensorrt_logger.h
index 087cb010189c..eb0164210dbb 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_logger.h
+++ b/src/runtime/contrib/tensorrt/tensorrt_logger.h
@@ -25,7 +25,7 @@
 #ifndef TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_LOGGER_H_
 #define TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_LOGGER_H_
 
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include "NvInfer.h"
 #include "tensorrt_utils.h"
diff --git a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
index 3f87f8d00ee6..21031c67863f 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
@@ -32,7 +32,7 @@
 #include "../json/json_node.h"
 #include "../json/json_runtime.h"
 
-#ifdef TVM_GRAPH_RUNTIME_TENSORRT
+#ifdef TVM_GRAPH_EXECUTOR_TENSORRT
 #include "NvInfer.h"
 #include "tensorrt_builder.h"
 #endif
@@ -108,7 +108,15 @@ class TensorRTRuntime : public JSONRuntimeBase {
     }
   }
 
-#ifdef TVM_GRAPH_RUNTIME_TENSORRT
+#ifdef TVM_GRAPH_EXECUTOR_TENSORRT
+  /*! \brief Destroy engines and contexts. */
+  ~TensorRTRuntime() {
+    for (auto& it : trt_engine_cache_) {
+      it.second.context->destroy();
+      it.second.engine->destroy();
+    }
+  }
+
   /*! \brief Run inference using built engine. */
   void Run() override {
     BuildEngine();
@@ -127,7 +135,7 @@ class TensorRTRuntime : public JSONRuntimeBase {
           const std::string name = nodes_[nid].GetOpName() + "_" + std::to_string(j);
           int binding_index = engine->getBindingIndex(name.c_str());
           ICHECK_NE(binding_index, -1);
-          if (data_entry_[eid]->ctx.device_type == kDLGPU) {
+          if (data_entry_[eid]->device.device_type == kDLGPU) {
             bindings[binding_index] = data_entry_[eid]->data;
           } else {
             device_buffers[binding_index].CopyFrom(data_entry_[eid]);
@@ -142,7 +150,7 @@ class TensorRTRuntime : public JSONRuntimeBase {
       const std::string& name = engine_and_context.outputs[i];
       int binding_index = engine->getBindingIndex(name.c_str());
       ICHECK_NE(binding_index, -1);
-      if (data_entry_[eid]->ctx.device_type == kDLGPU) {
+      if (data_entry_[eid]->device.device_type == kDLGPU) {
         bindings[binding_index] = data_entry_[eid]->data;
       } else {
         bindings[binding_index] = device_buffers[binding_index]->data;
@@ -165,7 +173,7 @@ class TensorRTRuntime : public JSONRuntimeBase {
       const std::string& name = engine_and_context.outputs[i];
       int binding_index = engine->getBindingIndex(name.c_str());
       ICHECK_NE(binding_index, -1);
-      if (data_entry_[eid]->ctx.device_type != kDLGPU) {
+      if (data_entry_[eid]->device.device_type != kDLGPU) {
         device_buffers[binding_index].CopyTo(const_cast<DLTensor*>(data_entry_[eid]));
       }
     }
diff --git a/src/runtime/contrib/tflite/tflite_runtime.cc b/src/runtime/contrib/tflite/tflite_runtime.cc
index 9a434fde2955..2d323ba51d98 100644
--- a/src/runtime/contrib/tflite/tflite_runtime.cc
+++ b/src/runtime/contrib/tflite/tflite_runtime.cc
@@ -90,7 +90,7 @@ DataType TfLiteDType2TVMDType(TfLiteType dtype) {
   }
 }
 
-void TFLiteRuntime::Init(const std::string& tflite_model_bytes, TVMContext ctx) {
+void TFLiteRuntime::Init(const std::string& tflite_model_bytes, Device dev) {
   const char* buffer = tflite_model_bytes.c_str();
   size_t buffer_size = tflite_model_bytes.size();
   // The buffer used to construct the model must be kept alive for
@@ -107,7 +107,7 @@ void TFLiteRuntime::Init(const std::string& tflite_model_bytes, TVMContext ctx)
   status = interpreter_->AllocateTensors();
   CHECK_TFLITE_STATUS(status) << "Failed to allocate tensors.";
 
-  ctx_ = ctx;
+  device_ = dev;
 }
 
 void TFLiteRuntime::Invoke() { interpreter_->Invoke(); }
@@ -140,7 +140,7 @@ NDArray TFLiteRuntime::GetOutput(int index) const {
     shape.push_back(dims->data[i]);
     size *= dims->data[i];
   }
-  NDArray ret = NDArray::Empty(shape, dtype, ctx_);
+  NDArray ret = NDArray::Empty(shape, dtype, device_);
   TVM_DTYPE_DISPATCH(dtype, DType, {
     DType* dest = static_cast<DType*>(ret->data);
     DType* src = interpreter_->typed_output_tensor<DType>(index);
@@ -176,9 +176,9 @@ PackedFunc TFLiteRuntime::GetFunction(const std::string& name,
   }
 }
 
-Module TFLiteRuntimeCreate(const std::string& tflite_model_bytes, TVMContext ctx) {
+Module TFLiteRuntimeCreate(const std::string& tflite_model_bytes, Device dev) {
   auto exec = make_object<TFLiteRuntime>();
-  exec->Init(tflite_model_bytes, ctx);
+  exec->Init(tflite_model_bytes, dev);
   return Module(exec);
 }
 
diff --git a/src/runtime/contrib/tflite/tflite_runtime.h b/src/runtime/contrib/tflite/tflite_runtime.h
index 3311f10975be..759be24b94ec 100644
--- a/src/runtime/contrib/tflite/tflite_runtime.h
+++ b/src/runtime/contrib/tflite/tflite_runtime.h
@@ -67,11 +67,11 @@ class TFLiteRuntime : public ModuleNode {
   void Invoke();
 
   /*!
-   * \brief Initialize the tflite runtime with tflite model and context.
+   * \brief Initialize the tflite runtime with tflite model and device.
    * \param tflite_model_bytes The tflite model.
-   * \param ctx The context where the tflite model will be executed on.
+   * \param dev The device where the tflite model will be executed on.
    */
-  void Init(const std::string& tflite_model_bytes, TVMContext ctx);
+  void Init(const std::string& tflite_model_bytes, Device dev);
 
   /*!
    * \brief set index-th input to the model.
@@ -103,8 +103,8 @@ class TFLiteRuntime : public ModuleNode {
   std::unique_ptr<char[]> flatBuffersBuffer_;
   // TFLite interpreter
   std::unique_ptr<tflite::Interpreter> interpreter_;
-  // TVM context
-  TVMContext ctx_;
+  // TVM device
+  Device device_;
 };
 
 }  // namespace runtime
diff --git a/src/runtime/contrib/vitis_ai/vitis_ai_runtime.cc b/src/runtime/contrib/vitis_ai/vitis_ai_runtime.cc
index 37dc767d31af..0e5e2ce4c4fa 100755
--- a/src/runtime/contrib/vitis_ai/vitis_ai_runtime.cc
+++ b/src/runtime/contrib/vitis_ai/vitis_ai_runtime.cc
@@ -25,6 +25,7 @@
 
 #include <tvm/runtime/registry.h>
 
+#include <cassert>
 #include <fstream>
 #include <streambuf>
 #include <string>
diff --git a/src/runtime/cpu_device_api.cc b/src/runtime/cpu_device_api.cc
index b745be33b456..774335f5660b 100644
--- a/src/runtime/cpu_device_api.cc
+++ b/src/runtime/cpu_device_api.cc
@@ -22,8 +22,8 @@
  */
 #include <dmlc/thread_local.h>
 #include <tvm/runtime/device_api.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 #include <cstdlib>
 #include <cstring>
@@ -38,14 +38,13 @@ namespace tvm {
 namespace runtime {
 class CPUDeviceAPI final : public DeviceAPI {
  public:
-  void SetDevice(TVMContext ctx) final {}
-  void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final {
+  void SetDevice(Device dev) final {}
+  void GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) final {
     if (kind == kExist) {
       *rv = 1;
     }
   }
-  void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment,
-                       DLDataType type_hint) final {
+  void* AllocDataSpace(Device dev, size_t nbytes, size_t alignment, DLDataType type_hint) final {
     void* ptr;
 #if _MSC_VER
     ptr = _aligned_malloc(nbytes, alignment);
@@ -61,7 +60,7 @@ class CPUDeviceAPI final : public DeviceAPI {
     return ptr;
   }
 
-  void FreeDataSpace(TVMContext ctx, void* ptr) final {
+  void FreeDataSpace(Device dev, void* ptr) final {
 #if _MSC_VER
     _aligned_free(ptr);
 #else
@@ -69,10 +68,10 @@ class CPUDeviceAPI final : public DeviceAPI {
 #endif
   }
 
-  void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {}
+  void StreamSync(Device dev, TVMStreamHandle stream) final {}
 
-  void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final;
-  void FreeWorkspace(TVMContext ctx, void* data) final;
+  void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final;
+  void FreeWorkspace(Device dev, void* data) final;
 
   static CPUDeviceAPI* Global() {
     // NOTE: explicitly use new to avoid exit-time destruction of global state
@@ -83,7 +82,7 @@ class CPUDeviceAPI final : public DeviceAPI {
 
  protected:
   void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
-                      TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
+                      Device dev_from, Device dev_to, DLDataType type_hint,
                       TVMStreamHandle stream) final {
     memcpy(static_cast<char*>(to) + to_offset, static_cast<const char*>(from) + from_offset, size);
   }
@@ -93,12 +92,12 @@ struct CPUWorkspacePool : public WorkspacePool {
   CPUWorkspacePool() : WorkspacePool(kDLCPU, CPUDeviceAPI::Global()) {}
 };
 
-void* CPUDeviceAPI::AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) {
-  return dmlc::ThreadLocalStore<CPUWorkspacePool>::Get()->AllocWorkspace(ctx, size);
+void* CPUDeviceAPI::AllocWorkspace(Device dev, size_t size, DLDataType type_hint) {
+  return dmlc::ThreadLocalStore<CPUWorkspacePool>::Get()->AllocWorkspace(dev, size);
 }
 
-void CPUDeviceAPI::FreeWorkspace(TVMContext ctx, void* data) {
-  dmlc::ThreadLocalStore<CPUWorkspacePool>::Get()->FreeWorkspace(ctx, data);
+void CPUDeviceAPI::FreeWorkspace(Device dev, void* data) {
+  dmlc::ThreadLocalStore<CPUWorkspacePool>::Get()->FreeWorkspace(dev, data);
 }
 
 TVM_REGISTER_GLOBAL("device_api.cpu").set_body([](TVMArgs args, TVMRetValue* rv) {
diff --git a/src/runtime/crt/Makefile b/src/runtime/crt/Makefile
index 0f3e3096e319..8d3acab1858b 100644
--- a/src/runtime/crt/Makefile
+++ b/src/runtime/crt/Makefile
@@ -45,8 +45,8 @@ QUIET ?= @
 CRT_PREFIX = $(wildcard src/crt)
 
 INCLUDES ?= -isystem include -iquote $(dir ${CRT_CONFIG})
-CFLAGS += ${INCLUDES} -Werror -g $(EXTRA_CFLAGS)
-CXXFLAGS += ${INCLUDES} -std=c++11 -Werror -g $(EXTRA_CXXFLAGS)
+CFLAGS += ${INCLUDES} -Werror -g $(EXTRA_CFLAGS) -DDMLC_USE_LOGGING_LIBRARY=\<tvm/runtime/logging.h\>
+CXXFLAGS += ${INCLUDES} -std=c++11 -Werror -g $(EXTRA_CXXFLAGS) -DDMLC_USE_LOGGING_LIBRARY=\<tvm/runtime/logging.h\>
 LDFLAGS += -Werror -g $(EXTRA_LDFLAGS)
 
 ${BUILD_DIR}/%.o: src/%.c $(CRT_CONFIG)
@@ -67,8 +67,8 @@ endef
 
 LIBS = \
 	src/runtime/crt/common \
-	src/runtime/crt/graph_runtime \
-	src/runtime/crt/graph_runtime_module \
+	src/runtime/crt/graph_executor \
+	src/runtime/crt/graph_executor_module \
 	src/runtime/crt/memory \
 	src/runtime/crt/utvm_rpc_common \
 	src/runtime/crt/utvm_rpc_server
diff --git a/src/runtime/crt/common/crt_backend_api.c b/src/runtime/crt/common/crt_backend_api.c
index 9656b07a7681..9a12bc28240a 100644
--- a/src/runtime/crt/common/crt_backend_api.c
+++ b/src/runtime/crt/common/crt_backend_api.c
@@ -36,9 +36,9 @@ void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t nbytes,
                                int dtype_bits_hint) {
   tvm_crt_error_t err = kTvmErrorNoError;
   void* ptr = 0;
-  DLContext ctx = {device_type, device_id};
+  DLDevice dev = {device_type, device_id};
   assert(nbytes > 0);
-  err = TVMPlatformMemoryAllocate(nbytes, ctx, &ptr);
+  err = TVMPlatformMemoryAllocate(nbytes, dev, &ptr);
   CHECK_EQ(err, kTvmErrorNoError,
            "TVMBackendAllocWorkspace(%d, %d, %" PRIu64 ", %d, %d) -> %" PRId32, device_type,
            device_id, nbytes, dtype_code_hint, dtype_bits_hint, err);
@@ -47,8 +47,8 @@ void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t nbytes,
 
 int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) {
   tvm_crt_error_t err = kTvmErrorNoError;
-  DLContext ctx = {device_type, device_id};
-  err = TVMPlatformMemoryFree(ptr, ctx);
+  DLDevice dev = {device_type, device_id};
+  err = TVMPlatformMemoryFree(ptr, dev);
   return err;
 }
 
diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c
index c2eb1ff903e3..e7fa7bcb5d5e 100644
--- a/src/runtime/crt/common/crt_runtime_api.c
+++ b/src/runtime/crt/common/crt_runtime_api.c
@@ -30,7 +30,7 @@
 #include <tvm/runtime/crt/crt.h>
 #include <tvm/runtime/crt/func_registry.h>
 #include <tvm/runtime/crt/internal/common/ndarray.h>
-#include <tvm/runtime/crt/internal/graph_runtime/graph_runtime.h>
+#include <tvm/runtime/crt/internal/graph_executor/graph_executor.h>
 #include <tvm/runtime/crt/internal/memory/memory.h>
 #include <tvm/runtime/crt/memory.h>
 #include <tvm/runtime/crt/platform.h>
@@ -65,11 +65,11 @@ int TVMArrayAlloc(const tvm_index_t* shape, int ndim, int dtype_code, int dtype_
   dtype.code = dtype_code;
   dtype.bits = dtype_bits;
   dtype.lanes = dtype_lanes;
-  DLContext ctx;
-  ctx.device_type = (DLDeviceType)device_type;
-  ctx.device_id = device_id;
+  DLDevice dev;
+  dev.device_type = (DLDeviceType)device_type;
+  dev.device_id = device_id;
   TVMNDArray arr;
-  int status = TVMNDArray_Empty(ndim, shape, dtype, ctx, &arr);
+  int status = TVMNDArray_Empty(ndim, shape, dtype, dev, &arr);
   if (status != 0) {
     return status;
   }
@@ -83,16 +83,16 @@ int TVMArrayFree(TVMArrayHandle handle) {
   return TVMNDArray_Release(&arr);
 }
 
-int TVMDeviceAllocDataSpace(DLContext ctx, size_t nbytes, size_t alignment, DLDataType type_hint,
+int TVMDeviceAllocDataSpace(DLDevice dev, size_t nbytes, size_t alignment, DLDataType type_hint,
                             void** out_data) {
   if (alignment != 1) {
     nbytes = (nbytes + alignment - 1) / alignment * alignment;
   }
-  return TVMPlatformMemoryAllocate(nbytes, ctx, out_data);
+  return TVMPlatformMemoryAllocate(nbytes, dev, out_data);
 }
 
-int TVMDeviceAllocDataSpaceWithScope(DLContext ctx, int ndim, const int64_t* shape,
-                                     DLDataType dtype, const char* mem_scope, void** out_data) {
+int TVMDeviceAllocDataSpaceWithScope(DLDevice dev, int ndim, const int64_t* shape, DLDataType dtype,
+                                     const char* mem_scope, void** out_data) {
   size_t nbytes = 1;
   for (int i = 0; i < ndim; ++i) {
     nbytes *= shape[i];
@@ -102,10 +102,10 @@ int TVMDeviceAllocDataSpaceWithScope(DLContext ctx, int ndim, const int64_t* sha
   int kAllocAlignment = 128;
   size_t align = (dtype.bits / 8) * dtype.lanes;
   if (align < kAllocAlignment) align = kAllocAlignment;
-  return TVMDeviceAllocDataSpace(ctx, nbytes, align, dtype, out_data);
+  return TVMDeviceAllocDataSpace(dev, nbytes, align, dtype, out_data);
 }
 
-int TVMDeviceFreeDataSpace(TVMContext ctx, void* ptr) { return TVMPlatformMemoryFree(ptr, ctx); }
+int TVMDeviceFreeDataSpace(DLDevice dev, void* ptr) { return TVMPlatformMemoryFree(ptr, dev); }
 
 static bool IsContiguous(const DLTensor* arr) {
   if (arr->strides == NULL) return true;
@@ -237,13 +237,13 @@ static int DecodeFunctionHandle(TVMFunctionHandle handle, tvm_module_index_t* mo
 }
 
 int TVMByteArrayFree(TVMByteArray* arr) {
-  DLContext ctx = {kDLCPU, 0};
-  int to_return = TVMPlatformMemoryFree((void*)arr->data, ctx);
+  DLDevice dev = {kDLCPU, 0};
+  int to_return = TVMPlatformMemoryFree((void*)arr->data, dev);
   if (to_return != 0) {
     return to_return;
   }
 
-  return TVMPlatformMemoryFree((void*)arr, ctx);
+  return TVMPlatformMemoryFree((void*)arr, dev);
 }
 
 tvm_crt_error_t RunTimeEvaluator(tvm_function_index_t function_index, TVMValue* args,
@@ -376,18 +376,18 @@ tvm_crt_error_t TVMInitializeRuntime() {
   tvm_crt_error_t error = kTvmErrorNoError;
   void* func_registry_memory = NULL;
 
-  DLContext ctx = {kDLCPU, 0};
-  error = TVMPlatformMemoryAllocate(TVM_CRT_GLOBAL_FUNC_REGISTRY_SIZE_BYTES, ctx,
+  DLDevice dev = {kDLCPU, 0};
+  error = TVMPlatformMemoryAllocate(TVM_CRT_GLOBAL_FUNC_REGISTRY_SIZE_BYTES, dev,
                                     &func_registry_memory);
   if (error != kTvmErrorNoError) {
     return error;
   }
 
   void* registry_backing_memory;
-  error = TVMPlatformMemoryAllocate(TVM_CRT_GLOBAL_FUNC_REGISTRY_SIZE_BYTES, ctx,
+  error = TVMPlatformMemoryAllocate(TVM_CRT_GLOBAL_FUNC_REGISTRY_SIZE_BYTES, dev,
                                     &registry_backing_memory);
   if (error != kTvmErrorNoError) {
-    TVMPlatformMemoryFree(func_registry_memory, ctx);
+    TVMPlatformMemoryFree(func_registry_memory, dev);
     return error;
   }
 
@@ -412,8 +412,8 @@ tvm_crt_error_t TVMInitializeRuntime() {
   }
 
   if (error != kTvmErrorNoError) {
-    TVMPlatformMemoryFree(registry_backing_memory, ctx);
-    TVMPlatformMemoryFree(func_registry_memory, ctx);
+    TVMPlatformMemoryFree(registry_backing_memory, dev);
+    TVMPlatformMemoryFree(func_registry_memory, dev);
   }
 
   return error;
@@ -422,7 +422,7 @@ tvm_crt_error_t TVMInitializeRuntime() {
 typedef struct {
   uint16_t function_index;
   TVMFunctionHandle func_to_time;
-  TVMContext ctx;
+  DLDevice device;
   int number;
   int repeat;
   int min_repeat_ms;
@@ -447,8 +447,8 @@ int RPCTimeEvaluator(TVMValue* args, int* type_codes, int num_args, TVMValue* re
 
   TVMModuleHandle mod = (TVMModuleHandle)args[0].v_handle;
   const char* name = args[1].v_str;
-  g_time_evaluator_state.ctx.device_type = args[2].v_int64;
-  g_time_evaluator_state.ctx.device_id = args[3].v_int64;
+  g_time_evaluator_state.device.device_type = args[2].v_int64;
+  g_time_evaluator_state.device.device_id = args[3].v_int64;
   g_time_evaluator_state.number = args[4].v_int64;
   g_time_evaluator_state.repeat = args[5].v_int64;
   g_time_evaluator_state.min_repeat_ms = args[6].v_int64;
@@ -474,16 +474,16 @@ tvm_crt_error_t RunTimeEvaluator(tvm_function_index_t function_index, TVMValue*
   }
 
   // TODO(areusch): should *really* rethink needing to return doubles
-  DLContext result_byte_ctx = {kDLCPU, 0};
+  DLDevice result_byte_dev = {kDLCPU, 0};
   TVMByteArray* result_byte_arr = NULL;
   tvm_crt_error_t err =
-      TVMPlatformMemoryAllocate(sizeof(TVMByteArray), result_byte_ctx, (void*)&result_byte_arr);
+      TVMPlatformMemoryAllocate(sizeof(TVMByteArray), result_byte_dev, (void*)&result_byte_arr);
   if (err != kTvmErrorNoError) {
     goto release_and_return;
   }
   result_byte_arr->data = NULL;
   size_t data_size = sizeof(double) * g_time_evaluator_state.repeat;
-  err = TVMPlatformMemoryAllocate(data_size, result_byte_ctx, (void*)&result_byte_arr->data);
+  err = TVMPlatformMemoryAllocate(data_size, result_byte_dev, (void*)&result_byte_arr->data);
   if (err != kTvmErrorNoError) {
     goto release_and_return;
   }
@@ -527,9 +527,9 @@ tvm_crt_error_t RunTimeEvaluator(tvm_function_index_t function_index, TVMValue*
 
 release_and_return : {
   tvm_crt_error_t release_err =
-      TVMPlatformMemoryFree((void*)&result_byte_arr->data, result_byte_ctx);
+      TVMPlatformMemoryFree((void*)&result_byte_arr->data, result_byte_dev);
   if (release_err != kTvmErrorNoError) {
-    release_err = TVMPlatformMemoryFree((void*)&result_byte_arr, result_byte_ctx);
+    release_err = TVMPlatformMemoryFree((void*)&result_byte_arr, result_byte_dev);
   }
 
   if (err == kTvmErrorNoError && release_err != kTvmErrorNoError) {
diff --git a/src/runtime/crt/common/ndarray.c b/src/runtime/crt/common/ndarray.c
index c90a4667903c..fb8fc8022f43 100644
--- a/src/runtime/crt/common/ndarray.c
+++ b/src/runtime/crt/common/ndarray.c
@@ -30,26 +30,26 @@
 
 #include "crt_config.h"
 
-int TVMNDArray_Create(int32_t ndim, const tvm_index_t* shape, DLDataType dtype, DLContext ctx,
+int TVMNDArray_Create(int32_t ndim, const tvm_index_t* shape, DLDataType dtype, DLDevice dev,
                       TVMNDArray* array) {
   memset(array, 0, sizeof(TVMNDArray));
   array->dl_tensor.ndim = ndim;
   tvm_crt_error_t err;
-  DLContext dlctx = {kDLCPU, 0};
-  err = TVMPlatformMemoryAllocate(sizeof(int64_t) * ndim, dlctx, (void*)&array->dl_tensor.shape);
+  DLDevice dldev = {kDLCPU, 0};
+  err = TVMPlatformMemoryAllocate(sizeof(int64_t) * ndim, dldev, (void*)&array->dl_tensor.shape);
   if (err != kTvmErrorNoError) {
     return -1;
   }
   memcpy(array->dl_tensor.shape, shape, sizeof(int64_t) * ndim);
   array->dl_tensor.dtype = dtype;
-  array->dl_tensor.ctx = ctx;
+  array->dl_tensor.device = dev;
   array->dl_tensor.data = 0;
   return 0;
 }
 
-int TVMNDArray_Empty(int32_t ndim, const tvm_index_t* shape, DLDataType dtype, DLContext ctx,
+int TVMNDArray_Empty(int32_t ndim, const tvm_index_t* shape, DLDataType dtype, DLDevice dev,
                      TVMNDArray* array) {
-  int status = TVMNDArray_Create(ndim, shape, dtype, ctx, array);
+  int status = TVMNDArray_Create(ndim, shape, dtype, dev, array);
   if (status != 0) {
     return status;
   }
@@ -76,11 +76,11 @@ int TVMNDArray_Load(TVMNDArray* ret, const char** strm) {
   }
   memcpy(&reserved, *strm, sizeof(reserved));
   *strm += sizeof(reserved);
-  DLContext ctx;
+  DLDevice dev;
   int ndim;  // sizeof ndim should match dlpack
   DLDataType dtype;
-  memcpy(&ctx, *strm, sizeof(ctx));
-  *strm += sizeof(ctx);
+  memcpy(&dev, *strm, sizeof(dev));
+  *strm += sizeof(dev);
   memcpy(&ndim, *strm, sizeof(ndim));
   *strm += sizeof(ndim);
   memcpy(&dtype, *strm, sizeof(dtype));
@@ -89,8 +89,8 @@ int TVMNDArray_Load(TVMNDArray* ret, const char** strm) {
     fprintf(stderr, "Invalid ndim=%d: expected to be 0 ~ %d.\n", ndim, TVM_CRT_MAX_NDIM);
     status = -1;
   }
-  if (ctx.device_type != kDLCPU) {
-    fprintf(stderr, "Invalid DLTensor context: can only save as CPU tensor\n");
+  if (dev.device_type != kDLCPU) {
+    fprintf(stderr, "Invalid DLTensor device: can only save as CPU tensor\n");
     status = -1;
   }
   int64_t shape[TVM_CRT_MAX_NDIM] = {0};
@@ -101,7 +101,7 @@ int TVMNDArray_Load(TVMNDArray* ret, const char** strm) {
       *strm += sizeof(shape[idx]);
     }
   }
-  status = TVMNDArray_Empty(ndim, shape, dtype, ctx, ret);
+  status = TVMNDArray_Empty(ndim, shape, dtype, dev, ret);
   if (status != 0) {
     return status;
   }
@@ -128,7 +128,7 @@ int TVMNDArray_Load(TVMNDArray* ret, const char** strm) {
 
 int TVMNDArray_CreateView(TVMNDArray* arr, const tvm_index_t* shape, int32_t ndim, DLDataType dtype,
                           TVMNDArray* array_view) {
-  int status = TVMNDArray_Create(ndim, shape, dtype, arr->dl_tensor.ctx, array_view);
+  int status = TVMNDArray_Create(ndim, shape, dtype, arr->dl_tensor.device, array_view);
   if (status != 0) {
     return status;
   }
@@ -138,15 +138,15 @@ int TVMNDArray_CreateView(TVMNDArray* arr, const tvm_index_t* shape, int32_t ndi
 
 int TVMNDArray_Release(TVMNDArray* arr) {
   tvm_crt_error_t err;
-  DLContext ctx = {kDLCPU, 0};
+  DLDevice dev = {kDLCPU, 0};
 
-  err = TVMPlatformMemoryFree(arr->dl_tensor.data, ctx);
+  err = TVMPlatformMemoryFree(arr->dl_tensor.data, dev);
   if (err != kTvmErrorNoError) {
     return err;
   }
 
   arr->dl_tensor.data = 0;
-  err = TVMPlatformMemoryFree(arr->dl_tensor.shape, ctx);
+  err = TVMPlatformMemoryFree(arr->dl_tensor.shape, dev);
   if (err != kTvmErrorNoError) {
     return err;
   }
diff --git a/src/runtime/crt/graph_runtime/graph_runtime.c b/src/runtime/crt/graph_executor/graph_executor.c
similarity index 71%
rename from src/runtime/crt/graph_runtime/graph_runtime.c
rename to src/runtime/crt/graph_executor/graph_executor.c
index 21b72f0e400c..2fe9e73aeddc 100644
--- a/src/runtime/crt/graph_runtime/graph_runtime.c
+++ b/src/runtime/crt/graph_executor/graph_executor.c
@@ -20,12 +20,12 @@
 // LINT_C_FILE
 
 /*!
- * \file graph_runtime.c
- * \brief implement graph runtime in pure C
+ * \file graph_executor.c
+ * \brief implement graph executor in pure C
  */
 
 #include <tvm/runtime/c_runtime_api.h>
-#include <tvm/runtime/crt/internal/graph_runtime/graph_runtime.h>
+#include <tvm/runtime/crt/internal/graph_executor/graph_executor.h>
 #include <tvm/runtime/crt/logging.h>
 #include <tvm/runtime/crt/memory.h>
 #include <tvm/runtime/crt/module.h>
@@ -49,7 +49,7 @@ uint32_t Shape_Accumulate(int64_t* shape, uint32_t ndim) {
   return accum;
 }
 
-int NodeEntry_Load(TVMGraphRuntimeNodeEntry* entry, JSONReader* reader) {
+int NodeEntry_Load(TVMGraphExecutorNodeEntry* entry, JSONReader* reader) {
   int status = 0;
   reader->BeginArray(reader);
   if (!(reader->NextArrayItem(reader))) {
@@ -74,8 +74,8 @@ int NodeEntry_Load(TVMGraphRuntimeNodeEntry* entry, JSONReader* reader) {
   return status;
 }
 
-void TVMGraphRuntimeNode_LoadAttrs(TVMGraphRuntimeNode* node, JSONReader* reader,
-                                   TVMOpParam* param) {
+void TVMGraphExecutorNode_LoadAttrs(TVMGraphExecutorNode* node, JSONReader* reader,
+                                    TVMOpParam* param) {
   int bitmask = 0;
   char key[20], value[120];
   memset(param, 0, sizeof(TVMOpParam));
@@ -109,7 +109,7 @@ void TVMGraphRuntimeNode_LoadAttrs(TVMGraphRuntimeNode* node, JSONReader* reader
   }
 }
 
-int TVMGraphRuntimeNode_Load(TVMGraphRuntimeNode* node, JSONReader* reader) {
+int TVMGraphExecutorNode_Load(TVMGraphExecutorNode* node, JSONReader* reader) {
   int status = 0;
   reader->BeginObject(reader);
   int bitmask = 0;
@@ -137,9 +137,9 @@ int TVMGraphRuntimeNode_Load(TVMGraphRuntimeNode* node, JSONReader* reader) {
         fprintf(stderr, "error determining inputs array length\n");
         break;
       }
-      DLContext ctx = {kDLCPU, 0};
-      tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(TVMGraphRuntimeNodeEntry) * num_inputs,
-                                                      ctx, (void**)&node->inputs);
+      DLDevice dev = {kDLCPU, 0};
+      tvm_crt_error_t err = TVMPlatformMemoryAllocate(
+          sizeof(TVMGraphExecutorNodeEntry) * num_inputs, dev, (void**)&node->inputs);
       if (err != kTvmErrorNoError) {
         fprintf(stderr, "memory allocate error: %08x", err);
         return -1;
@@ -150,7 +150,7 @@ int TVMGraphRuntimeNode_Load(TVMGraphRuntimeNode* node, JSONReader* reader) {
           return -1;
         }
 
-        TVMGraphRuntimeNodeEntry* inputs = node->inputs + count;
+        TVMGraphExecutorNodeEntry* inputs = node->inputs + count;
         reader->BeginArray(reader);
         if (!reader->NextArrayItem(reader)) {
           fprintf(stderr, "invalid json format\n");
@@ -181,7 +181,7 @@ int TVMGraphRuntimeNode_Load(TVMGraphRuntimeNode* node, JSONReader* reader) {
     } else if (!strcmp(key, "attr") || !strcmp(key, "attrs")) {
       TVMOpParam param;
 
-      TVMGraphRuntimeNode_LoadAttrs(node, reader, &param);
+      TVMGraphExecutorNode_LoadAttrs(node, reader, &param);
       memcpy(&node->param, &param, sizeof(param));
     } else if (!strcmp(key, "control_deps")) {
       fprintf(stderr, "do not support key %s", key);
@@ -201,21 +201,21 @@ int TVMGraphRuntimeNode_Load(TVMGraphRuntimeNode* node, JSONReader* reader) {
   return status;
 }
 
-TVMGraphRuntimeNode TVMGraphRuntimeNodeCreate() {
-  TVMGraphRuntimeNode node;
-  memset(&node, 0, sizeof(TVMGraphRuntimeNode));
-  node.LoadAttrs = TVMGraphRuntimeNode_LoadAttrs;
-  node.Load = TVMGraphRuntimeNode_Load;
+TVMGraphExecutorNode TVMGraphExecutorNodeCreate() {
+  TVMGraphExecutorNode node;
+  memset(&node, 0, sizeof(TVMGraphExecutorNode));
+  node.LoadAttrs = TVMGraphExecutorNode_LoadAttrs;
+  node.Load = TVMGraphExecutorNode_Load;
   return node;
 }
 
-int TVMGraphRuntimeNodeRelease(TVMGraphRuntimeNode* node) {
+int TVMGraphExecutorNodeRelease(TVMGraphExecutorNode* node) {
   if (!node) {
     return 0;
   }
   if (node->inputs) {
-    DLContext ctx = {kDLCPU, 0};
-    tvm_crt_error_t err = TVMPlatformMemoryFree(node->inputs, ctx);
+    DLDevice dev = {kDLCPU, 0};
+    tvm_crt_error_t err = TVMPlatformMemoryFree(node->inputs, dev);
     node->inputs = 0;
     if (err != kTvmErrorNoError) {
       return -1;
@@ -225,7 +225,7 @@ int TVMGraphRuntimeNodeRelease(TVMGraphRuntimeNode* node) {
   return 0;
 }
 
-int TVMGraphRuntimeGraphAttr_Load(TVMGraphRuntimeGraphAttr* attr, JSONReader* reader) {
+int TVMGraphExecutorGraphAttr_Load(TVMGraphExecutorGraphAttr* attr, JSONReader* reader) {
   int status = 0;
   int bitmask = 0;
   char key[16], type[16];
@@ -264,9 +264,9 @@ int TVMGraphRuntimeGraphAttr_Load(TVMGraphRuntimeGraphAttr* attr, JSONReader* re
         status = -1;
         break;
       }
-      DLContext ctx = {kDLCPU, 0};
+      DLDevice dev = {kDLCPU, 0};
       tvm_crt_error_t err =
-          TVMPlatformMemoryAllocate(TVM_CRT_STRLEN_DLTYPE * num_items, ctx, (void**)&attr->dltype);
+          TVMPlatformMemoryAllocate(TVM_CRT_STRLEN_DLTYPE * num_items, dev, (void**)&attr->dltype);
       if (err != kTvmErrorNoError) {
         fprintf(stderr, "memory allocate error: %08x", err);
         return -1;
@@ -322,9 +322,9 @@ int TVMGraphRuntimeGraphAttr_Load(TVMGraphRuntimeGraphAttr* attr, JSONReader* re
         status = -1;
         break;
       }
-      DLContext ctx = {kDLCPU, 0};
+      DLDevice dev = {kDLCPU, 0};
       tvm_crt_error_t err =
-          TVMPlatformMemoryAllocate(sizeof(uint32_t) * num_items, ctx, (void**)&attr->storage_id);
+          TVMPlatformMemoryAllocate(sizeof(uint32_t) * num_items, dev, (void**)&attr->storage_id);
       if (err != kTvmErrorNoError) {
         fprintf(stderr, "memory allocate error: %08x", err);
         return -1;
@@ -374,15 +374,15 @@ int TVMGraphRuntimeGraphAttr_Load(TVMGraphRuntimeGraphAttr* attr, JSONReader* re
         status = -1;
         break;
       }
-      DLContext ctx = {kDLCPU, 0};
+      DLDevice dev = {kDLCPU, 0};
       tvm_crt_error_t err = TVMPlatformMemoryAllocate(
-          sizeof(int64_t) * TVM_CRT_MAX_NDIM * num_items, ctx, (void**)&attr->shape);
+          sizeof(int64_t) * TVM_CRT_MAX_NDIM * num_items, dev, (void**)&attr->shape);
       if (err != kTvmErrorNoError) {
         fprintf(stderr, "memory allocate error: %08x", err);
         status = -1;
         break;
       }
-      err = TVMPlatformMemoryAllocate(sizeof(uint32_t) * num_items, ctx, (void**)&attr->ndim);
+      err = TVMPlatformMemoryAllocate(sizeof(uint32_t) * num_items, dev, (void**)&attr->ndim);
       if (err != kTvmErrorNoError) {
         fprintf(stderr, "memory allocate error: %08x", err);
         status = -1;
@@ -449,9 +449,9 @@ int TVMGraphRuntimeGraphAttr_Load(TVMGraphRuntimeGraphAttr* attr, JSONReader* re
         status = -1;
         break;
       }
-      DLContext ctx = {kDLCPU, 0};
+      DLDevice dev = {kDLCPU, 0};
       tvm_crt_error_t err =
-          TVMPlatformMemoryAllocate(sizeof(uint32_t) * num_items, ctx, (void**)&attr->device_index);
+          TVMPlatformMemoryAllocate(sizeof(uint32_t) * num_items, dev, (void**)&attr->device_index);
       if (err != kTvmErrorNoError) {
         fprintf(stderr, "memory allocate error: %08x", err);
         status = -1;
@@ -520,45 +520,45 @@ int TVMGraphRuntimeGraphAttr_Load(TVMGraphRuntimeGraphAttr* attr, JSONReader* re
   return status;
 }
 
-int TVMGraphRuntimeGraphAttr_Release(TVMGraphRuntimeGraphAttr* attr) {
+int TVMGraphExecutorGraphAttr_Release(TVMGraphExecutorGraphAttr* attr) {
   if (!attr) {
     return 0;
   }
   if (attr->storage_id) {
-    DLContext ctx = {kDLCPU, 0};
-    tvm_crt_error_t err = TVMPlatformMemoryFree(attr->storage_id, ctx);
+    DLDevice dev = {kDLCPU, 0};
+    tvm_crt_error_t err = TVMPlatformMemoryFree(attr->storage_id, dev);
     attr->storage_id = 0;
     if (err != kTvmErrorNoError) {
       return -1;
     }
   }
   if (attr->device_index) {
-    DLContext ctx = {kDLCPU, 0};
-    tvm_crt_error_t err = TVMPlatformMemoryFree(attr->device_index, ctx);
+    DLDevice dev = {kDLCPU, 0};
+    tvm_crt_error_t err = TVMPlatformMemoryFree(attr->device_index, dev);
     attr->device_index = 0;
     if (err != kTvmErrorNoError) {
       return -1;
     }
   }
   if (attr->dltype) {
-    DLContext ctx = {kDLCPU, 0};
-    tvm_crt_error_t err = TVMPlatformMemoryFree(attr->dltype, ctx);
+    DLDevice dev = {kDLCPU, 0};
+    tvm_crt_error_t err = TVMPlatformMemoryFree(attr->dltype, dev);
     attr->dltype = 0;
     if (err != kTvmErrorNoError) {
       return -1;
     }
   }
   if (attr->shape) {
-    DLContext ctx = {kDLCPU, 0};
-    tvm_crt_error_t err = TVMPlatformMemoryFree(attr->shape, ctx);
+    DLDevice dev = {kDLCPU, 0};
+    tvm_crt_error_t err = TVMPlatformMemoryFree(attr->shape, dev);
     attr->shape = 0;
     if (err != kTvmErrorNoError) {
       return -1;
     }
   }
   if (attr->ndim) {
-    DLContext ctx = {kDLCPU, 0};
-    tvm_crt_error_t err = TVMPlatformMemoryFree(attr->ndim, ctx);
+    DLDevice dev = {kDLCPU, 0};
+    tvm_crt_error_t err = TVMPlatformMemoryFree(attr->ndim, dev);
     attr->ndim = 0;
     if (err != kTvmErrorNoError) {
       return -1;
@@ -568,7 +568,7 @@ int TVMGraphRuntimeGraphAttr_Release(TVMGraphRuntimeGraphAttr* attr) {
   return 0;
 }
 
-int TVMGraphRuntime_Load(TVMGraphRuntime* runtime, JSONReader* reader) {
+int TVMGraphExecutor_Load(TVMGraphExecutor* executor, JSONReader* reader) {
   int status = 0;
   reader->BeginObject(reader);
   int bitmask = 0;
@@ -582,31 +582,31 @@ int TVMGraphRuntime_Load(TVMGraphRuntime* runtime, JSONReader* reader) {
         status = -1;
         break;
       }
-      DLContext ctx = {kDLCPU, 0};
-      tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(TVMGraphRuntimeNode) * num_items, ctx,
-                                                      (void**)&runtime->nodes);
+      DLDevice dev = {kDLCPU, 0};
+      tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(TVMGraphExecutorNode) * num_items, dev,
+                                                      (void**)&executor->nodes);
       if (err != kTvmErrorNoError) {
         fprintf(stderr, "memory allocate error: %08x", err);
         status = -1;
         break;
       }
       while (reader->NextArrayItem(reader)) {
-        if (runtime->nodes_count == num_items) {
+        if (executor->nodes_count == num_items) {
           fprintf(stderr, "array too big\n");
           status = -1;
           return status;
         }
-        TVMGraphRuntimeNode* node = runtime->nodes + runtime->nodes_count;
-        status = TVMGraphRuntimeNode_Load(node, reader);
+        TVMGraphExecutorNode* node = executor->nodes + executor->nodes_count;
+        status = TVMGraphExecutorNode_Load(node, reader);
         if (status != 0) {
-          fprintf(stderr, "failed to load an element in `nodes` field in graph runtime node.\n");
+          fprintf(stderr, "failed to load an element in `nodes` field in graph executor node.\n");
           break;
 #if TVM_CRT_DEBUG
         } else {
-          printf("loading: node (%u) %s loaded.\n", runtime->nodes_count, node->name);
+          printf("loading: node (%u) %s loaded.\n", executor->nodes_count, node->name);
 #endif  // TVM_CRT_DEBUG
         }
-        runtime->nodes_count++;
+        executor->nodes_count++;
       }
       bitmask |= 1;
     } else if (!strcmp(key, "arg_nodes")) {
@@ -617,23 +617,24 @@ int TVMGraphRuntime_Load(TVMGraphRuntime* runtime, JSONReader* reader) {
         status = -1;
         break;
       }
-      DLContext ctx = {kDLCPU, 0};
-      tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(uint32_t) * num_items, ctx,
-                                                      (void**)&runtime->input_nodes);
+      DLDevice dev = {kDLCPU, 0};
+      tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(uint32_t) * num_items, dev,
+                                                      (void**)&executor->input_nodes);
+
       if (err != kTvmErrorNoError) {
         fprintf(stderr, "memory allocate error: %08x", err);
         status = -1;
         break;
       }
       while (reader->NextArrayItem(reader)) {
-        if (runtime->input_nodes_count == num_items) {
+        if (executor->input_nodes_count == num_items) {
           fprintf(stderr, "array too big\n");
           status = -1;
           return status;
         }
-        uint32_t* node = runtime->input_nodes + runtime->input_nodes_count;
+        uint32_t* node = executor->input_nodes + executor->input_nodes_count;
         reader->ReadUnsignedInteger(reader, node);
-        runtime->input_nodes_count++;
+        executor->input_nodes_count++;
       }
       bitmask |= 2;
     } else if (!strcmp(key, "node_row_ptr")) {
@@ -644,24 +645,24 @@ int TVMGraphRuntime_Load(TVMGraphRuntime* runtime, JSONReader* reader) {
         status = -1;
         break;
       }
-      DLContext ctx = {kDLCPU, 0};
-      tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(uint32_t) * num_items, ctx,
-                                                      (void**)&runtime->node_row_ptr);
+      DLDevice dev = {kDLCPU, 0};
+      tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(uint32_t) * num_items, dev,
+                                                      (void**)&executor->node_row_ptr);
       if (err != kTvmErrorNoError) {
         fprintf(stderr, "memory allocate error: %08x", err);
         status = -1;
         break;
       }
       while (reader->NextArrayItem(reader)) {
-        if (runtime->node_row_ptr_count == num_items) {
+        if (executor->node_row_ptr_count == num_items) {
           fprintf(stderr, "array too big\n");
           status = -1;
           return status;
         }
-        uint32_t count = runtime->node_row_ptr_count;
-        uint32_t* node = runtime->node_row_ptr + count;
+        uint32_t count = executor->node_row_ptr_count;
+        uint32_t* node = executor->node_row_ptr + count;
         reader->ReadUnsignedInteger(reader, node);
-        runtime->node_row_ptr_count++;
+        executor->node_row_ptr_count++;
       }
       bitmask |= 4;
     } else if (!strcmp(key, "heads")) {
@@ -672,33 +673,33 @@ int TVMGraphRuntime_Load(TVMGraphRuntime* runtime, JSONReader* reader) {
         status = -1;
         break;
       }
-      DLContext ctx = {kDLCPU, 0};
-      tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(TVMGraphRuntimeNodeEntry) * num_items,
-                                                      ctx, (void**)&runtime->outputs);
+      DLDevice dev = {kDLCPU, 0};
+      tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(TVMGraphExecutorNodeEntry) * num_items,
+                                                      dev, (void**)&executor->outputs);
       if (err != kTvmErrorNoError) {
         fprintf(stderr, "memory allocate error: %08x", err);
         status = -1;
         break;
       }
       while (reader->NextArrayItem(reader)) {
-        if (runtime->outputs_count == num_items) {
+        if (executor->outputs_count == num_items) {
           fprintf(stderr, "array too big\n");
           status = -1;
           return status;
         }
-        TVMGraphRuntimeNodeEntry* entry = runtime->outputs + runtime->outputs_count;
+        TVMGraphExecutorNodeEntry* entry = executor->outputs + executor->outputs_count;
         status = NodeEntry_Load(entry, reader);
         if (status != 0) {
-          fprintf(stderr, "Fail to load an element in `heads` field in graph runtime node.\n");
+          fprintf(stderr, "Fail to load an element in `heads` field in graph executor node.\n");
           break;
         }
-        runtime->outputs_count++;
+        executor->outputs_count++;
       }
       bitmask |= 8;
     } else if (!strcmp(key, "attrs")) {
-      status = TVMGraphRuntimeGraphAttr_Load(&(runtime->attrs), reader);
+      status = TVMGraphExecutorGraphAttr_Load(&(executor->attrs), reader);
       if (status != 0) {
-        fprintf(stderr, "Fail to load an element in `heads` field in graph runtime node.\n");
+        fprintf(stderr, "Fail to load an element in `heads` field in graph executor node.\n");
         break;
       }
       bitmask |= 16;
@@ -719,29 +720,31 @@ int TVMGraphRuntime_Load(TVMGraphRuntime* runtime, JSONReader* reader) {
   return status;
 }
 
-uint32_t TVMGraphRuntime_GetEntryId(TVMGraphRuntime* runtime, uint32_t nid, uint32_t index) {
-  return runtime->node_row_ptr[nid] + index;
+uint32_t TVMGraphExecutor_GetEntryId(TVMGraphExecutor* executor, uint32_t nid, uint32_t index) {
+  return executor->node_row_ptr[nid] + index;
 }
 
 /*!
  * \brief Get the number of input tensors allocated.
- * \param runtime The graph runtime.
+ * \param executor The graph executor.
  * \return the number of input tensors allocated.
  */
-int TVMGraphRuntime_GetNumInputs(TVMGraphRuntime* runtime) { return runtime->input_nodes_count; }
+int TVMGraphExecutor_GetNumInputs(TVMGraphExecutor* executor) {
+  return executor->input_nodes_count;
+}
 
 /*!
  * \brief Get the input index given the name of input.
- * \param runtime The graph runtime.
+ * \param executor The graph executor.
  * \param name The name of the input.
  * \return The index of input.
  */
-int TVMGraphRuntime_GetInputIndex(TVMGraphRuntime* runtime, const char* name) {
+int TVMGraphExecutor_GetInputIndex(TVMGraphExecutor* executor, const char* name) {
   uint32_t i;
   int32_t rv = -1;
-  for (i = 0; i < runtime->input_nodes_count; ++i) {
-    uint32_t nid = runtime->input_nodes[i];
-    if (!strcmp(runtime->nodes[nid].name, name)) {
+  for (i = 0; i < executor->input_nodes_count; ++i) {
+    uint32_t nid = executor->input_nodes[i];
+    if (!strcmp(executor->nodes[nid].name, name)) {
       rv = i;
       break;
     }
@@ -752,28 +755,28 @@ int TVMGraphRuntime_GetInputIndex(TVMGraphRuntime* runtime, const char* name) {
 
 /*!
  * \brief set input to the graph based on name.
- * \param runtime The graph runtime.
+ * \param executor The graph executor.
  * \param name The name of the input.
  * \param data_in The input data.
  */
-void TVMGraphRuntime_SetInput(TVMGraphRuntime* runtime, const char* name, DLTensor* data_in) {
-  uint32_t index = TVMGraphRuntime_GetInputIndex(runtime, name);
-  if (index >= runtime->input_nodes_count) {
+void TVMGraphExecutor_SetInput(TVMGraphExecutor* executor, const char* name, DLTensor* data_in) {
+  uint32_t index = TVMGraphExecutor_GetInputIndex(executor, name);
+  if (index >= executor->input_nodes_count) {
     fprintf(stderr, "given index is greater than num of input nodes.\n");
   }
-  uint32_t eid = TVMGraphRuntime_GetEntryId(runtime, runtime->input_nodes[index], 0);
-  runtime->data_entry[eid].dl_tensor.data = data_in->data;
+  uint32_t eid = TVMGraphExecutor_GetEntryId(executor, executor->input_nodes[index], 0);
+  executor->data_entry[eid].dl_tensor.data = data_in->data;
 }
 
 /*!
  * \brief Load parameters from parameter blob.
- * \param runtime The graph runtime.
+ * \param executor The graph executor.
  * \param param_blob A binary blob of parameter.
  * \param param_size The parameter size.
  * \return The result of this function execution.
  */
-int TVMGraphRuntime_LoadParams(TVMGraphRuntime* runtime, const char* param_blob,
-                               const uint32_t param_size) {
+int TVMGraphExecutor_LoadParams(TVMGraphExecutor* executor, const char* param_blob,
+                                const uint32_t param_size) {
   int status = 0;
   const char* bptr = param_blob;
   uint64_t header, reserved;
@@ -788,15 +791,15 @@ int TVMGraphRuntime_LoadParams(TVMGraphRuntime* runtime, const char* param_blob,
 
   // read names
   char* names = NULL;
-  DLContext ctx = {kDLCPU, 0};
+  DLDevice dev = {kDLCPU, 0};
   tvm_crt_error_t err =
-      TVMPlatformMemoryAllocate(TVM_CRT_STRLEN_NAME * runtime->nodes_count, ctx, (void**)&names);
+      TVMPlatformMemoryAllocate(TVM_CRT_STRLEN_NAME * executor->nodes_count, dev, (void**)&names);
   if (err != kTvmErrorNoError) {
     fprintf(stderr, "memory allocate error: %08x", err);
     status = -1;
     return status;
   }
-  memset(names, 0, TVM_CRT_STRLEN_NAME * runtime->nodes_count);
+  memset(names, 0, TVM_CRT_STRLEN_NAME * executor->nodes_count);
   uint64_t names_count;
   int idx;
   memcpy(&names_count, bptr, sizeof(names_count));
@@ -824,33 +827,33 @@ int TVMGraphRuntime_LoadParams(TVMGraphRuntime* runtime, const char* param_blob,
   }
 
   for (idx = 0; idx < size; idx++) {
-    int32_t in_idx = TVMGraphRuntime_GetInputIndex(runtime, names + TVM_CRT_STRLEN_NAME * idx);
+    int32_t in_idx = TVMGraphExecutor_GetInputIndex(executor, names + TVM_CRT_STRLEN_NAME * idx);
     CHECK_GT(in_idx, 0, "Found param for non-existent input: %s\n",
              names + TVM_CRT_STRLEN_NAME * idx);
-    uint32_t eid = TVMGraphRuntime_GetEntryId(runtime, runtime->input_nodes[in_idx], 0);
-    if (!(eid < runtime->data_entry_count)) {
+    uint32_t eid = TVMGraphExecutor_GetEntryId(executor, executor->input_nodes[in_idx], 0);
+    if (!(eid < executor->data_entry_count)) {
       fprintf(stderr, "`entry_id`=%d is greater than expected(%d).\n", eid,
-              runtime->data_entry_count);
+              executor->data_entry_count);
       status = -1;
     }
 
-    if (runtime->data_entry[eid].dl_tensor.shape) {
-      err = TVMPlatformMemoryFree(runtime->data_entry[eid].dl_tensor.shape, ctx);
+    if (executor->data_entry[eid].dl_tensor.shape) {
+      err = TVMPlatformMemoryFree(executor->data_entry[eid].dl_tensor.shape, dev);
       if (err != kTvmErrorNoError) {
         status = -1;
       }
-      runtime->data_entry[eid].dl_tensor.shape = 0;
+      executor->data_entry[eid].dl_tensor.shape = 0;
     }
-    if (runtime->data_entry[eid].dl_tensor.data) {
-      err = TVMPlatformMemoryFree(runtime->data_entry[eid].dl_tensor.data, ctx);
+    if (executor->data_entry[eid].dl_tensor.data) {
+      err = TVMPlatformMemoryFree(executor->data_entry[eid].dl_tensor.data, dev);
       if (err != kTvmErrorNoError) {
         status = -1;
       }
-      runtime->data_entry[eid].dl_tensor.data = 0;
+      executor->data_entry[eid].dl_tensor.data = 0;
     }
-    status |= TVMNDArray_Load(&(runtime->data_entry[eid]), &bptr);
+    status |= TVMNDArray_Load(&(executor->data_entry[eid]), &bptr);
 #if TVM_CRT_DEBUG
-    TVMNDArray* entry = &(runtime->data_entry[eid]);
+    TVMNDArray* entry = &(executor->data_entry[eid]);
     printf("loading: param %s loaded, in_idx=%d, eid=%d, ndim=%d, data[0]=%f\n",
            names + TVM_CRT_STRLEN_NAME * idx, in_idx, eid, entry->dl_tensor.ndim,
            ((float*)entry->dl_tensor.data)[0]);  // NOLINT(*)
@@ -858,7 +861,7 @@ int TVMGraphRuntime_LoadParams(TVMGraphRuntime* runtime, const char* param_blob,
   }
 
   // Release memory
-  err = TVMPlatformMemoryFree(names, ctx);
+  err = TVMPlatformMemoryFree(names, dev);
   if (err != kTvmErrorNoError) {
     status = -1;
     return status;
@@ -869,38 +872,38 @@ int TVMGraphRuntime_LoadParams(TVMGraphRuntime* runtime, const char* param_blob,
 
 /*!
  * \brief Run all the operations one by one.
- * \param runtime The graph runtime.
+ * \param executor The graph executor.
  */
-void TVMGraphRuntime_Run(TVMGraphRuntime* runtime) {
+void TVMGraphExecutor_Run(TVMGraphExecutor* executor) {
   // setup the array and requirements.
   uint32_t idx;
-  for (idx = 0; idx < runtime->op_execs_count; ++idx) {
-    if (runtime->op_execs[idx].fexec) {
+  for (idx = 0; idx < executor->op_execs_count; ++idx) {
+    if (executor->op_execs[idx].fexec) {
 #if TVM_CRT_DEBUG
-      printf("calling: %s (%d)\n", runtime->op_execs[idx].name, idx);
+      printf("calling: %s (%d)\n", executor->op_execs[idx].name, idx);
 #endif  // TVM_CRT_DEBUG
-      runtime->op_execs[idx].Call(&(runtime->op_execs[idx]));
+      executor->op_execs[idx].Call(&(executor->op_execs[idx]));
     }
   }
 }
 
 /*!
  * \brief Get the number of output tensors allocated.
- * \param runtime The graph runtime.
+ * \param executor The graph executor.
  * \return the number of output tensors allocated.
  */
-int TVMGraphRuntime_GetNumOutputs(TVMGraphRuntime* runtime) { return runtime->outputs_count; }
+int TVMGraphExecutor_GetNumOutputs(TVMGraphExecutor* executor) { return executor->outputs_count; }
 
-int TVMGraphRuntime_GetOutput(TVMGraphRuntime* runtime, const int32_t idx, DLTensor* out) {
+int TVMGraphExecutor_GetOutput(TVMGraphExecutor* executor, const int32_t idx, DLTensor* out) {
   int status = 0;
-  uint32_t nid = runtime->outputs[idx].node_id;
-  uint32_t index = runtime->outputs[idx].index;
-  uint32_t eid = TVMGraphRuntime_GetEntryId(runtime, nid, index);
+  uint32_t nid = executor->outputs[idx].node_id;
+  uint32_t index = executor->outputs[idx].index;
+  uint32_t eid = TVMGraphExecutor_GetEntryId(executor, nid, index);
 
   // copy data section to allocated output tensor
   int32_t elem_bytes = out->dtype.bits / 8;
   int64_t size = Shape_Accumulate(out->shape, out->ndim);
-  DLTensor* tensor = &(runtime->data_entry[eid].dl_tensor);
+  DLTensor* tensor = &(executor->data_entry[eid].dl_tensor);
   CHECK(out->ndim == tensor->ndim);
   CHECK(out->dtype.bits == tensor->dtype.bits);
   CHECK(Shape_Accumulate(out->shape, out->ndim) == Shape_Accumulate(tensor->shape, tensor->ndim));
@@ -908,7 +911,7 @@ int TVMGraphRuntime_GetOutput(TVMGraphRuntime* runtime, const int32_t idx, DLTen
   return status;
 }
 
-int TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) {
+int TVMGraphExecutor_SetupStorage(TVMGraphExecutor* executor) {
   TVMPackedFunc lookup_linked_param;
   int lookup_linked_param_valid;
   uint32_t idx;
@@ -919,16 +922,16 @@ int TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) {
     temp_args.tcodes[0] = kTVMArgInt;
     temp_args.values_count = 1;
     lookup_linked_param_valid =
-        (TVMPackedFunc_InitModuleFunc(&lookup_linked_param, runtime->module_handle,
+        (TVMPackedFunc_InitModuleFunc(&lookup_linked_param, executor->module_handle,
                                       "_lookup_linked_param", &temp_args) == 0);
   }
 
   // Grab saved optimization plan from graph.
-  TVMGraphRuntimeGraphAttr* attrs = &(runtime->attrs);
+  TVMGraphExecutorGraphAttr* attrs = &(executor->attrs);
   DLDataType* vtype = NULL;
-  DLContext alloc_ctx = {kDLCPU, 0};
+  DLDevice alloc_dev = {kDLCPU, 0};
   tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(DLDataType) * attrs->dltype_count,
-                                                  alloc_ctx, (void**)&vtype);
+                                                  alloc_dev, (void**)&vtype);
   if (err != kTvmErrorNoError) {
     fprintf(stderr, "memory allocate error: %08x", err);
     return -1;
@@ -938,20 +941,20 @@ int TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) {
   }
 
   // Size and device type of each storage pool entry.
-  TVMGraphRuntimePoolEntry* pool_entry = NULL;
-  err = TVMPlatformMemoryAllocate(sizeof(TVMGraphRuntimePoolEntry) * runtime->nodes_count,
-                                  alloc_ctx, (void**)&pool_entry);
+  TVMGraphExecutorPoolEntry* pool_entry = NULL;
+  err = TVMPlatformMemoryAllocate(sizeof(TVMGraphExecutorPoolEntry) * executor->nodes_count,
+                                  alloc_dev, (void**)&pool_entry);
   if (err != kTvmErrorNoError) {
     fprintf(stderr, "memory allocate error: %08x", err);
     return -1;
   }
-  memset(pool_entry, 0, sizeof(TVMGraphRuntimePoolEntry) * runtime->nodes_count);
+  memset(pool_entry, 0, sizeof(TVMGraphExecutorPoolEntry) * executor->nodes_count);
   uint32_t pool_entry_count = 0;
   // Find the maximum space size.
   for (idx = 0; idx < attrs->shape_count; idx++) {
     int storage_id = attrs->storage_id[idx];
     // Use the fallback device if no device index is available.
-    int device_type = runtime->ctxs[0].device_type;
+    int device_type = executor->devices[0].device_type;
     uint32_t size = Shape_Accumulate(attrs->shape + idx * TVM_CRT_MAX_NDIM, attrs->ndim[idx]);
     DLDataType t = vtype[idx];
     uint32_t bits = t.bits * t.lanes;
@@ -967,15 +970,15 @@ int TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) {
   }
 
   // Allocate the space.
-  err = TVMPlatformMemoryAllocate(sizeof(TVMNDArray) * pool_entry_count, alloc_ctx,
-                                  (void**)&runtime->storage_pool);
+  err = TVMPlatformMemoryAllocate(sizeof(TVMGraphExecutorStorageEntry) * pool_entry_count,
+                                  alloc_dev, (void**)&executor->storage_pool);
   if (err != kTvmErrorNoError) {
     fprintf(stderr, "memory allocate error: %08x", err);
     return -1;
   }
   for (idx = 0; idx < pool_entry_count; idx++) {
-    TVMGraphRuntimePoolEntry pit = pool_entry[idx];
-    TVMContext ctx = runtime->ctxs[0];
+    TVMGraphExecutorPoolEntry pit = pool_entry[idx];
+    DLDevice dev = executor->devices[0];
     uint8_t did_find_linked_param = 0;
     if (lookup_linked_param_valid) {
       lookup_linked_param.args.values[0].v_int64 = idx;
@@ -983,10 +986,10 @@ int TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) {
 
       void* linked_param_data = lookup_linked_param.ret_value.values[0].v_handle;
       if (linked_param_data != NULL) {
-        runtime->storage_pool[runtime->storage_pool_count].is_linked_param = 1;
-        DLTensor* tensor = &runtime->storage_pool[runtime->storage_pool_count].array.dl_tensor;
+        executor->storage_pool[executor->storage_pool_count].is_linked_param = 1;
+        DLTensor* tensor = &executor->storage_pool[executor->storage_pool_count].array.dl_tensor;
         tensor->data = linked_param_data;
-        tensor->ctx = ctx;
+        tensor->device = dev;
         tensor->ndim = attrs->ndim[pit.entry_id];
         tensor->shape = attrs->shape + idx * TVM_CRT_MAX_NDIM;
         tensor->strides = NULL;
@@ -1000,40 +1003,40 @@ int TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) {
           0,
       };
       shape[0] = (pit.size + 3) / 4;
-      int status = TVMNDArray_Empty(1, shape, dtype, ctx,
-                                    &runtime->storage_pool[runtime->storage_pool_count].array);
+      int status = TVMNDArray_Empty(1, shape, dtype, dev,
+                                    &executor->storage_pool[executor->storage_pool_count].array);
       CHECK_EQ(status, 0, "fail to create storage_pool with idx=%d\n", idx);
     }
-    runtime->storage_pool_count++;
+    executor->storage_pool_count++;
   }
 
   // Assign the pooled entries. A unified memory pool is used to simplifiy
   // memory assignment for each node entry. The allocated memory on each device
   // is mapped to this pool.
-  runtime->data_entry_count = runtime->node_row_ptr[runtime->node_row_ptr_count - 1];
-  err = TVMPlatformMemoryAllocate(sizeof(TVMNDArray) * runtime->data_entry_count, alloc_ctx,
-                                  (void**)&runtime->data_entry);
+  executor->data_entry_count = executor->node_row_ptr[executor->node_row_ptr_count - 1];
+  err = TVMPlatformMemoryAllocate(sizeof(TVMNDArray) * executor->data_entry_count, alloc_dev,
+                                  (void**)&executor->data_entry);
   if (err != kTvmErrorNoError) {
     fprintf(stderr, "memory allocate error: %08x", err);
     return -1;
   }
-  for (idx = 0; idx < runtime->data_entry_count; ++idx) {
+  for (idx = 0; idx < executor->data_entry_count; ++idx) {
     uint32_t storage_id = attrs->storage_id[idx];
-    CHECK(storage_id < runtime->storage_pool_count);
-    int status = TVMNDArray_CreateView(&(runtime->storage_pool[storage_id].array),
+    CHECK(storage_id < executor->storage_pool_count);
+    int status = TVMNDArray_CreateView(&(executor->storage_pool[storage_id].array),
                                        attrs->shape + idx * TVM_CRT_MAX_NDIM, attrs->ndim[idx],
-                                       vtype[idx], &runtime->data_entry[idx]);
+                                       vtype[idx], &executor->data_entry[idx]);
     CHECK_EQ(status, 0, "fail to create for node with idx=%d, storage_id=%u\n", idx, storage_id);
   }
 
   // Release memory
-  err = TVMPlatformMemoryFree(vtype, alloc_ctx);
+  err = TVMPlatformMemoryFree(vtype, alloc_dev);
   if (err != kTvmErrorNoError) {
     fprintf(stderr, "memory free error: %08x", err);
     return err;
   }
 
-  err = TVMPlatformMemoryFree(pool_entry, alloc_ctx);
+  err = TVMPlatformMemoryFree(pool_entry, alloc_dev);
   if (err != kTvmErrorNoError) {
     fprintf(stderr, "memory free error: %08x", err);
     return -1;
@@ -1042,32 +1045,32 @@ int TVMGraphRuntime_SetupStorage(TVMGraphRuntime* runtime) {
   return 0;
 }
 
-int TVMGraphRuntime_SetupOpExecs(TVMGraphRuntime* runtime) {
+int TVMGraphExecutor_SetupOpExecs(TVMGraphExecutor* executor) {
   int status = 0;
   uint32_t nid, idx;
-  runtime->op_execs_count = runtime->nodes_count;
-  DLContext ctx = {kDLCPU, 0};
-  tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(TVMPackedFunc) * runtime->op_execs_count,
-                                                  ctx, (void**)&runtime->op_execs);
+  executor->op_execs_count = executor->nodes_count;
+  DLDevice dev = {kDLCPU, 0};
+  tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(TVMPackedFunc) * executor->op_execs_count,
+                                                  dev, (void**)&executor->op_execs);
   if (err != kTvmErrorNoError) {
     fprintf(stderr, "memory allocate error: %08x", err);
     status = -1;
     return status;
   }
-  for (nid = 0; nid < runtime->nodes_count; nid++) {
-    const TVMGraphRuntimeNode* inode = runtime->nodes + nid;
+  for (nid = 0; nid < executor->nodes_count; nid++) {
+    const TVMGraphExecutorNode* inode = executor->nodes + nid;
     if (strcmp(inode->op_type, "null")) {
       DLTensorPtr args[TVM_CRT_MAX_ARGS];
       uint32_t args_count = 0;
       for (idx = 0; idx < inode->inputs_count; idx++) {
-        const TVMGraphRuntimeNodeEntry* entry = inode->inputs + idx;
-        uint32_t eid = TVMGraphRuntime_GetEntryId(runtime, entry->node_id, entry->index);
-        args[idx] = &(runtime->data_entry[eid].dl_tensor);
+        const TVMGraphExecutorNodeEntry* entry = inode->inputs + idx;
+        uint32_t eid = TVMGraphExecutor_GetEntryId(executor, entry->node_id, entry->index);
+        args[idx] = &(executor->data_entry[eid].dl_tensor);
         args_count++;
       }
       for (idx = 0; idx < inode->param.num_outputs; idx++) {
-        uint32_t eid = TVMGraphRuntime_GetEntryId(runtime, nid, idx);
-        args[args_count] = &(runtime->data_entry[eid].dl_tensor);
+        uint32_t eid = TVMGraphExecutor_GetEntryId(executor, nid, idx);
+        args[args_count] = &(executor->data_entry[eid].dl_tensor);
         args_count++;
       }
       if (strcmp(inode->op_type, "tvm_op")) {
@@ -1085,9 +1088,9 @@ int TVMGraphRuntime_SetupOpExecs(TVMGraphRuntime* runtime) {
       printf("tvm_op: creating %s with node_id=%d\n", inode->param.func_name, nid);
 #endif  // TVM_CRT_DEBUG
       TVMPackedFunc pf;
-      TVMGraphRuntime_CreateTVMOp(runtime, &(inode->param), args, args_count, inode->inputs_count,
-                                  &pf);
-      runtime->op_execs[nid] = pf;
+      TVMGraphExecutor_CreateTVMOp(executor, &(inode->param), args, args_count, inode->inputs_count,
+                                   &pf);
+      executor->op_execs[nid] = pf;
     }
   }
   return status;
@@ -1104,9 +1107,9 @@ typedef struct TVMOpArgs {
   uint32_t shape_data_count;
 } TVMOpArgs;
 
-int32_t TVMGraphRuntime_CreateTVMOp(TVMGraphRuntime* runtime, const TVMOpParam* param,
-                                    DLTensorPtr* args, const uint32_t args_count,
-                                    uint32_t num_inputs, TVMPackedFunc* pf) {
+int32_t TVMGraphExecutor_CreateTVMOp(TVMGraphExecutor* executor, const TVMOpParam* param,
+                                     DLTensorPtr* args, const uint32_t args_count,
+                                     uint32_t num_inputs, TVMPackedFunc* pf) {
   int status = 0;
   uint32_t idx;
   TVMOpArgs arg_ptr;
@@ -1137,42 +1140,42 @@ int32_t TVMGraphRuntime_CreateTVMOp(TVMGraphRuntime* runtime, const TVMOpParam*
   }
 
   TVMArgs targs = TVMArgs_Create(arg_ptr.arg_values, arg_ptr.arg_tcodes, arg_ptr.arg_values_count);
-  status = TVMPackedFunc_InitModuleFunc(pf, runtime->module_handle, param->func_name, &targs);
+  status = TVMPackedFunc_InitModuleFunc(pf, executor->module_handle, param->func_name, &targs);
 
   return status;
 }
 
 /*!
- * \brief Initialize the graph executor with graph and context.
+ * \brief Initialize the graph executor with graph and device.
  * \param graph_json The execution graph.
  * \param module_handle The module containing the compiled functions for the host
  * processor.
- * \param ctxs The context of the host and devices where graph nodes will be
+ * \param devs The device of the host and devices where graph nodes will be
  * executed on.
  * \return 0 on success.
  */
-int TVMGraphRuntime_Init(TVMGraphRuntime* runtime, const char* graph_json,
-                         TVMModuleHandle module_handle, const TVMContext* ctxs) {
+int TVMGraphExecutor_Init(TVMGraphExecutor* executor, const char* graph_json,
+                          TVMModuleHandle module_handle, const DLDevice* devs) {
   JSONReader reader;
   tvm_crt_error_t err = JSONReader_Create(graph_json, &reader);
   if (err != kTvmErrorNoError) {
     return -1;
   }
 
-  TVMGraphRuntime_Load(runtime, &reader);
+  TVMGraphExecutor_Load(executor, &reader);
   err = JSONReader_Release(&reader);
   if (err != kTvmErrorNoError) {
     return -1;
   }
-  runtime->module_handle = module_handle;
-  runtime->ctxs[0] = ctxs[0];
+  executor->module_handle = module_handle;
+  executor->devices[0] = devs[0];
 
   int status;
-  status = TVMGraphRuntime_SetupStorage(runtime);
+  status = TVMGraphExecutor_SetupStorage(executor);
   if (status != 0) {
     return status;
   }
-  status = TVMGraphRuntime_SetupOpExecs(runtime);
+  status = TVMGraphExecutor_SetupOpExecs(executor);
   if (status != 0) {
     if (status != 0) {
       return status;
@@ -1184,84 +1187,84 @@ int TVMGraphRuntime_Init(TVMGraphRuntime* runtime, const char* graph_json,
   return status;
 }
 
-int TVMGraphRuntime_Create(const char* sym_json, TVMModuleHandle module_handle,
-                           const TVMContext* ctxs, TVMGraphRuntime** runtime) {
-  DLContext ctx = {kDLCPU, 0};
-  tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(TVMGraphRuntime), ctx, (void**)runtime);
+int TVMGraphExecutor_Create(const char* sym_json, TVMModuleHandle module_handle,
+                            const DLDevice* devs, TVMGraphExecutor** executor) {
+  DLDevice dev = {kDLCPU, 0};
+  tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(TVMGraphExecutor), dev, (void**)executor);
   if (err != kTvmErrorNoError) {
     fprintf(stderr, "memory allocate error: %08x", err);
     return -1;
   }
 
-  memset(*runtime, 0, sizeof(TVMGraphRuntime));
+  memset(*executor, 0, sizeof(TVMGraphExecutor));
   // init
-  return TVMGraphRuntime_Init(*runtime, sym_json, module_handle, ctxs);
+  return TVMGraphExecutor_Init(*executor, sym_json, module_handle, devs);
 }
 
-int TVMGraphRuntime_Release(TVMGraphRuntime** pptr) {
+int TVMGraphExecutor_Release(TVMGraphExecutor** pptr) {
   int status = 0;
   int32_t idx;
-  TVMGraphRuntime* runtime = (TVMGraphRuntime*)(*pptr);
-  for (idx = 0; idx < runtime->nodes_count; ++idx) {
-    status = TVMGraphRuntimeNodeRelease(&(runtime->nodes[idx]));
+  TVMGraphExecutor* executor = (TVMGraphExecutor*)(*pptr);
+  for (idx = 0; idx < executor->nodes_count; ++idx) {
+    status = TVMGraphExecutorNodeRelease(&(executor->nodes[idx]));
     if (status != 0) {
       return status;
     }
   }
-  DLContext ctx = {kDLCPU, 0};
-  status = TVMPlatformMemoryFree(runtime->nodes, ctx);
+  DLDevice dev = {kDLCPU, 0};
+  status = TVMPlatformMemoryFree(executor->nodes, dev);
   if (status != 0) {
     return status;
   }
-  status = TVMGraphRuntimeGraphAttr_Release(&(runtime->attrs));
+  status = TVMGraphExecutorGraphAttr_Release(&(executor->attrs));
   if (status != 0) {
     return status;
   }
-  for (idx = 0; idx < runtime->storage_pool_count; ++idx) {
-    if (runtime->storage_pool[idx].is_linked_param == 0) {
-      status = TVMNDArray_Release(&(runtime->storage_pool[idx]).array);
+  for (idx = 0; idx < executor->storage_pool_count; ++idx) {
+    if (executor->storage_pool[idx].is_linked_param == 0) {
+      status = TVMNDArray_Release(&(executor->storage_pool[idx]).array);
       if (status != 0) {
         return status;
       }
     }
   }
-  for (idx = 0; idx < runtime->data_entry_count; ++idx) {
-    status = TVMPlatformMemoryFree(runtime->data_entry[idx].dl_tensor.shape, ctx);
+  for (idx = 0; idx < executor->data_entry_count; ++idx) {
+    status = TVMPlatformMemoryFree(executor->data_entry[idx].dl_tensor.shape, dev);
     if (status != 0) {
       return status;
     }
   }
-  status = TVMPlatformMemoryFree(runtime->input_nodes, ctx);
+  status = TVMPlatformMemoryFree(executor->input_nodes, dev);
   if (status != 0) {
     return status;
   }
-  status = TVMPlatformMemoryFree(runtime->node_row_ptr, ctx);
+  status = TVMPlatformMemoryFree(executor->node_row_ptr, dev);
   if (status != 0) {
     return status;
   }
-  status = TVMPlatformMemoryFree(runtime->outputs, ctx);
+  status = TVMPlatformMemoryFree(executor->outputs, dev);
   if (status != 0) {
     return status;
   }
-  status = TVMPlatformMemoryFree(runtime->storage_pool, ctx);
+  status = TVMPlatformMemoryFree(executor->storage_pool, dev);
   if (status != 0) {
     return status;
   }
-  status = TVMPlatformMemoryFree(runtime->data_entry, ctx);
+  status = TVMPlatformMemoryFree(executor->data_entry, dev);
   if (status != 0) {
     return status;
   }
-  status = TVMPlatformMemoryFree(runtime->op_execs, ctx);
+  status = TVMPlatformMemoryFree(executor->op_execs, dev);
   if (status != 0) {
     return status;
   }
-  status = TVMPlatformMemoryFree(*pptr, ctx);
+  status = TVMPlatformMemoryFree(*pptr, dev);
   if (status != 0) {
     return status;
   }
 
   if (g_fexecs) {
-    status = TVMPlatformMemoryFree(g_fexecs, ctx);
+    status = TVMPlatformMemoryFree(g_fexecs, dev);
     g_fexecs = 0;
     if (status != 0) {
       return status;
diff --git a/src/runtime/crt/graph_runtime/load_json.c b/src/runtime/crt/graph_executor/load_json.c
similarity index 95%
rename from src/runtime/crt/graph_runtime/load_json.c
rename to src/runtime/crt/graph_executor/load_json.c
index 6de49a3f9789..dd2faecdc538 100644
--- a/src/runtime/crt/graph_runtime/load_json.c
+++ b/src/runtime/crt/graph_executor/load_json.c
@@ -25,7 +25,7 @@
  */
 #include <stdlib.h>
 #include <string.h>
-#include <tvm/runtime/crt/internal/graph_runtime/load_json.h>
+#include <tvm/runtime/crt/internal/graph_executor/load_json.h>
 #include <tvm/runtime/crt/memory.h>
 #include <tvm/runtime/crt/platform.h>
 
@@ -85,15 +85,15 @@ void SeqPop(Seq* seq) {
 }
 
 tvm_crt_error_t SeqCreate(uint64_t len, Seq** seq) {
-  DLContext ctx = {kDLCPU, 0};
-  tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(Seq), ctx, (void**)seq);
+  DLDevice dev = {kDLCPU, 0};
+  tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(Seq), dev, (void**)seq);
   if (err != kTvmErrorNoError) {
     return err;
   }
   memset(*seq, 0, sizeof(Seq));
   (*seq)->allocated = len;
 
-  err = TVMPlatformMemoryAllocate(sizeof(uint32_t) * len, ctx, (void**)&(*seq)->data);
+  err = TVMPlatformMemoryAllocate(sizeof(uint32_t) * len, dev, (void**)&(*seq)->data);
   if (err != kTvmErrorNoError) {
     return err;
   }
@@ -104,12 +104,12 @@ tvm_crt_error_t SeqCreate(uint64_t len, Seq** seq) {
 }
 
 tvm_crt_error_t SeqRelease(Seq* seq) {
-  DLContext ctx = {kDLCPU, 0};
-  tvm_crt_error_t err = TVMPlatformMemoryFree(seq->data, ctx);
+  DLDevice dev = {kDLCPU, 0};
+  tvm_crt_error_t err = TVMPlatformMemoryFree(seq->data, dev);
   if (err != kTvmErrorNoError) {
     return err;
   }
-  return TVMPlatformMemoryFree(seq, ctx);
+  return TVMPlatformMemoryFree(seq, dev);
 }
 
 // implementations of JSONReader
@@ -173,7 +173,7 @@ char JSONReader_PeekNextNonSpace(JSONReader* reader) {
  * \param out_str the output string. NULL to merely consume input and discard it.
  * \param out_str_size Number of bytes available to write starting from out_str. Includes
  *      terminating \0.
- * \throw dmlc::Error when next token is not string
+ * \throw tvm::Error when next token is not string
  */
 int JSONReader_ReadString(JSONReader* reader, char* out_str, size_t out_str_size) {
   int status = 0;
@@ -472,8 +472,8 @@ tvm_crt_error_t JSONReader_Create(const char* is, JSONReader* reader) {
   reader->NextObjectItem = JSONReader_NextObjectItem;
   reader->ArrayLength = JSONReader_ArrayLength;
 
-  DLContext ctx = {kDLCPU, 0};
-  err = TVMPlatformMemoryAllocate(strlen(is) + 1, ctx, (void**)&reader->is_);
+  DLDevice dev = {kDLCPU, 0};
+  err = TVMPlatformMemoryAllocate(strlen(is) + 1, dev, (void**)&reader->is_);
   if (err != kTvmErrorNoError) {
     return err;
   }
@@ -490,6 +490,6 @@ tvm_crt_error_t JSONReader_Release(JSONReader* reader) {
     return err;
   }
 
-  DLContext ctx = {kDLCPU, 0};
-  return TVMPlatformMemoryFree(reader->is_, ctx);
+  DLDevice dev = {kDLCPU, 0};
+  return TVMPlatformMemoryFree(reader->is_, dev);
 }
diff --git a/src/runtime/crt/graph_runtime_module/graph_runtime_module.c b/src/runtime/crt/graph_executor_module/graph_executor_module.c
similarity index 50%
rename from src/runtime/crt/graph_runtime_module/graph_runtime_module.c
rename to src/runtime/crt/graph_executor_module/graph_executor_module.c
index 8f479e9108f3..7b2a25040d08 100644
--- a/src/runtime/crt/graph_runtime_module/graph_runtime_module.c
+++ b/src/runtime/crt/graph_executor_module/graph_executor_module.c
@@ -20,27 +20,27 @@
 // LINT_C_FILE
 
 /*!
- * \file graph_runtime_module.c
- * \brief wrap graph_runtime into a TVMModule for use with RPC.
+ * \file graph_executor_module.c
+ * \brief wrap graph_executor into a TVMModule for use with RPC.
  */
 
 #include <tvm/runtime/crt/func_registry.h>
-#include <tvm/runtime/crt/graph_runtime.h>
-#include <tvm/runtime/crt/graph_runtime_module.h>
+#include <tvm/runtime/crt/graph_executor.h>
+#include <tvm/runtime/crt/graph_executor_module.h>
 #include <tvm/runtime/crt/module.h>
 
-#include "tvm/runtime/crt/internal/graph_runtime/graph_runtime.h"
+#include "tvm/runtime/crt/internal/graph_executor/graph_executor.h"
 
 typedef struct {
   TVMModule mod;
-  TVMGraphRuntime* runtime;
-} GraphRuntimeModule;
+  TVMGraphExecutor* executor;
+} GraphExecutorModule;
 
-static GraphRuntimeModule graph_runtime;
+static GraphExecutorModule graph_executor;
 
-int32_t TVMGraphRuntimeModule_Create(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values,
-                                     int* ret_tcodes, void* resource_handle) {
-  if (graph_runtime.runtime != NULL) {
+int32_t TVMGraphExecutorModule_Create(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values,
+                                      int* ret_tcodes, void* resource_handle) {
+  if (graph_executor.executor != NULL) {
     return kTvmErrorGraphModuleAlreadyCreated;
   }
 
@@ -57,18 +57,18 @@ int32_t TVMGraphRuntimeModule_Create(TVMValue* args, int* tcodes, int nargs, TVM
     return kTvmErrorGraphModuleBadContext;
   }
 
-  TVMContext ctx = {(DLDeviceType)args[2].v_int64, (int)args[3].v_int64};
+  DLDevice dev = {(DLDeviceType)args[2].v_int64, (int)args[3].v_int64};
   int ret_value =
-      TVMGraphRuntime_Create(args[0].v_str, args[1].v_handle, &ctx, &graph_runtime.runtime);
+      TVMGraphExecutor_Create(args[0].v_str, args[1].v_handle, &dev, &graph_executor.executor);
   if (ret_value != 0) {
     return ret_value;
   }
 
   TVMModuleHandle out;
-  ret_value = TVMModCreateFromCModule(&graph_runtime.mod, &out);
+  ret_value = TVMModCreateFromCModule(&graph_executor.mod, &out);
   if (ret_value != 0) {
     ret_tcodes[0] = kTVMNullptr;
-    TVMGraphRuntime_Release(&graph_runtime.runtime);
+    TVMGraphExecutor_Release(&graph_executor.executor);
     return ret_value;
   }
 
@@ -77,8 +77,9 @@ int32_t TVMGraphRuntimeModule_Create(TVMValue* args, int* tcodes, int nargs, TVM
   return kTvmErrorNoError;
 }
 
-int32_t TVMGraphRuntimeModule_GetInput(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values,
-                                       int* ret_tcodes, void* resource_handle) {
+int32_t TVMGraphExecutorModule_GetInput(TVMValue* args, int* tcodes, int nargs,
+                                        TVMValue* ret_values, int* ret_tcodes,
+                                        void* resource_handle) {
   if (nargs != 1) {
     return kTvmErrorFunctionCallNumArguments;
   }
@@ -87,45 +88,45 @@ int32_t TVMGraphRuntimeModule_GetInput(TVMValue* args, int* tcodes, int nargs, T
     return kTvmErrorFunctionCallWrongArgType;
   }
 
-  int index = TVMGraphRuntime_GetInputIndex(graph_runtime.runtime, args[0].v_str);
+  int index = TVMGraphExecutor_GetInputIndex(graph_executor.executor, args[0].v_str);
   if (index < 0) {
     return kTvmErrorGraphModuleNoSuchInput;
   }
 
-  uint32_t eid = TVMGraphRuntime_GetEntryId(graph_runtime.runtime,
-                                            graph_runtime.runtime->input_nodes[index], 0);
-  ret_values[0].v_handle = (void*)&graph_runtime.runtime->data_entry[eid].dl_tensor;
+  uint32_t eid = TVMGraphExecutor_GetEntryId(graph_executor.executor,
+                                             graph_executor.executor->input_nodes[index], 0);
+  ret_values[0].v_handle = (void*)&graph_executor.executor->data_entry[eid].dl_tensor;
   ret_tcodes[0] = kTVMNDArrayHandle;
   return 0;
 }
 
-int32_t TVMGraphRuntimeModule_GetNumInputs(TVMValue* args, int* tcodes, int nargs,
-                                           TVMValue* ret_values, int* ret_tcodes,
-                                           void* resource_handle) {
+int32_t TVMGraphExecutorModule_GetNumInputs(TVMValue* args, int* tcodes, int nargs,
+                                            TVMValue* ret_values, int* ret_tcodes,
+                                            void* resource_handle) {
   if (nargs != 0) {
     return kTvmErrorFunctionCallNumArguments;
   }
 
-  ret_values[0].v_int64 = TVMGraphRuntime_GetNumInputs();
+  ret_values[0].v_int64 = TVMGraphExecutor_GetNumInputs();
   ret_tcodes[0] = kTVMArgInt;
   return 0;
 }
 
-int32_t TVMGraphRuntimeModule_GetNumOutputs(TVMValue* args, int* tcodes, int nargs,
-                                            TVMValue* ret_values, int* ret_tcodes,
-                                            void* resource_handle) {
+int32_t TVMGraphExecutorModule_GetNumOutputs(TVMValue* args, int* tcodes, int nargs,
+                                             TVMValue* ret_values, int* ret_tcodes,
+                                             void* resource_handle) {
   if (nargs != 0) {
     return kTvmErrorFunctionCallNumArguments;
   }
 
-  ret_values[0].v_int64 = TVMGraphRuntime_GetNumOutputs(graph_runtime.runtime);
+  ret_values[0].v_int64 = TVMGraphExecutor_GetNumOutputs(graph_executor.executor);
   ret_tcodes[0] = kTVMArgInt;
   return 0;
 }
 
-int32_t TVMGraphRuntimeModule_GetOutput(TVMValue* args, int* tcodes, int nargs,
-                                        TVMValue* ret_values, int* ret_tcodes,
-                                        void* resource_handle) {
+int32_t TVMGraphExecutorModule_GetOutput(TVMValue* args, int* tcodes, int nargs,
+                                         TVMValue* ret_values, int* ret_tcodes,
+                                         void* resource_handle) {
   if (nargs != 1) {
     return kTvmErrorFunctionCallNumArguments;
   }
@@ -135,22 +136,22 @@ int32_t TVMGraphRuntimeModule_GetOutput(TVMValue* args, int* tcodes, int nargs,
   }
 
   int output_index = args[0].v_int64;
-  if (output_index < 0 || output_index > TVMGraphRuntime_GetNumOutputs(graph_runtime.runtime)) {
+  if (output_index < 0 || output_index > TVMGraphExecutor_GetNumOutputs(graph_executor.executor)) {
     return kTvmErrorGraphModuleNoSuchInput;
   }
 
-  uint32_t nid = graph_runtime.runtime->outputs[output_index].node_id;
-  uint32_t index = graph_runtime.runtime->outputs[output_index].index;
-  uint32_t eid = TVMGraphRuntime_GetEntryId(graph_runtime.runtime, nid, index);
+  uint32_t nid = graph_executor.executor->outputs[output_index].node_id;
+  uint32_t index = graph_executor.executor->outputs[output_index].index;
+  uint32_t eid = TVMGraphExecutor_GetEntryId(graph_executor.executor, nid, index);
 
-  ret_values[0].v_handle = (void*)&(graph_runtime.runtime->data_entry[eid].dl_tensor);
+  ret_values[0].v_handle = (void*)&(graph_executor.executor->data_entry[eid].dl_tensor);
   ret_tcodes[0] = kTVMNDArrayHandle;
   return 0;
 }
 
-int32_t TVMGraphRuntimeModule_LoadParams(TVMValue* args, int* tcodes, int nargs,
-                                         TVMValue* ret_values, int* ret_tcodes,
-                                         void* resource_handle) {
+int32_t TVMGraphExecutorModule_LoadParams(TVMValue* args, int* tcodes, int nargs,
+                                          TVMValue* ret_values, int* ret_tcodes,
+                                          void* resource_handle) {
   if (nargs != 1) {
     return kTvmErrorFunctionCallNumArguments;
   }
@@ -162,23 +163,24 @@ int32_t TVMGraphRuntimeModule_LoadParams(TVMValue* args, int* tcodes, int nargs,
   ret_tcodes[0] = kTVMNullptr;
 
   TVMByteArray* arr = (TVMByteArray*)args[0].v_handle;
-  return TVMGraphRuntime_LoadParams(graph_runtime.runtime, arr->data, arr->size);
+  return TVMGraphExecutor_LoadParams(graph_executor.executor, arr->data, arr->size);
 }
 
-int32_t TVMGraphRuntimeModule_Run(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values,
-                                  int* ret_tcodes, void* resource_handle) {
+int32_t TVMGraphExecutorModule_Run(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values,
+                                   int* ret_tcodes, void* resource_handle) {
   if (nargs != 0) {
     return kTvmErrorFunctionCallNumArguments;
   }
 
-  TVMGraphRuntime_Run(graph_runtime.runtime);
+  TVMGraphExecutor_Run(graph_executor.executor);
 
   ret_tcodes[0] = kTVMNullptr;
   return 0;
 }
 
-int32_t TVMGraphRuntimeModule_SetInput(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values,
-                                       int* ret_tcodes, void* resource_handle) {
+int32_t TVMGraphExecutorModule_SetInput(TVMValue* args, int* tcodes, int nargs,
+                                        TVMValue* ret_values, int* ret_tcodes,
+                                        void* resource_handle) {
   if (nargs != 2) {
     return kTvmErrorFunctionCallNumArguments;
   }
@@ -187,26 +189,26 @@ int32_t TVMGraphRuntimeModule_SetInput(TVMValue* args, int* tcodes, int nargs, T
     return kTvmErrorFunctionCallWrongArgType;
   }
 
-  TVMGraphRuntime_SetInput(graph_runtime.runtime, args[0].v_str, (DLTensor*)args[1].v_handle);
+  TVMGraphExecutor_SetInput(graph_executor.executor, args[0].v_str, (DLTensor*)args[1].v_handle);
 
   ret_tcodes[0] = kTVMNullptr;
   return 0;
 }
 
-int32_t TVMGraphRuntimeModule_NotImplemented(TVMValue* args, int* tcodes, int nargs,
-                                             TVMValue* ret_values, int* ret_tcodes,
-                                             void* resource_handle) {
+int32_t TVMGraphExecutorModule_NotImplemented(TVMValue* args, int* tcodes, int nargs,
+                                              TVMValue* ret_values, int* ret_tcodes,
+                                              void* resource_handle) {
   return kTvmErrorFunctionCallNotImplemented;
 }
 
-static const TVMBackendPackedCFunc graph_runtime_registry_funcs[] = {
-    &TVMGraphRuntimeModule_GetInput,      &TVMGraphRuntimeModule_GetNumInputs,
-    &TVMGraphRuntimeModule_GetNumOutputs, &TVMGraphRuntimeModule_GetOutput,
-    &TVMGraphRuntimeModule_LoadParams,    &TVMGraphRuntimeModule_Run,
-    &TVMGraphRuntimeModule_SetInput,      &TVMGraphRuntimeModule_NotImplemented,
+static const TVMBackendPackedCFunc graph_executor_registry_funcs[] = {
+    &TVMGraphExecutorModule_GetInput,      &TVMGraphExecutorModule_GetNumInputs,
+    &TVMGraphExecutorModule_GetNumOutputs, &TVMGraphExecutorModule_GetOutput,
+    &TVMGraphExecutorModule_LoadParams,    &TVMGraphExecutorModule_Run,
+    &TVMGraphExecutorModule_SetInput,      &TVMGraphExecutorModule_NotImplemented,
 };
 
-static const TVMFuncRegistry graph_runtime_registry = {
+static const TVMFuncRegistry graph_executor_registry = {
     "\x08get_input\0"
     "get_num_inputs\0"
     "get_num_outputs\0"
@@ -215,11 +217,11 @@ static const TVMFuncRegistry graph_runtime_registry = {
     "run\0"
     "set_input\0"
     "share_params\0",
-    graph_runtime_registry_funcs};
+    graph_executor_registry_funcs};
 
-tvm_crt_error_t TVMGraphRuntimeModule_Register() {
-  graph_runtime.mod.registry = &graph_runtime_registry;
-  graph_runtime.runtime = NULL;
+tvm_crt_error_t TVMGraphExecutorModule_Register() {
+  graph_executor.mod.registry = &graph_executor_registry;
+  graph_executor.executor = NULL;
 
-  return TVMFuncRegisterGlobal("tvm.graph_runtime.create", &TVMGraphRuntimeModule_Create, 0);
+  return TVMFuncRegisterGlobal("tvm.graph_executor.create", &TVMGraphExecutorModule_Create, 0);
 }
diff --git a/src/runtime/crt/host/main.cc b/src/runtime/crt/host/main.cc
index bf36deacb938..e64455417928 100644
--- a/src/runtime/crt/host/main.cc
+++ b/src/runtime/crt/host/main.cc
@@ -34,8 +34,8 @@
 
 #include "crt_config.h"
 
-#ifdef TVM_HOST_USE_GRAPH_RUNTIME_MODULE
-#include <tvm/runtime/crt/graph_runtime_module.h>
+#ifdef TVM_HOST_USE_GRAPH_EXECUTOR_MODULE
+#include <tvm/runtime/crt/graph_executor_module.h>
 #endif
 
 using namespace std::chrono;
@@ -61,12 +61,12 @@ void TVMPlatformAbort(tvm_crt_error_t error_code) {
 
 MemoryManagerInterface* memory_manager;
 
-tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLContext ctx, void** out_ptr) {
-  return memory_manager->Allocate(memory_manager, num_bytes, ctx, out_ptr);
+tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr) {
+  return memory_manager->Allocate(memory_manager, num_bytes, dev, out_ptr);
 }
 
-tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLContext ctx) {
-  return memory_manager->Free(memory_manager, ptr, ctx);
+tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) {
+  return memory_manager->Free(memory_manager, ptr, dev);
 }
 
 steady_clock::time_point g_utvm_start_time;
@@ -131,9 +131,9 @@ int main(int argc, char** argv) {
 
   utvm_rpc_server_t rpc_server = UTvmRpcServerInit(&UTvmWriteFunc, nullptr);
 
-#ifdef TVM_HOST_USE_GRAPH_RUNTIME_MODULE
-  CHECK_EQ(TVMGraphRuntimeModule_Register(), kTvmErrorNoError,
-           "failed to register GraphRuntime TVMModule");
+#ifdef TVM_HOST_USE_GRAPH_EXECUTOR_MODULE
+  CHECK_EQ(TVMGraphExecutorModule_Register(), kTvmErrorNoError,
+           "failed to register GraphExecutor TVMModule");
 #endif
 
   if (TVMFuncRegisterGlobal("tvm.testing.reset_server", (TVMFunctionHandle)&testonly_reset_server,
diff --git a/src/runtime/crt/include/tvm/runtime/crt/internal/common/ndarray.h b/src/runtime/crt/include/tvm/runtime/crt/internal/common/ndarray.h
index bfe83b59b5bc..f878477e7b42 100644
--- a/src/runtime/crt/include/tvm/runtime/crt/internal/common/ndarray.h
+++ b/src/runtime/crt/include/tvm/runtime/crt/internal/common/ndarray.h
@@ -41,10 +41,10 @@ typedef struct TVMNDArray {
   DLTensor dl_tensor;
 } TVMNDArray;
 
-int TVMNDArray_Create(int32_t ndim, const tvm_index_t* shape, DLDataType dtype, DLContext ctx,
+int TVMNDArray_Create(int32_t ndim, const tvm_index_t* shape, DLDataType dtype, DLDevice dev,
                       TVMNDArray* array);
 
-int TVMNDArray_Empty(int32_t ndim, const tvm_index_t* shape, DLDataType dtype, DLContext ctx,
+int TVMNDArray_Empty(int32_t ndim, const tvm_index_t* shape, DLDataType dtype, DLDevice dev,
                      TVMNDArray* array);
 
 int TVMNDArray_Load(TVMNDArray* ret, const char** strm);
diff --git a/src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/graph_runtime.h b/src/runtime/crt/include/tvm/runtime/crt/internal/graph_executor/graph_executor.h
similarity index 56%
rename from src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/graph_runtime.h
rename to src/runtime/crt/include/tvm/runtime/crt/internal/graph_executor/graph_executor.h
index 8e0faaa4f199..47ef474778e0 100644
--- a/src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/graph_runtime.h
+++ b/src/runtime/crt/include/tvm/runtime/crt/internal/graph_executor/graph_executor.h
@@ -18,41 +18,41 @@
  */
 
 /*!
- * \file src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/graph_runtime.h
- * \brief Tiny graph runtime that can run graph containing only tvm PackedFunc.
+ * \file src/runtime/crt/include/tvm/runtime/crt/internal/graph_executor/graph_executor.h
+ * \brief Tiny graph executor that can run graph containing only tvm PackedFunc.
  */
-#ifndef TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_GRAPH_RUNTIME_GRAPH_RUNTIME_H_
-#define TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_GRAPH_RUNTIME_GRAPH_RUNTIME_H_
+#ifndef TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_GRAPH_EXECUTOR_GRAPH_EXECUTOR_H_
+#define TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_GRAPH_EXECUTOR_GRAPH_EXECUTOR_H_
 
-#include <tvm/runtime/crt/graph_runtime.h>
+#include <tvm/runtime/crt/graph_executor.h>
 #include <tvm/runtime/crt/internal/common/ndarray.h>
-#include <tvm/runtime/crt/internal/graph_runtime/load_json.h>
+#include <tvm/runtime/crt/internal/graph_executor/load_json.h>
 #include <tvm/runtime/crt/module.h>
 
 // Memory pool entry.
-typedef struct TVMGraphRuntimePoolEntry {
+typedef struct TVMGraphExecutorPoolEntry {
   size_t size;
   int device_type;
   int entry_id;
-} TVMGraphRuntimePoolEntry;
+} TVMGraphExecutorPoolEntry;
 
 // Node entry
-typedef struct TVMGraphRuntimeNodeEntry {
+typedef struct TVMGraphExecutorNodeEntry {
   uint32_t node_id;
   uint32_t index;
   uint32_t version;
   // JSON Loader
   void (*Load)(JSONReader* reader);
-} TVMGraphRuntimeNodeEntry;
+} TVMGraphExecutorNodeEntry;
 
 // Storage entry.
-typedef struct TVMGraphRuntimeStorageEntry {
+typedef struct TVMGraphExecutorStorageEntry {
   uint8_t is_linked_param;
   TVMNDArray array;
-} TVMGraphRuntimeStorageEntry;
+} TVMGraphExecutorStorageEntry;
 
 // Node
-typedef struct TVMGraphRuntimeNode {
+typedef struct TVMGraphExecutorNode {
   // operator type in string
   char op_type[16];
   // name of the op
@@ -60,20 +60,20 @@ typedef struct TVMGraphRuntimeNode {
   // parameters
   TVMOpParam param;
   // inputs
-  TVMGraphRuntimeNodeEntry* inputs;
+  TVMGraphExecutorNodeEntry* inputs;
   // number of inputs
   size_t inputs_count;
   // control deps
   uint32_t control_deps[20];
   // JSON Loader
-  void (*LoadAttrs)(struct TVMGraphRuntimeNode* node, JSONReader* reader, TVMOpParam* param);
+  void (*LoadAttrs)(struct TVMGraphExecutorNode* node, JSONReader* reader, TVMOpParam* param);
   // JSON Loader
-  int (*Load)(struct TVMGraphRuntimeNode* node, JSONReader* reader);
-} TVMGraphRuntimeNode;
+  int (*Load)(struct TVMGraphExecutorNode* node, JSONReader* reader);
+} TVMGraphExecutorNode;
 
-typedef struct TVMGraphRuntime {
+typedef struct TVMGraphExecutor {
   /*! \brief The graph nodes. */
-  TVMGraphRuntimeNode* nodes;
+  TVMGraphExecutorNode* nodes;
   /*! \brief The graph nodes counter. */
   uint32_t nodes_count;
   /*! \brief The argument nodes. */
@@ -83,18 +83,18 @@ typedef struct TVMGraphRuntime {
   uint32_t* node_row_ptr;
   uint32_t node_row_ptr_count;
   /*! \brief Output entries. */
-  TVMGraphRuntimeNodeEntry* outputs;
+  TVMGraphExecutorNodeEntry* outputs;
   /*! \brief Output entries counter. */
   uint32_t outputs_count;
   /*! \brief Additional graph attributes. */
-  TVMGraphRuntimeGraphAttr attrs;
+  TVMGraphExecutorGraphAttr attrs;
   /*! \brief The code module that contains both host and device code. */
   TVMModuleHandle module_handle;
   /*! \brief Execution context of all devices including the host. */
-  TVMContext ctxs[1];
-  uint32_t ctxs_count;
+  DLDevice devices[1];
+  uint32_t devices_count;
   /*! \brief Common storage pool for all devices. */
-  TVMGraphRuntimeStorageEntry* storage_pool;
+  TVMGraphExecutorStorageEntry* storage_pool;
   uint32_t storage_pool_count;
   /*! \brief Data entry of each node. */
   TVMNDArray* data_entry;
@@ -102,20 +102,20 @@ typedef struct TVMGraphRuntime {
   /*! \brief Operator on each node. */
   TVMPackedFunc* op_execs;
   uint32_t op_execs_count;
-} TVMGraphRuntime;
+} TVMGraphExecutor;
 
 typedef DLTensor* DLTensorPtr;
 
 // private functions
-uint32_t TVMGraphRuntime_GetEntryId(TVMGraphRuntime* runtime, uint32_t nid, uint32_t index);
-void TVMGraphRuntime_SetInput(TVMGraphRuntime* runtime, const char* name, DLTensor* data_in);
-int TVMGraphRuntime_LoadParams(TVMGraphRuntime* runtime, const char* param_blob,
-                               const uint32_t param_size);
-void TVMGraphRuntime_Run(TVMGraphRuntime* runtime);
-int TVMGraphRuntime_GetOutput(TVMGraphRuntime* runtime, const int32_t idx, DLTensor* out);
+uint32_t TVMGraphExecutor_GetEntryId(TVMGraphExecutor* executor, uint32_t nid, uint32_t index);
+void TVMGraphExecutor_SetInput(TVMGraphExecutor* executor, const char* name, DLTensor* data_in);
+int TVMGraphExecutor_LoadParams(TVMGraphExecutor* executor, const char* param_blob,
+                                const uint32_t param_size);
+void TVMGraphExecutor_Run(TVMGraphExecutor* executor);
+int TVMGraphExecutor_GetOutput(TVMGraphExecutor* executor, const int32_t idx, DLTensor* out);
 
-int32_t TVMGraphRuntime_CreateTVMOp(TVMGraphRuntime* runtime, const TVMOpParam* param,
-                                    DLTensorPtr* args, const uint32_t args_count,
-                                    uint32_t num_inputs, TVMPackedFunc* pf);
+int32_t TVMGraphExecutor_CreateTVMOp(TVMGraphExecutor* executor, const TVMOpParam* param,
+                                     DLTensorPtr* args, const uint32_t args_count,
+                                     uint32_t num_inputs, TVMPackedFunc* pf);
 
-#endif  // TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_GRAPH_RUNTIME_GRAPH_RUNTIME_H_
+#endif  // TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_GRAPH_EXECUTOR_GRAPH_EXECUTOR_H_
diff --git a/src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/load_json.h b/src/runtime/crt/include/tvm/runtime/crt/internal/graph_executor/load_json.h
similarity index 90%
rename from src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/load_json.h
rename to src/runtime/crt/include/tvm/runtime/crt/internal/graph_executor/load_json.h
index af69506b0634..ac5adc842b62 100644
--- a/src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/load_json.h
+++ b/src/runtime/crt/include/tvm/runtime/crt/internal/graph_executor/load_json.h
@@ -18,11 +18,11 @@
  */
 
 /*!
- * \file src/runtime/crt/include/tvm/runtime/crt/internal/graph_runtime/load_json.h
+ * \file src/runtime/crt/include/tvm/runtime/crt/internal/graph_executor/load_json.h
  * \brief Lightweight JSON Reader that read save into C++ data structs.
  */
-#ifndef TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_GRAPH_RUNTIME_LOAD_JSON_H_
-#define TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_GRAPH_RUNTIME_LOAD_JSON_H_
+#ifndef TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_GRAPH_EXECUTOR_LOAD_JSON_H_
+#define TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_GRAPH_EXECUTOR_LOAD_JSON_H_
 
 #include <ctype.h>
 #include <inttypes.h>
@@ -38,9 +38,9 @@ enum {
   JSON_READ_TYPE_S32 = 6,
   JSON_READ_TYPE_F32 = 7,
   JSON_READ_TYPE_F64 = 8,
-  JSON_READ_TYPE_GRAPH_RUNTIME_NODE = 9,
-  JSON_READ_TYPE_GRAPH_RUNTIME_NODE_ENTRY = 10,
-  JSON_READ_TYPE_GRAPH_RUNTIME_GRAPH_ATTR = 11
+  JSON_READ_TYPE_GRAPH_EXECUTOR_NODE = 9,
+  JSON_READ_TYPE_GRAPH_EXECUTOR_NODE_ENTRY = 10,
+  JSON_READ_TYPE_GRAPH_EXECUTOR_GRAPH_ATTR = 11
 };
 
 typedef struct Seq {
@@ -100,4 +100,4 @@ tvm_crt_error_t JSONReader_Create(const char* is, JSONReader* reader);
  */
 tvm_crt_error_t JSONReader_Release(JSONReader* reader);
 
-#endif  // TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_GRAPH_RUNTIME_LOAD_JSON_H_
+#endif  // TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_GRAPH_EXECUTOR_LOAD_JSON_H_
diff --git a/src/runtime/crt/memory/memory.c b/src/runtime/crt/memory/memory.c
index 105d07870842..ed18544c2181 100644
--- a/src/runtime/crt/memory/memory.c
+++ b/src/runtime/crt/memory/memory.c
@@ -124,7 +124,7 @@ void MultiMap_Insert(struct MultiMap* map, uint32_t npage, Page* p) {
  * \return The virtual address
  */
 tvm_crt_error_t MemoryManager_Allocate(MemoryManagerInterface* interface, size_t num_bytes,
-                                       DLContext ctx, void** out_ptr) {
+                                       DLDevice dev, void** out_ptr) {
   MemoryManager* mgr = (MemoryManager*)interface;
 
   *out_ptr = 0;
@@ -256,10 +256,10 @@ tvm_crt_error_t MemoryManager_Realloc(MemoryManagerInterface* interface, void**
  * \brief Free the memory.
  * \param interface Pointer to this structure.
  * \param ptr A pointer returned from TVMPlatformMemoryAllocate which should be free'd.
- * \param ctx Execution context passed to TVMPlatformMemoryAllocate. Fixed to {kDLCPU, 0}.
+ * \param dev Execution device passed to TVMPlatformMemoryAllocate. Fixed to {kDLCPU, 0}.
  * \return kTvmErrorNoError if successful; a descriptive error code otherwise.
  */
-tvm_crt_error_t MemoryManager_Free(MemoryManagerInterface* interface, void* ptr, DLContext ctx) {
+tvm_crt_error_t MemoryManager_Free(MemoryManagerInterface* interface, void* ptr, DLDevice dev) {
   MemoryManager* mgr = (MemoryManager*)interface;
 
   TLB* pmap = &(mgr->pmap);
diff --git a/src/runtime/crt/utvm_rpc_server/rpc_server.cc b/src/runtime/crt/utvm_rpc_server/rpc_server.cc
index 0b9e96cd660f..8b7c0eb01840 100644
--- a/src/runtime/crt/utvm_rpc_server/rpc_server.cc
+++ b/src/runtime/crt/utvm_rpc_server/rpc_server.cc
@@ -208,15 +208,15 @@ utvm_rpc_server_t UTvmRpcServerInit(utvm_rpc_channel_write_t write_func, void* w
     TVMPlatformAbort(err);
   }
 
-  DLContext ctx = {kDLCPU, 0};
+  DLDevice dev = {kDLCPU, 0};
   void* receive_buffer_memory;
-  err = TVMPlatformMemoryAllocate(TVM_CRT_MAX_PACKET_SIZE_BYTES, ctx, &receive_buffer_memory);
+  err = TVMPlatformMemoryAllocate(TVM_CRT_MAX_PACKET_SIZE_BYTES, dev, &receive_buffer_memory);
   if (err != kTvmErrorNoError) {
     TVMPlatformAbort(err);
   }
   auto receive_buffer = new (receive_buffer_memory) uint8_t[TVM_CRT_MAX_PACKET_SIZE_BYTES];
   void* rpc_server_memory;
-  err = TVMPlatformMemoryAllocate(sizeof(tvm::runtime::micro_rpc::MicroRPCServer), ctx,
+  err = TVMPlatformMemoryAllocate(sizeof(tvm::runtime::micro_rpc::MicroRPCServer), dev,
                                   &rpc_server_memory);
   if (err != kTvmErrorNoError) {
     TVMPlatformAbort(err);
diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc
index f156d68d283e..5d6ff2b263c2 100644
--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -37,56 +37,56 @@ namespace runtime {
 
 class CUDADeviceAPI final : public DeviceAPI {
  public:
-  void SetDevice(TVMContext ctx) final { CUDA_CALL(cudaSetDevice(ctx.device_id)); }
-  void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final {
+  void SetDevice(Device dev) final { CUDA_CALL(cudaSetDevice(dev.device_id)); }
+  void GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) final {
     int value = 0;
     switch (kind) {
       case kExist:
-        value = (cudaDeviceGetAttribute(&value, cudaDevAttrMaxThreadsPerBlock, ctx.device_id) ==
+        value = (cudaDeviceGetAttribute(&value, cudaDevAttrMaxThreadsPerBlock, dev.device_id) ==
                  cudaSuccess);
         break;
       case kMaxThreadsPerBlock: {
-        CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrMaxThreadsPerBlock, ctx.device_id));
+        CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrMaxThreadsPerBlock, dev.device_id));
         break;
       }
       case kWarpSize: {
-        CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrWarpSize, ctx.device_id));
+        CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrWarpSize, dev.device_id));
         break;
       }
       case kMaxSharedMemoryPerBlock: {
         CUDA_CALL(
-            cudaDeviceGetAttribute(&value, cudaDevAttrMaxSharedMemoryPerBlock, ctx.device_id));
+            cudaDeviceGetAttribute(&value, cudaDevAttrMaxSharedMemoryPerBlock, dev.device_id));
         break;
       }
       case kComputeVersion: {
         std::ostringstream os;
-        CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrComputeCapabilityMajor, ctx.device_id));
+        CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrComputeCapabilityMajor, dev.device_id));
         os << value << ".";
-        CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrComputeCapabilityMinor, ctx.device_id));
+        CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrComputeCapabilityMinor, dev.device_id));
         os << value;
         *rv = os.str();
         return;
       }
       case kDeviceName: {
         std::string name(256, 0);
-        CUDA_DRIVER_CALL(cuDeviceGetName(&name[0], name.size(), ctx.device_id));
+        CUDA_DRIVER_CALL(cuDeviceGetName(&name[0], name.size(), dev.device_id));
         name.resize(strlen(name.c_str()));
         *rv = std::move(name);
         return;
       }
       case kMaxClockRate: {
-        CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrClockRate, ctx.device_id));
+        CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrClockRate, dev.device_id));
         break;
       }
       case kMultiProcessorCount: {
-        CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrMultiProcessorCount, ctx.device_id));
+        CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrMultiProcessorCount, dev.device_id));
         break;
       }
       case kMaxThreadDimensions: {
         int dims[3];
-        CUDA_CALL(cudaDeviceGetAttribute(&dims[0], cudaDevAttrMaxBlockDimX, ctx.device_id));
-        CUDA_CALL(cudaDeviceGetAttribute(&dims[1], cudaDevAttrMaxBlockDimY, ctx.device_id));
-        CUDA_CALL(cudaDeviceGetAttribute(&dims[2], cudaDevAttrMaxBlockDimZ, ctx.device_id));
+        CUDA_CALL(cudaDeviceGetAttribute(&dims[0], cudaDevAttrMaxBlockDimX, dev.device_id));
+        CUDA_CALL(cudaDeviceGetAttribute(&dims[1], cudaDevAttrMaxBlockDimY, dev.device_id));
+        CUDA_CALL(cudaDeviceGetAttribute(&dims[2], cudaDevAttrMaxBlockDimZ, dev.device_id));
 
         std::stringstream ss;  // use json string to return multiple int values;
         ss << "[" << dims[0] << ", " << dims[1] << ", " << dims[2] << "]";
@@ -94,7 +94,7 @@ class CUDADeviceAPI final : public DeviceAPI {
         return;
       }
       case kMaxRegistersPerBlock: {
-        CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrMaxRegistersPerBlock, ctx.device_id));
+        CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrMaxRegistersPerBlock, dev.device_id));
         break;
       }
       case kGcnArch:
@@ -106,62 +106,61 @@ class CUDADeviceAPI final : public DeviceAPI {
     }
     *rv = value;
   }
-  void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment,
-                       DLDataType type_hint) final {
+  void* AllocDataSpace(Device dev, size_t nbytes, size_t alignment, DLDataType type_hint) final {
     ICHECK_EQ(256 % alignment, 0U) << "CUDA space is aligned at 256 bytes";
     void* ret;
-    if (ctx.device_type == kDLCPUPinned) {
+    if (dev.device_type == kDLCPUPinned) {
       CUDA_CALL(cudaMallocHost(&ret, nbytes));
     } else {
-      CUDA_CALL(cudaSetDevice(ctx.device_id));
+      CUDA_CALL(cudaSetDevice(dev.device_id));
       CUDA_CALL(cudaMalloc(&ret, nbytes));
     }
     return ret;
   }
 
-  void FreeDataSpace(TVMContext ctx, void* ptr) final {
-    if (ctx.device_type == kDLCPUPinned) {
+  void FreeDataSpace(Device dev, void* ptr) final {
+    if (dev.device_type == kDLCPUPinned) {
       CUDA_CALL(cudaFreeHost(ptr));
     } else {
-      CUDA_CALL(cudaSetDevice(ctx.device_id));
+      CUDA_CALL(cudaSetDevice(dev.device_id));
       CUDA_CALL(cudaFree(ptr));
     }
   }
 
  protected:
   void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
-                      TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
+                      Device dev_from, Device dev_to, DLDataType type_hint,
                       TVMStreamHandle stream) final {
     cudaStream_t cu_stream = static_cast<cudaStream_t>(stream);
     from = static_cast<const char*>(from) + from_offset;
     to = static_cast<char*>(to) + to_offset;
 
-    if (ctx_from.device_type == kDLCPUPinned) {
-      ctx_from.device_type = kDLCPU;
+    if (dev_from.device_type == kDLCPUPinned) {
+      dev_from.device_type = kDLCPU;
     }
 
-    if (ctx_to.device_type == kDLCPUPinned) {
-      ctx_to.device_type = kDLCPU;
+    if (dev_to.device_type == kDLCPUPinned) {
+      dev_to.device_type = kDLCPU;
     }
 
     // In case there is a copy from host mem to host mem */
-    if (ctx_to.device_type == kDLCPU && ctx_from.device_type == kDLCPU) {
+    if (dev_to.device_type == kDLCPU && dev_from.device_type == kDLCPU) {
       memcpy(to, from, size);
       return;
     }
 
-    if (ctx_from.device_type == kDLGPU && ctx_to.device_type == kDLGPU) {
-      CUDA_CALL(cudaSetDevice(ctx_from.device_id));
-      if (ctx_from.device_id == ctx_to.device_id) {
+    if (dev_from.device_type == kDLGPU && dev_to.device_type == kDLGPU) {
+      CUDA_CALL(cudaSetDevice(dev_from.device_id));
+      if (dev_from.device_id == dev_to.device_id) {
         GPUCopy(from, to, size, cudaMemcpyDeviceToDevice, cu_stream);
       } else {
-        cudaMemcpyPeerAsync(to, ctx_to.device_id, from, ctx_from.device_id, size, cu_stream);
+        cudaMemcpyPeerAsync(to, dev_to.device_id, from, dev_from.device_id, size, cu_stream);
       }
-    } else if (ctx_from.device_type == kDLGPU && ctx_to.device_type == kDLCPU) {
-      CUDA_CALL(cudaSetDevice(ctx_from.device_id));
+    } else if (dev_from.device_type == kDLGPU && dev_to.device_type == kDLCPU) {
+      CUDA_CALL(cudaSetDevice(dev_from.device_id));
       GPUCopy(from, to, size, cudaMemcpyDeviceToHost, cu_stream);
-    } else if (ctx_from.device_type == kDLCPU && ctx_to.device_type == kDLGPU) {
-      CUDA_CALL(cudaSetDevice(ctx_to.device_id));
+    } else if (dev_from.device_type == kDLCPU && dev_to.device_type == kDLGPU) {
+      CUDA_CALL(cudaSetDevice(dev_to.device_id));
       GPUCopy(from, to, size, cudaMemcpyHostToDevice, cu_stream);
     } else {
       LOG(FATAL) << "expect copy from/to GPU or between GPU";
@@ -169,21 +168,21 @@ class CUDADeviceAPI final : public DeviceAPI {
   }
 
  public:
-  TVMStreamHandle CreateStream(TVMContext ctx) {
-    CUDA_CALL(cudaSetDevice(ctx.device_id));
+  TVMStreamHandle CreateStream(Device dev) {
+    CUDA_CALL(cudaSetDevice(dev.device_id));
     cudaStream_t retval;
     CUDA_CALL(cudaStreamCreate(&retval));
     return static_cast<TVMStreamHandle>(retval);
   }
 
-  void FreeStream(TVMContext ctx, TVMStreamHandle stream) {
-    CUDA_CALL(cudaSetDevice(ctx.device_id));
+  void FreeStream(Device dev, TVMStreamHandle stream) {
+    CUDA_CALL(cudaSetDevice(dev.device_id));
     cudaStream_t cu_stream = static_cast<cudaStream_t>(stream);
     CUDA_CALL(cudaStreamDestroy(cu_stream));
   }
 
-  void SyncStreamFromTo(TVMContext ctx, TVMStreamHandle event_src, TVMStreamHandle event_dst) {
-    CUDA_CALL(cudaSetDevice(ctx.device_id));
+  void SyncStreamFromTo(Device dev, TVMStreamHandle event_src, TVMStreamHandle event_dst) {
+    CUDA_CALL(cudaSetDevice(dev.device_id));
     cudaStream_t src_stream = static_cast<cudaStream_t>(event_src);
     cudaStream_t dst_stream = static_cast<cudaStream_t>(event_dst);
     cudaEvent_t evt;
@@ -193,21 +192,21 @@ class CUDADeviceAPI final : public DeviceAPI {
     CUDA_CALL(cudaEventDestroy(evt));
   }
 
-  void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {
-    CUDA_CALL(cudaSetDevice(ctx.device_id));
+  void StreamSync(Device dev, TVMStreamHandle stream) final {
+    CUDA_CALL(cudaSetDevice(dev.device_id));
     CUDA_CALL(cudaStreamSynchronize(static_cast<cudaStream_t>(stream)));
   }
 
-  void SetStream(TVMContext ctx, TVMStreamHandle stream) final {
+  void SetStream(Device dev, TVMStreamHandle stream) final {
     CUDAThreadEntry::ThreadLocal()->stream = static_cast<cudaStream_t>(stream);
   }
 
-  void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final {
-    return CUDAThreadEntry::ThreadLocal()->pool.AllocWorkspace(ctx, size);
+  void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final {
+    return CUDAThreadEntry::ThreadLocal()->pool.AllocWorkspace(dev, size);
   }
 
-  void FreeWorkspace(TVMContext ctx, void* data) final {
-    CUDAThreadEntry::ThreadLocal()->pool.FreeWorkspace(ctx, data);
+  void FreeWorkspace(Device dev, void* data) final {
+    CUDAThreadEntry::ThreadLocal()->pool.FreeWorkspace(dev, data);
   }
 
   static CUDADeviceAPI* Global() {
@@ -275,7 +274,7 @@ class GPUTimerNode : public TimerNode {
 
 TVM_REGISTER_OBJECT_TYPE(GPUTimerNode);
 
-TVM_REGISTER_GLOBAL("profiling.timer.gpu").set_body_typed([](TVMContext ctx) {
+TVM_REGISTER_GLOBAL("profiling.timer.gpu").set_body_typed([](Device dev) {
   return Timer(make_object<GPUTimerNode>());
 });
 
diff --git a/src/runtime/file_utils.cc b/src/runtime/file_utils.cc
index 92c398b559d2..32dd1d8020c9 100644
--- a/src/runtime/file_utils.cc
+++ b/src/runtime/file_utils.cc
@@ -24,9 +24,9 @@
 
 #include <dmlc/json.h>
 #include <dmlc/memory_io.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/serializer.h>
-#include <tvm/support/logging.h>
 
 #include <fstream>
 #include <unordered_map>
diff --git a/src/runtime/graph/graph_runtime_factory.cc b/src/runtime/graph/graph_runtime_factory.cc
deleted file mode 100644
index 605d6b0ce892..000000000000
--- a/src/runtime/graph/graph_runtime_factory.cc
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file graph_runtime_factory.cc
- * \brief Graph runtime factory implementations
- */
-
-#include "./graph_runtime_factory.h"
-
-#include <tvm/runtime/container.h>
-#include <tvm/runtime/device_api.h>
-#include <tvm/runtime/registry.h>
-
-#include <iterator>
-#include <vector>
-
-namespace tvm {
-namespace runtime {
-
-GraphRuntimeFactory::GraphRuntimeFactory(
-    const std::string& graph_json,
-    const std::unordered_map<std::string, tvm::runtime::NDArray>& params,
-    const std::string& module_name) {
-  graph_json_ = graph_json;
-  params_ = params;
-  module_name_ = module_name;
-}
-
-PackedFunc GraphRuntimeFactory::GetFunction(
-    const std::string& name, const tvm::runtime::ObjectPtr<tvm::runtime::Object>& sptr_to_self) {
-  if (name == module_name_) {
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      std::vector<TVMContext> contexts;
-      for (int i = 0; i < args.num_args; ++i) {
-        contexts.emplace_back(args[i].operator TVMContext());
-      }
-      *rv = this->RuntimeCreate(contexts);
-    });
-  } else if (name == "debug_create") {
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      ICHECK_GE(args.size(), 2);
-      std::string module_name = args[0].operator String();
-      ICHECK(module_name == module_name_) << "Currently we only support single model for now.";
-      std::vector<TVMContext> contexts;
-      for (int i = 1; i < args.num_args; ++i) {
-        contexts.emplace_back(args[i].operator TVMContext());
-      }
-      *rv = this->DebugRuntimeCreate(contexts);
-    });
-  } else if (name == "remove_params") {
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      std::unordered_map<std::string, tvm::runtime::NDArray> empty_params{};
-      auto exec =
-          make_object<GraphRuntimeFactory>(this->graph_json_, empty_params, this->module_name_);
-      exec->Import(this->imports_[0]);
-      *rv = Module(exec);
-    });
-  } else {
-    return PackedFunc();
-  }
-}
-
-void GraphRuntimeFactory::SaveToBinary(dmlc::Stream* stream) {
-  stream->Write(graph_json_);
-  std::vector<std::string> names;
-  std::vector<DLTensor*> arrays;
-  for (const auto& v : params_) {
-    names.emplace_back(v.first);
-    arrays.emplace_back(const_cast<DLTensor*>(v.second.operator->()));
-  }
-  uint64_t sz = arrays.size();
-  ICHECK(sz == names.size());
-  stream->Write(sz);
-  stream->Write(names);
-  for (size_t i = 0; i < sz; ++i) {
-    tvm::runtime::SaveDLTensor(stream, arrays[i]);
-  }
-  stream->Write(module_name_);
-}
-
-Module GraphRuntimeFactory::RuntimeCreate(const std::vector<TVMContext>& ctxs) {
-  auto exec = make_object<GraphRuntime>();
-  exec->Init(this->graph_json_, this->imports_[0], ctxs, PackedFunc());
-  // set params
-  SetParams(exec.get(), this->params_);
-  return Module(exec);
-}
-
-Module GraphRuntimeFactory::DebugRuntimeCreate(const std::vector<TVMContext>& ctxs) {
-  const PackedFunc* pf = tvm::runtime::Registry::Get("tvm.graph_runtime_debug.create");
-  ICHECK(pf != nullptr) << "Cannot find function tvm.graph_runtime_debug.create in registry. "
-                           "Do you enable debug graph runtime build?";
-  // Debug runtime create packed function will call GetAllContexs, so we unpack the ctxs.
-  std::vector<int> unpacked_ctxs;
-  for (const auto& ctx : ctxs) {
-    unpacked_ctxs.emplace_back(ctx.device_type);
-    unpacked_ctxs.emplace_back(ctx.device_id);
-  }
-  size_t args_size = unpacked_ctxs.size() + 2;
-  std::vector<TVMValue> values(args_size);
-  std::vector<int> codes(args_size);
-  runtime::TVMArgsSetter setter(values.data(), codes.data());
-  setter(0, this->graph_json_);
-  setter(1, this->imports_[0]);
-  for (size_t i = 0; i < unpacked_ctxs.size(); ++i) {
-    setter(i + 2, unpacked_ctxs[i]);
-  }
-  TVMRetValue rv;
-  pf->CallPacked(TVMArgs(values.data(), codes.data(), args_size), &rv);
-  Module mod = rv.operator Module();
-  // debug graph runtime is one child class of graph runtime.
-  SetParams(const_cast<GraphRuntime*>(mod.as<GraphRuntime>()), this->params_);
-  return mod;
-}
-
-Module GraphRuntimeFactoryModuleLoadBinary(void* strm) {
-  dmlc::Stream* stream = static_cast<dmlc::Stream*>(strm);
-  std::string graph_json;
-  std::unordered_map<std::string, tvm::runtime::NDArray> params;
-  std::string module_name;
-  ICHECK(stream->Read(&graph_json));
-  uint64_t sz;
-  ICHECK(stream->Read(&sz));
-  std::vector<std::string> names;
-  ICHECK(stream->Read(&names));
-  ICHECK(sz == names.size());
-  for (size_t i = 0; i < sz; ++i) {
-    tvm::runtime::NDArray temp;
-    temp.Load(stream);
-    params[names[i]] = temp;
-  }
-  ICHECK(stream->Read(&module_name));
-  auto exec = make_object<GraphRuntimeFactory>(graph_json, params, module_name);
-  return Module(exec);
-}
-
-TVM_REGISTER_GLOBAL("tvm.graph_runtime_factory.create").set_body([](TVMArgs args, TVMRetValue* rv) {
-  ICHECK_GE(args.num_args, 3) << "The expected number of arguments for "
-                                 "graph_runtime_factory.create needs at least 3, "
-                                 "but it has "
-                              << args.num_args;
-  // The argument order is graph_json, module, module_name, param0_name, param0_tensor,
-  // [param1_name, param1_tensor], ...
-  ICHECK_EQ((args.size() - 3) % 2, 0);
-  std::unordered_map<std::string, tvm::runtime::NDArray> params;
-  for (size_t i = 3; i < static_cast<size_t>(args.size()); i += 2) {
-    std::string name = args[i].operator String();
-    params[name] = args[i + 1].operator tvm::runtime::NDArray();
-  }
-  auto exec = make_object<GraphRuntimeFactory>(args[0], params, args[2]);
-  exec->Import(args[1]);
-  *rv = Module(exec);
-});
-
-TVM_REGISTER_GLOBAL("runtime.module.loadbinary_GraphRuntimeFactory")
-    .set_body_typed(GraphRuntimeFactoryModuleLoadBinary);
-
-}  // namespace runtime
-}  // namespace tvm
diff --git a/src/runtime/graph_executor/cuda_graph/graph_runtime_cuda_graph.cc b/src/runtime/graph_executor/cuda_graph/graph_runtime_cuda_graph.cc
new file mode 100644
index 000000000000..53f225403be6
--- /dev/null
+++ b/src/runtime/graph_executor/cuda_graph/graph_runtime_cuda_graph.cc
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file graph_executor_cuda_graph.cc
+ */
+
+#include <tvm/runtime/registry.h>
+
+#include "../../cuda/cuda_common.h"
+#include "../graph_executor.h"
+
+namespace tvm {
+namespace runtime {
+
+/*!
+ * \brief Graph executor with CUDA Graph Support.
+ *
+ *  This is the extension of GraphExecutor class used for CUDA graph launch
+ *  instead of CUDA kernel launch. CUDA graph launch requires CUDA 10.0 or
+ *  above, currently there are two ways of constructing CUDA graphs:
+ *  (1) Using CUDA stream capture API to capture a series of operations on
+ *  CUDA stream, and automatically generates a graph (2) Building a graph
+ *  using CUDA graph API manually. This implementation uses stream capture.
+ */
+class GraphExecutorCudaGraph : public GraphExecutor {
+ public:
+  /*!
+   * \brief Begin CUDA graph capture on stream, the stream enters capture mode.
+   */
+  void StartCapture() {
+    const Device& dev = data_entry_[entry_id(0, 0)]->device;
+
+    TVMStreamCreate(dev.device_type, dev.device_id, &capture_stream_);
+    TVMSetStream(dev.device_type, dev.device_id, capture_stream_);
+
+    CUDA_CALL(cudaStreamBeginCapture(static_cast<cudaStream_t>(capture_stream_),
+                                     cudaStreamCaptureModeGlobal));
+  }
+
+  /*!
+   * \brief Launch the instantiated graph on stream
+   */
+  void RunCudaGraph() {
+    cudaStream_t cuStream = static_cast<cudaStream_t>(capture_stream_);
+    CUDA_CALL(cudaGraphLaunch(cuda_graph_exec_, cuStream));
+    CUDA_CALL(cudaStreamSynchronize(cuStream));
+  }
+
+  /*!
+   * \brief End CUDA graph capture on stream, a graph will be created and
+   * instantiated.
+   */
+  void EndCapture() {
+    cudaGraph_t graph;
+    CUDA_CALL(cudaStreamEndCapture(static_cast<cudaStream_t>(capture_stream_), &graph));
+
+    cudaGraphNode_t* nodes = NULL;
+    size_t numNodes = 0;
+    CUDA_CALL(cudaGraphGetNodes(graph, nodes, &numNodes));
+    LOG(INFO) << "Num of nodes in the cuda graph created using stream capture API = " << numNodes;
+
+    CUDA_CALL(cudaGraphInstantiate(&cuda_graph_exec_, graph, NULL, NULL, 0));
+  }
+
+  /*!
+   * \brief GetFunction Get the function based on input.
+   * \param name The function which needs to be invoked.
+   * \param sptr_to_self Packed function pointer.
+   */
+  PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self);
+
+ private:
+  /*! \brief The Cuda stream on which to capture a CUDA graph. */
+  TVMStreamHandle capture_stream_;
+  /*! \brief The captured CUDA graph will be instantiated to this. */
+  cudaGraphExec_t cuda_graph_exec_;
+};
+
+PackedFunc GraphExecutorCudaGraph::GetFunction(const std::string& name,
+                                               const ObjectPtr<Object>& sptr_to_self) {
+  if (name == "run_cuda_graph") {
+    return PackedFunc(
+        [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->RunCudaGraph(); });
+  } else if (name == "start_capture") {
+    return PackedFunc(
+        [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->StartCapture(); });
+  } else if (name == "end_capture") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->EndCapture(); });
+  } else {
+    return GraphExecutor::GetFunction(name, sptr_to_self);
+  }
+}
+
+Module GraphExecutorCudaGraphCreate(const std::string& sym_json, const tvm::runtime::Module& m,
+                                    const std::vector<Device>& devs,
+                                    PackedFunc lookup_linked_param_func) {
+  auto exec = make_object<GraphExecutorCudaGraph>();
+  exec->Init(sym_json, m, devs, lookup_linked_param_func);
+  return Module(exec);
+}
+
+TVM_REGISTER_GLOBAL("tvm.graph_executor_cuda_graph.create")
+    .set_body([](TVMArgs args, TVMRetValue* rv) {
+      ICHECK_GE(args.num_args, 4)
+          << "The expected number of arguments for graph_executor.create is "
+             "at least 4, but it has "
+          << args.num_args;
+      PackedFunc lookup_linked_param_func;
+      int dev_start_arg = 2;
+      if (args[2].type_code() == kTVMPackedFuncHandle) {
+        lookup_linked_param_func = args[2];
+        dev_start_arg++;
+      }
+
+      *rv = GraphExecutorCudaGraphCreate(args[0], args[1], GetAllDevice(args, dev_start_arg),
+                                         lookup_linked_param_func);
+    });
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/graph/debug/graph_runtime_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc
similarity index 85%
rename from src/runtime/graph/debug/graph_runtime_debug.cc
rename to src/runtime/graph_executor/debug/graph_executor_debug.cc
index 0e3003aa42c3..7c1e6960f9f5 100644
--- a/src/runtime/graph/debug/graph_runtime_debug.cc
+++ b/src/runtime/graph_executor/debug/graph_executor_debug.cc
@@ -18,7 +18,7 @@
  */
 
 /*!
- * \file graph_runtime_debug.cc
+ * \file graph_executor_debug.cc
  */
 #include <tvm/runtime/container.h>
 #include <tvm/runtime/ndarray.h>
@@ -29,18 +29,18 @@
 #include <chrono>
 #include <sstream>
 
-#include "../graph_runtime.h"
+#include "../graph_executor.h"
 
 namespace tvm {
 namespace runtime {
 
 /*!
- * \brief Graph runtime with debug .
+ * \brief Graph executor with debug .
  *
- *  This is the extension of GraphRuntime class used for debugging
+ *  This is the extension of GraphExecutor class used for debugging
  *  TVM runtime PackedFunc API.
  */
-class GraphRuntimeDebug : public GraphRuntime {
+class GraphExecutorDebug : public GraphExecutor {
  public:
   /*!
    * \brief Run each operation in the graph and get the time per op for all ops.
@@ -58,7 +58,7 @@ class GraphRuntimeDebug : public GraphRuntime {
    */
   std::string RunIndividual(int number, int repeat, int min_repeat_ms) {
     // warmup run
-    GraphRuntime::Run();
+    GraphExecutor::Run();
     std::string tkey = module_->type_key();
     std::vector<double> time_sec_per_op(op_execs_.size(), 0);
     if (tkey == "rpc") {
@@ -128,12 +128,12 @@ class GraphRuntimeDebug : public GraphRuntime {
           << "Don't know how to run op type " << nodes_[index].op_type
           << " remotely over RPC right now";
 
-      // NOTE: GraphRuntimeDebug expects graph nodes to have an "op" attribute of "tvm_op" or "null"
-      // and "null" is a placeholder node for a parameter or input.
+      // NOTE: GraphExecutorDebug expects graph nodes to have an "op" attribute of "tvm_op" or
+      // "null" and "null" is a placeholder node for a parameter or input.
       return 0;
     }
 
-    const TVMContext& ctx = data_entry_[entry_id(index, 0)]->ctx;
+    const Device& dev = data_entry_[entry_id(index, 0)]->device;
     TVMOpParam param = nodes_[index].param;
     std::string name = param.func_name;
     uint32_t num_inputs = param.num_inputs;
@@ -141,8 +141,8 @@ class GraphRuntimeDebug : public GraphRuntime {
 
     PackedFunc time_eval = runtime::Registry::Get("runtime.RPCTimeEvaluator")
                                ->
-                               operator()(module_, name, static_cast<int>(ctx.device_type),
-                                          ctx.device_id, number, repeat, min_repeat_ms, "");
+                               operator()(module_, name, static_cast<int>(dev.device_type),
+                                          dev.device_id, number, repeat, min_repeat_ms, "");
 
     int num_flat_args = num_inputs + num_outputs;
     std::unique_ptr<TVMValue> values(new TVMValue[num_flat_args]);
@@ -171,8 +171,8 @@ class GraphRuntimeDebug : public GraphRuntime {
   }
 
   Timer RunOpHost(int index) {
-    const TVMContext& ctx = data_entry_[entry_id(index, 0)]->ctx;
-    Timer t = Timer::Start(ctx);
+    const Device& dev = data_entry_[entry_id(index, 0)]->device;
+    Timer t = Timer::Start(dev);
     op_execs_[index]();
     t->Stop();
     return t;
@@ -235,8 +235,8 @@ class GraphRuntimeDebug : public GraphRuntime {
  * \param name The function which needs to be invoked.
  * \param sptr_to_self Packed function pointer.
  */
-PackedFunc GraphRuntimeDebug::GetFunction(const std::string& name,
-                                          const ObjectPtr<Object>& sptr_to_self) {
+PackedFunc GraphExecutorDebug::GetFunction(const std::string& name,
+                                           const ObjectPtr<Object>& sptr_to_self) {
   // return member functions during query.
   if (name == "get_output_by_layer") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
@@ -261,37 +261,37 @@ PackedFunc GraphRuntimeDebug::GetFunction(const std::string& name,
       *rv = this->RunIndividual(number, repeat, min_repeat_ms);
     });
   } else {
-    return GraphRuntime::GetFunction(name, sptr_to_self);
+    return GraphExecutor::GetFunction(name, sptr_to_self);
   }
 }
 
 /*!
- * \brief GraphRuntimeDebugCreate Get the function based on input.
+ * \brief GraphExecutorDebugCreate Get the function based on input.
  * \param sym_json The graph symbol in json format.
  * \param m Compiled module which will be loaded.
- * \param ctxs All devices contexts.
+ * \param devs All devices.
  */
-Module GraphRuntimeDebugCreate(const std::string& sym_json, const tvm::runtime::Module& m,
-                               const std::vector<TVMContext>& ctxs,
-                               PackedFunc lookup_linked_param_func) {
-  auto exec = make_object<GraphRuntimeDebug>();
-  exec->Init(sym_json, m, ctxs, lookup_linked_param_func);
+Module GraphExecutorDebugCreate(const std::string& sym_json, const tvm::runtime::Module& m,
+                                const std::vector<Device>& devs,
+                                PackedFunc lookup_linked_param_func) {
+  auto exec = make_object<GraphExecutorDebug>();
+  exec->Init(sym_json, m, devs, lookup_linked_param_func);
   return Module(exec);
 }
 
-TVM_REGISTER_GLOBAL("tvm.graph_runtime_debug.create").set_body([](TVMArgs args, TVMRetValue* rv) {
-  ICHECK_GE(args.num_args, 4) << "The expected number of arguments for graph_runtime.create is "
+TVM_REGISTER_GLOBAL("tvm.graph_executor_debug.create").set_body([](TVMArgs args, TVMRetValue* rv) {
+  ICHECK_GE(args.num_args, 4) << "The expected number of arguments for graph_executor.create is "
                                  "at least 4, but it has "
                               << args.num_args;
   PackedFunc lookup_linked_param_func;
-  int ctx_start_arg = 2;
+  int dev_start_arg = 2;
   if (args[2].type_code() == kTVMPackedFuncHandle) {
     lookup_linked_param_func = args[2];
-    ctx_start_arg++;
+    dev_start_arg++;
   }
 
-  *rv = GraphRuntimeDebugCreate(args[0], args[1], GetAllContext(args, ctx_start_arg),
-                                lookup_linked_param_func);
+  *rv = GraphExecutorDebugCreate(args[0], args[1], GetAllDevice(args, dev_start_arg),
+                                 lookup_linked_param_func);
 });
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph_executor/graph_executor.cc
similarity index 80%
rename from src/runtime/graph/graph_runtime.cc
rename to src/runtime/graph_executor/graph_executor.cc
index 6c51e711aef1..c4d984fe9633 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph_executor/graph_executor.cc
@@ -18,9 +18,9 @@
  */
 
 /*!
- * \file graph_runtime.cc
+ * \file graph_executor.cc
  */
-#include "graph_runtime.h"
+#include "graph_executor.h"
 
 #include <tvm/runtime/container.h>
 #include <tvm/runtime/device_api.h>
@@ -53,28 +53,29 @@ inline size_t GetDataAlignment(const DLTensor& arr) {
 /*!
  * \brief Run all the operations one by one.
  */
-void GraphRuntime::Run() {
+void GraphExecutor::Run() {
   // setup the array and requirements.
   for (size_t i = 0; i < op_execs_.size(); ++i) {
     if (op_execs_[i]) op_execs_[i]();
   }
 }
 /*!
- * \brief Initialize the graph executor with graph and context.
+ * \brief Initialize the graph executor with graph and device.
  * \param graph_json The execution graph.
  * \param module The module containing the compiled functions for the host
  * processor.
- * \param ctxs The context of the host and devices where graph nodes will be
+ * \param devs The devices of the host and devices where graph nodes will be
  * executed on.
- * \param lookup_linked_param_func Linked parameter lookup function.
+ * \param lookup_linked_param_func Linked parameter lookup function. Default is nullptr.
  */
-void GraphRuntime::Init(const std::string& graph_json, tvm::runtime::Module module,
-                        const std::vector<TVMContext>& ctxs, PackedFunc lookup_linked_param_func) {
+void GraphExecutor::Init(const std::string& graph_json, tvm::runtime::Module module,
+                         const std::vector<Device>& devs,
+                         const PackedFunc lookup_linked_param_func) {
   std::istringstream is(graph_json);
   dmlc::JSONReader reader(&is);
   this->Load(&reader);
   module_ = module;
-  ctxs_ = ctxs;
+  devices_ = devs;
   lookup_linked_param_ = lookup_linked_param_func;
   if (lookup_linked_param_ == nullptr) {
     lookup_linked_param_ = PackedFunc(
@@ -93,7 +94,7 @@ void GraphRuntime::Init(const std::string& graph_json, tvm::runtime::Module modu
  * \param name The name of the input.
  * \return The index of input.
  */
-int GraphRuntime::GetInputIndex(const std::string& name) {
+int GraphExecutor::GetInputIndex(const std::string& name) {
   auto it = input_map_.find(name);
   if (it != input_map_.end()) {
     return it->second;
@@ -105,7 +106,7 @@ int GraphRuntime::GetInputIndex(const std::string& name) {
  * \param index The input index.
  * \param data_in The input data.
  */
-void GraphRuntime::SetInput(int index, DLTensor* data_in) {
+void GraphExecutor::SetInput(int index, DLTensor* data_in) {
   ICHECK_LT(static_cast<size_t>(index), input_nodes_.size());
   uint32_t eid = this->entry_id(input_nodes_[index], 0);
   data_entry_[eid].CopyFrom(data_in);
@@ -115,7 +116,7 @@ void GraphRuntime::SetInput(int index, DLTensor* data_in) {
  * \param index The input index.
  * \param data_ref The input data that is referred.
  */
-void GraphRuntime::SetInputZeroCopy(int index, DLTensor* data_ref) {
+void GraphExecutor::SetInputZeroCopy(int index, DLTensor* data_ref) {
   ICHECK_LT(static_cast<size_t>(index), input_nodes_.size());
   uint32_t eid = this->entry_id(input_nodes_[index], 0);
   const DLTensor* old_t = data_entry_[eid].operator->();
@@ -124,8 +125,8 @@ void GraphRuntime::SetInputZeroCopy(int index, DLTensor* data_ref) {
   ICHECK_EQ(data_alignment_[eid], details::GetDataAlignment(*data_ref));
   ICHECK_EQ(reinterpret_cast<size_t>(data_ref->data) % kAllocAlignment, 0);
   ICHECK_EQ(old_t->ndim, static_cast<size_t>(data_ref->ndim));
-  ICHECK_EQ(old_t->ctx.device_type, data_ref->ctx.device_type);
-  ICHECK_EQ(old_t->ctx.device_id, data_ref->ctx.device_id);
+  ICHECK_EQ(old_t->device.device_type, data_ref->device.device_type);
+  ICHECK_EQ(old_t->device.device_id, data_ref->device.device_id);
   for (auto i = 0; i < data_ref->ndim; ++i) {
     ICHECK_EQ(old_t->shape[i], data_ref->shape[i]);
   }
@@ -140,20 +141,20 @@ void GraphRuntime::SetInputZeroCopy(int index, DLTensor* data_ref) {
  *
  * \return The number of outputs from graph.
  */
-int GraphRuntime::NumOutputs() const { return outputs_.size(); }
+int GraphExecutor::NumOutputs() const { return outputs_.size(); }
 /*!
  * \brief Get the number of inputs
  *
  * \return The number of inputs to the graph.
  */
-int GraphRuntime::NumInputs() const { return input_nodes_.size(); }
+int GraphExecutor::NumInputs() const { return input_nodes_.size(); }
 /*!
  * \brief Return NDArray for given input index.
  * \param index The input index.
  *
  * \return NDArray corresponding to given input node index.
  */
-NDArray GraphRuntime::GetInput(int index) const {
+NDArray GraphExecutor::GetInput(int index) const {
   ICHECK_LT(static_cast<size_t>(index), input_nodes_.size());
   uint32_t eid = this->entry_id(input_nodes_[index], 0);
   return data_entry_[eid];
@@ -164,7 +165,7 @@ NDArray GraphRuntime::GetInput(int index) const {
  *
  * \return NDArray corresponding to given output node index.
  */
-NDArray GraphRuntime::GetOutput(int index) const {
+NDArray GraphExecutor::GetOutput(int index) const {
   ICHECK_LT(static_cast<size_t>(index), outputs_.size());
   uint32_t eid = this->entry_id(outputs_[index]);
   return data_entry_[eid];
@@ -174,7 +175,7 @@ NDArray GraphRuntime::GetOutput(int index) const {
  * \param index The output index.
  * \param data_out the output data.
  */
-void GraphRuntime::CopyOutputTo(int index, DLTensor* data_out) {
+void GraphExecutor::CopyOutputTo(int index, DLTensor* data_out) {
   ICHECK_LT(static_cast<size_t>(index), outputs_.size());
   uint32_t eid = this->entry_id(outputs_[index]);
 
@@ -192,20 +193,22 @@ void GraphRuntime::CopyOutputTo(int index, DLTensor* data_out) {
  * \brief Load parameters from parameter blob.
  * \param param_blob A binary blob of parameter.
  */
-void GraphRuntime::LoadParams(const std::string& param_blob) {
+void GraphExecutor::LoadParams(const std::string& param_blob) {
   dmlc::MemoryStringStream strm(const_cast<std::string*>(&param_blob));
   this->LoadParams(&strm);
 }
 
-void GraphRuntime::LoadParams(dmlc::Stream* strm) {
+void GraphExecutor::LoadParams(dmlc::Stream* strm) {
   Map<String, NDArray> params = ::tvm::runtime::LoadParams(strm);
   for (auto& p : params) {
-    uint32_t eid = this->entry_id(input_nodes_[GetInputIndex(p.first)], 0);
+    int in_idx = GetInputIndex(p.first);
+    if (in_idx < 0) continue;
+    uint32_t eid = this->entry_id(input_nodes_[in_idx], 0);
     data_entry_[eid].CopyFrom(p.second);
   }
 }
 
-void GraphRuntime::ShareParams(const GraphRuntime& other, dmlc::Stream* strm) {
+void GraphExecutor::ShareParams(const GraphExecutor& other, dmlc::Stream* strm) {
   uint64_t header, reserved;
   ICHECK(strm->Read(&header)) << "Invalid parameters file format";
   ICHECK(header == kTVMNDArrayListMagic) << "Invalid parameters file format";
@@ -230,17 +233,17 @@ void GraphRuntime::ShareParams(const GraphRuntime& other, dmlc::Stream* strm) {
   this->SetupOpExecs();
 }
 
-void GraphRuntime::LinkedNDArrayDeleter(Object* container) {
+void GraphExecutor::LinkedNDArrayDeleter(Object* container) {
   // container is the NDArray::Container which needs to get deleted.
   // The data member points to global const memory, so it does not need deleting.
   delete static_cast<NDArray::Container*>(container);
 }
 
-void GraphRuntime::DefaultLookupLinkedParam(TVMArgs args, TVMRetValue* rv) {
+void GraphExecutor::DefaultLookupLinkedParam(TVMArgs args, TVMRetValue* rv) {
   Module mod = args[0];
   int64_t storage_id = args[1];
   DLTensor* template_tensor = args[2];
-  TVMContext ctx = args[3];
+  Device dev = args[3];
   // Get pre-linked parameter lookup function, if it was generated. When pf == nullptr, no linked
   // params are present.
   if (!module_lookup_linked_param_valid_) {
@@ -262,12 +265,12 @@ void GraphRuntime::DefaultLookupLinkedParam(TVMArgs args, TVMRetValue* rv) {
                                  template_tensor->shape + template_tensor->ndim};
 
   std::unique_ptr<NDArray::Container> container{new NDArray::Container(
-      static_cast<void*>(opaque_handle), shape_vec, template_tensor->dtype, ctx)};
-  container->SetDeleter(GraphRuntime::LinkedNDArrayDeleter);
+      static_cast<void*>(opaque_handle), shape_vec, template_tensor->dtype, dev)};
+  container->SetDeleter(GraphExecutor::LinkedNDArrayDeleter);
   *rv = NDArray(GetObjectPtr<Object>(container.release()));
 }
 
-void GraphRuntime::SetupStorage() {
+void GraphExecutor::SetupStorage() {
   // Grab saved optimization plan from graph.
   std::vector<DLDataType> vtype;
   for (const std::string& s_type : attrs_.dltype) {
@@ -280,7 +283,7 @@ void GraphRuntime::SetupStorage() {
   for (size_t i = 0; i < attrs_.shape.size(); ++i) {
     int storage_id = attrs_.storage_id[i];
     // Use the fallback device if no device index is available.
-    int device_type = static_cast<int>(ctxs_[0].device_type);
+    int device_type = static_cast<int>(devices_[0].device_type);
     if (!attrs_.device_index.empty()) {
       device_type = attrs_.device_index[i];
     }
@@ -304,10 +307,10 @@ void GraphRuntime::SetupStorage() {
     TVMRetValue lookup_rv;
     {
       std::vector<int64_t> shape_vec{attrs_.shape[i].begin(), attrs_.shape[i].end()};
-      DLTensor template_tensor{nullptr,  TVMContext{kDLCPU, 0}, static_cast<int>(shape_vec.size()),
-                               vtype[i], shape_vec.data(),      nullptr,
+      DLTensor template_tensor{nullptr,  Device{kDLCPU, 0}, static_cast<int>(shape_vec.size()),
+                               vtype[i], shape_vec.data(),  nullptr,
                                0};
-      lookup_rv = lookup_linked_param_(module_, sid, &template_tensor, ctxs_[0]);
+      lookup_rv = lookup_linked_param_(module_, sid, &template_tensor, devices_[0]);
     }
     if (lookup_rv.type_code() != kTVMNullptr) {
       pool_entry[sid].linked_param = lookup_rv;
@@ -321,16 +324,16 @@ void GraphRuntime::SetupStorage() {
   for (const auto& pit : pool_entry) {
     // This for loop is very fast since there are usually only a couple of
     // devices available on the same hardware.
-    const auto& cit = std::find_if(ctxs_.begin(), ctxs_.end(), [&pit](const TVMContext& c) {
-      return pit.device_type == static_cast<int>(c.device_type);
+    const auto& cit = std::find_if(devices_.begin(), devices_.end(), [&pit](const Device& d) {
+      return pit.device_type == static_cast<int>(d.device_type);
     });
-    TVMContext ctx = cit == ctxs_.end() ? ctxs_[0] : *cit;
+    Device dev = cit == devices_.end() ? devices_[0] : *cit;
     if (pit.linked_param.defined()) {
       storage_pool_.push_back(pit.linked_param);
     } else {
       std::vector<int64_t> shape;
       shape.push_back(static_cast<int64_t>(pit.size + 3) / 4);
-      storage_pool_.push_back(NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx));
+      storage_pool_.push_back(NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, dev));
     }
   }
 
@@ -349,7 +352,7 @@ void GraphRuntime::SetupStorage() {
   }
 }
 
-void GraphRuntime::SetupOpExecs() {
+void GraphExecutor::SetupOpExecs() {
   op_execs_.resize(this->GetNumOfNodes());
   input_dltensors_.resize(num_node_entries());
   std::unordered_set<uint32_t> input_node_eids;
@@ -386,9 +389,10 @@ void GraphRuntime::SetupOpExecs() {
   }
 }
 
-std::pair<std::function<void()>, std::shared_ptr<GraphRuntime::OpArgs> > GraphRuntime::CreateTVMOp(
-    const TVMOpParam& param, const std::vector<DLTensor>& args, size_t num_inputs) {
-  std::shared_ptr<GraphRuntime::OpArgs> arg_ptr = std::make_shared<GraphRuntime::OpArgs>();
+std::pair<std::function<void()>, std::shared_ptr<GraphExecutor::OpArgs> >
+GraphExecutor::CreateTVMOp(const TVMOpParam& param, const std::vector<DLTensor>& args,
+                           size_t num_inputs) {
+  std::shared_ptr<GraphExecutor::OpArgs> arg_ptr = std::make_shared<GraphExecutor::OpArgs>();
   // setup address.
   arg_ptr->args = args;
   if (param.flatten_data) {
@@ -435,8 +439,8 @@ std::pair<std::function<void()>, std::shared_ptr<GraphRuntime::OpArgs> > GraphRu
   return {fexec, arg_ptr};
 }
 
-PackedFunc GraphRuntime::GetFunction(const std::string& name,
-                                     const ObjectPtr<Object>& sptr_to_self) {
+PackedFunc GraphExecutor::GetFunction(const std::string& name,
+                                      const ObjectPtr<Object>& sptr_to_self) {
   // Return member functions during query.
   if (name == "set_input") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
@@ -491,34 +495,34 @@ PackedFunc GraphRuntime::GetFunction(const std::string& name,
   } else if (name == "share_params") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
       const auto& module = args[0].operator Module();
-      ICHECK_EQ(module.operator->()->type_key(), "GraphRuntime");
+      ICHECK_EQ(module.operator->()->type_key(), std::string("GraphExecutor"));
       const auto& param_blob = args[1].operator std::string();
       dmlc::MemoryStringStream strm(const_cast<std::string*>(&param_blob));
-      this->ShareParams(dynamic_cast<const GraphRuntime&>(*module.operator->()), &strm);
+      this->ShareParams(dynamic_cast<const GraphExecutor&>(*module.operator->()), &strm);
     });
   } else {
     return PackedFunc();
   }
 }
 
-Module GraphRuntimeCreate(const std::string& sym_json, const tvm::runtime::Module& m,
-                          const std::vector<TVMContext>& ctxs,
-                          const PackedFunc lookup_linked_param_func) {
-  auto exec = make_object<GraphRuntime>();
-  exec->Init(sym_json, m, ctxs, lookup_linked_param_func);
+Module GraphExecutorCreate(const std::string& sym_json, const tvm::runtime::Module& m,
+                           const std::vector<Device>& devs,
+                           const PackedFunc lookup_linked_param_func) {
+  auto exec = make_object<GraphExecutor>();
+  exec->Init(sym_json, m, devs, lookup_linked_param_func);
   return Module(exec);
 }
 
-// Get all context for the host and other runtime devices.
-std::vector<TVMContext> GetAllContext(const TVMArgs& args, int ctx_start_arg) {
+// Get all devices for the host and other runtime devices.
+std::vector<Device> GetAllDevice(const TVMArgs& args, int dev_start_arg) {
   // Reserve the first item as the fallback device.
-  std::vector<TVMContext> ret;
-  TVMContext ctx;
-  for (int i = ctx_start_arg; i < args.num_args; i += 2) {
+  std::vector<Device> ret;
+  Device dev;
+  for (int i = dev_start_arg; i < args.num_args; i += 2) {
     int dev_type = args[i];
-    ctx.device_type = static_cast<DLDeviceType>(dev_type);
-    ctx.device_id = args[i + 1];
-    ret.push_back(ctx);
+    dev.device_type = static_cast<DLDeviceType>(dev_type);
+    dev.device_id = args[i + 1];
+    ret.push_back(dev);
   }
   return ret;
 }
@@ -527,19 +531,19 @@ std::vector<TVMContext> GetAllContext(const TVMArgs& args, int ctx_start_arg) {
 // from tvm4j and javascript, since they don't have heterogeneous
 // execution support yet. For heterogenenous execution, at least 5 arguments will
 // be passed in. The third one is the number of devices.
-// Eventually, we will only probably pass TVMContext for all the languages.
-TVM_REGISTER_GLOBAL("tvm.graph_runtime.create").set_body([](TVMArgs args, TVMRetValue* rv) {
-  ICHECK_GE(args.num_args, 4) << "The expected number of arguments for graph_runtime.create is "
+// Eventually, we will only probably pass Device for all the languages.
+TVM_REGISTER_GLOBAL("tvm.graph_executor.create").set_body([](TVMArgs args, TVMRetValue* rv) {
+  ICHECK_GE(args.num_args, 4) << "The expected number of arguments for graph_executor.create is "
                                  "at least 4, but it has "
                               << args.num_args;
   PackedFunc lookup_linked_param_func;
-  int ctx_start_arg = 2;
+  int dev_start_arg = 2;
   if (args[2].type_code() == kTVMPackedFuncHandle) {
     lookup_linked_param_func = args[2];
-    ctx_start_arg++;
+    dev_start_arg++;
   }
-  const auto& contexts = GetAllContext(args, ctx_start_arg);
-  *rv = GraphRuntimeCreate(args[0], args[1], contexts, lookup_linked_param_func);
+  const auto& devices = GetAllDevice(args, dev_start_arg);
+  *rv = GraphExecutorCreate(args[0], args[1], devices, lookup_linked_param_func);
 });
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/graph/graph_runtime.h b/src/runtime/graph_executor/graph_executor.h
similarity index 92%
rename from src/runtime/graph/graph_runtime.h
rename to src/runtime/graph_executor/graph_executor.h
index a1e2ee3b5d74..37a47f6971e6 100644
--- a/src/runtime/graph/graph_runtime.h
+++ b/src/runtime/graph_executor/graph_executor.h
@@ -18,12 +18,12 @@
  */
 
 /*!
- * \brief Tiny graph runtime that can run graph
+ * \brief Tiny graph executor that can run graph
  *        containing only tvm PackedFunc.
- * \file graph_runtime.h
+ * \file graph_executor.h
  */
-#ifndef TVM_RUNTIME_GRAPH_GRAPH_RUNTIME_H_
-#define TVM_RUNTIME_GRAPH_GRAPH_RUNTIME_H_
+#ifndef TVM_RUNTIME_GRAPH_EXECUTOR_GRAPH_EXECUTOR_H_
+#define TVM_RUNTIME_GRAPH_EXECUTOR_GRAPH_EXECUTOR_H_
 
 #include <dlpack/dlpack.h>
 #include <dmlc/json.h>
@@ -56,12 +56,12 @@ struct TVMOpParam {
 };
 
 /*!
- * \brief Tiny graph runtime.
+ * \brief Tiny graph executor.
  *
  *  This runtime can be acccesibly in various language via
  *  TVM runtime PackedFunc API.
  */
-class TVM_DLL GraphRuntime : public ModuleNode {
+class TVM_DLL GraphExecutor : public ModuleNode {
   struct OpArgs {
     std::vector<DLTensor> args;
     std::vector<TVMValue> arg_values;
@@ -81,23 +81,23 @@ class TVM_DLL GraphRuntime : public ModuleNode {
   /*!
    * \return The type key of the executor.
    */
-  const char* type_key() const final { return "GraphRuntime"; }
+  const char* type_key() const final { return "GraphExecutor"; }
   void Run();
 
   /*!
-   * \brief Initialize the graph executor with graph and context.
+   * \brief Initialize the graph executor with graph and device.
    * \param graph_json The execution graph.
    * \param module The module containing the compiled functions for the host
    *  processor.
-   * \param ctxs The context of the host and devices where graph nodes will be
+   * \param devs The device of the host and devices where graph nodes will be
    *  executed on.
    * \param lookup_linked_param_func If given, a PackedFunc invoked to lookup linked parameters
    *  by storage_id. If not given, linked parameters are looked-up using an internal implementation,
-   *  which is not compatible with RPCModules.
+   *  which is not compatible with RPCModules. Default is nullptr.
    */
 
   void Init(const std::string& graph_json, tvm::runtime::Module module,
-            const std::vector<TVMContext>& ctxs, const PackedFunc lookup_linked_param_func);
+            const std::vector<Device>& devs, const PackedFunc lookup_linked_param_func = nullptr);
 
   /*!
    * \brief Get the input index given the name of input.
@@ -162,12 +162,12 @@ class TVM_DLL GraphRuntime : public ModuleNode {
   void LoadParams(const std::string& param_blob);
 
   /*!
-   * \brief Share parameters from pre-existing GraphRuntime instance.
-   * \param other A GraphRuntime instance, previously with |LoadParams| called with the
+   * \brief Share parameters from pre-existing GraphExecutor instance.
+   * \param other A GraphExecutor instance, previously with |LoadParams| called with the
    * identical input |param_blob|.
    * \param strm The input stream.
    */
-  void ShareParams(const GraphRuntime& other, dmlc::Stream* strm);
+  void ShareParams(const GraphExecutor& other, dmlc::Stream* strm);
 
   /*!
    * \brief Get total number of nodes.
@@ -406,7 +406,7 @@ class TVM_DLL GraphRuntime : public ModuleNode {
   /*! \brief The code module that contains both host and device code. */
   tvm::runtime::Module module_;
   /*! \brief Execution context of all devices including the host. */
-  std::vector<TVMContext> ctxs_;
+  std::vector<Device> devices_;
   /*! \brief Common storage pool for all devices. */
   std::vector<NDArray> storage_pool_;
   /*! \brief Data entry of each node. */
@@ -426,8 +426,8 @@ class TVM_DLL GraphRuntime : public ModuleNode {
   bool module_lookup_linked_param_valid_;
 };
 
-std::vector<TVMContext> GetAllContext(const TVMArgs& args, int ctx_start_arg);
+std::vector<Device> GetAllDevice(const TVMArgs& args, int dev_start_arg);
 }  // namespace runtime
 }  // namespace tvm
 
-#endif  // TVM_RUNTIME_GRAPH_GRAPH_RUNTIME_H_
+#endif  // TVM_RUNTIME_GRAPH_EXECUTOR_GRAPH_EXECUTOR_H_
diff --git a/src/runtime/graph_executor/graph_executor_factory.cc b/src/runtime/graph_executor/graph_executor_factory.cc
new file mode 100644
index 000000000000..a6cef931421b
--- /dev/null
+++ b/src/runtime/graph_executor/graph_executor_factory.cc
@@ -0,0 +1,210 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file graph_executor_factory.cc
+ * \brief Graph executor factory implementations
+ */
+
+#include "./graph_executor_factory.h"
+
+#include <tvm/runtime/container.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/runtime/registry.h>
+
+#include <iterator>
+#include <vector>
+
+namespace tvm {
+namespace runtime {
+
+GraphExecutorFactory::GraphExecutorFactory(
+    const std::string& graph_json,
+    const std::unordered_map<std::string, tvm::runtime::NDArray>& params,
+    const std::string& module_name) {
+  graph_json_ = graph_json;
+  params_ = params;
+  module_name_ = module_name;
+}
+
+PackedFunc GraphExecutorFactory::GetFunction(
+    const std::string& name, const tvm::runtime::ObjectPtr<tvm::runtime::Object>& sptr_to_self) {
+  if (name == module_name_) {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      std::vector<Device> devices;
+      for (int i = 0; i < args.num_args; ++i) {
+        devices.emplace_back(args[i].operator Device());
+      }
+      *rv = this->ExecutorCreate(devices);
+    });
+  } else if (name == "debug_create") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      ICHECK_GE(args.size(), 2);
+      std::string module_name = args[0].operator String();
+      ICHECK(module_name == module_name_) << "Currently we only support single model for now.";
+      std::vector<Device> devices;
+      for (int i = 1; i < args.num_args; ++i) {
+        devices.emplace_back(args[i].operator Device());
+      }
+      *rv = this->DebugExecutorCreate(devices);
+    });
+  } else if (name == "remove_params") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      std::unordered_map<std::string, tvm::runtime::NDArray> empty_params{};
+      auto exec =
+          make_object<GraphExecutorFactory>(this->graph_json_, empty_params, this->module_name_);
+      exec->Import(this->imports_[0]);
+      *rv = Module(exec);
+    });
+  } else if (name == "cuda_graph_create") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      std::vector<Device> devices;
+      for (int i = 0; i < args.num_args; ++i) {
+        devices.emplace_back(args[i].operator Device());
+      }
+      *rv = this->CudaGraphExecutorCreate(devices);
+    });
+  } else {
+    return PackedFunc();
+  }
+}
+
+void GraphExecutorFactory::SaveToBinary(dmlc::Stream* stream) {
+  stream->Write(graph_json_);
+  std::vector<std::string> names;
+  std::vector<DLTensor*> arrays;
+  for (const auto& v : params_) {
+    names.emplace_back(v.first);
+    arrays.emplace_back(const_cast<DLTensor*>(v.second.operator->()));
+  }
+  uint64_t sz = arrays.size();
+  ICHECK(sz == names.size());
+  stream->Write(sz);
+  stream->Write(names);
+  for (size_t i = 0; i < sz; ++i) {
+    tvm::runtime::SaveDLTensor(stream, arrays[i]);
+  }
+  stream->Write(module_name_);
+}
+
+Module GraphExecutorFactory::ExecutorCreate(const std::vector<Device>& devs) {
+  auto exec = make_object<GraphExecutor>();
+  exec->Init(this->graph_json_, this->imports_[0], devs, PackedFunc());
+  // set params
+  SetParams(exec.get(), this->params_);
+  return Module(exec);
+}
+
+Module GraphExecutorFactory::DebugExecutorCreate(const std::vector<Device>& devs) {
+  const PackedFunc* pf = tvm::runtime::Registry::Get("tvm.graph_executor_debug.create");
+  ICHECK(pf != nullptr) << "Cannot find function tvm.graph_executor_debug.create in registry. "
+                           "Do you enable debug graph executor build?";
+  // Debug executor create packed function will call GetAllContexs, so we unpack the devs.
+  std::vector<int> unpacked_devs;
+  for (const auto& dev : devs) {
+    unpacked_devs.emplace_back(dev.device_type);
+    unpacked_devs.emplace_back(dev.device_id);
+  }
+  size_t args_size = unpacked_devs.size() + 2;
+  std::vector<TVMValue> values(args_size);
+  std::vector<int> codes(args_size);
+  runtime::TVMArgsSetter setter(values.data(), codes.data());
+  setter(0, this->graph_json_);
+  setter(1, this->imports_[0]);
+  for (size_t i = 0; i < unpacked_devs.size(); ++i) {
+    setter(i + 2, unpacked_devs[i]);
+  }
+  TVMRetValue rv;
+  pf->CallPacked(TVMArgs(values.data(), codes.data(), args_size), &rv);
+  Module mod = rv.operator Module();
+  // debug graph executor is one child class of graph executor.
+  SetParams(const_cast<GraphExecutor*>(mod.as<GraphExecutor>()), this->params_);
+  return mod;
+}
+
+Module GraphExecutorFactory::CudaGraphExecutorCreate(const std::vector<Device>& devs) {
+  const PackedFunc* pf = tvm::runtime::Registry::Get("tvm.graph_executor_cuda_graph.create");
+  ICHECK(pf != nullptr) << "Cannot find function tvm.graph_executor_cuda_graph.create in registry. "
+                           "Did you set(USE_GRAPH_EXECUTOR_CUGRAPH=ON)?";
+  std::vector<int> unpacked_devs;
+  for (const auto& dev : devs) {
+    unpacked_devs.emplace_back(dev.device_type);
+    unpacked_devs.emplace_back(dev.device_id);
+  }
+  size_t args_size = unpacked_devs.size() + 2;
+  std::vector<TVMValue> values(args_size);
+  std::vector<int> codes(args_size);
+  runtime::TVMArgsSetter setter(values.data(), codes.data());
+  setter(0, this->graph_json_);
+  setter(1, this->imports_[0]);
+  for (size_t i = 0; i < unpacked_devs.size(); ++i) {
+    setter(i + 2, unpacked_devs[i]);
+  }
+  TVMRetValue rv;
+  pf->CallPacked(TVMArgs(values.data(), codes.data(), args_size), &rv);
+  Module mod = rv.operator Module();
+  SetParams(const_cast<GraphExecutor*>(mod.as<GraphExecutor>()), this->params_);
+  return mod;
+}
+
+Module GraphExecutorFactoryModuleLoadBinary(void* strm) {
+  dmlc::Stream* stream = static_cast<dmlc::Stream*>(strm);
+  std::string graph_json;
+  std::unordered_map<std::string, tvm::runtime::NDArray> params;
+  std::string module_name;
+  ICHECK(stream->Read(&graph_json));
+  uint64_t sz;
+  ICHECK(stream->Read(&sz));
+  std::vector<std::string> names;
+  ICHECK(stream->Read(&names));
+  ICHECK(sz == names.size());
+  for (size_t i = 0; i < sz; ++i) {
+    tvm::runtime::NDArray temp;
+    temp.Load(stream);
+    params[names[i]] = temp;
+  }
+  ICHECK(stream->Read(&module_name));
+  auto exec = make_object<GraphExecutorFactory>(graph_json, params, module_name);
+  return Module(exec);
+}
+
+TVM_REGISTER_GLOBAL("tvm.graph_executor_factory.create")
+    .set_body([](TVMArgs args, TVMRetValue* rv) {
+      ICHECK_GE(args.num_args, 3) << "The expected number of arguments for "
+                                     "graph_executor_factory.create needs at least 3, "
+                                     "but it has "
+                                  << args.num_args;
+      // The argument order is graph_json, module, module_name, param0_name, param0_tensor,
+      // [param1_name, param1_tensor], ...
+      ICHECK_EQ((args.size() - 3) % 2, 0);
+      std::unordered_map<std::string, tvm::runtime::NDArray> params;
+      for (size_t i = 3; i < static_cast<size_t>(args.size()); i += 2) {
+        std::string name = args[i].operator String();
+        params[name] = args[i + 1].operator tvm::runtime::NDArray();
+      }
+      auto exec = make_object<GraphExecutorFactory>(args[0], params, args[2]);
+      exec->Import(args[1]);
+      *rv = Module(exec);
+    });
+
+TVM_REGISTER_GLOBAL("runtime.module.loadbinary_GraphExecutorFactory")
+    .set_body_typed(GraphExecutorFactoryModuleLoadBinary);
+
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/graph/graph_runtime_factory.h b/src/runtime/graph_executor/graph_executor_factory.h
similarity index 63%
rename from src/runtime/graph/graph_runtime_factory.h
rename to src/runtime/graph_executor/graph_executor_factory.h
index 98fb27c43ea2..46346cbea002 100644
--- a/src/runtime/graph/graph_runtime_factory.h
+++ b/src/runtime/graph_executor/graph_executor_factory.h
@@ -18,12 +18,12 @@
  */
 
 /*!
- * \file tvm/runtime/graph_runtime_factory.h
- * \brief Graph runtime factory creating graph runtime.
+ * \file tvm/runtime/graph_executor/graph_executor_factory.h
+ * \brief Graph executor factory creating graph executor.
  */
 
-#ifndef TVM_RUNTIME_GRAPH_GRAPH_RUNTIME_FACTORY_H_
-#define TVM_RUNTIME_GRAPH_GRAPH_RUNTIME_FACTORY_H_
+#ifndef TVM_RUNTIME_GRAPH_EXECUTOR_GRAPH_EXECUTOR_FACTORY_H_
+#define TVM_RUNTIME_GRAPH_EXECUTOR_GRAPH_EXECUTOR_FACTORY_H_
 
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/module.h>
@@ -37,22 +37,22 @@
 #include <unordered_map>
 #include <vector>
 
-#include "./graph_runtime.h"
+#include "./graph_executor.h"
 
 namespace tvm {
 namespace runtime {
 
-class TVM_DLL GraphRuntimeFactory : public runtime::ModuleNode {
+class TVM_DLL GraphExecutorFactory : public runtime::ModuleNode {
  public:
   /*!
-   * \brief Construct the GraphRuntimeFactory.
+   * \brief Construct the GraphExecutorFactory.
    * \param graph_json The execution graph.
    * \param params The params of graph.
    * \param module_name The module name of graph.
    */
-  GraphRuntimeFactory(const std::string& graph_json,
-                      const std::unordered_map<std::string, tvm::runtime::NDArray>& params,
-                      const std::string& module_name = "default");
+  GraphExecutorFactory(const std::string& graph_json,
+                       const std::unordered_map<std::string, tvm::runtime::NDArray>& params,
+                       const std::string& module_name = "default");
 
   /*!
    * \brief Get member function to front-end
@@ -65,7 +65,7 @@ class TVM_DLL GraphRuntimeFactory : public runtime::ModuleNode {
   /*!
    * \return The type key of the executor.
    */
-  const char* type_key() const override { return "GraphRuntimeFactory"; }
+  const char* type_key() const override { return "GraphExecutorFactory"; }
 
   /*!
    * \brief Save the module to binary stream.
@@ -74,27 +74,35 @@ class TVM_DLL GraphRuntimeFactory : public runtime::ModuleNode {
   void SaveToBinary(dmlc::Stream* stream) override;
 
   /*!
-   * \brief Create a specific runtime module
-   * \param ctxs The context of the host and devices where graph nodes will be
+   * \brief Create a specific executor module
+   * \param devs The device of the host and devices where graph nodes will be
    *  executed on.
-   * \return created runtime module
+   * \return created executor module
    */
-  Module RuntimeCreate(const std::vector<TVMContext>& ctxs);
+  Module ExecutorCreate(const std::vector<Device>& devs);
 
   /*!
-   * \brief Create a specific debug runtime module
-   * \param ctxs The context of the host and devices where graph nodes will be
+   * \brief Create a specific debug executor module
+   * \param devs The device of the host and devices where graph nodes will be
    *  executed on.
-   * \return created debug runtime module
+   * \return created debug executor module
    */
-  Module DebugRuntimeCreate(const std::vector<TVMContext>& ctxs);
+  Module DebugExecutorCreate(const std::vector<Device>& devs);
+
+  /*!
+   * \brief Create a specific cuda graph executor module
+   * \param devs The device of the host and devices where graph nodes will be
+   *  executed on.
+   * \return created cuda graph executor module
+   */
+  Module CudaGraphExecutorCreate(const std::vector<Device>& devs);
 
   /*!
    * \brief Set params.
-   * \param graph_runtime The graph runtime we want to set the params into.
+   * \param graph_executor The graph executor we want to set the params into.
    * \param params The graph params value we want to set.
    */
-  void SetParams(GraphRuntime* graph_runtime,
+  void SetParams(GraphExecutor* graph_executor,
                  const std::unordered_map<std::string, tvm::runtime::NDArray>& params) const {
     std::unordered_map<std::string, tvm::runtime::NDArray> value = params;
     // upload big arrays first to avoid memory issue in rpc mode
@@ -109,9 +117,9 @@ class TVM_DLL GraphRuntimeFactory : public runtime::ModuleNode {
                 return lhs_size > rhs_size;
               });
     for (const auto& key : keys) {
-      int in_idx = graph_runtime->GetInputIndex(key);
+      int in_idx = graph_executor->GetInputIndex(key);
       if (in_idx >= 0) {
-        graph_runtime->SetInput(in_idx, const_cast<DLTensor*>(value[key].operator->()));
+        graph_executor->SetInput(in_idx, const_cast<DLTensor*>(value[key].operator->()));
       }
     }
   }
@@ -128,4 +136,4 @@ class TVM_DLL GraphRuntimeFactory : public runtime::ModuleNode {
 }  // namespace runtime
 }  // namespace tvm
 
-#endif  // TVM_RUNTIME_GRAPH_GRAPH_RUNTIME_FACTORY_H_
+#endif  // TVM_RUNTIME_GRAPH_EXECUTOR_GRAPH_EXECUTOR_FACTORY_H_
diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
index 70cebf5afa44..a07a7c683026 100644
--- a/src/runtime/hexagon/hexagon_device_api.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -18,8 +18,8 @@
  */
 
 #include <tvm/runtime/device_api.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 #include <algorithm>
 #include <cstring>
@@ -31,13 +31,13 @@ namespace runtime {
 
 class HexagonDeviceAPI : public DeviceAPI {
  public:
-  void SetDevice(TVMContext ctx) final;
-  void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final;
-  void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment, DLDataType type_hint) final;
-  void FreeDataSpace(TVMContext ctx, void* ptr) final;
-  void StreamSync(TVMContext ctx, TVMStreamHandle stream) final;
-  void* AllocWorkspace(TVMContext ctx, size_t nbytes, DLDataType type_hint = {}) final;
-  void FreeWorkspace(TVMContext ctx, void* ptr) final;
+  void SetDevice(Device dev) final;
+  void GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) final;
+  void* AllocDataSpace(Device dev, size_t nbytes, size_t alignment, DLDataType type_hint) final;
+  void FreeDataSpace(Device dev, void* ptr) final;
+  void StreamSync(Device dev, TVMStreamHandle stream) final;
+  void* AllocWorkspace(Device dev, size_t nbytes, DLDataType type_hint = {}) final;
+  void FreeWorkspace(Device dev, void* ptr) final;
 
   static HexagonDeviceAPI* Global() {
     // NOTE: explicitly use new to avoid destruction of global state
@@ -48,33 +48,33 @@ class HexagonDeviceAPI : public DeviceAPI {
 
  protected:
   void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
-                      size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
-                      DLDataType type_hint, TVMStreamHandle stream) final;
+                      size_t num_bytes, Device dev_from, Device dev_to, DLDataType type_hint,
+                      TVMStreamHandle stream) final;
 };
 
 // HexagonDeviceAPI.
 
-inline void HexagonDeviceAPI::SetDevice(TVMContext ctx) {}
+inline void HexagonDeviceAPI::SetDevice(Device dev) {}
 
-inline void HexagonDeviceAPI::GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) {
+inline void HexagonDeviceAPI::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) {
   if (kind == kExist) *rv = 1;
 }
 
-inline void* HexagonDeviceAPI::AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment,
+inline void* HexagonDeviceAPI::AllocDataSpace(Device dev, size_t nbytes, size_t alignment,
                                               DLDataType type_hint) {
-  ICHECK(hexagon::Device::ValidateDeviceId(ctx.device_id));
+  ICHECK(hexagon::Device::ValidateDeviceId(dev.device_id));
   return hexagon::Device::Global()->Alloc(nbytes, alignment);
 }
 
-inline void HexagonDeviceAPI::FreeDataSpace(TVMContext ctx, void* ptr) {
-  ICHECK(hexagon::Device::ValidateDeviceId(ctx.device_id));
+inline void HexagonDeviceAPI::FreeDataSpace(Device dev, void* ptr) {
+  ICHECK(hexagon::Device::ValidateDeviceId(dev.device_id));
   hexagon::Device::Global()->Free(ptr);
 }
 
 inline void HexagonDeviceAPI::CopyDataFromTo(const void* from, size_t from_offset, void* to,
-                                             size_t to_offset, size_t num_bytes,
-                                             TVMContext ctx_from, TVMContext ctx_to,
-                                             DLDataType type_hint, TVMStreamHandle stream) {
+                                             size_t to_offset, size_t num_bytes, Device dev_from,
+                                             Device dev_to, DLDataType type_hint,
+                                             TVMStreamHandle stream) {
   const char* src = static_cast<const char*>(from) + from_offset;
   char* dst = static_cast<char*>(to) + to_offset;
 
@@ -83,45 +83,45 @@ inline void HexagonDeviceAPI::CopyDataFromTo(const void* from, size_t from_offse
   };
   (void)Is32bit;
 
-  if (ctx_from.device_type == ctx_to.device_type) {
-    if (ctx_from.device_type == kDLCPU) {
+  if (dev_from.device_type == dev_to.device_type) {
+    if (dev_from.device_type == kDLCPU) {
       memmove(dst, src, num_bytes);
-    } else if (static_cast<int>(ctx_from.device_type) == kDLHexagon) {
-      ICHECK(hexagon::Device::ValidateDeviceId(ctx_from.device_id));
-      ICHECK_EQ(ctx_from.device_id, ctx_to.device_id);
+    } else if (static_cast<int>(dev_from.device_type) == kDLHexagon) {
+      ICHECK(hexagon::Device::ValidateDeviceId(dev_from.device_id));
+      ICHECK_EQ(dev_from.device_id, dev_to.device_id);
       ICHECK(Is32bit(dst) && Is32bit(src));
       hexagon::Device::Global()->CopyDeviceToDevice(dst, src, num_bytes);
     }
   } else {
-    if (ctx_from.device_type == kDLCPU) {
-      ICHECK_EQ(static_cast<int>(ctx_to.device_type), kDLHexagon);
+    if (dev_from.device_type == kDLCPU) {
+      ICHECK_EQ(static_cast<int>(dev_to.device_type), kDLHexagon);
       ICHECK(Is32bit(dst));
-      ICHECK(hexagon::Device::ValidateDeviceId(ctx_to.device_id));
+      ICHECK(hexagon::Device::ValidateDeviceId(dev_to.device_id));
       hexagon::Device::Global()->CopyHostToDevice(dst, src, num_bytes);
     } else {
-      ICHECK_EQ(static_cast<int>(ctx_from.device_type), kDLHexagon);
-      ICHECK_EQ(ctx_to.device_type, kDLCPU);
+      ICHECK_EQ(static_cast<int>(dev_from.device_type), kDLHexagon);
+      ICHECK_EQ(dev_to.device_type, kDLCPU);
       ICHECK(Is32bit(src));
-      ICHECK(hexagon::Device::ValidateDeviceId(ctx_from.device_id));
+      ICHECK(hexagon::Device::ValidateDeviceId(dev_from.device_id));
       hexagon::Device::Global()->CopyDeviceToHost(dst, src, num_bytes);
     }
   }
 }
 
-inline void HexagonDeviceAPI::StreamSync(TVMContext ctx, TVMStreamHandle stream) {}
+inline void HexagonDeviceAPI::StreamSync(Device dev, TVMStreamHandle stream) {}
 
-inline void* HexagonDeviceAPI::AllocWorkspace(TVMContext ctx, size_t nbytes, DLDataType type_hint) {
-  ICHECK(hexagon::Device::ValidateDeviceId(ctx.device_id));
+inline void* HexagonDeviceAPI::AllocWorkspace(Device dev, size_t nbytes, DLDataType type_hint) {
+  ICHECK(hexagon::Device::ValidateDeviceId(dev.device_id));
   if (type_hint.code == 100) {
     size_t align = std::min(nbytes, 2048lu);
     return hexagon::Device::Global()->AllocVtcm(nbytes, align);
   }
-  return DeviceAPI::AllocWorkspace(ctx, nbytes, type_hint);
+  return DeviceAPI::AllocWorkspace(dev, nbytes, type_hint);
 }
 
-inline void HexagonDeviceAPI::FreeWorkspace(TVMContext ctx, void* ptr) {
-  ICHECK(hexagon::Device::ValidateDeviceId(ctx.device_id));
-  DeviceAPI::FreeWorkspace(ctx, ptr);
+inline void HexagonDeviceAPI::FreeWorkspace(Device dev, void* ptr) {
+  ICHECK(hexagon::Device::ValidateDeviceId(dev.device_id));
+  DeviceAPI::FreeWorkspace(dev, ptr);
 }
 
 TVM_REGISTER_GLOBAL("device_api.hexagon").set_body([](TVMArgs args, TVMRetValue* rv) {
diff --git a/src/runtime/hexagon/hexagon_module.cc b/src/runtime/hexagon/hexagon_module.cc
index 994e24b99084..73c830fe5fc9 100644
--- a/src/runtime/hexagon/hexagon_module.cc
+++ b/src/runtime/hexagon/hexagon_module.cc
@@ -22,8 +22,8 @@
 #ifdef __ANDROID__
 #include <android/log.h>
 #endif
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 #include <memory>
 #include <set>
@@ -379,7 +379,7 @@ void HexagonModuleNode::RemapArgs(const TVMArgs& args, std::vector<TVMValue>& va
       case kTVMNDArrayHandle:
       case kTVMDLTensorHandle: {
         DLTensor* t = static_cast<DLTensor*>(a);
-        assert(TVMDeviceExtType(t->ctx.device_type) == kDLHexagon);
+        assert(TVMDeviceExtType(t->device.device_type) == kDLHexagon);
         TVMValue v;
         v.v_handle = CreateRemoteTensor(t);
         remote_tensors.push_back(v.v_handle);
@@ -401,25 +401,25 @@ void* HexagonModuleNode::CreateRemoteTensor(const DLTensor* t) const {
     Layout of the DLTensor structure on Hexagon.
 
     DLTensor:                       Size  offset
-      data              void*          4       0
-      ctx.device_type   enum           1       4
-      <pad>                            3       5
-      ctx.device_id     int            4       8
-      ndim              int            4      12
-      dtype.code        uint8_t        1      16
-      dtype.bits        uint8_t        1      17
-      dtype.lanes       uint16_t       2      18
-      shape             int64_t*       4      20
-      strides           int64_t*       4      24
-      <pad>                            4      28
-      byte_offset       uint64_t       8      32
+      data               void*          4       0
+      device.device_type enum           1       4
+      <pad>                             3       5
+      device.device_id   int            4       8
+      ndim               int            4      12
+      dtype.code         uint8_t        1      16
+      dtype.bits         uint8_t        1      17
+      dtype.lanes        uint16_t       2      18
+      shape              int64_t*       4      20
+      strides            int64_t*       4      24
+      <pad>                             4      28
+      byte_offset        uint64_t       8      32
       .. end ................................ 40
   */
   struct __attribute__((packed)) HexagonDLTensor {
     uint32_t data;
-    uint8_t ctx_device_type;
+    uint8_t device_type;
     uint8_t pad0[3];  // MUST BE ZERO!
-    int32_t ctx_device_id;
+    int32_t device_id;
     int32_t ndim;
     uint8_t dtype_code;
     uint8_t dtype_bits;
@@ -444,9 +444,9 @@ void* HexagonModuleNode::CreateRemoteTensor(const DLTensor* t) const {
 
   HexagonDLTensor local;
   local.data = static_cast<uint32_t>(reinterpret_cast<uintptr_t>(t->data));
-  local.ctx_device_type = uint8_t(t->ctx.device_type);
+  local.device_type = uint8_t(t->device.device_type);
   local.pad0[0] = local.pad0[1] = local.pad0[2] = 0;
-  local.ctx_device_id = t->ctx.device_id;
+  local.device_id = t->device.device_id;
   local.ndim = t->ndim;
   local.dtype_code = t->dtype.code;
   local.dtype_bits = t->dtype.bits;
diff --git a/src/runtime/hexagon/hexagon_module.h b/src/runtime/hexagon/hexagon_module.h
index e558997b7a4c..1288b933410c 100644
--- a/src/runtime/hexagon/hexagon_module.h
+++ b/src/runtime/hexagon/hexagon_module.h
@@ -20,8 +20,8 @@
 #ifndef TVM_RUNTIME_HEXAGON_HEXAGON_MODULE_H_
 #define TVM_RUNTIME_HEXAGON_HEXAGON_MODULE_H_
 
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/module.h>
-#include <tvm/support/logging.h>
 
 #include <array>
 #include <memory>
@@ -143,7 +143,7 @@ class Device {
   virtual ~Device() = 0;
 
   static std::shared_ptr<Device> Global();
-  static bool ValidateDeviceId(decltype(DLContext::device_id) device_id) {
+  static bool ValidateDeviceId(decltype(DLDevice::device_id) device_id) {
     // Only supporting a single device for now.
     return device_id == 0;
   }
diff --git a/src/runtime/hexagon/sim/hexagon_device_sim.cc b/src/runtime/hexagon/sim/hexagon_device_sim.cc
index 6cc7dcf3209f..1d3f0fd1006f 100644
--- a/src/runtime/hexagon/sim/hexagon_device_sim.cc
+++ b/src/runtime/hexagon/sim/hexagon_device_sim.cc
@@ -22,7 +22,7 @@
 #include <llvm/ADT/StringRef.h>
 #include <llvm/Support/FileSystem.h>
 #include <llvm/Support/Process.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include <algorithm>
 #include <deque>
diff --git a/src/runtime/hexagon/target/hexagon_dsprpcapi.cc b/src/runtime/hexagon/target/hexagon_dsprpcapi.cc
index d494db82e2c7..a089684c4188 100644
--- a/src/runtime/hexagon/target/hexagon_dsprpcapi.cc
+++ b/src/runtime/hexagon/target/hexagon_dsprpcapi.cc
@@ -22,7 +22,7 @@
 
 #include <dlfcn.h>
 #include <stdint.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include "hexagon_target_log.h"
 
diff --git a/src/runtime/hexagon/target/hexagon_dsprpcapi.h b/src/runtime/hexagon/target/hexagon_dsprpcapi.h
index c0e40805ecbf..e4711e3da584 100644
--- a/src/runtime/hexagon/target/hexagon_dsprpcapi.h
+++ b/src/runtime/hexagon/target/hexagon_dsprpcapi.h
@@ -22,7 +22,7 @@
 
 #ifdef __ANDROID__
 #include <stdint.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include "remote.h"
 #include "remote64.h"
diff --git a/src/runtime/hexagon/target/hexagon_stubapi.cc b/src/runtime/hexagon/target/hexagon_stubapi.cc
index 5428ae7c1cff..1fb7d942e968 100644
--- a/src/runtime/hexagon/target/hexagon_stubapi.cc
+++ b/src/runtime/hexagon/target/hexagon_stubapi.cc
@@ -23,7 +23,7 @@
 #include <dlfcn.h>
 #include <stdint.h>
 #include <sys/stat.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include "hexagon_target_log.h"
 
diff --git a/src/runtime/hexagon/target/hexagon_stubapi.h b/src/runtime/hexagon/target/hexagon_stubapi.h
index cc5b7b7413ca..fba22b10247c 100644
--- a/src/runtime/hexagon/target/hexagon_stubapi.h
+++ b/src/runtime/hexagon/target/hexagon_stubapi.h
@@ -24,7 +24,7 @@
 #include <AEEStdErr.h>
 #include <rpcmem.h>
 #include <stdint.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include <tuple>
 
diff --git a/src/runtime/logging.cc b/src/runtime/logging.cc
new file mode 100644
index 000000000000..8f0537ad7adc
--- /dev/null
+++ b/src/runtime/logging.cc
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <tvm/runtime/logging.h>
+
+#include <string>
+
+#if TVM_LOG_STACK_TRACE
+#if TVM_USE_LIBBACKTRACE
+
+#include <backtrace.h>
+#include <cxxabi.h>
+
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+namespace tvm {
+namespace runtime {
+namespace {
+
+struct BacktraceInfo {
+  std::vector<std::string> lines;
+  size_t max_size;
+  std::string error_message;
+};
+
+void BacktraceCreateErrorCallback(void* data, const char* msg, int errnum) {
+  std::cerr << "Could not initialize backtrace state: " << msg << std::endl;
+}
+
+backtrace_state* BacktraceCreate() {
+  return backtrace_create_state(nullptr, 1, BacktraceCreateErrorCallback, nullptr);
+}
+
+static backtrace_state* _bt_state = BacktraceCreate();
+
+std::string DemangleName(std::string name) {
+  int status = 0;
+  size_t length = name.size();
+  std::unique_ptr<char, void (*)(void* __ptr)> demangled_name = {
+      abi::__cxa_demangle(name.c_str(), nullptr, &length, &status), &std::free};
+  if (demangled_name && status == 0 && length > 0) {
+    return demangled_name.get();
+  } else {
+    return name;
+  }
+}
+
+void BacktraceErrorCallback(void* data, const char* msg, int errnum) {
+  // do nothing
+}
+
+void BacktraceSyminfoCallback(void* data, uintptr_t pc, const char* symname, uintptr_t symval,
+                              uintptr_t symsize) {
+  auto str = reinterpret_cast<std::string*>(data);
+
+  if (symname != nullptr) {
+    std::string tmp(symname, symsize);
+    *str = DemangleName(tmp.c_str());
+  } else {
+    std::ostringstream s;
+    s << "0x" << std::setfill('0') << std::setw(sizeof(uintptr_t) * 2) << std::hex << pc;
+    *str = s.str();
+  }
+}
+
+int BacktraceFullCallback(void* data, uintptr_t pc, const char* filename, int lineno,
+                          const char* symbol) {
+  auto stack_trace = reinterpret_cast<BacktraceInfo*>(data);
+  std::stringstream s;
+
+  std::unique_ptr<std::string> symbol_str = std::make_unique<std::string>("<unknown>");
+  if (symbol != nullptr) {
+    *symbol_str = DemangleName(symbol);
+  } else {
+    // see if syminfo gives anything
+    backtrace_syminfo(_bt_state, pc, BacktraceSyminfoCallback, BacktraceErrorCallback,
+                      symbol_str.get());
+  }
+  s << *symbol_str;
+
+  if (filename != nullptr) {
+    s << std::endl << "        at " << filename;
+    if (lineno != 0) {
+      s << ":" << lineno;
+    }
+  }
+  // Skip tvm::backtrace and tvm::LogFatal::~LogFatal at the beginning of the trace as they don't
+  // add anything useful to the backtrace.
+  if (!(stack_trace->lines.size() == 0 &&
+        (symbol_str->find("tvm::runtime::Backtrace", 0) == 0 ||
+         symbol_str->find("tvm::runtime::detail::LogFatal", 0) == 0))) {
+    stack_trace->lines.push_back(s.str());
+  }
+  // TVMFuncCall denotes the API boundary so we stop there. Exceptions should be caught there.
+  if (*symbol_str == "TVMFuncCall" || stack_trace->lines.size() >= stack_trace->max_size) {
+    return 1;
+  }
+  return 0;
+}
+}  // namespace
+
+std::string Backtrace() {
+  BacktraceInfo bt;
+  bt.max_size = 100;
+  if (_bt_state == nullptr) {
+    return "";
+  }
+  // libbacktrace eats memory if run on multiple threads at the same time, so we guard against it
+  static std::mutex m;
+  std::lock_guard<std::mutex> lock(m);
+  backtrace_full(_bt_state, 0, BacktraceFullCallback, BacktraceErrorCallback, &bt);
+
+  std::ostringstream s;
+  s << "Stack trace:\n";
+  for (size_t i = 0; i < bt.lines.size(); i++) {
+    s << "  " << i << ": " << bt.lines[i] << "\n";
+  }
+
+  return s.str();
+}
+}  // namespace runtime
+}  // namespace tvm
+
+#else
+
+#include <dmlc/logging.h>
+
+namespace tvm {
+namespace runtime {
+// Fallback to the dmlc implementation when backtrace is not available.
+std::string Backtrace() { return dmlc::StackTrace(); }
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_USE_LIBBACKTRACE
+#else
+
+namespace tvm {
+namespace runtime {
+// stacktrace logging is completely disabled
+std::string Backtrace() { return ""; }
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_LOG_STACK_TRACE
+
+#if (TVM_LOG_CUSTOMIZE == 0)
+namespace tvm {
+namespace runtime {
+namespace detail {
+
+LogFatal::Entry& LogFatal::GetEntry() {
+  static thread_local LogFatal::Entry result;
+  return result;
+}
+}  // namespace detail
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_LOG_CUSTOMIZE
diff --git a/src/runtime/metadata_module.cc b/src/runtime/metadata_module.cc
index 665c72cc5e0d..4a1d89ce1a1f 100644
--- a/src/runtime/metadata_module.cc
+++ b/src/runtime/metadata_module.cc
@@ -21,7 +21,7 @@
  * \file src/runtime/metadata_module.cc
  * \brief A wrapper for initializing imported modules using metadata. This
  * module is intended to be used by various runtime in the TVM stack, i.e.
- * graph runtime, relay VM, AOT runtime, and various user defined runtimes. It
+ * graph executor, relay VM, AOT runtime, and various user defined runtimes. It
  * paves the way to separate the code and metedata, which makes compilation
  * and/or interpretation more convenient. In addition, the clear separation of
  * code and metadata significantly reduces the efforts for handling external
diff --git a/src/runtime/metal/metal_common.h b/src/runtime/metal/metal_common.h
index bd07dbfde9d0..55f9022a6b96 100644
--- a/src/runtime/metal/metal_common.h
+++ b/src/runtime/metal/metal_common.h
@@ -32,8 +32,8 @@
 #import <Metal/MTLLibrary.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/packed_func.h>
-#include <tvm/support/logging.h>
 
 #include <memory>
 #include <mutex>
@@ -62,57 +62,57 @@ class MetalWorkspace final : public DeviceAPI {
   std::mutex mutex;
   // Destructor
   ~MetalWorkspace();
-  // Get command queue for given context.
-  id<MTLCommandQueue> GetCommandQueue(TVMContext ctx) {
-    ICHECK_EQ(ctx.device_type, kDLMetal);
-    ICHECK(ctx.device_id >= 0 && static_cast<size_t>(ctx.device_id) < queues.size())
-        << "Invalid Metal device_id=" << ctx.device_id;
-    return queues[ctx.device_id];
+  // Get command queue for given device.
+  id<MTLCommandQueue> GetCommandQueue(Device dev) {
+    ICHECK_EQ(dev.device_type, kDLMetal);
+    ICHECK(dev.device_id >= 0 && static_cast<size_t>(dev.device_id) < queues.size())
+        << "Invalid Metal device_id=" << dev.device_id;
+    return queues[dev.device_id];
   }
-  // Get device for given context
-  id<MTLDevice> GetDevice(TVMContext ctx) {
-    ICHECK_EQ(ctx.device_type, kDLMetal);
-    ICHECK(ctx.device_id >= 0 && static_cast<size_t>(ctx.device_id) < devices.size())
-        << "Invalid Metal device_id=" << ctx.device_id;
-    return devices[ctx.device_id];
+  // Get device for given device
+  id<MTLDevice> GetDevice(Device dev) {
+    ICHECK_EQ(dev.device_type, kDLMetal);
+    ICHECK(dev.device_id >= 0 && static_cast<size_t>(dev.device_id) < devices.size())
+        << "Invalid Metal device_id=" << dev.device_id;
+    return devices[dev.device_id];
   }
   // Initialize workspace
   // Return false if already initialized, otherwise return true.
   void Init();
   // override device API
-  void SetDevice(TVMContext ctx) final;
-  void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final;
-  void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment, DLDataType type_hint) final;
-  void FreeDataSpace(TVMContext ctx, void* ptr) final;
-  void StreamSync(TVMContext ctx, TVMStreamHandle stream) final;
-  void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final;
-  void FreeWorkspace(TVMContext ctx, void* data) final;
+  void SetDevice(Device dev) final;
+  void GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) final;
+  void* AllocDataSpace(Device dev, size_t nbytes, size_t alignment, DLDataType type_hint) final;
+  void FreeDataSpace(Device dev, void* ptr) final;
+  void StreamSync(Device dev, TVMStreamHandle stream) final;
+  void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final;
+  void FreeWorkspace(Device dev, void* data) final;
   // get the global workspace
   static MetalWorkspace* Global();
 
  protected:
   void CopyDataFromTo(const void* from, size_t from_size, void* to, size_t to_size, size_t size,
-                      TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
+                      Device dev_from, Device dev_to, DLDataType type_hint,
                       TVMStreamHandle stream) final;
 };
 
 /*! \brief Thread local workspace */
 class MetalThreadEntry {
  public:
-  /*! \brief The current context */
-  TVMContext context;
+  /*! \brief The current device */
+  Device device;
   /*! \brief The shared buffer used for copy. */
   std::vector<id<MTLBuffer> > temp_buffer_;
   /*! \brief workspace pool */
   WorkspacePool pool;
   // constructor
   MetalThreadEntry() : pool(static_cast<DLDeviceType>(kDLMetal), MetalWorkspace::Global()) {
-    context.device_id = 0;
-    context.device_type = static_cast<DLDeviceType>(kDLMetal);
+    device.device_id = 0;
+    device.device_type = static_cast<DLDeviceType>(kDLMetal);
   }
   ~MetalThreadEntry();
-  // Get temp buffer with at least size under ctx.
-  id<MTLBuffer> GetTempBuffer(TVMContext ctx, size_t size);
+  // Get temp buffer with at least size under dev.
+  id<MTLBuffer> GetTempBuffer(Device dev, size_t size);
   // get the global workspace
   static MetalThreadEntry* ThreadLocal();
 };
diff --git a/src/runtime/metal/metal_device_api.mm b/src/runtime/metal/metal_device_api.mm
index 0169a4c2ec28..cf8520864e99 100644
--- a/src/runtime/metal/metal_device_api.mm
+++ b/src/runtime/metal/metal_device_api.mm
@@ -30,50 +30,54 @@
 namespace metal {
 
 MetalWorkspace* MetalWorkspace::Global() {
-  // NOTE: explicitly use new to avoid exit-time destruction of global state
-  // Global state will be recycled by OS as the process exits.
-  static MetalWorkspace* inst = new MetalWorkspace();
-  return inst;
+  @autoreleasepool {
+    // NOTE: explicitly use new to avoid exit-time destruction of global state
+    // Global state will be recycled by OS as the process exits.
+    static MetalWorkspace* inst = new MetalWorkspace();
+    return inst;
+  }
 }
 
-void MetalWorkspace::GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) {
-  this->Init();
-  size_t index = static_cast<size_t>(ctx.device_id);
-  if (kind == kExist) {
-    *rv = int(index < devices.size());
-    return;
-  }
-  ICHECK_LT(index, devices.size()) << "Invalid device id " << index;
-  switch (kind) {
-    case kMaxThreadsPerBlock: {
-      *rv = static_cast<int>([devices[ctx.device_id] maxThreadsPerThreadgroup].width);
-      break;
+void MetalWorkspace::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) {
+  @autoreleasepool {
+    this->Init();
+    size_t index = static_cast<size_t>(dev.device_id);
+    if (kind == kExist) {
+      *rv = int(index < devices.size());
+      return;
     }
-    case kWarpSize: {
-      // Set warp size to be 1 for safty reason.
-      *rv = 1;
-      break;
+    ICHECK_LT(index, devices.size()) << "Invalid device id " << index;
+    switch (kind) {
+      case kMaxThreadsPerBlock: {
+        *rv = static_cast<int>([devices[dev.device_id] maxThreadsPerThreadgroup].width);
+        break;
+      }
+      case kWarpSize: {
+        // Set warp size to be 1 for safty reason.
+        *rv = 1;
+        break;
+      }
+      case kMaxSharedMemoryPerBlock:
+        return;
+      case kComputeVersion:
+        return;
+      case kDeviceName:
+        return;
+      case kMaxClockRate:
+        return;
+      case kMultiProcessorCount:
+        return;
+      case kMaxThreadDimensions:
+        return;
+      case kExist:
+        return;
+      case kMaxRegistersPerBlock:
+        return;
+      case kGcnArch:
+        return;
+      case kApiVersion:
+        return;
     }
-    case kMaxSharedMemoryPerBlock:
-      return;
-    case kComputeVersion:
-      return;
-    case kDeviceName:
-      return;
-    case kMaxClockRate:
-      return;
-    case kMultiProcessorCount:
-      return;
-    case kMaxThreadDimensions:
-      return;
-    case kExist:
-      return;
-    case kMaxRegistersPerBlock:
-      return;
-    case kGcnArch:
-      return;
-    case kApiVersion:
-      return;
   }
 }
 
@@ -106,7 +110,11 @@ int GetWarpSize(id<MTLDevice> dev) {
   ICHECK(f != nil);
   id<MTLComputePipelineState> state = [dev newComputePipelineStateWithFunction:f error:&error_msg];
   ICHECK(state != nil) << [[error_msg localizedDescription] UTF8String];
-  return static_cast<int>(state.threadExecutionWidth);
+  int size = static_cast<int>(state.threadExecutionWidth);
+  [state release];
+  [f release];
+  [lib release];
+  return size;
 }
 
 MetalWorkspace::~MetalWorkspace() {
@@ -127,130 +135,137 @@ int GetWarpSize(id<MTLDevice> dev) {
 #if TARGET_OS_IPHONE
   // on iPhone
   id<MTLDevice> d = MTLCreateSystemDefaultDevice();
-  devices.push_back([d retain]);
-  queues.push_back([[d newCommandQueue] retain]);
+  devices.push_back(d);
+  queues.push_back([d newCommandQueue]);
 #else
   NSArray<id<MTLDevice> >* devs = MTLCopyAllDevices();
   for (size_t i = 0; i < devs.count; ++i) {
     id<MTLDevice> d = [devs objectAtIndex:i];
-    devices.push_back([d retain]);
-    queues.push_back([[d newCommandQueue] retain]);
+    devices.push_back(d);
+    queues.push_back([d newCommandQueue]);
     LOG(INFO) << "Intializing Metal device " << i << ", name=" << [d.name UTF8String];
     warp_size.push_back(GetWarpSize(d));
   }
 #endif
 }
 
-void MetalWorkspace::SetDevice(TVMContext ctx) {
-  MetalThreadEntry::ThreadLocal()->context.device_id = ctx.device_id;
+void MetalWorkspace::SetDevice(Device dev) {
+  MetalThreadEntry::ThreadLocal()->device.device_id = dev.device_id;
 }
 
-void* MetalWorkspace::AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment,
+void* MetalWorkspace::AllocDataSpace(Device device, size_t nbytes, size_t alignment,
                                      DLDataType type_hint) {
-  this->Init();
-  id<MTLDevice> dev = GetDevice(ctx);
-  // GPU memory only
-  MTLResourceOptions storage_mode = MTLResourceStorageModePrivate;
-  /*
-  #if TARGET_OS_IPHONE
-  storage_mode = MTLResourceStorageModeShared;
-  #else
-  storage_mode = MTLResourceStorageModeManaged;
-  #endif
-  */
-  id<MTLBuffer> buf = [dev newBufferWithLength:nbytes options:storage_mode];
-  ICHECK(buf != nil);
-  return (void*)(CFBridgingRetain(buf));
+  @autoreleasepool {
+    this->Init();
+    id<MTLDevice> dev = GetDevice(device);
+    // GPU memory only
+    MTLResourceOptions storage_mode = MTLResourceStorageModePrivate;
+    /*
+    #if TARGET_OS_IPHONE
+    storage_mode = MTLResourceStorageModeShared;
+    #else
+    storage_mode = MTLResourceStorageModeManaged;
+    #endif
+    */
+    id<MTLBuffer> buf = [dev newBufferWithLength:nbytes options:storage_mode];
+    ICHECK(buf != nil);
+    return (void*)(buf);
+  }
 }
 
-void MetalWorkspace::FreeDataSpace(TVMContext ctx, void* ptr) {
-  // MTLBuffer PurgeableState should be set to empty before manual
-  // release in order to prevent memory leak
-  [(id<MTLBuffer>)ptr setPurgeableState:MTLPurgeableStateEmpty];
-  // release the ptr.
-  CFRelease(ptr);
+void MetalWorkspace::FreeDataSpace(Device dev, void* ptr) {
+  @autoreleasepool {
+    // MTLBuffer PurgeableState should be set to empty before manual
+    // release in order to prevent memory leak
+    [(id<MTLBuffer>)ptr setPurgeableState:MTLPurgeableStateEmpty];
+    // release the ptr.
+    CFRelease(ptr);
+  }
 }
 
 void MetalWorkspace::CopyDataFromTo(const void* from, size_t from_offset, void* to,
-                                    size_t to_offset, size_t size, TVMContext ctx_from,
-                                    TVMContext ctx_to, DLDataType type_hint,
-                                    TVMStreamHandle stream) {
-  this->Init();
-  ICHECK(stream == nullptr);
-  TVMContext ctx = ctx_from;
-  if (ctx_from.device_type == kDLCPU) ctx = ctx_to;
-  id<MTLCommandQueue> queue = GetCommandQueue(ctx);
-  id<MTLCommandBuffer> cb = [queue commandBuffer];
-  int from_dev_type = static_cast<int>(ctx_from.device_type);
-  int to_dev_type = static_cast<int>(ctx_to.device_type);
+                                    size_t to_offset, size_t size, Device dev_from, Device dev_to,
+                                    DLDataType type_hint, TVMStreamHandle stream) {
+  @autoreleasepool {
+    this->Init();
+    ICHECK(stream == nullptr);
+    Device dev = dev_from;
+    if (dev_from.device_type == kDLCPU) dev = dev_to;
+    id<MTLCommandQueue> queue = GetCommandQueue(dev);
+    id<MTLCommandBuffer> cb = [queue commandBuffer];
+    int from_dev_type = static_cast<int>(dev_from.device_type);
+    int to_dev_type = static_cast<int>(dev_to.device_type);
 
-  if (from_dev_type == kDLMetal && to_dev_type == kDLMetal) {
-    ICHECK_EQ(ctx_from.device_id, ctx_to.device_id) << "Metal disallow cross device copy.";
-    id<MTLBlitCommandEncoder> encoder = [cb blitCommandEncoder];
-    [encoder copyFromBuffer:(__bridge id<MTLBuffer>)(from)
-               sourceOffset:from_offset
-                   toBuffer:(__bridge id<MTLBuffer>)(to)destinationOffset:to_offset
-                       size:size];
-    [encoder endEncoding];
-    [cb commit];
-  } else if (from_dev_type == kDLMetal && to_dev_type == kDLCPU) {
-    // copy to a local buffer before get into global buffer.
-    id<MTLBuffer> from_buf = (__bridge id<MTLBuffer>)(from);
-    if (from_buf.storageMode != MTLStorageModeShared) {
-      id<MTLBuffer> temp = MetalThreadEntry::ThreadLocal()->GetTempBuffer(ctx_from, size);
+    if (from_dev_type == kDLMetal && to_dev_type == kDLMetal) {
+      ICHECK_EQ(dev_from.device_id, dev_to.device_id) << "Metal disallow cross device copy.";
       id<MTLBlitCommandEncoder> encoder = [cb blitCommandEncoder];
-      [encoder copyFromBuffer:from_buf
+      [encoder copyFromBuffer:(id<MTLBuffer>)(from)
                  sourceOffset:from_offset
-                     toBuffer:temp
-            destinationOffset:0
-                         size:size];
-      [encoder endEncoding];
-      [cb commit];
-      [cb waitUntilCompleted];
-      memcpy(static_cast<char*>(to) + to_offset, static_cast<char*>([temp contents]), size);
-    } else {
-      memcpy(static_cast<char*>(to) + to_offset,
-             static_cast<char*>([from_buf contents]) + from_offset, size);
-    }
-  } else if (from_dev_type == kDLCPU && to_dev_type == kDLMetal) {
-    id<MTLBuffer> to_buf = (__bridge id<MTLBuffer>)(to);
-    if (to_buf.storageMode != MTLStorageModeShared) {
-      id<MTLBuffer> temp = MetalThreadEntry::ThreadLocal()->GetTempBuffer(ctx_to, size);
-      memcpy([temp contents], static_cast<const char*>(from) + from_offset, size);
-      id<MTLBlitCommandEncoder> encoder = [cb blitCommandEncoder];
-      [encoder copyFromBuffer:temp
-                 sourceOffset:0
-                     toBuffer:to_buf
-            destinationOffset:to_offset
+                     toBuffer:(id<MTLBuffer>)(to)destinationOffset:to_offset
                          size:size];
       [encoder endEncoding];
       [cb commit];
-      [cb waitUntilCompleted];
+    } else if (from_dev_type == kDLMetal && to_dev_type == kDLCPU) {
+      // copy to a local buffer before get into global buffer.
+      id<MTLBuffer> from_buf = (id<MTLBuffer>)(from);
+      if (from_buf.storageMode != MTLStorageModeShared) {
+        id<MTLBuffer> temp = MetalThreadEntry::ThreadLocal()->GetTempBuffer(dev_from, size);
+        id<MTLBlitCommandEncoder> encoder = [cb blitCommandEncoder];
+        [encoder copyFromBuffer:from_buf
+                   sourceOffset:from_offset
+                       toBuffer:temp
+              destinationOffset:0
+                           size:size];
+        [encoder endEncoding];
+        [cb commit];
+        [cb waitUntilCompleted];
+        memcpy(static_cast<char*>(to) + to_offset, static_cast<char*>([temp contents]), size);
+      } else {
+        memcpy(static_cast<char*>(to) + to_offset,
+               static_cast<char*>([from_buf contents]) + from_offset, size);
+      }
+    } else if (from_dev_type == kDLCPU && to_dev_type == kDLMetal) {
+      id<MTLBuffer> to_buf = (id<MTLBuffer>)(to);
+      if (to_buf.storageMode != MTLStorageModeShared) {
+        id<MTLBuffer> temp = MetalThreadEntry::ThreadLocal()->GetTempBuffer(dev_to, size);
+        memcpy([temp contents], static_cast<const char*>(from) + from_offset, size);
+        id<MTLBlitCommandEncoder> encoder = [cb blitCommandEncoder];
+        [encoder copyFromBuffer:temp
+                   sourceOffset:0
+                       toBuffer:to_buf
+              destinationOffset:to_offset
+                           size:size];
+        [encoder endEncoding];
+        [cb commit];
+        [cb waitUntilCompleted];
+      } else {
+        memcpy(static_cast<char*>([to_buf contents]) + to_offset,
+               static_cast<const char*>(from) + from_offset, size);
+      }
     } else {
-      memcpy(static_cast<char*>([to_buf contents]) + to_offset,
-             static_cast<const char*>(from) + from_offset, size);
+      LOG(FATAL) << "Expect copy from/to Metal or between Metal"
+                 << ", from=" << from_dev_type << ", to=" << to_dev_type;
     }
-  } else {
-    LOG(FATAL) << "Expect copy from/to Metal or between Metal"
-               << ", from=" << from_dev_type << ", to=" << to_dev_type;
   }
 }
 
-void MetalWorkspace::StreamSync(TVMContext ctx, TVMStreamHandle stream) {
-  ICHECK(stream == nullptr);
-  // commit an empty command buffer and wait until it completes.
-  id<MTLCommandQueue> queue = GetCommandQueue(ctx);
-  id<MTLCommandBuffer> cb = [queue commandBuffer];
-  [cb commit];
-  [cb waitUntilCompleted];
+void MetalWorkspace::StreamSync(Device dev, TVMStreamHandle stream) {
+  @autoreleasepool {
+    ICHECK(stream == nullptr);
+    // commit an empty command buffer and wait until it completes.
+    id<MTLCommandQueue> queue = GetCommandQueue(dev);
+    id<MTLCommandBuffer> cb = [queue commandBuffer];
+    [cb commit];
+    [cb waitUntilCompleted];
+  }
 }
 
-void* MetalWorkspace::AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) {
-  return MetalThreadEntry::ThreadLocal()->pool.AllocWorkspace(ctx, size);
+void* MetalWorkspace::AllocWorkspace(Device dev, size_t size, DLDataType type_hint) {
+  return MetalThreadEntry::ThreadLocal()->pool.AllocWorkspace(dev, size);
 }
 
-void MetalWorkspace::FreeWorkspace(TVMContext ctx, void* data) {
-  MetalThreadEntry::ThreadLocal()->pool.FreeWorkspace(ctx, data);
+void MetalWorkspace::FreeWorkspace(Device dev, void* data) {
+  MetalThreadEntry::ThreadLocal()->pool.FreeWorkspace(dev, data);
 }
 
 MetalThreadEntry::~MetalThreadEntry() {
@@ -262,19 +277,19 @@ int GetWarpSize(id<MTLDevice> dev) {
   }
 }
 
-id<MTLBuffer> MetalThreadEntry::GetTempBuffer(TVMContext ctx, size_t size) {
-  if (temp_buffer_.size() <= static_cast<size_t>(ctx.device_id)) {
-    temp_buffer_.resize(ctx.device_id + 1, nil);
+id<MTLBuffer> MetalThreadEntry::GetTempBuffer(Device dev, size_t size) {
+  if (temp_buffer_.size() <= static_cast<size_t>(dev.device_id)) {
+    temp_buffer_.resize(dev.device_id + 1, nil);
   }
-  if (temp_buffer_[ctx.device_id] == nil || temp_buffer_[ctx.device_id].length < size) {
-    id<MTLDevice> dev = MetalWorkspace::Global()->GetDevice(ctx);
-    if (temp_buffer_[ctx.device_id] != nil) {
-      [temp_buffer_[ctx.device_id] release];
+  if (temp_buffer_[dev.device_id] == nil || temp_buffer_[dev.device_id].length < size) {
+    id<MTLDevice> mtl_dev = MetalWorkspace::Global()->GetDevice(dev);
+    if (temp_buffer_[dev.device_id] != nil) {
+      [temp_buffer_[dev.device_id] setPurgeableState:MTLPurgeableStateEmpty];
+      [temp_buffer_[dev.device_id] release];
     }
-    temp_buffer_[ctx.device_id] = [[dev newBufferWithLength:size
-                                                    options:MTLStorageModeShared] retain];
+    temp_buffer_[dev.device_id] = [mtl_dev newBufferWithLength:size options:MTLStorageModeShared];
   }
-  return temp_buffer_[ctx.device_id];
+  return temp_buffer_[dev.device_id];
 }
 
 typedef dmlc::ThreadLocalStore<MetalThreadEntry> MetalThreadStore;
diff --git a/src/runtime/metal/metal_module.mm b/src/runtime/metal/metal_module.mm
index 8f1fde86f074..a8b01815bf68 100644
--- a/src/runtime/metal/metal_module.mm
+++ b/src/runtime/metal/metal_module.mm
@@ -113,7 +113,6 @@ void SaveToBinary(dmlc::Stream* stream) final {
           LOG(FATAL) << "Fail to compile metal lib:" << [[err_msg localizedDescription] UTF8String];
         }
       }
-      [e.lib retain];
     }
     id<MTLFunction> f =
         [e.lib newFunctionWithName:[NSString stringWithUTF8String:func_name.c_str()]];
@@ -123,11 +122,13 @@ void SaveToBinary(dmlc::Stream* stream) final {
     ICHECK(state != nil) << "cannot get state:"
                          << " for function " << func_name
                          << [[err_msg localizedDescription] UTF8String];
+    [f release];
     // The state.threadExecutionWidth can change dynamically according
     // to the resource constraint in kernel, so it is not strictly hold
     // Turn of warp aware optimziation for now.
     // ICHECK_EQ(state.threadExecutionWidth, w->warp_size[device_id]);
-    e.smap[func_name] = [state retain];
+    if (e.smap[func_name] != nil) [e.smap[func_name] release];
+    e.smap[func_name] = state;
     return state;
   }
 
@@ -176,36 +177,41 @@ void Init(MetalModuleNode* m, ObjectPtr<Object> sptr, const std::string& func_na
     std::fill(scache_.begin(), scache_.end(), (id<MTLComputePipelineState>)nil);
     thread_axis_cfg_.Init(num_buffer_args + num_pack_args, thread_axis_tags);
     metal::MetalThreadEntry* t = metal::MetalThreadEntry::ThreadLocal();
-    int dev_id = t->context.device_id;
+    int dev_id = t->device.device_id;
     scache_[dev_id] = m->GetPipelineState(dev_id, func_name);
   }
   // invoke the function with void arguments
   void operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion64* pack_args) const {
-    metal::MetalThreadEntry* t = metal::MetalThreadEntry::ThreadLocal();
-    int device_id = t->context.device_id;
-    if (scache_[device_id] == nil) {
-      scache_[device_id] = m_->GetPipelineState(device_id, func_name_);
-    }
-    ThreadWorkLoad wl = thread_axis_cfg_.Extract(args);
-    id<MTLCommandQueue> queue = w_->GetCommandQueue(t->context);
-    id<MTLCommandBuffer> cb = [queue commandBuffer];
-    id<MTLComputeCommandEncoder> encoder = [cb computeCommandEncoder];
-    [encoder setComputePipelineState:scache_[device_id]];
-    for (size_t i = 0; i < num_buffer_args_; ++i) {
-      void* buf = args[static_cast<int>(i)];
-      [encoder setBuffer:(__bridge id<MTLBuffer>)(buf) offset:0 atIndex:i];
-    }
-    if (num_pack_args_ != 0) {
-      [encoder setBytes:pack_args
-                 length:num_pack_args_ * sizeof(ArgUnion64)
-                atIndex:num_buffer_args_];
+    @autoreleasepool {
+      metal::MetalThreadEntry* t = metal::MetalThreadEntry::ThreadLocal();
+      int device_id = t->device.device_id;
+      if (scache_[device_id] == nil) {
+        scache_[device_id] = m_->GetPipelineState(device_id, func_name_);
+      }
+      ThreadWorkLoad wl = thread_axis_cfg_.Extract(args);
+      int blockSize = wl.block_dim(0) * wl.block_dim(1) * wl.block_dim(2);
+      auto maxTotalThreadsPerThreadgroup = scache_[device_id].maxTotalThreadsPerThreadgroup;
+      CHECK_LE(blockSize, maxTotalThreadsPerThreadgroup);
+      id<MTLCommandQueue> queue = w_->GetCommandQueue(t->device);
+      id<MTLCommandBuffer> cb = [queue commandBuffer];
+      id<MTLComputeCommandEncoder> encoder = [cb computeCommandEncoder];
+      [encoder setComputePipelineState:scache_[device_id]];
+      for (size_t i = 0; i < num_buffer_args_; ++i) {
+        void* buf = args[static_cast<int>(i)];
+        [encoder setBuffer:(id<MTLBuffer>)(buf) offset:0 atIndex:i];
+      }
+      if (num_pack_args_ != 0) {
+        [encoder setBytes:pack_args
+                   length:num_pack_args_ * sizeof(ArgUnion64)
+                  atIndex:num_buffer_args_];
+      }
+      // launch
+      MTLSize dimGrid = MTLSizeMake(wl.grid_dim(0), wl.grid_dim(1), wl.grid_dim(2));
+      MTLSize dimBlock = MTLSizeMake(wl.block_dim(0), wl.block_dim(1), wl.block_dim(2));
+      [encoder dispatchThreadgroups:dimGrid threadsPerThreadgroup:dimBlock];
+      [encoder endEncoding];
+      [cb commit];
     }
-    // launch
-    MTLSize dimGrid = MTLSizeMake(wl.grid_dim(0), wl.grid_dim(1), wl.grid_dim(2));
-    MTLSize dimBlock = MTLSizeMake(wl.block_dim(0), wl.block_dim(1), wl.block_dim(2));
-    [encoder dispatchThreadgroups:dimGrid threadsPerThreadgroup:dimBlock];
-    [encoder endEncoding];
-    [cb commit];
   }
 
  private:
@@ -230,23 +236,27 @@ void operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion64* pack_args) cons
 
 PackedFunc MetalModuleNode::GetFunction(const std::string& name,
                                         const ObjectPtr<Object>& sptr_to_self) {
-  ICHECK_EQ(sptr_to_self.get(), this);
-  ICHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main";
-  auto it = fmap_.find(name);
-  if (it == fmap_.end()) return PackedFunc();
-  const FunctionInfo& info = it->second;
-  MetalWrappedFunc f;
-  size_t num_buffer_args = NumBufferArgs(info.arg_types);
-  f.Init(this, sptr_to_self, name, num_buffer_args, info.arg_types.size() - num_buffer_args,
-         info.thread_axis_tags);
-  return PackFuncNonBufferArg(f, info.arg_types);
+  @autoreleasepool {
+    ICHECK_EQ(sptr_to_self.get(), this);
+    ICHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main";
+    auto it = fmap_.find(name);
+    if (it == fmap_.end()) return PackedFunc();
+    const FunctionInfo& info = it->second;
+    MetalWrappedFunc f;
+    size_t num_buffer_args = NumBufferArgs(info.arg_types);
+    f.Init(this, sptr_to_self, name, num_buffer_args, info.arg_types.size() - num_buffer_args,
+           info.thread_axis_tags);
+    return PackFuncNonBufferArg(f, info.arg_types);
+  }
 }
 
 Module MetalModuleCreate(std::string data, std::string fmt,
                          std::unordered_map<std::string, FunctionInfo> fmap, std::string source) {
-  metal::MetalWorkspace::Global()->Init();
-  auto n = make_object<MetalModuleNode>(data, fmt, fmap, source);
-  return Module(n);
+  @autoreleasepool {
+    metal::MetalWorkspace::Global()->Init();
+    auto n = make_object<MetalModuleNode>(data, fmt, fmap, source);
+    return Module(n);
+  }
 }
 
 // Load module from module.
diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc
index 6c0d0c4c40fe..cd916d46971d 100644
--- a/src/runtime/micro/micro_session.cc
+++ b/src/runtime/micro/micro_session.cc
@@ -25,8 +25,8 @@
 
 #include <tvm/runtime/crt/rpc_common/framing.h>
 #include <tvm/runtime/crt/rpc_common/session.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 #include <algorithm>
 #include <chrono>
diff --git a/src/runtime/micro/standalone/utvm_graph_runtime.cc b/src/runtime/micro/standalone/utvm_graph_executor.cc
similarity index 94%
rename from src/runtime/micro/standalone/utvm_graph_runtime.cc
rename to src/runtime/micro/standalone/utvm_graph_executor.cc
index e19ee347a45e..920faa134cf5 100644
--- a/src/runtime/micro/standalone/utvm_graph_runtime.cc
+++ b/src/runtime/micro/standalone/utvm_graph_executor.cc
@@ -17,7 +17,7 @@
  * under the License.
  */
 
-#include "utvm_graph_runtime.h"
+#include "utvm_graph_executor.h"
 
 #include <dlfcn.h>
 
@@ -122,7 +122,7 @@ void ParseArgNodes(const picojson::array& jinput_nodes, DynArray<uint32_t>* inpu
 
 NDArray::~NDArray() {}
 
-NDArray NDArray::Empty(const DynArray<int64_t>& shape, DLDataType dtype, DLContext ctx) {
+NDArray NDArray::Empty(const DynArray<int64_t>& shape, DLDataType dtype, DLDevice dev) {
   NDArray r;
   int64_t nbytes = (dtype.bits * dtype.lanes + 7) / 8;
   for (const auto& s : shape) {
@@ -130,16 +130,16 @@ NDArray NDArray::Empty(const DynArray<int64_t>& shape, DLDataType dtype, DLConte
   }
 
   r.storage_ = std::shared_ptr<void>(
-      TVMBackendAllocWorkspace(static_cast<int>(ctx.device_type), static_cast<int>(ctx.device_id),
+      TVMBackendAllocWorkspace(static_cast<int>(dev.device_type), static_cast<int>(dev.device_id),
                                nbytes, dtype.code, dtype.bits),
       [=](void* ptr) {
         if (ptr) {
-          TVMBackendFreeWorkspace(ctx.device_type, ctx.device_id, ptr);
+          TVMBackendFreeWorkspace(dev.device_type, dev.device_id, ptr);
         }
       });
   r.shape_ = shape;
   r.dtype_ = dtype;
-  r.ctx_ = ctx;
+  r.device_ = dev;
   return r;
 }
 
@@ -148,7 +148,7 @@ NDArray NDArray::CreateView(const DynArray<int64_t>& shape, DLDataType dtype) {
   r.storage_ = storage_;
   r.shape_ = shape;
   r.dtype_ = dtype;
-  r.ctx_ = ctx_;
+  r.device_ = device_;
   return r;
 }
 
@@ -156,7 +156,7 @@ DLTensor NDArray::ToDLTensor() {
   DLTensor r;
   r.data = storage_.get();
   assert(r.data != nullptr);
-  r.ctx = ctx_;
+  r.device = device_;
   r.ndim = shape_.size();
   r.dtype = dtype_;
   r.shape = shape_.data();
@@ -226,7 +226,7 @@ void* DSOModule::GetSymbol(const char* name) const {
   return f;
 }
 
-MicroGraphRuntime::MicroGraphRuntime(const std::string& graph_json, DSOModule* module) {
+MicroGraphExecutor::MicroGraphExecutor(const std::string& graph_json, DSOModule* module) {
   assert(module);
   module_ = module;
   picojson::value v;
@@ -240,28 +240,28 @@ MicroGraphRuntime::MicroGraphRuntime(const std::string& graph_json, DSOModule* m
   SetupOpExecs();
 }
 
-MicroGraphRuntime::~MicroGraphRuntime() {}
+MicroGraphExecutor::~MicroGraphExecutor() {}
 
-void MicroGraphRuntime::Run() {
+void MicroGraphExecutor::Run() {
   for (size_t i = 0; i < op_execs_.size(); ++i) {
     if (op_execs_[i]) op_execs_[i]();
   }
 }
 
-void MicroGraphRuntime::SetInput(int index, DLTensor* data_in) {
+void MicroGraphExecutor::SetInput(int index, DLTensor* data_in) {
   assert(static_cast<size_t>(index) < input_nodes_.size());
   uint32_t eid = this->entry_id(input_nodes_[index], 0);
   data_entry_[eid].CopyFrom(data_in);
 }
 
-void MicroGraphRuntime::CopyOutputTo(int index, DLTensor* data_out) {
+void MicroGraphExecutor::CopyOutputTo(int index, DLTensor* data_out) {
   assert(static_cast<size_t>(index) < outputs_.size());
   uint32_t eid = this->entry_id(outputs_[index]);
   const NDArray& data = data_entry_[eid];
   data.CopyTo(data_out);
 }
 
-void MicroGraphRuntime::SetupStorage() {
+void MicroGraphExecutor::SetupStorage() {
   // Grab saved optimization plan from graph.
   DynArray<DLDataType> vtype(attrs_.dltype.size());
   for (size_t i = 0; i < attrs_.dltype.size(); ++i) {
@@ -279,7 +279,7 @@ void MicroGraphRuntime::SetupStorage() {
   for (size_t i = 0; i < attrs_.shape.size(); ++i) {
     int storage_id = attrs_.storage_id[i];
     // Use the fallback device if no device index is available.
-    int device_type = static_cast<int>(ctx_.device_type);
+    int device_type = static_cast<int>(device_.device_type);
     size_t size = 1;
     for (int64_t sz : attrs_.shape[i]) {
       size *= static_cast<size_t>(sz);
@@ -306,7 +306,7 @@ void MicroGraphRuntime::SetupStorage() {
     const auto& pit = pool_entry[i];
     DynArray<int64_t> shape(1);
     shape[0] = static_cast<int64_t>(pit.size + 3) / 4;
-    storage_pool_[i] = NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx_);
+    storage_pool_[i] = NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, device_);
   }
 
   // Assign the pooled entries. A unified memory pool is used to simplify
@@ -373,7 +373,7 @@ std::function<void()> CreateTVMOp(const DSOModule& module, const TVMOpParam& par
   return fexec;
 }
 
-void MicroGraphRuntime::SetupOpExecs() {
+void MicroGraphExecutor::SetupOpExecs() {
   op_execs_.resize(nodes_.size());
   // setup the array and requirements.
   for (uint32_t nid = 0; nid < nodes_.size(); ++nid) {
diff --git a/src/runtime/micro/standalone/utvm_graph_runtime.h b/src/runtime/micro/standalone/utvm_graph_executor.h
similarity index 87%
rename from src/runtime/micro/standalone/utvm_graph_runtime.h
rename to src/runtime/micro/standalone/utvm_graph_executor.h
index b479193861bb..afede6a7b30a 100644
--- a/src/runtime/micro/standalone/utvm_graph_runtime.h
+++ b/src/runtime/micro/standalone/utvm_graph_executor.h
@@ -17,8 +17,8 @@
  * under the License.
  */
 
-#ifndef TVM_RUNTIME_MICRO_STANDALONE_UTVM_GRAPH_RUNTIME_H_
-#define TVM_RUNTIME_MICRO_STANDALONE_UTVM_GRAPH_RUNTIME_H_
+#ifndef TVM_RUNTIME_MICRO_STANDALONE_UTVM_GRAPH_EXECUTOR_H_
+#define TVM_RUNTIME_MICRO_STANDALONE_UTVM_GRAPH_EXECUTOR_H_
 
 #include <dlpack/dlpack.h>
 
@@ -93,8 +93,8 @@ struct Node {
 // Minimal NDArray abstraction
 class NDArray {
  public:
-  // initialize NDArray with shape/dtype/ctx
-  static NDArray Empty(const DynArray<int64_t>& shape, DLDataType dtype, DLContext ctx);
+  // initialize NDArray with shape/dtype/device
+  static NDArray Empty(const DynArray<int64_t>& shape, DLDataType dtype, DLDevice dev);
   // create a view of the NDArray storage, with the given shape/dtype
   NDArray CreateView(const DynArray<int64_t>& shape, DLDataType dtype);
   // Copy into the internal storage.
@@ -112,16 +112,16 @@ class NDArray {
   DynArray<int64_t> shape_;
   // tensor dtype
   DLDataType dtype_;
-  // tensor context
-  DLContext ctx_;
+  // tensor device
+  DLDevice device_;
 };
 
-// Minimal GraphRuntime implementation
-class MicroGraphRuntime {
+// Minimal GraphExecutor implementation
+class MicroGraphExecutor {
  public:
-  // Construct a GraphRuntime with the given graph and DSOModule.
-  MicroGraphRuntime(const std::string& graph_json, DSOModule* module);
-  ~MicroGraphRuntime();
+  // Construct a GraphExecutor with the given graph and DSOModule.
+  MicroGraphExecutor(const std::string& graph_json, DSOModule* module);
+  ~MicroGraphExecutor();
   // Run the graph
   void Run();
   // Set the input at `index` to a copy of the tensor `data_in`
@@ -150,8 +150,8 @@ class MicroGraphRuntime {
   DynArray<NodeEntry> outputs_;
   // Additional graph attributes
   GraphAttr attrs_;
-  // Execution context
-  DLContext ctx_{kDLCPU, 0};
+  // Execution device
+  DLDevice device_{kDLCPU, 0};
 
   // Common storage pool
   DynArray<NDArray> storage_pool_;
@@ -164,4 +164,4 @@ class MicroGraphRuntime {
 }  // namespace micro
 }  // namespace tvm
 
-#endif  // TVM_RUNTIME_MICRO_STANDALONE_UTVM_GRAPH_RUNTIME_H_
+#endif  // TVM_RUNTIME_MICRO_STANDALONE_UTVM_GRAPH_EXECUTOR_H_
diff --git a/src/runtime/micro/standalone/utvm_runtime.cc b/src/runtime/micro/standalone/utvm_runtime.cc
index 73d616b6d482..585da9300128 100644
--- a/src/runtime/micro/standalone/utvm_runtime.cc
+++ b/src/runtime/micro/standalone/utvm_runtime.cc
@@ -20,28 +20,28 @@
 
 #include <cassert>
 
-#include "utvm_graph_runtime.h"
+#include "utvm_graph_executor.h"
 
 void* UTVMRuntimeCreate(const char* json, size_t json_len, void* module) {
-  return new tvm::micro::MicroGraphRuntime(std::string(json, json + json_len),
-                                           reinterpret_cast<tvm::micro::DSOModule*>(module));
+  return new tvm::micro::MicroGraphExecutor(std::string(json, json + json_len),
+                                            reinterpret_cast<tvm::micro::DSOModule*>(module));
 }
 
 void UTVMRuntimeDestroy(void* handle) {
-  delete reinterpret_cast<tvm::micro::MicroGraphRuntime*>(handle);
+  delete reinterpret_cast<tvm::micro::MicroGraphExecutor*>(handle);
 }
 
 void UTVMRuntimeSetInput(void* handle, int index, void* tensor) {
-  reinterpret_cast<tvm::micro::MicroGraphRuntime*>(handle)->SetInput(
+  reinterpret_cast<tvm::micro::MicroGraphExecutor*>(handle)->SetInput(
       index, reinterpret_cast<DLTensor*>(tensor));
 }
 
 void UTVMRuntimeRun(void* handle) {
-  reinterpret_cast<tvm::micro::MicroGraphRuntime*>(handle)->Run();
+  reinterpret_cast<tvm::micro::MicroGraphExecutor*>(handle)->Run();
 }
 
 void UTVMRuntimeGetOutput(void* handle, int index, void* tensor) {
-  reinterpret_cast<tvm::micro::MicroGraphRuntime*>(handle)->CopyOutputTo(
+  reinterpret_cast<tvm::micro::MicroGraphExecutor*>(handle)->CopyOutputTo(
       index, reinterpret_cast<DLTensor*>(tensor));
 }
 void* UTVMRuntimeDSOModuleCreate(const char* so, size_t so_len) {
diff --git a/src/runtime/minrpc/minrpc_server.h b/src/runtime/minrpc/minrpc_server.h
index d5c61eccfd6d..732e1e49d4a4 100644
--- a/src/runtime/minrpc/minrpc_server.h
+++ b/src/runtime/minrpc/minrpc_server.h
@@ -46,7 +46,7 @@
 #endif
 
 #if TVM_MINRPC_ENABLE_LOGGING
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 #endif
 
 namespace tvm {
@@ -173,7 +173,7 @@ class MinRPCServer {
     uint64_t data_handle;
     this->Read(&data_handle);
     arr->data = reinterpret_cast<void*>(data_handle);
-    this->Read(&(arr->ctx));
+    this->Read(&(arr->device));
     this->Read(&(arr->ndim));
     this->Read(&(arr->dtype));
     arr->shape = this->ArenaAlloc<int64_t>(arr->ndim);
@@ -186,13 +186,13 @@ class MinRPCServer {
 
     uint8_t* data_ptr;
     int call_ecode = 0;
-    if (arr->ctx.device_type == kDLCPU) {
+    if (arr->device.device_type == kDLCPU) {
       data_ptr = reinterpret_cast<uint8_t*>(data_handle) + arr->byte_offset;
     } else {
       data_ptr = this->ArenaAlloc<uint8_t>(num_bytes);
       DLTensor temp;
       temp.data = reinterpret_cast<void*>(data_ptr);
-      temp.ctx = arr->ctx;
+      temp.device = arr->device;
       temp.ndim = arr->ndim;
       temp.dtype = arr->dtype;
       temp.shape = arr->shape;
@@ -201,7 +201,7 @@ class MinRPCServer {
       call_ecode = TVMDeviceCopyDataFromTo(arr, &temp, nullptr);
       // need sync to make sure that the copy is completed.
       if (call_ecode == 0) {
-        call_ecode = TVMSynchronize(arr->ctx.device_type, arr->ctx.device_id, nullptr);
+        call_ecode = TVMSynchronize(arr->device.device_type, arr->device.device_id, nullptr);
       }
     }
 
@@ -224,7 +224,7 @@ class MinRPCServer {
     uint64_t data_handle;
     this->Read(&data_handle);
     arr->data = reinterpret_cast<void*>(data_handle);
-    this->Read(&(arr->ctx));
+    this->Read(&(arr->device));
     this->Read(&(arr->ndim));
     this->Read(&(arr->dtype));
     arr->shape = this->ArenaAlloc<int64_t>(arr->ndim);
@@ -235,7 +235,7 @@ class MinRPCServer {
     this->Read(&num_bytes);
 
     int call_ecode = 0;
-    if (arr->ctx.device_type == kDLCPU) {
+    if (arr->device.device_type == kDLCPU) {
       uint8_t* dptr = reinterpret_cast<uint8_t*>(data_handle) + arr->byte_offset;
       this->ReadArray(dptr, num_bytes);
     } else {
@@ -243,7 +243,7 @@ class MinRPCServer {
       this->ReadArray(temp_data, num_bytes);
       DLTensor temp;
       temp.data = temp_data;
-      temp.ctx = DLContext{kDLCPU, 0};
+      temp.device = DLDevice{kDLCPU, 0};
       temp.ndim = arr->ndim;
       temp.dtype = arr->dtype;
       temp.shape = arr->shape;
@@ -252,7 +252,7 @@ class MinRPCServer {
       call_ecode = TVMDeviceCopyDataFromTo(&temp, arr, nullptr);
       // need sync to make sure that the copy is completed.
       if (call_ecode == 0) {
-        call_ecode = TVMSynchronize(arr->ctx.device_type, arr->ctx.device_id, nullptr);
+        call_ecode = TVMSynchronize(arr->device.device_type, arr->device.device_id, nullptr);
       }
     }
 
@@ -390,18 +390,18 @@ class MinRPCServer {
 
   void SyscallDevAllocData(TVMValue* values, int* tcodes, int num_args) {
     MINRPC_CHECK(num_args == 4);
-    MINRPC_CHECK(tcodes[0] == kTVMContext);
+    MINRPC_CHECK(tcodes[0] == kDLDevice);
     MINRPC_CHECK(tcodes[1] == kDLInt);
     MINRPC_CHECK(tcodes[2] == kDLInt);
     MINRPC_CHECK(tcodes[3] == kTVMDataType);
 
-    TVMContext ctx = values[0].v_ctx;
+    DLDevice dev = values[0].v_device;
     int64_t nbytes = values[1].v_int64;
     int64_t alignment = values[2].v_int64;
     DLDataType type_hint = values[3].v_type;
 
     void* handle;
-    int call_ecode = TVMDeviceAllocDataSpace(ctx, nbytes, alignment, type_hint, &handle);
+    int call_ecode = TVMDeviceAllocDataSpace(dev, nbytes, alignment, type_hint, &handle);
 
     if (call_ecode == 0) {
       this->ReturnHandle(handle);
@@ -418,8 +418,8 @@ class MinRPCServer {
     DLTensor* arr = reinterpret_cast<DLTensor*>(values[0].v_handle);
     const char* mem_scope = (tcodes[1] == kTVMNullptr ? nullptr : values[1].v_str);
     void* handle;
-    int call_ecode = TVMDeviceAllocDataSpaceWithScope(arr->ctx, arr->ndim, arr->shape, arr->dtype,
-                                                      mem_scope, &handle);
+    int call_ecode = TVMDeviceAllocDataSpaceWithScope(arr->device, arr->ndim, arr->shape,
+                                                      arr->dtype, mem_scope, &handle);
     if (call_ecode == 0) {
       this->ReturnHandle(handle);
     } else {
@@ -429,13 +429,13 @@ class MinRPCServer {
 
   void SyscallDevFreeData(TVMValue* values, int* tcodes, int num_args) {
     MINRPC_CHECK(num_args == 2);
-    MINRPC_CHECK(tcodes[0] == kTVMContext);
+    MINRPC_CHECK(tcodes[0] == kDLDevice);
     MINRPC_CHECK(tcodes[1] == kTVMOpaqueHandle);
 
-    TVMContext ctx = values[0].v_ctx;
+    DLDevice dev = values[0].v_device;
     void* handle = values[1].v_handle;
 
-    int call_ecode = TVMDeviceFreeDataSpace(ctx, handle);
+    int call_ecode = TVMDeviceFreeDataSpace(dev, handle);
 
     if (call_ecode == 0) {
       this->ReturnVoid();
@@ -446,13 +446,13 @@ class MinRPCServer {
 
   void SyscallDevStreamSync(TVMValue* values, int* tcodes, int num_args) {
     MINRPC_CHECK(num_args == 2);
-    MINRPC_CHECK(tcodes[0] == kTVMContext);
+    MINRPC_CHECK(tcodes[0] == kDLDevice);
     MINRPC_CHECK(tcodes[1] == kTVMOpaqueHandle);
 
-    TVMContext ctx = values[0].v_ctx;
+    DLDevice dev = values[0].v_device;
     void* handle = values[1].v_handle;
 
-    int call_ecode = TVMSynchronize(ctx.device_type, ctx.device_id, handle);
+    int call_ecode = TVMSynchronize(dev.device_type, dev.device_id, handle);
 
     if (call_ecode == 0) {
       this->ReturnVoid();
@@ -511,7 +511,7 @@ class MinRPCServer {
       size_t npages = ((min_size + kPageSize - 1) / kPageSize);
       void* data;
 
-      if (TVMDeviceAllocDataSpace(DLContext{kDLCPU, 0}, npages * kPageSize, kPageAlign,
+      if (TVMDeviceAllocDataSpace(DLDevice{kDLCPU, 0}, npages * kPageSize, kPageAlign,
                                   DLDataType{kDLInt, 1, 1}, &data) != 0) {
         io_->Exit(static_cast<int>(RPCServerStatus::kAllocError));
       }
@@ -523,7 +523,7 @@ class MinRPCServer {
     }
 
     void deallocate(ArenaPageHeader* page) {
-      if (TVMDeviceFreeDataSpace(DLContext{kDLCPU, 0}, page) != 0) {
+      if (TVMDeviceFreeDataSpace(DLDevice{kDLCPU, 0}, page) != 0) {
         io_->Exit(static_cast<int>(RPCServerStatus::kAllocError));
       }
     }
diff --git a/src/runtime/minrpc/rpc_reference.h b/src/runtime/minrpc/rpc_reference.h
index 07d13a7ff67b..e42508a73959 100644
--- a/src/runtime/minrpc/rpc_reference.h
+++ b/src/runtime/minrpc/rpc_reference.h
@@ -223,15 +223,15 @@ struct RPCReference {
 
   template <typename TChannelPtr>
   static void SendDLTensor(TChannelPtr channel, DLTensor* arr) {
-    TVMContext ctx;
+    DLDevice dev;
     uint64_t data;
     // When we return NDArray, we directly return
     // the space and the context
     // The client will be further wrapping
-    ctx = arr->ctx;
+    dev = arr->device;
     data = reinterpret_cast<uint64_t>(arr->data);
     channel->Write(data);
-    channel->Write(ctx);
+    channel->Write(dev);
     channel->Write(arr->ndim);
     channel->Write(arr->dtype);
     channel->WriteArray(arr->shape, arr->ndim);
@@ -249,7 +249,7 @@ struct RPCReference {
     DLTensor* arr = channel->template ArenaAlloc<DLTensor>(1);
     DLTensor& tensor = *arr;
     tensor.data = reinterpret_cast<void*>(handle);
-    channel->Read(&(tensor.ctx));
+    channel->Read(&(tensor.device));
     channel->Read(&(tensor.ndim));
     channel->Read(&(tensor.dtype));
     tensor.shape = channel->template ArenaAlloc<int64_t>(tensor.ndim);
@@ -306,8 +306,8 @@ struct RPCReference {
           channel->template Write<int32_t>(padding);
           break;
         }
-        case kTVMContext: {
-          channel->Write(value.v_ctx);
+        case kDLDevice: {
+          channel->Write(value.v_device);
           break;
         }
 
@@ -408,8 +408,8 @@ struct RPCReference {
           channel->template Read<int32_t>(&padding);
           break;
         }
-        case kTVMContext: {
-          channel->Read(&(value.v_ctx));
+        case kDLDevice: {
+          channel->Read(&(value.v_device));
           break;
         }
         case kTVMPackedFuncHandle:
diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc
index d3ddbf8c0229..4b52a7d37ce2 100644
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -23,9 +23,9 @@
  */
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 #include "runtime_base.h"
 
@@ -66,15 +66,15 @@ void ArrayCopyFromBytes(DLTensor* handle, const void* data, size_t nbytes) {
 
   DLTensor from;
   from.data = const_cast<void*>(data);
-  from.ctx = DLContext{kDLCPU, 0};
+  from.device = Device{kDLCPU, 0};
   from.ndim = handle->ndim;
   from.dtype = handle->dtype;
   from.shape = handle->shape;
   from.strides = nullptr;
   from.byte_offset = 0;
-  DeviceAPI::Get(handle->ctx)->CopyDataFromTo(&from, handle, nullptr);
+  DeviceAPI::Get(handle->device)->CopyDataFromTo(&from, handle, nullptr);
   // Synchronize in case data become unavailable later.
-  DeviceAPI::Get(handle->ctx)->StreamSync(handle->ctx, nullptr);
+  DeviceAPI::Get(handle->device)->StreamSync(handle->device, nullptr);
 }
 
 void ArrayCopyToBytes(const DLTensor* handle, void* data, size_t nbytes) {
@@ -84,16 +84,16 @@ void ArrayCopyToBytes(const DLTensor* handle, void* data, size_t nbytes) {
 
   DLTensor to;
   to.data = const_cast<void*>(data);
-  to.ctx = DLContext{kDLCPU, 0};
+  to.device = Device{kDLCPU, 0};
   to.ndim = handle->ndim;
   to.dtype = handle->dtype;
   to.shape = handle->shape;
   to.strides = nullptr;
   to.byte_offset = 0;
 
-  DeviceAPI::Get(handle->ctx)->CopyDataFromTo(const_cast<DLTensor*>(handle), &to, nullptr);
+  DeviceAPI::Get(handle->device)->CopyDataFromTo(const_cast<DLTensor*>(handle), &to, nullptr);
   // Synchronize in case data become unavailable later.
-  DeviceAPI::Get(handle->ctx)->StreamSync(handle->ctx, nullptr);
+  DeviceAPI::Get(handle->device)->StreamSync(handle->device, nullptr);
 }
 
 struct NDArray::Internal {
@@ -103,8 +103,8 @@ struct NDArray::Internal {
     if (ptr->manager_ctx != nullptr) {
       static_cast<NDArray::Container*>(ptr->manager_ctx)->DecRef();
     } else if (ptr->dl_tensor.data != nullptr) {
-      tvm::runtime::DeviceAPI::Get(ptr->dl_tensor.ctx)
-          ->FreeDataSpace(ptr->dl_tensor.ctx, ptr->dl_tensor.data);
+      tvm::runtime::DeviceAPI::Get(ptr->dl_tensor.device)
+          ->FreeDataSpace(ptr->dl_tensor.device, ptr->dl_tensor.data);
     }
     delete ptr;
   }
@@ -123,7 +123,7 @@ struct NDArray::Internal {
   }
   // Local create function which allocates tensor metadata
   // but does not allocate space for the data.
-  static NDArray Create(std::vector<int64_t> shape, DLDataType dtype, DLContext ctx) {
+  static NDArray Create(std::vector<int64_t> shape, DLDataType dtype, Device dev) {
     VerifyDataType(dtype);
 
     // critical zone: construct header
@@ -138,8 +138,8 @@ struct NDArray::Internal {
     data->dl_tensor.ndim = static_cast<int>(data->shape_.size());
     // setup dtype
     data->dl_tensor.dtype = dtype;
-    // setup ctx
-    data->dl_tensor.ctx = ctx;
+    // setup device
+    data->dl_tensor.device = dev;
     return ret;
   }
   // Implementation of API function
@@ -175,7 +175,7 @@ struct NDArray::Internal {
 NDArray NDArray::CreateView(std::vector<int64_t> shape, DLDataType dtype) {
   ICHECK(data_ != nullptr);
   ICHECK(get_mutable()->dl_tensor.strides == nullptr) << "Can only create view for compact tensor";
-  NDArray ret = Internal::Create(shape, dtype, get_mutable()->dl_tensor.ctx);
+  NDArray ret = Internal::Create(shape, dtype, get_mutable()->dl_tensor.device);
   ret.get_mutable()->dl_tensor.byte_offset = this->get_mutable()->dl_tensor.byte_offset;
   size_t curr_size = GetDataSize(this->get_mutable()->dl_tensor);
   size_t view_size = GetDataSize(ret.get_mutable()->dl_tensor);
@@ -190,11 +190,12 @@ NDArray NDArray::CreateView(std::vector<int64_t> shape, DLDataType dtype) {
 
 DLManagedTensor* NDArray::ToDLPack() const { return Internal::ToDLPack(get_mutable()); }
 
-NDArray NDArray::Empty(std::vector<int64_t> shape, DLDataType dtype, DLContext ctx,
+NDArray NDArray::Empty(std::vector<int64_t> shape, DLDataType dtype, Device dev,
                        Optional<String> mem_scope) {
-  NDArray ret = Internal::Create(shape, dtype, ctx);
-  ret.get_mutable()->dl_tensor.data = DeviceAPI::Get(ret->ctx)->AllocDataSpace(
-      ret->ctx, shape.size(), shape.data(), ret->dtype, mem_scope);
+  NDArray ret = Internal::Create(shape, dtype, dev);
+  ret.get_mutable()->dl_tensor.data =
+      DeviceAPI::Get(ret->device)
+          ->AllocDataSpace(ret->device, shape.size(), shape.data(), ret->dtype, mem_scope);
   return ret;
 }
 
@@ -229,16 +230,16 @@ void NDArray::CopyFromTo(const DLTensor* from, DLTensor* to, TVMStreamHandle str
   size_t to_size = GetDataSize(*to);
   ICHECK_EQ(from_size, to_size) << "TVMArrayCopyFromTo: The size must exactly match";
 
-  ICHECK(from->ctx.device_type == to->ctx.device_type || from->ctx.device_type == kDLCPU ||
-         to->ctx.device_type == kDLCPU || from->ctx.device_type == kDLCPUPinned ||
-         to->ctx.device_type == kDLCPUPinned)
-      << "Can not copy across different ctx types directly";
+  ICHECK(from->device.device_type == to->device.device_type || from->device.device_type == kDLCPU ||
+         to->device.device_type == kDLCPU || from->device.device_type == kDLCPUPinned ||
+         to->device.device_type == kDLCPUPinned)
+      << "Can not copy across different device types directly";
 
-  // Use the context that is *not* a cpu context to get the correct device
+  // Use the device that is *not* a cpu device to get the correct device
   // api manager.
-  TVMContext ctx = from->ctx.device_type != kDLCPU ? from->ctx : to->ctx;
+  Device dev = from->device.device_type != kDLCPU ? from->device : to->device;
 
-  DeviceAPI::Get(ctx)->CopyDataFromTo(const_cast<DLTensor*>(from), to, stream);
+  DeviceAPI::Get(dev)->CopyDataFromTo(const_cast<DLTensor*>(from), to, stream);
 }
 
 std::vector<int64_t> NDArray::Shape() const { return get_mutable()->shape_; }
@@ -270,10 +271,10 @@ int TVMArrayAlloc(const tvm_index_t* shape, int ndim, int dtype_code, int dtype_
   dtype.code = static_cast<uint8_t>(dtype_code);
   dtype.bits = static_cast<uint8_t>(dtype_bits);
   dtype.lanes = static_cast<uint16_t>(dtype_lanes);
-  DLContext ctx;
-  ctx.device_type = static_cast<DLDeviceType>(device_type);
-  ctx.device_id = device_id;
-  auto ndarray = NDArray::Empty(std::vector<int64_t>(shape, shape + ndim), dtype, ctx);
+  Device dev;
+  dev.device_type = static_cast<DLDeviceType>(device_type);
+  dev.device_id = device_id;
+  auto ndarray = NDArray::Empty(std::vector<int64_t>(shape, shape + ndim), dtype, dev);
 
   *out = NDArray::Internal::MoveToFFIHandle(ndarray);
   API_END();
@@ -284,9 +285,9 @@ TVM_REGISTER_GLOBAL("runtime.TVMArrayAllocWithScope").set_body([](TVMArgs args,
   int ndim = args[1];
   std::vector<int64_t> shape(shape_ptr, shape_ptr + ndim);
   DataType dtype = args[2];
-  TVMContext ctx = args[3];
+  Device dev = args[3];
   Optional<String> mem_scope = args[4];
-  auto ndarray = NDArray::Empty(shape, dtype, ctx, mem_scope);
+  auto ndarray = NDArray::Empty(shape, dtype, dev, mem_scope);
   *ret = ndarray;
 });
 
diff --git a/src/runtime/object.cc b/src/runtime/object.cc
index ad68c70698ea..23594b5d7d8a 100644
--- a/src/runtime/object.cc
+++ b/src/runtime/object.cc
@@ -20,9 +20,9 @@
  * \file src/runtime/object.cc
  * \brief Object type management system.
  */
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/object.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 #include <iostream>
 #include <mutex>
@@ -88,7 +88,7 @@ class TypeContext {
     }
     // try to allocate from parent's type table.
     ICHECK_LT(parent_tindex, type_table_.size())
-        << " skey= " << skey << "static_index=" << static_tindex;
+        << " skey=" << skey << ", static_index=" << static_tindex;
     TypeInfo& pinfo = type_table_[parent_tindex];
     ICHECK_EQ(pinfo.index, parent_tindex);
 
diff --git a/src/runtime/opencl/aocl/aocl_common.h b/src/runtime/opencl/aocl/aocl_common.h
index ae1a4a8cc31f..448f5d0ac6d7 100644
--- a/src/runtime/opencl/aocl/aocl_common.h
+++ b/src/runtime/opencl/aocl/aocl_common.h
@@ -39,7 +39,7 @@ class AOCLWorkspace final : public OpenCLWorkspace {
  public:
   // override OpenCL device API
   void Init() final;
-  bool IsOpenCLDevice(TVMContext ctx) final;
+  bool IsOpenCLDevice(Device dev) final;
   OpenCLThreadEntry* GetThreadEntry() final;
   // get the global workspace
   static OpenCLWorkspace* Global();
diff --git a/src/runtime/opencl/aocl/aocl_device_api.cc b/src/runtime/opencl/aocl/aocl_device_api.cc
index 5432507087ca..e407837f6a7f 100644
--- a/src/runtime/opencl/aocl/aocl_device_api.cc
+++ b/src/runtime/opencl/aocl/aocl_device_api.cc
@@ -40,8 +40,8 @@ void AOCLWorkspace::Init() {
   OpenCLWorkspace::Init("aocl", "accelerator", "Intel(R) FPGA SDK for OpenCL(TM)");
 }
 
-bool AOCLWorkspace::IsOpenCLDevice(TVMContext ctx) {
-  return ctx.device_type == static_cast<DLDeviceType>(kDLAOCL);
+bool AOCLWorkspace::IsOpenCLDevice(Device dev) {
+  return dev.device_type == static_cast<DLDeviceType>(kDLAOCL);
 }
 
 typedef dmlc::ThreadLocalStore<AOCLThreadEntry> AOCLThreadStore;
diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index 2e7f05f91020..b4377119e4c7 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -26,8 +26,9 @@
 
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
+#include <tvm/runtime/logging.h>
+#include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
-#include <tvm/support/logging.h>
 
 /* There are many OpenCL platforms that do not yet support OpenCL 2.0,
  * hence we use 1.2 APIs, some of which are now deprecated.  In order
@@ -218,23 +219,23 @@ class OpenCLWorkspace : public DeviceAPI {
             const std::string& platform_name = "");
   virtual void Init() { Init("opencl", "gpu"); }
   // Check whether the context is OpenCL or not.
-  virtual bool IsOpenCLDevice(TVMContext ctx) { return ctx.device_type == kDLOpenCL; }
-  // get the queue of the context
-  cl_command_queue GetQueue(TVMContext ctx) {
-    ICHECK(IsOpenCLDevice(ctx));
+  virtual bool IsOpenCLDevice(Device dev) { return dev.device_type == kDLOpenCL; }
+  // get the queue of the device
+  cl_command_queue GetQueue(Device dev) {
+    ICHECK(IsOpenCLDevice(dev));
     this->Init();
-    ICHECK(ctx.device_id >= 0 && static_cast<size_t>(ctx.device_id) < queues.size())
-        << "Invalid OpenCL device_id=" << ctx.device_id;
-    return queues[ctx.device_id];
+    ICHECK(dev.device_id >= 0 && static_cast<size_t>(dev.device_id) < queues.size())
+        << "Invalid OpenCL device_id=" << dev.device_id;
+    return queues[dev.device_id];
   }
   // override device API
-  void SetDevice(TVMContext ctx) final;
-  void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final;
-  void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment, DLDataType type_hint) final;
-  void FreeDataSpace(TVMContext ctx, void* ptr) final;
-  void StreamSync(TVMContext ctx, TVMStreamHandle stream) final;
-  void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final;
-  void FreeWorkspace(TVMContext ctx, void* data) final;
+  void SetDevice(Device dev) final;
+  void GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) final;
+  void* AllocDataSpace(Device dev, size_t size, size_t alignment, DLDataType type_hint) final;
+  void FreeDataSpace(Device dev, void* ptr) final;
+  void StreamSync(Device dev, TVMStreamHandle stream) final;
+  void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final;
+  void FreeWorkspace(Device dev, void* data) final;
 
   /*!
    * \brief Get the thread local ThreadEntry
@@ -246,7 +247,7 @@ class OpenCLWorkspace : public DeviceAPI {
 
  protected:
   void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
-                      TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
+                      Device dev_from, Device dev_to, DLDataType type_hint,
                       TVMStreamHandle stream) final;
 };
 
@@ -260,16 +261,17 @@ class OpenCLThreadEntry {
     // timestamp used to recognize stale kernel
     size_t version{0};
   };
-  /*! \brief The current context */
-  TVMContext context;
+  /*! \brief The current device */
+  Device device;
   /*! \brief The thread-local kernel table */
   std::vector<KTEntry> kernel_table;
   /*! \brief workspace pool */
   WorkspacePool pool;
   // constructor
-  OpenCLThreadEntry(DLDeviceType device_type, DeviceAPI* device) : pool(device_type, device) {
-    context.device_id = 0;
-    context.device_type = device_type;
+  OpenCLThreadEntry(DLDeviceType device_type, DeviceAPI* device_api)
+      : pool(device_type, device_api) {
+    device.device_id = 0;
+    device.device_type = device_type;
   }
   OpenCLThreadEntry() : OpenCLThreadEntry(kDLOpenCL, OpenCLWorkspace::Global()) {}
 
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index a3ec21e28f1d..8f49279243c2 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -36,13 +36,11 @@ OpenCLWorkspace* OpenCLWorkspace::Global() {
   return inst;
 }
 
-void OpenCLWorkspace::SetDevice(TVMContext ctx) {
-  GetThreadEntry()->context.device_id = ctx.device_id;
-}
+void OpenCLWorkspace::SetDevice(Device dev) { GetThreadEntry()->device.device_id = dev.device_id; }
 
-void OpenCLWorkspace::GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) {
+void OpenCLWorkspace::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) {
   this->Init();
-  size_t index = static_cast<size_t>(ctx.device_id);
+  size_t index = static_cast<size_t>(dev.device_id);
   if (kind == kExist) {
     *rv = static_cast<int>(index < devices.size());
     return;
@@ -116,7 +114,7 @@ void OpenCLWorkspace::GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue*
   }
 }
 
-void* OpenCLWorkspace::AllocDataSpace(TVMContext ctx, size_t size, size_t alignment,
+void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t size, size_t alignment,
                                       DLDataType type_hint) {
   this->Init();
   ICHECK(context != nullptr) << "No OpenCL device";
@@ -126,53 +124,52 @@ void* OpenCLWorkspace::AllocDataSpace(TVMContext ctx, size_t size, size_t alignm
   return mptr;
 }
 
-void OpenCLWorkspace::FreeDataSpace(TVMContext ctx, void* ptr) {
+void OpenCLWorkspace::FreeDataSpace(Device dev, void* ptr) {
   // We have to make sure that the memory object is not in the command queue
   // for some OpenCL platforms.
-  OPENCL_CALL(clFinish(this->GetQueue(ctx)));
+  OPENCL_CALL(clFinish(this->GetQueue(dev)));
 
   cl_mem mptr = static_cast<cl_mem>(ptr);
   OPENCL_CALL(clReleaseMemObject(mptr));
 }
 
 void OpenCLWorkspace::CopyDataFromTo(const void* from, size_t from_offset, void* to,
-                                     size_t to_offset, size_t size, TVMContext ctx_from,
-                                     TVMContext ctx_to, DLDataType type_hint,
-                                     TVMStreamHandle stream) {
+                                     size_t to_offset, size_t size, Device dev_from, Device dev_to,
+                                     DLDataType type_hint, TVMStreamHandle stream) {
   this->Init();
   ICHECK(stream == nullptr);
-  if (IsOpenCLDevice(ctx_from) && IsOpenCLDevice(ctx_to)) {
-    OPENCL_CALL(clEnqueueCopyBuffer(this->GetQueue(ctx_to),
+  if (IsOpenCLDevice(dev_from) && IsOpenCLDevice(dev_to)) {
+    OPENCL_CALL(clEnqueueCopyBuffer(this->GetQueue(dev_to),
                                     static_cast<cl_mem>((void*)from),  // NOLINT(*)
                                     static_cast<cl_mem>(to), from_offset, to_offset, size, 0,
                                     nullptr, nullptr));
-  } else if (IsOpenCLDevice(ctx_from) && ctx_to.device_type == kDLCPU) {
-    OPENCL_CALL(clEnqueueReadBuffer(this->GetQueue(ctx_from),
+  } else if (IsOpenCLDevice(dev_from) && dev_to.device_type == kDLCPU) {
+    OPENCL_CALL(clEnqueueReadBuffer(this->GetQueue(dev_from),
                                     static_cast<cl_mem>((void*)from),  // NOLINT(*)
                                     CL_FALSE, from_offset, size, static_cast<char*>(to) + to_offset,
                                     0, nullptr, nullptr));
-    OPENCL_CALL(clFinish(this->GetQueue(ctx_from)));
-  } else if (ctx_from.device_type == kDLCPU && IsOpenCLDevice(ctx_to)) {
-    OPENCL_CALL(clEnqueueWriteBuffer(this->GetQueue(ctx_to), static_cast<cl_mem>(to), CL_FALSE,
+    OPENCL_CALL(clFinish(this->GetQueue(dev_from)));
+  } else if (dev_from.device_type == kDLCPU && IsOpenCLDevice(dev_to)) {
+    OPENCL_CALL(clEnqueueWriteBuffer(this->GetQueue(dev_to), static_cast<cl_mem>(to), CL_FALSE,
                                      to_offset, size, static_cast<const char*>(from) + from_offset,
                                      0, nullptr, nullptr));
-    OPENCL_CALL(clFinish(this->GetQueue(ctx_to)));
+    OPENCL_CALL(clFinish(this->GetQueue(dev_to)));
   } else {
     LOG(FATAL) << "Expect copy from/to OpenCL or between OpenCL";
   }
 }
 
-void OpenCLWorkspace::StreamSync(TVMContext ctx, TVMStreamHandle stream) {
+void OpenCLWorkspace::StreamSync(Device dev, TVMStreamHandle stream) {
   ICHECK(stream == nullptr);
-  OPENCL_CALL(clFinish(this->GetQueue(ctx)));
+  OPENCL_CALL(clFinish(this->GetQueue(dev)));
 }
 
-void* OpenCLWorkspace::AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) {
-  return GetThreadEntry()->pool.AllocWorkspace(ctx, size);
+void* OpenCLWorkspace::AllocWorkspace(Device dev, size_t size, DLDataType type_hint) {
+  return GetThreadEntry()->pool.AllocWorkspace(dev, size);
 }
 
-void OpenCLWorkspace::FreeWorkspace(TVMContext ctx, void* data) {
-  GetThreadEntry()->pool.FreeWorkspace(ctx, data);
+void OpenCLWorkspace::FreeWorkspace(Device dev, void* data) {
+  GetThreadEntry()->pool.FreeWorkspace(dev, data);
 }
 
 typedef dmlc::ThreadLocalStore<OpenCLThreadEntry> OpenCLThreadStore;
diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc
index a4c61e47b376..8c22c3c8cb23 100644
--- a/src/runtime/opencl/opencl_module.cc
+++ b/src/runtime/opencl/opencl_module.cc
@@ -65,7 +65,7 @@ class OpenCLWrappedFunc {
     for (cl_uint i = 0; i < arg_size_.size(); ++i) {
       OPENCL_CALL(clSetKernelArg(kernel, i, arg_size_[i], void_args[i]));
     }
-    cl_command_queue queue = w_->GetQueue(t->context);
+    cl_command_queue queue = w_->GetQueue(t->device);
     ThreadWorkLoad wl = thread_axis_cfg_.Extract(args);
     cl_uint work_dim = static_cast<cl_uint>(thread_axis_cfg_.work_dim());
     for (cl_uint i = 0; i < work_dim; ++i) {
@@ -186,7 +186,7 @@ void OpenCLModuleNode::Init() {
 cl_kernel OpenCLModuleNode::InstallKernel(cl::OpenCLWorkspace* w, cl::OpenCLThreadEntry* t,
                                           const std::string& func_name, const KTRefEntry& e) {
   std::lock_guard<std::mutex> lock(build_lock_);
-  int device_id = t->context.device_id;
+  int device_id = t->device.device_id;
   if (!device_built_flag_[device_id]) {
     // create program
     if (fmt_ == "cl") {
diff --git a/src/runtime/opencl/sdaccel/sdaccel_common.h b/src/runtime/opencl/sdaccel/sdaccel_common.h
index feeab0bc89ce..80bc770cc0a4 100644
--- a/src/runtime/opencl/sdaccel/sdaccel_common.h
+++ b/src/runtime/opencl/sdaccel/sdaccel_common.h
@@ -39,7 +39,7 @@ class SDAccelWorkspace final : public OpenCLWorkspace {
  public:
   // override OpenCL device API
   void Init() final;
-  bool IsOpenCLDevice(TVMContext ctx) final;
+  bool IsOpenCLDevice(Device dev) final;
   OpenCLThreadEntry* GetThreadEntry() final;
   // get the global workspace
   static OpenCLWorkspace* Global();
diff --git a/src/runtime/opencl/sdaccel/sdaccel_device_api.cc b/src/runtime/opencl/sdaccel/sdaccel_device_api.cc
index ebe387b1ddb3..7d4b673324a0 100644
--- a/src/runtime/opencl/sdaccel/sdaccel_device_api.cc
+++ b/src/runtime/opencl/sdaccel/sdaccel_device_api.cc
@@ -38,8 +38,8 @@ OpenCLWorkspace* SDAccelWorkspace::Global() {
 
 void SDAccelWorkspace::Init() { OpenCLWorkspace::Init("sdaccel", "accelerator", "Xilinx"); }
 
-bool SDAccelWorkspace::IsOpenCLDevice(TVMContext ctx) {
-  return ctx.device_type == static_cast<DLDeviceType>(kDLSDAccel);
+bool SDAccelWorkspace::IsOpenCLDevice(Device dev) {
+  return dev.device_type == static_cast<DLDeviceType>(kDLSDAccel);
 }
 
 typedef dmlc::ThreadLocalStore<SDAccelThreadEntry> SDAccelThreadStore;
diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc
index 3d204166986d..6cf1034a3837 100644
--- a/src/runtime/profiling.cc
+++ b/src/runtime/profiling.cc
@@ -34,30 +34,30 @@ namespace runtime {
 class DefaultTimerNode : public TimerNode {
  public:
   virtual void Start() {
-    TVMSynchronize(ctx_.device_type, ctx_.device_id, nullptr);
+    TVMSynchronize(device_.device_type, device_.device_id, nullptr);
     start_ = std::chrono::high_resolution_clock::now();
   }
   virtual void Stop() {
-    TVMSynchronize(ctx_.device_type, ctx_.device_id, nullptr);
+    TVMSynchronize(device_.device_type, device_.device_id, nullptr);
     duration_ = std::chrono::high_resolution_clock::now() - start_;
   }
   virtual int64_t SyncAndGetElapsedNanos() { return duration_.count(); }
   virtual ~DefaultTimerNode() {}
 
-  explicit DefaultTimerNode(TVMContext ctx) : ctx_(ctx) {}
+  explicit DefaultTimerNode(Device dev) : device_(dev) {}
   static constexpr const char* _type_key = "DefaultTimerNode";
   TVM_DECLARE_FINAL_OBJECT_INFO(DefaultTimerNode, TimerNode);
 
  private:
   std::chrono::high_resolution_clock::time_point start_;
   std::chrono::duration<int64_t, std::nano> duration_;
-  TVMContext ctx_;
+  Device device_;
 };
 
 TVM_REGISTER_OBJECT_TYPE(DefaultTimerNode);
 TVM_REGISTER_OBJECT_TYPE(TimerNode);
 
-Timer DefaultTimer(TVMContext ctx) { return Timer(make_object<DefaultTimerNode>(ctx)); }
+Timer DefaultTimer(Device dev) { return Timer(make_object<DefaultTimerNode>(dev)); }
 
 class CPUTimerNode : public TimerNode {
  public:
@@ -75,18 +75,18 @@ class CPUTimerNode : public TimerNode {
 };
 TVM_REGISTER_OBJECT_TYPE(CPUTimerNode);
 
-TVM_REGISTER_GLOBAL("profiling.timer.cpu").set_body_typed([](TVMContext ctx) {
+TVM_REGISTER_GLOBAL("profiling.timer.cpu").set_body_typed([](Device dev) {
   return Timer(make_object<CPUTimerNode>());
 });
 
-Timer Timer::Start(TVMContext ctx) {
-  auto f = Registry::Get(std::string("profiling.timer.") + DeviceName(ctx.device_type));
+Timer Timer::Start(Device dev) {
+  auto f = Registry::Get(std::string("profiling.timer.") + DeviceName(dev.device_type));
   if (f == nullptr) {
-    Timer t = DefaultTimer(ctx);
+    Timer t = DefaultTimer(dev);
     t->Start();
     return t;
   } else {
-    Timer t = f->operator()(ctx);
+    Timer t = f->operator()(dev);
     t->Start();
     return t;
   }
diff --git a/src/runtime/registry.cc b/src/runtime/registry.cc
index a65235090bfd..bb5a794a030b 100644
--- a/src/runtime/registry.cc
+++ b/src/runtime/registry.cc
@@ -22,8 +22,8 @@
  * \brief The global registry of packed function.
  */
 #include <dmlc/thread_local.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 #include <array>
 #include <memory>
diff --git a/src/runtime/rocm/rocm_device_api.cc b/src/runtime/rocm/rocm_device_api.cc
index 5f24ce0eec48..70c614551884 100644
--- a/src/runtime/rocm/rocm_device_api.cc
+++ b/src/runtime/rocm/rocm_device_api.cc
@@ -25,9 +25,9 @@
 #include <hip/hip_runtime_api.h>
 #include <hsa/hsa.h>
 #include <tvm/runtime/device_api.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/profiling.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 #include "rocm_common.h"
 
@@ -36,15 +36,15 @@ namespace runtime {
 
 class ROCMDeviceAPI final : public DeviceAPI {
  public:
-  void SetDevice(TVMContext ctx) final { ROCM_CALL(hipSetDevice(ctx.device_id)); }
-  void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final {
+  void SetDevice(Device dev) final { ROCM_CALL(hipSetDevice(dev.device_id)); }
+  void GetAttr(Device device, DeviceAttrKind kind, TVMRetValue* rv) final {
     int value = 0;
     switch (kind) {
       case kExist: {
         if (hsa_init() == HSA_STATUS_SUCCESS) {
           int dev;
           ROCM_CALL(hipGetDeviceCount(&dev));
-          value = dev > ctx.device_id ? 1 : 0;
+          value = dev > device.device_id ? 1 : 0;
           hsa_shut_down();
         } else {
           value = 0;
@@ -53,50 +53,53 @@ class ROCMDeviceAPI final : public DeviceAPI {
       }
       case kMaxThreadsPerBlock: {
         ROCM_CALL(
-            hipDeviceGetAttribute(&value, hipDeviceAttributeMaxThreadsPerBlock, ctx.device_id));
+            hipDeviceGetAttribute(&value, hipDeviceAttributeMaxThreadsPerBlock, device.device_id));
         break;
       }
       case kWarpSize: {
-        ROCM_CALL(hipDeviceGetAttribute(&value, hipDeviceAttributeWarpSize, ctx.device_id));
+        ROCM_CALL(hipDeviceGetAttribute(&value, hipDeviceAttributeWarpSize, device.device_id));
         break;
       }
       case kMaxSharedMemoryPerBlock: {
         ROCM_CALL(hipDeviceGetAttribute(&value, hipDeviceAttributeMaxSharedMemoryPerBlock,
-                                        ctx.device_id));
+                                        device.device_id));
         break;
       }
       case kComputeVersion: {
         std::ostringstream os;
-        ROCM_CALL(
-            hipDeviceGetAttribute(&value, hipDeviceAttributeComputeCapabilityMajor, ctx.device_id));
+        ROCM_CALL(hipDeviceGetAttribute(&value, hipDeviceAttributeComputeCapabilityMajor,
+                                        device.device_id));
         os << value << ".";
-        ROCM_CALL(
-            hipDeviceGetAttribute(&value, hipDeviceAttributeComputeCapabilityMinor, ctx.device_id));
+        ROCM_CALL(hipDeviceGetAttribute(&value, hipDeviceAttributeComputeCapabilityMinor,
+                                        device.device_id));
         os << value;
         *rv = os.str();
         return;
       }
       case kDeviceName: {
         std::string name(256, 0);
-        ROCM_CALL(hipDeviceGetName(&name[0], name.size(), ctx.device_id));
+        ROCM_CALL(hipDeviceGetName(&name[0], name.size(), device.device_id));
         name.resize(strlen(name.c_str()));
         *rv = std::move(name);
         return;
       }
       case kMaxClockRate: {
-        ROCM_CALL(hipDeviceGetAttribute(&value, hipDeviceAttributeClockRate, ctx.device_id));
+        ROCM_CALL(hipDeviceGetAttribute(&value, hipDeviceAttributeClockRate, device.device_id));
         break;
       }
       case kMultiProcessorCount: {
         ROCM_CALL(
-            hipDeviceGetAttribute(&value, hipDeviceAttributeMultiprocessorCount, ctx.device_id));
+            hipDeviceGetAttribute(&value, hipDeviceAttributeMultiprocessorCount, device.device_id));
         break;
       }
       case kMaxThreadDimensions: {
         int dims[3];
-        ROCM_CALL(hipDeviceGetAttribute(&dims[0], hipDeviceAttributeMaxBlockDimX, ctx.device_id));
-        ROCM_CALL(hipDeviceGetAttribute(&dims[1], hipDeviceAttributeMaxBlockDimY, ctx.device_id));
-        ROCM_CALL(hipDeviceGetAttribute(&dims[2], hipDeviceAttributeMaxBlockDimZ, ctx.device_id));
+        ROCM_CALL(
+            hipDeviceGetAttribute(&dims[0], hipDeviceAttributeMaxBlockDimX, device.device_id));
+        ROCM_CALL(
+            hipDeviceGetAttribute(&dims[1], hipDeviceAttributeMaxBlockDimY, device.device_id));
+        ROCM_CALL(
+            hipDeviceGetAttribute(&dims[2], hipDeviceAttributeMaxBlockDimZ, device.device_id));
 
         std::stringstream ss;
         ss << "[" << dims[0] << ", " << dims[1] << ", " << dims[2] << "]";
@@ -104,12 +107,12 @@ class ROCMDeviceAPI final : public DeviceAPI {
         return;
       }
       case kMaxRegistersPerBlock:
-        ROCM_CALL(
-            hipDeviceGetAttribute(&value, hipDeviceAttributeMaxRegistersPerBlock, ctx.device_id));
+        ROCM_CALL(hipDeviceGetAttribute(&value, hipDeviceAttributeMaxRegistersPerBlock,
+                                        device.device_id));
         break;
       case kGcnArch: {
         hipDeviceProp_t prop;
-        ROCM_CALL(hipGetDeviceProperties(&prop, ctx.device_id));
+        ROCM_CALL(hipGetDeviceProperties(&prop, device.device_id));
         *rv = prop.gcnArch;
         return;
       }
@@ -120,59 +123,58 @@ class ROCMDeviceAPI final : public DeviceAPI {
     }
     *rv = value;
   }
-  void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment,
-                       DLDataType type_hint) final {
-    ROCM_CALL(hipSetDevice(ctx.device_id));
+  void* AllocDataSpace(Device dev, size_t nbytes, size_t alignment, DLDataType type_hint) final {
+    ROCM_CALL(hipSetDevice(dev.device_id));
     ICHECK_EQ(256 % alignment, 0U) << "ROCM space is aligned at 256 bytes";
     void* ret;
     ROCM_CALL(hipMalloc(&ret, nbytes));
     return ret;
   }
 
-  void FreeDataSpace(TVMContext ctx, void* ptr) final {
-    ROCM_CALL(hipSetDevice(ctx.device_id));
+  void FreeDataSpace(Device dev, void* ptr) final {
+    ROCM_CALL(hipSetDevice(dev.device_id));
     ROCM_CALL(hipFree(ptr));
   }
 
   void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
-                      TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
+                      Device dev_from, Device dev_to, DLDataType type_hint,
                       TVMStreamHandle stream) final {
     hipStream_t hip_stream = static_cast<hipStream_t>(stream);
     from = static_cast<const char*>(from) + from_offset;
     to = static_cast<char*>(to) + to_offset;
-    if (ctx_from.device_type == kDLROCM && ctx_to.device_type == kDLROCM) {
-      ROCM_CALL(hipSetDevice(ctx_from.device_id));
-      if (ctx_from.device_id == ctx_to.device_id) {
+    if (dev_from.device_type == kDLROCM && dev_to.device_type == kDLROCM) {
+      ROCM_CALL(hipSetDevice(dev_from.device_id));
+      if (dev_from.device_id == dev_to.device_id) {
         GPUCopy(from, to, size, hipMemcpyDeviceToDevice, hip_stream);
       } else {
-        hipMemcpyPeerAsync(to, ctx_to.device_id, from, ctx_from.device_id, size, hip_stream);
+        hipMemcpyPeerAsync(to, dev_to.device_id, from, dev_from.device_id, size, hip_stream);
       }
-    } else if (ctx_from.device_type == kDLROCM && ctx_to.device_type == kDLCPU) {
-      ROCM_CALL(hipSetDevice(ctx_from.device_id));
+    } else if (dev_from.device_type == kDLROCM && dev_to.device_type == kDLCPU) {
+      ROCM_CALL(hipSetDevice(dev_from.device_id));
       GPUCopy(from, to, size, hipMemcpyDeviceToHost, hip_stream);
-    } else if (ctx_from.device_type == kDLCPU && ctx_to.device_type == kDLROCM) {
-      ROCM_CALL(hipSetDevice(ctx_to.device_id));
+    } else if (dev_from.device_type == kDLCPU && dev_to.device_type == kDLROCM) {
+      ROCM_CALL(hipSetDevice(dev_to.device_id));
       GPUCopy(from, to, size, hipMemcpyHostToDevice, hip_stream);
     } else {
       LOG(FATAL) << "expect copy from/to GPU or between GPU";
     }
   }
 
-  void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {
-    ROCM_CALL(hipSetDevice(ctx.device_id));
+  void StreamSync(Device dev, TVMStreamHandle stream) final {
+    ROCM_CALL(hipSetDevice(dev.device_id));
     ROCM_CALL(hipStreamSynchronize(static_cast<hipStream_t>(stream)));
   }
 
-  void SetStream(TVMContext ctx, TVMStreamHandle stream) final {
+  void SetStream(Device dev, TVMStreamHandle stream) final {
     ROCMThreadEntry::ThreadLocal()->stream = static_cast<hipStream_t>(stream);
   }
 
-  void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final {
-    return ROCMThreadEntry::ThreadLocal()->pool.AllocWorkspace(ctx, size);
+  void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final {
+    return ROCMThreadEntry::ThreadLocal()->pool.AllocWorkspace(dev, size);
   }
 
-  void FreeWorkspace(TVMContext ctx, void* data) final {
-    ROCMThreadEntry::ThreadLocal()->pool.FreeWorkspace(ctx, data);
+  void FreeWorkspace(Device dev, void* data) final {
+    ROCMThreadEntry::ThreadLocal()->pool.FreeWorkspace(dev, data);
   }
 
   static ROCMDeviceAPI* Global() {
@@ -233,7 +235,7 @@ class ROCMTimerNode : public TimerNode {
 
 TVM_REGISTER_OBJECT_TYPE(ROCMTimerNode);
 
-TVM_REGISTER_GLOBAL("profiling.timer.rocm").set_body_typed([](TVMContext ctx) {
+TVM_REGISTER_GLOBAL("profiling.timer.rocm").set_body_typed([](Device dev) {
   return Timer(make_object<ROCMTimerNode>());
 });
 
diff --git a/src/runtime/rpc/rpc_device_api.cc b/src/runtime/rpc/rpc_device_api.cc
index 06737f99a4de..1d6fb85d9495 100644
--- a/src/runtime/rpc/rpc_device_api.cc
+++ b/src/runtime/rpc/rpc_device_api.cc
@@ -21,8 +21,8 @@
  * \file rpc_device_api.cc
  */
 #include <tvm/runtime/device_api.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 #include <utility>
 
@@ -33,100 +33,99 @@ namespace runtime {
 
 class RPCDeviceAPI final : public DeviceAPI {
  public:
-  void SetDevice(TVMContext ctx) final {
-    auto remote_ctx = RemoveRPCSessionMask(ctx);
-    GetSess(ctx)->GetDeviceAPI(remote_ctx)->SetDevice(remote_ctx);
+  void SetDevice(Device dev) final {
+    auto remote_dev = RemoveRPCSessionMask(dev);
+    GetSess(dev)->GetDeviceAPI(remote_dev)->SetDevice(remote_dev);
   }
 
-  void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final {
-    auto remote_ctx = RemoveRPCSessionMask(ctx);
-    GetSess(ctx)->GetDeviceAPI(remote_ctx)->GetAttr(remote_ctx, kind, rv);
+  void GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) final {
+    auto remote_dev = RemoveRPCSessionMask(dev);
+    GetSess(dev)->GetDeviceAPI(remote_dev)->GetAttr(remote_dev, kind, rv);
   }
 
-  void* AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, DLDataType dtype,
+  void* AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
                        Optional<String> mem_scope) final {
-    auto sess = GetSess(ctx);
-    auto remote_ctx = RemoveRPCSessionMask(ctx);
+    auto sess = GetSess(dev);
+    auto remote_dev = RemoveRPCSessionMask(dev);
     void* data =
-        sess->GetDeviceAPI(remote_ctx)->AllocDataSpace(remote_ctx, ndim, shape, dtype, mem_scope);
+        sess->GetDeviceAPI(remote_dev)->AllocDataSpace(remote_dev, ndim, shape, dtype, mem_scope);
     RemoteSpace* space = new RemoteSpace();
     space->data = data;
     space->sess = std::move(sess);
     return space;
   }
 
-  void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment,
-                       DLDataType type_hint) final {
-    auto sess = GetSess(ctx);
-    auto remote_ctx = RemoveRPCSessionMask(ctx);
+  void* AllocDataSpace(Device dev, size_t nbytes, size_t alignment, DLDataType type_hint) final {
+    auto sess = GetSess(dev);
+    auto remote_dev = RemoveRPCSessionMask(dev);
     void* data =
-        sess->GetDeviceAPI(remote_ctx)->AllocDataSpace(remote_ctx, nbytes, alignment, type_hint);
+        sess->GetDeviceAPI(remote_dev)->AllocDataSpace(remote_dev, nbytes, alignment, type_hint);
 
     RemoteSpace* space = new RemoteSpace();
     space->data = data;
     space->sess = std::move(sess);
     return space;
   }
-  void FreeDataSpace(TVMContext ctx, void* ptr) final {
+  void FreeDataSpace(Device dev, void* ptr) final {
     RemoteSpace* space = static_cast<RemoteSpace*>(ptr);
-    auto remote_ctx = RemoveRPCSessionMask(ctx);
+    auto remote_dev = RemoveRPCSessionMask(dev);
     try {
-      GetSess(ctx)->GetDeviceAPI(remote_ctx)->FreeDataSpace(remote_ctx, space->data);
-    } catch (const dmlc::Error& e) {
+      GetSess(dev)->GetDeviceAPI(remote_dev)->FreeDataSpace(remote_dev, space->data);
+    } catch (const Error& e) {
       // fault tolerance to remote close.
     }
     delete space;
   }
 
   void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) final {
-    DLContext ctx_from = from->ctx;
-    DLContext ctx_to = to->ctx;
-    if (IsRPCSessionContext(ctx_from) && IsRPCSessionContext(ctx_to)) {
-      ICHECK(ctx_from.device_type == ctx_to.device_type)
+    DLDevice dev_from = from->device;
+    DLDevice dev_to = to->device;
+    if (IsRPCSessionDevice(dev_from) && IsRPCSessionDevice(dev_to)) {
+      ICHECK(dev_from.device_type == dev_to.device_type)
           << "Cannot copy across two different remote session";
       DLTensor from_tensor = *from;
-      from_tensor.ctx = RemoveRPCSessionMask(ctx_from);
+      from_tensor.device = RemoveRPCSessionMask(dev_from);
       from_tensor.data = static_cast<const RemoteSpace*>(from->data)->data;
       DLTensor to_tensor = *to;
-      to_tensor.ctx = RemoveRPCSessionMask(ctx_to);
+      to_tensor.device = RemoveRPCSessionMask(dev_to);
       to_tensor.data = static_cast<const RemoteSpace*>(to->data)->data;
-      auto remote_ctx = from_tensor.ctx;
-      if (remote_ctx.device_type == kDLCPU) remote_ctx = to_tensor.ctx;
-      GetSess(ctx_from)->GetDeviceAPI(remote_ctx)->CopyDataFromTo(&from_tensor, &to_tensor, stream);
-    } else if (IsRPCSessionContext(ctx_from) && ctx_to.device_type == kDLCPU) {
+      auto remote_dev = from_tensor.device;
+      if (remote_dev.device_type == kDLCPU) remote_dev = to_tensor.device;
+      GetSess(dev_from)->GetDeviceAPI(remote_dev)->CopyDataFromTo(&from_tensor, &to_tensor, stream);
+    } else if (IsRPCSessionDevice(dev_from) && dev_to.device_type == kDLCPU) {
       DLTensor from_tensor = *from;
-      from_tensor.ctx = RemoveRPCSessionMask(ctx_from);
+      from_tensor.device = RemoveRPCSessionMask(dev_from);
       from_tensor.data = static_cast<const RemoteSpace*>(from->data)->data;
       void* to_bytes = static_cast<char*>(to->data) + to->byte_offset;
       size_t nbytes = GetDataSize(*to);
-      GetSess(ctx_from)->CopyFromRemote(&from_tensor, to_bytes, nbytes);
-    } else if (ctx_from.device_type == kDLCPU && IsRPCSessionContext(ctx_to)) {
+      GetSess(dev_from)->CopyFromRemote(&from_tensor, to_bytes, nbytes);
+    } else if (dev_from.device_type == kDLCPU && IsRPCSessionDevice(dev_to)) {
       DLTensor to_tensor = *to;
-      to_tensor.ctx = RemoveRPCSessionMask(ctx_to);
+      to_tensor.device = RemoveRPCSessionMask(dev_to);
       to_tensor.data = static_cast<const RemoteSpace*>(to->data)->data;
       void* from_bytes = static_cast<char*>(from->data) + from->byte_offset;
       size_t nbytes = GetDataSize(*from);
-      GetSess(ctx_to)->CopyToRemote(from_bytes, &to_tensor, nbytes);
+      GetSess(dev_to)->CopyToRemote(from_bytes, &to_tensor, nbytes);
     } else {
       LOG(FATAL) << "expect copy from/to remote or between remote";
     }
   }
 
-  void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {
-    auto remote_ctx = RemoveRPCSessionMask(ctx);
-    GetSess(ctx)->GetDeviceAPI(remote_ctx)->StreamSync(remote_ctx, stream);
+  void StreamSync(Device dev, TVMStreamHandle stream) final {
+    auto remote_dev = RemoveRPCSessionMask(dev);
+    GetSess(dev)->GetDeviceAPI(remote_dev)->StreamSync(remote_dev, stream);
   }
 
  protected:
   void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
-                      size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
-                      DLDataType type_hint, TVMStreamHandle stream) final {
+                      size_t num_bytes, Device dev_from, Device dev_to, DLDataType type_hint,
+                      TVMStreamHandle stream) final {
     LOG(FATAL) << "Not implemented.";
   }
 
  private:
-  std::shared_ptr<RPCSession> GetSess(TVMContext ctx) {
-    int tbl_index = GetRPCSessionIndex(ctx);
+  std::shared_ptr<RPCSession> GetSess(Device dev) {
+    int tbl_index = GetRPCSessionIndex(dev);
     return RPCSession::Get(tbl_index);
   }
 };
diff --git a/src/runtime/rpc/rpc_endpoint.cc b/src/runtime/rpc/rpc_endpoint.cc
index 8716355fd68f..b5768146b3f7 100644
--- a/src/runtime/rpc/rpc_endpoint.cc
+++ b/src/runtime/rpc/rpc_endpoint.cc
@@ -176,10 +176,9 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
       if (tcode == kTVMObjectHandle || tcode == kTVMObjectRValueRefArg) {
         LOG(FATAL) << "ValueError: Cannot pass argument " << i << ", type "
                    << args[i].AsObjectRef<ObjectRef>()->GetTypeKey() << " is not supported by RPC";
-      } else if (tcode == kTVMContext) {
-        DLContext ctx = args[i];
-        ICHECK(!IsRPCSessionContext(ctx))
-            << "InternalError: cannot pass RPC context in the channel";
+      } else if (tcode == kDLDevice) {
+        DLDevice dev = args[i];
+        ICHECK(!IsRPCSessionDevice(dev)) << "InternalError: cannot pass RPC device in the channel";
       }
     }
   }
@@ -405,7 +404,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
 
     // When session is local, we can directly treat handle
     // as the cpu pointer without allocating a temp space.
-    if (arr->ctx.device_type == kDLCPU && sess->IsLocalSession() && DMLC_IO_NO_ENDIAN_SWAP) {
+    if (arr->device.device_type == kDLCPU && sess->IsLocalSession() && DMLC_IO_NO_ENDIAN_SWAP) {
       char* data_ptr = reinterpret_cast<char*>(arr->data) + arr->byte_offset;
       fcopyack(data_ptr, data_bytes);
     } else {
@@ -438,7 +437,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
 
     // When session is local, we can directly treat handle
     // as the cpu pointer without allocating a temp space.
-    if (arr->ctx.device_type == kDLCPU && sess->IsLocalSession()) {
+    if (arr->device.device_type == kDLCPU && sess->IsLocalSession()) {
       char* dptr = reinterpret_cast<char*>(arr->data) + arr->byte_offset;
       this->ReadArray(dptr, data_bytes);
 
@@ -526,7 +525,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
 
       try {
         fconstructor->CallPacked(constructor_args, &con_ret);
-      } catch (const dmlc::Error& e) {
+      } catch (const Error& e) {
         LOG(FATAL) << "Server[" << name_ << "]:"
                    << " Error caught from session constructor " << constructor_name << ":\n"
                    << e.what();
@@ -540,7 +539,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
       ICHECK_EQ(tkey, "rpc") << "Constructor " << constructor_name << " to return an RPCModule";
       serving_session_ = RPCModuleGetSession(mod);
       this->ReturnVoid();
-    } catch (const std::runtime_error& e) {
+    } catch (const std::exception& e) {
       this->ReturnException(e.what());
     }
 
@@ -550,11 +549,11 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
   void HandleSyscallStreamSync() {
     TVMArgs args = RecvPackedSeq();
     try {
-      TVMContext ctx = args[0];
+      Device dev = args[0];
       TVMStreamHandle handle = args[1];
 
       this->SwitchToState(kWaitForAsyncCallback);
-      GetServingSession()->AsyncStreamWait(ctx, handle, [this](RPCCode status, TVMArgs args) {
+      GetServingSession()->AsyncStreamWait(dev, handle, [this](RPCCode status, TVMArgs args) {
         if (status == RPCCode::kException) {
           this->ReturnException(args.values[0].v_str);
         } else {
@@ -562,7 +561,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
         }
         this->SwitchToState(kRecvPacketNumBytes);
       });
-    } catch (const std::runtime_error& e) {
+    } catch (const std::exception& e) {
       this->ReturnException(e.what());
       this->SwitchToState(kRecvPacketNumBytes);
     }
@@ -581,7 +580,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
       setter(0, rv);
 
       this->ReturnPackedSeq(TVMArgs(&ret_value, &ret_tcode, 1));
-    } catch (const std::runtime_error& e) {
+    } catch (const std::exception& e) {
       this->ReturnException(e.what());
     }
     this->SwitchToState(kRecvPacketNumBytes);
@@ -719,7 +718,7 @@ void RPCEndpoint::Shutdown() {
             writer_.bytes_available());
         if (n == 0) break;
       }
-    } catch (const dmlc::Error& e) {
+    } catch (const Error& e) {
     }
     channel_.reset(nullptr);
   }
@@ -807,7 +806,7 @@ void RPCEndpoint::CopyToRemote(void* from_bytes, DLTensor* to, uint64_t nbytes)
 
   uint64_t to_data = reinterpret_cast<uint64_t>(to->data);
   uint64_t shape_bytes = to->ndim * sizeof(int64_t);
-  uint64_t packet_nbytes = sizeof(code) + sizeof(to_data) + sizeof(to->ctx) + sizeof(to->ndim) +
+  uint64_t packet_nbytes = sizeof(code) + sizeof(to_data) + sizeof(to->device) + sizeof(to->ndim) +
                            sizeof(to->dtype) + sizeof(to->byte_offset) + shape_bytes +
                            sizeof(nbytes) + num_data_bytes;
 
@@ -828,7 +827,7 @@ void RPCEndpoint::CopyFromRemote(DLTensor* from, void* to_bytes, uint64_t nbytes
 
   uint64_t from_data = reinterpret_cast<uint64_t>(from->data);
   uint64_t shape_bytes = from->ndim * sizeof(int64_t);
-  uint64_t packet_nbytes = sizeof(code) + sizeof(from_data) + sizeof(from->ctx) +
+  uint64_t packet_nbytes = sizeof(code) + sizeof(from_data) + sizeof(from->device) +
                            sizeof(from->ndim) + sizeof(from->dtype) + sizeof(from->byte_offset) +
                            shape_bytes + sizeof(nbytes);
 
@@ -855,37 +854,37 @@ void RPCFreeHandle(RPCSession* handler, TVMArgs args, TVMRetValue* rv) {
 }
 
 void RPCDevSetDevice(RPCSession* handler, TVMArgs args, TVMRetValue* rv) {
-  TVMContext ctx = args[0];
-  handler->GetDeviceAPI(ctx)->SetDevice(ctx);
+  Device dev = args[0];
+  handler->GetDeviceAPI(dev)->SetDevice(dev);
 }
 
 void RPCDevGetAttr(RPCSession* handler, TVMArgs args, TVMRetValue* rv) {
-  TVMContext ctx = args[0];
+  Device dev = args[0];
   DeviceAttrKind kind = static_cast<DeviceAttrKind>(args[1].operator int());
   if (kind == kExist) {
-    DeviceAPI* api = handler->GetDeviceAPI(ctx, true);
+    DeviceAPI* api = handler->GetDeviceAPI(dev, true);
     if (api != nullptr) {
-      api->GetAttr(ctx, kind, rv);
+      api->GetAttr(dev, kind, rv);
     } else {
       *rv = 0;
     }
   } else {
-    handler->GetDeviceAPI(ctx)->GetAttr(ctx, static_cast<DeviceAttrKind>(kind), rv);
+    handler->GetDeviceAPI(dev)->GetAttr(dev, static_cast<DeviceAttrKind>(kind), rv);
   }
 }
 
 void RPCDevAllocData(RPCSession* handler, TVMArgs args, TVMRetValue* rv) {
-  TVMContext ctx = args[0];
+  Device dev = args[0];
   uint64_t nbytes = args[1];
   uint64_t alignment = args[2];
   DLDataType type_hint = args[3];
-  void* data = handler->GetDeviceAPI(ctx)->AllocDataSpace(ctx, nbytes, alignment, type_hint);
+  void* data = handler->GetDeviceAPI(dev)->AllocDataSpace(dev, nbytes, alignment, type_hint);
   *rv = data;
 }
 
 void RPCDevAllocDataWithScope(RPCSession* handler, TVMArgs args, TVMRetValue* rv) {
   DLTensor* arr = args[0];
-  TVMContext ctx = arr->ctx;
+  Device dev = arr->device;
   int ndim = arr->ndim;
   int64_t* shape = arr->shape;
   DLDataType dtype = arr->dtype;
@@ -896,14 +895,14 @@ void RPCDevAllocDataWithScope(RPCSession* handler, TVMArgs args, TVMRetValue* rv
   } else {
     ICHECK_EQ(tcode, kTVMNullptr);
   }
-  void* data = handler->GetDeviceAPI(ctx)->AllocDataSpace(ctx, ndim, shape, dtype, mem_scope);
+  void* data = handler->GetDeviceAPI(dev)->AllocDataSpace(dev, ndim, shape, dtype, mem_scope);
   *rv = data;
 }
 
 void RPCDevFreeData(RPCSession* handler, TVMArgs args, TVMRetValue* rv) {
-  TVMContext ctx = args[0];
+  Device dev = args[0];
   void* ptr = args[1];
-  handler->GetDeviceAPI(ctx)->FreeDataSpace(ctx, ptr);
+  handler->GetDeviceAPI(dev)->FreeDataSpace(dev, ptr);
 }
 
 void RPCCopyAmongRemote(RPCSession* handler, TVMArgs args, TVMRetValue* rv) {
@@ -911,14 +910,14 @@ void RPCCopyAmongRemote(RPCSession* handler, TVMArgs args, TVMRetValue* rv) {
   DLTensor* to = args[1];
   TVMStreamHandle stream = args[2];
 
-  TVMContext ctx = from->ctx;
-  if (ctx.device_type == kDLCPU) {
-    ctx = to->ctx;
+  Device dev = from->device;
+  if (dev.device_type == kDLCPU) {
+    dev = to->device;
   } else {
-    ICHECK(to->ctx.device_type == kDLCPU || to->ctx.device_type == from->ctx.device_type)
-        << "Can not copy across different ctx types directly";
+    ICHECK(to->device.device_type == kDLCPU || to->device.device_type == from->device.device_type)
+        << "Can not copy across different dev types directly";
   }
-  handler->GetDeviceAPI(ctx)->CopyDataFromTo(from, to, stream);
+  handler->GetDeviceAPI(dev)->CopyDataFromTo(from, to, stream);
 }
 
 void RPCEndpoint::EventHandler::HandleSyscall(RPCCode code) {
@@ -993,27 +992,26 @@ class RPCClientSession : public RPCSession, public DeviceAPI {
     endpoint_->SysCallRemote(RPCCode::kFreeHandle, handle, type_code);
   }
 
-  void SetDevice(TVMContext ctx) final { endpoint_->SysCallRemote(RPCCode::kDevSetDevice, ctx); }
+  void SetDevice(Device dev) final { endpoint_->SysCallRemote(RPCCode::kDevSetDevice, dev); }
 
-  void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final {
-    if (ctx.device_type == kDLCPU && kind == kExist) {
+  void GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) final {
+    if (dev.device_type == kDLCPU && kind == kExist) {
       // cpu always exists.
       *rv = 1;
     } else {
-      *rv = endpoint_->SysCallRemote(RPCCode::kDevGetAttr, ctx, static_cast<int>(kind));
+      *rv = endpoint_->SysCallRemote(RPCCode::kDevGetAttr, dev, static_cast<int>(kind));
     }
   }
 
-  void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment,
-                       DLDataType type_hint) final {
-    return endpoint_->SysCallRemote(RPCCode::kDevAllocData, ctx, nbytes, alignment, type_hint);
+  void* AllocDataSpace(Device dev, size_t nbytes, size_t alignment, DLDataType type_hint) final {
+    return endpoint_->SysCallRemote(RPCCode::kDevAllocData, dev, nbytes, alignment, type_hint);
   }
 
-  void* AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, DLDataType dtype,
+  void* AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
                        Optional<String> mem_scope) final {
     DLTensor temp;
     temp.data = nullptr;
-    temp.ctx = ctx;
+    temp.device = dev;
     temp.ndim = ndim;
     temp.dtype = dtype;
     temp.shape = const_cast<int64_t*>(shape);
@@ -1027,19 +1025,19 @@ class RPCClientSession : public RPCSession, public DeviceAPI {
     }
   }
 
-  void FreeDataSpace(TVMContext ctx, void* ptr) final {
-    endpoint_->SysCallRemote(RPCCode::kDevFreeData, ctx, ptr);
+  void FreeDataSpace(Device dev, void* ptr) final {
+    endpoint_->SysCallRemote(RPCCode::kDevFreeData, dev, ptr);
   }
 
   void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) final {
     endpoint_->SysCallRemote(RPCCode::kCopyAmongRemote, from, to, stream);
   }
 
-  void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {
-    endpoint_->SysCallRemote(RPCCode::kDevStreamSync, ctx, stream);
+  void StreamSync(Device dev, TVMStreamHandle stream) final {
+    endpoint_->SysCallRemote(RPCCode::kDevStreamSync, dev, stream);
   }
 
-  DeviceAPI* GetDeviceAPI(TVMContext ctx, bool allow_missing) final { return this; }
+  DeviceAPI* GetDeviceAPI(Device dev, bool allow_missing) final { return this; }
 
   bool IsLocalSession() const final { return false; }
 
diff --git a/src/runtime/rpc/rpc_endpoint.h b/src/runtime/rpc/rpc_endpoint.h
index 8e08bfa75623..cd3c9b2bec72 100644
--- a/src/runtime/rpc/rpc_endpoint.h
+++ b/src/runtime/rpc/rpc_endpoint.h
@@ -132,7 +132,7 @@ class RPCEndpoint {
    * \param to The target array.
    * \param to_offset The byte offset in the to.
    * \param nbytes The size of the memory in bytes.
-   * \param ctx_to The target context.
+   * \param dev_to The target device.
    * \param type_hint Hint of content data type.
    */
   void CopyToRemote(void* from_bytes, DLTensor* to, uint64_t nbytes);
@@ -143,7 +143,7 @@ class RPCEndpoint {
    * \param to The target array.
    * \param to_offset The byte offset in the to.
    * \param nbytes The size of the memory in bytes.
-   * \param ctx_from The source context.
+   * \param dev_from The source device.
    * \param type_hint Hint of content data type.
    */
   void CopyFromRemote(DLTensor* from, void* to_bytes, uint64_t nbytes);
diff --git a/src/runtime/rpc/rpc_local_session.cc b/src/runtime/rpc/rpc_local_session.cc
index 0650b55d0d7c..4b1c1f7fe998 100644
--- a/src/runtime/rpc/rpc_local_session.cc
+++ b/src/runtime/rpc/rpc_local_session.cc
@@ -91,35 +91,35 @@ void LocalSession::CopyToRemote(void* from_bytes, DLTensor* to, uint64_t nbytes)
   ICHECK_EQ(nbytes, GetDataSize(*to));
   DLTensor from;
   from.data = from_bytes;
-  from.ctx = {kDLCPU, 0};
+  from.device = {kDLCPU, 0};
   from.ndim = to->ndim;
   from.shape = to->shape;
   from.dtype = to->dtype;
   from.strides = nullptr;
   from.byte_offset = 0;
-  TVMContext ctx_to = to->ctx;
-  this->GetDeviceAPI(ctx_to)->CopyDataFromTo(&from, to, nullptr);
+  Device dev_to = to->device;
+  this->GetDeviceAPI(dev_to)->CopyDataFromTo(&from, to, nullptr);
   // Copy can happen asynchrously
   // synchronize to make sure that copy is completed
-  this->GetDeviceAPI(ctx_to)->StreamSync(ctx_to, nullptr);
+  this->GetDeviceAPI(dev_to)->StreamSync(dev_to, nullptr);
 }
 
 void LocalSession::CopyFromRemote(DLTensor* from, void* to_bytes, uint64_t nbytes) {
   ICHECK_EQ(nbytes, GetDataSize(*from));
   DLTensor to;
   to.data = to_bytes;
-  to.ctx = {kDLCPU, 0};
+  to.device = {kDLCPU, 0};
   to.ndim = from->ndim;
   to.shape = from->shape;
   to.dtype = from->dtype;
   to.strides = nullptr;
   to.byte_offset = 0;
 
-  TVMContext ctx_from = from->ctx;
-  this->GetDeviceAPI(ctx_from)->CopyDataFromTo(from, &to, nullptr);
+  Device dev_from = from->device;
+  this->GetDeviceAPI(dev_from)->CopyDataFromTo(from, &to, nullptr);
   // Copy can happen asynchrously
   // synchronize to make sure that copy is completed
-  this->GetDeviceAPI(ctx_from)->StreamSync(ctx_from, nullptr);
+  this->GetDeviceAPI(dev_from)->StreamSync(dev_from, nullptr);
 }
 
 void LocalSession::FreeHandle(void* handle, int type_code) {
@@ -129,8 +129,8 @@ void LocalSession::FreeHandle(void* handle, int type_code) {
   TVMRetValue rv = TVMRetValue::MoveFromCHost(value, type_code);
 }
 
-DeviceAPI* LocalSession::GetDeviceAPI(TVMContext ctx, bool allow_missing) {
-  return DeviceAPI::Get(ctx, allow_missing);
+DeviceAPI* LocalSession::GetDeviceAPI(Device dev, bool allow_missing) {
+  return DeviceAPI::Get(dev, allow_missing);
 }
 
 TVM_REGISTER_GLOBAL("rpc.LocalSession").set_body_typed([]() {
diff --git a/src/runtime/rpc/rpc_local_session.h b/src/runtime/rpc/rpc_local_session.h
index ea070e34bd35..d1b54d5be65b 100644
--- a/src/runtime/rpc/rpc_local_session.h
+++ b/src/runtime/rpc/rpc_local_session.h
@@ -54,7 +54,7 @@ class LocalSession : public RPCSession {
 
   void FreeHandle(void* handle, int type_code) override;
 
-  DeviceAPI* GetDeviceAPI(TVMContext ctx, bool allow_missing = false) override;
+  DeviceAPI* GetDeviceAPI(Device dev, bool allow_missing = false) override;
 
   bool IsLocalSession() const override { return true; }
 
diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index 4f721e122a4c..7db84862604f 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -23,6 +23,7 @@
  */
 #include <tvm/runtime/container.h>
 #include <tvm/runtime/device_api.h>
+#include <tvm/runtime/profiling.h>
 #include <tvm/runtime/registry.h>
 
 #include <cstring>
@@ -55,21 +56,21 @@ static void RemoteNDArrayDeleter(Object* obj) {
  *     underlying DLTensor.
  * \param template_tensor An empty DLTensor whose shape and dtype fields are used to fill the newly
  *     created array. Needed because it's difficult to pass a shape vector as a PackedFunc arg.
- * \param ctx Remote context used with this tensor. Must have non-zero RPCSessMask.
+ * \param dev Remote device used with this tensor. Must have non-zero RPCSessMask.
  * \param remote_ndarray_handle The handle returned by RPC server to identify the NDArray.
  */
 NDArray NDArrayFromRemoteOpaqueHandle(std::shared_ptr<RPCSession> sess, void* handle,
-                                      DLTensor* template_tensor, TVMContext ctx,
+                                      DLTensor* template_tensor, Device dev,
                                       void* remote_ndarray_handle) {
-  ICHECK_EQ(sess->table_index(), GetRPCSessionIndex(ctx))
-      << "The TVMContext given does not belong to the given session";
+  ICHECK_EQ(sess->table_index(), GetRPCSessionIndex(dev))
+      << "The Device given does not belong to the given session";
   RemoteSpace* space = new RemoteSpace();
   space->sess = sess;
   space->data = handle;
   std::vector<int64_t> shape_vec{template_tensor->shape,
                                  template_tensor->shape + template_tensor->ndim};
   NDArray::Container* data = new NDArray::Container(static_cast<void*>(space), std::move(shape_vec),
-                                                    template_tensor->dtype, ctx);
+                                                    template_tensor->dtype, dev);
   data->manager_ctx = remote_ndarray_handle;
   data->SetDeleter(RemoteNDArrayDeleter);
   return NDArray(GetObjectPtr<Object>(data));
@@ -105,14 +106,14 @@ class RPCWrappedFunc : public Object {
           type_codes[i] = kTVMDLTensorHandle;
           // translate to a remote view of DLTensor
           auto dptr = std::make_unique<DLTensor>(*static_cast<DLTensor*>(values[i].v_handle));
-          dptr->ctx = RemoveSessMask(dptr->ctx);
+          dptr->device = RemoveSessMask(dptr->device);
           dptr->data = static_cast<RemoteSpace*>(dptr->data)->data;
           values[i].v_handle = dptr.get();
           temp_dltensors.emplace_back(std::move(dptr));
           break;
         }
-        case kTVMContext: {
-          values[i].v_ctx = RemoveSessMask(values[i].v_ctx);
+        case kDLDevice: {
+          values[i].v_device = RemoveSessMask(values[i].v_device);
           break;
         }
         case kTVMPackedFuncHandle:
@@ -129,7 +130,7 @@ class RPCWrappedFunc : public Object {
   ~RPCWrappedFunc() {
     try {
       sess_->FreeHandle(handle_, kTVMPackedFuncHandle);
-    } catch (const dmlc::Error& e) {
+    } catch (const Error& e) {
       // fault tolerance to remote close
     }
   }
@@ -146,11 +147,11 @@ class RPCWrappedFunc : public Object {
   void WrapRemoteReturnToValue(TVMArgs args, TVMRetValue* rv) const;
 
   // remove a remote session mask
-  TVMContext RemoveSessMask(TVMContext ctx) const {
-    ICHECK(IsRPCSessionContext(ctx)) << "Can not pass in local context";
-    ICHECK_EQ(GetRPCSessionIndex(ctx), sess_->table_index())
-        << "Can not pass in context with a different remote session";
-    return RemoveRPCSessionMask(ctx);
+  Device RemoveSessMask(Device dev) const {
+    ICHECK(IsRPCSessionDevice(dev)) << "Can not pass in local device";
+    ICHECK_EQ(GetRPCSessionIndex(dev), sess_->table_index())
+        << "Can not pass in device with a different remote session";
+    return RemoveRPCSessionMask(dev);
   }
 };
 
@@ -164,7 +165,7 @@ class RPCModuleNode final : public ModuleNode {
     if (module_handle_ != nullptr) {
       try {
         sess_->FreeHandle(module_handle_, kTVMModuleHandle);
-      } catch (const dmlc::Error& e) {
+      } catch (const Error& e) {
         // fault tolerance to remote close
       }
       module_handle_ = nullptr;
@@ -187,21 +188,21 @@ class RPCModuleNode final : public ModuleNode {
     return "";
   }
 
-  PackedFunc GetTimeEvaluator(const std::string& name, TVMContext ctx, int number, int repeat,
+  PackedFunc GetTimeEvaluator(const std::string& name, Device dev, int number, int repeat,
                               int min_repeat_ms, const std::string& f_preproc_name) {
     InitRemoteFunc(&remote_get_time_evaluator_, "runtime.RPCTimeEvaluator");
-    // Remove session mask because we pass ctx by parts.
-    ICHECK_EQ(GetRPCSessionIndex(ctx), sess_->table_index())
-        << "ValueError: Need to pass the matched remote context to RPCModule.GetTimeEvaluator";
-    ctx = RemoveRPCSessionMask(ctx);
+    // Remove session mask because we pass dev by parts.
+    ICHECK_EQ(GetRPCSessionIndex(dev), sess_->table_index())
+        << "ValueError: Need to pass the matched remote device to RPCModule.GetTimeEvaluator";
+    dev = RemoveRPCSessionMask(dev);
 
     if (module_handle_ != nullptr) {
       return remote_get_time_evaluator_(GetRef<Module>(this), name,
-                                        static_cast<int>(ctx.device_type), ctx.device_id, number,
+                                        static_cast<int>(dev.device_type), dev.device_id, number,
                                         repeat, min_repeat_ms, f_preproc_name);
     } else {
       return remote_get_time_evaluator_(Optional<Module>(nullptr), name,
-                                        static_cast<int>(ctx.device_type), ctx.device_id, number,
+                                        static_cast<int>(dev.device_type), dev.device_id, number,
                                         repeat, min_repeat_ms, f_preproc_name);
     }
   }
@@ -285,7 +286,7 @@ void RPCWrappedFunc::WrapRemoteReturnToValue(TVMArgs args, TVMRetValue* rv) cons
     DLTensor* tensor = args[1];
     void* nd_handle = args[2];
     *rv = NDArrayFromRemoteOpaqueHandle(sess_, tensor->data, tensor,
-                                        AddRPCSessionMask(tensor->ctx, sess_->table_index()),
+                                        AddRPCSessionMask(tensor->device, sess_->table_index()),
                                         nd_handle);
   } else {
     ICHECK_EQ(args.size(), 2);
@@ -341,31 +342,29 @@ inline void CPUCacheFlush(int begin_index, const TVMArgs& args) {
   }
 }
 
-PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repeat,
-                             int min_repeat_ms, PackedFunc f_preproc) {
+PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat, int min_repeat_ms,
+                             PackedFunc f_preproc) {
   ICHECK(pf != nullptr);
 
-  if (static_cast<int>(ctx.device_type) == static_cast<int>(kDLMicroDev)) {
+  if (static_cast<int>(dev.device_type) == static_cast<int>(kDLMicroDev)) {
     auto get_micro_time_evaluator = runtime::Registry::Get("micro._GetMicroTimeEvaluator");
     ICHECK(get_micro_time_evaluator != nullptr) << "micro backend not enabled";
-    return (*get_micro_time_evaluator)(pf, ctx, number, repeat);
+    return (*get_micro_time_evaluator)(pf, dev, number, repeat);
   }
 
-  auto ftimer = [pf, ctx, number, repeat, min_repeat_ms, f_preproc](TVMArgs args,
+  auto ftimer = [pf, dev, number, repeat, min_repeat_ms, f_preproc](TVMArgs args,
                                                                     TVMRetValue* rv) mutable {
     TVMRetValue temp;
     std::ostringstream os;
     // skip first time call, to activate lazy compilation components.
     pf.CallPacked(args, &temp);
 
-    DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
+    DeviceAPI::Get(dev)->StreamSync(dev, nullptr);
 
     for (int i = 0; i < repeat; ++i) {
       if (f_preproc != nullptr) {
         f_preproc.CallPacked(args, &temp);
       }
-      std::chrono::time_point<std::chrono::high_resolution_clock, std::chrono::nanoseconds> tbegin,
-          tend;
       double duration_ms = 0.0;
 
       do {
@@ -374,20 +373,17 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repe
                                              number * 1.618));  // 1.618 is chosen by random
         }
 
-        tbegin = std::chrono::high_resolution_clock::now();
+        Timer t = Timer::Start(dev);
         // start timing
         for (int i = 0; i < number; ++i) {
           pf.CallPacked(args, &temp);
         }
-        DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
-        tend = std::chrono::high_resolution_clock::now();
-
-        duration_ms =
-            std::chrono::duration_cast<std::chrono::duration<double>>(tend - tbegin).count() * 1000;
+        t->Stop();
+        int64_t t_nanos = t->SyncAndGetElapsedNanos();
+        duration_ms = t_nanos / 1e6;
       } while (duration_ms < min_repeat_ms);
 
-      double speed =
-          std::chrono::duration_cast<std::chrono::duration<double>>(tend - tbegin).count() / number;
+      double speed = duration_ms / 1e3 / number;
       os.write(reinterpret_cast<char*>(&speed), sizeof(speed));
     }
 
@@ -404,15 +400,15 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repe
 TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
     .set_body_typed([](Optional<Module> opt_mod, std::string name, int device_type, int device_id,
                        int number, int repeat, int min_repeat_ms, std::string f_preproc_name) {
-      TVMContext ctx;
-      ctx.device_type = static_cast<DLDeviceType>(device_type);
-      ctx.device_id = device_id;
+      Device dev;
+      dev.device_type = static_cast<DLDeviceType>(device_type);
+      dev.device_id = device_id;
       if (opt_mod.defined()) {
         Module m = opt_mod.value();
         std::string tkey = m->type_key();
         if (tkey == "rpc") {
           return static_cast<RPCModuleNode*>(m.operator->())
-              ->GetTimeEvaluator(name, ctx, number, repeat, min_repeat_ms, f_preproc_name);
+              ->GetTimeEvaluator(name, dev, number, repeat, min_repeat_ms, f_preproc_name);
         } else {
           PackedFunc f_preproc;
           if (!f_preproc_name.empty()) {
@@ -421,7 +417,7 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
                 << "Cannot find " << f_preproc_name << " in the global function";
             f_preproc = *pf_preproc;
           }
-          return WrapTimeEvaluator(m.GetFunction(name, false), ctx, number, repeat, min_repeat_ms,
+          return WrapTimeEvaluator(m.GetFunction(name, false), dev, number, repeat, min_repeat_ms,
                                    f_preproc);
         }
       } else {
@@ -434,7 +430,7 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
               << "Cannot find " << f_preproc_name << " in the global function";
           f_preproc = *pf_preproc;
         }
-        return WrapTimeEvaluator(*pf, ctx, number, repeat, min_repeat_ms, f_preproc);
+        return WrapTimeEvaluator(*pf, dev, number, repeat, min_repeat_ms, f_preproc);
       }
     });
 
@@ -473,10 +469,10 @@ TVM_REGISTER_GLOBAL("rpc.SessTableIndex").set_body([](TVMArgs args, TVMRetValue*
 });
 
 TVM_REGISTER_GLOBAL("tvm.rpc.NDArrayFromRemoteOpaqueHandle")
-    .set_body_typed([](Module mod, void* remote_array, DLTensor* template_tensor, TVMContext ctx,
+    .set_body_typed([](Module mod, void* remote_array, DLTensor* template_tensor, Device dev,
                        void* ndarray_handle) -> NDArray {
       return NDArrayFromRemoteOpaqueHandle(RPCModuleGetSession(mod), remote_array, template_tensor,
-                                           ctx, ndarray_handle);
+                                           dev, ndarray_handle);
     });
 
 }  // namespace runtime
diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc
index 0ac5b8dc74ef..df4f1ce42998 100644
--- a/src/runtime/rpc/rpc_session.cc
+++ b/src/runtime/rpc/rpc_session.cc
@@ -46,7 +46,7 @@ void RPCSession::AsyncCallFunc(PackedFuncHandle func, const TVMValue* arg_values
   try {
     this->CallFunc(func, arg_values, arg_type_codes, num_args,
                    [&callback](TVMArgs args) { callback(RPCCode::kReturn, args); });
-  } catch (const std::runtime_error& e) {
+  } catch (const std::exception& e) {
     this->SendException(callback, e.what());
   }
 }
@@ -60,7 +60,7 @@ void RPCSession::AsyncCopyToRemote(void* local_from_bytes, DLTensor* remote_to,
   try {
     this->CopyToRemote(local_from_bytes, remote_to, nbytes);
     callback(RPCCode::kReturn, TVMArgs(&value, &tcode, 1));
-  } catch (const std::runtime_error& e) {
+  } catch (const std::exception& e) {
     this->SendException(callback, e.what());
   }
 }
@@ -74,21 +74,21 @@ void RPCSession::AsyncCopyFromRemote(DLTensor* remote_from, void* local_to_bytes
   try {
     this->CopyFromRemote(remote_from, local_to_bytes, nbytes);
     callback(RPCCode::kReturn, TVMArgs(&value, &tcode, 1));
-  } catch (const std::runtime_error& e) {
+  } catch (const std::exception& e) {
     this->SendException(callback, e.what());
   }
 }
 
-void RPCSession::AsyncStreamWait(TVMContext ctx, TVMStreamHandle stream,
+void RPCSession::AsyncStreamWait(Device dev, TVMStreamHandle stream,
                                  RPCSession::FAsyncCallback callback) {
   TVMValue value;
   int32_t tcode = kTVMNullptr;
   value.v_handle = nullptr;
 
   try {
-    this->GetDeviceAPI(ctx)->StreamSync(ctx, stream);
+    this->GetDeviceAPI(dev)->StreamSync(dev, stream);
     callback(RPCCode::kReturn, TVMArgs(&value, &tcode, 1));
-  } catch (const std::runtime_error& e) {
+  } catch (const std::exception& e) {
     this->SendException(callback, e.what());
   }
 }
diff --git a/src/runtime/rpc/rpc_session.h b/src/runtime/rpc/rpc_session.h
index 4b942f2230ba..8923103157d5 100644
--- a/src/runtime/rpc/rpc_session.h
+++ b/src/runtime/rpc/rpc_session.h
@@ -157,12 +157,12 @@ class RPCSession {
    *  The device API is guaranteed to be alive during the
    *  lifetime of the Session.
    *
-   * \param ctx The remote context.
+   * \param dev The remote device.
    * \param allow_missing Whether can we return nullptr if it is not available.
    *
    * \return The device API.
    */
-  virtual DeviceAPI* GetDeviceAPI(TVMContext ctx, bool allow_missing = false) = 0;
+  virtual DeviceAPI* GetDeviceAPI(Device dev, bool allow_missing = false) = 0;
 
   /*!
    * \brief Whether the session is a local session and we can directly
@@ -234,12 +234,12 @@ class RPCSession {
   virtual void AsyncCopyFromRemote(DLTensor* remote_from, void* local_to_bytes, uint64_t nbytes,
                                    FAsyncCallback on_complete);
   /*!
-   * \brief Asynchrously wait for all events in ctx, stream compeletes.
-   * \param ctx The device context.
+   * \brief Asynchrously wait for all events in dev, stream compeletes.
+   * \param dev The device.
    * \param stream The stream to wait on.
    * \param on_complete The callback to signal copy complete.
    */
-  virtual void AsyncStreamWait(TVMContext ctx, TVMStreamHandle stream, FAsyncCallback on_compelte);
+  virtual void AsyncStreamWait(Device dev, TVMStreamHandle stream, FAsyncCallback on_compelte);
 
   /*!
    * \return The session table index of the session.
@@ -272,7 +272,7 @@ class RPCSession {
 /*!
  * \brief Remote space handle cell used by the RPC runtime API.
  *
- *  When we allocate space using a rpc context, the data pointer
+ *  When we allocate space using a rpc device, the data pointer
  *  points to an allocated RemoteSpace.
  */
 struct RemoteSpace {
@@ -285,7 +285,7 @@ struct RemoteSpace {
 /*!
  * \brief Wrap a timer function to measure the time cost of a given packed function.
  * \param f The function argument.
- * \param ctx The context.
+ * \param dev The device.
  * \param number The number of times to run this function for taking average.
  *        We call these runs as one `repeat` of measurement.
  * \param repeat The number of times to repeat the measurement.
@@ -302,8 +302,8 @@ struct RemoteSpace {
  * \param f_preproc The function to be executed before we excetute time evaluator.
  * \return f_timer A timer function.
  */
-PackedFunc WrapTimeEvaluator(PackedFunc f, TVMContext ctx, int number, int repeat,
-                             int min_repeat_ms, PackedFunc f_preproc = nullptr);
+PackedFunc WrapTimeEvaluator(PackedFunc f, Device dev, int number, int repeat, int min_repeat_ms,
+                             PackedFunc f_preproc = nullptr);
 
 /*!
  * \brief Create a Global RPC module that refers to the session.
diff --git a/src/runtime/runtime_base.h b/src/runtime/runtime_base.h
index 21601df1ad39..7abb32935a2b 100644
--- a/src/runtime/runtime_base.h
+++ b/src/runtime/runtime_base.h
@@ -34,7 +34,7 @@
      and finishes with API_END() or API_END_HANDLE_ERROR */
 #define API_END()                           \
   }                                         \
-  catch (std::runtime_error & _except_) {   \
+  catch (std::exception & _except_) {       \
     return TVMAPIHandleException(_except_); \
   }                                         \
   return 0;  // NOLINT(*)
@@ -45,7 +45,7 @@
  */
 #define API_END_HANDLE_ERROR(Finalize)      \
   }                                         \
-  catch (std::runtime_error & _except_) {   \
+  catch (std::exception & _except_) {       \
     Finalize;                               \
     return TVMAPIHandleException(_except_); \
   }                                         \
@@ -56,6 +56,6 @@
  * \param e the exception
  * \return the return value of API after exception is handled
  */
-int TVMAPIHandleException(const std::runtime_error& e);
+int TVMAPIHandleException(const std::exception& e);
 
 #endif  // TVM_RUNTIME_RUNTIME_BASE_H_
diff --git a/src/runtime/stackvm/stackvm.cc b/src/runtime/stackvm/stackvm.cc
index 4a5211e9c829..808dc4063c8b 100644
--- a/src/runtime/stackvm/stackvm.cc
+++ b/src/runtime/stackvm/stackvm.cc
@@ -478,11 +478,11 @@ void StackVM::Run(State* s) const {
             break;
           }
           case StackVM::kArrDeviceId: {
-            stack[sp].v_int64 = arr[index].ctx.device_id;
+            stack[sp].v_int64 = arr[index].device.device_id;
             break;
           }
           case StackVM::kArrDeviceType: {
-            stack[sp].v_int64 = static_cast<int64_t>(arr[index].ctx.device_type);
+            stack[sp].v_int64 = static_cast<int64_t>(arr[index].device.device_type);
             break;
           }
           case StackVM::kArrAddr: {
@@ -537,11 +537,11 @@ void StackVM::Run(State* s) const {
             break;
           }
           case StackVM::kArrDeviceId: {
-            arr[index].ctx.device_id = static_cast<int>(stack[sp].v_int64);
+            arr[index].device.device_id = static_cast<int>(stack[sp].v_int64);
             break;
           }
           case StackVM::kArrDeviceType: {
-            arr[index].ctx.device_type = static_cast<DLDeviceType>(stack[sp].v_int64);
+            arr[index].device.device_type = static_cast<DLDeviceType>(stack[sp].v_int64);
             break;
           }
           case StackVM::kTVMValueContent: {
diff --git a/src/runtime/thread_pool.cc b/src/runtime/thread_pool.cc
index 5f5a811c2d30..cab04ec0db4a 100644
--- a/src/runtime/thread_pool.cc
+++ b/src/runtime/thread_pool.cc
@@ -24,10 +24,10 @@
 #include <dmlc/thread_local.h>
 #include <tvm/runtime/c_backend_api.h>
 #include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/threading_backend.h>
-#include <tvm/support/logging.h>
 #if TVM_THREADPOOL_USE_OPENMP
 #include <omp.h>
 #endif
diff --git a/src/runtime/threading_backend.cc b/src/runtime/threading_backend.cc
index 2527f4799086..7f9cfaa8730c 100644
--- a/src/runtime/threading_backend.cc
+++ b/src/runtime/threading_backend.cc
@@ -21,8 +21,8 @@
  * \file threading_backend.cc
  * \brief Native threading backend
  */
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/threading_backend.h>
-#include <tvm/support/logging.h>
 
 #include <algorithm>
 #include <thread>
diff --git a/src/runtime/vm/bytecode.cc b/src/runtime/vm/bytecode.cc
index f82d708468f7..09b928fa1e39 100644
--- a/src/runtime/vm/bytecode.cc
+++ b/src/runtime/vm/bytecode.cc
@@ -22,8 +22,8 @@
  * \brief The bytecode for Relay virtual machine.
  */
 
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/vm/bytecode.h>
-#include <tvm/support/logging.h>
 
 #include <sstream>
 
diff --git a/src/runtime/vm/memory_manager.cc b/src/runtime/vm/memory_manager.cc
index 960b2e20145a..fa5edf18335c 100644
--- a/src/runtime/vm/memory_manager.cc
+++ b/src/runtime/vm/memory_manager.cc
@@ -37,7 +37,7 @@ static void BufferDeleter(Object* obj) {
   auto* ptr = static_cast<NDArray::Container*>(obj);
   ICHECK(ptr->manager_ctx != nullptr);
   Buffer* buffer = reinterpret_cast<Buffer*>(ptr->manager_ctx);
-  MemoryManager::GetAllocator(buffer->ctx)->Free(*(buffer));
+  MemoryManager::GetAllocator(buffer->device)->Free(*(buffer));
   delete buffer;
   delete ptr;
 }
@@ -80,7 +80,8 @@ NDArray StorageObj::AllocNDArray(size_t offset, std::vector<int64_t> shape, DLDa
   VerifyDataType(dtype);
 
   // crtical zone: allocate header, cannot throw
-  NDArray::Container* container = new NDArray::Container(nullptr, shape, dtype, this->buffer.ctx);
+  NDArray::Container* container =
+      new NDArray::Container(nullptr, shape, dtype, this->buffer.device);
 
   container->SetDeleter(StorageObj::Deleter);
   size_t needed_size = GetDataSize(container->dl_tensor);
@@ -116,54 +117,54 @@ MemoryManager* MemoryManager::Global() {
   return inst;
 }
 
-Allocator* MemoryManager::GetOrCreateAllocator(TVMContext ctx, AllocatorType type) {
+Allocator* MemoryManager::GetOrCreateAllocator(Device dev, AllocatorType type) {
   MemoryManager* m = MemoryManager::Global();
   std::lock_guard<std::mutex> lock(m->mu_);
-  if (m->allocators_.find(ctx) == m->allocators_.end()) {
+  if (m->allocators_.find(dev) == m->allocators_.end()) {
     std::unique_ptr<Allocator> alloc;
     switch (type) {
       case kNaive: {
-        DLOG(INFO) << "New naive allocator for " << DeviceName(ctx.device_type) << "("
-                   << ctx.device_id << ")";
-        alloc.reset(new NaiveAllocator(ctx));
+        DLOG(INFO) << "New naive allocator for " << DeviceName(dev.device_type) << "("
+                   << dev.device_id << ")";
+        alloc.reset(new NaiveAllocator(dev));
         break;
       }
       case kPooled: {
-        DLOG(INFO) << "New pooled allocator for " << DeviceName(ctx.device_type) << "("
-                   << ctx.device_id << ")";
-        alloc.reset(new PooledAllocator(ctx));
+        DLOG(INFO) << "New pooled allocator for " << DeviceName(dev.device_type) << "("
+                   << dev.device_id << ")";
+        alloc.reset(new PooledAllocator(dev));
         break;
       }
       default:
         LOG(FATAL) << "Unknown allocator type: " << type;
     }
     auto ret = alloc.get();
-    m->allocators_.emplace(ctx, std::move(alloc));
+    m->allocators_.emplace(dev, std::move(alloc));
     return ret;
   }
-  auto alloc = m->allocators_.at(ctx).get();
+  auto alloc = m->allocators_.at(dev).get();
   if (alloc->type() != type) {
-    LOG(WARNING) << "The type of existing allocator for " << DeviceName(ctx.device_type) << "("
-                 << ctx.device_id << ") is different from the request type (" << alloc->type()
+    LOG(WARNING) << "The type of existing allocator for " << DeviceName(dev.device_type) << "("
+                 << dev.device_id << ") is different from the request type (" << alloc->type()
                  << " vs " << type << ")";
   }
   return alloc;
 }
 
-Allocator* MemoryManager::GetAllocator(TVMContext ctx) {
+Allocator* MemoryManager::GetAllocator(Device dev) {
   MemoryManager* m = MemoryManager::Global();
   std::lock_guard<std::mutex> lock(m->mu_);
-  auto it = m->allocators_.find(ctx);
+  auto it = m->allocators_.find(dev);
   if (it == m->allocators_.end()) {
-    LOG(FATAL) << "Allocator for " << DeviceName(ctx.device_type) << "(" << ctx.device_id
+    LOG(FATAL) << "Allocator for " << DeviceName(dev.device_type) << "(" << dev.device_id
                << ") has not been created yet.";
   }
   return it->second.get();
 }
 
-NDArray Allocator::Empty(std::vector<int64_t> shape, DLDataType dtype, DLContext ctx) {
+NDArray Allocator::Empty(std::vector<int64_t> shape, DLDataType dtype, DLDevice dev) {
   VerifyDataType(dtype);
-  NDArray::Container* container = new NDArray::Container(nullptr, shape, dtype, ctx);
+  NDArray::Container* container = new NDArray::Container(nullptr, shape, dtype, dev);
   container->SetDeleter(BufferDeleter);
   size_t size = GetDataSize(container->dl_tensor);
   size_t alignment = GetDataAlignment(container->dl_tensor);
diff --git a/src/runtime/vm/naive_allocator.h b/src/runtime/vm/naive_allocator.h
index 301acf81a9c9..9fce66f60669 100644
--- a/src/runtime/vm/naive_allocator.h
+++ b/src/runtime/vm/naive_allocator.h
@@ -34,20 +34,20 @@ namespace vm {
 
 class NaiveAllocator final : public Allocator {
  public:
-  explicit NaiveAllocator(TVMContext ctx) : Allocator(kNaive), used_memory_(0), ctx_(ctx) {}
+  explicit NaiveAllocator(Device dev) : Allocator(kNaive), used_memory_(0), device_(dev) {}
 
   Buffer Alloc(size_t nbytes, size_t alignment, DLDataType type_hint) override {
     Buffer buf;
-    buf.ctx = ctx_;
+    buf.device = device_;
     buf.size = nbytes;
-    buf.data = DeviceAPI::Get(ctx_)->AllocDataSpace(ctx_, nbytes, alignment, type_hint);
+    buf.data = DeviceAPI::Get(device_)->AllocDataSpace(device_, nbytes, alignment, type_hint);
     used_memory_.fetch_add(nbytes, std::memory_order_relaxed);
     DLOG(INFO) << "allocate " << nbytes << " B, used memory " << used_memory_ << " B";
     return buf;
   }
 
   void Free(const Buffer& buffer) override {
-    DeviceAPI::Get(ctx_)->FreeDataSpace(buffer.ctx, buffer.data);
+    DeviceAPI::Get(device_)->FreeDataSpace(buffer.device, buffer.data);
     used_memory_.fetch_sub(buffer.size, std::memory_order_relaxed);
     DLOG(INFO) << "free " << buffer.size << " B, used memory " << used_memory_ << " B";
   }
@@ -56,7 +56,7 @@ class NaiveAllocator final : public Allocator {
 
  private:
   std::atomic<size_t> used_memory_;
-  TVMContext ctx_;
+  Device device_;
 };
 
 }  // namespace vm
diff --git a/src/runtime/vm/pooled_allocator.h b/src/runtime/vm/pooled_allocator.h
index 4226ef74daa4..bb088c5653f2 100644
--- a/src/runtime/vm/pooled_allocator.h
+++ b/src/runtime/vm/pooled_allocator.h
@@ -39,8 +39,8 @@ class PooledAllocator final : public Allocator {
  public:
   static constexpr size_t kDefaultPageSize = 4096;
 
-  explicit PooledAllocator(TVMContext ctx, size_t page_size = kDefaultPageSize)
-      : Allocator(kPooled), page_size_(page_size), used_memory_(0), ctx_(ctx) {}
+  explicit PooledAllocator(Device dev, size_t page_size = kDefaultPageSize)
+      : Allocator(kPooled), page_size_(page_size), used_memory_(0), device_(dev) {}
 
   ~PooledAllocator() { ReleaseAll(); }
 
@@ -55,9 +55,9 @@ class PooledAllocator final : public Allocator {
       return ret;
     }
     Buffer buf;
-    buf.ctx = ctx_;
+    buf.device = device_;
     buf.size = size;
-    buf.data = DeviceAPI::Get(ctx_)->AllocDataSpace(ctx_, size, alignment, type_hint);
+    buf.data = DeviceAPI::Get(device_)->AllocDataSpace(device_, size, alignment, type_hint);
     used_memory_.fetch_add(size, std::memory_order_relaxed);
     DLOG(INFO) << "allocate " << size << " B, used memory " << used_memory_ << " B";
     return buf;
@@ -80,7 +80,7 @@ class PooledAllocator final : public Allocator {
     for (auto const& it : memory_pool_) {
       auto const& pool = it.second;
       for (auto const& buf : pool) {
-        DeviceAPI::Get(buf.ctx)->FreeDataSpace(buf.ctx, buf.data);
+        DeviceAPI::Get(buf.device)->FreeDataSpace(buf.device, buf.data);
       }
     }
     memory_pool_.clear();
@@ -93,7 +93,7 @@ class PooledAllocator final : public Allocator {
   std::atomic<size_t> used_memory_;
   std::unordered_map<size_t, std::vector<Buffer> > memory_pool_;
   std::mutex mu_;
-  TVMContext ctx_;
+  Device device_;
 };
 
 }  // namespace vm
diff --git a/src/runtime/vm/profiler/vm.cc b/src/runtime/vm/profiler/vm.cc
index fc01a754ca50..3f3dee437cb3 100644
--- a/src/runtime/vm/profiler/vm.cc
+++ b/src/runtime/vm/profiler/vm.cc
@@ -113,9 +113,8 @@ void VirtualMachineDebug::LoadExecutable(const Executable* exec) {
 void VirtualMachineDebug::InvokePacked(Index packed_index, const PackedFunc& func, Index arg_count,
                                        Index output_size, const std::vector<ObjectRef>& args) {
   ICHECK(exec_);
-  ICHECK(!ctxs_.empty()) << "Context has not been initialized yet.";
-  // The device context of any input of the operator is used for
-  // synchronization.
+  ICHECK(!devices_.empty()) << "Device has not been initialized yet.";
+  // The device of any input of the operator is used for synchronization.
   ICHECK_GT(arg_count, 0U);
   ObjectRef arg = args[0];
   while (arg->IsInstance<ADTObj>()) {
@@ -124,9 +123,9 @@ void VirtualMachineDebug::InvokePacked(Index packed_index, const PackedFunc& fun
   }
   ICHECK(arg->IsInstance<NDArray::ContainerType>());
   auto nd_array = Downcast<NDArray>(arg);
-  auto ctx = nd_array->ctx;
+  auto dev = nd_array->device;
 
-  Timer t = Timer::Start(ctx);
+  Timer t = Timer::Start(dev);
   VirtualMachine::InvokePacked(packed_index, func, arg_count, output_size, args);
   t->Stop();
 
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index 6d121aa67733..ee06da83bd92 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -24,10 +24,10 @@
 
 #include <dmlc/memory_io.h>
 #include <tvm/runtime/container.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/memory.h>
 #include <tvm/runtime/object.h>
 #include <tvm/runtime/vm/vm.h>
-#include <tvm/support/logging.h>
 
 #include <algorithm>
 #include <chrono>
@@ -64,11 +64,11 @@ std::ostream& operator<<(std::ostream& os, const VMFunction& vm_func) {
   return os;
 }
 
-inline ObjectRef CopyTo(ObjectRef src, const DLContext& ctx) {
+inline ObjectRef CopyTo(ObjectRef src, const DLDevice& dev) {
   if (src->IsInstance<NDArray::ContainerType>()) {
     auto nd_array = Downcast<NDArray>(src);
-    if (nd_array->ctx.device_type != ctx.device_type) {
-      return nd_array.CopyTo(ctx);
+    if (nd_array->device.device_type != dev.device_type) {
+      return nd_array.CopyTo(dev);
     }
     return src;
   } else {
@@ -77,7 +77,7 @@ inline ObjectRef CopyTo(ObjectRef src, const DLContext& ctx) {
     std::vector<ObjectRef> ret;
     ADT adt = Downcast<ADT>(src);
     for (size_t i = 0; i < adt.size(); i++) {
-      ret.push_back(CopyTo(adt[i], ctx));
+      ret.push_back(CopyTo(adt[i], dev));
     }
     return ADT(adt->tag, ret.begin(), ret.end());
   }
@@ -135,18 +135,18 @@ PackedFunc VirtualMachine::GetFunction(const std::string& name,
   } else if (name == "init") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
       ICHECK_EQ(args.size() % 3, 0);
-      std::vector<TVMContext> contexts;
+      std::vector<Device> devices;
       std::vector<AllocatorType> alloc_types;
       for (int i = 0; i < args.size() / 3; ++i) {
-        TVMContext ctx;
+        Device dev;
         int device_type = args[i * 3];
-        ctx.device_type = DLDeviceType(device_type);
-        ctx.device_id = args[i * 3 + 1];
+        dev.device_type = DLDeviceType(device_type);
+        dev.device_id = args[i * 3 + 1];
         int type = args[i * 3 + 2];
-        contexts.push_back(ctx);
+        devices.push_back(dev);
         alloc_types.push_back(AllocatorType(type));
       }
-      this->Init(contexts, alloc_types);
+      this->Init(devices, alloc_types);
     });
   } else if (name == "set_input") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
@@ -164,8 +164,8 @@ PackedFunc VirtualMachine::GetFunction(const std::string& name,
       std::vector<ObjectRef> func_args(param_names.size());
       for (int i = 1; i < args.size(); ++i) {
         Index device_type = vm_func.params_device_type[i - 1];
-        DLContext ctx = GetContext(device_type);
-        ObjectRef obj = CopyTo(args[i], ctx);
+        Device dev = GetDevice(device_type);
+        ObjectRef obj = CopyTo(args[i], dev);
         func_args[i - 1] = obj;
       }
       inputs_.erase(func_name);
@@ -177,13 +177,13 @@ PackedFunc VirtualMachine::GetFunction(const std::string& name,
   }
 }
 
-inline TVMContext VirtualMachine::GetContext(Index device_type) const {
-  ICHECK_GE(ctxs_.size(), device_type) << "ctxs_ list doesn't contain device:" << device_type;
+inline Device VirtualMachine::GetDevice(Index device_type) const {
+  ICHECK_GE(devices_.size(), device_type) << "devices_ doesn't contain device:" << device_type;
 
-  auto ctx = ctxs_[device_type];
-  ICHECK_EQ(static_cast<Index>(ctx.device_type), device_type)
-      << "device type " << device_type << " has not been initialized int the context list.";
-  return ctx;
+  auto dev = devices_[device_type];
+  ICHECK_EQ(static_cast<Index>(dev.device_type), device_type)
+      << "device type " << device_type << " has not been initialized in the device list.";
+  return dev;
 }
 
 void VirtualMachine::PushFrame(Index arg_count, Index ret_pc, const VMFunction& vm_func) {
@@ -301,18 +301,18 @@ void VirtualMachine::LoadExecutable(const Executable* exec) {
   }
 }
 
-void VirtualMachine::Init(const std::vector<TVMContext>& ctxs,
+void VirtualMachine::Init(const std::vector<Device>& devs,
                           const std::vector<AllocatorType>& alloc_types) {
-  ICHECK_EQ(ctxs.size(), alloc_types.size());
-  // Cache the context
-  for (size_t i = 0; i < ctxs.size(); i++) {
-    auto dev_type = static_cast<size_t>(ctxs[i].device_type);
-    auto alloc = MemoryManager::GetOrCreateAllocator(ctxs[i], alloc_types[i]);
-    if (ctxs_.size() <= dev_type) {
-      ctxs_.resize(dev_type + 1);
+  ICHECK_EQ(devs.size(), alloc_types.size());
+  // Cache the device
+  for (size_t i = 0; i < devs.size(); i++) {
+    auto dev_type = static_cast<size_t>(devs[i].device_type);
+    auto alloc = MemoryManager::GetOrCreateAllocator(devs[i], alloc_types[i]);
+    if (devices_.size() <= dev_type) {
+      devices_.resize(dev_type + 1);
       allocators_.resize(dev_type + 1);
     }
-    ctxs_[dev_type] = ctxs[i];
+    devices_[dev_type] = devs[i];
     allocators_[dev_type] = alloc;
   }
 }
@@ -388,8 +388,8 @@ void VirtualMachine::RunLoop() {
         }
 
         if (!const_pool_[instr.const_index].defined()) {
-          TVMContext ctx = GetContext(exec_->const_device_type[instr.const_index]);
-          const_pool_[instr.const_index] = CopyTo(constant_obj, ctx);
+          Device dev = GetDevice(exec_->const_device_type[instr.const_index]);
+          const_pool_[instr.const_index] = CopyTo(constant_obj, dev);
         }
         WriteRegister(instr.dst, const_pool_[instr.const_index]);
         pc_++;
@@ -497,9 +497,9 @@ void VirtualMachine::RunLoop() {
         goto main_loop;
       }
       case Opcode::AllocTensorReg: {
-        DLContext cpu_ctx = GetContext(static_cast<Index>(kDLCPU));
+        Device cpu_dev = GetDevice(static_cast<Index>(kDLCPU));
         auto shape_obj = ReadRegister(instr.alloc_tensor_reg.shape_register);
-        NDArray shape_tensor = Downcast<NDArray>(CopyTo(shape_obj, cpu_ctx));
+        NDArray shape_tensor = Downcast<NDArray>(CopyTo(shape_obj, cpu_dev));
         auto shape = ToShape(shape_tensor);
         auto storage_obj = ReadRegister(instr.alloc_tensor_reg.storage);
         auto storage = Downcast<Storage>(storage_obj);
@@ -542,7 +542,7 @@ void VirtualMachine::RunLoop() {
         ICHECK_LT(static_cast<size_t>(dev_type), allocators_.size())
             << "Memory allocator for device " << dev_type << " has not been initialized";
         auto* alloc = allocators_[dev_type];
-        ICHECK(alloc) << "Did you forget to init the VirtualMachine with contexts?";
+        ICHECK(alloc) << "Did you forget to init the VirtualMachine with devices?";
         storage_obj->buffer = alloc->Alloc(size, alignment, instr.alloc_storage.dtype_hint);
         Storage storage(storage_obj);
         WriteRegister(instr.dst, storage);
@@ -577,12 +577,12 @@ void VirtualMachine::RunLoop() {
         }
       }
       case Opcode::ReshapeTensor: {
-        DLContext cpu_ctx = GetContext(static_cast<Index>(kDLCPU));
+        Device cpu_dev = GetDevice(static_cast<Index>(kDLCPU));
         auto tensor_obj = ReadRegister(instr.reshape_tensor.tensor);
         NDArray tensor_arr = Downcast<NDArray>(tensor_obj);
         // Read the shape from shape tensor
         auto shape_obj = ReadRegister(instr.reshape_tensor.newshape);
-        NDArray shape_tensor = Downcast<NDArray>(CopyTo(shape_obj, cpu_ctx));
+        NDArray shape_tensor = Downcast<NDArray>(CopyTo(shape_obj, cpu_dev));
         const DLTensor* dl_tensor = shape_tensor.operator->();
         ICHECK_EQ(dl_tensor->dtype.code, 0u);
         ICHECK_EQ(dl_tensor->dtype.bits, 64);
@@ -598,14 +598,14 @@ void VirtualMachine::RunLoop() {
       case Opcode::DeviceCopy: {
         auto tensor_src = ReadRegister(instr.src);
         NDArray src_data = Downcast<NDArray>(tensor_src);
-        DLContext src_ctx = src_data->ctx;
-        ICHECK_EQ(static_cast<Index>(src_ctx.device_type), instr.src_device_type);
+        Device src_dev = src_data->device;
+        ICHECK_EQ(static_cast<Index>(src_dev.device_type), instr.src_device_type);
 
-        DLContext dst_ctx;
-        dst_ctx.device_type = static_cast<DLDeviceType>(instr.dst_device_type);
-        dst_ctx.device_id = 0;
+        Device dst_dev;
+        dst_dev.device_type = static_cast<DLDeviceType>(instr.dst_device_type);
+        dst_dev.device_id = 0;
 
-        NDArray dst_data = src_data.CopyTo(dst_ctx);
+        NDArray dst_data = src_data.CopyTo(dst_dev);
         WriteRegister(instr.dst, dst_data);
         pc_++;
         goto main_loop;
diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc
index 794f3c570f96..5cd4812f41c4 100644
--- a/src/runtime/vulkan/vulkan.cc
+++ b/src/runtime/vulkan/vulkan.cc
@@ -76,7 +76,7 @@ class VulkanThreadEntry {
     }
   }
 
-  TVMContext ctx;
+  Device device;
   std::unique_ptr<WorkspacePool> pool;
   VulkanStream* Stream(size_t device_id);
   VulkanStagingBuffer* StagingBuffer(int device_id, size_t size);
@@ -115,12 +115,15 @@ class VulkanDeviceAPI final : public DeviceAPI {
       vkDestroyInstance(instance_, nullptr);
     }
   }
-  void SetDevice(TVMContext ctx) final { VulkanThreadEntry::ThreadLocal()->ctx = ctx; }
-  void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final;
+  void SetDevice(Device dev) final { VulkanThreadEntry::ThreadLocal()->device = dev; }
+  void GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) final;
   std::vector<uint32_t> GetComputeQueueFamilies(VkPhysicalDevice phy_dev);
-  void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment,
-                       DLDataType type_hint) final {
-    const auto& vctx = context(ctx.device_id);
+  void* AllocDataSpace(Device dev, size_t nbytes, size_t alignment, DLDataType type_hint) final {
+    if (nbytes == 0) {
+      // Vulkan seems to have issues if we return nullptr on zero size alloc
+      nbytes = 1;
+    }
+    const auto& vctx = context(dev.device_id);
     VkBufferCreateInfo info;
     info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
     info.pNext = nullptr;
@@ -187,12 +190,12 @@ class VulkanDeviceAPI final : public DeviceAPI {
     return pbuf;
   }
 
-  void FreeDataSpace(TVMContext ctx, void* ptr) final {
+  void FreeDataSpace(Device dev, void* ptr) final {
     // Before releasing the vkBuffer, call sync to
     // finish all the vulkan commands that reference the buffer.
-    StreamSync(ctx, nullptr);
+    StreamSync(dev, nullptr);
 
-    const auto& vctx = context(ctx.device_id);
+    const auto& vctx = context(dev.device_id);
     auto* pbuf = static_cast<VulkanBuffer*>(ptr);
     vkDestroyBuffer(vctx.device, pbuf->buffer, nullptr);
     vkFreeMemory(vctx.device, pbuf->memory, nullptr);
@@ -201,19 +204,19 @@ class VulkanDeviceAPI final : public DeviceAPI {
 
  protected:
   void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
-                      TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
+                      Device dev_from, Device dev_to, DLDataType type_hint,
                       TVMStreamHandle stream) final {
     ICHECK(stream == nullptr);
-    TVMContext ctx = ctx_from;
-    if (ctx_from.device_type == kDLCPU) {
-      ctx = ctx_to;
+    Device dev = dev_from;
+    if (dev_from.device_type == kDLCPU) {
+      dev = dev_to;
     }
 
-    int from_dev_type = static_cast<int>(ctx_from.device_type);
-    int to_dev_type = static_cast<int>(ctx_to.device_type);
+    int from_dev_type = static_cast<int>(dev_from.device_type);
+    int to_dev_type = static_cast<int>(dev_to.device_type);
     if (from_dev_type == kDLVulkan && to_dev_type == kDLVulkan) {
       VulkanThreadEntry::ThreadLocal()
-          ->Stream(ctx_from.device_id)
+          ->Stream(dev_from.device_id)
           ->Launch([=](VulkanStreamState* state) {
             // 1: copy
             const auto* from_buf = static_cast<const VulkanBuffer*>(from);
@@ -224,7 +227,7 @@ class VulkanDeviceAPI final : public DeviceAPI {
             copy_info.size = size;
             vkCmdCopyBuffer(state->cmd_buffer_, from_buf->buffer, to_buf->buffer, 1, &copy_info);
             // 2: barrier(transfer-> compute|transfer)
-            ICHECK_EQ(ctx_from.device_id, ctx_to.device_id) << "Vulkan disallow cross device copy.";
+            ICHECK_EQ(dev_from.device_id, dev_to.device_id) << "Vulkan disallow cross device copy.";
             VkMemoryBarrier barrier_info;
             barrier_info.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
             barrier_info.pNext = nullptr;
@@ -240,10 +243,10 @@ class VulkanDeviceAPI final : public DeviceAPI {
 
     } else if (from_dev_type == kDLVulkan && to_dev_type == kDLCPU) {
       const auto* from_buf = static_cast<const VulkanBuffer*>(from);
-      const auto& vctx = context(ctx_from.device_id);
-      auto* temp = VulkanThreadEntry::ThreadLocal()->StagingBuffer(ctx_from.device_id, size);
+      const auto& vctx = context(dev_from.device_id);
+      auto* temp = VulkanThreadEntry::ThreadLocal()->StagingBuffer(dev_from.device_id, size);
       VulkanThreadEntry::ThreadLocal()
-          ->Stream(ctx_from.device_id)
+          ->Stream(dev_from.device_id)
           ->Launch([&](VulkanStreamState* state) {
             VkBufferCopy copy_info;
             copy_info.srcOffset = from_offset;
@@ -251,7 +254,7 @@ class VulkanDeviceAPI final : public DeviceAPI {
             copy_info.size = size;
             vkCmdCopyBuffer(state->cmd_buffer_, from_buf->buffer, temp->buffer, 1, &copy_info);
           });
-      VulkanThreadEntry::ThreadLocal()->Stream(ctx_from.device_id)->Synchronize();
+      VulkanThreadEntry::ThreadLocal()->Stream(dev_from.device_id)->Synchronize();
       if (!vctx.coherent_staging) {
         VkMappedMemoryRange mrange;
         mrange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
@@ -263,10 +266,10 @@ class VulkanDeviceAPI final : public DeviceAPI {
       }
       memcpy(static_cast<char*>(to) + to_offset, static_cast<char*>(temp->host_addr), size);
     } else if (from_dev_type == kDLCPU && to_dev_type == kDLVulkan) {
-      const auto& vctx = context(ctx_to.device_id);
+      const auto& vctx = context(dev_to.device_id);
       const auto* to_buf = static_cast<const VulkanBuffer*>(to);
       VulkanStagingBuffer* temp =
-          VulkanThreadEntry::ThreadLocal()->StagingBuffer(ctx_to.device_id, size);
+          VulkanThreadEntry::ThreadLocal()->StagingBuffer(dev_to.device_id, size);
       memcpy(temp->host_addr, static_cast<const char*>(from) + from_offset, size);
       // host side flush if access is not coherent.
       // so writes from CPU is visible to GPU
@@ -281,7 +284,7 @@ class VulkanDeviceAPI final : public DeviceAPI {
       }
 
       VulkanThreadEntry::ThreadLocal()
-          ->Stream(ctx_from.device_id)
+          ->Stream(dev_from.device_id)
           ->Launch([&](VulkanStreamState* state) {
             // 0: barrier(host->transfer)
             VkMemoryBarrier barrier_info;
@@ -301,7 +304,7 @@ class VulkanDeviceAPI final : public DeviceAPI {
           });
       // TODO(tulloch): should we instead make the staging buffer a property of the
       // Stream? This would allow us to elide synchronizations here.
-      VulkanThreadEntry::ThreadLocal()->Stream(ctx_from.device_id)->Synchronize();
+      VulkanThreadEntry::ThreadLocal()->Stream(dev_from.device_id)->Synchronize();
     } else {
       LOG(FATAL) << "Expect copy from/to Vulkan or between Vulkan"
                  << ", from=" << from_dev_type << ", to=" << to_dev_type;
@@ -310,37 +313,37 @@ class VulkanDeviceAPI final : public DeviceAPI {
 
  public:
   // Always use the default stream
-  TVMStreamHandle CreateStream(TVMContext ctx) {
+  TVMStreamHandle CreateStream(Device dev) {
     LOG(FATAL) << "Not implemented";
     return nullptr;
   }
 
-  void FreeStream(TVMContext ctx, TVMStreamHandle stream) {
+  void FreeStream(Device dev, TVMStreamHandle stream) {
     LOG(FATAL) << "Not implemented";
     return;
   }
 
-  void SyncStreamFromTo(TVMContext ctx, TVMStreamHandle event_src, TVMStreamHandle event_dst) {
+  void SyncStreamFromTo(Device dev, TVMStreamHandle event_src, TVMStreamHandle event_dst) {
     LOG(FATAL) << "Not implemented";
     return;
   }
 
-  void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {
+  void StreamSync(Device dev, TVMStreamHandle stream) final {
     ICHECK(stream == nullptr);
-    VulkanThreadEntry::ThreadLocal()->Stream(ctx.device_id)->Synchronize();
+    VulkanThreadEntry::ThreadLocal()->Stream(dev.device_id)->Synchronize();
   }
 
-  void SetStream(TVMContext ctx, TVMStreamHandle stream) final {
+  void SetStream(Device dev, TVMStreamHandle stream) final {
     LOG(FATAL) << "Not implemented";
     return;
   }
 
-  void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final {
-    return VulkanThreadEntry::ThreadLocal()->pool->AllocWorkspace(ctx, size);
+  void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final {
+    return VulkanThreadEntry::ThreadLocal()->pool->AllocWorkspace(dev, size);
   }
 
-  void FreeWorkspace(TVMContext ctx, void* data) final {
-    VulkanThreadEntry::ThreadLocal()->pool->FreeWorkspace(ctx, data);
+  void FreeWorkspace(Device dev, void* data) final {
+    VulkanThreadEntry::ThreadLocal()->pool->FreeWorkspace(dev, data);
   }
 
   static VulkanDeviceAPI* Global() {
@@ -359,36 +362,45 @@ class VulkanDeviceAPI final : public DeviceAPI {
   std::vector<VulkanContext> context_;
 };
 
-void VulkanDeviceAPI::GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) {
-  size_t index = static_cast<size_t>(ctx.device_id);
+void VulkanDeviceAPI::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) {
+  size_t index = static_cast<size_t>(dev.device_id);
   if (kind == kExist) {
     *rv = static_cast<int>(index < context_.size());
     return;
   }
   ICHECK_LT(index, context_.size()) << "Invalid device id " << index;
   const auto& vctx = context(index);
+  VkPhysicalDeviceProperties phy_prop;
+  vkGetPhysicalDeviceProperties(vctx.phy_device, &phy_prop);
+
   switch (kind) {
     case kMaxThreadsPerBlock: {
-      VkPhysicalDeviceProperties phy_prop;
-      vkGetPhysicalDeviceProperties(vctx.phy_device, &phy_prop);
       int64_t value = phy_prop.limits.maxComputeWorkGroupInvocations;
       *rv = value;
       break;
     }
     case kMaxSharedMemoryPerBlock: {
-      VkPhysicalDeviceProperties phy_prop;
-      vkGetPhysicalDeviceProperties(vctx.phy_device, &phy_prop);
       int64_t value = phy_prop.limits.maxComputeSharedMemorySize;
       *rv = value;
       break;
     }
     case kWarpSize: {
-      *rv = 1;
+      VkPhysicalDeviceSubgroupProperties subgroup_prop;
+      subgroup_prop.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES;
+      subgroup_prop.pNext = NULL;
+
+      VkPhysicalDeviceProperties2 phy_prop2;
+      phy_prop2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
+      phy_prop2.pNext = &subgroup_prop;
+
+      vkGetPhysicalDeviceProperties2(vctx.phy_device, &phy_prop2);
+      int64_t subgroup_size = subgroup_prop.subgroupSize;
+      ICHECK(subgroup_size >= 1);
+
+      *rv = subgroup_size;
       break;
     }
     case kComputeVersion: {
-      VkPhysicalDeviceProperties phy_prop;
-      vkGetPhysicalDeviceProperties(vctx.phy_device, &phy_prop);
       int64_t value = phy_prop.apiVersion;
       std::ostringstream os;
       os << VK_VERSION_MAJOR(value) << "." << VK_VERSION_MINOR(value) << "."
@@ -405,8 +417,6 @@ void VulkanDeviceAPI::GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue*
     case kExist:
       break;
     case kMaxThreadDimensions: {
-      VkPhysicalDeviceProperties phy_prop;
-      vkGetPhysicalDeviceProperties(vctx.phy_device, &phy_prop);
       int64_t dims[3];
       dims[0] = phy_prop.limits.maxComputeWorkGroupSize[0];
       dims[1] = phy_prop.limits.maxComputeWorkGroupSize[1];
@@ -1034,8 +1044,8 @@ VulkanStagingBuffer* VulkanThreadEntry::StagingBuffer(int device_id, size_t size
 VulkanThreadEntry::VulkanThreadEntry()
     : pool(std::make_unique<WorkspacePool>(static_cast<DLDeviceType>(kDLVulkan),
                                            VulkanDeviceAPI::Global())) {
-  ctx.device_id = 0;
-  ctx.device_type = static_cast<DLDeviceType>(kDLVulkan);
+  device.device_id = 0;
+  device.device_type = static_cast<DLDeviceType>(kDLVulkan);
 }
 
 VulkanStream* VulkanThreadEntry::Stream(size_t device_id) {
@@ -1048,7 +1058,7 @@ VulkanStream* VulkanThreadEntry::Stream(size_t device_id) {
 
 void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv,
                                    const ArgUnion64* pack_args) const {
-  int device_id = VulkanThreadEntry::ThreadLocal()->ctx.device_id;
+  int device_id = VulkanThreadEntry::ThreadLocal()->device.device_id;
   ICHECK_LT(device_id, kVulkanMaxNumDevice);
   const auto& vctx = VulkanDeviceAPI::Global()->context(device_id);
   if (!scache_[device_id]) {
diff --git a/src/runtime/vulkan/vulkan_common.h b/src/runtime/vulkan/vulkan_common.h
index 9cd1f257f091..3083ba6f9ce4 100644
--- a/src/runtime/vulkan/vulkan_common.h
+++ b/src/runtime/vulkan/vulkan_common.h
@@ -22,8 +22,8 @@
 
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/packed_func.h>
-#include <tvm/support/logging.h>
 #include <vulkan/vulkan.h>
 
 #include <memory>
diff --git a/src/runtime/vulkan/vulkan_shader.h b/src/runtime/vulkan/vulkan_shader.h
index c9fbb13e938d..513e3bccc36e 100644
--- a/src/runtime/vulkan/vulkan_shader.h
+++ b/src/runtime/vulkan/vulkan_shader.h
@@ -22,8 +22,8 @@
 
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/packed_func.h>
-#include <tvm/support/logging.h>
 
 #include <vector>
 
diff --git a/src/runtime/workspace_pool.cc b/src/runtime/workspace_pool.cc
index 2d347c32ac10..40d488df700a 100644
--- a/src/runtime/workspace_pool.cc
+++ b/src/runtime/workspace_pool.cc
@@ -43,7 +43,7 @@ class WorkspacePool::Pool {
     allocated_.push_back(e);
   }
   // allocate from pool
-  void* Alloc(TVMContext ctx, DeviceAPI* device, size_t nbytes) {
+  void* Alloc(Device dev, DeviceAPI* device, size_t nbytes) {
     // Allocate align to page.
     nbytes = (nbytes + (kWorkspacePageSize - 1)) / kWorkspacePageSize * kWorkspacePageSize;
     if (nbytes == 0) nbytes = kWorkspacePageSize;
@@ -57,12 +57,12 @@ class WorkspacePool::Pool {
       free_list_.pop_back();
       if (e.size < nbytes) {
         // resize the page
-        device->FreeDataSpace(ctx, e.data);
-        e.data = device->AllocDataSpace(ctx, nbytes, kTempAllocaAlignment, type);
+        device->FreeDataSpace(dev, e.data);
+        e.data = device->AllocDataSpace(dev, nbytes, kTempAllocaAlignment, type);
         e.size = nbytes;
       }
     } else if (free_list_.size() == 1) {
-      e.data = device->AllocDataSpace(ctx, nbytes, kTempAllocaAlignment, type);
+      e.data = device->AllocDataSpace(dev, nbytes, kTempAllocaAlignment, type);
       e.size = nbytes;
     } else {
       if (free_list_.back().size >= nbytes) {
@@ -76,8 +76,8 @@ class WorkspacePool::Pool {
         // resize the page
         e = free_list_.back();
         free_list_.pop_back();
-        device->FreeDataSpace(ctx, e.data);
-        e.data = device->AllocDataSpace(ctx, nbytes, kTempAllocaAlignment, type);
+        device->FreeDataSpace(dev, e.data);
+        e.data = device->AllocDataSpace(dev, nbytes, kTempAllocaAlignment, type);
         e.size = nbytes;
       }
     }
@@ -114,10 +114,10 @@ class WorkspacePool::Pool {
     }
   }
   // Release all resources
-  void Release(TVMContext ctx, DeviceAPI* device) {
+  void Release(Device dev, DeviceAPI* device) {
     ICHECK_EQ(allocated_.size(), 1);
     for (size_t i = 1; i < free_list_.size(); ++i) {
-      device->FreeDataSpace(ctx, free_list_[i].data);
+      device->FreeDataSpace(dev, free_list_[i].data);
     }
     free_list_.clear();
   }
@@ -140,28 +140,28 @@ WorkspacePool::WorkspacePool(DLDeviceType device_type, DeviceAPI* device)
 WorkspacePool::~WorkspacePool() {
   for (size_t i = 0; i < array_.size(); ++i) {
     if (array_[i] != nullptr) {
-      TVMContext ctx;
-      ctx.device_type = device_type_;
-      ctx.device_id = static_cast<int>(i);
-      array_[i]->Release(ctx, device_);
+      Device dev;
+      dev.device_type = device_type_;
+      dev.device_id = static_cast<int>(i);
+      array_[i]->Release(dev, device_);
       delete array_[i];
     }
   }
 }
 
-void* WorkspacePool::AllocWorkspace(TVMContext ctx, size_t size) {
-  if (static_cast<size_t>(ctx.device_id) >= array_.size()) {
-    array_.resize(ctx.device_id + 1, nullptr);
+void* WorkspacePool::AllocWorkspace(Device dev, size_t size) {
+  if (static_cast<size_t>(dev.device_id) >= array_.size()) {
+    array_.resize(dev.device_id + 1, nullptr);
   }
-  if (array_[ctx.device_id] == nullptr) {
-    array_[ctx.device_id] = new Pool();
+  if (array_[dev.device_id] == nullptr) {
+    array_[dev.device_id] = new Pool();
   }
-  return array_[ctx.device_id]->Alloc(ctx, device_, size);
+  return array_[dev.device_id]->Alloc(dev, device_, size);
 }
 
-void WorkspacePool::FreeWorkspace(TVMContext ctx, void* ptr) {
-  ICHECK(static_cast<size_t>(ctx.device_id) < array_.size() && array_[ctx.device_id] != nullptr);
-  array_[ctx.device_id]->Free(ptr);
+void WorkspacePool::FreeWorkspace(Device dev, void* ptr) {
+  ICHECK(static_cast<size_t>(dev.device_id) < array_.size() && array_[dev.device_id] != nullptr);
+  array_[dev.device_id]->Free(ptr);
 }
 
 }  // namespace runtime
diff --git a/src/runtime/workspace_pool.h b/src/runtime/workspace_pool.h
index 887afc5cbb57..0db9758fac86 100644
--- a/src/runtime/workspace_pool.h
+++ b/src/runtime/workspace_pool.h
@@ -54,17 +54,17 @@ class TVM_DLL WorkspacePool {
   ~WorkspacePool();
   /*!
    * \brief Allocate temporal workspace.
-   * \param ctx The context of allocation.
+   * \param dev The device of allocation.
    * \param size The size to be allocated.
    */
-  void* AllocWorkspace(TVMContext ctx, size_t size);
+  void* AllocWorkspace(Device dev, size_t size);
   /*!
    * \brief Free temporal workspace in backend execution.
    *
-   * \param ctx The context of allocation.
+   * \param dev The device of allocation.
    * \param ptr The pointer to be freed.
    */
-  void FreeWorkspace(TVMContext ctx, void* ptr);
+  void FreeWorkspace(Device dev, void* ptr);
 
  private:
   class Pool;
diff --git a/src/support/base64.h b/src/support/base64.h
index 901922db8edc..3aac9920a075 100644
--- a/src/support/base64.h
+++ b/src/support/base64.h
@@ -26,7 +26,7 @@
 #ifndef TVM_SUPPORT_BASE64_H_
 #define TVM_SUPPORT_BASE64_H_
 
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include <cctype>
 #include <cstdio>
diff --git a/src/support/ffi_testing.cc b/src/support/ffi_testing.cc
index b06a8bb461be..bac888a81aea 100644
--- a/src/support/ffi_testing.cc
+++ b/src/support/ffi_testing.cc
@@ -72,13 +72,13 @@ TVM_REGISTER_GLOBAL("testing.test_check_eq_callback").set_body([](TVMArgs args,
       runtime::TypedPackedFunc<void(int x, int y)>([msg](int x, int y) { CHECK_EQ(x, y) << msg; });
 });
 
-TVM_REGISTER_GLOBAL("testing.context_test").set_body([](TVMArgs args, TVMRetValue* ret) {
-  DLContext ctx = args[0];
+TVM_REGISTER_GLOBAL("testing.device_test").set_body([](TVMArgs args, TVMRetValue* ret) {
+  Device dev = args[0];
   int dtype = args[1];
   int did = args[2];
-  CHECK_EQ(static_cast<int>(ctx.device_type), dtype);
-  CHECK_EQ(static_cast<int>(ctx.device_id), did);
-  *ret = ctx;
+  CHECK_EQ(static_cast<int>(dev.device_type), dtype);
+  CHECK_EQ(static_cast<int>(dev.device_id), did);
+  *ret = dev;
 });
 
 // in src/api_test.cc
diff --git a/src/support/libinfo.cc b/src/support/libinfo.cc
index d6c8f1799596..ea3a22e8ab01 100644
--- a/src/support/libinfo.cc
+++ b/src/support/libinfo.cc
@@ -76,12 +76,12 @@
 #define TVM_INFO_USE_STACKVM_RUNTIME "NOT-FOUND"
 #endif
 
-#ifndef TVM_INFO_USE_GRAPH_RUNTIME
-#define TVM_INFO_USE_GRAPH_RUNTIME "NOT-FOUND"
+#ifndef TVM_INFO_USE_GRAPH_EXECUTOR
+#define TVM_INFO_USE_GRAPH_EXECUTOR "NOT-FOUND"
 #endif
 
-#ifndef TVM_INFO_USE_GRAPH_RUNTIME_DEBUG
-#define TVM_INFO_USE_GRAPH_RUNTIME_DEBUG "NOT-FOUND"
+#ifndef TVM_INFO_USE_GRAPH_EXECUTOR_DEBUG
+#define TVM_INFO_USE_GRAPH_EXECUTOR_DEBUG "NOT-FOUND"
 #endif
 
 #ifndef TVM_INFO_USE_OPENMP
@@ -200,8 +200,8 @@
 #define TVM_INFO_USE_ARM_COMPUTE_LIB "NOT-FOUND"
 #endif
 
-#ifndef TVM_INFO_USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME
-#define TVM_INFO_USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME "NOT-FOUND"
+#ifndef TVM_INFO_USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR
+#define TVM_INFO_USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR "NOT-FOUND"
 #endif
 
 #ifndef TVM_INFO_INDEX_DEFAULT_I64
@@ -234,8 +234,8 @@ TVM_DLL Map<String, String> GetLibInfo() {
       {"USE_LLVM", TVM_INFO_USE_LLVM},
       {"LLVM_VERSION", TVM_INFO_LLVM_VERSION},
       {"USE_STACKVM_RUNTIME", TVM_INFO_USE_STACKVM_RUNTIME},
-      {"USE_GRAPH_RUNTIME", TVM_INFO_USE_GRAPH_RUNTIME},
-      {"USE_GRAPH_RUNTIME_DEBUG", TVM_INFO_USE_GRAPH_RUNTIME_DEBUG},
+      {"USE_GRAPH_EXECUTOR", TVM_INFO_USE_GRAPH_EXECUTOR},
+      {"USE_GRAPH_EXECUTOR_DEBUG", TVM_INFO_USE_GRAPH_EXECUTOR_DEBUG},
       {"USE_OPENMP", TVM_INFO_USE_OPENMP},
       {"USE_RELAY_DEBUG", TVM_INFO_USE_RELAY_DEBUG},
       {"USE_RTTI", TVM_INFO_USE_RTTI},
@@ -265,7 +265,7 @@ TVM_DLL Map<String, String> GetLibInfo() {
       {"USE_COREML", TVM_INFO_USE_COREML},
       {"USE_TARGET_ONNX", TVM_INFO_USE_TARGET_ONNX},
       {"USE_ARM_COMPUTE_LIB", TVM_INFO_USE_ARM_COMPUTE_LIB},
-      {"USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME", TVM_INFO_USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME},
+      {"USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR", TVM_INFO_USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR},
       {"INDEX_DEFAULT_I64", TVM_INFO_INDEX_DEFAULT_I64},
       {"TVM_CXX_COMPILER_PATH", TVM_CXX_COMPILER_PATH}};
   return result;
diff --git a/src/support/parallel_for.cc b/src/support/parallel_for.cc
index f4756c29adeb..4ced0df6ddf3 100644
--- a/src/support/parallel_for.cc
+++ b/src/support/parallel_for.cc
@@ -21,7 +21,7 @@
  * \file parallel_for.cc
  * \brief An implementation to run loop in parallel.
  */
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/support/parallel_for.h>
 
 #include <future>
diff --git a/src/support/pipe.h b/src/support/pipe.h
index 3c1356ba174c..a2803638e1f3 100644
--- a/src/support/pipe.h
+++ b/src/support/pipe.h
@@ -25,7 +25,7 @@
 #define TVM_SUPPORT_PIPE_H_
 
 #include <dmlc/io.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #ifdef _WIN32
 #include <windows.h>
diff --git a/src/support/socket.h b/src/support/socket.h
index 16fba6b58e3d..11060ae8aae1 100644
--- a/src/support/socket.h
+++ b/src/support/socket.h
@@ -49,7 +49,7 @@ using ssize_t = int;
 #include <sys/socket.h>
 #include <unistd.h>
 #endif
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include <cstring>
 #include <string>
diff --git a/src/target/llvm/codegen_amdgpu.cc b/src/target/llvm/codegen_amdgpu.cc
index ca21892ccc5f..c7c44c9b0f8f 100644
--- a/src/target/llvm/codegen_amdgpu.cc
+++ b/src/target/llvm/codegen_amdgpu.cc
@@ -38,15 +38,15 @@ namespace {
 
 // calls the device api to get the max threads per block
 static inline int DetectROCMmaxThreadsPerBlock() {
-  TVMContext tvm_ctx;
-  tvm_ctx.device_type = kDLROCM;
-  tvm_ctx.device_id = 0;
-  tvm::runtime::DeviceAPI* api = tvm::runtime::DeviceAPI::Get(tvm_ctx, true);
+  Device tvm_dev;
+  tvm_dev.device_type = kDLROCM;
+  tvm_dev.device_id = 0;
+  tvm::runtime::DeviceAPI* api = tvm::runtime::DeviceAPI::Get(tvm_dev, true);
   if (api != nullptr) {
     TVMRetValue val;
-    api->GetAttr(tvm_ctx, tvm::runtime::kExist, &val);
+    api->GetAttr(tvm_dev, tvm::runtime::kExist, &val);
     if (val.operator int() == 1) {
-      tvm::runtime::DeviceAPI::Get(tvm_ctx)->GetAttr(tvm_ctx, tvm::runtime::kMaxThreadsPerBlock,
+      tvm::runtime::DeviceAPI::Get(tvm_dev)->GetAttr(tvm_dev, tvm::runtime::kMaxThreadsPerBlock,
                                                      &val);
       return val.operator int();
     }
diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc
index b37cd73ece04..b49f850b2d90 100644
--- a/src/target/llvm/codegen_cpu.cc
+++ b/src/target/llvm/codegen_cpu.cc
@@ -437,11 +437,14 @@ void CodeGenCPU::CreateComputeScope(const AttrStmtNode* op) {
     arg_types.push_back(value->getType());
   }
   llvm::FunctionType* ftype = llvm::FunctionType::get(t_int_, arg_types, false);
+  // $xxx_compute_ functions are not global. They should be marked as static (via InternalLinkage)
+  // to call them correctly on MIPS platform (CALL16 issue)
+  // Linkage ld Error: CALL16 reloc at 0x290 not against global symbol
   llvm::Function* fcompute = llvm::Function::Create(
-      ftype, llvm::Function::PrivateLinkage,
+      ftype, llvm::Function::InternalLinkage,
       op->value.as<StringImmNode>()->value.operator llvm::StringRef(), module_.get());
   BasicBlock* compute_call_end = CheckCallSuccess(builder_->CreateCall(fcompute, arg_values));
-  // setup compute fuinction.
+  // setup compute function.
   std::unordered_map<const VarNode*, llvm::Value*> new_vmap;
   size_t idx = 0;
   for (auto it = fcompute->arg_begin(); it != fcompute->arg_end(); ++it, ++idx) {
diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc
index c1af2a366a6b..9d324d56887f 100644
--- a/src/target/llvm/codegen_hexagon.cc
+++ b/src/target/llvm/codegen_hexagon.cc
@@ -564,8 +564,8 @@ llvm::Value* CodeGenHexagon::CreateStructRefPtr(DataType t, llvm::Value* buf, ll
     /* The following "kinds" are accessing the members of DLTensor:
        typedef struct {
          void* data;            kArrData
-         DLContext ctx;         kArrDeviceType (ctx.device_type)
-                                kArrDeviceId (ctx.device_id)
+         DLDevice device;       kArrDeviceType (device.device_type)
+                                kArrDeviceId (device.device_id)
          int ndim;              kArrNDim
          DLDataType dtype;      kArrTypeCode (dtype.code)
                                 kArrTypeBits (dtype.bits)
@@ -602,7 +602,7 @@ llvm::Value* CodeGenHexagon::CreateStructRefPtr(DataType t, llvm::Value* buf, ll
          void* v_handle;
          const char* v_str;
          TVMType v_type;
-         TVMContext v_ctx;
+         DLDevice v_device;
        } TVMValue;
     */
     ICHECK_EQ(t.lanes(), 1);
diff --git a/src/target/llvm/codegen_params.cc b/src/target/llvm/codegen_params.cc
index 694be5621606..3b4cae9197b0 100644
--- a/src/target/llvm/codegen_params.cc
+++ b/src/target/llvm/codegen_params.cc
@@ -69,7 +69,7 @@ llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::
 
   auto arr_type = arr.DataType();
   CHECK(arr.IsContiguous()) << "CodegenParams: only support contiguous arrays";
-  CHECK_EQ(arr->ctx.device_type, kDLCPU) << "CodegenParams: only support contiguous arrays";
+  CHECK_EQ(arr->device.device_type, kDLCPU) << "CodegenParams: only support contiguous arrays";
   CHECK_EQ(arr_type.lanes(), 1) << "CodegenParams: only support generating 1-lane parameters; saw "
                                 << arr_type.lanes();
 
diff --git a/src/target/llvm/llvm_common.cc b/src/target/llvm/llvm_common.cc
index 35bfc8dc2e5b..61dd7024ff05 100644
--- a/src/target/llvm/llvm_common.cc
+++ b/src/target/llvm/llvm_common.cc
@@ -24,7 +24,7 @@
 
 #include "llvm_common.h"
 
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/target/target.h>
 
 #include <atomic>
diff --git a/src/target/metadata_module.cc b/src/target/metadata_module.cc
index 0b30d42c876c..8184e9189c4b 100644
--- a/src/target/metadata_module.cc
+++ b/src/target/metadata_module.cc
@@ -35,7 +35,7 @@ namespace codegen {
 
 /*!
  * \brief Create a metadata module wrapper. The helper is used by different
- *        codegens, such as graph runtime codegen and the vm compiler.
+ *        codegens, such as graph executor codegen and the vm compiler.
  *
  * \param params The metadata for initialization of all modules.
  * \param target_module the internal module that is compiled by tvm.
diff --git a/src/target/source/codegen_c.cc b/src/target/source/codegen_c.cc
index 55db59f8d842..986ef8ed0749 100644
--- a/src/target/source/codegen_c.cc
+++ b/src/target/source/codegen_c.cc
@@ -267,10 +267,10 @@ std::string CodeGenC::GetStructRef(DataType t, const PrimExpr& buffer, const Pri
         os << "dtype.lanes";
         break;
       case builtin::kArrDeviceId:
-        os << "ctx.device_id";
+        os << "device.device_id";
         break;
       case builtin::kArrDeviceType:
-        os << "ctx.device_type";
+        os << "device.device_type";
         break;
       default:
         LOG(FATAL) << "unknown field code";
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index a7732719a699..26f1850c0e47 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -104,7 +104,7 @@ class CSourceModuleNode : public runtime::ModuleNode {
   void SaveToFile(const std::string& file_name, const std::string& format) final {
     std::string fmt = GetFileFormat(file_name, format);
     std::string meta_file = GetMetaFilePath(file_name);
-    if (fmt == "c") {
+    if (fmt == "c" || fmt == "cu") {
       ICHECK_NE(code_.length(), 0);
       SaveBinaryToFile(file_name, code_);
     } else {
diff --git a/src/target/spirv/ir_builder.cc b/src/target/spirv/ir_builder.cc
index 3a9de4e077dc..5a1457387ae5 100644
--- a/src/target/spirv/ir_builder.cc
+++ b/src/target/spirv/ir_builder.cc
@@ -48,6 +48,8 @@ void IRBuilder::InitHeader() {
   header_.push_back(0U);
   // shader
   ib_.Begin(spv::OpCapability).Add(spv::CapabilityShader).Commit(&header_);
+  // Declare int64 capability by default
+  ib_.Begin(spv::OpCapability).Add(spv::CapabilityInt64).Commit(&header_);
   // memory model
   ib_.Begin(spv::OpMemoryModel)
       .AddSeq(spv::AddressingModelLogical, spv::MemoryModelGLSL450)
diff --git a/src/target/target.cc b/src/target/target.cc
index 9d0b01bf3202..396e264ede4d 100644
--- a/src/target/target.cc
+++ b/src/target/target.cc
@@ -112,7 +112,7 @@ static const TObj* ObjTypeCheck(const ObjectRef& obj, const std::string& expecte
     std::ostringstream os;
     os << ": Expects type \"" << expected_type << "\", but gets \"" << obj->GetTypeKey()
        << "\" for object: " << obj;
-    throw dmlc::Error(os.str());
+    throw Error(os.str());
   }
   return ptr;
 }
@@ -120,7 +120,7 @@ static const TObj* ObjTypeCheck(const ObjectRef& obj, const std::string& expecte
 static TargetKind GetTargetKind(const String& name) {
   Optional<TargetKind> kind = TargetKind::Get(name);
   if (!kind.defined()) {
-    throw dmlc::Error(": Target kind \"" + name + "\" is not defined");
+    throw Error(": Target kind \"" + name + "\" is not defined");
   }
   return kind.value();
 }
@@ -131,10 +131,10 @@ static std::string RemovePrefixDashes(const std::string& s) {
   for (; n_dashes < len && s[n_dashes] == '-'; ++n_dashes) {
   }
   if (n_dashes == 0) {
-    throw dmlc::Error(": Attribute keys should start with '-', not an attribute key: " + s);
+    throw Error(": Attribute keys should start with '-', not an attribute key: " + s);
   }
   if (n_dashes >= len) {
-    throw dmlc::Error(": Not an attribute key: " + s);
+    throw Error(": Not an attribute key: " + s);
   }
   return s.substr(n_dashes);
 }
@@ -166,7 +166,7 @@ static int ParseKVPair(const std::string& s, const std::string& s_next, std::str
     result_k = s.substr(0, pos);
     result_v = s.substr(pos + 1);
     if (result_k.empty() || result_v.empty()) {
-      throw dmlc::Error(": Empty attribute key or value in \"" + s + "\"");
+      throw Error(": Empty attribute key or value in \"" + s + "\"");
     }
     return 1;
   } else if (!s_next.empty() && s_next[0] != '-') {
@@ -196,7 +196,7 @@ const TargetKindNode::ValueTypeInfo& TargetInternal::FindTypeInfo(const TargetKi
       }
       os << kv.first;
     }
-    throw dmlc::Error(os.str());
+    throw Error(os.str());
   }
   return it->second;
 }
@@ -210,14 +210,14 @@ ObjectRef TargetInternal::ParseType(const std::string& str,
     // Parsing integer
     int v;
     if (!(is >> v)) {
-      throw dmlc::Error(": Cannot parse into type \"Integer\" from string: " + str);
+      throw Error(": Cannot parse into type \"Integer\" from string: " + str);
     }
     return Integer(v);
   } else if (info.type_index == String::ContainerType::_GetOrAllocRuntimeTypeIndex()) {
     // Parsing string
     std::string v;
     if (!(is >> v)) {
-      throw dmlc::Error(": Cannot parse into type \"String\" from string: " + str);
+      throw Error(": Cannot parse into type \"String\" from string: " + str);
     }
     return String(v);
   } else if (info.type_index == Target::ContainerType::_GetOrAllocRuntimeTypeIndex()) {
@@ -230,14 +230,14 @@ ObjectRef TargetInternal::ParseType(const std::string& str,
       try {
         ObjectRef parsed = TargetInternal::ParseType(substr, *info.key);
         result.push_back(parsed);
-      } catch (const dmlc::Error& e) {
+      } catch (const Error& e) {
         std::string index = "[" + std::to_string(result.size()) + "]";
-        throw dmlc::Error(index + e.what());
+        throw Error(index + e.what());
       }
     }
     return Array<ObjectRef>(result);
   }
-  throw dmlc::Error(": Unsupported type \"" + info.type_key + "\" for parsing from string: " + str);
+  throw Error(": Unsupported type \"" + info.type_key + "\" for parsing from string: " + str);
 }
 
 ObjectRef TargetInternal::ParseType(const ObjectRef& obj,
@@ -257,15 +257,14 @@ ObjectRef TargetInternal::ParseType(const ObjectRef& obj,
     } else if (const auto* ptr = obj.as<MapNode>()) {
       for (const auto& kv : *ptr) {
         if (!kv.first->IsInstance<StringObj>()) {
-          throw dmlc::Error(": Target object requires key of dict to be str, but get: " +
-                            kv.first->GetTypeKey());
+          throw Error(": Target object requires key of dict to be str, but get: " +
+                      kv.first->GetTypeKey());
         }
       }
       Map<String, ObjectRef> config = GetRef<Map<String, ObjectRef>>(ptr);
       return Target(TargetInternal::FromConfig({config.begin(), config.end()}));
     }
-    throw dmlc::Error(": Expect type 'dict' or 'str' to construct Target, but get: " +
-                      obj->GetTypeKey());
+    throw Error(": Expect type 'dict' or 'str' to construct Target, but get: " + obj->GetTypeKey());
   } else if (info.type_index == ArrayNode::_GetOrAllocRuntimeTypeIndex()) {
     // Parsing array
     const auto* array = ObjTypeCheck<ArrayNode>(obj, "Array");
@@ -273,9 +272,9 @@ ObjectRef TargetInternal::ParseType(const ObjectRef& obj,
     for (const ObjectRef& e : *array) {
       try {
         result.push_back(TargetInternal::ParseType(e, *info.key));
-      } catch (const dmlc::Error& e) {
+      } catch (const Error& e) {
         std::string index = '[' + std::to_string(result.size()) + ']';
-        throw dmlc::Error(index + e.what());
+        throw Error(index + e.what());
       }
     }
     return Array<ObjectRef>(result);
@@ -287,17 +286,17 @@ ObjectRef TargetInternal::ParseType(const ObjectRef& obj,
       ObjectRef key, val;
       try {
         key = TargetInternal::ParseType(kv.first, *info.key);
-      } catch (const dmlc::Error& e) {
+      } catch (const Error& e) {
         std::ostringstream os;
         os << "'s key \"" << key << "\"" << e.what();
-        throw dmlc::Error(os.str());
+        throw Error(os.str());
       }
       try {
         val = TargetInternal::ParseType(kv.second, *info.val);
-      } catch (const dmlc::Error& e) {
+      } catch (const Error& e) {
         std::ostringstream os;
         os << "[\"" << key << "\"]" << e.what();
-        throw dmlc::Error(os.str());
+        throw Error(os.str());
       }
       result[key] = val;
     }
@@ -308,7 +307,7 @@ ObjectRef TargetInternal::ParseType(const ObjectRef& obj,
     os << ": Parsing type \"" << info.type_key
        << "\" is not supported for the given object of type \"" << obj->GetTypeKey()
        << "\". The object is: " << obj;
-    throw dmlc::Error(os.str());
+    throw Error(os.str());
   }
   return obj;
 }
@@ -388,7 +387,7 @@ Target::Target(const String& tag_or_config_or_target_str) {
   ObjectPtr<Object> target;
   try {
     target = TargetInternal::FromString(tag_or_config_or_target_str);
-  } catch (const dmlc::Error& e) {
+  } catch (const Error& e) {
     LOG(FATAL) << "ValueError" << e.what()
                << ". Target creation from string failed: " << tag_or_config_or_target_str;
   }
@@ -399,7 +398,7 @@ Target::Target(const Map<String, ObjectRef>& config) {
   ObjectPtr<Object> target;
   try {
     target = TargetInternal::FromConfig({config.begin(), config.end()});
-  } catch (const dmlc::Error& e) {
+  } catch (const Error& e) {
     LOG(FATAL) << "ValueError" << e.what()
                << ". Target creation from config dict failed: " << config;
   }
@@ -536,7 +535,7 @@ ObjectPtr<Object> TargetInternal::FromConfigString(const String& config_str) {
                     "if the python module is properly loaded";
   Optional<Map<String, ObjectRef>> config = (*loader)(config_str);
   if (!config.defined()) {
-    throw dmlc::Error(": Cannot load config dict with python JSON loader");
+    throw Error(": Cannot load config dict with python JSON loader");
   }
   return TargetInternal::FromConfig({config.value().begin(), config.value().end()});
 }
@@ -554,7 +553,7 @@ ObjectPtr<Object> TargetInternal::FromRawString(const String& target_str) {
     }
   }
   if (name.empty()) {
-    throw dmlc::Error(": Cannot parse empty target string");
+    throw Error(": Cannot parse empty target string");
   }
   // Create the target config
   std::unordered_map<String, ObjectRef> config = {{"kind", String(name)}};
@@ -565,17 +564,17 @@ ObjectPtr<Object> TargetInternal::FromRawString(const String& target_str) {
       // Parse key-value pair
       std::string s_next = (iter + 1 < options.size()) ? options[iter + 1] : "";
       iter += ParseKVPair(RemovePrefixDashes(options[iter]), s_next, &key, &value);
-    } catch (const dmlc::Error& e) {
-      throw dmlc::Error(": Error when parsing target" + std::string(e.what()));
+    } catch (const Error& e) {
+      throw Error(": Error when parsing target" + std::string(e.what()));
     }
     try {
       // check if `key` has been used
       if (config.count(key)) {
-        throw dmlc::Error(": The key \"" + key + "\" appears more than once");
+        throw Error(": The key \"" + key + "\" appears more than once");
       }
       config[key] = TargetInternal::ParseType(value, TargetInternal::FindTypeInfo(kind, key));
-    } catch (const dmlc::Error& e) {
-      throw dmlc::Error(": Error when parsing target[\"" + key + "\"]" + e.what());
+    } catch (const Error& e) {
+      throw Error(": Error when parsing target[\"" + key + "\"]" + e.what());
     }
   }
   return TargetInternal::FromConfig(config);
@@ -594,11 +593,11 @@ ObjectPtr<Object> TargetInternal::FromConfig(std::unordered_map<String, ObjectRe
       target->kind = GetTargetKind(GetRef<String>(kind));
       config.erase(kKind);
     } else {
-      throw dmlc::Error(": Expect type of field \"kind\" is String, but get type: " +
-                        config[kKind]->GetTypeKey());
+      throw Error(": Expect type of field \"kind\" is String, but get type: " +
+                  config[kKind]->GetTypeKey());
     }
   } else {
-    throw dmlc::Error(": Field \"kind\" is not found");
+    throw Error(": Field \"kind\" is not found");
   }
   // parse "tag"
   if (config.count(kTag)) {
@@ -606,8 +605,8 @@ ObjectPtr<Object> TargetInternal::FromConfig(std::unordered_map<String, ObjectRe
       target->tag = GetRef<String>(tag);
       config.erase(kTag);
     } else {
-      throw dmlc::Error(": Expect type of field \"tag\" is String, but get type: " +
-                        config[kTag]->GetTypeKey());
+      throw Error(": Expect type of field \"tag\" is String, but get type: " +
+                  config[kTag]->GetTypeKey());
     }
   } else {
     target->tag = "";
@@ -622,15 +621,15 @@ ObjectPtr<Object> TargetInternal::FromConfig(std::unordered_map<String, ObjectRe
           if (const auto* key = e.as<StringObj>()) {
             keys.push_back(GetRef<String>(key));
           } else {
-            throw dmlc::Error(
+            throw Error(
                 ": Expect 'keys' to be an array of strings, but it "
                 "contains an element of type: " +
                 e->GetTypeKey());
           }
         }
       } else {
-        throw dmlc::Error(": Expect type of field \"keys\" is Array, but get type: " +
-                          config[kKeys]->GetTypeKey());
+        throw Error(": Expect type of field \"keys\" is Array, but get type: " +
+                    config[kKeys]->GetTypeKey());
       }
     }
     // add device name
@@ -662,8 +661,8 @@ ObjectPtr<Object> TargetInternal::FromConfig(std::unordered_map<String, ObjectRe
     try {
       const TargetKindNode::ValueTypeInfo& info = TargetInternal::FindTypeInfo(target->kind, key);
       attrs[key] = TargetInternal::ParseType(value, info);
-    } catch (const dmlc::Error& e) {
-      throw dmlc::Error(": Error when parsing target[\"" + key + "\"]" + e.what());
+    } catch (const Error& e) {
+      throw Error(": Error when parsing target[\"" + key + "\"]" + e.what());
     }
   }
   // set default attribute values if they do not exist
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index 863d99993f4a..08842554257b 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -103,7 +103,7 @@ static int ExtractIntWithPrefix(const std::string& str, const std::string& prefi
  * \param val The detected value
  * \return A boolean indicating if detection succeeds
  */
-static bool DetectDeviceFlag(TVMContext device, runtime::DeviceAttrKind flag, TVMRetValue* val) {
+static bool DetectDeviceFlag(Device device, runtime::DeviceAttrKind flag, TVMRetValue* val) {
   using runtime::DeviceAPI;
   DeviceAPI* api = DeviceAPI::Get(device, true);
   // Check if compiled with the corresponding device api
diff --git a/src/te/schedule/schedule_postproc_rewrite_for_tensor_core.cc b/src/te/schedule/schedule_postproc_rewrite_for_tensor_core.cc
index 74d1a19d2cfe..377ad5c7a40a 100644
--- a/src/te/schedule/schedule_postproc_rewrite_for_tensor_core.cc
+++ b/src/te/schedule/schedule_postproc_rewrite_for_tensor_core.cc
@@ -1089,8 +1089,8 @@ Stmt SchedulePostProcRewriteForTensorCore(Stmt stmt, Schedule schedule,
   }
 
   // Check if current runtime support GPU CUDA
-  TVMContext ctx{kDLGPU, 0};
-  auto api = tvm::runtime::DeviceAPI::Get(ctx, true);
+  Device dev{kDLGPU, 0};
+  auto api = tvm::runtime::DeviceAPI::Get(dev, true);
   if (api == nullptr) {
     return stmt;
   }
diff --git a/src/tir/analysis/block_access_region_detector.cc b/src/tir/analysis/block_access_region_detector.cc
new file mode 100644
index 000000000000..b1da536f1dad
--- /dev/null
+++ b/src/tir/analysis/block_access_region_detector.cc
@@ -0,0 +1,246 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tir/analysis/block_region_detector.cc
+ * \brief Detect block read/write regions by visiting its body
+ */
+
+#include <tvm/arith/analyzer.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/stmt_functor.h>
+
+namespace tvm {
+namespace tir {
+
+/*!
+ * \brief Detect which regions of tensors in this block are read or written to. Regions are sorted
+ * by order of appearance in the AST. \note This detector can only visit blocks and will not visit
+ * child blocks recursively
+ */
+class BlockReadWriteDetector : public StmtExprVisitor {
+ public:
+  explicit BlockReadWriteDetector(const Map<Var, Buffer>& buffer_var_map)
+      : buffer_var_map_(buffer_var_map) {}
+
+  /*! \brief Return read regions of the block */
+  Array<BufferRegion> CollectReads();
+  /*! \brief Return write regions of the block */
+  Array<BufferRegion> CollectWrites();
+  /*!
+   * \brief Return opaque buffer regions of the block
+   * \note The buffer accessed by load/store or call with buffer.data will
+   *       be marked as opaque.
+   */
+  Array<BufferRegion> CollectOpaques();
+  /*! \brief overload operator() to make sure it accepts a block node */
+  void operator()(const Stmt& stmt);
+
+ private:
+  /*! \brief Iteration range for loop_vars */
+  std::unordered_map<const VarNode*, arith::IntSet> dom_map_;
+  /*! \brief The buffers that the current block reads */
+  std::vector<Buffer> read_buffers_;
+  /*! \brief The buffers that the current block writes */
+  std::vector<Buffer> writes_buffers_;
+  /*! \brief The opaque buffer which is access by buffer.data */
+  std::vector<Buffer> opaque_buffers_;
+  /*! \brief The read regions of the current block */
+  std::vector<std::vector<tvm::arith::IntSet>> read_regions_;
+  /*! \brief The write regions of the current block */
+  std::vector<std::vector<tvm::arith::IntSet>> write_regions_;
+  /*! \brief The outside buffer data mapping to its buffer */
+  Map<Var, Buffer> buffer_var_map_;
+  /*! \brief The analyzer for simplifying*/
+  arith::Analyzer analyzer_;
+
+  /*!
+   * \brief Update read/write buffers and regions with provided buffer and region
+   * \param buffers The buffers should be updated
+   * \param regions The access regions should be updated
+   * \param buffer The provided buffer
+   * \param region The provided region
+   */
+  void Update(std::vector<Buffer>* buffers, std::vector<std::vector<arith::IntSet>>* regions,
+              const Buffer& buffer, const std::vector<arith::IntSet>& region);
+
+  /*! \brief Helper function to collect access regions. */
+  Array<BufferRegion> CollectRegions(const std::vector<Buffer>& buffers,
+                                     const std::vector<std::vector<tvm::arith::IntSet>>& regions);
+
+  /*! \brief Helper function to add a opaque buffer. */
+  void AddOpaque(const Var& buffer_var);
+
+  void VisitStmt_(const ForNode* op) override;
+  void VisitStmt_(const BlockRealizeNode* op) override;
+  void VisitStmt_(const BufferStoreNode* op) override;
+  void VisitStmt_(const StoreNode* op) override;
+  void VisitExpr_(const BufferLoadNode* op) override;
+  void VisitExpr_(const LoadNode* op) override;
+  void VisitExpr_(const VarNode* op) override;
+};
+
+void BlockReadWriteDetector::operator()(const Stmt& stmt) {
+  ICHECK(stmt.as<BlockNode>() != nullptr)
+      << "Only visiting Blocks is allowed, but got " << stmt->GetTypeKey();
+  StmtExprVisitor::operator()(stmt);
+}
+
+Array<BufferRegion> BlockReadWriteDetector::CollectReads() {
+  return CollectRegions(read_buffers_, read_regions_);
+}
+
+Array<BufferRegion> BlockReadWriteDetector::CollectWrites() {
+  return CollectRegions(writes_buffers_, write_regions_);
+}
+
+Array<BufferRegion> BlockReadWriteDetector::CollectOpaques() {
+  Array<BufferRegion> res;
+  res.reserve(opaque_buffers_.size());
+  for (const Buffer& buffer : opaque_buffers_) {
+    res.push_back(BufferRegion::FullRegion(buffer));
+  }
+  return res;
+}
+
+void BlockReadWriteDetector::VisitExpr_(const VarNode* op) { AddOpaque(GetRef<Var>(op)); }
+
+void BlockReadWriteDetector::VisitExpr_(const LoadNode* op) {
+  AddOpaque(op->buffer_var);
+  ExprVisitor::VisitExpr_(op);
+}
+
+void BlockReadWriteDetector::VisitExpr_(const BufferLoadNode* op) {
+  std::vector<arith::IntSet> relaxed_region;
+  for (const PrimExpr& index : op->indices) {
+    relaxed_region.push_back(arith::EvalSet(index, dom_map_));
+  }
+  Update(&read_buffers_, &read_regions_, op->buffer, relaxed_region);
+  ExprVisitor::VisitExpr_(op);
+}
+
+void BlockReadWriteDetector::VisitStmt_(const ForNode* op) {
+  Range range = Range::FromMinExtent(op->min, op->extent);
+  dom_map_[op->loop_var.get()] = arith::IntSet::FromRange(range);
+  StmtVisitor::VisitStmt_(op);
+  dom_map_.erase(op->loop_var.get());
+}
+
+void BlockReadWriteDetector::VisitStmt_(const StoreNode* op) {
+  AddOpaque(op->buffer_var);
+  StmtVisitor::VisitStmt_(op);
+}
+
+void BlockReadWriteDetector::VisitStmt_(const BufferStoreNode* op) {
+  std::vector<arith::IntSet> relaxed_region;
+  for (const PrimExpr& index : op->indices) {
+    relaxed_region.push_back(arith::EvalSet(index, dom_map_));
+  }
+  Update(&writes_buffers_, &write_regions_, op->buffer, relaxed_region);
+  StmtVisitor::VisitStmt_(op);
+}
+
+void BlockReadWriteDetector::VisitStmt_(const BlockRealizeNode* op) {
+  /*! \note detector will not visit child block recursively, so it will stop here */
+  std::unordered_map<const VarNode*, PrimExpr> vmap;
+  for (size_t i = 0; i < op->block->iter_vars.size(); ++i) {
+    vmap[op->block->iter_vars[i]->var.get()] = op->iter_values[i];
+  }
+  for (const auto& read : op->block->reads) {
+    std::vector<arith::IntSet> relaxed_region;
+    for (const auto& range : read->region) {
+      relaxed_region.push_back(
+          arith::EvalSet(arith::IntSet::FromRange(Range::FromMinExtent(
+                             Substitute(range->min, vmap), Substitute(range->extent, vmap))),
+                         dom_map_));
+    }
+    Update(&read_buffers_, &read_regions_, read->buffer, relaxed_region);
+  }
+  for (const auto& write : op->block->writes) {
+    std::vector<arith::IntSet> relaxed_region;
+    for (const auto& range : write->region) {
+      relaxed_region.push_back(
+          arith::EvalSet(arith::IntSet::FromRange(Range::FromMinExtent(
+                             Substitute(range->min, vmap), Substitute(range->extent, vmap))),
+                         dom_map_));
+    }
+    Update(&writes_buffers_, &write_regions_, write->buffer, relaxed_region);
+  }
+}
+
+void BlockReadWriteDetector::Update(std::vector<Buffer>* buffers,
+                                    std::vector<std::vector<arith::IntSet>>* regions,
+                                    const Buffer& buffer,
+                                    const std::vector<arith::IntSet>& region) {
+  if (buffer_var_map_.find(buffer->data) == buffer_var_map_.end()) return;
+  ICHECK_EQ(buffers->size(), regions->size())
+      << " Expected the buffer and regions to have the same size ";
+  for (size_t i = 0; i < regions->size(); ++i) {
+    if ((*buffers)[i].same_as(buffer)) {
+      ICHECK_EQ((*regions)[i].size(), region.size()) << "Inconsistent buffer dimension";
+      for (size_t j = 0; j < region.size(); ++j) {
+        (*regions)[i][j] = arith::Union({(*regions)[i][j], region[j]});
+      }
+      return;
+    }
+  }
+  buffers->push_back(buffer);
+  regions->push_back(region);
+}
+
+Array<BufferRegion> BlockReadWriteDetector::CollectRegions(
+    const std::vector<Buffer>& buffers,
+    const std::vector<std::vector<tvm::arith::IntSet>>& regions) {
+  ICHECK_EQ(buffers.size(), regions.size());
+  Array<BufferRegion> res;
+  res.reserve(buffers.size());
+  for (size_t i = 0; i < regions.size(); ++i) {
+    Array<Range> region;
+    region.reserve(regions[i].size());
+    for (size_t j = 0; j < regions[i].size(); j++) {
+      tvm::arith::IntSet range = regions[i][j];
+      region.push_back(range.CoverRange(Range::FromMinExtent(0, buffers[i]->shape[j])));
+    }
+    res.push_back(BufferRegion(buffers[i], region));
+  }
+  return res;
+}
+
+void BlockReadWriteDetector::AddOpaque(const Var& buffer_var) {
+  auto it = buffer_var_map_.find(buffer_var);
+  if (it != buffer_var_map_.end()) {
+    const Buffer& buffer = (*it).second;
+    for (const Buffer& opaque_buffer : opaque_buffers_) {
+      if (buffer.same_as(opaque_buffer)) return;
+    }
+    opaque_buffers_.push_back(buffer);
+  }
+}
+
+Array<Array<BufferRegion>> GetBlockAccessRegion(const Block& block,
+                                                const Map<Var, Buffer>& buffer_var_map) {
+  BlockReadWriteDetector detector(buffer_var_map);
+  detector(block);
+  return {detector.CollectReads(), detector.CollectWrites(), detector.CollectOpaques()};
+}
+
+TVM_REGISTER_GLOBAL("tir.analysis.get_block_access_region").set_body_typed(GetBlockAccessRegion);
+
+}  // namespace tir
+}  // namespace tvm
diff --git a/src/tir/analysis/expr_complexity.cc b/src/tir/analysis/expr_complexity.cc
new file mode 100644
index 000000000000..e809668bb624
--- /dev/null
+++ b/src/tir/analysis/expr_complexity.cc
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tir/analysis/expr_complexity.cc
+ * \brief Calculate expr complexity.
+ */
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/expr_functor.h>
+
+namespace tvm {
+namespace tir {
+
+/*! \brief Count the size of the PrimExpr. */
+class PrimExprSizeCounter : public ExprVisitor {
+ public:
+  PrimExprSizeCounter() = default;
+
+  static size_t Count(const PrimExpr& expr) {
+    PrimExprSizeCounter prim_expr_size_counter;
+    prim_expr_size_counter.VisitExpr(expr);
+    return prim_expr_size_counter.counter_;
+  }
+
+ private:
+  void VisitExpr(const PrimExpr& expr) final {
+    counter_++;
+    ExprVisitor::VisitExpr(expr);
+  }
+
+  size_t counter_{0};
+};
+
+size_t CalculateExprComplexity(const PrimExpr& expr) { return PrimExprSizeCounter::Count(expr); }
+
+}  // namespace tir
+}  // namespace tvm
diff --git a/src/tir/ir/script/script_complete.cc b/src/tir/ir/script/script_complete.cc
new file mode 100644
index 000000000000..a42b5ea5b3a0
--- /dev/null
+++ b/src/tir/ir/script/script_complete.cc
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tir/ir/script/script_complete.cc
+ * \brief Used by TVM Script parser to expand incomplete TIR input
+ */
+
+#include <tvm/arith/int_set.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/stmt.h>
+#include <tvm/tir/stmt_functor.h>
+
+#include <utility>
+
+namespace tvm {
+namespace tir {
+
+/*! \brief Generate surrounding loops automatically */
+class ScriptCompleter : public StmtMutator {
+ public:
+  explicit ScriptCompleter(Map<Var, Buffer>* buffer_var_map, bool contain_root)
+      : buffer_var_map_(buffer_var_map), contain_root_(contain_root) {}
+  /*! \brief Whether the stmt contains at least one block. */
+  bool contains_block = false;
+
+ private:
+  Map<Var, Buffer>* buffer_var_map_;
+  bool contain_root_;
+  bool visited_root_ = false;
+  Stmt VisitStmt_(const BlockRealizeNode* op) override {
+    contains_block = true;
+    Stmt body = StmtMutator::VisitStmt_(op);
+    if (!op->iter_values.empty() && !op->iter_values[0].dtype().is_int()) {
+      auto block_with_binding = CopyOnWrite(Downcast<BlockRealize>(body).get());
+      std::vector<PrimExpr> bindings;
+      for (size_t i = 0; i < op->iter_values.size(); ++i) {
+        bindings.push_back(Var("i" + std::to_string(i)));
+      }
+      block_with_binding->iter_values = bindings;
+      body = BlockRealize(block_with_binding);
+      for (int i = op->iter_values.size() - 1; i >= 0; --i) {
+        body = For(Downcast<Var>(bindings[i]), op->block->iter_vars[i]->dom->min,
+                   op->block->iter_vars[i]->dom->extent, {}, body);
+      }
+    }
+    return body;
+  }
+
+  Stmt VisitStmt_(const BlockNode* op) override {
+    bool is_root_block = contain_root_ && !visited_root_;
+    visited_root_ = true;
+    // Buffers allocated in the block can be accessed by its body.
+    for (const auto& alloc_buffer : op->alloc_buffers) {
+      buffer_var_map_->Set(alloc_buffer->data, alloc_buffer);
+    }
+    Block block = Downcast<Block>(StmtMutator::VisitStmt_(op));
+    // Remove buffers allocated inside block to detect its access region
+    for (const auto& alloc_buffer : op->alloc_buffers) {
+      buffer_var_map_->erase(alloc_buffer->data);
+    }
+    // ignore root block or blocks which already has reads/writes regions
+    if (block->reads.empty() || block->writes.empty()) {
+      if (op->iter_vars.empty()) {
+        // non-root opaque block is not allowed
+        CHECK(is_root_block)
+            << "ValueError: Can not auto detect buffer access region for an opaque block. Please "
+               "annotate the access region manually.";
+        return std::move(block);
+      }
+      auto access_region = GetBlockAccessRegion(block, *buffer_var_map_);
+      const Array<BufferRegion>& reads = access_region[0];
+      const Array<BufferRegion>& writes = access_region[1];
+      const Array<BufferRegion>& opaque = access_region[2];
+      CHECK(opaque.empty())
+          << "ValueError: Can not auto detect buffer access region from tir.Load, tir.Store or "
+             "direct access by buffer data. Please annotation the access region manually";
+      auto n = CopyOnWrite(block.operator->());
+      if (n->reads.empty()) n->reads = reads;
+      if (n->writes.empty()) n->writes = writes;
+      return Block(n);
+    } else {
+      return std::move(block);
+    }
+  }
+};
+
+PrimFunc ScriptComplete(PrimFunc func, const Array<Buffer>& root_allocates) {
+  Map<Var, Buffer> buffer_var_map;
+  for (const auto& pair : func->buffer_map) {
+    const Buffer& buffer = pair.second;
+    buffer_var_map.Set(buffer->data, buffer);
+  }
+  for (const auto& alloc : root_allocates) {
+    buffer_var_map.Set(alloc->data, alloc);
+  }
+  bool contain_root = root_allocates.empty() && func->body->IsInstance<BlockRealizeNode>() &&
+                      Downcast<BlockRealize>(func->body)->block->iter_vars.empty();
+  ScriptCompleter script_completer(&buffer_var_map, contain_root);
+  // generate surrounding loops automatically
+  Stmt res = script_completer(func->body);
+  // generate root block automatically
+  if (script_completer.contains_block && !contain_root) {
+    res = Block({}, {}, {}, "root", res, NullOpt, root_allocates);
+    res = BlockRealize({}, Bool(true), Downcast<Block>(res));
+  }
+  if (func->body.same_as(res)) {
+    return func;
+  } else {
+    auto fptr = func.CopyOnWrite();
+    fptr->body = res;
+    return func;
+  }
+}
+
+TVM_REGISTER_GLOBAL("script.Complete").set_body_typed(ScriptComplete);
+
+}  // namespace tir
+}  // namespace tvm
diff --git a/src/tir/ir/stmt_functor.cc b/src/tir/ir/stmt_functor.cc
index 639d38db0a81..07574e4fb2f1 100644
--- a/src/tir/ir/stmt_functor.cc
+++ b/src/tir/ir/stmt_functor.cc
@@ -19,12 +19,14 @@
 /*!
  * \file stmt_functor.cc
  */
+#include <tvm/ir/module.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/tir/function.h>
 #include <tvm/tir/stmt_functor.h>
 
 #include <functional>
 
-#include "functor_common.h"
+#include "./functor_common.h"
 
 namespace tvm {
 namespace tir {
@@ -631,9 +633,9 @@ Stmt IRTransform(Stmt ir_node, const runtime::PackedFunc& f_preorder,
   return transform(std::move(ir_node));
 }
 
-class IRSubstitue : public StmtExprMutator {
+class IRSubstitute : public StmtExprMutator {
  public:
-  explicit IRSubstitue(std::function<Optional<PrimExpr>(const Var&)> vmap) : vmap_(vmap) {}
+  explicit IRSubstitute(std::function<Optional<PrimExpr>(const Var&)> vmap) : vmap_(vmap) {}
 
   PrimExpr VisitExpr_(const VarNode* op) final {
     Var var = GetRef<Var>(op);
@@ -679,11 +681,53 @@ class IRSubstitue : public StmtExprMutator {
 };
 
 Stmt Substitute(Stmt stmt, std::function<Optional<PrimExpr>(const Var&)> vmap) {
-  return IRSubstitue(vmap)(std::move(stmt));
+  return IRSubstitute(vmap)(std::move(stmt));
 }
 
 PrimExpr Substitute(PrimExpr expr, std::function<Optional<PrimExpr>(const Var&)> vmap) {
-  return IRSubstitue(vmap)(std::move(expr));
+  return IRSubstitute(vmap)(std::move(expr));
+}
+
+void PreOrderVisit(const ObjectRef& stmt_or_expr,
+                   const std::function<bool(const ObjectRef&)>& fvisit) {
+  class PreOrderVisitor : public StmtExprVisitor {
+   public:
+    explicit PreOrderVisitor(const std::function<bool(const ObjectRef&)>& f) : f_(f) {}
+
+   private:
+    void VisitExpr(const PrimExpr& expr) final {
+      const PrimExprNode* p_expr = expr.get();
+      if (visited_.count(p_expr) == 0) {
+        visited_.insert(p_expr);
+        if (f_(expr)) {
+          ExprVisitor::VisitExpr(expr);
+        }
+      }
+    }
+
+    void VisitStmt(const Stmt& stmt) final {
+      const StmtNode* p_stmt = stmt.get();
+      if (visited_.count(p_stmt) == 0) {
+        visited_.insert(p_stmt);
+        if (f_(stmt)) {
+          StmtVisitor::VisitStmt(stmt);
+        }
+      }
+    }
+
+    const std::function<bool(const ObjectRef&)>& f_;
+    std::unordered_set<const Object*> visited_;
+  };
+
+  PreOrderVisitor visitor(fvisit);
+  if (const auto* stmt = stmt_or_expr.as<StmtNode>()) {
+    visitor(GetRef<Stmt>(stmt));
+  } else if (const auto* expr = stmt_or_expr.as<PrimExprNode>()) {
+    visitor(GetRef<PrimExpr>(expr));
+  } else {
+    LOG(FATAL) << "InternalError: PreOrderVisit does not accept object with type: "
+               << stmt_or_expr->GetTypeKey();
+  }
 }
 
 TVM_REGISTER_GLOBAL("tir.IRTransform").set_body_typed(IRTransform);
diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc
index 1d12d57d10b4..a4488c8986e6 100644
--- a/src/tir/transforms/lower_tvm_builtin.cc
+++ b/src/tir/transforms/lower_tvm_builtin.cc
@@ -128,11 +128,11 @@ class BuiltinLower : public StmtExprMutator {
   }
 
   Stmt VisitStmt_(const AttrStmtNode* op) final {
-    if (op->attr_key == attr::device_context_id) {
+    if (op->attr_key == attr::device_id) {
       ICHECK(!device_id_.defined());
       device_id_ = op->value;
       return this->VisitStmt(op->body);
-    } else if (op->attr_key == attr::device_context_type) {
+    } else if (op->attr_key == attr::device_type) {
       ICHECK(!device_type_.defined());
       device_type_ = op->value;
       return this->VisitStmt(op->body);
diff --git a/src/tir/transforms/make_packed_api.cc b/src/tir/transforms/make_packed_api.cc
index 0946af6f640a..0cc0086897d8 100644
--- a/src/tir/transforms/make_packed_api.cc
+++ b/src/tir/transforms/make_packed_api.cc
@@ -229,7 +229,7 @@ PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) {
   //
   // For example, for auto broadcasting, checks are required to guarantee that
   // either 0 or the original stride will be correctly used. Checks here have
-  // to use the args that may have no let bining yet. Therefore, hoisting let
+  // to use the args that may have no let binding yet. Therefore, hoisting let
   // binding for args before buffer declaration is needed.
   for (const auto& kv : var_def) {
     binder.Bind(kv.second, kv.first, kv.first->name_hint, true);
@@ -249,10 +249,10 @@ PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) {
   // Set device context
   if (vmap.count(device_id.get())) {
     PrimExpr node = StringImm("default");
-    seq_check.push_back(AttrStmt(node, attr::device_context_id, device_id, nop));
-    seq_check.push_back(AttrStmt(node, attr::device_context_type, device_type, nop));
+    seq_check.push_back(AttrStmt(node, attr::device_id, device_id, nop));
+    seq_check.push_back(AttrStmt(node, attr::device_type, device_type, nop));
 
-    if (runtime::DeviceAPI::NeedSetDeviceContext(target_device_type)) {
+    if (runtime::DeviceAPI::NeedSetDevice(target_device_type)) {
       Stmt set_device =
           Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(),
                         {StringImm(runtime::symbol::tvm_set_device), device_type, device_id}));
diff --git a/src/tir/transforms/storage_access.cc b/src/tir/transforms/storage_access.cc
index 38143c14b021..00002d3587db 100644
--- a/src/tir/transforms/storage_access.cc
+++ b/src/tir/transforms/storage_access.cc
@@ -132,6 +132,10 @@ void StorageAccessVisitor::VisitStmt_(const AttrStmtNode* op) {
       StmtExprVisitor::VisitStmt_(op);
     }
     env_threads_.pop_back();
+  } else if (op->attr_key == attr::hand_threaded) {
+    // skip this pass on blocks that were hand_threaded
+    // this avoids control flow and read/write conflicts
+    // between hand-threaded kernels and automatic threading
   } else {
     StmtExprVisitor::VisitStmt_(op);
   }
diff --git a/tests/azure-pipelines/main.yml b/tests/azure-pipelines/main.yml
index 094c1df12739..49d488aba5fd 100644
--- a/tests/azure-pipelines/main.yml
+++ b/tests/azure-pipelines/main.yml
@@ -35,7 +35,7 @@ jobs:
         cmakeArgs: >
           -DUSE_SORT=ON
           -DUSE_RPC=ON
-          -DUSE_GRAPH_RUNTIME=ON
+          -DUSE_GRAPH_EXECUTOR=ON
           ..
     - task: MSBuild@1
       inputs:
@@ -56,7 +56,7 @@ jobs:
         cmakeArgs: >
           -DUSE_SORT=ON
           -DUSE_RPC=ON
-          -DUSE_GRAPH_RUNTIME=ON
+          -DUSE_GRAPH_EXECUTOR=ON
           ..
     - task: MSBuild@1
       inputs:
@@ -75,7 +75,7 @@ jobs:
         cmakeArgs: >
           -DUSE_SORT=ON
           -DUSE_RPC=ON
-          -DUSE_GRAPH_RUNTIME=ON
+          -DUSE_GRAPH_EXECUTOR=ON
           ..
     - script: cd build.common && make -j`sysctl -n hw.ncpu`
       displayName: Build the project
diff --git a/tests/cpp/build_module_test.cc b/tests/cpp/build_module_test.cc
index ed50e3c86e85..e9373936e0d4 100644
--- a/tests/cpp/build_module_test.cc
+++ b/tests/cpp/build_module_test.cc
@@ -163,16 +163,16 @@ TEST(BuildModule, Heterogeneous) {
     pc[i] = i - 1.0;
   }
 
-  // Initialize graph runtime.
+  // Initialize graph executor.
   int cpu_dev_ty = static_cast<int>(kDLCPU);
   int cpu_dev_id = 0;
   int gpu_dev_ty = static_cast<int>(kDLGPU);
   int gpu_dev_id = 0;
 
-  const runtime::PackedFunc* graph_runtime =
-      tvm::runtime::Registry::Get("tvm.graph_runtime.create");
+  const runtime::PackedFunc* graph_executor =
+      tvm::runtime::Registry::Get("tvm.graph_executor.create");
   runtime::Module mod =
-      (*graph_runtime)(json, module, cpu_dev_ty, cpu_dev_id, gpu_dev_ty, gpu_dev_id);
+      (*graph_executor)(json, module, cpu_dev_ty, cpu_dev_id, gpu_dev_ty, gpu_dev_id);
 
   // test FFI for module.
   auto test_ffi = PackedFunc([](TVMArgs args, TVMRetValue* rv) {
diff --git a/tests/cpp/container_test.cc b/tests/cpp/container_test.cc
index 35fd5b1c45b1..41632ff8d561 100644
--- a/tests/cpp/container_test.cc
+++ b/tests/cpp/container_test.cc
@@ -676,7 +676,7 @@ TEST(Optional, PackedCall) {
     ICHECK_EQ(args[0].type_code(), tcode);
   });
   String s = "xyz";
-  auto nd = NDArray::Empty({0, 1}, DataType::Float(32), DLContext{kDLCPU, 0});
+  auto nd = NDArray::Empty({0, 1}, DataType::Float(32), DLDevice{kDLCPU, 0});
   test_ffi(Optional<NDArray>(nd), static_cast<int>(kTVMNDArrayHandle));
   test_ffi(Optional<String>(s), static_cast<int>(kTVMObjectRValueRefArg));
   test_ffi(s, static_cast<int>(kTVMObjectHandle));
diff --git a/tests/cpp/contrib/bnns.cc b/tests/cpp/contrib/bnns.cc
new file mode 100644
index 000000000000..f7d40f176fb6
--- /dev/null
+++ b/tests/cpp/contrib/bnns.cc
@@ -0,0 +1,307 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+TEST(PackedFunc, Basic) {
+  using namespace tvm;
+  using namespace tvm::tir;
+  using namespace tvm::runtime;
+  int x = 0;
+  void* handle = &x;
+  DLTensor a;
+
+  Var v = PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
+    ICHECK(args.num_args == 3);
+    ICHECK(args.values[0].v_float64 == 1.0);
+    ICHECK(args.type_codes[0] == kDLFloat);
+    ICHECK(args.values[1].v_handle == &a);
+    ICHECK(args.type_codes[1] == kTVMDLTensorHandle);
+    ICHECK(args.values[2].v_handle == &x);
+    ICHECK(args.type_codes[2] == kTVMOpaqueHandle);
+    *rv = Var("a");
+  })(1.0, &a, handle);
+  ICHECK(v->name_hint == "a");
+}
+
+TEST(PackedFunc, Node) {
+  using namespace tvm;
+  using namespace tvm::tir;
+  using namespace tvm::runtime;
+  Var x;
+  Var t = PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
+    ICHECK(args.num_args == 1);
+    ICHECK(args[0].IsObjectRef<ObjectRef>());
+    Var b = args[0];
+    ICHECK(x.same_as(b));
+    *rv = b;
+  })(x);
+  ICHECK(t.same_as(x));
+}
+
+TEST(PackedFunc, NDArray) {
+  using namespace tvm;
+  using namespace tvm::runtime;
+  auto x = NDArray::Empty({}, String2DLDataType("float32"), Device{kDLCPU, 0});
+  reinterpret_cast<float*>(x->data)[0] = 10.0f;
+  ICHECK(x.use_count() == 1);
+
+  PackedFunc forward([&](TVMArgs args, TVMRetValue* rv) { *rv = args[0]; });
+
+  NDArray ret = PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
+    NDArray y = args[0];
+    DLTensor* ptr = args[0];
+    ICHECK(ptr == x.operator->());
+    ICHECK(x.same_as(y));
+    ICHECK(x.use_count() == 2);
+    *rv = forward(y);
+  })(x);
+  ICHECK(ret.use_count() == 2);
+  ICHECK(ret.same_as(x));
+}
+
+TEST(PackedFunc, str) {
+  using namespace tvm;
+  using namespace tvm::runtime;
+  PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
+    ICHECK(args.num_args == 1);
+    std::string x = args[0];
+    ICHECK(x == "hello");
+    String y = args[0];
+    ICHECK(y == "hello");
+    *rv = x;
+  })("hello");
+
+  PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
+    ICHECK(args.num_args == 1);
+    runtime::String s = args[0];
+    ICHECK(s == "hello");
+  })(runtime::String("hello"));
+}
+
+TEST(PackedFunc, func) {
+  using namespace tvm;
+  using namespace tvm::runtime;
+  PackedFunc addone([&](TVMArgs args, TVMRetValue* rv) { *rv = args[0].operator int() + 1; });
+  // function as arguments
+  int r0 = PackedFunc([](TVMArgs args, TVMRetValue* rv) {
+    PackedFunc f = args[0];
+    // TVMArgValue -> Arguments as function
+    *rv = f(args[1]).operator int();
+  })(addone, 1);
+  ICHECK_EQ(r0, 2);
+
+  int r1 = PackedFunc([](TVMArgs args, TVMRetValue* rv) {
+    // TVMArgValue -> TVMRetValue
+    *rv = args[1];
+  })(2, 100);
+  ICHECK_EQ(r1, 100);
+
+  int r2 = PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
+    // re-assignment
+    *rv = args[0];
+    // TVMRetValue -> Function argument
+    *rv = addone(args[0].operator PackedFunc()(args[1], 1));
+  })(addone, 100);
+  ICHECK_EQ(r2, 102);
+}
+
+TEST(PackedFunc, Expr) {
+  using namespace tvm;
+  using namespace tvm::runtime;
+  // automatic conversion of int to expr
+  PackedFunc addone([](TVMArgs args, TVMRetValue* rv) {
+    PrimExpr x = args[0];
+    *rv = x.as<tvm::tir::IntImmNode>()->value + 1;
+  });
+  int r0 = PackedFunc([](TVMArgs args, TVMRetValue* rv) {
+    PackedFunc f = args[0];
+    // TVMArgValue -> Arguments as function
+    *rv = f(args[1]).operator int();
+  })(addone, 1);
+  ICHECK_EQ(r0, 2);
+}
+
+TEST(PackedFunc, Type) {
+  using namespace tvm;
+  using namespace tvm::runtime;
+  auto get_type = PackedFunc([](TVMArgs args, TVMRetValue* rv) {
+    DataType x = args[0];
+    *rv = x;
+  });
+  auto get_type2 = PackedFunc([](TVMArgs args, TVMRetValue* rv) { *rv = args[0]; });
+  ICHECK(get_type("int32").operator DataType() == DataType::Int(32));
+  ICHECK(get_type("float").operator DataType() == DataType::Float(32));
+  ICHECK(get_type2("float32x2").operator DataType() == DataType::Float(32, 2));
+}
+
+TEST(TypedPackedFunc, HighOrder) {
+  using namespace tvm;
+  using namespace tvm::runtime;
+  using Int1Func = TypedPackedFunc<int(int)>;
+  using Int2Func = TypedPackedFunc<int(int, int)>;
+  using BindFunc = TypedPackedFunc<Int1Func(Int2Func, int value)>;
+  BindFunc ftyped;
+  ftyped = [](Int2Func f1, int value) -> Int1Func {
+    auto binded = [f1, value](int x) { return f1(value, x); };
+    Int1Func x(binded);
+    return x;
+  };
+  auto add = [](int x, int y) { return x + y; };
+  ICHECK_EQ(ftyped(Int2Func(add), 1)(2), 3);
+  PackedFunc f = ftyped(Int2Func(add), 1);
+  ICHECK_EQ(f(3).operator int(), 4);
+  // call the type erased version.
+  Int1Func f1 = ftyped.packed()(Int2Func(add), 1);
+  ICHECK_EQ(f1(3), 4);
+}
+
+TEST(TypedPackedFunc, Deduce) {
+  using namespace tvm::runtime;
+  using tvm::runtime::detail::function_signature;
+
+  TypedPackedFunc<int(float)> x;
+  auto f = [](int x) -> int { return x + 1; };
+  std::function<void(float)> y;
+
+  static_assert(std::is_same<function_signature<decltype(x)>::FType, int(float)>::value,
+                "invariant1");
+  static_assert(std::is_same<function_signature<decltype(f)>::FType, int(int)>::value,
+                "invariant2");
+  static_assert(std::is_same<function_signature<decltype(y)>::FType, void(float)>::value,
+                "invariant3");
+}
+
+TEST(PackedFunc, ObjectConversion) {
+  using namespace tvm;
+  using namespace tvm::tir;
+  using namespace tvm::runtime;
+  TVMRetValue rv;
+  auto x = NDArray::Empty({}, String2DLDataType("float32"), Device{kDLCPU, 0});
+  // assign null
+  rv = ObjectRef();
+  ICHECK_EQ(rv.type_code(), kTVMNullptr);
+
+  // Can assign NDArray to ret type
+  rv = x;
+  ICHECK_EQ(rv.type_code(), kTVMNDArrayHandle);
+  // Even if we assign base type it still shows as NDArray
+  rv = ObjectRef(x);
+  ICHECK_EQ(rv.type_code(), kTVMNDArrayHandle);
+  // Check convert back
+  ICHECK(rv.operator NDArray().same_as(x));
+  ICHECK(rv.operator ObjectRef().same_as(x));
+  ICHECK(!rv.IsObjectRef<PrimExpr>());
+
+  auto pf1 = PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
+    ICHECK_EQ(args[0].type_code(), kTVMNDArrayHandle);
+    ICHECK(args[0].operator NDArray().same_as(x));
+    ICHECK(args[0].operator ObjectRef().same_as(x));
+    ICHECK(args[1].operator ObjectRef().get() == nullptr);
+    ICHECK(args[1].operator NDArray().get() == nullptr);
+    ICHECK(args[1].operator Module().get() == nullptr);
+    ICHECK(args[1].operator Array<NDArray>().get() == nullptr);
+    ICHECK(!args[0].IsObjectRef<PrimExpr>());
+  });
+  pf1(x, ObjectRef());
+  pf1(ObjectRef(x), NDArray());
+
+  // testcases for modules
+  auto* pf = tvm::runtime::Registry::Get("runtime.SourceModuleCreate");
+  ICHECK(pf != nullptr);
+  Module m = (*pf)("", "xyz");
+  rv = m;
+  ICHECK_EQ(rv.type_code(), kTVMModuleHandle);
+  // Even if we assign base type it still shows as NDArray
+  rv = ObjectRef(m);
+  ICHECK_EQ(rv.type_code(), kTVMModuleHandle);
+  // Check convert back
+  ICHECK(rv.operator Module().same_as(m));
+  ICHECK(rv.operator ObjectRef().same_as(m));
+  ICHECK(!rv.IsObjectRef<NDArray>());
+
+  auto pf2 = PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
+    ICHECK_EQ(args[0].type_code(), kTVMModuleHandle);
+    ICHECK(args[0].operator Module().same_as(m));
+    ICHECK(args[0].operator ObjectRef().same_as(m));
+    ICHECK(args[1].operator ObjectRef().get() == nullptr);
+    ICHECK(args[1].operator NDArray().get() == nullptr);
+    ICHECK(args[1].operator Module().get() == nullptr);
+    ICHECK(!args[0].IsObjectRef<PrimExpr>());
+  });
+  pf2(m, ObjectRef());
+  pf2(ObjectRef(m), Module());
+}
+
+TEST(TypedPackedFunc, RValue) {
+  using namespace tvm;
+  using namespace tvm::runtime;
+  {
+    auto inspect = [](TVMArgs args, TVMRetValue* rv) {
+      for (int i = 0; i < args.size(); ++i) {
+        ICHECK_EQ(args[0].type_code(), kTVMObjectRValueRefArg);
+      }
+    };
+    PackedFunc finspect(inspect);
+    finspect(tir::Var("x"));
+  }
+  {
+    auto f = [](tir::Var x, bool move) {
+      if (move) {
+        ICHECK(x.unique());
+      } else {
+        ICHECK(!x.unique());
+      }
+      ICHECK(x->name_hint == "x");
+      return x;
+    };
+    TypedPackedFunc<tir::Var(tir::Var, bool)> tf(f);
+
+    tir::Var var("x");
+    ICHECK(var.unique());
+    tf(var, false);
+    // move the result to the function.
+    tir::Var ret = tf(std::move(var), true);
+    ICHECK(!var.defined());
+  }
+
+  {
+    // pass child class.
+    auto f = [](PrimExpr x, bool move) {
+      if (move) {
+        ICHECK(x.unique());
+      } else {
+        ICHECK(!x.unique());
+      }
+      return x;
+    };
+    TypedPackedFunc<PrimExpr(PrimExpr, bool)> tf(f);
+
+    tir::Var var("x");
+    ICHECK(var.unique());
+    tf(var, false);
+    tf(std::move(var), true);
+    // auto conversion.
+    tf(1, true);
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  testing::FLAGS_gtest_death_test_style = "threadsafe";
+  return RUN_ALL_TESTS();
+}
diff --git a/tests/cpp/ir_functor_test.cc b/tests/cpp/ir_functor_test.cc
index 237dc46b99ca..9e8595d6809c 100644
--- a/tests/cpp/ir_functor_test.cc
+++ b/tests/cpp/ir_functor_test.cc
@@ -19,10 +19,14 @@
 
 #include <dmlc/logging.h>
 #include <gtest/gtest.h>
+#include <tvm/ir/module.h>
 #include <tvm/node/functor.h>
+#include <tvm/relay/function.h>
+#include <tvm/tir/analysis.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/expr.h>
 #include <tvm/tir/expr_functor.h>
+#include <tvm/tir/function.h>
 #include <tvm/tir/op.h>
 #include <tvm/tir/stmt_functor.h>
 
@@ -52,6 +56,55 @@ TEST(IRF, CountVar) {
   ICHECK_EQ(n_var, 2);
 }
 
+TEST(IRF, VisitPrimFuncs) {
+  using namespace tvm;
+  using namespace tvm::tir;
+  PrimFunc prim_func(/*params=*/{}, /*body=*/Evaluate(Integer(0)));
+  relay::Function relay_func(/*params=*/{}, /*body=*/relay::Expr(nullptr),
+                             /*ret_type=*/relay::Type{nullptr}, /*ty_params=*/{});
+  IRModule mod({
+      {GlobalVar("main"), prim_func},
+      {GlobalVar("main2"), relay_func},
+  });
+  int n_visited = 0;
+  VisitPrimFuncs(mod, [&](const PrimFuncNode* func) { ++n_visited; });
+  ASSERT_EQ(n_visited, 1);
+}
+
+TEST(IRF, PreOrderVisit) {
+  using namespace tvm;
+  using namespace tvm::tir;
+  Stmt init = IfThenElse(const_true(), Evaluate(Integer(0)), Evaluate(Integer(0)));
+  Stmt body = Evaluate(Integer(1));
+  Block block(/*iter_vars=*/{}, /*reads=*/{},
+              /*writes=*/{}, /*name_hint=*/"block", /*body=*/body,
+              /*init=*/init);
+  bool init_visited = false;
+  bool stopped_at_if = true;
+  bool body_visited = false;
+  PreOrderVisit(block, [&](const ObjectRef& n) -> bool {
+    if (n->IsInstance<IfThenElseNode>()) {
+      init_visited = true;
+      return false;
+    }
+    if (const auto* eval = n.as<EvaluateNode>()) {
+      if (const auto* int_imm = eval->value.as<IntImmNode>()) {
+        if (int_imm->value == 0) {
+          stopped_at_if = false;
+        } else if (int_imm->value == 1) {
+          body_visited = true;
+        } else {
+          LOG(FATAL) << "Unreachable";
+        }
+      }
+    }
+    return true;
+  });
+  ASSERT_EQ(init_visited, true);
+  ASSERT_EQ(stopped_at_if, true);
+  ASSERT_EQ(body_visited, true);
+}
+
 TEST(IRF, ExprTransform) {
   using namespace tvm;
   using namespace tvm::tir;
@@ -72,7 +125,7 @@ TEST(IRF, ExprTransform) {
   try {
     f(z - 1, 2);
     LOG(FATAL) << "should fail";
-  } catch (dmlc::Error&) {
+  } catch (Error&) {
   }
 }
 
diff --git a/tests/cpp/packed_func_test.cc b/tests/cpp/packed_func_test.cc
index 53a3f40388cb..cf22577a791a 100644
--- a/tests/cpp/packed_func_test.cc
+++ b/tests/cpp/packed_func_test.cc
@@ -64,7 +64,7 @@ TEST(PackedFunc, Node) {
 TEST(PackedFunc, NDArray) {
   using namespace tvm;
   using namespace tvm::runtime;
-  auto x = NDArray::Empty({}, String2DLDataType("float32"), TVMContext{kDLCPU, 0});
+  auto x = NDArray::Empty({}, String2DLDataType("float32"), Device{kDLCPU, 0});
   reinterpret_cast<float*>(x->data)[0] = 10.0f;
   ICHECK(x.use_count() == 1);
 
@@ -199,7 +199,7 @@ TEST(PackedFunc, ObjectConversion) {
   using namespace tvm::tir;
   using namespace tvm::runtime;
   TVMRetValue rv;
-  auto x = NDArray::Empty({}, String2DLDataType("float32"), TVMContext{kDLCPU, 0});
+  auto x = NDArray::Empty({}, String2DLDataType("float32"), Device{kDLCPU, 0});
   // assign null
   rv = ObjectRef();
   ICHECK_EQ(rv.type_code(), kTVMNullptr);
diff --git a/tests/cpp/parallel_for_test.cc b/tests/cpp/parallel_for_test.cc
index bf5fe94b83ff..a4549344bd11 100644
--- a/tests/cpp/parallel_for_test.cc
+++ b/tests/cpp/parallel_for_test.cc
@@ -19,7 +19,7 @@
 
 #include <dmlc/logging.h>
 #include <gtest/gtest.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/support/parallel_for.h>
 
 #include <vector>
diff --git a/tests/cpp/profiling.cc b/tests/cpp/profiling.cc
index 6ec2fc060f9f..f770bfda8e5b 100644
--- a/tests/cpp/profiling.cc
+++ b/tests/cpp/profiling.cc
@@ -27,11 +27,11 @@ namespace tvm {
 namespace runtime {
 TEST(DefaultTimer, Basic) {
   using namespace tvm::runtime;
-  DLContext ctx;
-  ctx.device_type = kDLCPU;
-  ctx.device_id = 0;
+  Device dev;
+  dev.device_type = kDLCPU;
+  dev.device_id = 0;
 
-  Timer t = Timer::Start(ctx);
+  Timer t = Timer::Start(dev);
   std::this_thread::sleep_for(std::chrono::milliseconds(10));
   t->Stop();
   int64_t elapsed = t->SyncAndGetElapsedNanos();
diff --git a/tests/cpp/relay_build_module_test.cc b/tests/cpp/relay_build_module_test.cc
index a15cdcd3926b..b7b5abfd697d 100644
--- a/tests/cpp/relay_build_module_test.cc
+++ b/tests/cpp/relay_build_module_test.cc
@@ -123,10 +123,10 @@ TEST(Relay, BuildModule) {
   std::string json = json_f();
   tvm::runtime::Module mod = mod_f();
   // run
-  auto ctx = A->ctx;
-  auto pfr = tvm::runtime::Registry::Get("tvm.graph_runtime.create");
+  auto dev = A->device;
+  auto pfr = tvm::runtime::Registry::Get("tvm.graph_executor.create");
   ICHECK(mod.defined()) << "Module must be defined";
-  tvm::runtime::Module run_mod = (*pfr)(json, mod, (int)ctx.device_type, (int)ctx.device_id);
+  tvm::runtime::Module run_mod = (*pfr)(json, mod, (int)dev.device_type, (int)dev.device_id);
   auto set_input_f = run_mod.GetFunction("set_input_zero_copy", false);
   auto run_f = run_mod.GetFunction("run", false);
   auto get_output_f = run_mod.GetFunction("get_output", false);
diff --git a/tests/crt/memory_test.cc b/tests/crt/memory_test.cc
index 101a5f008394..d876e5c96da9 100644
--- a/tests/crt/memory_test.cc
+++ b/tests/crt/memory_test.cc
@@ -40,7 +40,7 @@ class MemoryManagerTest : public ::testing::Test {
     MemoryManagerCreate(&interface, memory_pool, kMemoryPoolSizeBytes, kPageSizeBytesLog);
     mgr = (MemoryManager*)interface;
     ASSERT_EQ(kNumUsablePages, mgr->ptable.max_pages);
-    ctx_ = {kDLCPU, 0};
+    dev_ = {kDLCPU, 0};
   }
 
   unsigned int AddressToPageNumber(void* a) {
@@ -52,7 +52,7 @@ class MemoryManagerTest : public ::testing::Test {
   uint8_t* memory_pool;
   MemoryManagerInterface* interface;
   MemoryManager* mgr;
-  DLContext ctx_;
+  DLDevice dev_;
 };
 
 #define EXPECT_PAGE(expected, actual) EXPECT_EQ(expected, AddressToPageNumber(actual))
@@ -64,7 +64,7 @@ TEST_F(MemoryManagerTest, AllocFreeFifo) {
     void* ptrs[kNumUsablePages];
     for (size_t idx = 0; idx < kNumUsablePages; idx++) {
       void* a;
-      EXPECT_EQ(interface->Allocate(interface, 1, ctx_, &a), kTvmErrorNoError);
+      EXPECT_EQ(interface->Allocate(interface, 1, dev_, &a), kTvmErrorNoError);
       if (i == 0) {
         EXPECT_PAGE(idx, a);
       } else {
@@ -75,7 +75,7 @@ TEST_F(MemoryManagerTest, AllocFreeFifo) {
     }
 
     for (int idx = kNumUsablePages - 1; idx >= 0; idx--) {
-      interface->Free(interface, ptrs[idx], ctx_);
+      interface->Free(interface, ptrs[idx], dev_);
       EXPECT_EQ(interface->vleak_size, idx);
     }
   }
diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py
index ab51b6c79c83..8d8b34322de3 100644
--- a/tests/lint/check_file_type.py
+++ b/tests/lint/check_file_type.py
@@ -125,9 +125,16 @@
     "docs/_static/img/tvm-logo-square.png",
     # pytest config
     "pytest.ini",
-    # Zephyr tests
-    "tests/micro/qemu/zephyr-runtime/prj.conf",
-    "tests/micro/qemu/zephyr-runtime/qemu-hack/qemu-system-i386",
+    # microTVM tests
+    "tests/micro/zephyr/testdata/digit-2.jpg",
+    "tests/micro/zephyr/testdata/digit-9.jpg",
+    "tests/micro/zephyr/testdata/mnist-8.onnx",
+    # microTVM Zephyr runtime
+    "apps/microtvm/zephyr/demo_runtime/prj.conf",
+    "apps/microtvm/zephyr/demo_runtime/boards/nrf5340dk_nrf5340_cpuapp.conf",
+    "apps/microtvm/zephyr/demo_runtime/boards/nucleo_f746zg.conf",
+    "apps/microtvm/zephyr/demo_runtime/boards/qemu_x86.conf",
+    "apps/microtvm/zephyr/demo_runtime/qemu-hack/qemu-system-i386",
     # microTVM Virtual Machines
     "apps/microtvm/reference-vm/zephyr/Vagrantfile",
     "apps/microtvm/reference-vm/zephyr/base-box/Vagrantfile.packer-template",
diff --git a/tests/micro/qemu/.gitignore b/tests/micro/qemu/.gitignore
deleted file mode 100644
index c920d8f93ff8..000000000000
--- a/tests/micro/qemu/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-/test_zephyr*-workspace
-/*.micro-binary
diff --git a/tests/micro/qemu/zephyr-runtime/.gitignore b/tests/micro/qemu/zephyr-runtime/.gitignore
deleted file mode 100644
index 64be5d3a487c..000000000000
--- a/tests/micro/qemu/zephyr-runtime/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-__tvm*
-libtvm__*
-/build
diff --git a/tests/micro/test_runtime_micro_on_arm.py b/tests/micro/test_runtime_micro_on_arm.py
index 45ca8e74323c..7d19d9510062 100644
--- a/tests/micro/test_runtime_micro_on_arm.py
+++ b/tests/micro/test_runtime_micro_on_arm.py
@@ -19,7 +19,7 @@
 import numpy as np
 import tvm
 from tvm import te
-from tvm.contrib import graph_runtime, utils
+from tvm.contrib import graph_executor, utils
 from tvm import relay
 import tvm.micro as micro
 from tvm.micro import create_micro_mod
@@ -36,7 +36,7 @@
 
 
 def relay_micro_build(func, dev_config, params=None):
-    """Create a graph runtime module with a micro device context from a Relay function.
+    """Create a graph executor module with a micro device context from a Relay function.
 
     Parameters
     ----------
@@ -52,7 +52,7 @@ def relay_micro_build(func, dev_config, params=None):
     Return
     ------
     mod : tvm.runtime.Module
-        graph runtime module for the target device
+        graph executor module for the target device
     """
     with tvm.transform.PassContext(
         disabled_pass={"FuseOps"}, config={"tir.disable_vectorize": True}
@@ -60,7 +60,7 @@ def relay_micro_build(func, dev_config, params=None):
         graph, c_mod, params = relay.build(func, target=TARGET, params=params)
     micro_mod = micro.create_micro_mod(c_mod, dev_config)
     ctx = tvm.micro_dev(0)
-    mod = graph_runtime.create(graph, micro_mod, ctx)
+    mod = graph_executor.create(graph, micro_mod, ctx)
     mod.set_input(**params)
     return mod
 
@@ -171,8 +171,8 @@ def test_workspace_add():
         tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 2.0)
 
 
-def test_graph_runtime():
-    """Test a program which uses the graph runtime."""
+def test_graph_executor():
+    """Test a program which uses the graph executor."""
     if not tvm.runtime.enabled("micro_dev"):
         return
     shape = (1024,)
@@ -347,9 +347,9 @@ def test_inactive_session_use():
     print()
     print("finished workspace add test")
     input("[press enter to continue]")
-    test_graph_runtime()
+    test_graph_executor()
     print()
-    print("finished graph runtime test")
+    print("finished graph executor test")
     input("[press enter to continue]")
     test_conv2d()
     print()
diff --git a/tests/micro/zephyr/README.md b/tests/micro/zephyr/README.md
new file mode 100644
index 000000000000..9769cae2b53b
--- /dev/null
+++ b/tests/micro/zephyr/README.md
@@ -0,0 +1,42 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+This directory contains tests for MicroTVM's integration with Zephyr.
+
+To run the test, you first need to be running in a Python environment with
+all of the appropriate TVM dependencies installed. If you have [Poetry](https://python-poetry.org/)
+installed, you can do the following to get an appropriately-configured Python
+environment:
+
+```
+$ cd tvm/apps/microtvm/
+$ poetry lock && poetry install && poetry shell
+```
+
+You can then run this test (either on real hardware or on a QEMU-emulated
+device) using:
+
+```
+$ cd tvm/tests/micro/zephyr
+$ pytest test_zephyr.py --microtvm-platforms=host       # For QEMU emulation
+$ pytest test_zephyr.py --microtvm-platforms=nrf5340dk  # For nRF5340DK
+```
+
+To see the list of supported values for `--microtvm-platforms`, run:
+```
+$ pytest test_zephyr.py --help
+```
diff --git a/tests/micro/qemu/conftest.py b/tests/micro/zephyr/conftest.py
similarity index 94%
rename from tests/micro/qemu/conftest.py
rename to tests/micro/zephyr/conftest.py
index 3fc54df02063..e8ce443adfaf 100644
--- a/tests/micro/qemu/conftest.py
+++ b/tests/micro/zephyr/conftest.py
@@ -16,11 +16,14 @@
 # under the License.
 import pytest
 
+import tvm.target.target
+
 
 def pytest_addoption(parser):
     parser.addoption(
         "--microtvm-platforms",
         default="host",
+        choices=tvm.target.target.MICRO_SUPPORTED_MODELS.keys(),
         help=(
             "Specify a comma-separated list of test models (i.e. as passed to tvm.target.micro()) "
             "for microTVM tests."
diff --git a/tests/micro/qemu/test_zephyr.py b/tests/micro/zephyr/test_zephyr.py
similarity index 75%
rename from tests/micro/qemu/test_zephyr.py
rename to tests/micro/zephyr/test_zephyr.py
index 51d5f990e710..1db3d505f490 100644
--- a/tests/micro/qemu/test_zephyr.py
+++ b/tests/micro/zephyr/test_zephyr.py
@@ -19,12 +19,15 @@
 import copy
 import datetime
 import glob
+import logging
 import os
 import subprocess
 import sys
 
 import pytest
 import numpy as np
+import onnx
+from PIL import Image
 
 import tvm
 import tvm.rpc
@@ -36,11 +39,14 @@
 from tvm.relay.expr_functor import ExprMutator
 from tvm.relay.op.annotation import compiler_begin, compiler_end
 
+# If set, build the uTVM binary from scratch on each test.
+# Otherwise, reuses the build from the previous test run.
 BUILD = True
-DEBUG = False
-
 
-TARGET = None
+# If set, enable a debug session while the test is running.
+# Before running the test, in a separate shell, you should run:
+#   python -m tvm.exec.microtvm_debug_shell
+DEBUG = False
 
 
 def _make_sess_from_op(model, zephyr_board, west_cmd, op_name, sched, arg_bufs):
@@ -63,15 +69,17 @@ def _make_session(model, target, zephyr_board, west_cmd, mod):
         os.makedirs(workspace_parent)
     workspace = tvm.micro.Workspace(debug=True, root=workspace_root)
 
-    project_dir = os.path.join(os.path.dirname(__file__) or ".", "zephyr-runtime")
+    test_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__)))
+    tvm_source_dir = os.path.join(test_dir, "..", "..", "..")
+    runtime_path = os.path.join(tvm_source_dir, "apps", "microtvm", "zephyr", "demo_runtime")
     compiler = zephyr.ZephyrCompiler(
-        project_dir=project_dir,
+        project_dir=runtime_path,
         board=zephyr_board,
         zephyr_toolchain_variant="zephyr",
         west_cmd=west_cmd,
     )
 
-    opts = tvm.micro.default_options(f"{project_dir}/crt")
+    opts = tvm.micro.default_options(os.path.join(runtime_path, "crt"))
     # TODO(weberlo) verify this is necessary
     opts["bin_opts"]["ccflags"] = ["-std=gnu++14"]
     opts["lib_opts"]["ccflags"] = ["-std=gnu++14"]
@@ -132,11 +140,11 @@ def test_compile_runtime(platform, west_cmd):
 
     # NOTE: run test in a nested function so cPython will delete arrays before closing the session.
     def test_basic_add(sess):
-        A_data = tvm.nd.array(np.array([2, 3], dtype="int8"), ctx=sess.context)
+        A_data = tvm.nd.array(np.array([2, 3], dtype="int8"), device=sess.device)
         assert (A_data.asnumpy() == np.array([2, 3])).all()
-        B_data = tvm.nd.array(np.array([4], dtype="int8"), ctx=sess.context)
+        B_data = tvm.nd.array(np.array([4], dtype="int8"), device=sess.device)
         assert (B_data.asnumpy() == np.array([4])).all()
-        C_data = tvm.nd.array(np.array([0, 0], dtype="int8"), ctx=sess.context)
+        C_data = tvm.nd.array(np.array([0, 0], dtype="int8"), device=sess.device)
         assert (C_data.asnumpy() == np.array([0, 0])).all()
 
         system_lib = sess.get_system_lib()
@@ -154,16 +162,16 @@ def test_platform_timer(platform, west_cmd):
 
     # NOTE: run test in a nested function so cPython will delete arrays before closing the session.
     def test_basic_add(sess):
-        A_data = tvm.nd.array(np.array([2, 3], dtype="int8"), ctx=sess.context)
+        A_data = tvm.nd.array(np.array([2, 3], dtype="int8"), device=sess.device)
         assert (A_data.asnumpy() == np.array([2, 3])).all()
-        B_data = tvm.nd.array(np.array([4], dtype="int8"), ctx=sess.context)
+        B_data = tvm.nd.array(np.array([4], dtype="int8"), device=sess.device)
         assert (B_data.asnumpy() == np.array([4])).all()
-        C_data = tvm.nd.array(np.array([0, 0], dtype="int8"), ctx=sess.context)
+        C_data = tvm.nd.array(np.array([0, 0], dtype="int8"), device=sess.device)
         assert (C_data.asnumpy() == np.array([0, 0])).all()
 
         system_lib = sess.get_system_lib()
         time_eval_f = system_lib.time_evaluator(
-            "add", sess.context, number=20, repeat=3, min_repeat_ms=40
+            "add", sess.device, number=20, repeat=3, min_repeat_ms=40
         )
         result = time_eval_f(A_data, B_data, C_data)
         assert (C_data.asnumpy() == np.array([6, 7])).all()
@@ -191,8 +199,8 @@ def test_relay(platform, west_cmd):
         graph, mod, params = tvm.relay.build(func, target=target)
 
     with _make_session(model, target, zephyr_board, west_cmd, mod) as session:
-        graph_mod = tvm.micro.create_local_graph_runtime(
-            graph, session.get_system_lib(), session.context
+        graph_mod = tvm.micro.create_local_graph_executor(
+            graph, session.get_system_lib(), session.device
         )
         graph_mod.set_input(**params)
         x_in = np.random.randint(10, size=shape[0], dtype=dtype)
@@ -202,6 +210,53 @@ def test_relay(platform, west_cmd):
         tvm.testing.assert_allclose(result, x_in * x_in + 1)
 
 
+def test_onnx(platform, west_cmd):
+    """Testing a simple ONNX model."""
+    model, zephyr_board = PLATFORMS[platform]
+
+    # Load test images.
+    this_dir = os.path.dirname(__file__)
+    digit_2 = Image.open(f"{this_dir}/testdata/digit-2.jpg").resize((28, 28))
+    digit_2 = np.asarray(digit_2).astype("float32")
+    digit_2 = np.expand_dims(digit_2, axis=0)
+
+    digit_9 = Image.open(f"{this_dir}/testdata/digit-9.jpg").resize((28, 28))
+    digit_9 = np.asarray(digit_9).astype("float32")
+    digit_9 = np.expand_dims(digit_9, axis=0)
+
+    # Load ONNX model and convert to Relay.
+    onnx_model = onnx.load(f"{this_dir}/testdata/mnist-8.onnx")
+    shape = {"Input3": (1, 1, 28, 28)}
+    relay_mod, params = relay.frontend.from_onnx(onnx_model, shape=shape, freeze_params=True)
+    relay_mod = relay.transform.DynamicToStatic()(relay_mod)
+
+    # We add the -link-params=1 option to ensure the model parameters are compiled in.
+    # There is currently a bug preventing the demo_runtime environment from receiving
+    # the model weights when set using graph_mod.set_input().
+    # See: https://github.com/apache/tvm/issues/7567
+    target = tvm.target.target.micro(model, options=["-link-params=1"])
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        lowered = relay.build(relay_mod, target, params=params)
+        graph = lowered.get_json()
+
+    with _make_session(model, target, zephyr_board, west_cmd, lowered.lib) as session:
+        graph_mod = tvm.micro.create_local_graph_executor(
+            graph, session.get_system_lib(), session.device
+        )
+
+        # Send the digit-2 image and confirm that the correct result is returned.
+        graph_mod.set_input("Input3", tvm.nd.array(digit_2))
+        graph_mod.run()
+        result = graph_mod.get_output(0).asnumpy()
+        assert np.argmax(result) == 2
+
+        # Send the digit-9 image and confirm that the correct result is returned.
+        graph_mod.set_input("Input3", tvm.nd.array(digit_9))
+        graph_mod.run()
+        result = graph_mod.get_output(0).asnumpy()
+        assert np.argmax(result) == 9
+
+
 class CcompilerAnnotator(ExprMutator):
     """
     This is used to create external functions for ccompiler.
@@ -264,8 +319,8 @@ def check_result(relay_mod, model, zephyr_board, west_cmd, map_inputs, out_shape
         graph, mod, params = tvm.relay.build(relay_mod, target=target)
 
     with _make_session(model, target, zephyr_board, west_cmd, mod) as session:
-        rt_mod = tvm.micro.create_local_graph_runtime(
-            graph, session.get_system_lib(), session.context
+        rt_mod = tvm.micro.create_local_graph_executor(
+            graph, session.get_system_lib(), session.device
         )
         rt_mod.set_input(**params)
         for name, data in map_inputs.items():
@@ -277,7 +332,7 @@ def check_result(relay_mod, model, zephyr_board, west_cmd, map_inputs, out_shape
         results = result if isinstance(result, list) else [result]
 
         for idx, shape in enumerate(out_shapes):
-            out = tvm.nd.empty(shape, ctx=session.context)
+            out = tvm.nd.empty(shape, device=session.device)
             out = rt_mod.get_output(idx, out)
             tvm.testing.assert_allclose(out.asnumpy(), results[idx], rtol=TOL, atol=TOL)
 
diff --git a/tests/micro/zephyr/testdata/digit-2.jpg b/tests/micro/zephyr/testdata/digit-2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b709a206b8d776215dcaa78643b22fe628b3c43a
GIT binary patch
literal 572
zcmV-C0>l0P*#F=F5K2Z#MgRc;000310RRC1+W<oV2nGfR1_cNQ2L}lW2nq}o4Gaql
z3>gv=4-_35A08bV92_7dE+-%&EF&BoC^soAFflYVG#@89JvcHvE;BST|G)qX2ml-a
z9036l0RO}Q9{>OW1pxs80RaI300000000010s{mE1_uZU3Jd?l0JRVR0s#X90t5pE
z1q1{D00Dgg0s{a95d{(<F+m0sQDFxnaTGFvk)a49Lb1UWQqk}vGh%Xr2?`b$7#SKR
zCMPH<Dl|1VH#j*uLqtVJM@UIZQ&d$}S6EqEV`OD!XJ~0^b98lfcX)YvgM@{Khlq)a
zl9QB`mY0~BnxdnmrKYE-sj9NGw6(UkxVgH*!^FkL$H>Xb($mz{*4NnC+Tr5k<mKk)
z=;`Y5^7Hid_V@Vt`v2Mh2mt{A06zfv{=GB99|QbN;(ZR+SBm1o&fes1jV@Wkl<e3F
zMhN6{&0_fP;l8urTg+UeT<PpW$9##k9g2cRGDtl~J#cYcZ72ks07>)}+G+9pM!N9a
z+Fgtq<eEAmEds>;Ufks4k5G6O=vOe_>A$lT&9{hIqq>&me#H&a?UfhKQ?v%>I1(TC
zPo;Q8dSAiM82D?!`n~+#AhvSzq6>Sai_G#i?k&erpO=siPeWdhbMfE9x=i;{+Ft7#
z+e`yT6p%#>F!kUap}qMH^{zG#jmxgXeWv)|M1xS$?n4;X*+RSr2LNRA&OU5{I`h)J
Keu^lfzyI08i0m@}

literal 0
HcmV?d00001

diff --git a/tests/micro/zephyr/testdata/digit-9.jpg b/tests/micro/zephyr/testdata/digit-9.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6ce9cde3b322b351847da179555aba35358dbf23
GIT binary patch
literal 535
zcmV+y0_gq!*#F=F5K2Z#MgRc;000310RRC1+W<oV2nGfR1_cNQ2L}lW2nq}o4Gaql
z3>gv=4-_35A08bV92_7dE+-%&EF&BoC^soAFflYVG#@89JvcHvE;BST|G)qX2ml-a
z9036l0RO}Q9{>OW1pxs80RaI300000000010s{mE1_uZU3Jd?l0JRVR0s#X90t5pE
z1q1{D00Dgg0s{a95d{(<F+m0sQDFxnaTGFvk)a49Lb1UWQqk}vGh%Xr2?`b$7#SKR
zCMPH<Dl|1VH#j*uLqtVJM@UIZQ&d$}S6EqEV`OD!XJ~0^b98lfcX)YvgM@{Khlq)a
zl9QB`mY0~BnxdnmrKYE-sj9NGw6(UkxVgH*!^FkL$H>Xb($mz{*4NnC+Tr5k<mKk)
z=;`Y5^7Hid_V@Vt`v2Mh2mt{A06zfv{<>`=;qQwz?HTobGeo$#u}uE)N`lzywMpFn
z0PqlNo02z*Mvg>ebR}6>V1hb=DYU8qU<(BV@=t2}8cXd%P4Nbe;f*80*GEW!Bi*PZ
z?Hn;69XR{sV~|fd_pc58pmf`NABf335rk{YDCd_0tL|ja<f6Q)itFO|qvKfYVzK`K
zgpT{{(cla?5@2@oF}wAy%l3&aFN`#MorPnUOHHKo&zR(o!z1&r9-@lb*Zftj_@LSy
ZX8Eqoq+V1cqOR5S`9ROq<kmHR|JfJ4*|h)w

literal 0
HcmV?d00001

diff --git a/tests/python/all-platform-minimal-test/test_minimal_target_codegen_llvm.py b/tests/python/all-platform-minimal-test/test_minimal_target_codegen_llvm.py
index 9861a1c39740..c2097bb15b52 100644
--- a/tests/python/all-platform-minimal-test/test_minimal_target_codegen_llvm.py
+++ b/tests/python/all-platform-minimal-test/test_minimal_target_codegen_llvm.py
@@ -55,12 +55,12 @@ def check_llvm():
         binds = {A: Ab}
         # BUILD and invoke the kernel.
         f = tvm.build(s, [A, B, C], "llvm", binds=binds)
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         # launch the kernel.
         n = nn
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
+        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
+        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
         f(a, b, c)
         tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
 
@@ -96,10 +96,10 @@ def check_llvm(use_file):
             s[B].pragma(s[B].op.axis[0], "import_llvm", ll_code)
         # BUILD and invoke the kernel.
         f = tvm.build(s, [A, B], "llvm")
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         # launch the kernel.
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
+        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
         f(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), a.asnumpy() + 1.0)
 
diff --git a/tests/python/all-platform-minimal-test/test_runtime_ndarray.py b/tests/python/all-platform-minimal-test/test_runtime_ndarray.py
index bd9fb738ba7b..00ca0b1af3cf 100644
--- a/tests/python/all-platform-minimal-test/test_runtime_ndarray.py
+++ b/tests/python/all-platform-minimal-test/test_runtime_ndarray.py
@@ -24,19 +24,19 @@
 
 @tvm.testing.uses_gpu
 def test_nd_create():
-    for target, ctx in tvm.testing.enabled_targets():
+    for target, dev in tvm.testing.enabled_targets():
         for dtype in ["uint8", "int8", "uint16", "int16", "uint32", "int32", "float32"]:
             x = np.random.randint(0, 10, size=(3, 4))
             x = np.array(x, dtype=dtype)
-            y = tvm.nd.array(x, ctx=ctx)
-            z = y.copyto(ctx)
+            y = tvm.nd.array(x, device=dev)
+            z = y.copyto(dev)
             assert y.dtype == x.dtype
             assert y.shape == x.shape
             assert isinstance(y, tvm.nd.NDArray)
             np.testing.assert_equal(x, y.asnumpy())
             np.testing.assert_equal(x, z.asnumpy())
         # no need here, just to test usablity
-        ctx.sync()
+        dev.sync()
 
 
 def test_fp16_conversion():
diff --git a/tests/python/all-platform-minimal-test/test_runtime_packed_func.py b/tests/python/all-platform-minimal-test/test_runtime_packed_func.py
index c6efbb472c4a..9318b0cc1783 100644
--- a/tests/python/all-platform-minimal-test/test_runtime_packed_func.py
+++ b/tests/python/all-platform-minimal-test/test_runtime_packed_func.py
@@ -99,15 +99,15 @@ def myfunc(ss):
     tvm.runtime.convert(myfunc)(x)
 
 
-def test_ctx():
-    def test_ctx_func(ctx):
-        assert tvm.gpu(7) == ctx
+def test_device():
+    def test_device_func(dev):
+        assert tvm.gpu(7) == dev
         return tvm.cpu(0)
 
-    x = test_ctx_func(tvm.gpu(7))
+    x = test_device_func(tvm.gpu(7))
     assert x == tvm.cpu(0)
     x = tvm.opencl(10)
-    x = tvm.testing.context_test(x, x.device_type, x.device_id)
+    x = tvm.testing.device_test(x, x.device_type, x.device_id)
     assert x == tvm.opencl(10)
 
 
@@ -163,4 +163,4 @@ def check(arr):
     test_convert()
     test_return_func()
     test_byte_array()
-    test_ctx()
+    test_device()
diff --git a/tests/python/contrib/test_arm_compute_lib/infrastructure.py b/tests/python/contrib/test_arm_compute_lib/infrastructure.py
index 9a9bf69958f5..35f345cea78a 100644
--- a/tests/python/contrib/test_arm_compute_lib/infrastructure.py
+++ b/tests/python/contrib/test_arm_compute_lib/infrastructure.py
@@ -24,7 +24,7 @@
 import tvm
 from tvm import relay
 from tvm import rpc
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 from tvm.relay.op.contrib import arm_compute_lib
 from tvm.contrib import utils
 from tvm.autotvm.measure import request_remote
@@ -214,7 +214,7 @@ def build_and_run(
         raise Exception(err_msg)
 
     lib = update_lib(lib, device.device, device.cross_compile)
-    gen_module = graph_runtime.GraphModule(lib["default"](device.device.cpu(0)))
+    gen_module = graph_executor.GraphModule(lib["default"](device.device.cpu(0)))
     gen_module.set_input(**inputs)
     out = []
     for _ in range(no_runs):
diff --git a/tests/python/contrib/test_bnns/__init__.py b/tests/python/contrib/test_bnns/__init__.py
new file mode 100644
index 000000000000..724b23f1378b
--- /dev/null
+++ b/tests/python/contrib/test_bnns/__init__.py
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Infrastructure and tests for BNNS"""
diff --git a/tests/python/contrib/test_bnns/infrastructure.py b/tests/python/contrib/test_bnns/infrastructure.py
new file mode 100644
index 000000000000..d046ee9ad0dd
--- /dev/null
+++ b/tests/python/contrib/test_bnns/infrastructure.py
@@ -0,0 +1,330 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from itertools import zip_longest, combinations
+import json
+import os
+import warnings
+
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm import rpc
+from tvm.contrib import graph_executor
+from tvm.relay.op.contrib.bnns import partition_for_bnns
+from tvm.contrib import utils
+from tvm.autotvm.measure import request_remote
+from tvm.relay.analysis import analysis
+
+
+class Device:
+    """
+    Common device configuration for python tests.
+
+    Check tests/python/contrib/arm_compute_lib/ for the presence of an test_config.json file.
+    This file can be used to override the default configuration here which will attempt to run the BNNS
+    runtime tests locally if the runtime is available. Changing the configuration will allow these
+    runtime tests to be offloaded to a remote device with BNNS via a tracker for example.
+
+    Notes
+    -----
+        The test configuration will be loaded once when the the class is created. If the configuration
+        changes between tests, any changes will not be picked up.
+
+
+    Attributes
+    ----------
+    connection_type : str
+        Details the type of RPC connection to use. Options:
+        local - Use the local device,
+        tracker - Connect to a tracker to request a remote device,
+        remote - Connect to a remote device directly.
+    host : str
+        Specify IP address or hostname of remote target.
+    port : int
+        Specify port number of remote target.
+    target : str
+        The compilation target.
+    device_key : str
+        The device key of the remote target. Use when connecting to a remote device via a tracker.
+    cross_compile : str
+        Specify path to cross compiler to use when connecting a remote device from a non-arm platform.
+    """
+
+    connection_type = "local"
+    host = "localhost"
+    port = 9090
+    target = "llvm"
+    device_key = ""
+    cross_compile = ""
+
+    def __init__(self):
+        """Keep remote device for lifetime of object."""
+        self.device = self._get_remote()
+
+    @classmethod
+    def _get_remote(cls):
+        """Get a remote (or local) device to use for testing."""
+        if cls.connection_type == "tracker":
+            device = request_remote(cls.device_key, cls.host, cls.port, timeout=1000)
+        elif cls.connection_type == "remote":
+            device = rpc.connect(cls.host, cls.port)
+        elif cls.connection_type == "local":
+            device = rpc.LocalSession()
+        else:
+            raise ValueError(
+                "connection_type in test_config.json should be one of: " "local, tracker, remote."
+            )
+
+        return device
+
+    @classmethod
+    def load(cls, file_name):
+        """Load test config
+
+        Load the test configuration by looking for file_name relative
+        to the test_bnns directory.
+        """
+        location = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+        config_file = os.path.join(location, file_name)
+        if not os.path.exists(config_file):
+            warnings.warn("Config file doesn't exist, resuming tests with default config.")
+            return
+        with open(config_file, mode="r") as config:
+            test_config = json.load(config)
+
+        cls.connection_type = test_config["connection_type"]
+        cls.host = test_config["host"]
+        cls.port = test_config["port"]
+        cls.target = test_config["target"]
+        cls.device_key = test_config.get("device_key") or ""
+        cls.cross_compile = test_config.get("cross_compile") or ""
+
+
+Device.target = "llvm"
+
+
+def skip_runtime_test():
+    """Skip test if it requires the runtime and it's not present."""
+    # BNNS codegen not present.
+    if not tvm.get_global_func("relay.ext.bnns", True):
+        print("Skip because BNNS codegen is not available.")
+        return True
+    return False
+
+
+def skip_codegen_test():
+    """Skip test if it requires the BNNS codegen and it's not present."""
+    if not tvm.get_global_func("relay.ext.bnns", True):
+        print("Skip because BNNS codegen is not available.")
+        return True
+
+
+def build_module(mod, target, params=None, enable_bnns=True, tvm_ops=0):
+    """Build module with option to build for BNNS."""
+    if isinstance(mod, tvm.relay.expr.Call):
+        mod = tvm.IRModule.from_expr(mod)
+    with tvm.transform.PassContext(opt_level=3):
+        if enable_bnns:
+            mod = partition_for_bnns(mod)
+        relay.backend.compile_engine.get().clear()
+        return relay.build(mod, target=target, target_host=target, params=params)
+
+
+def build_and_run(
+    mod,
+    inputs,
+    outputs,
+    params,
+    device,
+    enable_bnns=True,
+    no_runs=1,
+    tvm_ops=0,
+    config=None,
+):
+    """Build and run the relay module."""
+    if config is None:
+        config = {}
+
+    try:
+        lib = build_module(mod, device.target, params, enable_bnns, tvm_ops)
+    except Exception as e:
+        err_msg = "The module could not be built.\n"
+        if config:
+            err_msg += f"The test failed with the following parameters: {config}\n"
+        err_msg += str(e)
+        raise Exception(err_msg)
+
+    lib = update_lib(lib, device.device, device.cross_compile)
+    gen_module = graph_executor.GraphModule(lib["default"](device.device.cpu(0)))
+    gen_module.set_input(**inputs)
+    out = []
+    for _ in range(no_runs):
+        gen_module.run()
+        out.append([gen_module.get_output(i) for i in range(outputs)])
+    return out
+
+
+def update_lib(lib, device, cross_compile):
+    """Export the library to the remote/local device."""
+    lib_name = "mod.so"
+    temp = utils.tempdir()
+    lib_path = temp.relpath(lib_name)
+    if cross_compile:
+        lib.export_library(lib_path, cc=cross_compile)
+    else:
+        lib.export_library(lib_path)
+    device.upload(lib_path)
+    lib = device.load_module(lib_name)
+    return lib
+
+
+def extract_bnns_modules(module):
+    """Get the BNNS module(s) from llvm module."""
+    return list(filter(lambda mod: mod.type_key == "bnns_json", module.get_lib().imported_modules))
+
+
+def verify(answers, atol, rtol, verify_saturation=False, config=None):
+    """Compare the array of answers. Each entry is a list of outputs."""
+    if config is None:
+        config = {}
+
+    if len(answers) < 2:
+        raise RuntimeError(f"No results to compare: expected at least two, found {len(answers)}")
+    for answer in zip_longest(*answers):
+        for outs in combinations(answer, 2):
+            try:
+                if verify_saturation:
+                    assert (
+                        np.count_nonzero(outs[0].asnumpy() == 255) < 0.25 * outs[0].asnumpy().size
+                    ), "Output is saturated: {}".format(outs[0])
+                    assert (
+                        np.count_nonzero(outs[0].asnumpy() == 0) < 0.25 * outs[0].asnumpy().size
+                    ), "Output is saturated: {}".format(outs[0])
+                tvm.testing.assert_allclose(
+                    outs[0].asnumpy(), outs[1].asnumpy(), rtol=rtol, atol=atol
+                )
+            except AssertionError as e:
+                err_msg = "Results not within the acceptable tolerance.\n"
+                if config:
+                    err_msg += f"The test failed with the following parameters: {config}\n"
+                err_msg += str(e)
+                raise AssertionError(err_msg)
+
+
+def verify_codegen(
+    module,
+    known_good_codegen,
+    num_bnns_modules,
+    tvm_ops=0,
+    target=Device.target,
+):
+    """Check BNNS codegen against a known good output."""
+    module = build_module(module, target, tvm_ops=tvm_ops)
+    bnns_modules = extract_bnns_modules(module)
+
+    assert len(bnns_modules) == num_bnns_modules, (
+        f"The number of BNNS modules produced ({len(bnns_modules)}) does not "
+        f"match the expected value ({num_bnns_modules})."
+    )
+
+    for mod in bnns_modules:
+        source = mod.get_source("json")
+        codegen = json.loads(source)["nodes"]
+        # remove input and const names as these cannot be predetermined
+        for node in range(len(codegen)):
+            if codegen[node]["op"] == "input" or codegen[node]["op"] == "const":
+                codegen[node]["name"] = ""
+        codegen_str = json.dumps(codegen, sort_keys=True, indent=2)
+        known_good_codegen_str = json.dumps(known_good_codegen, sort_keys=True, indent=2)
+
+        assert codegen_str == known_good_codegen_str, (
+            f"The JSON produced by codegen does not match the expected result. \n"
+            f"Actual={codegen_str} \n"
+            f"Expected={known_good_codegen_str}"
+        )
+
+
+def compare_inference_with_ref(func, params, atol=0.002, rtol=0.007):
+    """Compare scoring results for compilation with and without BNNS.
+
+    Provided function will be compiled two times with and without BNNS.
+    The scoring results for both type of compilation will be compared
+    with provided atol and rtol. The input data will be automatically
+    generated based of shape and dtype info provided for var nodes.
+
+    """
+    # Generate input tensor values
+    inputs = {}
+    for free_param in analysis.free_vars(func):
+        name = free_param.name_hint
+        dtype = free_param.type_annotation.dtype
+        shape = [s.value for s in free_param.type_annotation.shape]
+        inputs[name] = tvm.nd.array(np.random.uniform(0, 127, shape).astype(dtype))
+
+    # Run for both type of compilation
+    device = Device()
+    outputs = []
+    for bnns in [False, True]:
+        outputs.append(build_and_run(func, inputs, 1, params, device, enable_bnns=bnns)[0])
+
+    # Compare result tensors
+    verify(outputs, atol=atol, rtol=rtol)
+
+
+def generate_trials(space, r_factor=3):
+    """Generates a series of trials.
+
+    This algorithm generates a series of non-deterministic trials given a
+    space of options to test. A trial is generated by pulling a value from
+    each option in the space. On some occasions the values are shuffled to
+    ensure a different trial on each r_factor iteration. The algorithm ensures
+    that each value from an option is used at least once. The total number of
+    trials is determined by the r_factor * the option with the largest number
+    of values.
+
+    Parameters
+    ----------
+    space: List[List[Any]]
+        A list of different options with varying values to test.
+    r_factor: Optional[int]
+        The repeat factor.
+
+    Returns
+    -------
+    result: List[Tuple]
+        A list of trials specifying values for each option.
+
+    """
+    np.random.seed(0)
+    max_len = 1
+    for option in space:
+        max_len = max(max_len, len(option))
+
+    num_trials = r_factor * max_len
+    trials = []
+    for i in range(num_trials):
+        trial = []
+        for option in space:
+            if i % len(option) == 0:
+                np.random.shuffle(option)
+            trial.append(option[i % len(option)])
+
+        trials.append(trial)
+
+    return trials
diff --git a/tests/python/contrib/test_bnns/test_conv2d.py b/tests/python/contrib/test_bnns/test_conv2d.py
new file mode 100644
index 000000000000..886958cf3076
--- /dev/null
+++ b/tests/python/contrib/test_bnns/test_conv2d.py
@@ -0,0 +1,177 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""BNNS integration conv2d tests."""
+
+import numpy as np
+import pytest
+import tvm
+from tvm import relay
+
+from .infrastructure import skip_runtime_test, compare_inference_with_ref, generate_trials
+
+# TODO: Missed cases
+#   1. Bias as add with 3d const tensor. Lead to additional unsqueeze op between
+#   2. Check unsupported cases of fusion. Like bias add with axis != 1, add with broadcast by spatial dims
+#   3. Check if bias/weights is not constants. Should fallback into LLVM or decompose it
+#   4. Check if bias/weights is constants expr. Should works somehow.
+
+
+def _get_model(
+    shape,
+    kernel=(3, 3),
+    padding=(1, 1),
+    strides=(1, 1),
+    dilation=(1, 1),
+    groups=1,
+    dtype="float32",
+    channels=-1,  # -1 means same as input channels
+    bias_type="none",
+    activation_type="none",
+):
+    """Return a model and any parameters it may have"""
+    if channels == -1:
+        channels = shape[1]
+
+    a = relay.var("a", shape=shape, dtype=dtype)
+    weight_shape = (channels, shape[1] // groups, *kernel)
+    w = tvm.nd.array(np.random.uniform(-128, 127, weight_shape).astype(dtype))
+    weights = relay.const(w, dtype)
+    out = relay.nn.conv2d(
+        a,
+        weights,
+        kernel_size=kernel,
+        dilation=dilation,
+        strides=strides,
+        padding=padding,
+        groups=groups,
+        channels=channels,
+        out_dtype=dtype,
+    )
+    params = {"w": w}
+    if bias_type == "bias_add":
+        b = tvm.nd.array(np.random.uniform(-10, 10, weight_shape[0]).astype(dtype))
+        biasc = relay.const(b, dtype)
+        out = relay.nn.bias_add(out, biasc, axis=1)
+        params["b"] = b
+    elif bias_type == "add_3d" or bias_type == "add_4d":
+        bias_shape = (
+            (weight_shape[0], 1, 1) if bias_type == "add_3d" else (1, weight_shape[0], 1, 1)
+        )
+        b = tvm.nd.array(np.random.uniform(-10, 10, bias_shape).astype(dtype))
+        biasc = relay.const(b, dtype)
+        out = relay.add(out, biasc)
+        params["b"] = b
+
+    if activation_type == "relu":
+        out = relay.nn.relu(out)
+    elif activation_type == "sigmoid":
+        out = relay.op.sigmoid(out)
+    return out, params
+
+
+@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available")
+def test_conv2d():
+    np.random.seed(0)
+
+    kernel_hs = [1, 2, 3, 5]
+    kernel_ws = [1, 2, 3, 5]
+    pad = [(1, 1), (2, 2), (2, 1)]
+    strides = [(1, 1), (2, 2)]
+    dilation = [(1, 1)]
+    out_channels = [1, 4, 8, 16]
+    input_shapes = [(10, 10, 14), (12, 15, 16), (20, 20, 20)]
+    batches = [1, 2]
+    groups = [1, 2]
+    bias_kind = ["none", "add_3d", "add_4d", "bias.add"]
+    activation_kind = ["none", "relu", "sigmoid"]
+    trials = generate_trials(
+        [
+            kernel_hs,
+            kernel_ws,
+            pad,
+            strides,
+            dilation,
+            out_channels,
+            input_shapes,
+            groups,
+            batches,
+            bias_kind,
+            activation_kind,
+        ],
+        3,
+    )
+
+    for (
+        kernel_h,
+        kernel_w,
+        pad,
+        stride,
+        dilation,
+        out_channels,
+        input_shapes,
+        group,
+        batch,
+        bias,
+        activation,
+    ) in trials:
+        if out_channels % group != 0:
+            continue
+        func, params = _get_model(
+            shape=(batch, *input_shapes),
+            kernel=(kernel_h, kernel_w),
+            padding=pad,
+            strides=stride,
+            dilation=dilation,
+            groups=group,
+            channels=out_channels,
+            bias_type=bias,
+            activation_type=activation,
+        )
+        compare_inference_with_ref(func, params)
+
+
+@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available")
+def test_conv2d_dw():
+    if skip_runtime_test():
+        return
+
+    np.random.seed(0)
+    shape = [4, 5, 5]
+
+    for batch in [1, 2]:
+        mod, params = _get_model(shape=(batch, *shape), groups=shape[0])
+        compare_inference_with_ref(mod, params)
+
+
+@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available")
+def test_conv2d_with_oc1():
+    if skip_runtime_test():
+        return
+
+    np.random.seed(0)
+    shape = [3, 5, 5]
+
+    for batch in [1, 2]:
+        for bias in ["none", "add_4d"]:
+            mod, params = _get_model(shape=(batch, *shape), channels=1, bias_type=bias)
+            compare_inference_with_ref(mod, params)
+
+
+if __name__ == "__main__":
+    test_conv2d()
+    test_conv2d_dw()
+    test_conv2d_with_oc1()
diff --git a/tests/python/contrib/test_bnns/test_conv2d_patterns.py b/tests/python/contrib/test_bnns/test_conv2d_patterns.py
new file mode 100644
index 000000000000..b10504bbc961
--- /dev/null
+++ b/tests/python/contrib/test_bnns/test_conv2d_patterns.py
@@ -0,0 +1,107 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""BNNS pattern detection check"""
+
+import tvm
+from tvm import relay
+import numpy as np
+
+from tvm.relay.op.contrib.bnns import partition_for_bnns
+
+fp32 = "float32"
+
+
+def partition(exp):
+    """Apply BNNS specific partitioning transformation"""
+    mod = tvm.IRModule.from_expr(exp)
+    with tvm.transform.PassContext(opt_level=3):
+        mod = partition_for_bnns(mod)
+    return mod
+
+
+def is_op_fused(func, op_name):
+    is_fused = False
+
+    def visit(op):
+        if (
+            isinstance(op, tvm.relay.function.Function)
+            and op_name in op.attrs["PartitionedFromPattern"]
+        ):
+            nonlocal is_fused
+            is_fused = True
+
+    tvm.relay.analysis.post_order_visit(func.body, visit)
+    return is_fused
+
+
+def test_pattern_conv2d_with_bias_add():
+    for axis in (1, 2):
+        a = relay.var("a", shape=(2, 7, 8, 8), dtype=fp32)
+        w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype(fp32))
+        res = relay.nn.conv2d(a, w, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype=fp32)
+        b = relay.const(np.random.uniform(-10, 10, 8).astype(fp32))
+        res = relay.nn.bias_add(res, b, axis=axis)
+
+        mod = partition(res)
+        bias_is_fused = is_op_fused(mod["bnns_0"], "nn.bias_add")
+
+        assert bias_is_fused if axis == 1 else not bias_is_fused
+
+
+def test_pattern_conv2d_with_add():
+    workloads = {8: False, (8, 1): False, (8, 1, 1): True, (1, 8, 1, 1): True}
+
+    for b_shape, should_be_fused in workloads.items():
+        a = relay.var("a", shape=(2, 7, 8, 8), dtype=fp32)
+        w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype(fp32))
+        res = relay.nn.conv2d(a, w, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype=fp32)
+        b = relay.const(np.random.uniform(-10, 10, b_shape).astype(fp32))
+        res = relay.add(res, b)
+
+        mod = partition(res)
+        bias_is_fused = is_op_fused(mod["bnns_0"], "add")
+
+        assert bias_is_fused == should_be_fused
+
+
+def test_pattern_conv2d_with_non_cons_weights():
+    for const_weights in (True, False):
+        a = relay.var("a", shape=(2, 7, 8, 8), dtype=fp32)
+        if const_weights:
+            w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype(fp32))
+        else:
+            w = relay.var("w", shape=(8, 7, 3, 3), dtype=fp32)
+
+        res = relay.nn.conv2d(a, w, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype=fp32)
+
+        mod = partition(res)
+        use_bnns = len(mod.get_global_vars()) == 2  # GlobalVar: "main" and "bnns_0"
+
+        assert use_bnns == const_weights
+
+
+def test_pattern_conv2d_with_non_cons_bias():
+    a = relay.var("a", shape=[2, 7, 8, 8], dtype=fp32)
+    w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype(fp32))
+    res = relay.nn.conv2d(a, w, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype=fp32)
+    b = relay.var("b", shape=[8], dtype=fp32)
+    res = relay.nn.bias_add(res, b, axis=1)
+
+    mod = partition(res)
+    bias_is_fused = is_op_fused(mod["bnns_0"], "nn.bias_add")
+
+    assert not bias_is_fused
diff --git a/tests/python/contrib/test_bnns/test_dense.py b/tests/python/contrib/test_bnns/test_dense.py
new file mode 100644
index 000000000000..c2cf9bf71373
--- /dev/null
+++ b/tests/python/contrib/test_bnns/test_dense.py
@@ -0,0 +1,190 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""BNNS integration dense tests."""
+
+import numpy as np
+import math
+import pytest
+import tvm
+from tvm import relay
+from .infrastructure import (
+    Device,
+    skip_runtime_test,
+    skip_codegen_test,
+    build_and_run,
+    verify,
+    verify_codegen,
+    generate_trials,
+)
+
+
+def _get_model(shape, weight_shape, units, dtype, var_names, has_bias=False, has_gelu=False):
+    """Return a model and any parameters it may have"""
+    a = relay.var(next(var_names), shape=shape, dtype=dtype)
+    w = tvm.nd.array(np.random.uniform(-128, 127, weight_shape).astype(dtype))
+    weights = relay.const(w, dtype)
+    out = relay.nn.dense(a, weights, units=units, out_dtype=dtype)
+    params = {"w": w}
+    if has_bias:
+        b = tvm.nd.array(np.random.randint(-128, 127, weight_shape[0]).astype(dtype))
+        biasc = relay.const(b, dtype)
+        out = relay.op.add(out, biasc)
+        params["b"] = b
+    if has_gelu:
+        const1 = relay.const(0.044715)
+        const2 = relay.const(math.sqrt(2 / math.pi))
+        bias = out
+        out = relay.op.power(bias, relay.const(3.0, "float32"))
+        out = relay.op.multiply(out, const1)
+        out = relay.op.add(out, bias)
+        out = relay.op.multiply(out, const2)
+        out = relay.op.tanh(out)
+        out = relay.op.add(out, relay.const(1, "float32"))
+        out = relay.op.multiply(out, relay.const(0.5))
+        out = relay.op.multiply(out, bias)
+    return out, params
+
+
+def _get_expected_codegen(shape, weight_shape, units, dtype, has_bias=False, has_gelu=False):
+    output_shape = (shape[0], units)
+    name = "nn.dense"
+    if has_bias is True:
+        name = "bnns.dense_bias"
+    if has_bias is True and has_gelu is True:
+        name = "bnns.dense_bias_gelu"
+
+    node = {
+        "op": "kernel",
+        "name": name,
+        "inputs": [],
+        "attrs": {
+            "num_outputs": "1",
+            "out_dtype": [["float32"]],
+            "shape": [[list(output_shape)]],
+            "dtype": [[dtype]],
+            "units": [[str(units)]],
+        },
+    }
+
+    inputs = [
+        {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[str(dtype)]]}},
+        {
+            "op": "const",
+            "name": "",
+            "attrs": {"shape": [[list(weight_shape)]], "dtype": [[str(dtype)]]},
+        },
+    ]
+
+    if has_bias:
+        inputs.append(
+            {
+                "op": "const",
+                "name": "",
+                "attrs": {"shape": [[[weight_shape[0]]]], "dtype": [["float32"]]},
+            }
+        )
+
+    input_idx = 0
+    for _ in range(len(inputs)):
+        node["inputs"].append([input_idx, 0, 0])
+        input_idx += 1
+    node["attrs"]["num_inputs"] = str(len(inputs))
+    inputs.append(node)
+    return inputs
+
+
+@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available")
+def test_dense():
+    device = Device()
+    np.random.seed(0)
+
+    dtype = ["float32"]
+    shape = [
+        ((1, 128), (16, 128), 16),
+        ((32, 32), (32, 32), 32),
+        ((1, 64), (1, 64), 1),
+        ((11, 2), (2, 2), 2),
+        ((2, 2), (1, 2), 1),
+    ]
+    composite = [False, True]
+    trials = generate_trials([dtype, shape, composite, composite], 3)
+
+    for dtype, (shape, weight_shape, units), with_bias, with_gelu in trials:
+        outputs = []
+        inputs = {"a": tvm.nd.array(np.random.uniform(-128, 127, shape).astype(dtype))}
+        func, params = _get_model(
+            shape,
+            weight_shape,
+            units,
+            dtype,
+            var_names=iter(inputs),
+            has_bias=with_bias,
+            has_gelu=with_gelu,
+        )
+        for bnns in [False, True]:
+            outputs.append(
+                build_and_run(
+                    func,
+                    inputs,
+                    1,
+                    params,
+                    device,
+                    enable_bnns=bnns,
+                )[0]
+            )
+
+        config = {
+            "shape": shape,
+            "weight_shape": weight_shape,
+            "units": units,
+            "dtype": dtype,
+            "with_bias": with_bias,
+            "with_gelu": with_gelu,
+        }
+        verify(outputs, atol=0.001, rtol=0.01, config=config)
+
+
+@pytest.mark.skipif(skip_codegen_test(), reason="Skip because BNNS codegen is not available")
+def test_codegen_dense():
+    np.random.seed(0)
+
+    dtype = ["float32"]
+    shape = [
+        ((1, 128), (16, 128), 16),
+        ((32, 32), (32, 32), 32),
+        ((1, 64), (1, 64), 1),
+        ((11, 2), (2, 2), 2),
+        ((2, 2), (1, 2), 1),
+    ]
+    composite = [False, True]
+    trials = generate_trials([dtype, shape, composite, composite], 3)
+
+    for dtype, (shape, weight_shape, units), with_bias, with_gelu in trials:
+        inputs = {"a"}
+
+        args = (shape, weight_shape, units, dtype)
+
+        func, params = _get_model(
+            *args, var_names=iter(inputs), has_bias=with_bias, has_gelu=with_gelu
+        )
+        exp_codegen = _get_expected_codegen(*args, has_bias=with_bias, has_gelu=with_gelu)
+        verify_codegen(func, exp_codegen, 1)
+
+
+if __name__ == "__main__":
+    test_dense()
+    test_codegen_dense()
diff --git a/tests/python/contrib/test_bnns/test_matmul.py b/tests/python/contrib/test_bnns/test_matmul.py
new file mode 100644
index 000000000000..7bf4d48f8e88
--- /dev/null
+++ b/tests/python/contrib/test_bnns/test_matmul.py
@@ -0,0 +1,113 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""BNNS integration dense tests."""
+
+import numpy as np
+import math
+import pytest
+import tvm
+from tvm import relay
+from tvm import testing
+from .infrastructure import (
+    Device,
+    skip_runtime_test,
+    skip_codegen_test,
+    verify_codegen,
+    build_and_run,
+    verify,
+    generate_trials,
+)
+
+
+def _get_model(a_shape, b_shape, dtype, var_names, is_a_constant=False, is_b_constant=False):
+    """Return a model and any parameters it may have"""
+    a = relay.var(next(var_names), shape=a_shape, dtype=dtype)
+    b = relay.var(next(var_names), shape=b_shape, dtype=dtype)
+    params = {}
+    if is_b_constant is True:
+        b = tvm.nd.array(np.random.uniform(-128, 127, b_shape).astype(dtype))
+        params["b"] = b
+        b = relay.const(b, dtype)
+    if is_a_constant is True:
+        a = tvm.nd.array(np.random.uniform(-128, 127, a_shape).astype(dtype))
+        params["a"] = a
+        a = relay.const(a, dtype)
+    out = relay.nn.batch_matmul(a, b)
+    return out, params
+
+
+@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available")
+def test_matmul():
+    device = Device()
+    np.random.seed(0)
+    dtype = "float32"
+
+    # C[N, I, J] = A[N, I, K] * B[N, J, K]
+    shapes_config = [
+        # B, I, J, K
+        [1, 4, 4, 3],
+        [1, 16, 32, 32],
+        [2, 1, 1, 3],
+        [2, 16, 32, 32],
+        [5, 1, 1, 3],
+    ]
+    data_config = [
+        # A_is_constant, B_is_constant
+        [False, True],
+        [True, False],
+        [False, False],
+    ]
+
+    for N, I, J, K in shapes_config:
+        a_shape = [N, I, K]
+        b_shape = [N, J, K]
+        for is_a_constant, is_b_constant in data_config:
+            outputs = []
+            inputs = {
+                "a": tvm.nd.array(np.random.uniform(-128, 127, a_shape).astype(dtype)),
+                "b": tvm.nd.array(np.random.uniform(-128, 127, b_shape).astype(dtype)),
+            }
+            func, params = _get_model(
+                a_shape,
+                b_shape,
+                dtype,
+                var_names=iter(inputs),
+                is_a_constant=is_a_constant,
+                is_b_constant=is_b_constant,
+            )
+            for enable_bnns in [False, True]:
+                outputs.append(
+                    build_and_run(
+                        func,
+                        inputs,
+                        1,
+                        params,
+                        device,
+                        enable_bnns=enable_bnns,
+                    )[0]
+                )
+
+            config = {
+                "a_shape": a_shape,
+                "b_shape": b_shape,
+                "dtype": dtype,
+            }
+            verify(outputs, atol=0.001, rtol=0.01, config=config)
+
+
+if __name__ == "__main__":
+    test_matmul()
diff --git a/tests/python/contrib/test_bnns/test_normalization.py b/tests/python/contrib/test_bnns/test_normalization.py
new file mode 100644
index 000000000000..094cfb041c3c
--- /dev/null
+++ b/tests/python/contrib/test_bnns/test_normalization.py
@@ -0,0 +1,201 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""BNNS integration normalization tests."""
+
+import numpy as np
+import math
+import pytest
+import tvm
+from tvm import relay
+from tvm import testing
+from .infrastructure import (
+    Device,
+    skip_runtime_test,
+    skip_codegen_test,
+    verify_codegen,
+    build_and_run,
+    verify,
+    generate_trials,
+)
+
+
+def _get_model(
+    shape, b_shape, s_shape, dtype, var_names, axis=1, epsilon=1e-5, center=True, scale=True
+):
+    """Return a model and any parameters it may have"""
+    src = relay.var(next(var_names), shape=shape, dtype=dtype)
+    params = {}
+    b = tvm.nd.array(np.random.uniform(-128, 127, b_shape).astype(dtype))
+    params["b"] = b
+    b = relay.const(b, dtype)
+    s = tvm.nd.array(np.random.uniform(-128, 127, b_shape).astype(dtype))
+    params["b"] = s
+    s = relay.const(s, dtype)
+    out = relay.nn.instance_norm(src, s, b, axis, epsilon, center, scale)
+
+    return out, params
+
+
+def _get_expected_codegen(shape, axis, center, scale, dtype, offload_on_bnns):
+    output_shape = shape
+    name = "nn.instance_norm"
+
+    node = {
+        "op": "kernel",
+        "name": name,
+        "inputs": [],
+        "attrs": {
+            "num_outputs": "1",
+            "axis": [[str(axis)]],
+            "center": [[str(int(center))]],
+            "scale": [[str(int(scale))]],
+            "shape": [[list(output_shape)]],
+            "dtype": [[dtype]],
+            "epsilon": [["1.0000000000000001e-05"]],
+        },
+    }
+
+    inputs = [
+        {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[str(dtype)]]}},
+        {
+            "op": "const",
+            "name": "",
+            "attrs": {"shape": [[[shape[axis]]]], "dtype": [[str(dtype)]]},
+        },
+        {
+            "op": "const",
+            "name": "",
+            "attrs": {"shape": [[[shape[axis]]]], "dtype": [[str(dtype)]]},
+        },
+    ]
+
+    input_idx = 0
+    for _ in range(len(inputs)):
+        node["inputs"].append([input_idx, 0, 0])
+        input_idx += 1
+    node["attrs"]["num_inputs"] = str(len(inputs))
+    inputs.append(node)
+    return inputs
+
+
+@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available")
+def test_normalization():
+    device = Device()
+    np.random.seed(0)
+    dtype = "float32"
+
+    shapes_config = [
+        [1, 2, 3, 4],
+        [3, 2, 3, 4],
+        [2, 2, 3],
+        [16, 32, 32],
+        [5, 3],
+    ]
+    axes = [-1, 0, 1, 2]
+
+    for shape in shapes_config:
+        for axis in axes:
+            if len(shape) == 2 and axis != 0:
+                continue
+            for center in [False, True]:
+                for scale in [False, True]:
+                    outputs = []
+                    inputs = {
+                        "src": tvm.nd.array(np.random.uniform(-128, 127, shape).astype(dtype)),
+                    }
+                    func, params = _get_model(
+                        shape,
+                        [shape[axis]],
+                        [shape[axis]],
+                        dtype,
+                        var_names=iter(inputs),
+                        axis=axis,
+                        center=center,
+                        scale=scale,
+                    )
+                    for enable_bnns in [False, True]:
+                        outputs.append(
+                            build_and_run(
+                                func,
+                                inputs,
+                                1,
+                                params,
+                                device,
+                                enable_bnns=enable_bnns,
+                            )[0]
+                        )
+
+                    config = {
+                        "dtype": dtype,
+                    }
+                    verify(outputs, atol=0.001, rtol=0.01, config=config)
+
+
+@pytest.mark.skipif(skip_codegen_test(), reason="Skip because BNNS codegen is not available")
+def test_codegen_normalization():
+    np.random.seed(0)
+
+    dtype = "float32"
+    shapes_config = [
+        [1, 2, 3, 4],
+        [3, 2, 3, 4],
+        [2, 2, 3],
+        [16, 32, 32],
+        [5, 3],
+    ]
+    axes = [-1, 0, 1, 2]
+
+    def check_normalization(rank, axis):
+        if rank < 3 or rank > 4:
+            return False
+        if axis == 0 and rank == 3 or axis == 1 and rank == 4:
+            return True
+        return False
+
+    for shape in shapes_config:
+        for axis in axes:
+            if len(shape) == 2 and axis != 0:
+                continue
+            for center in [False, True]:
+                for scale in [False, True]:
+                    inputs = {"src"}
+
+                    args = (shape, axis, center, scale, dtype)
+
+                    func, params = _get_model(
+                        shape,
+                        [shape[axis]],
+                        [shape[axis]],
+                        dtype,
+                        var_names=iter(inputs),
+                        axis=axis,
+                        center=center,
+                        scale=scale,
+                    )
+
+                    offload_on_bnns = check_normalization(len(shape), axis)
+                    if offload_on_bnns is True:
+                        bnns_blocks = 1
+                    else:
+                        bnns_blocks = 0
+                    exp_codegen = _get_expected_codegen(*args, offload_on_bnns)
+                    verify_codegen(func, exp_codegen, bnns_blocks)
+
+
+if __name__ == "__main__":
+    test_normalization()
+    test_codegen_normalization()
diff --git a/tests/python/contrib/test_bnns/test_onnx_topologies.py b/tests/python/contrib/test_bnns/test_onnx_topologies.py
new file mode 100644
index 000000000000..25c4bc483333
--- /dev/null
+++ b/tests/python/contrib/test_bnns/test_onnx_topologies.py
@@ -0,0 +1,140 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""BNNS pattern detection check"""
+
+import pytest
+
+import tvm
+from tvm import relay
+from tvm.relay import transform
+from tvm.contrib import utils, graph_executor
+from tvm.contrib.download import download_testdata
+from tvm.relay.op.contrib.bnns import partition_for_bnns
+
+import numpy as np
+
+pytest.importorskip("onnx")
+
+bnns_is_absent = tvm.get_global_func("relay.ext.bnns", True) is None
+
+TARGET = "llvm"
+INPUT_SHAPE = [1, 3, 224, 224]
+
+BASE_MODEL_URL = "https://github.com/onnx/models/raw/master/"
+MODEL_URL_COLLECTION = {
+    "BERT": "text/machine_comprehension/bert-squad/model/bertsquad-10.onnx",
+    "MobileNet-v2": "vision/classification/mobilenet/model/mobilenetv2-7.onnx",
+    "ResNet50-v1": "vision/classification/resnet/model/resnet50-v1-7.onnx",
+    "ResNet50-v2": "vision/classification/resnet/model/resnet50-v2-7.onnx",
+    "SqueezeNet-v1.1": "vision/classification/squeezenet/model/squeezenet1.1-7.onnx",
+    "SqueezeNet-v1.0": "vision/classification/squeezenet/model/squeezenet1.0-7.onnx",
+    "Inception-v1": "vision/classification/inception_and_googlenet/inception_v1/model/inception-v1-7.onnx",
+    "Inception-v2": "vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-7.onnx",
+}
+
+
+def get_onnx_input_name(model):
+    inputs = [node.name for node in model.graph.input]
+    initializer = [node.name for node in model.graph.initializer]
+
+    inputs = list(set(inputs) - set(initializer))
+    return inputs
+
+
+def get_model_url(model_name):
+    return BASE_MODEL_URL + MODEL_URL_COLLECTION[model_name]
+
+
+def get_name_from_url(url):
+    return url[url.rfind("/") + 1 :].strip()
+
+
+def find_of_download(model_name):
+    model_url = get_model_url(model_name)
+    model_file_name = get_name_from_url(model_url)
+    return download_testdata(model_url, model_file_name, module="models")
+
+
+def get_model(model_name):
+    model_path = find_of_download(model_name)
+    onnx_model = onnx.load(model_path)
+    input_names = get_onnx_input_name(onnx_model)
+    input_dict = {}
+    for name in input_names:
+        input_dict[name] = INPUT_SHAPE  # TODO: hardcode
+    mod, params = relay.frontend.from_onnx(onnx_model, input_dict, freeze_params=True)
+    return mod, params, input_dict
+
+
+def simplify_model(mod):
+    """
+    Simplify execution graph
+
+    At least merge BatchNorm into convolution. For this purpose decompose BN primitive
+    into simple operation which can be calculated as const expr and after that merged
+    into nearest conv/dense primitive.
+    """
+    seq = tvm.transform.Sequential(
+        [
+            transform.InferType(),
+            transform.FoldConstant(),
+            transform.SimplifyInference(),
+            transform.FoldScaleAxis(),
+        ]
+    )
+    return seq(mod)
+
+
+def process(model_name):
+    temp = utils.tempdir()
+    model, params, input_dict = get_model(model_name)
+
+    def run(mod, target, simplify=True, with_bnns=False):
+        with tvm.transform.PassContext(opt_level=3):
+            if simplify:
+                mod = simplify_model(mod)
+            if with_bnns:
+                mod = partition_for_bnns(mod)
+            graph_module = relay.build(mod, target=target, target_host=target, params=params)
+
+        lib_name = "deploy.tar"
+        path_dso = temp.relpath(lib_name)
+        graph_module.export_library(path_dso)
+
+        dev = tvm.cpu(0)
+        loaded_lib = tvm.runtime.load_module(path_dso)
+
+        module = graph_executor.GraphModule(loaded_lib["default"](dev))
+        module.run()
+        return module.get_output(0).asnumpy()
+
+    res_llvm = run(model, TARGET, simplify=True, with_bnns=False)
+    res_bnns = run(model, TARGET, simplify=True, with_bnns=True)
+
+    tvm.testing.assert_allclose(
+        res_llvm,
+        res_bnns,
+        atol=0.002,
+        rtol=0.007,
+    )
+
+
+@pytest.mark.skip(reason="Manually disabled because of huge complexity")
+@pytest.mark.skipif(bnns_is_absent, reason="BNNS runtime is absent")
+@pytest.mark.parametrize("model_name", MODEL_URL_COLLECTION.keys())
+def test_topology(model_name):
+    process(model_name)
diff --git a/tests/python/contrib/test_bnns/test_pooling.py b/tests/python/contrib/test_bnns/test_pooling.py
new file mode 100644
index 000000000000..77a78d4bf7e1
--- /dev/null
+++ b/tests/python/contrib/test_bnns/test_pooling.py
@@ -0,0 +1,289 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""BNNS integration pooling tests."""
+
+import numpy as np
+import pytest
+import tvm
+from tvm import relay
+from tvm import testing
+from .infrastructure import (
+    skip_runtime_test,
+    skip_codegen_test,
+    build_and_run,
+    verify,
+    verify_codegen,
+)
+from .infrastructure import Device
+
+
+def _calculate_output_shape(shape, sizes, padding, strides):
+    """Calculate pooling output shape."""
+    output_height = ((shape[2] - sizes[0] + padding[0] + padding[2]) / strides[0]) + 1
+    output_width = ((shape[3] - sizes[1] + padding[1] + padding[3]) / strides[1]) + 1
+    return 1, shape[1], int(output_height), int(output_width)
+
+
+def _get_pooling_model(
+    shape, dtype, typef, sizes, strides, padding, ceil_mode, count_include_pad, var_names
+):
+    """Return a model and any parameters it may have."""
+    if len(padding) == 2:
+        padding = (padding[0], padding[1], padding[0], padding[1])
+    out = relay.var(next(var_names), shape=shape, dtype=dtype)
+
+    if typef == "nn.max_pool2d":
+        out = relay.nn.max_pool2d(
+            out,
+            pool_size=sizes,
+            strides=strides,
+            padding=padding,
+            ceil_mode=ceil_mode,
+        )
+    elif typef == "nn.avg_pool2d":
+        out = relay.nn.avg_pool2d(
+            out,
+            pool_size=sizes,
+            strides=strides,
+            padding=padding,
+            ceil_mode=ceil_mode,
+            count_include_pad=count_include_pad,
+        )
+    else:
+        raise ValueError("Function not supported")
+
+    return out
+
+
+def _get_global_pooling_model(shape, dtype, typef, var_names):
+    """Return a model and any parameters it may have."""
+    out = relay.var(next(var_names), shape=shape, dtype=dtype)
+
+    if typef == "nn.global_max_pool2d":
+        out = relay.nn.global_max_pool2d(out)
+    elif typef == "nn.global_avg_pool2d":
+        out = relay.nn.global_avg_pool2d(out)
+    else:
+        raise ValueError("Function not supported")
+
+    return out
+
+
+def _get_expected_pooling_codegen(
+    shape, dtype, typef, sizes, strides, padding, ceil_mode, count_include_pad
+):
+    if len(padding) == 2:
+        padding = (padding[0], padding[1], padding[0], padding[1])
+    output_shape = _calculate_output_shape(shape, sizes, padding, strides)
+
+    node = {
+        "op": "kernel",
+        "name": typef,
+        "inputs": [[0, 0, 0]],
+        "attrs": {
+            "num_inputs": "1",
+            "num_outputs": "1",
+            "layout": [["NCHW"]],
+            "shape": [[list(output_shape)]],
+            "dtype": [[dtype]],
+            "padding": [[str(p) for p in padding]],
+            "strides": [[str(s) for s in strides]],
+            "pool_size": [[str(s) for s in sizes]],
+            "ceil_mode": [[str(1 if ceil_mode else 0)]],
+        },
+    }
+
+    if typef == "nn.avg_pool2d" or typef == "nn.l2_pool2d":
+        node["attrs"]["count_include_pad"] = [["1" if count_include_pad else "0"]]
+
+    input = {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[dtype]]}}
+    return [input, node]
+
+
+def _get_expected_global_pooling_codegen(shape, dtype, typef):
+    node = {
+        "op": "kernel",
+        "name": typef,
+        "inputs": [[0, 0, 0]],
+        "attrs": {
+            "num_inputs": "1",
+            "num_outputs": "1",
+            "layout": [["NCHW"]],
+            "shape": [[[1, shape[1], 1, 1]]],
+            "dtype": [[dtype]],
+        },
+    }
+
+    input = {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[dtype]]}}
+    return [input, node]
+
+
+@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available")
+def test_pooling():
+    device = Device()
+    np.random.seed(0)
+
+    dtype = "float32"
+    trials = [
+        ["nn.max_pool2d", (3, 3), (2, 2), (0, 0), False, False, (27, 27, 512)],
+        ["nn.max_pool2d", (2, 2), (2, 2), (0, 0), False, True, (16, 16, 16)],
+        ["nn.max_pool2d", (3, 3), (2, 2), (1, 1), True, True, (15, 15, 16)],
+        ["nn.max_pool2d", (2, 2), (2, 2), (0, 1), False, False, (16, 16, 16)],
+        ["nn.avg_pool2d", (2, 2), (2, 2), (1, 1), False, False, (16, 16, 16)],
+        ["nn.avg_pool2d", (2, 2), (2, 2), (0, 0), False, True, (16, 16, 16)],
+        ["nn.avg_pool2d", (3, 3), (2, 2), (0, 1), True, False, (15, 15, 16)],
+    ]
+
+    for (
+        typef,
+        size,
+        stride,
+        pad,
+        ceil_mode,
+        count_include_pad,
+        input_shape,
+    ) in trials:
+        shape = (1, *input_shape)
+        outputs = []
+        inputs = {
+            "a": tvm.nd.array(np.random.uniform(-127, 128, shape).astype(dtype)),
+        }
+
+        func = _get_pooling_model(
+            shape, dtype, typef, size, stride, pad, ceil_mode, count_include_pad, iter(inputs)
+        )
+
+        config = {
+            "size": size,
+            "stride": stride,
+            "shape": shape,
+            "pooling type": typef,
+            "dtype": dtype,
+            "padding": pad,
+            "ceil_mode": ceil_mode,
+            "count_include_pad": count_include_pad,
+            "inputs": inputs,
+        }
+
+        params = None
+        for enable_bnns in [False, True]:
+            outputs.append(
+                build_and_run(
+                    func, inputs, 1, params, device, enable_bnns=enable_bnns, config=config
+                )[0]
+            )
+
+        verify(outputs, atol=0.001, rtol=0.001, config=config)
+
+
+@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available")
+def test_global_pooling():
+    device = Device()
+    np.random.seed(0)
+
+    dtype = "float32"
+
+    trials = [
+        ["nn.global_max_pool2d", (8, 8, 16)],
+        ["nn.global_max_pool2d", (9, 9, 16)],
+        ["nn.global_max_pool2d", (8, 8, 16)],
+        ["nn.global_avg_pool2d", (8, 8, 16)],
+        ["nn.global_avg_pool2d", (8, 8, 16)],
+        ["nn.global_avg_pool2d", (9, 9, 16)],
+    ]
+
+    for typef, input_shape in trials:
+        shape = (1, *input_shape)
+        outputs = []
+        inputs = {
+            "a": tvm.nd.array(np.random.uniform(-127, 128, shape).astype(dtype)),
+        }
+
+        func = _get_global_pooling_model(shape, dtype, typef, iter(inputs))
+        config = {
+            "shape": shape,
+            "pooling type": typef,
+            "dtype": dtype,
+        }
+
+        for enable_bnns in [False, True]:
+            outputs.append(
+                build_and_run(
+                    func, inputs, 1, None, device, enable_bnns=enable_bnns, config=config
+                )[0]
+            )
+
+        verify(outputs, atol=0.001, rtol=0.001, config=config)
+
+
+@pytest.mark.skipif(skip_codegen_test(), reason="Skip because BNNS codegen is not available")
+def test_codegen_pooling():
+    dtype = "float32"
+
+    trials = [
+        ["nn.max_pool2d", (2, 2), (2, 2), (0, 0), False, True, (16, 16, 16)],
+        ["nn.max_pool2d", (3, 3), (2, 2), (1, 1), True, True, (15, 15, 16)],
+        ["nn.max_pool2d", (2, 2), (2, 2), (0, 1), False, False, (16, 16, 16)],
+        ["nn.avg_pool2d", (2, 2), (2, 2), (1, 1), False, False, (16, 16, 16)],
+        ["nn.avg_pool2d", (2, 2), (2, 2), (0, 0), False, True, (16, 16, 16)],
+        ["nn.avg_pool2d", (3, 3), (2, 2), (0, 1), True, False, (15, 15, 16)],
+    ]
+
+    for (
+        typef,
+        size,
+        stride,
+        pad,
+        ceil_mode,
+        count_include_pad,
+        input_shape,
+    ) in trials:
+        shape = (1, *input_shape)
+        inputs = {"a"}
+        args = (shape, dtype, typef, size, stride, pad, False, False)
+        func = _get_pooling_model(*args, iter(inputs))
+        exp_codegen = _get_expected_pooling_codegen(*args)
+        verify_codegen(func, exp_codegen, 1)
+
+
+@pytest.mark.skipif(skip_codegen_test(), reason="Skip because BNNS codegen is not available")
+def test_codegen_global_pooling():
+    dtype = "float32"
+
+    trials = [
+        ["nn.global_max_pool2d", (8, 8, 16)],
+        ["nn.global_max_pool2d", (9, 9, 16)],
+        ["nn.global_max_pool2d", (8, 8, 16)],
+        ["nn.global_avg_pool2d", (8, 8, 16)],
+        ["nn.global_avg_pool2d", (8, 8, 16)],
+        ["nn.global_avg_pool2d", (9, 9, 16)],
+    ]
+
+    for typef, input_shape in trials:
+        shape = (1, *input_shape)
+        inputs = {"a"}
+        args = (shape, dtype, typef)
+        func = _get_global_pooling_model(*args, iter(inputs))
+        exp_codegen = _get_expected_global_pooling_codegen(*args)
+        verify_codegen(func, exp_codegen, 1)
+
+
+if __name__ == "__main__":
+    test_pooling()
+    test_global_pooling()
+    test_codegen_pooling()
+    test_codegen_global_pooling()
diff --git a/tests/python/contrib/test_cblas.py b/tests/python/contrib/test_cblas.py
index dd9f7775a7ac..946d93385e6f 100644
--- a/tests/python/contrib/test_cblas.py
+++ b/tests/python/contrib/test_cblas.py
@@ -49,11 +49,11 @@ def verify(target="llvm"):
         if not tvm.get_global_func(lib.__name__ + ".matmul", True):
             print("skip because extern function is not available")
             return
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         f = tvm.build(s, [A, B, D, bias], target)
-        a = tvm.nd.array(np.random.uniform(size=ashape).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=bshape).astype(B.dtype), ctx)
-        d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=ashape).astype(A.dtype), dev)
+        b = tvm.nd.array(np.random.uniform(size=bshape).astype(B.dtype), dev)
+        d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), dev)
         bb = 10.0
         f(a, b, d, bb)
         tvm.testing.assert_allclose(
@@ -119,11 +119,11 @@ def verify(target="llvm"):
         if not tvm.get_global_func("tvm.contrib.mkl.matmul_u8s8s32", True):
             print("skip because extern function is not available")
             return
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         f = tvm.build(s, [A, B, D, bias], target)
-        a = tvm.nd.array(np.random.randint(low=0, high=50, size=ashape).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.randint(low=0, high=50, size=bshape).astype(B.dtype), ctx)
-        d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx)
+        a = tvm.nd.array(np.random.randint(low=0, high=50, size=ashape).astype(A.dtype), dev)
+        b = tvm.nd.array(np.random.randint(low=0, high=50, size=bshape).astype(B.dtype), dev)
+        d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), dev)
         bb = 10
         f(a, b, d, bb)
         tvm.testing.assert_allclose(
@@ -171,11 +171,11 @@ def verify(target="llvm"):
         if not tvm.get_global_func(lib.__name__ + ".matmul", True):
             print("skip because extern function is not available")
             return
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         f = tvm.build(s, [A, B, D], target)
-        a = tvm.nd.array(np.random.uniform(size=ashape).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=bshape).astype(B.dtype), ctx)
-        d = tvm.nd.array(np.zeros((batch, n, m), dtype=D.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=ashape).astype(A.dtype), dev)
+        b = tvm.nd.array(np.random.uniform(size=bshape).astype(B.dtype), dev)
+        d = tvm.nd.array(np.zeros((batch, n, m), dtype=D.dtype), dev)
         f(a, b, d)
         tvm.testing.assert_allclose(
             d.asnumpy(), get_numpy(a.asnumpy(), b.asnumpy(), transa, transb), rtol=1e-5
diff --git a/tests/python/contrib/test_coreml_codegen.py b/tests/python/contrib/test_coreml_codegen.py
index dd10b6d9fcbd..b93c489fdac6 100644
--- a/tests/python/contrib/test_coreml_codegen.py
+++ b/tests/python/contrib/test_coreml_codegen.py
@@ -99,13 +99,13 @@ def test_annotate():
 
 @pytest.mark.skipif(not _has_xcode(), reason="Xcode is not available")
 def test_compile_and_run():
-    ctx = tvm.cpu()
+    dev = tvm.cpu()
     target = "llvm"
     tol = 1e-3
 
     with relay.build_config(opt_level=3):
         lib = relay.build(_create_graph_annotated(), target=target)
-    m = tvm.contrib.graph_runtime.GraphModule(lib["default"](ctx))
+    m = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
 
     shape = (10, 10)
     x_data = np.random.rand(*shape).astype("float32")
@@ -114,7 +114,7 @@ def test_compile_and_run():
     m.set_input("x", x_data)
     m.set_input("y", y_data)
     m.run()
-    out = tvm.nd.empty(shape, ctx=ctx)
+    out = tvm.nd.empty(shape, device=dev)
     out = m.get_output(0, out)
 
     expected = (y_data * y_data) - (x_data + x_data)
diff --git a/tests/python/contrib/test_coreml_runtime.py b/tests/python/contrib/test_coreml_runtime.py
index c0076d6eb12f..447b412595c3 100644
--- a/tests/python/contrib/test_coreml_runtime.py
+++ b/tests/python/contrib/test_coreml_runtime.py
@@ -56,7 +56,7 @@ def create_coreml_model():
         )
         return coremltools.models.MLModel(builder.spec)
 
-    def verify(coreml_model, model_path, ctx):
+    def verify(coreml_model, model_path, dev):
         coreml_model = create_coreml_model()
 
         out_spec = coreml_model.output_description._fd_spec
@@ -72,9 +72,9 @@ def verify(coreml_model, model_path, ctx):
         coreml_outputs = [coreml_model.predict(inputs)[name] for name in out_names]
 
         # inference via tvm coreml runtime
-        runtime = coreml_runtime.create("main", model_path, ctx)
+        runtime = coreml_runtime.create("main", model_path, dev)
         for name in inputs:
-            runtime.set_input(name, tvm.nd.array(inputs[name], ctx))
+            runtime.set_input(name, tvm.nd.array(inputs[name], dev))
         runtime.invoke()
         tvm_outputs = [runtime.get_output(i).asnumpy() for i in range(runtime.get_num_outputs())]
 
@@ -89,14 +89,14 @@ def check_remote(coreml_model):
         )
         compiled_model = os.path.basename(compiled_model)
         remote = rpc.connect(proxy_host, proxy_port, key=key)
-        ctx = remote.cpu(0)
-        verify(coreml_model, compiled_model, ctx)
+        dev = remote.cpu(0)
+        verify(coreml_model, compiled_model, dev)
 
     def check_local(coreml_model):
         temp = utils.tempdir()
         compiled_model = xcode.compile_coreml(coreml_model, out_dir=temp.temp_dir)
-        ctx = tvm.cpu(0)
-        verify(coreml_model, compiled_model, ctx)
+        dev = tvm.cpu(0)
+        verify(coreml_model, compiled_model, dev)
 
     coreml_model = create_coreml_model()
     check_remote(coreml_model)
diff --git a/tests/python/contrib/test_cublas.py b/tests/python/contrib/test_cublas.py
index 175a747bba42..c4e6f89deadc 100644
--- a/tests/python/contrib/test_cublas.py
+++ b/tests/python/contrib/test_cublas.py
@@ -35,11 +35,11 @@ def verify(target="cuda"):
         if not tvm.get_global_func("tvm.contrib.cublas.matmul", True):
             print("skip because extern function is not available")
             return
-        ctx = tvm.gpu(0)
+        dev = tvm.gpu(0)
         f = tvm.build(s, [A, B, C], target)
-        a = tvm.nd.array(np.random.uniform(0, 128, size=(n, l)).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(0, 128, size=(l, m)).astype(B.dtype), ctx)
-        c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(0, 128, size=(n, l)).astype(A.dtype), dev)
+        b = tvm.nd.array(np.random.uniform(0, 128, size=(l, m)).astype(B.dtype), dev)
+        c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev)
         f(a, b, c)
         tvm.testing.assert_allclose(
             c.asnumpy(), np.dot(a.asnumpy().astype(C.dtype), b.asnumpy().astype(C.dtype)), rtol=rtol
@@ -70,7 +70,7 @@ def verify(target="cuda"):
         if not tvm.get_global_func("tvm.contrib.cublaslt.matmul", True):
             print("skip because extern function is not available")
             return
-        ctx = tvm.gpu(0)
+        dev = tvm.gpu(0)
         f = tvm.build(s, [A, B, C], target)
         a_old = np.random.uniform(0, 128, size=(n, l))
         b_old = np.random.uniform(0, 128, size=(l, m))
@@ -95,9 +95,9 @@ def verify(target="cuda"):
         )
         b_new = b_new.reshape([m, L])
 
-        a = tvm.nd.array(a_new.astype(A.dtype), ctx)
-        b = tvm.nd.array(b_new.astype(B.dtype), ctx)
-        c = tvm.nd.array(np.zeros((m, N_out), dtype=C.dtype), ctx)
+        a = tvm.nd.array(a_new.astype(A.dtype), dev)
+        b = tvm.nd.array(b_new.astype(B.dtype), dev)
+        c = tvm.nd.array(np.zeros((m, N_out), dtype=C.dtype), dev)
         f(a, b, c)
         # Transform output c from layout CUBLASLT_ORDER_COL32 to row major layout
         c_out = c.asnumpy()
@@ -126,11 +126,11 @@ def verify(target="cuda"):
         if not tvm.get_global_func("tvm.contrib.cublas.matmul", True):
             print("skip because extern function is not available")
             return
-        ctx = tvm.gpu(0)
+        dev = tvm.gpu(0)
         f = tvm.build(s, [A, B, C], target)
-        a = tvm.nd.array(np.random.uniform(size=(j, n, l)).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=(j, l, m)).astype(B.dtype), ctx)
-        c = tvm.nd.array(np.zeros((j, n, m), dtype=C.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=(j, n, l)).astype(A.dtype), dev)
+        b = tvm.nd.array(np.random.uniform(size=(j, l, m)).astype(B.dtype), dev)
+        c = tvm.nd.array(np.zeros((j, n, m), dtype=C.dtype), dev)
         f(a, b, c)
         tvm.testing.assert_allclose(
             c.asnumpy(),
diff --git a/tests/python/contrib/test_cudnn.py b/tests/python/contrib/test_cudnn.py
index a4425ab6fd90..690589cf173b 100644
--- a/tests/python/contrib/test_cudnn.py
+++ b/tests/python/contrib/test_cudnn.py
@@ -71,14 +71,14 @@ def verify_conv2d(data_dtype, conv_dtype, tensor_format=0, groups=1):
     s = te.create_schedule(Y.op)
 
     # validation
-    ctx = tvm.gpu(0)
+    dev = tvm.gpu(0)
     f = tvm.build(s, [X, W, Y], "cuda --host=llvm", name="conv2d")
     x_np = np.random.uniform(-1, 1, xshape).astype(data_dtype)
     w_np = np.random.uniform(-1, 1, wshape).astype(data_dtype)
     y_np = np.zeros(yshape).astype(data_dtype)
-    x = tvm.nd.array(x_np, ctx)
-    w = tvm.nd.array(w_np, ctx)
-    y = tvm.nd.array(y_np, ctx)
+    x = tvm.nd.array(x_np, dev)
+    w = tvm.nd.array(w_np, dev)
+    y = tvm.nd.array(y_np, dev)
     if tensor_format == 0:
         c_np = tvm.topi.testing.conv2d_nchw_python(x_np, w_np, 1, 1, groups=groups)
     elif tensor_format == 1:
@@ -149,14 +149,14 @@ def verify_conv3d(data_dtype, conv_dtype, tensor_format=0, groups=1):
     s = te.create_schedule(Y.op)
 
     # validation
-    ctx = tvm.gpu(0)
+    dev = tvm.gpu(0)
     f = tvm.build(s, [X, W, Y], target="cuda --host=llvm", name="conv3d")
     x_np = np.random.uniform(-1, 1, xshape).astype(data_dtype)
     w_np = np.random.uniform(-1, 1, wshape).astype(data_dtype)
     y_np = np.zeros(yshape).astype(data_dtype)
-    x = tvm.nd.array(x_np, ctx)
-    w = tvm.nd.array(w_np, ctx)
-    y = tvm.nd.array(y_np, ctx)
+    x = tvm.nd.array(x_np, dev)
+    w = tvm.nd.array(w_np, dev)
+    y = tvm.nd.array(y_np, dev)
     if tensor_format == 0:
         c_np = tvm.topi.testing.conv3d_ncdhw_python(x_np, w_np, 1, 1, groups)
     else:
@@ -177,11 +177,11 @@ def verify_softmax(shape, axis, dtype="float32"):
     B = cudnn.softmax(A, axis)
     s = te.create_schedule([B.op])
 
-    ctx = tvm.gpu(0)
+    dev = tvm.gpu(0)
     a_np = np.random.uniform(size=shape).astype(dtype)
     b_np = tvm.topi.testing.softmax_python(a_np)
-    a = tvm.nd.array(a_np, ctx)
-    b = tvm.nd.array(b_np, ctx)
+    a = tvm.nd.array(a_np, dev)
+    b = tvm.nd.array(b_np, dev)
     f = tvm.build(s, [A, B], target="cuda --host=llvm", name="softmax")
     f(a, b)
     tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3)
@@ -192,13 +192,13 @@ def verify_softmax_4d(shape, dtype="float32"):
     B = cudnn.softmax(A, axis=1)
     s = te.create_schedule([B.op])
 
-    ctx = tvm.gpu(0)
+    dev = tvm.gpu(0)
     n, c, h, w = shape
     a_np = np.random.uniform(size=shape).astype(dtype)
     b_np = tvm.topi.testing.softmax_python(a_np.transpose(0, 2, 3, 1).reshape(h * w, c))
     b_np = b_np.reshape(n, h, w, c).transpose(0, 3, 1, 2)
-    a = tvm.nd.array(a_np, ctx)
-    b = tvm.nd.array(b_np, ctx)
+    a = tvm.nd.array(a_np, dev)
+    b = tvm.nd.array(b_np, dev)
     f = tvm.build(s, [A, B], target="cuda --host=llvm", name="softmax")
     f(a, b)
     tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3)
diff --git a/tests/python/contrib/test_edgetpu_runtime.py b/tests/python/contrib/test_edgetpu_runtime.py
index 8c6113cee3d4..a3a01604bed1 100644
--- a/tests/python/contrib/test_edgetpu_runtime.py
+++ b/tests/python/contrib/test_edgetpu_runtime.py
@@ -67,11 +67,11 @@ def check_remote(target_edgetpu=False):
         # inference via remote tvm tflite runtime
         server = rpc.Server("localhost")
         remote = rpc.connect(server.host, server.port)
-        ctx = remote.cpu(0)
+        dev = remote.cpu(0)
 
         with open(tflite_model_path, "rb") as model_fin:
-            runtime = tflite_runtime.create(model_fin.read(), ctx)
-            runtime.set_input(0, tvm.nd.array(tflite_input, ctx))
+            runtime = tflite_runtime.create(model_fin.read(), dev)
+            runtime.set_input(0, tvm.nd.array(tflite_input, dev))
             runtime.invoke()
             out = runtime.get_output(0)
             np.testing.assert_equal(out.asnumpy(), tflite_output)
diff --git a/tests/python/contrib/test_ethosn/infrastructure.py b/tests/python/contrib/test_ethosn/infrastructure.py
index cd9e9e91292d..59021cf86211 100644
--- a/tests/python/contrib/test_ethosn/infrastructure.py
+++ b/tests/python/contrib/test_ethosn/infrastructure.py
@@ -20,7 +20,7 @@
 from __future__ import absolute_import, print_function
 import tvm
 from tvm import relay
-from tvm.contrib import utils, graph_runtime, download
+from tvm.contrib import utils, graph_executor, download
 from hashlib import md5
 from itertools import zip_longest, combinations
 import numpy as np
@@ -211,7 +211,7 @@ def run(lib, inputs, outputs, npu=True):
     lib_path = temp.relpath(lib_name)
     lib.export_library(lib_path)
     lib = tvm.runtime.load_module(lib_path)
-    module = graph_runtime.GraphModule(lib["default"](tvm.cpu()))
+    module = graph_executor.GraphModule(lib["default"](tvm.cpu()))
     module.set_input(**inputs)
     module.run()
     out = [module.get_output(i) for i in range(outputs)]
@@ -221,7 +221,7 @@ def run(lib, inputs, outputs, npu=True):
 
 
 def build_and_run(
-    mod, inputs, outputs, params, ctx=tvm.cpu(), npu=True, expected_host_ops=0, npu_partitions=1
+    mod, inputs, outputs, params, device=tvm.cpu(), npu=True, expected_host_ops=0, npu_partitions=1
 ):
     lib = build(mod, params, npu, expected_host_ops, npu_partitions)
     return run(lib, inputs, outputs, npu)
diff --git a/tests/python/contrib/test_ethosn/test_networks.py b/tests/python/contrib/test_ethosn/test_networks.py
index 06ce93b2aba5..ce89c90d9379 100644
--- a/tests/python/contrib/test_ethosn/test_networks.py
+++ b/tests/python/contrib/test_ethosn/test_networks.py
@@ -122,7 +122,9 @@ def test_mobilenet_v1():
     # codegen, which could come about from either a change in Support Library
     # version or a change in the Ethos-N codegen. To update this requires running
     # on hardware that isn't available in CI.
-    _compile_hash = {"81637c89339201a07dc96e3b5dbf836a"}
+    _compile_hash = {"bfb5a50607edb50009c58ae9d4287e4d"}
+    if tei.get_ethosn_variant() == 3:
+        _compile_hash = {"896c28b4f06341ea638ead3a593e1aed"}
     if tei.get_ethosn_api_version() == 2008:
         _compile_hash = {"47e216d8ab2bf491708ccf5620bc0d02"}
         if tei.get_ethosn_variant() == 3:
@@ -150,7 +152,9 @@ def test_inception_v3():
     # codegen, which could come about from either a change in Support Library
     # version or a change in the Ethos-N codegen. To update this requires running
     # on hardware that isn't available in CI.
-    _compile_hash = {"de0e175af610ebd45ccb03d170dc9664"}
+    _compile_hash = {"96116d7e6c7385de0688074a3f889983"}
+    if tei.get_ethosn_variant() == 3:
+        _compile_hash = {"551cde850c6ef960d19be4f317fb8e68"}
     if tei.get_ethosn_api_version() == 2008:
         _compile_hash = {"8c9d75659cd7bc9ff6dd6d490d28f9b2"}
         if tei.get_ethosn_variant() == 3:
@@ -177,7 +181,9 @@ def test_inception_v4():
     # codegen, which could come about from either a change in Support Library
     # version or a change in the Ethos-N codegen. To update this requires running
     # on hardware that isn't available in CI.
-    _compile_hash = {"06bf6cb56344f3904bcb108e54edfe87"}
+    _compile_hash = {"b34aec2a48c591818761ed6b42c133e5"}
+    if tei.get_ethosn_variant() == 3:
+        _compile_hash = {"30f078bd42757e8686eafa1f28d0d352"}
     if tei.get_ethosn_api_version() == 2008:
         if not tei.get_ethosn_variant() == 0:
             pytest.skip(
@@ -206,7 +212,9 @@ def test_ssd_mobilenet_v1():
     # codegen, which could come about from either a change in Support Library
     # version or a change in the Ethos-N codegen. To update this requires running
     # on hardware that isn't available in CI.
-    _compile_hash = {"29aec6b184b09454b4323271aadf89b1", "6211d96103880b016baa85e638abddef"}
+    _compile_hash = {"c312edfc9a946ed4dc7c049d472dae6e", "3183f0fa5eba8f6b9557d14eaf47842d"}
+    if tei.get_ethosn_variant() == 3:
+        _compile_hash = {"deee52e136327436411fc725624ae2ea", "6526509d3cbee014e38c79e22bb29d7f"}
     if tei.get_ethosn_api_version() == 2008:
         _compile_hash = {"5999f26e140dee0d7866491997ef78c5", "24e3a690a7e95780052792d5626c85be"}
         if tei.get_ethosn_variant() == 3:
diff --git a/tests/python/contrib/test_gemm_acc16.py b/tests/python/contrib/test_gemm_acc16.py
index 5475978cb3cc..4d2ed795d6e0 100644
--- a/tests/python/contrib/test_gemm_acc16.py
+++ b/tests/python/contrib/test_gemm_acc16.py
@@ -38,7 +38,7 @@ def verify(target="llvm -mcpu=skylake-avx512"):
             print("skip because %s is not enabled..." % target)
             return
 
-        ctx = tvm.context(target, 0)
+        dev = tvm.device(target, 0)
         X = te.placeholder((m, k), name="X", dtype="uint8")
         W = te.placeholder((n, k), name="W", dtype="int8")
         pc = dot_16x1x16_uint8_int8_int16()
@@ -69,7 +69,7 @@ def verify(target="llvm -mcpu=skylake-avx512"):
         t_sch[t_fc].tensorize(a_yi, pc)
         # print(tvm.lower(t_sch, [X, packedW, t_fc], simple_mode=True))
         t_func = tvm.build(t_sch, [X, packedW, t_fc], target, name="intrinsic")
-        t_evaluator = t_func.time_evaluator(t_func.entry_name, ctx, number=10)
+        t_evaluator = t_func.time_evaluator(t_func.entry_name, dev, number=10)
 
         # generate the plain data
         a_ = np.random.uniform(1, 10, size=(m, k)).astype("uint8")
@@ -84,9 +84,9 @@ def verify(target="llvm -mcpu=skylake-avx512"):
                         s_idx // 128 * 2 + t_idx
                     ]
 
-        x = tvm.nd.array(a_, ctx)
-        w = tvm.nd.array(packW, ctx)
-        y = tvm.nd.array(np.zeros((m, n), dtype="int16"), ctx)
+        x = tvm.nd.array(a_, dev)
+        w = tvm.nd.array(packW, dev)
+        y = tvm.nd.array(np.zeros((m, n), dtype="int16"), dev)
 
         result = t_evaluator(x, w, y)
         gops_per_sec = gops_per_mm / result.mean / 1e9
diff --git a/tests/python/contrib/test_gemm_acc32_vnni.py b/tests/python/contrib/test_gemm_acc32_vnni.py
index 3e0d5db95379..02538e88c39e 100644
--- a/tests/python/contrib/test_gemm_acc32_vnni.py
+++ b/tests/python/contrib/test_gemm_acc32_vnni.py
@@ -48,7 +48,7 @@ def verify(target="llvm -mcpu=cascadelake"):
             print("skip because %s is not enabled..." % target)
             return
 
-        ctx = tvm.context(target, 0)
+        dev = tvm.device(target, 0)
         pc = dot_16x1x16_uint8_int8_int32_cascadelake()
         ak = te.reduce_axis((0, k), name="k")
         packedW = te.placeholder((n // 16, 16 * (k // 4), 4), name="packedW", dtype="int8")
@@ -76,7 +76,7 @@ def verify(target="llvm -mcpu=cascadelake"):
         t_sch[t_fc].tensorize(a_yi, pc)
 
         t_func = tvm.build(t_sch, [X, packedW, t_fc], target, name="intrinsic")
-        t_evaluator = t_func.time_evaluator(t_func.entry_name, ctx, number=10)
+        t_evaluator = t_func.time_evaluator(t_func.entry_name, dev, number=10)
 
         # generate the plain data
         a_ = np.random.uniform(1, 10, size=(m, k)).astype("uint8")
@@ -91,9 +91,9 @@ def verify(target="llvm -mcpu=cascadelake"):
                         (s_idx // 16) * 4 + t_idx
                     ]
 
-        x = tvm.nd.array(a_, ctx)
-        w = tvm.nd.array(packW, ctx)
-        y = tvm.nd.array(np.zeros((m, n), dtype="int32"), ctx)
+        x = tvm.nd.array(a_, dev)
+        w = tvm.nd.array(packW, dev)
+        y = tvm.nd.array(np.zeros((m, n), dtype="int32"), dev)
         result = t_evaluator(x, w, y)
 
         gops_per_sec = gops_per_mm / result.mean / 1e9
diff --git a/tests/python/contrib/test_miopen.py b/tests/python/contrib/test_miopen.py
index 7a7c10fc7f86..630bfc011038 100644
--- a/tests/python/contrib/test_miopen.py
+++ b/tests/python/contrib/test_miopen.py
@@ -52,11 +52,11 @@ def test_conv2d():
     s = te.create_schedule(Y.op)
 
     def verify():
-        ctx = tvm.rocm(0)
+        dev = tvm.rocm(0)
         f = tvm.build(s, [X, W, Y], "rocm --host=llvm", name="conv2d")
-        x = tvm.nd.array(np.random.uniform(-1, 1, xshape).astype(np.float32), ctx)
-        w = tvm.nd.array(np.random.uniform(-1, 1, wshape).astype(np.float32), ctx)
-        y = tvm.nd.array(np.random.uniform(-1, 1, yshape).astype(np.float32), ctx)
+        x = tvm.nd.array(np.random.uniform(-1, 1, xshape).astype(np.float32), dev)
+        w = tvm.nd.array(np.random.uniform(-1, 1, wshape).astype(np.float32), dev)
+        y = tvm.nd.array(np.random.uniform(-1, 1, yshape).astype(np.float32), dev)
         f(x, w, y)
 
         Y_ref = topi.nn.conv2d_nchw(
@@ -64,7 +64,7 @@ def verify():
         )
         s_ref = te.create_schedule(Y_ref.op)
         f_ref = tvm.build(s_ref, [X, W, Y_ref], "rocm --host=llvm")
-        y_ref = tvm.nd.array(np.random.uniform(-1, 1, yshape).astype(np.float32), ctx)
+        y_ref = tvm.nd.array(np.random.uniform(-1, 1, yshape).astype(np.float32), dev)
         f_ref(x, w, y_ref)
         print("Max abs diff:", np.max(np.abs(y.asnumpy() - y_ref.asnumpy())))
         tvm.testing.assert_allclose(y.asnumpy(), y_ref.asnumpy(), atol=1e-3)
diff --git a/tests/python/contrib/test_mps.py b/tests/python/contrib/test_mps.py
index 75a76f02ab40..597e87778866 100644
--- a/tests/python/contrib/test_mps.py
+++ b/tests/python/contrib/test_mps.py
@@ -47,11 +47,11 @@ def verify(A, B, D, s, target="metal"):
         if not tvm.get_global_func("tvm.contrib.mps.matmul", True):
             print("skip because extern function is not available")
             return
-        ctx = tvm.metal(0)
+        dev = tvm.metal(0)
         f = tvm.build(s, [A, B, D], "metal")
-        a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx)
-        c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), dev)
+        b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), dev)
+        c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev)
         f(a, b, c)
         tvm.testing.assert_allclose(c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + 1, rtol=1e-5)
 
@@ -77,11 +77,11 @@ def verify(A, B, C, target="llvm"):
         if not tvm.get_global_func("tvm.contrib.mps.conv2d", True):
             print("skip because extern function is not available")
             return
-        ctx = tvm.metal(0)
+        dev = tvm.metal(0)
         f = tvm.build(s1, [A, B, C], "metal")
-        a = tvm.nd.array(np.random.uniform(size=(n, h, w, ci)).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=(co, kh, kw, ci)).astype(B.dtype), ctx)
-        c = tvm.nd.array(np.zeros((n, h // stride, w // stride, co), dtype=C.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=(n, h, w, ci)).astype(A.dtype), dev)
+        b = tvm.nd.array(np.random.uniform(size=(co, kh, kw, ci)).astype(B.dtype), dev)
+        c = tvm.nd.array(np.zeros((n, h // stride, w // stride, co), dtype=C.dtype), dev)
         f(a, b, c)
         # print(c.asnumpy())
         # print(c.shape)
diff --git a/tests/python/contrib/test_mxnet_bridge.py b/tests/python/contrib/test_mxnet_bridge.py
index afe739c47dc5..308bd82988ef 100644
--- a/tests/python/contrib/test_mxnet_bridge.py
+++ b/tests/python/contrib/test_mxnet_bridge.py
@@ -47,10 +47,10 @@ def mxnet_check():
     # get a mxnet version
     mxf = to_mxnet_func(f, const_loc=[0, 1])
 
-    ctx = mx.gpu(0)
-    xx = mx.nd.uniform(shape=shape, ctx=ctx)
-    yy = mx.nd.uniform(shape=shape, ctx=ctx)
-    zz = mx.nd.empty(shape=shape, ctx=ctx)
+    dev = mx.gpu(0)
+    xx = mx.nd.uniform(shape=shape, device=dev)
+    yy = mx.nd.uniform(shape=shape, device=dev)
+    zz = mx.nd.empty(shape=shape, device=dev)
 
     # invoke myf: this runs in mxnet engine
     mxf(xx, yy, zz, 10.0)
diff --git a/tests/python/contrib/test_nnpack.py b/tests/python/contrib/test_nnpack.py
index bcb4358596b2..0208d7211960 100644
--- a/tests/python/contrib/test_nnpack.py
+++ b/tests/python/contrib/test_nnpack.py
@@ -42,11 +42,11 @@ def verify(target="llvm"):
         if not nnpack.is_available():
             pytest.skip("nnpack is not available")
 
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         f = tvm.build(s, [A, B, D, bias], target)
-        a = tvm.nd.array(np.random.uniform(size=(l)).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=(m, l)).astype(B.dtype), ctx)
-        d = tvm.nd.array(np.zeros((m,), dtype=D.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=(l)).astype(A.dtype), dev)
+        b = tvm.nd.array(np.random.uniform(size=(m, l)).astype(B.dtype), dev)
+        d = tvm.nd.array(np.zeros((m,), dtype=D.dtype), dev)
         bb = 10.0
         f(a, b, d, bb)
         tvm.testing.assert_allclose(d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy().T) + bb, rtol=1e-5)
@@ -111,7 +111,7 @@ def verify(target="llvm", algorithm=nnpack.ConvolutionAlgorithm.AUTO, with_bias=
         if not nnpack.is_available():
             pytest.skip("nnpack is not available")
 
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         output = nnpack.convolution_inference(
             data,
             kernel,
@@ -127,10 +127,10 @@ def verify(target="llvm", algorithm=nnpack.ConvolutionAlgorithm.AUTO, with_bias=
         na = np.random.uniform(size=dshape).astype(data.dtype)
         nb = np.random.uniform(size=kshape).astype(kernel.dtype)
         nc = np.zeros(bshape, dtype=bias.dtype)
-        ta = tvm.nd.array(na, ctx)
-        tb = tvm.nd.array(nb, ctx)
-        tc = tvm.nd.array(nc, ctx)
-        td = tvm.nd.array(np.zeros(oshape, dtype=output.dtype), ctx)
+        ta = tvm.nd.array(na, dev)
+        tb = tvm.nd.array(nb, dev)
+        tc = tvm.nd.array(nc, dev)
+        td = tvm.nd.array(np.zeros(oshape, dtype=output.dtype), dev)
         f(ta, tb, tc, td)
         nd = np_conv(np.reshape(na, (BATCH, IC, IH, IW)), nb, PAD, STRIDE) + nc.reshape(
             1, bshape[0], 1, 1
@@ -177,7 +177,7 @@ def verify(target="llvm", algorithm=nnpack.ConvolutionAlgorithm.AUTO, with_bias=
         if not nnpack.is_available():
             pytest.skip("nnpack is not available")
 
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         transformed_kernel = nnpack.convolution_inference_weight_transform(
             kernel, algorithm=algorithm
         )
@@ -201,10 +201,10 @@ def verify(target="llvm", algorithm=nnpack.ConvolutionAlgorithm.AUTO, with_bias=
             if with_bias
             else np.zeros(bshape, dtype=bias.dtype)
         )
-        ta = tvm.nd.array(na, ctx)
-        tb = tvm.nd.array(nb, ctx)
-        tc = tvm.nd.array(nc, ctx)
-        td = tvm.nd.array(np.zeros(oshape, dtype=output.dtype), ctx)
+        ta = tvm.nd.array(na, dev)
+        tb = tvm.nd.array(nb, dev)
+        tc = tvm.nd.array(nc, dev)
+        td = tvm.nd.array(np.zeros(oshape, dtype=output.dtype), dev)
         f(ta, tb, tc, td)
         nd = np_conv(np.reshape(na, (BATCH, IC, IH, IW)), nb, PAD, STRIDE) + nc.reshape(
             1, bshape[0], 1, 1
diff --git a/tests/python/contrib/test_onnx.py b/tests/python/contrib/test_onnx.py
index 6e9cf3afd5ba..9b29b33caaf6 100644
--- a/tests/python/contrib/test_onnx.py
+++ b/tests/python/contrib/test_onnx.py
@@ -48,8 +48,8 @@ def run_onnx(onnx_model, input_data):
 
 def run_relay(func, data_tuple):
     target = "llvm"
-    ctx = tvm.context("llvm", 0)
-    intrp = relay.create_executor("graph", ctx=ctx, target=target)
+    dev = tvm.device("llvm", 0)
+    intrp = relay.create_executor("graph", device=dev, target=target)
     relay_res = intrp.evaluate(func)(*data_tuple)
 
     result = []
diff --git a/tests/python/contrib/test_onnx_model.py b/tests/python/contrib/test_onnx_model.py
index a3f3717e3872..addb13732550 100644
--- a/tests/python/contrib/test_onnx_model.py
+++ b/tests/python/contrib/test_onnx_model.py
@@ -58,8 +58,8 @@ def get_data(in_data_shapes, dtype="float32"):
 
 def run_relay(mod, params, in_data):
     target = "llvm"
-    ctx = tvm.context("llvm", 0)
-    intrp = relay.create_executor("graph", mod, ctx=ctx, target=target)
+    dev = tvm.device("llvm", 0)
+    intrp = relay.create_executor("graph", mod, device=dev, target=target)
     in_data = [tvm.nd.array(value) for value in in_data.values()]
     return intrp.evaluate()(*in_data, **params).asnumpy()
 
diff --git a/tests/python/contrib/test_random.py b/tests/python/contrib/test_random.py
index fd87a065fffa..0740521b5fa5 100644
--- a/tests/python/contrib/test_random.py
+++ b/tests/python/contrib/test_random.py
@@ -35,9 +35,9 @@ def verify(target="llvm"):
         if not tvm.get_global_func("tvm.contrib.random.randint", True):
             print("skip because extern function is not available")
             return
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         f = tvm.build(s, [A], target)
-        a = tvm.nd.array(np.zeros((m, n), dtype=A.dtype), ctx)
+        a = tvm.nd.array(np.zeros((m, n), dtype=A.dtype), dev)
         f(a)
         na = a.asnumpy()
         assert abs(np.mean(na)) < 0.3
@@ -60,9 +60,9 @@ def verify(target="llvm"):
         if not tvm.get_global_func("tvm.contrib.random.uniform", True):
             print("skip because extern function is not available")
             return
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         f = tvm.build(s, [A], target)
-        a = tvm.nd.array(np.zeros((m, n), dtype=A.dtype), ctx)
+        a = tvm.nd.array(np.zeros((m, n), dtype=A.dtype), dev)
         f(a)
         na = a.asnumpy()
         assert abs(np.mean(na) - 0.5) < 1e-1
@@ -85,9 +85,9 @@ def verify(target="llvm"):
         if not tvm.get_global_func("tvm.contrib.random.normal", True):
             print("skip because extern function is not available")
             return
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         f = tvm.build(s, [A], target)
-        a = tvm.nd.array(np.zeros((m, n), dtype=A.dtype), ctx)
+        a = tvm.nd.array(np.zeros((m, n), dtype=A.dtype), dev)
         f(a)
         na = a.asnumpy()
         assert abs(np.mean(na) - 3) < 1e-1
@@ -98,12 +98,12 @@ def verify(target="llvm"):
 
 @tvm.testing.uses_gpu
 def test_random_fill():
-    def test_local(ctx, dtype):
+    def test_local(dev, dtype):
         if not tvm.get_global_func("tvm.contrib.random.random_fill", True):
             print("skip because extern function is not available")
             return
         np_ones = np.ones((512, 512), dtype=dtype)
-        value = tvm.nd.empty(np_ones.shape, np_ones.dtype, ctx)
+        value = tvm.nd.empty(np_ones.shape, np_ones.dtype, dev)
         random_fill = tvm.get_global_func("tvm.contrib.random.random_fill")
         random_fill(value)
 
@@ -146,8 +146,8 @@ def test_rpc(dtype):
         "float32",
         "float64",
     ]:
-        for _, ctx in tvm.testing.enabled_targets():
-            test_local(ctx, dtype)
+        for _, dev in tvm.testing.enabled_targets():
+            test_local(dev, dtype)
         test_rpc(dtype)
 
 
diff --git a/tests/python/contrib/test_rocblas.py b/tests/python/contrib/test_rocblas.py
index 6f1783daa74c..bcbec60265d3 100644
--- a/tests/python/contrib/test_rocblas.py
+++ b/tests/python/contrib/test_rocblas.py
@@ -37,11 +37,11 @@ def verify(target="rocm"):
         if not tvm.get_global_func("tvm.contrib.rocblas.matmul", True):
             print("skip because extern function is not available")
             return
-        ctx = tvm.rocm(0)
+        dev = tvm.rocm(0)
         f = tvm.build(s, [A, B, C], target)
-        a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx)
-        c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), dev)
+        b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), dev)
+        c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev)
         f(a, b, c)
         tvm.testing.assert_allclose(c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-5)
 
@@ -70,11 +70,11 @@ def verify(target="rocm"):
         if not tvm.get_global_func(lib.__name__ + ".batch_matmul", True):
             print("skip because extern function is not available")
             return
-        ctx = tvm.rocm(0)
+        dev = tvm.rocm(0)
         f = tvm.build(s, [A, B, C], target)
-        a = tvm.nd.array(np.random.uniform(size=ashape).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=bshape).astype(B.dtype), ctx)
-        c = tvm.nd.array(np.zeros((batch, m, n), dtype=C.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=ashape).astype(A.dtype), dev)
+        b = tvm.nd.array(np.random.uniform(size=bshape).astype(B.dtype), dev)
+        c = tvm.nd.array(np.zeros((batch, m, n), dtype=C.dtype), dev)
         f(a, b, c)
         tvm.testing.assert_allclose(
             c.asnumpy(), get_numpy(a.asnumpy(), b.asnumpy(), transa, transb), rtol=1e-5
diff --git a/tests/python/contrib/test_sort.py b/tests/python/contrib/test_sort.py
index a049602ac265..cdb3a00dc492 100644
--- a/tests/python/contrib/test_sort.py
+++ b/tests/python/contrib/test_sort.py
@@ -48,13 +48,13 @@ def test_sort():
         [[3, 4, 4], [2, 3, 3], [1, 2, 2], [0, 1, 1], [4, 0, 0]],
     ]
 
-    ctx = tvm.cpu(0)
+    dev = tvm.cpu(0)
     target = "llvm"
     s = te.create_schedule(out.op)
     f = tvm.build(s, [data, sort_num, out], target)
-    a = tvm.nd.array(np.array(input).astype(data.dtype), ctx)
-    b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx)
-    c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx)
+    a = tvm.nd.array(np.array(input).astype(data.dtype), dev)
+    b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), dev)
+    c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), dev)
     f(a, b, c)
     tvm.testing.assert_allclose(c.asnumpy(), np.array(sorted_index).astype(out.dtype), rtol=1e-5)
 
@@ -76,7 +76,7 @@ def test_sort_np():
         name="sort_tensor",
     )
 
-    ctx = tvm.cpu(0)
+    dev = tvm.cpu(0)
     target = "llvm"
     s = te.create_schedule(out.op)
     f = tvm.build(s, [data, sort_num, out], target)
@@ -84,9 +84,9 @@ def test_sort_np():
     np_data = np.random.uniform(size=dshape)
     np_out = np.argsort(np_data, axis=axis)
     sort_num_input = np.full(reduced_shape, dshape[axis])
-    a = tvm.nd.array(np.array(np_data).astype(data.dtype), ctx)
-    b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx)
-    c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx)
+    a = tvm.nd.array(np.array(np_data).astype(data.dtype), dev)
+    b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), dev)
+    c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), dev)
     f(a, b, c)
     tvm.testing.assert_allclose(c.asnumpy(), np_out, rtol=1e-5)
 
@@ -103,7 +103,7 @@ def test_sort_by_key_gpu():
 
         with tvm.target.Target(target):
             keys_out, values_out = sort_by_key(keys, values)
-            ctx = tvm.context(target)
+            dev = tvm.device(target)
             s = te.create_schedule([keys_out.op, values_out.op])
             f = tvm.build(s, [keys, values, keys_out, values_out], target)
 
@@ -111,10 +111,10 @@ def test_sort_by_key_gpu():
             values_np = np.random.randint(0, 10, size=(size,)).astype(np.int32)
             keys_np_out = np.zeros(keys_np.shape, np.int32)
             values_np_out = np.zeros(values_np.shape, np.int32)
-            keys_in = tvm.nd.array(keys_np, ctx)
-            values_in = tvm.nd.array(values_np, ctx)
-            keys_out = tvm.nd.array(keys_np_out, ctx)
-            values_out = tvm.nd.array(values_np_out, ctx)
+            keys_in = tvm.nd.array(keys_np, dev)
+            values_in = tvm.nd.array(values_np, dev)
+            keys_out = tvm.nd.array(keys_np_out, dev)
+            values_out = tvm.nd.array(values_np_out, dev)
             f(keys_in, values_in, keys_out, values_out)
 
             ref_keys_out = np.sort(keys_np)
diff --git a/tests/python/contrib/test_sparse.py b/tests/python/contrib/test_sparse.py
index 39cff18a5212..d9618391ce40 100644
--- a/tests/python/contrib/test_sparse.py
+++ b/tests/python/contrib/test_sparse.py
@@ -27,21 +27,21 @@ def test_static_tensor():
     dtype = "float32"
     stype = "csr"
     target = "llvm"
-    ctx = tvm.context(target, 0)
+    dev = tvm.device(target, 0)
     m = te.size_var("m")
     n = te.size_var("n")
     A = tvmsp.placeholder(shape=(m, n), name="A", dtype=dtype)
     assert A.stype == "csr"
     n = 3
     a = np.maximum(np.random.uniform(size=(n, n)).astype(dtype) - 0.6, 0.0)
-    a = tvmsp.array(a, ctx)
+    a = tvmsp.array(a, dev)
     A.data = te.placeholder(a.data.shape, dtype, name="A_data")
     Ab = tvm.tir.decl_buffer(a.data.shape, dtype, name="A_data")
     binds = {A.data: Ab}
     C = te.compute(A.data.shape, lambda i: A.data[i] * 2.0, tag="cs_scatter")
     s = te.create_schedule(C.op)
     f = tvm.build(s, [A.data, C], target, binds=binds)
-    c = tvmsp.array(np.zeros((n, n), dtype), ctx)
+    c = tvmsp.array(np.zeros((n, n), dtype), dev)
     c.data = tvm.nd.empty(a.data.shape, dtype)
     c.indices = a.indices
     c.indptr = a.indptr
@@ -53,7 +53,7 @@ def test_dynamic_tensor():
     dtype = "float32"
     stype = "csr"
     target = "llvm"
-    ctx = tvm.context(target, 0)
+    dev = tvm.device(target, 0)
     nr, nc, n = te.size_var("nr"), te.size_var("nc"), te.size_var("n")
     A = tvmsp.placeholder(shape=(nr, nc), nonzeros=n, name="A", dtype=dtype)
     assert A.stype == "csr"
@@ -61,14 +61,14 @@ def test_dynamic_tensor():
     s = te.create_schedule(C.op)
     _nr, _nc = 3, 5
     a = np.maximum(np.random.uniform(size=(_nr, _nc)).astype(dtype) - 0.6, 0.0)
-    a = tvmsp.array(a, ctx)
+    a = tvmsp.array(a, dev)
     assert a.data.dtype == a.dtype
     Ab = namedtuple("CSRBuffer", ["data", "indices", "indptr"])
     Ab.data = tvm.tir.decl_buffer(a.data.shape, a.data.dtype, name="A_data")
     Ab.indices = tvm.tir.decl_buffer(a.data.shape, a.data.dtype, name="A_indices")
     binds = {A.data: Ab.data, A.indices: Ab.indices}
     f = tvm.build(s, [nr, A.data, C], target, binds=binds)
-    c = tvmsp.array(np.zeros((_nr, _nc), dtype), ctx)
+    c = tvmsp.array(np.zeros((_nr, _nc), dtype), dev)
     c.data = tvm.nd.empty(a.data.shape, dtype)
     c.indices = a.indices
     c.indptr = a.indptr
@@ -80,7 +80,7 @@ def test_sparse_array_tuple():
     dtype, itype = "float32", "int32"
     stype = "csr"
     target = "llvm"
-    ctx = tvm.context(target, 0)
+    dev = tvm.device(target, 0)
     nr, nc, n = te.size_var("nr"), te.size_var("nc"), te.size_var("n")
     A = tvmsp.placeholder(shape=(nr, nc), nonzeros=n, name="A", dtype=dtype)
     assert A.stype == "csr"
@@ -92,22 +92,22 @@ def test_sparse_array_tuple():
     source_array = a
     ridx, cidx = np.nonzero(source_array)
     data = source_array[ridx, cidx]
-    a_data = _nd.array(data, ctx)
+    a_data = _nd.array(data, dev)
     indices = np.nonzero(source_array)[1].astype(itype)
-    a_indices = _nd.array(indices, ctx)
+    a_indices = _nd.array(indices, dev)
     indptr = [0] + np.apply_along_axis(np.count_nonzero, axis=1, arr=source_array).tolist()
     indptr = np.cumsum(np.array(indptr, itype)).astype(itype)
-    a_indptr = _nd.array(indptr, ctx)
+    a_indptr = _nd.array(indptr, dev)
     a_init = (a_data, a_indices, a_indptr)
     # construct tvm sparse array with tuple
-    a = tvmsp.array(a_init, shape=source_array.shape, ctx=ctx)
+    a = tvmsp.array(a_init, shape=source_array.shape, device=dev)
     assert a.data.dtype == a.dtype
     Ab = namedtuple("CSRBuffer", ["data", "indices", "indptr"])
     Ab.data = tvm.tir.decl_buffer(a.data.shape, a.data.dtype, name="A_data")
     Ab.indices = tvm.tir.decl_buffer(a.data.shape, a.data.dtype, name="A_indices")
     binds = {A.data: Ab.data, A.indices: Ab.indices}
     f = tvm.build(s, [nr, A.data, C], target, binds=binds)
-    c = tvmsp.array(np.zeros((_nr, _nc), dtype), ctx)
+    c = tvmsp.array(np.zeros((_nr, _nc), dtype), dev)
     c.data = tvm.nd.empty(a.data.shape, dtype)
     c.indices = a.indices
     c.indptr = a.indptr
diff --git a/tests/python/contrib/test_tensorrt.py b/tests/python/contrib/test_tensorrt.py
index ae8214d6463c..2bef7be65938 100644
--- a/tests/python/contrib/test_tensorrt.py
+++ b/tests/python/contrib/test_tensorrt.py
@@ -24,7 +24,7 @@
 
 from tvm import relay, runtime
 from tvm.relay.op.contrib import tensorrt
-from tvm.contrib import graph_runtime, utils
+from tvm.contrib import graph_executor, utils
 from tvm.runtime.vm import VirtualMachine
 from tvm.relay import Any, GlobalVar, transform
 from tvm.relay.expr_functor import ExprVisitor
@@ -97,7 +97,7 @@ def run_and_verify_func(config, target="cuda"):
         for k, v in input_shapes.items()
         if k not in is_param
     }
-    ctx = tvm.context(target)
+    dev = tvm.device(target)
 
     result_dict = dict()
     for mode in ["graph", "vm"]:
@@ -110,10 +110,10 @@ def run_and_verify_func(config, target="cuda"):
                 with tvm.transform.PassContext(
                     opt_level=3, config={"relay.ext.tensorrt.options": config}
                 ):
-                    exec = relay.create_executor(mode, mod=mod, ctx=ctx, target=target)
+                    exec = relay.create_executor(mode, mod=mod, device=dev, target=target)
             else:
                 with tvm.transform.PassContext(opt_level=3):
-                    exec = relay.create_executor(mode, mod=mod, ctx=ctx, target=target)
+                    exec = relay.create_executor(mode, mod=mod, device=dev, target=target)
             if not skip_runtime_test():
                 result_dict[result_key] = exec.evaluate()(**input_dict, **params)
 
@@ -143,10 +143,10 @@ def compile_and_run(mod, params, i_data, mode="vm", use_trt=True):
             with tvm.transform.PassContext(
                 opt_level=3, config={"relay.ext.tensorrt.options": config}
             ):
-                exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda")
+                exec = relay.create_executor(mode, mod=mod, device=tvm.gpu(0), target="cuda")
         else:
             with tvm.transform.PassContext(opt_level=3):
-                exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda")
+                exec = relay.create_executor(mode, mod=mod, device=tvm.gpu(0), target="cuda")
 
         res = exec.evaluate()(i_data, **params) if not skip_runtime_test() else None
         return res
@@ -198,10 +198,14 @@ def test_tensorrt_simple():
                 with tvm.transform.PassContext(
                     opt_level=3, config={"relay.ext.tensorrt.options": config}
                 ):
-                    relay_exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda")
+                    relay_exec = relay.create_executor(
+                        mode, mod=mod, device=tvm.gpu(0), target="cuda"
+                    )
             else:
                 with tvm.transform.PassContext(opt_level=3):
-                    relay_exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda")
+                    relay_exec = relay.create_executor(
+                        mode, mod=mod, device=tvm.gpu(0), target="cuda"
+                    )
             if not skip_runtime_test():
                 result_dict[result_key] = relay_exec.evaluate()(x_data, y_data, z_data)
 
@@ -243,12 +247,12 @@ def test_tensorrt_not_compatible():
     mod, config = tensorrt.partition_for_tensorrt(mod)
     for mode in ["graph", "vm"]:
         with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}):
-            exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda")
+            exec = relay.create_executor(mode, mod=mod, device=tvm.gpu(0), target="cuda")
             if not skip_runtime_test():
                 results = exec.evaluate()(x_data)
 
 
-def test_tensorrt_serialize_graph_runtime():
+def test_tensorrt_serialize_graph_executor():
     if skip_codegen_test():
         return
     import mxnet as mx
@@ -269,7 +273,7 @@ def compile_graph(mod, params):
         return graph, lib, params
 
     def run_graph(graph, lib, params):
-        mod_ = graph_runtime.create(graph, lib, ctx=tvm.gpu(0))
+        mod_ = graph_executor.create(graph, lib, device=tvm.gpu(0))
         mod_.load_params(params)
         mod_.run(data=i_data)
         res = mod_.get_output(0)
@@ -292,7 +296,7 @@ def load_graph():
         lib = tvm.runtime.load_module(tmpdir.relpath("compiled.so"))
         return graph, lib, params
 
-    # Test serialization with graph runtime
+    # Test serialization with graph executor
     graph, lib, graph_params = compile_graph(mod, params)
     save_graph(graph, lib, graph_params)
     loaded_graph, loaded_lib, loaded_params = load_graph()
@@ -702,7 +706,9 @@ def test_run(x_data_list, x_shape, new_shape, should_offload_to_trt):
                 assert are_ops_on_trt(mod, op_list=["reshape"]) == should_offload_to_trt
             if not skip_runtime_test():
                 with relay.build_config(opt_level=3):
-                    relay_exec = relay.create_executor("vm", mod=mod, ctx=tvm.cpu(0), target="llvm")
+                    relay_exec = relay.create_executor(
+                        "vm", mod=mod, device=tvm.cpu(0), target="llvm"
+                    )
 
                 for i, x_data in enumerate(x_data_list):
                     result_arr[i][use_trt] = relay_exec.evaluate()(x_data)
@@ -1172,7 +1178,7 @@ def test_tensorrt_dynamic_batch():
 
         if not skip_runtime_test():
             with relay.build_config(opt_level=3):
-                relay_exec = relay.create_executor("vm", mod=mod, ctx=tvm.cpu(0), target="llvm")
+                relay_exec = relay.create_executor("vm", mod=mod, device=tvm.cpu(0), target="llvm")
 
             for i, batch_size in enumerate(batches_to_test):
                 result_arr[i][use_trt] = relay_exec.evaluate()(x_data[:batch_size, ...])
@@ -1203,7 +1209,7 @@ def test_tensorrt_dynamic_batch_conv():
 
         if not skip_runtime_test():
             with relay.build_config(opt_level=3):
-                relay_exec = relay.create_executor("vm", mod=mod, ctx=tvm.cpu(0), target="llvm")
+                relay_exec = relay.create_executor("vm", mod=mod, device=tvm.cpu(0), target="llvm")
 
             for i, batch_size in enumerate(batches_to_test):
                 result_arr[i][use_trt] = relay_exec.evaluate()(x_data[:batch_size, ...], **params)
@@ -1301,8 +1307,8 @@ def get_maskrcnn_input(in_size: int) -> np.ndarray:
     if skip_runtime_test():
         return
 
-    ctx = tvm.cpu()
-    vm = tvm.runtime.vm.VirtualMachine(vm_trt_exec, ctx)
+    dev = tvm.cpu()
+    vm = tvm.runtime.vm.VirtualMachine(vm_trt_exec, dev)
     vm.set_input("main", **{"input0": np_sample_input})
     tvm_res = vm.run()
 
@@ -1354,7 +1360,7 @@ def test_empty_subgraph():
     x_data = np.random.uniform(-1, 1, x_shape).astype("float32")
     for mode in ["graph", "vm"]:
         with tvm.transform.PassContext(opt_level=3):
-            exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda")
+            exec = relay.create_executor(mode, mod=mod, device=tvm.gpu(0), target="cuda")
             if not skip_runtime_test():
                 results = exec.evaluate()(x_data)
 
diff --git a/tests/python/contrib/test_tflite_runtime.py b/tests/python/contrib/test_tflite_runtime.py
index 39d8881f4040..222dcd469401 100644
--- a/tests/python/contrib/test_tflite_runtime.py
+++ b/tests/python/contrib/test_tflite_runtime.py
@@ -130,7 +130,6 @@ def test_remote():
     # inference via remote tvm tflite runtime
     server = rpc.Server("localhost")
     remote = rpc.connect(server.host, server.port)
-    ctx = remote.cpu(0)
     a = remote.upload(tflite_model_path)
 
     with open(tflite_model_path, "rb") as model_fin:
diff --git a/tests/python/contrib/test_thrust.py b/tests/python/contrib/test_thrust.py
index 4edce0d6a642..7b4b3a3840ae 100644
--- a/tests/python/contrib/test_thrust.py
+++ b/tests/python/contrib/test_thrust.py
@@ -43,7 +43,7 @@ def test_stable_sort_by_key():
                 print("skip because thrust is not enabled...")
                 return
 
-            ctx = tvm.context(target, 0)
+            dev = tvm.device(target, 0)
             s = te.create_schedule([keys_out.op, values_out.op])
             f = tvm.build(s, [keys, values, keys_out, values_out], target)
 
@@ -51,10 +51,10 @@ def test_stable_sort_by_key():
             values_np = np.random.randint(0, 10, size=(size,)).astype(np.int32)
             keys_np_out = np.zeros(keys_np.shape, np.int32)
             values_np_out = np.zeros(values_np.shape, np.int32)
-            keys_in = tvm.nd.array(keys_np, ctx)
-            values_in = tvm.nd.array(values_np, ctx)
-            keys_out = tvm.nd.array(keys_np_out, ctx)
-            values_out = tvm.nd.array(values_np_out, ctx)
+            keys_in = tvm.nd.array(keys_np, dev)
+            values_in = tvm.nd.array(values_np, dev)
+            keys_out = tvm.nd.array(keys_np_out, dev)
+            values_out = tvm.nd.array(values_np_out, dev)
             f(keys_in, values_in, keys_out, values_out)
 
             ref_keys_out = np.sort(keys_np)
@@ -80,7 +80,7 @@ def test_exclusive_scan():
                 scan, reduction = exclusive_scan(values, return_reduction=True)
                 s = schedule_scan([scan, reduction])
 
-                ctx = tvm.context(target, 0)
+                dev = tvm.device(target, 0)
                 f = tvm.build(s, [values, scan, reduction], target)
 
                 values_np = np.random.randint(0, 10, size=ishape).astype(np.int32)
@@ -93,9 +93,9 @@ def test_exclusive_scan():
 
                 reduction_np_out = np.zeros(reduction_shape, np.int32)
 
-                values_in = tvm.nd.array(values_np, ctx)
-                values_out = tvm.nd.array(values_np_out, ctx)
-                reduction_out = tvm.nd.array(reduction_np_out, ctx)
+                values_in = tvm.nd.array(values_np, dev)
+                values_out = tvm.nd.array(values_np_out, dev)
+                reduction_out = tvm.nd.array(reduction_np_out, dev)
                 f(values_in, values_out, reduction_out)
 
                 ref_values_out = np.cumsum(values_np, axis=-1, dtype="int32") - values_np
@@ -123,13 +123,13 @@ def test_inclusive_scan():
                 scan = scan_thrust(values, out_dtype, exclusive=False)
                 s = tvm.te.create_schedule([scan.op])
 
-                ctx = tvm.context(target, 0)
+                dev = tvm.device(target, 0)
                 f = tvm.build(s, [values, scan], target)
 
                 values_np = np.random.randint(0, 10, size=ishape).astype(np.int32)
                 values_np_out = np.zeros(values_np.shape, out_dtype)
-                values_in = tvm.nd.array(values_np, ctx)
-                values_out = tvm.nd.array(values_np_out, ctx)
+                values_in = tvm.nd.array(values_np, dev)
+                values_out = tvm.nd.array(values_np_out, dev)
                 f(values_in, values_out)
 
                 ref_values_out = np.cumsum(values_np, axis=-1, dtype=out_dtype)
diff --git a/tests/python/contrib/test_verilator/infrastructure.py b/tests/python/contrib/test_verilator/infrastructure.py
index 7e4c297853d5..cf9f8bd4c6bc 100644
--- a/tests/python/contrib/test_verilator/infrastructure.py
+++ b/tests/python/contrib/test_verilator/infrastructure.py
@@ -113,6 +113,6 @@ def compile_module(mod):
 def run_module(exe, inputs):
     """Run Relay module"""
 
-    ctx = tvm.cpu()
-    vm = runtime.vm.VirtualMachine(exe, ctx)
+    dev = tvm.cpu()
+    vm = runtime.vm.VirtualMachine(exe, dev)
     return vm.run(**inputs)
diff --git a/tests/python/contrib/test_vitis_ai/infrastructure.py b/tests/python/contrib/test_vitis_ai/infrastructure.py
index df7836a37647..501ee255c143 100644
--- a/tests/python/contrib/test_vitis_ai/infrastructure.py
+++ b/tests/python/contrib/test_vitis_ai/infrastructure.py
@@ -34,7 +34,7 @@
 from tvm.relay.op.contrib.vitis_ai import annotation
 from tvm.relay.build_module import bind_params_by_name
 from tvm.contrib.target import vitis_ai
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 from tvm.contrib import utils
 
 
@@ -145,7 +145,7 @@ def verify_result(
     result,
     tol=1e-5,
     target="llvm",
-    ctx=tvm.cpu(),
+    device=tvm.cpu(),
     params=None,
     dpu_target="DPUCADX8G",
     tvm_ops=0,
@@ -154,8 +154,7 @@ def verify_result(
 
     lib = build_module(mod, target, params=params, dpu_target=dpu_target, tvm_ops=tvm_ops)
     lib = update_lib(lib)
-    ctx = tvm.cpu()
-    rt_mod = graph_runtime.GraphModule(lib["default"](tvm.cpu()))
+    rt_mod = graph_executor.GraphModule(lib["default"](tvm.cpu()))
 
     for name, data in map_inputs.items():
         rt_mod.set_input(name, data)
@@ -166,6 +165,6 @@ def verify_result(
     results = result if isinstance(result, list) else [result]
 
     for idx, shape in enumerate(out_shapes):
-        out = tvm.nd.empty(shape, ctx=ctx)
+        out = tvm.nd.empty(shape, device=device)
         out = rt_mod.get_output(idx, out)
         tvm.testing.assert_allclose(out.asnumpy(), results[idx], rtol=tol, atol=tol)
diff --git a/tests/python/contrib/test_vitis_ai/test_vitis_ai_runtime_cpu_part.py b/tests/python/contrib/test_vitis_ai/test_vitis_ai_runtime_cpu_part.py
index 030dda372cfe..64071325ef52 100644
--- a/tests/python/contrib/test_vitis_ai/test_vitis_ai_runtime_cpu_part.py
+++ b/tests/python/contrib/test_vitis_ai/test_vitis_ai_runtime_cpu_part.py
@@ -59,7 +59,7 @@ def test_extern_vitis_ai_resnet18():
     mod, params = relay.testing.resnet.get_workload(num_layers=18, batch_size=1)
     ref_mod, params = relay.testing.resnet.get_workload(num_layers=18, batch_size=1)
 
-    ref_ex = relay.create_executor("graph", mod=ref_mod, ctx=tvm.cpu(0))
+    ref_ex = relay.create_executor("graph", mod=ref_mod, device=tvm.cpu(0))
     i_data = np.random.uniform(0, 1, ishape).astype(dtype)
 
     ref_res = ref_ex.evaluate()(i_data, **params)
diff --git a/tests/python/driver/tvmc/test_compiler.py b/tests/python/driver/tvmc/test_compiler.py
index 6c2125262e0e..17b2834feb11 100644
--- a/tests/python/driver/tvmc/test_compiler.py
+++ b/tests/python/driver/tvmc/test_compiler.py
@@ -45,7 +45,7 @@ def test_save_dumps(tmpdir_factory):
 def verify_compile_tflite_module(model, shape_dict=None):
     pytest.importorskip("tflite")
 
-    graph, lib, params, dumps = tvmc.compiler.compile_model(
+    graph, lib, params, dumps = tvmc.compile(
         model, target="llvm", dump_code="ll", alter_layout="NCHW", shape_dict=shape_dict
     )
 
@@ -74,7 +74,7 @@ def test_compile_tflite_module(tflite_mobilenet_v1_1_quant):
 def test_cross_compile_aarch64_tflite_module(tflite_mobilenet_v1_1_quant):
     pytest.importorskip("tflite")
 
-    graph, lib, params, dumps = tvmc.compiler.compile_model(
+    graph, lib, params, dumps = tvmc.compile(
         tflite_mobilenet_v1_1_quant,
         target="llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr='+neon'",
         dump_code="asm",
@@ -91,9 +91,7 @@ def test_compile_keras__save_module(keras_resnet50, tmpdir_factory):
     # some CI environments wont offer tensorflow/Keras, so skip in case it is not present
     pytest.importorskip("tensorflow")
 
-    graph, lib, params, dumps = tvmc.compiler.compile_model(
-        keras_resnet50, target="llvm", dump_code="ll"
-    )
+    graph, lib, params, dumps = tvmc.compile(keras_resnet50, target="llvm", dump_code="ll")
 
     expected_temp_dir = tmpdir_factory.mktemp("saved_output")
     expected_file_name = "saved.tar"
@@ -111,7 +109,7 @@ def test_cross_compile_aarch64_keras_module(keras_resnet50):
     # some CI environments wont offer tensorflow/Keras, so skip in case it is not present
     pytest.importorskip("tensorflow")
 
-    graph, lib, params, dumps = tvmc.compiler.compile_model(
+    graph, lib, params, dumps = tvmc.compile(
         keras_resnet50,
         target="llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr='+neon'",
         dump_code="asm",
@@ -129,7 +127,7 @@ def verify_compile_onnx_module(model, shape_dict=None):
     # some CI environments wont offer onnx, so skip in case it is not present
     pytest.importorskip("onnx")
 
-    graph, lib, params, dumps = tvmc.compiler.compile_model(
+    graph, lib, params, dumps = tvmc.compile(
         model, target="llvm", dump_code="ll", shape_dict=shape_dict
     )
 
@@ -158,7 +156,7 @@ def test_cross_compile_aarch64_onnx_module(onnx_resnet50):
     # some CI environments wont offer onnx, so skip in case it is not present
     pytest.importorskip("onnx")
 
-    graph, lib, params, dumps = tvmc.compiler.compile_model(
+    graph, lib, params, dumps = tvmc.compile(
         onnx_resnet50,
         target="llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+neon",
         dump_code="asm",
@@ -176,7 +174,7 @@ def test_cross_compile_aarch64_onnx_module(onnx_resnet50):
 def test_compile_opencl(tflite_mobilenet_v1_0_25_128):
     pytest.importorskip("tflite")
 
-    graph, lib, params, dumps = tvmc.compiler.compile_model(
+    graph, lib, params, dumps = tvmc.compile(
         tflite_mobilenet_v1_0_25_128,
         target="opencl --host=llvm",
         alter_layout="NCHW",
@@ -196,7 +194,7 @@ def test_compile_opencl(tflite_mobilenet_v1_0_25_128):
 def test_compile_tflite_module_with_external_codegen(tflite_mobilenet_v1_1_quant):
     pytest.importorskip("tflite")
 
-    graph, lib, params, dumps = tvmc.compiler.compile_model(
+    graph, lib, params, dumps = tvmc.compile(
         tflite_mobilenet_v1_1_quant, target="ethos-n77, llvm", dump_code="relay"
     )
 
@@ -220,7 +218,7 @@ def test_compile_check_configs_composite_target(mock_pc, mock_fe, mock_ct, mock_
     mock_ct.return_value = mock_codegen
     mock_relay.return_value = mock.MagicMock()
 
-    graph, lib, params, dumps = tvmc.compiler.compile_model(
+    graph, lib, params, dumps = tvmc.compile(
         "no_file_needed", target="mockcodegen -testopt=value, llvm"
     )
 
diff --git a/tests/python/driver/tvmc/test_frontends.py b/tests/python/driver/tvmc/test_frontends.py
index b41f4c4dff2d..3da63d43ef29 100644
--- a/tests/python/driver/tvmc/test_frontends.py
+++ b/tests/python/driver/tvmc/test_frontends.py
@@ -93,7 +93,7 @@ def test_load_model__invalid_path__no_language():
     pytest.importorskip("tflite")
 
     with pytest.raises(FileNotFoundError):
-        tvmc.frontends.load_model("not/a/file.tflite")
+        tvmc.load("not/a/file.tflite")
 
 
 def test_load_model__invalid_path__with_language():
@@ -101,47 +101,55 @@ def test_load_model__invalid_path__with_language():
     pytest.importorskip("onnx")
 
     with pytest.raises(FileNotFoundError):
-        tvmc.frontends.load_model("not/a/file.txt", model_format="onnx")
+        tvmc.load("not/a/file.txt", model_format="onnx")
 
 
 def test_load_model__tflite(tflite_mobilenet_v1_1_quant):
     # some CI environments wont offer TFLite, so skip in case it is not present
     pytest.importorskip("tflite")
 
-    mod, params = tvmc.frontends.load_model(tflite_mobilenet_v1_1_quant)
+    mod, params = tvmc.load(tflite_mobilenet_v1_1_quant)
     assert type(mod) is IRModule
     assert type(params) is dict
     # check whether one known value is part of the params dict
     assert "_param_1" in params.keys()
 
 
-def test_load_model__keras(keras_resnet50):
+@pytest.mark.parametrize("load_model_kwargs", [{}, {"layout": "NCHW"}])
+def test_load_model__keras(keras_resnet50, load_model_kwargs):
     # some CI environments wont offer TensorFlow/Keras, so skip in case it is not present
     pytest.importorskip("tensorflow")
 
-    mod, params = tvmc.frontends.load_model(keras_resnet50)
+    mod, params = tvmc.frontends.load_model(keras_resnet50, **load_model_kwargs)
     assert type(mod) is IRModule
     assert type(params) is dict
     ## check whether one known value is part of the params dict
     assert "_param_1" in params.keys()
 
 
+def verify_load_model__onnx(model, **kwargs):
+    mod, params = tvmc.frontends.load_model(model, **kwargs)
+    assert type(mod) is IRModule
+    assert type(params) is dict
+    return mod, params
+
+
 def test_load_model__onnx(onnx_resnet50):
     # some CI environments wont offer onnx, so skip in case it is not present
     pytest.importorskip("onnx")
-
-    mod, params = tvmc.frontends.load_model(onnx_resnet50)
-    assert type(mod) is IRModule
-    assert type(params) is dict
-    ## check whether one known value is part of the params dict
+    mod, params = verify_load_model__onnx(onnx_resnet50)
+    # check whether one known value is part of the params dict
     assert "resnetv24_batchnorm0_gamma" in params.keys()
+    mod, params = verify_load_model__onnx(onnx_resnet50, freeze_params=True)
+    # check that the parameter dict is empty, implying that they have been folded into constants
+    assert params == {}
 
 
 def test_load_model__pb(pb_mobilenet_v1_1_quant):
     # some CI environments wont offer TensorFlow, so skip in case it is not present
     pytest.importorskip("tensorflow")
 
-    mod, params = tvmc.frontends.load_model(pb_mobilenet_v1_1_quant)
+    mod, params = tvmc.load(pb_mobilenet_v1_1_quant)
     assert type(mod) is IRModule
     assert type(params) is dict
     # check whether one known value is part of the params dict
@@ -153,7 +161,7 @@ def test_load_model___wrong_language__to_keras(tflite_mobilenet_v1_1_quant):
     pytest.importorskip("tensorflow")
 
     with pytest.raises(OSError):
-        tvmc.frontends.load_model(tflite_mobilenet_v1_1_quant, model_format="keras")
+        tvmc.load(tflite_mobilenet_v1_1_quant, model_format="keras")
 
 
 def test_load_model___wrong_language__to_tflite(keras_resnet50):
@@ -171,7 +179,7 @@ def test_load_model___wrong_language__to_onnx(tflite_mobilenet_v1_1_quant):
     from google.protobuf.message import DecodeError
 
     with pytest.raises(DecodeError):
-        tvmc.frontends.load_model(tflite_mobilenet_v1_1_quant, model_format="onnx")
+        tvmc.load(tflite_mobilenet_v1_1_quant, model_format="onnx")
 
 
 @pytest.mark.skip(reason="https://github.com/apache/tvm/issues/7455")
@@ -180,9 +188,7 @@ def test_load_model__pth(pytorch_resnet18):
     pytest.importorskip("torch")
     pytest.importorskip("torchvision")
 
-    mod, params = tvmc.frontends.load_model(
-        pytorch_resnet18, shape_dict={"input": [1, 3, 224, 224]}
-    )
+    mod, params = tvmc.load(pytorch_resnet18, shape_dict={"input": [1, 3, 224, 224]})
     assert type(mod) is IRModule
     assert type(params) is dict
     # check whether one known value is part of the params dict
@@ -194,7 +200,7 @@ def test_load_model___wrong_language__to_pytorch(tflite_mobilenet_v1_1_quant):
     pytest.importorskip("torch")
 
     with pytest.raises(RuntimeError) as e:
-        tvmc.frontends.load_model(
+        tvmc.load(
             tflite_mobilenet_v1_1_quant,
             model_format="pytorch",
             shape_dict={"input": [1, 3, 224, 224]},
diff --git a/tests/python/driver/tvmc/test_runner.py b/tests/python/driver/tvmc/test_runner.py
index 544ed9f7e9df..5fdf58fa8d64 100644
--- a/tests/python/driver/tvmc/test_runner.py
+++ b/tests/python/driver/tvmc/test_runner.py
@@ -73,7 +73,7 @@ def test_run_tflite_module__with_profile__valid_input(
     # some CI environments wont offer TFLite, so skip in case it is not present
     pytest.importorskip("tflite")
 
-    outputs, times = tvmc.runner.run_module(
+    outputs, times = tvmc.run(
         tflite_compiled_module_as_tarfile,
         inputs_file=imagenet_cat,
         hostname=None,
diff --git a/tests/python/driver/tvmc/test_tvmc_common.py b/tests/python/driver/tvmc/test_tvmc_common.py
index b272ceccea39..474649d8b1b3 100644
--- a/tests/python/driver/tvmc/test_tvmc_common.py
+++ b/tests/python/driver/tvmc/test_tvmc_common.py
@@ -273,3 +273,18 @@ def test_parse_multiple_target_with_opts():
     assert "myopt" in targets[0]["opts"]
     assert "value" == targets[0]["opts"]["myopt"]
     assert "llvm" == targets[1]["name"]
+
+
+def test_parse_quotes_and_separators_on_options():
+    targets_no_quote = tvmc.common.parse_target("foo -option1=+v1.0x,+value,+bar")
+    targets_single_quote = tvmc.common.parse_target("foo -option1='+v1.0x,+value'")
+    targets_double_quote = tvmc.common.parse_target('foo -option1="+v1.0x,+value"')
+
+    assert len(targets_no_quote) == 1
+    assert "+v1.0x,+value,+bar" == targets_no_quote[0]["opts"]["option1"]
+
+    assert len(targets_single_quote) == 1
+    assert "+v1.0x,+value" == targets_single_quote[0]["opts"]["option1"]
+
+    assert len(targets_double_quote) == 1
+    assert "+v1.0x,+value" == targets_double_quote[0]["opts"]["option1"]
diff --git a/tests/python/frontend/caffe/test_forward.py b/tests/python/frontend/caffe/test_forward.py
index d75ecd83a285..d0f87fcc21c7 100644
--- a/tests/python/frontend/caffe/test_forward.py
+++ b/tests/python/frontend/caffe/test_forward.py
@@ -36,7 +36,7 @@
 
 import tvm
 from tvm import relay
-from tvm.contrib import utils, graph_runtime
+from tvm.contrib import utils, graph_executor
 from tvm.contrib.download import download_testdata
 
 CURRENT_DIR = os.path.join(os.path.expanduser("~"), ".tvm_test_data", "caffe_test")
@@ -201,11 +201,11 @@ def _run_tvm(data, proto_file, blob_file):
     target = "llvm"
     target_host = "llvm"
 
-    ctx = tvm.cpu(0)
+    dev = tvm.cpu(0)
     with tvm.transform.PassContext(opt_level=3):
         lib = relay.build(mod, target=target, target_host=target_host, params=params)
     dtype = "float32"
-    m = graph_runtime.GraphModule(lib["default"](ctx))
+    m = graph_executor.GraphModule(lib["default"](dev))
     if isinstance(data, (tuple, list)):
         for idx, d in enumerate(data):
             m.set_input("data" + str(idx), tvm.nd.array(d.astype(dtype)))
diff --git a/tests/python/frontend/caffe2/test_forward.py b/tests/python/frontend/caffe2/test_forward.py
index a45b86ca903f..1081b087c468 100644
--- a/tests/python/frontend/caffe2/test_forward.py
+++ b/tests/python/frontend/caffe2/test_forward.py
@@ -17,7 +17,7 @@
 import numpy as np
 import tvm
 from tvm import te
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 from tvm import relay
 from model_zoo import c2_squeezenet, c2_resnet50, c2_vgg19
 from caffe2.python import workspace, core
@@ -26,7 +26,7 @@
 import tvm.testing
 
 
-def get_tvm_output(model, input_data, target, ctx, output_shape, output_dtype="float32"):
+def get_tvm_output(model, input_data, target, device, output_shape, output_dtype="float32"):
     """ Generic function to execute and get tvm output"""
     # supporting multiple inputs in caffe2 in a bit tricky,
     # because the input names can appear at the beginning or end of model.predict_net.external_input
@@ -42,7 +42,7 @@ def get_tvm_output(model, input_data, target, ctx, output_shape, output_dtype="f
     with tvm.transform.PassContext(opt_level=3):
         lib = relay.build(mod, target, params=params)
 
-    m = graph_runtime.GraphModule(lib["default"](ctx))
+    m = graph_executor.GraphModule(lib["default"](device))
 
     # set inputs
     m.set_input(input_names, tvm.nd.array(input_data.astype(input_data.dtype)))
@@ -78,8 +78,8 @@ def verify_caffe2_forward_impl(model, data_shape, out_shape):
     dtype = "float32"
     data = np.random.uniform(size=data_shape).astype(dtype)
     c2_out = get_caffe2_output(model, data, dtype)
-    for target, ctx in tvm.testing.enabled_targets():
-        tvm_out = get_tvm_output(model, data, target, ctx, out_shape, dtype)
+    for target, dev in tvm.testing.enabled_targets():
+        tvm_out = get_tvm_output(model, data, target, dev, out_shape, dtype)
         tvm.testing.assert_allclose(c2_out, tvm_out, rtol=1e-5, atol=1e-5)
 
 
diff --git a/tests/python/frontend/coreml/test_forward.py b/tests/python/frontend/coreml/test_forward.py
index 1d3f6c90a48e..c227c3955c5b 100644
--- a/tests/python/frontend/coreml/test_forward.py
+++ b/tests/python/frontend/coreml/test_forward.py
@@ -21,7 +21,7 @@
 
 import tvm
 from tvm import te
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 from tvm import topi
 import tvm.topi.testing
 from tvm import relay
@@ -33,11 +33,11 @@
 
 
 def get_tvm_output(
-    func, x, params, target, ctx, out_shape=(1, 1000), input_name="image", dtype="float32"
+    func, x, params, target, device, out_shape=(1, 1000), input_name="image", dtype="float32"
 ):
     with tvm.transform.PassContext(opt_level=3):
         lib = relay.build(func, target, params=params)
-    m = graph_runtime.GraphModule(lib["default"](ctx))
+    m = graph_executor.GraphModule(lib["default"](device))
     # set inputs
     m.set_input(input_name, tvm.nd.array(x.astype(dtype)))
     m.run()
@@ -52,10 +52,10 @@ def run_model_checkonly(model_file, model_name="", input_name="image"):
     shape_dict = {input_name: x.shape}
     # Some Relay passes change operators on the fly. Ensuring that we generate
     # new graph for each target.
-    for target, ctx in tvm.testing.enabled_targets():
+    for target, dev in tvm.testing.enabled_targets():
         mod, params = relay.frontend.from_coreml(model, shape_dict)
-        tvm_output = get_tvm_output(mod["main"], x, params, target, ctx)
-        print(target, ctx, model_name, "prediction id: ", np.argmax(tvm_output.flat))
+        tvm_output = get_tvm_output(mod["main"], x, params, target, dev)
+        print(target, dev, model_name, "prediction id: ", np.argmax(tvm_output.flat))
 
 
 @tvm.testing.uses_gpu
@@ -71,7 +71,7 @@ def test_resnet50_checkonly():
 
 
 def run_tvm_graph(
-    coreml_model, target, ctx, input_data, input_name, output_shape, output_dtype="float32"
+    coreml_model, target, device, input_data, input_name, output_shape, output_dtype="float32"
 ):
     """ Generic function to compile on relay and execute on tvm """
     if isinstance(input_data, list):
@@ -88,9 +88,9 @@ def run_tvm_graph(
     with tvm.transform.PassContext(opt_level=3):
         lib = relay.build(mod, target, params=params)
 
-    from tvm.contrib import graph_runtime
+    from tvm.contrib import graph_executor
 
-    m = graph_runtime.GraphModule(lib["default"](ctx))
+    m = graph_executor.GraphModule(lib["default"](device))
     # set inputs
     if isinstance(input_data, list):
         for i, e in enumerate(input_name):
@@ -129,9 +129,9 @@ def verify_AddLayerParams(input_dim, alpha=2):
         name="Add", alpha=alpha, input_names=["input1", "input2"], output_name="output", mode="ADD"
     )
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in tvm.testing.enabled_targets():
+    for target, dev in tvm.testing.enabled_targets():
         out = run_tvm_graph(
-            model, target, ctx, [a_np1, a_np2], ["input1", "input2"], b_np.shape, dtype
+            model, target, dev, [a_np1, a_np2], ["input1", "input2"], b_np.shape, dtype
         )
         tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
 
@@ -161,9 +161,9 @@ def verify_MultiplyLayerParams(input_dim, alpha):
         mode="MULTIPLY",
     )
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in tvm.testing.enabled_targets():
+    for target, dev in tvm.testing.enabled_targets():
         out = run_tvm_graph(
-            model, target, ctx, [a_np1, a_np2], ["input1", "input2"], b_np.shape, dtype
+            model, target, dev, [a_np1, a_np2], ["input1", "input2"], b_np.shape, dtype
         )
         tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
 
@@ -189,9 +189,9 @@ def verify_ConcatLayerParams(input1_dim, input2_dim):
         name="Concate", input_names=["input1", "input2"], output_name="output", mode="CONCAT"
     )
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in tvm.testing.enabled_targets():
+    for target, dev in tvm.testing.enabled_targets():
         out = run_tvm_graph(
-            model, target, ctx, [a_np1, a_np2], ["input1", "input2"], b_np.shape, dtype
+            model, target, dev, [a_np1, a_np2], ["input1", "input2"], b_np.shape, dtype
         )
         tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
 
@@ -226,8 +226,8 @@ def verify_UpsampleLayerParams(input_dim, scale, mode):
     )
 
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in tvm.testing.enabled_targets():
-        out = run_tvm_graph(model, target, ctx, a_np, "input", b_np.shape, dtype)
+    for target, dev in tvm.testing.enabled_targets():
+        out = run_tvm_graph(model, target, dev, a_np, "input", b_np.shape, dtype)
         tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
 
 
@@ -249,8 +249,8 @@ def verify_l2_normalize(input_dim, eps):
     builder.add_l2_normalize(name="L2", epsilon=eps, input_name="input", output_name="output")
 
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in tvm.testing.enabled_targets():
-        out = run_tvm_graph(model, target, ctx, a_np, "input", b_np.shape, dtype)
+    for target, dev in tvm.testing.enabled_targets():
+        out = run_tvm_graph(model, target, dev, a_np, "input", b_np.shape, dtype)
         tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
 
 
@@ -279,8 +279,8 @@ def verify_lrn(input_dim, size, bias, alpha, beta):
     )
 
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in tvm.testing.enabled_targets():
-        out = run_tvm_graph(model, target, ctx, a_np, "input", b_np.shape, dtype)
+    for target, dev in tvm.testing.enabled_targets():
+        out = run_tvm_graph(model, target, dev, a_np, "input", b_np.shape, dtype)
         tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
 
 
@@ -304,9 +304,9 @@ def verify_average(input_dim1, input_dim2, axis=0):
         name="MEAN", input_names=["input1", "input2"], output_name="output", mode="AVE"
     )
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in tvm.testing.enabled_targets():
+    for target, dev in tvm.testing.enabled_targets():
         out = run_tvm_graph(
-            model, target, ctx, [a_np1, a_np2], ["input1", "input2"], b_np.shape, dtype
+            model, target, dev, [a_np1, a_np2], ["input1", "input2"], b_np.shape, dtype
         )
         tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
 
@@ -338,11 +338,11 @@ def verify_max(input_dim):
         name="Max", input_names=["input1", "input2", "input3"], output_name="output", mode="MAX"
     )
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in tvm.testing.enabled_targets():
+    for target, dev in tvm.testing.enabled_targets():
         out = run_tvm_graph(
             model,
             target,
-            ctx,
+            dev,
             [a_np1, a_np2, a_np3],
             ["input1", "input2", "input3"],
             b_np.shape,
@@ -377,11 +377,11 @@ def verify_min(input_dim):
         name="Min", input_names=["input1", "input2", "input3"], output_name="output", mode="MIN"
     )
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in tvm.testing.enabled_targets():
+    for target, dev in tvm.testing.enabled_targets():
         out = run_tvm_graph(
             model,
             target,
-            ctx,
+            dev,
             [a_np1, a_np2, a_np3],
             ["input1", "input2", "input3"],
             b_np.shape,
@@ -408,8 +408,8 @@ def verify_unary_sqrt(input_dim):
     builder.add_unary(name="sqrt", input_name="input", output_name="output", mode="sqrt")
 
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in tvm.testing.enabled_targets():
-        out = run_tvm_graph(model, target, ctx, [a_np], ["input"], ref_val.shape, dtype)
+    for target, dev in tvm.testing.enabled_targets():
+        out = run_tvm_graph(model, target, dev, [a_np], ["input"], ref_val.shape, dtype)
         tvm.testing.assert_allclose(out, ref_val, rtol=1e-5)
 
 
@@ -427,8 +427,8 @@ def verify_unary_rsqrt(input_dim, epsilon=0):
     )
 
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in tvm.testing.enabled_targets():
-        out = run_tvm_graph(model, target, ctx, [a_np], ["input"], ref_val.shape, dtype)
+    for target, dev in tvm.testing.enabled_targets():
+        out = run_tvm_graph(model, target, dev, [a_np], ["input"], ref_val.shape, dtype)
         tvm.testing.assert_allclose(out, ref_val, rtol=1e-5)
 
 
@@ -446,8 +446,8 @@ def verify_unary_inverse(input_dim, epsilon=0):
     )
 
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in tvm.testing.enabled_targets():
-        out = run_tvm_graph(model, target, ctx, [a_np], ["input"], ref_val.shape, dtype)
+    for target, dev in tvm.testing.enabled_targets():
+        out = run_tvm_graph(model, target, dev, [a_np], ["input"], ref_val.shape, dtype)
         tvm.testing.assert_allclose(out, ref_val, rtol=1e-5)
 
 
@@ -465,8 +465,8 @@ def verify_unary_power(input_dim, alpha):
     )
 
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in tvm.testing.enabled_targets():
-        out = run_tvm_graph(model, target, ctx, [a_np], ["input"], ref_val.shape, dtype)
+    for target, dev in tvm.testing.enabled_targets():
+        out = run_tvm_graph(model, target, dev, [a_np], ["input"], ref_val.shape, dtype)
         tvm.testing.assert_allclose(out, ref_val, rtol=1e-5)
 
 
@@ -482,8 +482,8 @@ def verify_unary_exp(input_dim):
     builder.add_unary(name="exp", input_name="input", output_name="output", mode="exp")
 
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in tvm.testing.enabled_targets():
-        out = run_tvm_graph(model, target, ctx, [a_np], ["input"], ref_val.shape, dtype)
+    for target, dev in tvm.testing.enabled_targets():
+        out = run_tvm_graph(model, target, dev, [a_np], ["input"], ref_val.shape, dtype)
         tvm.testing.assert_allclose(out, ref_val, rtol=1e-5)
 
 
@@ -499,8 +499,8 @@ def verify_unary_log(input_dim):
     builder.add_unary(name="log", input_name="input", output_name="output", mode="log")
 
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in tvm.testing.enabled_targets():
-        out = run_tvm_graph(model, target, ctx, [a_np], ["input"], ref_val.shape, dtype)
+    for target, dev in tvm.testing.enabled_targets():
+        out = run_tvm_graph(model, target, dev, [a_np], ["input"], ref_val.shape, dtype)
         tvm.testing.assert_allclose(out, ref_val, rtol=1e-5)
 
 
@@ -516,8 +516,8 @@ def verify_unary_abs(input_dim):
     builder.add_unary(name="abs", input_name="input", output_name="output", mode="abs")
 
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in tvm.testing.enabled_targets():
-        out = run_tvm_graph(model, target, ctx, [a_np], ["input"], ref_val.shape, dtype)
+    for target, dev in tvm.testing.enabled_targets():
+        out = run_tvm_graph(model, target, dev, [a_np], ["input"], ref_val.shape, dtype)
         tvm.testing.assert_allclose(out, ref_val, rtol=1e-5)
 
 
@@ -535,8 +535,8 @@ def verify_unary_threshold(input_dim, alpha):
     )
 
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in tvm.testing.enabled_targets():
-        out = run_tvm_graph(model, target, ctx, [a_np], ["input"], ref_val.shape, dtype)
+    for target, dev in tvm.testing.enabled_targets():
+        out = run_tvm_graph(model, target, dev, [a_np], ["input"], ref_val.shape, dtype)
         tvm.testing.assert_allclose(out, ref_val, rtol=1e-5)
 
 
@@ -596,8 +596,8 @@ def _verify_reduce(input_dim, mode, axis, ref_func, dtype="float32"):
         )
 
         model = cm.models.MLModel(builder.spec)
-        for target, ctx in tvm.testing.enabled_targets():
-            out = run_tvm_graph(model, target, ctx, [a_np], ["input"], ref_val.shape, dtype)
+        for target, dev in tvm.testing.enabled_targets():
+            out = run_tvm_graph(model, target, dev, [a_np], ["input"], ref_val.shape, dtype)
             tvm.testing.assert_allclose(out, ref_val, rtol=1e-5, atol=1e-5)
 
     dshapes = [[10, 10], [1, 10, 10], [1, 3, 10, 10]]
@@ -634,8 +634,8 @@ def verify_reshape(input_dim, target_shape, mode):
     )
 
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in tvm.testing.enabled_targets():
-        out = run_tvm_graph(model, target, ctx, [a_np], ["input"], ref_val.shape, dtype)
+    for target, dev in tvm.testing.enabled_targets():
+        out = run_tvm_graph(model, target, dev, [a_np], ["input"], ref_val.shape, dtype)
         tvm.testing.assert_allclose(out, ref_val, rtol=1e-5)
 
 
@@ -666,9 +666,9 @@ def verify_split(input_dim, nOutputs):
     builder.add_split(name="split", input_name="input", output_names=output_names)
 
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in tvm.testing.enabled_targets():
+    for target, dev in tvm.testing.enabled_targets():
         out = run_tvm_graph(
-            model, target, ctx, [a_np], ["input"], output_shapes, [dtype] * len(output_shapes)
+            model, target, dev, [a_np], ["input"], output_shapes, [dtype] * len(output_shapes)
         )
         tvm.testing.assert_allclose(out, ref_val, rtol=1e-5)
 
@@ -721,9 +721,9 @@ def verify_image_scaler(input_dim, blue_bias=0.0, green_bias=0.0, red_bias=0.0,
         name="add", input_names=["input1", "input2"], output_name="output", alpha=0, mode="ADD"
     )
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in tvm.testing.enabled_targets():
+    for target, dev in tvm.testing.enabled_targets():
         out = run_tvm_graph(
-            model, target, ctx, [a_np, a_np], ["input1", "input2"], b_np.shape, dtype
+            model, target, dev, [a_np, a_np], ["input1", "input2"], b_np.shape, dtype
         )
         tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
 
@@ -769,8 +769,8 @@ def verify_convolution(input_dim, filter, padding):
         output_name="output",
     )
     model = cm.models.MLModel(builder.spec)
-    for target, ctx in tvm.testing.enabled_targets():
-        out = run_tvm_graph(model, target, ctx, [a_np], ["input1"], output_shape=None)
+    for target, dev in tvm.testing.enabled_targets():
+        out = run_tvm_graph(model, target, dev, [a_np], ["input1"], output_shape=None)
         tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
 
 
diff --git a/tests/python/frontend/darknet/test_forward.py b/tests/python/frontend/darknet/test_forward.py
index 1535c3a1b88f..3bb8e93d3d22 100644
--- a/tests/python/frontend/darknet/test_forward.py
+++ b/tests/python/frontend/darknet/test_forward.py
@@ -24,7 +24,7 @@
 import numpy as np
 import tvm
 from tvm import te
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 from tvm.contrib.download import download_testdata
 
 download_testdata.__test__ = False
@@ -79,8 +79,8 @@ def _get_tvm_output(net, data, build_dtype="float32", states=None):
     lib = relay.build(mod, target, params=params)
 
     # Execute on TVM
-    ctx = tvm.cpu(0)
-    m = graph_runtime.GraphModule(lib["default"](ctx))
+    dev = tvm.cpu(0)
+    m = graph_executor.GraphModule(lib["default"](dev))
     # set inputs
     m.set_input("data", tvm.nd.array(data.astype(dtype)))
     if states:
diff --git a/tests/python/frontend/keras/test_forward.py b/tests/python/frontend/keras/test_forward.py
index 561e444f077f..c7f734b891dd 100644
--- a/tests/python/frontend/keras/test_forward.py
+++ b/tests/python/frontend/keras/test_forward.py
@@ -18,7 +18,7 @@
 import tvm
 from tvm import te
 from tvm import relay
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 import keras
 import tvm.testing
 
@@ -84,12 +84,12 @@ def verify_keras_frontend(keras_model, need_transpose=True, layout="NCHW"):
     def get_keras_output(xs, dtype="float32"):
         return keras_model.predict(xs)
 
-    def get_tvm_output(xs, target, ctx, dtype="float32"):
+    def get_tvm_output(xs, target, dev, dtype="float32"):
         shape_dict = {name: x.shape for (name, x) in zip(keras_model.input_names, xs)}
         mod, params = relay.frontend.from_keras(keras_model, shape_dict, layout=layout)
         with tvm.transform.PassContext(opt_level=2):
             lib = relay.build(mod, target, params=params)
-        m = graph_runtime.GraphModule(lib["default"](ctx))
+        m = graph_executor.GraphModule(lib["default"](dev))
         for name, x in zip(keras_model.input_names, xs):
             m.set_input(name, tvm.nd.array(x.astype(dtype)))
         m.run()
@@ -104,9 +104,9 @@ def to_channels_last(arr):
     xs = [np.random.uniform(size=shape, low=-1.0, high=1.0) for shape in in_shapes]
     keras_out = get_keras_output(xs)
     keras_out = keras_out if isinstance(keras_out, list) else [keras_out]
-    for target, ctx in tvm.testing.enabled_targets():
+    for target, dev in tvm.testing.enabled_targets():
         inputs = [to_channels_first(x) for x in xs] if need_transpose else xs
-        tvm_out = get_tvm_output(inputs, target, ctx)
+        tvm_out = get_tvm_output(inputs, target, dev)
         for kout, tout in zip(keras_out, tvm_out):
             if need_transpose:
                 tout = to_channels_last(tout)
diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py
index 4eb7f6139e8f..c4e8e804b15a 100644
--- a/tests/python/frontend/mxnet/test_forward.py
+++ b/tests/python/frontend/mxnet/test_forward.py
@@ -19,7 +19,7 @@
 
 import tvm
 from tvm import te
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 from tvm import relay
 import mxnet as mx
 
@@ -68,7 +68,7 @@ def get_mxnet_output(symbol, x, dtype="float32"):
             args, auxs = mod.get_params()
             return out, args, auxs
 
-    def get_tvm_output(symbol, x, args, auxs, target, ctx, dtype="float32"):
+    def get_tvm_output(symbol, x, args, auxs, target, dev, dtype="float32"):
         shape_dict = {"data": x.shape}
         if gluon_impl:
             mod, params = relay.frontend.from_mxnet(symbol, shape_dict)
@@ -78,7 +78,7 @@ def get_tvm_output(symbol, x, args, auxs, target, ctx, dtype="float32"):
             )
         with tvm.transform.PassContext(opt_level=3):
             lib = relay.build(mod, target, params=params)
-        m = graph_runtime.GraphModule(lib["default"](ctx))
+        m = graph_executor.GraphModule(lib["default"](dev))
         # set inputs
         m.set_input("data", tvm.nd.array(x.astype(dtype)))
         m.run()
@@ -90,14 +90,14 @@ def get_tvm_output(symbol, x, args, auxs, target, ctx, dtype="float32"):
     x = np.random.uniform(size=data_shape)
     if gluon_impl:
         gluon_out, gluon_sym = get_gluon_output(name, x)
-        for target, ctx in tvm.testing.enabled_targets():
-            tvm_out = get_tvm_output(gluon_sym, x, None, None, target, ctx, dtype)
+        for target, dev in tvm.testing.enabled_targets():
+            tvm_out = get_tvm_output(gluon_sym, x, None, None, target, dev, dtype)
             tvm.testing.assert_allclose(gluon_out, tvm_out, rtol=1e-5, atol=1e-5)
     else:
         mx_out, args, auxs = get_mxnet_output(mx_symbol, x, dtype)
         assert "data" not in args
-        for target, ctx in tvm.testing.enabled_targets():
-            tvm_out = get_tvm_output(mx_symbol, x, args, auxs, target, ctx, dtype)
+        for target, dev in tvm.testing.enabled_targets():
+            tvm_out = get_tvm_output(mx_symbol, x, args, auxs, target, dev, dtype)
             tvm.testing.assert_allclose(mx_out, tvm_out, rtol=1e-5, atol=1e-5)
 
 
@@ -333,9 +333,9 @@ def test_forward_where():
     mx_out = mx.nd.where(mx_cond, mx_x, mx_y).asnumpy()
 
     mod, _ = relay.frontend.from_mxnet(mx_sym, shapes, args, auxs)
-    for target, ctx in tvm.testing.enabled_targets():
+    for target, dev in tvm.testing.enabled_targets():
         for kind in ["graph", "debug"]:
-            intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+            intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
             op_res = intrp.evaluate()(np_cond, np_x, np_y)
             tvm.testing.assert_allclose(op_res.asnumpy(), mx_out)
 
@@ -357,9 +357,9 @@ def verify(start, stop, step):
         ref_res = _mx_symbol(mx.nd, start, stop, step).asnumpy()
         mx_sym = _mx_symbol(mx.sym, start, stop, step)
         mod, _ = relay.frontend.from_mxnet(mx_sym, {})
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()()
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
 
@@ -416,9 +416,9 @@ def test_forward_broadcast_ops():
         ref_res = _mx_symbol(mx.nd, op, [mx.nd.array(a_np), mx.nd.array(b_np)])
         shapes = {"a": a_shape, "b": b_shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype)
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(a_np, b_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
 
@@ -451,9 +451,9 @@ def test_forward_elemwise_ops():
             ref_res = op(mx.nd.array(a_np), mx.nd.array(b_np))
         shapes = {"a": shape, "b": shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype)
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(a_np, b_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
 
@@ -500,9 +500,9 @@ def test_forward_unary_ops():
         ref_res = _mx_symbol(mx.nd, op, [mx.nd.array(a_np)])
         shapes = {"a": shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype)
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(a_np)
                 tvm.testing.assert_allclose(
                     op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5, atol=1e-5
@@ -532,9 +532,9 @@ def test_forward_scalar_ops():
         ref_res = op(mx.nd.array(a_np), b_scalar)
         shapes = {"a": a_shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype)
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(a_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
     for op in ["maximum", "minimum"]:
@@ -546,9 +546,9 @@ def test_forward_scalar_ops():
         ref_res = _mx_symbol(mx.nd, op, [mx.nd.array(a_np), b_scalar])
         shapes = {"a": a_shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype)
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(a_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
 
@@ -560,9 +560,9 @@ def verify(shape, axis, begin, end):
         ref_res = mx.nd.slice_axis(mx.nd.array(data_np), axis, begin, end)
         mx_sym = mx.sym.slice_axis(mx.sym.var("data"), axis, begin, end)
         mod, _ = relay.frontend.from_mxnet(mx_sym, {"data": shape})
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(data_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
 
@@ -585,9 +585,9 @@ def verify(x_shape, y_shape, axes):
             ref_res = mx.nd.slice_like(mx.nd.array(x_np), mx.nd.array(y_np), axes=axes)
             mx_sym = mx.sym.slice_like(mx.sym.var("x"), mx.sym.var("y"), axes=axes)
         mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": x_shape, "y": y_shape})
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(x_np, y_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
 
@@ -619,9 +619,9 @@ def verify(shape, seq_lengths, use_seq_lengths, seq_axis):
         mx_sym = mx.sym.SequenceReverse(*mx_sym_args)
         mod, _ = relay.frontend.from_mxnet(mx_sym, *from_mxnet_args)
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(*in_data)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
 
@@ -655,9 +655,9 @@ def test_forward_logistic_regression_output():
     ref_res = mx.nd.LogisticRegressionOutput(mx.nd.array(data_np), mx.nd.array(label_np))
     shapes = {"data": data_shape}
     mod, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype)
-    for target, ctx in tvm.testing.enabled_targets():
+    for target, dev in tvm.testing.enabled_targets():
         for kind in ["graph", "debug"]:
-            intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+            intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
             op_res = intrp.evaluate()(data_np)
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
 
@@ -672,9 +672,9 @@ def verify(a_shape, b_shape, transpose_b=False):
         ref_res = mx.nd.dot(mx.nd.array(a_np), mx.nd.array(b_np), transpose_b=transpose_b)
         shapes = {"a": a_shape, "b": b_shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype)
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(a_np, b_np)
                 tvm.testing.assert_allclose(
                     op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-05, atol=1e-05
@@ -691,9 +691,9 @@ def verify(shape):
         ref_res = mx.nd.shape_array(mx.nd.array(x_np))
         mx_sym = mx.sym.shape_array(mx.sym.var("x"))
         mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": shape})
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(x_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
 
@@ -713,9 +713,9 @@ def verify(shape, axis):
             ref_res = mx.nd.squeeze(mx.nd.array(x_np), axis=axis)
             mx_sym = mx.sym.squeeze(mx.sym.var("x"), axis=axis)
         mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": shape})
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(x_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
 
@@ -733,9 +733,9 @@ def verify(shape, axis, size):
             mx_sym = _mx_symbol(mx.sym, op, [mx.sym.var("x"), axis, size])
             ref_res = _mx_symbol(mx.nd, op, [mx.nd.array(x_np), axis, size])
             mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": shape})
-            for target, ctx in tvm.testing.enabled_targets():
+            for target, dev in tvm.testing.enabled_targets():
                 for kind in ["graph", "debug"]:
-                    intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                    intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                     op_res = intrp.evaluate()(x_np)
                     tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
 
@@ -750,9 +750,9 @@ def verify(input_shape, shape):
         ref_res = mx.nd.broadcast_to(mx.nd.array(x_np), shape=shape)
         mx_sym = mx.sym.broadcast_to(mx.sym.var("x"), shape=shape)
         mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": input_shape})
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(x_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
 
@@ -768,9 +768,9 @@ def verify(input_shape, like_shape):
         ref_res = mx.nd.broadcast_like(mx.nd.array(x_np), mx.nd.array(y_np))
         mx_sym = mx.sym.broadcast_like(mx.sym.var("x"), mx.sym.var("y"))
         mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": input_shape, "y": like_shape})
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(x_np, y_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
 
@@ -787,9 +787,9 @@ def test_forward_logical_not():
     ref_res = mx.nd.logical_not(mx.nd.array(a_np))
     shapes = {"a": a_shape}
     mod, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype)
-    for target, ctx in tvm.testing.enabled_targets():
+    for target, dev in tvm.testing.enabled_targets():
         for kind in ["graph", "debug"]:
-            intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+            intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
             op_res = intrp.evaluate()(a_np)
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
 
@@ -797,15 +797,15 @@ def test_forward_logical_not():
 @tvm.testing.uses_gpu
 def test_forward_full():
     def verify(val, shape, dtype):
-        ctx = mx.cpu()
+        dev = mx.cpu()
         ref_res = mx.nd.full(shape, val, dtype=dtype)
         mx_sym = mx.sym.full(shape, val, dtype=dtype)
         mod, _ = relay.frontend.from_mxnet(mx_sym, {})
-        for target, ctx in tvm.testing.enabled_targets():
-            # Skip testing graph runtime because this op will be optimized out
+        for target, dev in tvm.testing.enabled_targets():
+            # Skip testing graph executor because this op will be optimized out
             # by constant folding.
             for kind in ["debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()()
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
 
@@ -827,9 +827,9 @@ def verify(data_shape, weight_shape):
             mx.sym.var("x"), mx.sym.var("w"), input_dim=in_dim, output_dim=out_dim
         )
         mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": data_shape, "w": weight_shape})
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(x=x_np, w=w_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
 
@@ -854,9 +854,9 @@ def verify(shape, indices_src, axis, mode="clip"):
         ref_res = mx.nd.take(mx.nd.array(x_np), mx.nd.array(indices_np), axis, mode)
         mx_sym = mx.sym.take(mx.sym.var("x"), mx.sym.var("y"), axis, mode)
         mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": shape, "y": indices_np.shape})
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(x_np, indices_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
 
@@ -878,9 +878,9 @@ def verify(xshape, yshape, y_data, error=False):
         mod, _ = relay.frontend.from_mxnet(
             mx_sym, {"x_data": xshape, "y_data": yshape}, {"x_data": "float32", "y_data": "int32"}
         )
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(x_data, y_data)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
 
@@ -907,9 +907,9 @@ def verify(shape, transform_type, target_shape):
         mx_sym = mx.sym.GridGenerator(mx.sym.var("x"), transform_type, target_shape)
         shape_dict = {"x": x.shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict)
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(x)
                 tvm.testing.assert_allclose(
                     op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5, atol=1e-5
@@ -929,9 +929,9 @@ def verify(data_shape, grid_shape):
         mx_sym = mx.sym.BilinearSampler(mx.sym.var("data"), mx.sym.var("grid"))
         shape_dict = {"data": data.shape, "grid": grid.shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict)
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(data, grid)
                 tvm.testing.assert_allclose(
                     op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5, atol=1e-5
@@ -993,10 +993,10 @@ def verify(
             mx_params[name] = param._reduce()
 
         mod, params = relay.frontend.from_mxnet(mx_sym, shape=shape_dict, arg_params=mx_params)
-        for target, ctx in tvm.testing.enabled_targets():
-            # only test graph runtime because debug runtime is too slow
+        for target, dev in tvm.testing.enabled_targets():
+            # only test graph executor because debug runtime is too slow
             for kind in ["graph"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(**inputs, **params)
                 if init_states:
                     assert len(op_res) == len(mx_res)
@@ -1028,9 +1028,9 @@ def verify(xshape, yshape, offset=None):
             mx_sym = mx.sym.Crop(mx.sym.var("x"), mx.sym.var("y"), offset=offset)
             ref_res = mx.nd.Crop(mx.nd.array(x_data), mx.nd.array(y_data), offset=offset)
         mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": xshape, "y": yshape})
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 if offset is None or offset == (0, 0):
                     op_res = intrp.evaluate()(x_data, y_data)
                 else:
@@ -1051,9 +1051,9 @@ def verify(shape, axis, is_ascend, dtype="float32"):
         ref_res = mx.nd.argsort(mx.nd.array(x_np), axis=axis, is_ascend=is_ascend, dtype=dtype)
         mx_sym = mx.sym.argsort(mx.sym.var("x"), axis=axis, is_ascend=is_ascend, dtype=dtype)
         mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": shape})
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(x_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
 
@@ -1082,9 +1082,9 @@ def verify(shape, k, axis, ret_type, is_ascend=None, dtype="float32"):
                 mx.sym.var("x"), k=k, axis=axis, ret_typ=ret_type, is_ascend=is_ascend, dtype=dtype
             )
         mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": shape})
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(x_np)
                 if isinstance(ref_res, list):
                     assert len(op_res) == len(ref_res)
@@ -1136,12 +1136,12 @@ def verify(shape, use_sequence_length, value, axis, dtype, itype):
                 mx.sym.var("data"), use_sequence_length=use_sequence_length, value=value, axis=axis
             )
             mod, _ = relay.frontend.from_mxnet(mx_sym, {"data": shape}, dtype={"data": dtype})
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
                 if use_sequence_length is False and kind == "graph":
                     # Disable the test for 'graph' when it's identity.
                     continue
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 if use_sequence_length:
                     op_res = intrp.evaluate()(data_np, valid_length_np)
                 else:
@@ -1161,9 +1161,9 @@ def verify(shape):
         ref_res = mx.nd.contrib.div_sqrt_dim(mx.nd.array(x_np))
         mx_sym = mx.sym.contrib.div_sqrt_dim(mx.sym.var("x"))
         mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": shape})
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(x_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
 
@@ -1209,9 +1209,9 @@ def verify(shape, axis=1, fix_gamma=False):
         }
         mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict)
         # print(mod)
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(x, gamma, beta, moving_mean, moving_var)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-3)
 
@@ -1233,9 +1233,9 @@ def verify(shape, axis=1, epsilon=1e-5):
         )
         shape_dict = {"x": x.shape, "gamma": gamma.shape, "beta": beta.shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict)
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(x, gamma, beta)
                 tvm.testing.assert_allclose(
                     op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5, atol=1e-5
@@ -1259,9 +1259,9 @@ def verify(shape, axis=-1):
         )
         shape_dict = {"x": x.shape, "gamma": gamma.shape, "beta": beta.shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict)
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(x, gamma, beta)
                 tvm.testing.assert_allclose(
                     op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-3, atol=1e-5
@@ -1289,9 +1289,9 @@ def verify(shape, num_groups=1):
         )
         shape_dict = {"x": x.shape, "gamma": gamma.shape, "beta": beta.shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict)
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(x, gamma, beta)
                 tvm.testing.assert_allclose(
                     op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-3, atol=1e-5
@@ -1312,9 +1312,9 @@ def verify(indices_shape, depth, on_value, off_value, dtype):
         mx_sym = mx.sym.one_hot(mx.sym.var("x"), depth, on_value, off_value, dtype)
         shape_dict = {"x": x.shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict)
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(x.astype("float32"))
                 tvm.testing.assert_allclose(
                     op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-3, atol=1e-5
@@ -1440,9 +1440,9 @@ def verify(data_shape, kernel_size, stride, pad, num_filter, is_depthwise=False)
         )
         shape_dict = {"x": x.shape, "weight": weight.shape, "bias": bias.shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict)
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(x, weight, bias)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-3)
 
@@ -1521,9 +1521,9 @@ def verify(data_shape, kernel_size, stride, pad, num_filter):
         )
         shape_dict = {"x": x.shape, "weight": weight.shape, "bias": bias.shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict)
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(x, weight, bias)
                 tvm.testing.assert_allclose(
                     op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-3, atol=1e-5
@@ -1556,9 +1556,9 @@ def verify(a_np, b_np):
 
         shape_dict = {"a": a_np.shape, "b": b_np.shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict)
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["debug", "vm"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(a_np, b_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-3)
 
@@ -1575,9 +1575,9 @@ def verify(from_dtype, to_dtype):
         shape_dict = {"x": (1, 3, 18)}
         dtype_dict = {"x": from_dtype}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict, dtype_dict)
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "vm", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(from_np)
                 assert op_res.dtype == to_dtype, op_res.dtype
                 tvm.testing.assert_allclose(op_res.asnumpy(), from_np.astype(to_dtype))
@@ -1598,9 +1598,9 @@ def verify(dtypes, cast_narrow, expected_dtype):
             shape_dict[str(i)] = (1, 3, 18)
             dtype_dict[str(i)] = dtype
         mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict, dtype_dict)
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "vm", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(*x_nps)
                 for i, res in enumerate(op_res):
                     assert res.dtype == expected_dtype, res.dtype
@@ -1623,9 +1623,9 @@ def verify(x, shape, dtype):
         shapes = {"a": a_np.shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype)
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "vm", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(a_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
 
@@ -1664,9 +1664,9 @@ def verify(shape, blocksize=2):
             "x": x.shape,
         }
         mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict)
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(x)
                 tvm.testing.assert_allclose(
                     op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-3, atol=1e-5
@@ -1685,9 +1685,9 @@ def verify(shape, blocksize=2):
             "x": x.shape,
         }
         mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict)
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(x)
                 tvm.testing.assert_allclose(
                     op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-3, atol=1e-5
@@ -1723,9 +1723,9 @@ def verify(data_shape, kernel_size, max_displacement, stride1, stride2, pad_size
         )
         shape_dict = {"data1": data1.shape, "data2": data2.shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict)
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(data1, data2)
                 tvm.testing.assert_allclose(
                     op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-3, atol=1e-5
@@ -1830,9 +1830,9 @@ def verify(data_shape, start=None, step=None, axis=None):
 
         mx_sym = mx.sym.contrib.arange_like(data, **attrs)
         mod, _ = relay.frontend.from_mxnet(mx_sym, {"data": data_shape})
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()()
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
 
@@ -1852,9 +1852,9 @@ def verify(batch, seq_length, num_heads, head_dim):
 
         mx_sym = mx.sym.contrib.interleaved_matmul_selfatt_qk(data, heads=num_heads)
         mod, _ = relay.frontend.from_mxnet(mx_sym, {"data": data_shape})
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(data_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
 
@@ -1877,9 +1877,9 @@ def verify(batch, seq_length, num_heads, head_dim):
 
         mx_sym = mx.sym.contrib.interleaved_matmul_selfatt_valatt(data, weight, heads=num_heads)
         mod, _ = relay.frontend.from_mxnet(mx_sym, {"data": data_shape, "weight": weight_shape})
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(data=data_np, weight=weight_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
 
@@ -1887,6 +1887,66 @@ def verify(batch, seq_length, num_heads, head_dim):
     verify(3, 10, 6, 8)
 
 
+@tvm.testing.uses_gpu
+def test_forward_box_nms():
+    def verify(
+        data_shape,
+        overlap_thresh=0.5,
+        valid_thresh=0,
+        topk=1,
+        coord_start=2,
+        score_index=1,
+        id_index=0,
+        force_suppress=False,
+        in_format="corner",
+    ):
+        dtype = "float32"
+        data = np.random.uniform(low=0, high=1, size=data_shape).astype(dtype)
+        ref_res = mx.nd.contrib.box_nms(
+            mx.nd.array(data),
+            overlap_thresh=overlap_thresh,
+            valid_thresh=valid_thresh,
+            topk=topk,
+            coord_start=coord_start,
+            score_index=score_index,
+            id_index=id_index,
+            force_suppress=force_suppress,
+            background_id=-1,
+            in_format=in_format,
+            out_format=in_format,
+        )
+        mx_sym = mx.sym.contrib.box_nms(
+            mx.sym.var("data"),
+            overlap_thresh=overlap_thresh,
+            valid_thresh=valid_thresh,
+            topk=topk,
+            coord_start=coord_start,
+            score_index=score_index,
+            id_index=id_index,
+            force_suppress=force_suppress,
+            background_id=-1,
+            in_format=in_format,
+            out_format=in_format,
+        )
+        shape_dict = {"data": data_shape}
+        mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict)
+        for target, dev in tvm.testing.enabled_targets():
+            if tvm.contrib.thrust.can_use_thrust(
+                tvm.target.Target(target + " -libs=thrust"), "tvm.contrib.thrust.sort"
+            ):
+                target += " -libs=thrust"
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
+                op_res = intrp.evaluate()(data)
+                tvm.testing.assert_allclose(
+                    op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-3, atol=1e-5
+                )
+
+    verify((1, 10, 6))
+    # No valid boxes
+    verify((1, 10, 6), valid_thresh=1)
+
+
 @tvm.testing.uses_gpu
 def test_forward_box_decode():
     def verify(data_shape, anchor_shape, stds=[1, 1, 1, 1], clip=-1, in_format="corner"):
@@ -1915,9 +1975,9 @@ def verify(data_shape, anchor_shape, stds=[1, 1, 1, 1], clip=-1, in_format="corn
         )
         shape_dict = {"data": data_shape, "anchors": anchor_shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict)
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(data, anchors)
                 tvm.testing.assert_allclose(
                     op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-3, atol=1e-5
@@ -1957,9 +2017,9 @@ def verify(data_shape, axis, use_length, length):
             shape_dict = {"data": data_shape}
             mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict)
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 if use_length:
                     op_res = intrp.evaluate()(x, length)
                 else:
@@ -1989,7 +2049,7 @@ def verify(data_shape, axis, use_length, length):
 @pytest.mark.parametrize("constant_value", [0.0, 3.0])
 @tvm.testing.parametrize_targets
 @pytest.mark.parametrize("kind", ["graph", "vm", "debug"])
-def test_forward_npi_pad(data_shape, pad_width, mode, dtype, constant_value, target, ctx, kind):
+def test_forward_npi_pad(data_shape, pad_width, mode, dtype, constant_value, target, dev, kind):
     data_np = np.random.uniform(size=data_shape).astype(dtype)
     data = mx.sym.var("data")
     if mode == "constant":
@@ -2001,7 +2061,7 @@ def test_forward_npi_pad(data_shape, pad_width, mode, dtype, constant_value, tar
         ref_res = np.pad(data_np, mode=mode, pad_width=pad_width)
         mx_sym = mx.sym.np.pad(data.as_np_ndarray(), mode=mode, pad_width=pad_width)
     mod, _ = relay.frontend.from_mxnet(mx_sym, {"data": data_shape}, dtype=dtype)
-    intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+    intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
     op_res = intrp.evaluate()(data_np)
     tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
@@ -2014,13 +2074,13 @@ def test_forward_npi_pad(data_shape, pad_width, mode, dtype, constant_value, tar
 @pytest.mark.parametrize("axes", [(1, 0, 2), None])
 @tvm.testing.parametrize_targets
 @pytest.mark.parametrize("kind", ["graph", "vm", "debug"])
-def test_forward_npi_transpose(data_shape, axes, dtype, target, ctx, kind):
+def test_forward_npi_transpose(data_shape, axes, dtype, target, dev, kind):
     data_np = np.random.uniform(size=data_shape).astype(dtype)
     data = mx.sym.var("data")
     ref_res = mx.np.transpose(mx.np.array(data_np), axes=axes)
     mx_sym = mx.sym.np.transpose(data.as_np_ndarray(), axes=axes)
     mod, _ = relay.frontend.from_mxnet(mx_sym, {"data": data_shape}, dtype=dtype)
-    intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+    intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
     op_res = intrp.evaluate()(data_np)
     tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
 
@@ -2038,7 +2098,7 @@ def test_forward_npi_transpose(data_shape, axes, dtype, target, ctx, kind):
 @pytest.mark.parametrize("dtype", ["float64", "float32", "int64", "int32"])
 @tvm.testing.parametrize_targets
 @pytest.mark.parametrize("kind", ["graph", "vm", "debug"])
-def test_forward_npi_concatenate(data_shape1, data_shape2, axis, dtype, target, ctx, kind):
+def test_forward_npi_concatenate(data_shape1, data_shape2, axis, dtype, target, dev, kind):
     data_np1 = np.random.uniform(size=data_shape1).astype(dtype)
     data_np2 = np.random.uniform(size=data_shape2).astype(dtype)
     data1 = mx.sym.var("data1")
@@ -2048,7 +2108,7 @@ def test_forward_npi_concatenate(data_shape1, data_shape2, axis, dtype, target,
     mod, _ = relay.frontend.from_mxnet(
         mx_sym, shape={"data1": data_shape1, "data2": data_shape2}, dtype=dtype
     )
-    intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+    intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
     op_res = intrp.evaluate()(data_np1, data_np2)
     tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
 
@@ -2066,7 +2126,7 @@ def test_forward_npi_concatenate(data_shape1, data_shape2, axis, dtype, target,
 @pytest.mark.parametrize("dtype", ["float64", "float32", "int64", "int32"])
 @tvm.testing.parametrize_targets
 @pytest.mark.parametrize("kind", ["graph", "vm", "debug"])
-def test_forward_npi_stack(data_shape1, data_shape2, axis, dtype, target, ctx, kind):
+def test_forward_npi_stack(data_shape1, data_shape2, axis, dtype, target, dev, kind):
     data_np1 = np.random.uniform(size=data_shape1).astype(dtype)
     data_np2 = np.random.uniform(size=data_shape2).astype(dtype)
     data1 = mx.sym.var("data1")
@@ -2076,7 +2136,7 @@ def test_forward_npi_stack(data_shape1, data_shape2, axis, dtype, target, ctx, k
     mod, _ = relay.frontend.from_mxnet(
         mx_sym, shape={"data1": data_shape1, "data2": data_shape2}, dtype=dtype
     )
-    intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+    intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
     op_res = intrp.evaluate()(data_np1, data_np2)
     tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
 
@@ -2085,13 +2145,13 @@ def test_forward_npi_stack(data_shape1, data_shape2, axis, dtype, target, ctx, k
 @pytest.mark.parametrize("dtype", ["float64", "float32", "int64", "int32", "bool"])
 @tvm.testing.parametrize_targets
 @pytest.mark.parametrize("kind", ["graph", "vm", "debug"])
-def test_forward_np_copy(data_shape, dtype, target, ctx, kind):
+def test_forward_np_copy(data_shape, dtype, target, dev, kind):
     data_np = np.random.uniform(size=data_shape).astype(dtype)
     data = mx.sym.var("data")
     ref_res = mx.np.copy(mx.np.array(data_np))
     mx_sym = mx.sym.np.copy(data.as_np_ndarray())
     mod, _ = relay.frontend.from_mxnet(mx_sym, {"data": data_shape}, dtype=dtype)
-    intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+    intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
     op_res = intrp.evaluate()(data_np)
     tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
 
@@ -2113,13 +2173,13 @@ def test_forward_np_copy(data_shape, dtype, target, ctx, kind):
         ((2, 4, 1, 8), (-4, -3, -1, 2, -6), True),
     ],
 )
-def test_forward_npx_reshape(data_shape, out_shape, dtype, target, reverse, ctx, kind):
+def test_forward_npx_reshape(data_shape, out_shape, dtype, target, reverse, dev, kind):
     data_np = np.random.uniform(size=data_shape).astype(dtype)
     data = mx.sym.var("data")
     ref_res = mx.npx.reshape(mx.np.array(data_np), newshape=out_shape, reverse=reverse)
     mx_sym = mx.sym.npx.reshape(data.as_np_ndarray(), newshape=out_shape, reverse=reverse)
     mod, _ = relay.frontend.from_mxnet(mx_sym, {"data": data_shape}, dtype=dtype)
-    intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+    intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
     op_res = intrp.evaluate()(data_np)
     tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
 
@@ -2130,7 +2190,7 @@ def test_forward_npx_reshape(data_shape, out_shape, dtype, target, reverse, ctx,
 @pytest.mark.parametrize("dtype", ["float64", "float32", "int64", "int32"])
 @tvm.testing.parametrize_targets
 @pytest.mark.parametrize("kind", ["graph", "vm", "debug"])
-def test_forward_npi_binary(data_shape, dtype, target, ctx, kind):
+def test_forward_npi_binary(data_shape, dtype, target, dev, kind):
     ref_ops = [mx.np.power, mx.np.multiply, mx.np.add, mx.np.subtract, mx.np.less]
     mx_ops = [
         mx.sym.np.power,
@@ -2154,7 +2214,7 @@ def test_forward_npi_binary(data_shape, dtype, target, ctx, kind):
         mod, _ = relay.frontend.from_mxnet(
             mx_sym, shape={"lhs": data_shape, "rhs": data_shape}, dtype=dtype
         )
-        intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+        intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
         op_res = intrp.evaluate()(data_np1, data_np2)
         tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
 
@@ -2166,7 +2226,7 @@ def test_forward_npi_binary(data_shape, dtype, target, ctx, kind):
 @tvm.testing.parametrize_targets
 @pytest.mark.parametrize("scalar", [1.0, 2.0, 3.0, 4.0])
 @pytest.mark.parametrize("kind", ["graph", "vm", "debug"])
-def test_forward_npi_binary_scalar(data_shape, dtype, scalar, target, ctx, kind):
+def test_forward_npi_binary_scalar(data_shape, dtype, scalar, target, dev, kind):
     ref_ops = [mx.np.power, mx.np.multiply, mx.np.add, mx.np.subtract, mx.np.true_divide]
     mx_ops = [
         mx.sym.np.power,
@@ -2186,7 +2246,7 @@ def test_forward_npi_binary_scalar(data_shape, dtype, scalar, target, ctx, kind)
         ref_res = ref_op(mx.np.array(data_np1), scalar)
         mx_sym = mx_op(data1.as_np_ndarray(), scalar)
         mod, _ = relay.frontend.from_mxnet(mx_sym, shape={"lhs": data_shape}, dtype=dtype)
-        intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+        intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
         op_res = intrp.evaluate()(data_np1)
         tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
 
@@ -2197,13 +2257,13 @@ def test_forward_npi_binary_scalar(data_shape, dtype, scalar, target, ctx, kind)
 @pytest.mark.parametrize("dtype", ["float64", "float32"])
 @tvm.testing.parametrize_targets
 @pytest.mark.parametrize("kind", ["graph", "vm", "debug"])
-def test_forward_npi_tanh(data_shape, dtype, target, ctx, kind):
+def test_forward_npi_tanh(data_shape, dtype, target, dev, kind):
     data_np1 = np.random.uniform(size=data_shape).astype(dtype)
     data1 = mx.sym.var("data")
     ref_res = mx.np.tanh(mx.np.array(data_np1))
     mx_sym = mx.sym.np.tanh(data1.as_np_ndarray())
     mod, _ = relay.frontend.from_mxnet(mx_sym, shape={"data": data_shape}, dtype=dtype)
-    intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+    intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
     op_res = intrp.evaluate()(data_np1)
     tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
 
@@ -2219,7 +2279,7 @@ def test_forward_npi_tanh(data_shape, dtype, target, ctx, kind):
 @tvm.testing.parametrize_targets
 @pytest.mark.parametrize("kind", ["graph", "vm", "debug"])
 def test_forward_npi_where_rscalar(
-    data_shape, cond_shape, data_dtype, cond_dtype, scalar, target, ctx, kind
+    data_shape, cond_shape, data_dtype, cond_dtype, scalar, target, dev, kind
 ):
     if data_dtype == "bool":
         scalar = scalar == 0.0
@@ -2235,7 +2295,7 @@ def test_forward_npi_where_rscalar(
     mod, _ = relay.frontend.from_mxnet(
         mx_sym, shape={"condition": cond_shape, "x": data_shape}, dtype=dtypeDic
     )
-    intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+    intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
     op_res = intrp.evaluate()(cond_np, data_np)
     tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
 
@@ -2253,7 +2313,7 @@ def test_forward_npi_where_rscalar(
     ],
 )
 def test_forward_split_v2(
-    data_shape, axis, dtype, indices_or_sections, squeeze_axis, target, ctx, kind
+    data_shape, axis, dtype, indices_or_sections, squeeze_axis, target, dev, kind
 ):
     data_np = np.random.uniform(size=data_shape).astype(dtype)
     data = mx.sym.var("data")
@@ -2264,7 +2324,7 @@ def test_forward_split_v2(
         data.as_nd_ndarray(), indices_or_sections, axis=axis, squeeze_axis=squeeze_axis
     )
     mod, _ = relay.frontend.from_mxnet(mx_sym, {"data": data_shape}, dtype=dtype)
-    intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+    intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
     op_res = intrp.evaluate()(data_np)
     op_res_ = []
     for arr in op_res:
diff --git a/tests/python/frontend/mxnet/test_qnn_ops_utils.py b/tests/python/frontend/mxnet/test_qnn_ops_utils.py
index c2e242579f15..a200e06ed2d0 100644
--- a/tests/python/frontend/mxnet/test_qnn_ops_utils.py
+++ b/tests/python/frontend/mxnet/test_qnn_ops_utils.py
@@ -18,7 +18,7 @@
 import numpy as np
 import tvm
 from tvm import relay
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 from tvm.relay.frontend.mxnet_qnn_op_utils import (
     dequantize_mxnet_min_max,
     quantize_mxnet_min_max,
@@ -41,7 +41,7 @@ def dequantize_test_driver(in_dtype, quant_args, in_data, verify_output_data):
         mod = tvm.IRModule.from_expr(mod)
         with tvm.transform.PassContext(opt_level=3):
             graph, lib, params = relay.build(mod, "llvm", params=None)
-            rt_mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
+            rt_mod = graph_executor.create(graph, lib, device=tvm.cpu(0))
             rt_mod.set_input(input_data=in_data)
             rt_mod.set_input(**params)
             rt_mod.run()
@@ -120,7 +120,7 @@ def quantize_test_driver(out_dtype, quant_args, in_data, verify_output_data):
         mod = tvm.IRModule.from_expr(mod)
         with tvm.transform.PassContext(opt_level=3):
             graph, lib, params = relay.build(mod, "llvm", params=None)
-            rt_mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
+            rt_mod = graph_executor.create(graph, lib, device=tvm.cpu(0))
             rt_mod.set_input(input_data=in_data)
             rt_mod.set_input(**params)
             rt_mod.run()
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 1e1341640ea0..04b6c94a5f53 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -19,10 +19,11 @@
 from onnx import helper, TensorProto, mapping, numpy_helper
 import torch
 import torchvision
+import pytest
 import tvm.topi.testing
 import tvm
 from tvm import relay
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 import scipy
 import tvm.testing
 
@@ -42,7 +43,7 @@ def get_input_data_shape_dict(graph_def, input_data):
 
 
 def get_tvm_output_with_vm(
-    graph_def, input_data, target, ctx, opset=None, freeze_params=False, convert_to_static=False
+    graph_def, input_data, target, device, opset=None, freeze_params=False, convert_to_static=False
 ):
     """ Generic function to execute and get tvm output with vm executor"""
     if not isinstance(input_data, list):
@@ -56,18 +57,20 @@ def get_tvm_output_with_vm(
     if convert_to_static:
         mod = relay.transform.DynamicToStatic()(mod)
 
-    ex = relay.create_executor("vm", mod=mod, ctx=ctx, target=target)
-    result = ex.evaluate()(*input_data)
+    ex = relay.create_executor("vm", mod=mod, device=device, target=target)
+    result = ex.evaluate()(*input_data, **params)
     if isinstance(result, tvm.runtime.NDArray):
         return result.asnumpy()
     return [r.asnumpy() for r in result]
 
 
 def get_tvm_output(
-    graph_def, input_data, target, ctx, output_shape=None, output_dtype="float32", opset=None
+    graph_def, input_data, target, device, output_shape=None, output_dtype="float32", opset=None
 ):
     """ Generic function to execute and get tvm output"""
+    # TODO: Resolve the issues and remove the following lines
     target = "llvm"
+    device = tvm.cpu(0)
 
     input_names, shape_dict = get_input_data_shape_dict(graph_def, input_data)
 
@@ -75,8 +78,7 @@ def get_tvm_output(
     with tvm.transform.PassContext(opt_level=1):
         graph, lib, params = relay.build(mod, target, params=params)
 
-    ctx = tvm.cpu(0)
-    m = graph_runtime.create(graph, lib, ctx)
+    m = graph_executor.create(graph, lib, device)
     # set inputs
     if isinstance(input_data, list):
         for i, e in enumerate(input_names):
@@ -141,19 +143,19 @@ def verify_with_ort_with_inputs(
         targets = [tgt for (tgt, _) in tvm.testing.enabled_targets()]
 
     for target in targets:
-        ctx = tvm.context(target, 0)
+        dev = tvm.device(target, 0)
         if use_vm:
             tvm_out = get_tvm_output_with_vm(
                 model,
                 inputs,
                 target,
-                ctx,
+                dev,
                 opset=opset,
                 freeze_params=freeze_params,
                 convert_to_static=convert_to_static,
             )
         else:
-            tvm_out = get_tvm_output(model, inputs, target, ctx, out_shape, dtype, opset=opset)
+            tvm_out = get_tvm_output(model, inputs, target, dev, out_shape, dtype, opset=opset)
         if not isinstance(tvm_out, list):
             tvm_out = [tvm_out]
         if not isinstance(ort_out, list):
@@ -232,9 +234,9 @@ def test_reshape():
 
     model = helper.make_model(graph, producer_name="reshape_test")
 
-    for target, ctx in tvm.testing.enabled_targets():
+    for target, dev in tvm.testing.enabled_targets():
         x = np.random.uniform(size=in_shape).astype("int32")
-        tvm_out = get_tvm_output(model, x, target, ctx, ref_shape, "float32")
+        tvm_out = get_tvm_output(model, x, target, dev, ref_shape, "float32")
         tvm.testing.assert_allclose(ref_shape, tvm_out.shape)
 
 
@@ -268,9 +270,9 @@ def test_double_reshape():
 
     model = helper.make_model(graph, producer_name="reshape_test")
 
-    for target, ctx in tvm.testing.enabled_targets():
+    for target, dev in tvm.testing.enabled_targets():
         x = np.random.uniform(size=in_shape).astype("int32")
-        tvm_out = get_tvm_output(model, x, target, ctx, ref_shape, "float32")
+        tvm_out = get_tvm_output(model, x, target, dev, ref_shape, "float32")
         tvm.testing.assert_allclose(ref_shape, tvm_out.shape)
 
 
@@ -316,8 +318,8 @@ def _test_expand(name, data, shape, ref_data, dtype="int32"):
 
         model = helper.make_model(graph, producer_name=name)
 
-        for target, ctx in tvm.testing.enabled_targets():
-            tvm_out = get_tvm_output_with_vm(model, data, target, ctx, freeze_params=True)
+        for target, dev in tvm.testing.enabled_targets():
+            tvm_out = get_tvm_output_with_vm(model, data, target, dev, freeze_params=True)
             tvm.testing.assert_allclose(ref_data, tvm_out)
 
     in_shape = (3, 1)
@@ -408,9 +410,9 @@ def test_shape():
 
     model = helper.make_model(graph, producer_name="shape_test")
 
-    for target, ctx in tvm.testing.enabled_targets():
+    for target, dev in tvm.testing.enabled_targets():
         x = np.random.uniform(size=in_shape).astype("int32")
-        tvm_out = get_tvm_output(model, x, target, ctx, ref_shape, "int32")
+        tvm_out = get_tvm_output(model, x, target, dev, ref_shape, "int32")
         tvm.testing.assert_allclose(ref_shape, tvm_out)
 
 
@@ -437,8 +439,8 @@ def _test_power_iteration(x_shape, y_shape):
 
     model = helper.make_model(graph, producer_name="power_test")
 
-    for target, ctx in tvm.testing.enabled_targets():
-        tvm_out = get_tvm_output(model, [x, y], target, ctx, np_res.shape)
+    for target, dev in tvm.testing.enabled_targets():
+        tvm_out = get_tvm_output(model, [x, y], target, dev, np_res.shape)
         tvm.testing.assert_allclose(np_res, tvm_out, rtol=1e-5, atol=1e-5)
 
 
@@ -500,7 +502,7 @@ def test_squeeze():
 
     model = helper.make_model(graph, producer_name="squeeze_test")
     x = np.random.uniform(size=in_shape).astype("float32")
-    verify_with_ort_with_inputs(model, [x], [out_shape])
+    verify_with_ort_with_inputs(model, [x], [out_shape], opset=11)
 
 
 @tvm.testing.uses_gpu
@@ -538,7 +540,7 @@ def test_unsqueeze():
     )
 
     model = helper.make_model(graph, producer_name="squeeze_test")
-    verify_with_ort(model, [in_shape])
+    verify_with_ort(model, [in_shape], opset=11)
 
 
 def verify_gather(in_shape, indices, axis, dtype):
@@ -1001,9 +1003,9 @@ def test_onehot():
     model = helper.make_model(graph, producer_name="onehot_test")
 
     # TODO(jwfromm): Replace test against np with test against onnxrt once we update versions.
-    for target, ctx in tvm.testing.enabled_targets():
+    for target, dev in tvm.testing.enabled_targets():
         tvm_out = get_tvm_output_with_vm(
-            model, [indices_array, np.array([depth]).astype("int32"), values], target, ctx
+            model, [indices_array, np.array([depth]).astype("int32"), values], target, dev
         )
         tvm.testing.assert_allclose(out_np, tvm_out, rtol=1e-5, atol=1e-5)
 
@@ -1069,7 +1071,7 @@ def test_matmul():
     verify_with_ort_with_inputs(model, [a_array, b_array])
 
 
-def verify_batch_matmul(a_shape, b_shape, out_shape, target, ctx):
+def verify_batch_matmul(a_shape, b_shape, out_shape, target, dev):
     a_array = np.random.uniform(size=a_shape).astype("float32")
     b_array = np.random.uniform(size=b_shape).astype("float32")
 
@@ -1091,17 +1093,17 @@ def verify_batch_matmul(a_shape, b_shape, out_shape, target, ctx):
 
 # TODO(mbrookhart): enable cuda once VM supports heterogenous execution
 @tvm.testing.parametrize_targets("llvm")
-def test_batch_matmul(target, ctx):
-    verify_batch_matmul((2, 3, 4, 3), (2, 3, 3, 4), (2, 3, 4, 4), target, ctx)
-    verify_batch_matmul((2, 4, 3), (3, 4), (2, 4, 4), target, ctx)
-    verify_batch_matmul((2, 3, 4, 3), (3, 4), (2, 3, 4, 4), target, ctx)
+def test_batch_matmul(target, dev):
+    verify_batch_matmul((2, 3, 4, 3), (2, 3, 3, 4), (2, 3, 4, 4), target, dev)
+    verify_batch_matmul((2, 4, 3), (3, 4), (2, 4, 4), target, dev)
+    verify_batch_matmul((2, 3, 4, 3), (3, 4), (2, 3, 4, 4), target, dev)
     # Test implicit broadcasting.
-    verify_batch_matmul((4, 3), (2, 3, 4), (2, 4, 4), target, ctx)
-    verify_batch_matmul((2, 4, 3), (1, 3, 4), (2, 4, 4), target, ctx)
-    verify_batch_matmul((1, 4, 3), (2, 3, 4), (2, 4, 4), target, ctx)
+    verify_batch_matmul((4, 3), (2, 3, 4), (2, 4, 4), target, dev)
+    verify_batch_matmul((2, 4, 3), (1, 3, 4), (2, 4, 4), target, dev)
+    verify_batch_matmul((1, 4, 3), (2, 3, 4), (2, 4, 4), target, dev)
 
 
-def verify_simple_dynamic_model(a_shape, b_shape, target, ctx):
+def verify_simple_dynamic_model(a_shape, b_shape, target, dev):
     def verify_model(ex, a_shape, b_shape):
         a_array = np.random.uniform(size=a_shape).astype("float32")
         b_array = np.random.uniform(size=b_shape).astype("float32")
@@ -1138,7 +1140,7 @@ def verify_model(ex, a_shape, b_shape):
 
     mod, params = relay.frontend.from_onnx(model, {"a": a_anys, "b": b_anys})
 
-    ex = relay.create_executor("vm", mod=mod, ctx=ctx, target=target)
+    ex = relay.create_executor("vm", mod=mod, device=dev, target=target)
     verify_model(ex, a_shape, b_shape)
     verify_model(ex, [a * 2 for a in a_shape], [b * 2 for b in b_shape])
     verify_model(ex, [a * 3 for a in a_shape], [b * 3 for b in b_shape])
@@ -1146,10 +1148,10 @@ def verify_model(ex, a_shape, b_shape):
 
 # TODO(mbrookhart): enable cuda once VM supports heterogenous execution
 @tvm.testing.parametrize_targets("llvm")
-def test_batch_matmul_dynamic_model(target, ctx):
-    verify_simple_dynamic_model((2, 3, 4, 3), (2, 3, 3, 4), target, ctx)
-    verify_simple_dynamic_model((2, 4, 3), (3, 4), target, ctx)
-    verify_simple_dynamic_model((2, 3, 4, 3), (3, 4), target, ctx)
+def test_batch_matmul_dynamic_model(target, dev):
+    verify_simple_dynamic_model((2, 3, 4, 3), (2, 3, 3, 4), target, dev)
+    verify_simple_dynamic_model((2, 4, 3), (3, 4), target, dev)
+    verify_simple_dynamic_model((2, 3, 4, 3), (3, 4), target, dev)
 
 
 def verify_lrn(shape, nsize, dtype, alpha=None, beta=None, bias=None):
@@ -1312,8 +1314,8 @@ def verify_upsample3d_trilinear():
     model = helper.make_model(graph, producer_name="upsample_trilinear_test")
     # TODO(jwfromm): Trilinear upsampling not supported in 1.0.0 onnxruntime.
     # Replace topi comparison with verify_with_ort once we update.
-    for target, ctx in tvm.testing.enabled_targets():
-        tvm_out = get_tvm_output(model, in_array, target, ctx, out_shape, "float32")
+    for target, dev in tvm.testing.enabled_targets():
+        tvm_out = get_tvm_output(model, in_array, target, dev, out_shape, "float32")
         tvm.testing.assert_allclose(out_array, tvm_out, rtol=1e-5, atol=1e-5)
 
 
@@ -1584,7 +1586,7 @@ def verify_pad_v11(indata, pads, mode="constant", value=0.0):
     pads = np.array(pads)
     #  onnx graph
     if mode in ["edge", "reflect"]:
-        inputs = [indata, pads]
+        inputs = [indata]
         outdata = np.pad(indata, pad_width=np_pads, mode=mode)
         node = helper.make_node("Pad", inputs=["input", "pads"], outputs=["output"], mode=mode)
         graph = helper.make_graph(
@@ -1600,7 +1602,7 @@ def verify_pad_v11(indata, pads, mode="constant", value=0.0):
             ],
         )
     else:
-        inputs = [indata, pads, np.array([value]).astype("float32")]
+        inputs = [indata]
         outdata = np.pad(indata, pad_width=np_pads, mode="constant", constant_values=value)
         node = helper.make_node(
             "Pad", inputs=["input", "pads", "constant_value"], outputs=["output"], mode="constant"
@@ -1663,7 +1665,7 @@ def verify_reduce_func(func, data, axis, keepdims):
 
     model = helper.make_model(graph, producer_name="reduce_test")
 
-    verify_with_ort_with_inputs(model, [data], [outshape])
+    verify_with_ort_with_inputs(model, [data], [outshape], opset=11)
 
 
 @tvm.testing.uses_gpu
@@ -2489,42 +2491,27 @@ def verify_convtranspose_with_padding(
     dilations,
     auto_pad="NOTSET",
     unset_pad=False,
+    group=1,
 ):
-    if unset_pad:
-        node = helper.make_node(
-            "ConvTranspose",
-            inputs=["x", "W"],
-            outputs=["y"],
-            kernel_shape=kernel_shape,
-            # Default values for other attributes:
-            strides=strides,
-            dilations=dilations,
-            group=1,
-        )
-    elif padding is None:
-        node = helper.make_node(
-            "ConvTranspose",
-            inputs=["x", "W"],
-            outputs=["y"],
-            kernel_shape=kernel_shape,
-            # Default values for other attributes:
-            strides=strides,
-            dilations=dilations,
-            group=1,
-            auto_pad=auto_pad,
-        )
-    else:
-        node = helper.make_node(
-            "ConvTranspose",
-            inputs=["x", "W"],
-            outputs=["y"],
-            kernel_shape=kernel_shape,
-            # Default values for other attributes:
-            strides=strides,
-            dilations=dilations,
-            group=1,
-            pads=padding,
-        )
+    node = helper.make_node(
+        "ConvTranspose",
+        inputs=["x", "W"],
+        outputs=["y"],
+        kernel_shape=kernel_shape,
+        # Default values for other attributes:
+        strides=strides,
+        dilations=dilations,
+    )
+    if not unset_pad:
+        if padding is None:
+            pad_attr = helper.make_attribute("auto_pad", auto_pad)
+        else:
+            pad_attr = helper.make_attribute("pads", padding)
+        node.attribute.append(pad_attr)
+
+    if group is not None:
+        group_attr = helper.make_attribute("group", group)
+        node.attribute.append(group_attr)
 
     graph = helper.make_graph(
         [node],
@@ -2536,22 +2523,25 @@ def verify_convtranspose_with_padding(
         outputs=[helper.make_tensor_value_info("y", TensorProto.FLOAT, list(y_shape))],
     )
 
-    model = helper.make_model(graph, producer_name="conv_test")
+    model = helper.make_model(graph, producer_name="convtranspose_pad_test")
 
     verify_with_ort(model, [x_shape, w_shape], [y_shape], use_vm=True, convert_to_static=True)
 
 
-def verify_convtranspose(x_shape, w_shape, y_shape, p):
+def verify_convtranspose(x_shape, w_shape, y_shape, p, group=1):
     node = onnx.helper.make_node(
         "ConvTranspose",
         inputs=["x", "W"],
         outputs=["y"],
         strides=[3, 2],
-        group=1,
         kernel_shape=[3, 3],
         pads=p,
     )
 
+    if group is not None:
+        group_attr = helper.make_attribute("group", group)
+        node.attribute.append(group_attr)
+
     graph = helper.make_graph(
         [node],
         "verify_convtranspose_test",
@@ -2562,7 +2552,7 @@ def verify_convtranspose(x_shape, w_shape, y_shape, p):
         outputs=[helper.make_tensor_value_info("y", TensorProto.FLOAT, list(y_shape))],
     )
 
-    model = helper.make_model(graph, producer_name="convtranspose_trest")
+    model = helper.make_model(graph, producer_name="convtranspose_test")
     verify_with_ort(model, [x_shape, w_shape], y_shape)
 
 
@@ -2574,6 +2564,8 @@ def test_convtranspose():
     # (1, 2, 7, 3) output tensor
     # [1, 2, 1, 2] list for pads
     verify_convtranspose((1, 1, 3, 3), (1, 2, 3, 3), (1, 2, 7, 3), [1, 2, 1, 2])
+    # Test undefined groups.
+    verify_convtranspose((1, 1, 3, 3), (1, 2, 3, 3), (1, 2, 7, 3), [1, 2, 1, 2], group=None)
 
     def repeat(N, D):
         return tuple([N for _ in range(D)])
@@ -2711,7 +2703,7 @@ def verify_pooling(x_shape, kernel_shape, strides, pads, out_shape, mode, auto_p
     )
 
     model = helper.make_model(graph, producer_name="pooling_test")
-    verify_with_ort(model, [x_shape], [out_shape], use_vm=True, convert_to_static=True)
+    verify_with_ort(model, [x_shape], [out_shape], use_vm=False, convert_to_static=True)
 
 
 @tvm.testing.uses_gpu
@@ -3887,8 +3879,8 @@ def verify_if(cond_array):
 
     # TODO(jwfromm): Onnxruntime 1.0.0 is buggy with If statements. Replace this with
     # verify_with_ort once we update versions.
-    for target, ctx in tvm.testing.enabled_targets():
-        tvm_out = get_tvm_output_with_vm(if_model, [cond], target, ctx, freeze_params=True)
+    for target, dev in tvm.testing.enabled_targets():
+        tvm_out = get_tvm_output_with_vm(if_model, [cond], target, dev, freeze_params=True)
         for i in range(len(tvm_out)):
             tvm.testing.assert_allclose(correct_out[i], tvm_out[i], rtol=1e-05, atol=1e-05)
 
@@ -4099,6 +4091,194 @@ def verify_cumsum(indata, axis, exclusive=0, reverse=0, type="float32"):
     verify_cumsum(data, 1, 1, 1, type="int32")
 
 
+from onnx import numpy_helper
+
+f = onnx.__file__
+import glob
+
+onnx_test_folders = sorted(glob.glob("/".join(f.split("/")[0:-1]) + "/backend/test/data/node/*/"))
+
+unsupported_onnx_tests = [
+    "test_basic_convinteger/",
+    "test_bitshift_left_uint16/",
+    "test_bitshift_left_uint32/",
+    "test_bitshift_left_uint64/",
+    "test_bitshift_left_uint8/",
+    "test_bitshift_right_uint16/",
+    "test_bitshift_right_uint32/",
+    "test_bitshift_right_uint64/",
+    "test_bitshift_right_uint8/",
+    "test_cast_DOUBLE_to_FLOAT16/",
+    "test_cast_FLOAT16_to_DOUBLE/",
+    "test_cast_FLOAT16_to_FLOAT/",
+    "test_cast_FLOAT_to_FLOAT16/",
+    "test_cast_FLOAT_to_STRING/",
+    "test_cast_STRING_to_FLOAT/",
+    "test_compress_0/",
+    "test_compress_1/",
+    "test_compress_default_axis/",
+    "test_compress_negative_axis/",
+    "test_convinteger_with_padding/",
+    "test_convtranspose_dilations/",
+    "test_convtranspose_output_shape/",
+    "test_cumsum_1d/",
+    "test_cumsum_1d_exclusive/",
+    "test_cumsum_1d_reverse/",
+    "test_cumsum_1d_reverse_exclusive/",
+    "test_cumsum_2d_axis_0/",
+    "test_cumsum_2d_axis_1/",
+    "test_cumsum_2d_negative_axis/",
+    "test_dequantizelinear/",
+    "test_det_2d/",
+    "test_det_nd/",
+    "test_dynamicquantizelinear/",
+    "test_dynamicquantizelinear_expanded/",
+    "test_dynamicquantizelinear_max_adjusted/",
+    "test_dynamicquantizelinear_max_adjusted_expanded/",
+    "test_dynamicquantizelinear_min_adjusted/",
+    "test_dynamicquantizelinear_min_adjusted_expanded/",
+    "test_eyelike_populate_off_main_diagonal/",
+    "test_eyelike_with_dtype/",
+    "test_eyelike_without_dtype/",
+    "test_hardmax_axis_0/",
+    "test_hardmax_axis_1/",
+    "test_hardmax_axis_2/",
+    "test_hardmax_default_axis/",
+    "test_hardmax_example/",
+    "test_hardmax_negative_axis/",
+    "test_hardmax_one_hot/",
+    "test_isinf_negative/",
+    "test_isinf_positive/",
+    "test_lstm_defaults/",
+    "test_lstm_with_initial_bias/",
+    "test_lstm_with_peepholes/",
+    "test_matmulinteger/",
+    "test_maxpool_2d_dilations/",
+    "test_maxpool_2d_same_lower/",
+    "test_maxpool_2d_same_upper/",
+    "test_maxpool_with_argmax_2d_precomputed_pads/",
+    "test_maxpool_with_argmax_2d_precomputed_strides/",
+    "test_maxunpool_export_with_output_shape/",
+    "test_mvn/",
+    "test_nonmaxsuppression_center_point_box_format/",
+    "test_qlinearconv/",
+    "test_qlinearmatmul_2D/",
+    "test_qlinearmatmul_3D/",
+    "test_quantizelinear/",
+    "test_range_float_type_positive_delta_expanded/",
+    "test_range_int32_type_negative_delta_expanded/",
+    "test_resize_downsample_scales_cubic/",
+    "test_resize_downsample_scales_cubic_A_n0p5_exclude_outside/",
+    "test_resize_downsample_scales_cubic_align_corners/",
+    "test_resize_downsample_scales_linear/",
+    "test_resize_downsample_scales_nearest/",
+    "test_resize_downsample_sizes_cubic/",
+    "test_resize_downsample_sizes_linear_pytorch_half_pixel/",
+    "test_resize_downsample_sizes_nearest/",
+    "test_resize_downsample_sizes_nearest_tf_half_pixel_for_nn/",
+    "test_resize_tf_crop_and_resize/",
+    "test_resize_upsample_scales_cubic/",
+    "test_resize_upsample_scales_cubic_A_n0p5_exclude_outside/",
+    "test_resize_upsample_scales_cubic_align_corners/",
+    "test_resize_upsample_scales_cubic_asymmetric/",
+    "test_resize_upsample_scales_linear/",
+    "test_resize_upsample_sizes_cubic/",
+    "test_resize_upsample_sizes_nearest_ceil_half_pixel/",
+    "test_resize_upsample_sizes_nearest_floor_align_corners/",
+    "test_resize_upsample_sizes_nearest_round_prefer_ceil_asymmetric/",
+    "test_reversesequence_batch/",
+    "test_reversesequence_time/",
+    "test_rnn_seq_length/",
+    "test_roialign/",
+    "test_round/",
+    "test_scan9_sum/",
+    "test_scan_sum/",
+    "test_scatternd/",
+    "test_selu_default/",
+    "test_shrink_hard/",
+    "test_shrink_soft/",
+    "test_simple_rnn_defaults/",
+    "test_simple_rnn_with_initial_bias/",
+    "test_slice_neg_steps/",
+    "test_slice_start_out_of_bounds/",
+    "test_strnormalizer_export_monday_casesensintive_lower/",
+    "test_strnormalizer_export_monday_casesensintive_nochangecase/",
+    "test_strnormalizer_export_monday_casesensintive_upper/",
+    "test_strnormalizer_export_monday_empty_output/",
+    "test_strnormalizer_export_monday_insensintive_upper_twodim/",
+    "test_strnormalizer_nostopwords_nochangecase/",
+    "test_tfidfvectorizer_tf_batch_onlybigrams_skip0/",
+    "test_tfidfvectorizer_tf_batch_onlybigrams_skip5/",
+    "test_tfidfvectorizer_tf_batch_uniandbigrams_skip5/",
+    "test_tfidfvectorizer_tf_only_bigrams_skip0/",
+    "test_tfidfvectorizer_tf_onlybigrams_levelempty/",
+    "test_tfidfvectorizer_tf_onlybigrams_skip5/",
+    "test_tfidfvectorizer_tf_uniandbigrams_skip5/",
+    "test_top_k_smallest/",
+    "test_unique_not_sorted_without_axis/",
+    "test_unique_sorted_with_axis/",
+    "test_unique_sorted_with_axis_3d/",
+    "test_unique_sorted_with_negative_axis/",
+    "test_unique_sorted_without_axis/",
+    "test_unsqueeze_unsorted_axes/",
+    "test_upsample_nearest/",
+]
+
+
+@pytest.mark.parametrize("test", onnx_test_folders)
+def test_onnx_nodes(test):
+    for failure in unsupported_onnx_tests:
+        if failure in test:
+            pytest.skip()
+            break
+    onnx_model = onnx.load(test + "/model.onnx")
+    inputs = []
+    outputs = []
+    for dataset in glob.glob(test + "/*/"):
+        tensors = sorted(glob.glob(dataset + "/*.pb"))
+        for tensor in tensors:
+            new_tensor = onnx.TensorProto()
+            with open(tensor, "rb") as f:
+                new_tensor.ParseFromString(f.read())
+            if "input" in tensor.split("/")[-1]:
+                inputs.append(numpy_helper.to_array(new_tensor))
+            elif "output" in tensor.split("/")[-1]:
+                outputs.append(numpy_helper.to_array(new_tensor))
+            else:
+                raise ImportError(str(tensor) + " not labeled as an import or an output")
+        tvm_val = get_tvm_output_with_vm(onnx_model, inputs, "llvm", tvm.cpu(0))
+        if len(outputs) == 1:
+            tvm.testing.assert_allclose(outputs[0], tvm_val, rtol=1e-5, atol=1e-5)
+        else:
+            for output, val in zip(outputs, tvm_val):
+                tvm.testing.assert_allclose(output, val, rtol=1e-5, atol=1e-5)
+
+
+def test_wrong_input():
+    node = helper.make_node(
+        "Softplus",
+        inputs=["X"],
+        outputs=["Y"],
+    )
+
+    graph = helper.make_graph(
+        [node],
+        "softplus_test",
+        inputs=[helper.make_tensor_value_info("X", TensorProto.FLOAT, list([5]))],
+        outputs=[helper.make_tensor_value_info("Y", TensorProto.FLOAT, list([5]))],
+    )
+    model = helper.make_model(graph, producer_name="softplus_test")
+
+    # Check that the graph can import correctly with proper shape definitions.
+    correct_shape_dict = {"X": [5]}
+    relay.frontend.from_onnx(model, shape=correct_shape_dict)
+
+    # Check that an assertion is triggered when an input not in the graph is provided.
+    wrong_shape_dict = {"Z": [5]}
+    with pytest.raises(AssertionError):
+        relay.frontend.from_onnx(model, shape=wrong_shape_dict)
+
+
 if __name__ == "__main__":
     test_flatten()
     test_reshape()
@@ -4177,3 +4357,4 @@ def verify_cumsum(indata, axis, exclusive=0, reverse=0, type="float32"):
     test_maxunpool()
     test_softplus()
     test_cumsum()
+    test_wrong_input()
diff --git a/tests/python/frontend/pytorch/qnn_test.py b/tests/python/frontend/pytorch/qnn_test.py
index 29c69abba542..5b0b65f7b128 100644
--- a/tests/python/frontend/pytorch/qnn_test.py
+++ b/tests/python/frontend/pytorch/qnn_test.py
@@ -49,7 +49,7 @@ def get_tvm_runtime(script_module, input_name, ishape):
         # also not to make CI too slow
         lib = relay.build(mod, target="llvm", params=params)
 
-    runtime = tvm.contrib.graph_runtime.GraphModule(lib["default"](tvm.cpu(0)))
+    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](tvm.cpu(0)))
     return runtime
 
 
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 83c1698799c7..9ec52987c354 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -27,7 +27,7 @@
 from torch.nn import functional as F
 import tvm
 from tvm import relay
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 from tvm.contrib.nvcc import have_fp16
 import tvm.testing
 from packaging import version as package_version
@@ -206,9 +206,9 @@ def verify_model(model_name, input_data=[], custom_convert_map={}, rtol=1e-5, at
     compiled_input = dict(zip(input_names, [inp.clone().cpu().numpy() for inp in baseline_input]))
 
     with tvm.transform.PassContext(opt_level=3):
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             relay_graph, relay_lib, relay_params = relay.build(mod, target=target, params=params)
-            relay_model = graph_runtime.create(relay_graph, relay_lib, ctx)
+            relay_model = graph_executor.create(relay_graph, relay_lib, dev)
             relay_model.set_input(**relay_params)
             for name, inp in compiled_input.items():
                 relay_model.set_input(name, inp)
@@ -809,7 +809,24 @@ def forward(self, *args):
 
 
 @tvm.testing.uses_gpu
-def test_forward_avgpool():
+def test_forward_avgpool1d():
+    torch.set_grad_enabled(False)
+    input_shape = [1, 3, 10]
+
+    class AvgPool1D2(Module):
+        def forward(self, *args):
+            return torch.nn.functional.avg_pool1d(args[0], kernel_size=[10])
+
+    input_data = torch.rand(input_shape).float()
+    verify_model(torch.nn.AvgPool1d(kernel_size=[10]).eval(), input_data=input_data)
+    verify_model(AvgPool1D2().float().eval(), input_data=input_data)
+    verify_model(
+        torch.nn.AvgPool1d(kernel_size=[5], stride=2, padding=2).eval(), input_data=input_data
+    )
+
+
+@tvm.testing.uses_gpu
+def test_forward_avgpool2d():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
 
@@ -820,6 +837,9 @@ def forward(self, *args):
     input_data = torch.rand(input_shape).float()
     verify_model(torch.nn.AvgPool2d(kernel_size=[10, 10]).eval(), input_data=input_data)
     verify_model(AvgPool2D2().float().eval(), input_data=input_data)
+    verify_model(
+        torch.nn.AvgPool2d(kernel_size=5, stride=2, padding=2).eval(), input_data=input_data
+    )
 
 
 @tvm.testing.uses_gpu
@@ -834,6 +854,9 @@ def forward(self, *args):
     input_data = torch.rand(input_shape).float()
     verify_model(torch.nn.AvgPool3d(kernel_size=[10, 10, 10]).eval(), input_data=input_data)
     verify_model(AvgPool3D1().float().eval(), input_data=input_data)
+    verify_model(
+        torch.nn.AvgPool3d(kernel_size=5, stride=2, padding=2).eval(), input_data=input_data
+    )
 
 
 @tvm.testing.uses_gpu
@@ -2128,9 +2151,9 @@ def verify_model_vm(input_model, ishapes, idtype=None, idata=None, targets=["llv
 
     for tgt in targets:
         print("Running on target", tgt)
-        ctx = tvm.context(tgt, 0)
+        dev = tvm.device(tgt, 0)
 
-        executor = relay.create_executor("vm", mod=mod, ctx=ctx, target=tgt)
+        executor = relay.create_executor("vm", mod=mod, device=dev, target=tgt)
         evaluator = executor.evaluate()
 
         # Inference
@@ -2622,10 +2645,17 @@ class Clamp3(Module):
         def forward(self, *args):
             return torch.clamp(args[0], max=1.0)
 
+    class Clamp_MinExpr_MaxConstant(Module):
+        def forward(self, *args):
+            h, w = args[0].shape[2:]
+            amin = h / 100.0
+            return torch.clamp(args[0], min=amin, max=w)
+
     input_data = torch.rand(input_shape).float()
     verify_model(Clamp1().float().eval(), input_data=input_data)
     verify_model(Clamp2().float().eval(), input_data=input_data)
     verify_model(Clamp3().float().eval(), input_data=input_data)
+    verify_model(Clamp_MinExpr_MaxConstant().float().eval(), input_data=input_data)
 
 
 @tvm.testing.uses_gpu
@@ -3559,8 +3589,8 @@ def test_forward_pretrained_bert_base_uncased():
     # Execute on TVM
     # --------------
 
-    ctx = tvm.context(target, 0)
-    relay_model = graph_runtime.create(relay_graph, relay_lib, ctx)
+    dev = tvm.device(target, 0)
+    relay_model = graph_executor.create(relay_graph, relay_lib, dev)
     relay_model.set_input(**relay_params)
     relay_model.set_input(input_1, tokens_tensor)
     relay_model.set_input(input_2, segments_tensors)
@@ -3831,7 +3861,8 @@ def test_fn(is_sorted, return_inverse, return_counts):
     test_forward_logsoftmax()
     test_forward_sigmoid()
     test_forward_dense()
-    test_forward_avgpool()
+    test_forward_avgpool1d()
+    test_forward_avgpool2d()
     test_forward_avgpool3d()
     test_forward_dropout()
     test_forward_slice()
diff --git a/tests/python/frontend/pytorch/test_lstm.py b/tests/python/frontend/pytorch/test_lstm.py
index 1197990f54ba..9089a83239e4 100644
--- a/tests/python/frontend/pytorch/test_lstm.py
+++ b/tests/python/frontend/pytorch/test_lstm.py
@@ -222,8 +222,8 @@ def assert_equal(tvm_result, torch_result):
         )
 
 
-def run_and_compare(mod, params, pt_result, target, ctx):
-    executor = relay.create_executor("vm", mod=mod, ctx=ctx, target=target)
+def run_and_compare(mod, params, pt_result, target, device):
+    executor = relay.create_executor("vm", mod=mod, device=device, target=target)
     evaluator = executor.evaluate()
     exec_res = evaluator(**params)
 
@@ -249,7 +249,7 @@ def flatten(nested):
 
 def convert_list_to_vmobj(py_lst):
     def wrap_nd_array(arr):
-        return tvm.nd.array(arr, ctx=tvm.cpu(0))
+        return tvm.nd.array(arr, device=tvm.cpu(0))
 
     mod = tvm.IRModule()
     prelude = Prelude(mod)
@@ -365,6 +365,6 @@ def test_custom_lstm():
         else:
             params[states_name] = states_np
 
-        for tgt, ctx in tvm.testing.enabled_targets():
+        for tgt, dev in tvm.testing.enabled_targets():
             print("Running %s on target %s" % (name, tgt))
-            run_and_compare(mod, params, pt_result, target=tgt, ctx=ctx)
+            run_and_compare(mod, params, pt_result, target=tgt, device=dev)
diff --git a/tests/python/frontend/pytorch/test_object_detection.py b/tests/python/frontend/pytorch/test_object_detection.py
index a404a88393bc..3d51f0e58655 100644
--- a/tests/python/frontend/pytorch/test_object_detection.py
+++ b/tests/python/frontend/pytorch/test_object_detection.py
@@ -117,8 +117,8 @@ def compile_and_run_vm(mod, params, data_np, target):
         with tvm.transform.PassContext(opt_level=3):
             vm_exec = relay.vm.compile(mod, target=target, params=params)
 
-        ctx = tvm.context(target, 0)
-        vm = VirtualMachine(vm_exec, ctx)
+        dev = tvm.device(target, 0)
+        vm = VirtualMachine(vm_exec, dev)
         vm.set_input("main", **{input_name: data_np})
         return vm.run()
 
diff --git a/tests/python/frontend/tensorflow/test_bn_dynamic.py b/tests/python/frontend/tensorflow/test_bn_dynamic.py
index ac02ef146968..4eb0d01ef102 100644
--- a/tests/python/frontend/tensorflow/test_bn_dynamic.py
+++ b/tests/python/frontend/tensorflow/test_bn_dynamic.py
@@ -59,16 +59,16 @@ def verify_fused_batch_norm(shape):
         constant_graph = graph_util.convert_variables_to_constants(sess, sess.graph_def, ["output"])
 
     for device in ["llvm"]:
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             print("Skip because %s is not enabled" % device)
             continue
         mod, params = relay.frontend.from_tensorflow(constant_graph, outputs=["output"])
         with tvm.transform.PassContext(opt_level=3):
             graph, lib, params = relay.build(mod, target=device, params=params)
-        from tvm.contrib import graph_runtime
+        from tvm.contrib import graph_executor
 
-        m = graph_runtime.create(graph, lib, ctx)
+        m = graph_executor.create(graph, lib, dev)
         m.set_input(**params)
         m.set_input("input", data)
         m.run()
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 7ace735789de..b7280ee06f50 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -110,7 +110,7 @@ def run_tvm_graph(
     target="llvm",
     out_names=None,
     opt_level=3,
-    mode="graph_runtime",
+    mode="graph_executor",
     cuda_layout="NCHW",
     layout=None,
     disabled_pass=None,
@@ -132,9 +132,9 @@ def run_tvm_graph(
     mod, params = relay.frontend.from_tensorflow(
         graph_def, layout=layout, shape=shape_dict, outputs=out_names
     )
-    ctx = tvm.context(target, 0)
+    dev = tvm.device(target, 0)
     if mode == "debug":
-        ex = relay.create_executor(mode, mod=mod, ctx=tvm.cpu(), target="llvm")
+        ex = relay.create_executor(mode, mod=mod, device=tvm.cpu(), target="llvm")
         inputs = []
         for param in mod["main"].params:
             found = False
@@ -164,12 +164,13 @@ def run_tvm_graph(
         return vmobj_to_list(result)
     else:
         with tvm.transform.PassContext(opt_level=opt_level, disabled_pass=disabled_pass):
+            target = tvm.target.Target(target, target_host)
             graph, lib, params = relay.build(
-                mod, tvm.target.Target(target, target_host), params=params
+                mod, target=target, params=params
             )
-        from tvm.contrib import graph_runtime
+        from tvm.contrib import graph_executor
 
-        m = graph_runtime.create(graph, lib, ctx)
+        m = graph_executor.create(graph, lib, dev)
         # set inputs
         for e, i in zip(input_node, input_data):
             if e != "":
@@ -209,9 +210,10 @@ def compare_tf_with_tvm(
     init_global_variables=False,
     no_gpu=False,
     opt_level=3,
-    mode="graph_runtime",
+    mode="graph_executor",
     cuda_layout="NCHW",
     add_shapes_to_graph_def=True,
+    targets=None,
 ):
     """Generic function to generate and compare tensorflow and TVM output"""
 
@@ -235,13 +237,18 @@ def name_without_num(name):
 
         tf_output = run_tf_graph(sess, in_data, in_name, out_name)
 
-        for device in ["llvm", "cuda"]:
-            ctx = tvm.context(device, 0)
+        devices = targets if targets else ["llvm", "cuda"]
+
+        for device in devices:
+            dev = tvm.device(device, 0)
             if not tvm.testing.device_enabled(device):
                 print("Skip because %s is not enabled" % device)
                 continue
             if no_gpu and device == "cuda":
                 continue
+            if "cublas" in device and not tvm.get_global_func("tvm.contrib.cublas.matmul", True):
+                print("Skip because cublas is not enabled: %s" % device)
+                continue
 
             tvm_output = run_tvm_graph(
                 final_graph_def,
@@ -1685,7 +1692,7 @@ def test_forward_variable():
 
 
 @tvm.testing.parametrize_targets("llvm", "cuda")
-def test_read_variable_op(target, ctx):
+def test_read_variable_op(target, dev):
     """ Read Variable op test """
 
     tf.reset_default_graph()
@@ -1783,6 +1790,23 @@ def _test_batch_matmul(A_shape, B_shape, dtype, adjoint_a=False, adjoint_b=False
         compare_tf_with_tvm([A_np, B_np], [A.name, B.name], result.name)
 
 
+def _test_batch_matmul_dynamic(
+    A_shape, B_shape, A_np_shape, B_np_shape, dtype, adjoint_a=False, adjoint_b=False
+):
+    with tf.Graph().as_default():
+        A = tf.placeholder(shape=A_shape, dtype=dtype, name="A")
+        B = tf.placeholder(shape=B_shape, dtype=dtype, name="B")
+        result = tf.matmul(A, B, adjoint_a=adjoint_a, adjoint_b=adjoint_b, name="batchmatmul")
+
+        A_np = np.random.uniform(high=5.0, size=A_np_shape).astype(dtype)
+        B_np = np.random.uniform(high=5.0, size=B_np_shape).astype(dtype)
+        # for now, in TOPI, only cublas's implementation support dynamic shape
+        # TODO add more backends support in TOPI
+        compare_tf_with_tvm(
+            [A_np, B_np], [A.name, B.name], result.name, mode="vm", targets=["cuda -libs=cublas"]
+        )
+
+
 def test_forward_batch_matmul():
     """ TF op BatchMatMul, BatchMatMulV2 test"""
     _test_batch_matmul((3, 5, 4), (3, 4, 5), "int32")
@@ -1795,6 +1819,33 @@ def test_forward_batch_matmul():
     _test_batch_matmul((2, 3, 4, 2, 3, 4, 5, 6), (2, 3, 4, 2, 3, 4, 5, 6), "float32", False, True)
 
 
+@tvm.testing.requires_cuda
+def test_forward_batch_matmul_dynamic():
+    _test_batch_matmul_dynamic((None, 5, 4), (None, 4, 5), (3, 5, 4), (3, 4, 5), "int32")
+    _test_batch_matmul_dynamic(
+        (None, 5, 4), (None, 4, 5), (3, 5, 4), (3, 4, 5), "float32", True, True
+    )
+    _test_batch_matmul_dynamic(
+        (None, 5, 4), (None, 5, 4), (3, 5, 4), (3, 5, 4), "int32", True, False
+    )
+    _test_batch_matmul_dynamic(
+        (None, 5, 4), (None, 5, 4), (3, 5, 4), (3, 5, 4), "float32", False, True
+    )
+    _test_batch_matmul_dynamic(
+        (None, 4, 5, 6), (None, 4, 6, 5), (3, 4, 5, 6), (3, 4, 6, 5), "float32"
+    )
+    _test_batch_matmul_dynamic(
+        (None, None, 5, 6), (None, None, 6, 5), (3, 4, 5, 6), (3, 4, 6, 5), "float32"
+    )
+    _test_batch_matmul_dynamic(
+        (None, None, None, 5, 6),
+        (None, None, None, 6, 5),
+        (2, 3, 4, 5, 6),
+        (2, 3, 4, 6, 5),
+        "float32",
+    )
+
+
 #######################################################################
 # SparseTensorDenseMatMul
 # ----------------------------------
@@ -2354,6 +2405,54 @@ def test_forward_sparse_to_dense_v2():
     _test_sparse_to_dense_v2([[0, 0], [1, 3], [4, 3]], [3.0, 6.0, 9.0], [5, 5], "float32", 1.9)
 
 
+#######################################################################
+# tensorflow.sparse.add
+# ----------------------------------
+
+
+def _test_sparse_add(indices, values, A_shape, B_shape, dtype, flip=False):
+    """ One iteration of tf.sparse.add """
+
+    # TODO(ANSHUMAN87): support cuda
+    # TODO(ANSHUMAN87): support both sparse input case
+
+    with tf.Graph().as_default():
+        A_sp = tf.sparse.SparseTensor(
+            indices=indices, values=np.array(values).astype(dtype), dense_shape=A_shape
+        )
+        B = tf.placeholder(shape=B_shape, dtype=dtype, name="B")
+
+        # TODO(ANSHUMAN87): support user input threashold values
+        if flip:
+            result = tf.sparse.add(B, A_sp, threshold=0)
+        else:
+            result = tf.sparse.add(A_sp, B, threshold=0)
+
+        B_np = np.random.uniform(high=5.0, size=B_shape).astype(dtype)
+
+        compare_tf_with_tvm([B_np], [B.name], result.name, no_gpu=True)
+
+
+def test_sparse_add():
+    """ sparse.add op test"""
+    ###################################################################
+    #
+    # In order to create a SparseTensor, it requires 3 input as below:
+    #    SparseTensor(indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
+    #
+    # Above Sparse can be represented in Dense as below :
+    #    [[1, 0, 0, 0]
+    #     [0, 0, 2, 0]
+    #     [0, 0, 0, 0]]
+    #
+    # ------------------------------------------------------------------
+    for dtype_inp in ["float32", "float64", "int32"]:
+        _test_sparse_add([[0, 0], [1, 2]], [4.0, 8.0], [3, 4], [3, 4], dtype_inp)
+        _test_sparse_add([[0, 0], [1, 2]], [4.0, 8.0], [3, 4], [3, 4], dtype_inp, True)
+        _test_sparse_add([[0, 0], [1, 3], [4, 3]], [3.0, 6.0, 9.0], [5, 5], [5, 5], dtype_inp)
+        _test_sparse_add([[0, 0], [1, 3], [4, 3]], [3.0, 6.0, 9.0], [5, 5], [5, 5], dtype_inp, True)
+
+
 #######################################################################
 # StridedSlice
 # ------------
@@ -3621,7 +3720,7 @@ def test_forward_resnetv2():
             with tf.Session() as sess:
                 tf_output = run_tf_graph(sess, data, "input_tensor:0", out_node + ":0")
                 for device in ["llvm", "cuda"]:
-                    ctx = tvm.context(device, 0)
+                    dev = tvm.device(device, 0)
                     if not tvm.testing.device_enabled(device):
                         print("Skip because %s is not enabled" % device)
                         continue
@@ -3658,7 +3757,7 @@ def _test_ssd_impl():
             )
             # TODO(kevinthesun): enable gpu test when VM heterogeneous execution is ready.
             for device in ["llvm"]:
-                ctx = tvm.context(device, 0)
+                dev = tvm.device(device, 0)
                 if not tvm.testing.device_enabled(device):
                     print("Skip because %s is not enabled" % device)
                     continue
@@ -3760,10 +3859,10 @@ def _get_tvm_graph_module(graph_def):
         target = "llvm"
         with tvm.transform.PassContext(opt_level=0):
             graph, lib, params = relay.build(mod, target, params=params)
-        from tvm.contrib import graph_runtime
+        from tvm.contrib import graph_executor
 
-        ctx = tvm.cpu(0)
-        return params, graph_runtime.create(graph, lib, ctx)
+        dev = tvm.cpu(0)
+        return params, graph_executor.create(graph, lib, dev)
 
     def _do_tvm_sample(model, data, in_states, params, num_samples):
         """Sampled from the model"""
@@ -3977,7 +4076,7 @@ def test_forward_floor():
 def test_forward_relu():
     ishape = (1, 3, 10, 10)
     inp_array = np.random.uniform(-5, 5, size=ishape).astype(np.float32)
-    for mode in ["graph_runtime", "vm"]:
+    for mode in ["graph_executor", "vm"]:
         with tf.Graph().as_default():
             in1 = tf.placeholder(shape=inp_array.shape, dtype=inp_array.dtype)
             tf.nn.relu(in1)
@@ -3987,7 +4086,7 @@ def test_forward_relu():
 def test_forward_leaky_relu():
     ishape = (1, 3, 10, 10)
     inp_array = np.random.uniform(-5, 5, size=ishape).astype(np.float32)
-    for mode in ["graph_runtime", "vm"]:
+    for mode in ["graph_executor", "vm"]:
         with tf.Graph().as_default():
             in1 = tf.placeholder(shape=inp_array.shape, dtype=inp_array.dtype)
             tf.nn.leaky_relu(in1, alpha=0.4)
@@ -5175,7 +5274,7 @@ def test_forward_dynamic_input_shape():
             tf_output = run_tf_graph(sess, np_data, "data:0", ["{}:0".format(out_name)])
             # TODO(kevinthesun): enable gpu test when VM heterogeneous execution is ready.
             for device in ["llvm"]:
-                ctx = tvm.context(device, 0)
+                dev = tvm.device(device, 0)
                 if not tvm.testing.device_enabled(device):
                     print("Skip because %s is not enabled" % device)
                     continue
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index 0d02c15f2eb8..b02d246d1ae5 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -160,7 +160,7 @@ def run_tvm_graph(
     num_output=1,
     target="llvm",
     out_names=None,
-    mode="graph_runtime",
+    mode="graph_executor",
 ):
     """ Generic function to compile on relay and execute on tvm """
     # TFLite.Model.Model has changed to TFLite.Model from 1.14 to 2.1
@@ -189,7 +189,7 @@ def run_tvm_graph(
     )
 
     if mode in ["debug", "vm"]:
-        ex = relay.create_executor(mode, mod=mod, ctx=tvm.cpu(), target="llvm")
+        ex = relay.create_executor(mode, mod=mod, device=tvm.cpu(), target="llvm")
         inputs = []
         for param in mod["main"].params:
             found = False
@@ -207,10 +207,10 @@ def run_tvm_graph(
         with tvm.transform.PassContext(opt_level=3):
             lib = relay.build(mod, target, params=params)
 
-        ctx = tvm.context(target, 0)
-        from tvm.contrib import graph_runtime
+        dev = tvm.device(target, 0)
+        from tvm.contrib import graph_executor
 
-        m = graph_runtime.GraphModule(lib["default"](ctx))
+        m = graph_executor.GraphModule(lib["default"](dev))
         # set inputs
         for i, e in enumerate(input_node):
             m.set_input(e, tvm.nd.array(input_data[i].astype(input_data[i].dtype)))
@@ -264,7 +264,7 @@ def compare_tflite_with_tvm(
     out_names=None,
     quantized=False,
     input_range=None,
-    mode="graph_runtime",
+    mode="graph_executor",
     experimental_new_converter=False,
 ):
     """Generic function to generate and compare TFLite and TVM output"""
@@ -303,7 +303,7 @@ def compare_tflite_with_tvm(
         tflite_output = run_tflite_graph(tflite_model_buffer, in_data)
 
         for device in ["llvm"]:
-            ctx = tvm.context(device, 0)
+            dev = tvm.device(device, 0)
             if not tvm.testing.device_enabled(device):
                 print("Skip because %s is not enabled" % device)
                 continue
@@ -647,19 +647,28 @@ def test_forward_transpose():
 # ----
 
 
-def _test_cast(data, cast_dtype):
+def _test_cast(data, cast_dtype, use_mlir=False):
     """ One iteration of CAST """
     with tf.Graph().as_default():
         in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype)
         out = math_ops.cast(in_data, cast_dtype)
-        compare_tflite_with_tvm(data, "Placeholder:0", [in_data], [out])
+        compare_tflite_with_tvm(
+            data, "Placeholder:0", [in_data], [out], experimental_new_converter=use_mlir
+        )
 
 
 def test_forward_cast():
     """ CAST """
-    _test_cast(np.arange(6.0, dtype=np.float32).reshape((1, 6)), cast_dtype=tf.int32)
-    _test_cast(np.arange(6.0, dtype=np.float32).reshape((1, 6)), cast_dtype=tf.uint8)
-    _test_cast(np.arange(6.0, dtype=np.int32).reshape((1, 6)), cast_dtype=tf.int64)
+    for use_mlir in [False, True]:
+        _test_cast(
+            np.arange(6.0, dtype=np.float32).reshape((1, 6)), cast_dtype=tf.int32, use_mlir=use_mlir
+        )
+        _test_cast(
+            np.arange(6.0, dtype=np.float32).reshape((1, 6)), cast_dtype=tf.uint8, use_mlir=use_mlir
+        )
+        _test_cast(
+            np.arange(6.0, dtype=np.int32).reshape((1, 6)), cast_dtype=tf.int64, use_mlir=use_mlir
+        )
 
 
 #######################################################################
diff --git a/tests/python/integration/test_dot.py b/tests/python/integration/test_dot.py
index 609b6dedfb3a..7d4eca4d33d9 100644
--- a/tests/python/integration/test_dot.py
+++ b/tests/python/integration/test_dot.py
@@ -33,10 +33,10 @@ def test_dot():
     def verify(target):
         f = tvm.driver.build(s, [A, B, C], target)
         # verify
-        ctx = tvm.cpu(0)
-        a = tvm.nd.array(np.random.uniform(size=(nn,)).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=(nn,)).astype(B.dtype), ctx)
-        c = tvm.nd.array(np.zeros((), dtype=C.dtype), ctx)
+        dev = tvm.cpu(0)
+        a = tvm.nd.array(np.random.uniform(size=(nn,)).astype(A.dtype), dev)
+        b = tvm.nd.array(np.random.uniform(size=(nn,)).astype(B.dtype), dev)
+        c = tvm.nd.array(np.zeros((), dtype=C.dtype), dev)
         f(a, b, c)
         tvm.testing.assert_allclose(c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-4)
 
diff --git a/tests/python/integration/test_ewise.py b/tests/python/integration/test_ewise.py
index dda494d01484..034f89cd3fe2 100644
--- a/tests/python/integration/test_ewise.py
+++ b/tests/python/integration/test_ewise.py
@@ -39,13 +39,13 @@ def test_exp():
     def check_device(device, host="stackvm"):
         if not tvm.testing.device_enabled(host):
             return
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         fexp = tvm.build(s, [A, B], device, host, name="myexp")
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         # launch the kernel.
         n = 1024
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
+        b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev)
         fexp(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), np.exp(a.asnumpy()), rtol=1e-5)
 
@@ -68,7 +68,7 @@ def run(dtype):
         bx, tx = s[C].split(C.op.axis[0], factor=num_thread)
 
         def check_device(device):
-            ctx = tvm.context(device, 0)
+            dev = tvm.device(device, 0)
             if not tvm.testing.device_enabled(device):
                 print("skip because %s is not enabled.." % device)
                 return
@@ -87,10 +87,10 @@ def check_device(device):
             b_np += (b_np < 2.0) * 2
             a_np[np.abs(np.fmod(a_np, b_np)) < 1] += 1
 
-            a = tvm.nd.array(a_np, ctx)
-            b = tvm.nd.array(b_np, ctx)
-            c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
-            ftimer = fmod.time_evaluator(fmod.entry_name, ctx, number=1)
+            a = tvm.nd.array(a_np, dev)
+            b = tvm.nd.array(b_np, dev)
+            c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
+            ftimer = fmod.time_evaluator(fmod.entry_name, dev, number=1)
             tcost = ftimer(a, b, c).mean
             # fmod(a, b, c)
             np.testing.assert_allclose(c.asnumpy(), np.mod(a.asnumpy(), b.asnumpy()), rtol=1e-5)
@@ -123,16 +123,16 @@ def test_multiple_cache_write():
     def check_device(device, host="stackvm"):
         if not tvm.testing.device_enabled(host):
             return
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             return
         func = tvm.build(s, [A0, A1, C], device, host, name="multiple_cache_write")
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         # launch the kernel.
         n = 1024
-        a0 = tvm.nd.array(np.random.uniform(size=n).astype(A0.dtype), ctx)
-        a1 = tvm.nd.array(np.random.uniform(size=n).astype(A1.dtype), ctx)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+        a0 = tvm.nd.array(np.random.uniform(size=n).astype(A0.dtype), dev)
+        a1 = tvm.nd.array(np.random.uniform(size=n).astype(A1.dtype), dev)
+        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
         func(a0, a1, c)
         tvm.testing.assert_allclose(
             c.asnumpy(), a0.asnumpy() + a1.asnumpy() + (a0.asnumpy() * a1.asnumpy()), rtol=1e-5
@@ -156,13 +156,13 @@ def test_log_pow_llvm():
         return
 
     flog = tvm.build(s, [A, B], "llvm", name="mylog")
-    ctx = tvm.cpu(0)
+    dev = tvm.cpu(0)
     # launch the kernel.
     n = 1028
-    a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
-    b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx)
+    a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
+    b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev)
     repeat = 10
-    ftimer = flog.time_evaluator(flog.entry_name, ctx, number=1, repeat=repeat)
+    ftimer = flog.time_evaluator(flog.entry_name, dev, number=1, repeat=repeat)
     res = ftimer(a, b)
     assert len(res.results) == repeat
     tvm.testing.assert_allclose(b.asnumpy(), np.power(np.log(a.asnumpy()), 2.0), rtol=1e-5)
@@ -181,7 +181,7 @@ def run(dtype):
         bx, tx = s[B].split(B.op.axis[0], factor=num_thread)
 
         def check_device(device):
-            ctx = tvm.context(device, 0)
+            dev = tvm.device(device, 0)
             if not tvm.testing.device_enabled(device):
                 print("skip because %s is not enabled.." % device)
                 return
@@ -192,8 +192,8 @@ def check_device(device):
             func = tvm.build(s, [A, B], device)
             # launch the kernel.
             n = 1024
-            a = tvm.nd.array(np.random.randint(low=0, high=1000, size=n, dtype=A.dtype), ctx)
-            b = tvm.nd.array(np.zeros(shape=n, dtype=B.dtype), ctx)
+            a = tvm.nd.array(np.random.randint(low=0, high=1000, size=n, dtype=A.dtype), dev)
+            b = tvm.nd.array(np.zeros(shape=n, dtype=B.dtype), dev)
             func(a, b)
             tvm.testing.assert_allclose(
                 b.asnumpy(), list(map(lambda x: bin(x).count("1"), a.asnumpy())), rtol=1e-5
@@ -233,7 +233,7 @@ def run(dtype):
 
         # one line to build the function.
         def check_device(device):
-            ctx = tvm.context(device, 0)
+            dev = tvm.device(device, 0)
             if not tvm.testing.device_enabled(device):
                 print("skip because %s is not enabled.." % device)
                 return
@@ -241,10 +241,10 @@ def check_device(device):
 
             # launch the kernel.
             n = 1024
-            a = tvm.nd.array((np.random.uniform(size=n) * 256).astype(A.dtype), ctx)
-            b = tvm.nd.array((np.random.uniform(size=n) * 256).astype(B.dtype), ctx)
-            c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
-            ftimer = fadd.time_evaluator(fadd.entry_name, ctx, number=1)
+            a = tvm.nd.array((np.random.uniform(size=n) * 256).astype(A.dtype), dev)
+            b = tvm.nd.array((np.random.uniform(size=n) * 256).astype(B.dtype), dev)
+            c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
+            ftimer = fadd.time_evaluator(fadd.entry_name, dev, number=1)
             tcost = ftimer(a, b, c).mean
             tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy(), rtol=1e-6)
 
@@ -285,13 +285,13 @@ def tvm_callback_cuda_compile(code):
 
     # one line to build the function.
     def check_device(device):
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             print("skip because %s is not enabled.." % device)
             return
         f = tvm.build(s, [A, B], device)
-        a = tvm.nd.array((np.random.uniform(size=m) * 256).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.zeros(m, dtype=B.dtype), ctx)
+        a = tvm.nd.array((np.random.uniform(size=m) * 256).astype(A.dtype), dev)
+        b = tvm.nd.array(np.zeros(m, dtype=B.dtype), dev)
         f(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), a.asnumpy() + 3, rtol=1e-6)
 
diff --git a/tests/python/integration/test_ewise_fpga.py b/tests/python/integration/test_ewise_fpga.py
index 2e5364b24331..fb2c6b1a3db6 100644
--- a/tests/python/integration/test_ewise_fpga.py
+++ b/tests/python/integration/test_ewise_fpga.py
@@ -45,13 +45,13 @@ def test_exp():
     def check_device(device, host="llvm"):
         if not tvm.testing.device_enabled(device):
             return
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         fexp = tvm.build(s, [A, B], device, host, name="myexp")
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         # launch the kernel.
         n = 1024
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
+        b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev)
         fexp(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), np.exp(a.asnumpy()), rtol=1e-5)
 
@@ -80,15 +80,15 @@ def test_multi_kernel():
     def check_device(device, host="llvm"):
         if not tvm.testing.device_enabled(device):
             return
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         fadd = tvm.build(s, [A, B, C, D], device, host, name="myadd")
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         # launch the kernel.
         n = 1024
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
-        c = tvm.nd.array(np.random.uniform(size=n).astype(C.dtype), ctx)
-        d = tvm.nd.array(np.random.uniform(size=n).astype(D.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
+        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
+        c = tvm.nd.array(np.random.uniform(size=n).astype(C.dtype), dev)
+        d = tvm.nd.array(np.random.uniform(size=n).astype(D.dtype), dev)
         fadd(a, b, c, d)
         tvm.testing.assert_allclose(d.asnumpy(), a.asnumpy() * 2 + b.asnumpy(), rtol=1e-5)
 
diff --git a/tests/python/integration/test_gemm.py b/tests/python/integration/test_gemm.py
index 42612c262f18..5faacde30b58 100644
--- a/tests/python/integration/test_gemm.py
+++ b/tests/python/integration/test_gemm.py
@@ -79,7 +79,7 @@ def test_gemm():
 
     # one line to build the function.
     def check_device(device):
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             print("skip because %s is not enabled.." % device)
             return
@@ -93,12 +93,12 @@ def check_device(device):
         l = n
         a_np = np.random.uniform(size=(n, l)).astype(A.dtype)
         b_np = np.random.uniform(size=(m, l)).astype(B.dtype)
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(b_np, ctx)
-        c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
-        ftimer = f.time_evaluator(f.entry_name, ctx, number=1)
+        a = tvm.nd.array(a_np, dev)
+        b = tvm.nd.array(b_np, dev)
+        c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev)
+        ftimer = f.time_evaluator(f.entry_name, dev, number=1)
         tcost = ftimer(a, b, c).mean
-        print("%s: exec=%g sec/op" % (ctx, tcost))
+        print("%s: exec=%g sec/op" % (dev, tcost))
         tvm.testing.assert_allclose(c.asnumpy(), np.dot(a_np, b_np.T), rtol=1e-5)
 
     check_device("vulkan")
diff --git a/tests/python/integration/test_reduce.py b/tests/python/integration/test_reduce.py
index 21c9acb08eb6..19bd03ec79ce 100644
--- a/tests/python/integration/test_reduce.py
+++ b/tests/python/integration/test_reduce.py
@@ -41,7 +41,7 @@ def test_prim(reducer, np_reducer):
 
         # one line to build the function.
         def check_device(device, host="llvm"):
-            ctx = tvm.context(device, 0)
+            dev = tvm.device(device, 0)
             if not tvm.testing.device_enabled(device):
                 print("skip because %s is not enabled.." % device)
                 return
@@ -51,8 +51,8 @@ def check_device(device, host="llvm"):
             # launch the kernel.
             n = 1028
             m = 129
-            x = tvm.nd.array(np.random.uniform(size=(n, m)).astype(A.dtype), ctx)
-            y = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx)
+            x = tvm.nd.array(np.random.uniform(size=(n, m)).astype(A.dtype), dev)
+            y = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev)
             freduce(x, y)
             npy = y.asnumpy()
             npy[:2] = 0
@@ -82,13 +82,13 @@ def test_init_imm():
     def check_target(target="llvm"):
         if not tvm.runtime.enabled(target):
             return
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         fapi = tvm.lower(s, args=[A, B])
         fsum = tvm.build(fapi, target=target, name="mysum")
         # launch the kernel.
         n = 1027
-        a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.zeros((), dtype=B.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev)
+        b = tvm.nd.array(np.zeros((), dtype=B.dtype), dev)
         fsum(a, b)
         res = 10.0 + np.sum(a.asnumpy(), axis=0)
         tvm.testing.assert_allclose(b.asnumpy(), res, rtol=1e-4)
@@ -110,16 +110,16 @@ def test_init():
     def check_target(target="llvm"):
         if not tvm.runtime.enabled(target):
             return
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         fapi = tvm.lower(s, args=[A, C, I, B])
         print(fapi)
         mmult = tvm.build(fapi, target=target, name="mmult")
         # launch the kernel.
         n = 1027
-        a = tvm.nd.array(np.random.uniform(size=(n, n)).astype(A.dtype), ctx)
-        c = tvm.nd.array(np.random.uniform(size=(n, n)).astype(C.dtype), ctx)
-        ii = tvm.nd.array(np.random.uniform(size=(n, n)).astype(B.dtype), ctx)
-        b = tvm.nd.array(np.zeros((n, n), dtype=B.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=(n, n)).astype(A.dtype), dev)
+        c = tvm.nd.array(np.random.uniform(size=(n, n)).astype(C.dtype), dev)
+        ii = tvm.nd.array(np.random.uniform(size=(n, n)).astype(B.dtype), dev)
+        b = tvm.nd.array(np.zeros((n, n), dtype=B.dtype), dev)
         mmult(a, c, ii, b)
         res = ii.asnumpy() + np.matmul(a.asnumpy(), c.asnumpy())
         tvm.testing.assert_allclose(b.asnumpy(), res, rtol=1e-4)
@@ -141,13 +141,13 @@ def test_rfactor():
     def check_target(target="llvm"):
         if not tvm.testing.device_enabled(target):
             return
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         fapi = tvm.lower(s, args=[A, B])
         fsum = tvm.build(fapi, target=target, name="mysum")
         # launch the kernel.
         n = 1027
-        a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.zeros((), dtype=B.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev)
+        b = tvm.nd.array(np.zeros((), dtype=B.dtype), dev)
         fsum(a, b)
         res = np.sum(a.asnumpy(), axis=0)
         tvm.testing.assert_allclose(b.asnumpy(), res, rtol=1e-4)
@@ -172,16 +172,16 @@ def test_rfactor_init():
     def check_target(target="llvm"):
         if not tvm.runtime.enabled(target):
             return
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         fapi = tvm.lower(s, args=[A, C, I, B])
         print(fapi)
         mmult = tvm.build(fapi, target=target, name="mmult")
         # launch the kernel.
         n = 1027
-        a = tvm.nd.array(np.random.uniform(size=(n, n)).astype(A.dtype), ctx)
-        c = tvm.nd.array(np.random.uniform(size=(n, n)).astype(C.dtype), ctx)
-        ii = tvm.nd.array(np.random.uniform(size=(n, n)).astype(B.dtype), ctx)
-        b = tvm.nd.array(np.zeros((n, n), dtype=B.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=(n, n)).astype(A.dtype), dev)
+        c = tvm.nd.array(np.random.uniform(size=(n, n)).astype(C.dtype), dev)
+        ii = tvm.nd.array(np.random.uniform(size=(n, n)).astype(B.dtype), dev)
+        b = tvm.nd.array(np.zeros((n, n), dtype=B.dtype), dev)
         mmult(a, c, ii, b)
         res = ii.asnumpy() + np.matmul(a.asnumpy(), c.asnumpy())
         tvm.testing.assert_allclose(b.asnumpy(), res, rtol=1e-4)
@@ -203,13 +203,13 @@ def test_rfactor_factor_axis():
     def check_target(target="llvm"):
         if not tvm.testing.device_enabled(target):
             return
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         fapi = tvm.lower(s, args=[A, B])
         fsum = tvm.build(fapi, target=target, name="mysum")
         # launch the kernel.
         n = 1027
-        a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.zeros((), dtype=B.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev)
+        b = tvm.nd.array(np.zeros((), dtype=B.dtype), dev)
         fsum(a, b)
         res = np.sum(a.asnumpy(), axis=0)
         tvm.testing.assert_allclose(b.asnumpy(), res, rtol=1e-4)
@@ -242,7 +242,7 @@ def test_rfactor_threads():
 
     # one line to build the function.
     def check_target(device, host="stackvm"):
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             print("skip because %s is not enabled.." % device)
             return
@@ -252,8 +252,8 @@ def check_target(device, host="stackvm"):
         # launch the kernel.
         n = nn
         m = mm
-        a = tvm.nd.array(np.random.uniform(size=(m, n)).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.zeros(m, dtype=B.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=(m, n)).astype(A.dtype), dev)
+        b = tvm.nd.array(np.zeros(m, dtype=B.dtype), dev)
         fsum(a, b)
         res = np.sum(a.asnumpy(), axis=1)
         res[:2] = 0
@@ -296,15 +296,15 @@ def test_rfactor_elemwise_threads():
 
     # one line to build the function.
     def check_target(device, host="stackvm"):
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             print("skip because %s is not enabled.." % device)
             return
         fapi = tvm.lower(s, args=[A, C])
         fsum = tvm.build(fapi, target=device, name="mysum")
         # launch the kernel.
-        a = tvm.nd.array(np.random.uniform(size=(m, n)).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.zeros(m, dtype=B.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=(m, n)).astype(A.dtype), dev)
+        b = tvm.nd.array(np.zeros(m, dtype=B.dtype), dev)
         fsum(a, b)
         res = np.sum(a.asnumpy(), axis=1) + 2
         tvm.testing.assert_allclose(b.asnumpy(), res, rtol=1e-4)
@@ -339,7 +339,7 @@ def check_target():
         if not tvm.testing.device_enabled(device):
             print("skip because %s is not enabled.." % device)
             return
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         fapi = tvm.lower(s, args=[idx, val, T0, T1])
         fargmax = tvm.build(fapi, target="llvm", name="argmax")
 
@@ -349,10 +349,10 @@ def check_target():
         np_val = np.random.uniform(size=(mm, nn)).astype("float32")
         np_res = np.argmax(np_val, axis=1)
 
-        nd_idx = tvm.nd.array(np_idx, ctx)
-        nd_val = tvm.nd.array(np_val, ctx)
-        nd_res0 = tvm.nd.array(np.zeros(mm, dtype="int32"), ctx)
-        nd_res1 = tvm.nd.array(np.zeros(mm, dtype="float32"), ctx)
+        nd_idx = tvm.nd.array(np_idx, dev)
+        nd_val = tvm.nd.array(np_val, dev)
+        nd_res0 = tvm.nd.array(np.zeros(mm, dtype="int32"), dev)
+        nd_res1 = tvm.nd.array(np.zeros(mm, dtype="float32"), dev)
         fargmax(nd_idx, nd_val, nd_res0, nd_res1)
         tvm.testing.assert_allclose(np_res, nd_res0.asnumpy())
 
@@ -395,7 +395,7 @@ def fidentity(t0, t1):
     s[B0].set_store_predicate(thread_x.var.equal(0))
 
     def check_target(device):
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             print("skip because %s is not enabled.." % device)
             return
@@ -406,10 +406,10 @@ def check_target(device):
         np_val = np.random.uniform(size=(mm, nn)).astype("float32")
         np_res = np.argmax(np_val, axis=1)
 
-        nd_idx = tvm.nd.array(np_idx, ctx)
-        nd_val = tvm.nd.array(np_val, ctx)
-        nd_res0 = tvm.nd.array(np.zeros(mm, dtype="int32"), ctx)
-        nd_res1 = tvm.nd.array(np.zeros(mm, dtype="float32"), ctx)
+        nd_idx = tvm.nd.array(np_idx, dev)
+        nd_val = tvm.nd.array(np_val, dev)
+        nd_res0 = tvm.nd.array(np.zeros(mm, dtype="int32"), dev)
+        nd_res1 = tvm.nd.array(np.zeros(mm, dtype="float32"), dev)
         fargmax(nd_idx, nd_val, nd_res0, nd_res1)
         tvm.testing.assert_allclose(np_res, nd_res0.asnumpy())
 
@@ -427,7 +427,7 @@ def test_warp_reduction1():
     thread_y = te.thread_axis((0, nthy), "threadIdx.y")
 
     def check_target(device, m, n):
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             print("skip because %s is not enabled.." % device)
             return
@@ -452,8 +452,8 @@ def check_target(device, m, n):
         func = tvm.build(s, [A, B], device, name="warp_reduction")
         a_np = np.random.uniform(size=(m, n)).astype(A.dtype)
         b_np = np.zeros((m,), dtype=A.dtype)
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(b_np, ctx)
+        a = tvm.nd.array(a_np, dev)
+        b = tvm.nd.array(b_np, dev)
         b_np = np.max(a_np, axis=1)
         func(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3)
@@ -490,7 +490,7 @@ def fidentity(t0, t1):
     thread_y = te.thread_axis((0, nthdy), "threadIdx.y")
 
     def check_target(device):
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             print("skip because %s is not enabled.." % device)
             return
@@ -504,15 +504,15 @@ def check_target(device):
         s[T0].bind(xo, block_x)
 
         # validation
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         a0_np = np.random.uniform(size=(m, n)).astype(A0.dtype)
         a1_np = np.random.uniform(size=(m, n)).astype(A1.dtype)
         t0_np = np.zeros((m,), dtype=A0.dtype)
         t1_np = np.zeros((m,), dtype=A1.dtype)
-        a0 = tvm.nd.array(a0_np, ctx)
-        a1 = tvm.nd.array(a1_np, ctx)
-        t0 = tvm.nd.array(t0_np, ctx)
-        t1 = tvm.nd.array(t1_np, ctx)
+        a0 = tvm.nd.array(a0_np, dev)
+        a1 = tvm.nd.array(a1_np, dev)
+        t0 = tvm.nd.array(t0_np, dev)
+        t1 = tvm.nd.array(t1_np, dev)
         func = tvm.build(s, [A0, A1, T0, T1], device, name="reduction")
         func(a0, a1, t0, t1)
         t0_np = np.sum(a0_np, axis=1)
diff --git a/tests/python/integration/test_scan.py b/tests/python/integration/test_scan.py
index 73be68c3b6e2..54a8f1e92ed1 100644
--- a/tests/python/integration/test_scan.py
+++ b/tests/python/integration/test_scan.py
@@ -49,7 +49,7 @@ def test_scan():
 
     # one line to build the function.
     def check_device(device):
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             print("skip because %s is not enabled.." % device)
             return
@@ -58,8 +58,8 @@ def check_device(device):
         n = 1024
         m = 10
         a_np = np.random.uniform(size=(m, n)).astype(res.dtype)
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.zeros((m, n), dtype=res.dtype), ctx)
+        a = tvm.nd.array(a_np, dev)
+        b = tvm.nd.array(np.zeros((m, n), dtype=res.dtype), dev)
         fscan(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), np.cumsum(a_np, axis=0))
 
diff --git a/tests/python/integration/test_tuning.py b/tests/python/integration/test_tuning.py
index b7b7298f6e30..45e0958a0240 100644
--- a/tests/python/integration/test_tuning.py
+++ b/tests/python/integration/test_tuning.py
@@ -141,7 +141,7 @@ def get_sample_task(target=tvm.target.cuda(), target_host=None):
 
 
 @tvm.testing.parametrize_targets("cuda", "opencl")
-def test_tuning_gpu(target, ctx):
+def test_tuning_gpu(target, dev):
     # init task
     task, target = get_sample_task(target, None)
     logging.info("task config space: %s", task.config_space)
diff --git a/tests/python/integration/test_winograd_nnpack.py b/tests/python/integration/test_winograd_nnpack.py
index d32699375050..aa018616c87f 100644
--- a/tests/python/integration/test_winograd_nnpack.py
+++ b/tests/python/integration/test_winograd_nnpack.py
@@ -74,7 +74,7 @@ def get_ref_data():
     a_np, w_np, b_np, c_np = get_ref_data()
 
     def check_device(device):
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             print("Skipping %s becuase it is not enabled" % device)
         print("Running on target: %s" % device)
@@ -86,10 +86,10 @@ def check_device(device):
                 C = topi.nn.relu(C)
             s = topi.generic.schedule_conv2d_nchw([C])
 
-        a = tvm.nd.array(a_np, ctx)
-        w = tvm.nd.array(w_np, ctx)
-        b = tvm.nd.array(b_np, ctx)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        a = tvm.nd.array(a_np, dev)
+        w = tvm.nd.array(w_np, dev)
+        b = tvm.nd.array(b_np, dev)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
         if add_bias:
             func = tvm.build(
                 s,
diff --git a/tests/python/nightly/quantization/test_quantization_accuracy.py b/tests/python/nightly/quantization/test_quantization_accuracy.py
index 55971de17b03..57fa49e93a04 100644
--- a/tests/python/nightly/quantization/test_quantization_accuracy.py
+++ b/tests/python/nightly/quantization/test_quantization_accuracy.py
@@ -92,11 +92,13 @@ def get_model(model_name, batch_size, qconfig, target=None, original=False, simu
     return qfunc
 
 
-def eval_acc(model, dataset, batch_fn, target=tvm.target.cuda(), ctx=tvm.gpu(), log_interval=100):
+def eval_acc(
+    model, dataset, batch_fn, target=tvm.target.cuda(), device=tvm.gpu(), log_interval=100
+):
     with tvm.transform.PassContext(opt_level=3):
         graph, lib, params = relay.build(model, target)
     # create runtime module
-    m = tvm.contrib.graph_runtime.create(graph, lib, ctx)
+    m = tvm.contrib.graph_executor.create(graph, lib, device)
     m.set_input(**params)
 
     # setup evaluaiton metric
diff --git a/tests/python/relay/benchmarking/benchmark_vm.py b/tests/python/relay/benchmarking/benchmark_vm.py
index 02e6ae6f51e5..44ce9be766d2 100644
--- a/tests/python/relay/benchmarking/benchmark_vm.py
+++ b/tests/python/relay/benchmarking/benchmark_vm.py
@@ -19,7 +19,7 @@
 
 import tvm
 from tvm import te
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 from tvm import relay
 from tvm.runtime import container
 from tvm.runtime import vm as vm_rt
@@ -36,39 +36,39 @@ def benchmark_execution(
     dtype="float32",
     model="unknown",
 ):
-    def get_graph_runtime_output(
-        mod, data, params, target, ctx, dtype="float32", number=2, repeat=20
+    def get_graph_executor_output(
+        mod, data, params, target, dev, dtype="float32", number=2, repeat=20
     ):
         with tvm.transform.PassContext(opt_level=3):
             lib = relay.build(mod, target, params=params)
 
-        m = graph_runtime.GraphModule(lib["default"](ctx))
+        m = graph_executor.GraphModule(lib["default"](dev))
         # set inputs
         m.set_input("data", data)
         m.run()
         out = m.get_output(0, tvm.nd.empty(out_shape, dtype))
 
         if measure:
-            print("Evaluate graph runtime inference cost of {} on " "{}".format(model, repr(ctx)))
-            ftimer = m.module.time_evaluator("run", ctx, number=1, repeat=20)
+            print("Evaluate graph executor inference cost of {} on " "{}".format(model, repr(dev)))
+            ftimer = m.module.time_evaluator("run", dev, number=1, repeat=20)
             # Measure in millisecond.
             prof_res = np.array(ftimer().results) * 1000
             print(
-                "Mean graph runtime inference time (std dev): %.2f ms (%.2f ms)"
+                "Mean graph executor inference time (std dev): %.2f ms (%.2f ms)"
                 % (np.mean(prof_res), np.std(prof_res))
             )
 
         return out.asnumpy()
 
-    def get_vm_output(mod, data, params, target, ctx, dtype="float32", number=2, repeat=20):
+    def get_vm_output(mod, data, params, target, dev, dtype="float32", number=2, repeat=20):
         with tvm.transform.PassContext(opt_level=3):
             exe = vm.compile(mod, target, params=params)
-            rly_vm = vm_rt.VirtualMachine(exe, ctx)
+            rly_vm = vm_rt.VirtualMachine(exe, dev)
             result = rly_vm.run(data)
 
         if measure:
-            print("Evaluate vm inference cost of {} on {}".format(model, repr(ctx)))
-            ftimer = rly_vm.module.time_evaluator("invoke", ctx, number=number, repeat=repeat)
+            print("Evaluate vm inference cost of {} on {}".format(model, repr(dev)))
+            ftimer = rly_vm.module.time_evaluator("invoke", dev, number=number, repeat=repeat)
             # Measure in millisecond.
             prof_res = np.array(ftimer("main", data).results) * 1000
             print(
@@ -81,11 +81,11 @@ def get_vm_output(mod, data, params, target, ctx, dtype="float32", number=2, rep
     # random input
     data = np.random.uniform(size=data_shape).astype(dtype)
 
-    for target, ctx in testing.enabled_targets():
-        tvm_out = get_graph_runtime_output(
-            mod, tvm.nd.array(data.astype(dtype)), params, target, ctx, dtype
+    for target, dev in testing.enabled_targets():
+        tvm_out = get_graph_executor_output(
+            mod, tvm.nd.array(data.astype(dtype)), params, target, dev, dtype
         )
-        vm_out = get_vm_output(mod, tvm.nd.array(data.astype(dtype)), params, target, ctx, dtype)
+        vm_out = get_vm_output(mod, tvm.nd.array(data.astype(dtype)), params, target, dev, dtype)
         tvm.testing.assert_allclose(vm_out, tvm_out, rtol=1e-5, atol=1e-5)
 
 
diff --git a/tests/python/relay/dyn/test_dynamic_op_level10.py b/tests/python/relay/dyn/test_dynamic_op_level10.py
index a520f6c2c368..9bcb656bd246 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level10.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level10.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-""" 
+"""
 Support level10 operator test cases.
 
 """
@@ -44,10 +44,10 @@ def verify_more_dynamic_broadcast_to(x_shape, out_shape):
 
         x = np.random.uniform(size=np.prod(x_shape)).astype(dtype)
         ref_res = np.broadcast_to(np.reshape(x, x_shape), out_shape)
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["vm", "debug"]:
                 mod = tvm.ir.IRModule.from_expr(func)
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate(func)(
                     x, np.array(x_shape).astype(shape_type), np.array(out_shape).astype(shape_type)
                 )
@@ -70,10 +70,10 @@ def verify_broadcast_to(x_shape, out_shape):
 
         x = np.random.uniform(size=x_shape).astype(dtype)
         ref_res = np.broadcast_to(x, out_shape)
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["vm", "debug"]:
                 mod = tvm.ir.IRModule.from_expr(func)
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate(func)(x, np.array(out_shape).astype(shape_type))
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
@@ -100,10 +100,10 @@ def test_dyn_broadcast_to():
     x = np.random.uniform(size=x_shape).astype(dtype)
     dyn_shape = (1,) * rank
     ref_res = np.broadcast_to(x, dyn_shape)
-    for target, ctx in tvm.testing.enabled_targets():
+    for target, dev in tvm.testing.enabled_targets():
         for kind in ["vm", "debug"]:
             mod = tvm.ir.IRModule.from_expr(func)
-            intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+            intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
             op_res = intrp.evaluate(func)(x, np.array(dyn_shape).astype(shape_type))
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
@@ -133,10 +133,10 @@ def _verify(indices_shape, depth, on_value, off_value, axis, dtype):
         func = relay.Function([indices, depth_var], out)
         indices_np = np.random.randint(0, depth, size=indices_shape).astype("int32")
         out_np = tvm.topi.testing.one_hot(indices_np, on_value, off_value, depth, axis, dtype)
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["vm", "debug"]:
                 mod = tvm.ir.IRModule.from_expr(func)
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 out_relay = intrp.evaluate()(indices_np, np.array(depth).astype("int32"))
                 tvm.testing.assert_allclose(out_relay.asnumpy(), out_np)
 
diff --git a/tests/python/relay/dyn/test_dynamic_op_level2.py b/tests/python/relay/dyn/test_dynamic_op_level2.py
index 5ef975f97d2c..c0cbce17e9d0 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level2.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level2.py
@@ -56,10 +56,10 @@ def verify_upsampling(dshape, scale_h, scale_w, layout, method, align_corners=Fa
         zz = run_infer_type(z)
         func = relay.Function([x, scale_h_var, scale_w_var], z)
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["vm", "debug"]:
                 mod = tvm.ir.IRModule.from_expr(func)
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(
                     x_data, np.array(scale_h).astype("float32"), np.array(scale_w).astype("float32")
                 )
@@ -125,10 +125,10 @@ def verify_upsampling3d(
         zz = run_infer_type(z)
         func = relay.Function([x, scale_d_var, scale_h_var, scale_w_var], z)
 
-        for target, ctx in enabled_targets():
+        for target, dev in enabled_targets():
             for kind in ["vm", "debug"]:
                 mod = tvm.ir.IRModule.from_expr(func)
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(
                     x_data,
                     np.array(scale_d).astype("float32"),
diff --git a/tests/python/relay/dyn/test_dynamic_op_level3.py b/tests/python/relay/dyn/test_dynamic_op_level3.py
index d5f81e84e39d..78d12f9b1c8e 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level3.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level3.py
@@ -26,12 +26,12 @@
 import tvm.testing
 
 
-def verify_func(func, data, ref_res, target_ctx=tvm.testing.enabled_targets()):
+def verify_func(func, data, ref_res, target_device=tvm.testing.enabled_targets()):
     assert isinstance(data, list)
-    for target, ctx in target_ctx:
+    for target, dev in target_device:
         for kind in ["vm", "debug"]:
             mod = tvm.ir.IRModule.from_expr(func)
-            intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+            intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
             op_res = intrp.evaluate()(*data)
             if isinstance(op_res, tvm.runtime.container.ADT):
                 assert len(op_res) == len(
diff --git a/tests/python/relay/dyn/test_dynamic_op_level4.py b/tests/python/relay/dyn/test_dynamic_op_level4.py
index 3d7a99a28e33..3cb706440cad 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level4.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level4.py
@@ -57,9 +57,9 @@ def verify(dshape, begin, end, strides, output, slice_mode="end", test_ref=True,
 
         if not test_ref:
             return
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             mod = tvm.ir.IRModule.from_expr(func)
-            intrp = relay.create_executor("vm", mod=mod, ctx=ctx, target=target)
+            intrp = relay.create_executor("vm", mod=mod, device=dev, target=target)
             op_res = intrp.evaluate()(x_data)
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
 
diff --git a/tests/python/relay/dyn/test_dynamic_op_level5.py b/tests/python/relay/dyn/test_dynamic_op_level5.py
index 9273b019ec96..c49ac9680266 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level5.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level5.py
@@ -60,10 +60,10 @@ def verify_resize(dshape, scale, method, layout):
         zz = run_infer_type(z)
         func = relay.Function([x, size_var], z)
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["vm", "debug"]:
                 mod = tvm.ir.IRModule.from_expr(func)
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(x_data, size)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-4, atol=1e-6)
 
diff --git a/tests/python/relay/dyn/test_dynamic_op_level6.py b/tests/python/relay/dyn/test_dynamic_op_level6.py
index 52abbe2a15b6..9ceb9ab9db97 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level6.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level6.py
@@ -52,10 +52,10 @@ def verify_topk(k, axis, ret_type, is_ascend, dtype):
                 np_values[i, :] = np_data[i, np_indices[i, :]]
         np_indices = np_indices.astype(dtype)
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["vm", "debug"]:
                 mod = tvm.ir.IRModule.from_expr(func)
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(np_data, np.array([k]).astype("float32"))
                 if ret_type == "both":
                     tvm.testing.assert_allclose(op_res[0].asnumpy(), np_values)
diff --git a/tests/python/relay/test_adt.py b/tests/python/relay/test_adt.py
index 122fa67d65df..c432596a2a9f 100644
--- a/tests/python/relay/test_adt.py
+++ b/tests/python/relay/test_adt.py
@@ -33,8 +33,8 @@ def count(e):
     return count_(p, e)
 
 
-ctx = tvm.context("llvm", 0)
-intrp = create_executor(mod=prelude.mod, ctx=ctx, target="llvm")
+dev = tvm.device("llvm", 0)
+intrp = create_executor(mod=prelude.mod, device=dev, target="llvm")
 
 nat, z, s = prelude.mod.get_type("nat")
 
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index 32292de4c8ea..9f777c29bef6 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -53,12 +53,12 @@ def check_result(
         expected = [expected]
     for kind in ["debug", "vm"]:
         targets = targets or tvm.testing.enabled_targets()
-        for tgt, ctx in targets:
+        for tgt, dev in targets:
             if disable_targets and tgt in disable_targets:
                 continue
-            if kind == "debug" and (only_vm or ctx.device_type != tvm.cpu().device_type):
+            if kind == "debug" and (only_vm or dev.device_type != tvm.cpu().device_type):
                 continue
-            ex = relay.create_executor(kind, mod=mod, ctx=ctx, target=tgt)
+            ex = relay.create_executor(kind, mod=mod, device=dev, target=tgt)
             result = ex.evaluate()(*args)
             if isinstance(result, tvm.runtime.container.ADT):
                 result = [r.asnumpy() for r in result]
@@ -751,7 +751,7 @@ def verify_any_split(data_shape, indices_or_sections, axis, static_data_shape, r
     mod["main"] = relay.Function([data], y.astuple())
     data_np = np.random.uniform(size=static_data_shape).astype(dtype)
     for kind in ["vm"]:
-        ex = relay.create_executor(kind, mod=mod, ctx=tvm.cpu(), target="llvm")
+        ex = relay.create_executor(kind, mod=mod, device=tvm.cpu(), target="llvm")
         result = ex.evaluate()(data_np)
         for ret, ref_ret in zip(result, ref_out_shape):
             assert ret.asnumpy().shape == ref_ret, "Shape mismatch: expect %s but got %s." % (
@@ -964,9 +964,9 @@ def test_any_get_valid_counts():
     # Check failed: err_code == CL_SUCCESS == false: OpenCL Error,
     # code=-61: CL_INVALID_BUFFER_SIZE
     targets = []
-    for tgt, ctx in tvm.testing.enabled_targets():
+    for tgt, dev in tvm.testing.enabled_targets():
         if "opencl" not in tgt:
-            targets.append((tgt, ctx))
+            targets.append((tgt, dev))
     verify_any_get_valid_counts(0, "float32", targets=targets)
 
 
diff --git a/tests/python/relay/test_auto_scheduler_layout_rewrite_networks.py b/tests/python/relay/test_auto_scheduler_layout_rewrite_networks.py
index 56b57e1fad76..8466fc1700b0 100644
--- a/tests/python/relay/test_auto_scheduler_layout_rewrite_networks.py
+++ b/tests/python/relay/test_auto_scheduler_layout_rewrite_networks.py
@@ -21,7 +21,7 @@
 
 import tvm
 from tvm import relay, auto_scheduler
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 import tvm.testing
 
 
@@ -168,8 +168,8 @@ def tune_and_check(mod, data, weight):
             lib2 = relay.build(mod, target=target, params={"weight": weight})
 
         def get_output(data, lib):
-            ctx = tvm.cpu()
-            module = graph_runtime.GraphModule(lib["default"](ctx))
+            dev = tvm.cpu()
+            module = graph_executor.GraphModule(lib["default"](dev))
             module.set_input("data", data)
             module.run()
 
diff --git a/tests/python/relay/test_auto_scheduler_tuning.py b/tests/python/relay/test_auto_scheduler_tuning.py
index 1ec0e305311a..1250543a13ae 100644
--- a/tests/python/relay/test_auto_scheduler_tuning.py
+++ b/tests/python/relay/test_auto_scheduler_tuning.py
@@ -20,7 +20,7 @@
 import numpy as np
 
 from tvm import auto_scheduler, relay
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 import tvm.testing
 
 from test_auto_scheduler_task_extraction import get_network
@@ -69,8 +69,8 @@ def tune_network(network, target):
 
         # Check the correctness
         def get_output(data, lib):
-            ctx = tvm.gpu()
-            module = graph_runtime.GraphModule(lib["default"](ctx))
+            dev = tvm.gpu()
+            module = graph_executor.GraphModule(lib["default"](dev))
             module.set_input("data", data)
             module.run()
             return module.get_output(0).asnumpy()
diff --git a/tests/python/relay/test_backend_compile_engine.py b/tests/python/relay/test_backend_compile_engine.py
index bf53dc5360e3..42b6373b45f1 100644
--- a/tests/python/relay/test_backend_compile_engine.py
+++ b/tests/python/relay/test_backend_compile_engine.py
@@ -184,11 +184,11 @@ def get_func(shape):
 
     # Test JIT target
     for target in ["llvm"]:
-        ctx = tvm.context(target)
+        dev = tvm.device(target)
         if tvm.testing.device_enabled(target):
             f = engine.jit(get_func((10,)), target)
-            x = tvm.nd.array(np.ones(10).astype("float32"), ctx=ctx)
-            y = tvm.nd.empty((10,), ctx=ctx)
+            x = tvm.nd.array(np.ones(10).astype("float32"), device=dev)
+            y = tvm.nd.empty((10,), device=dev)
             f(x, y)
             tvm.testing.assert_allclose(y.asnumpy(), x.asnumpy() * 3)
     engine.dump()
diff --git a/tests/python/relay/test_backend_graph_runtime.py b/tests/python/relay/test_backend_graph_executor.py
similarity index 95%
rename from tests/python/relay/test_backend_graph_runtime.py
rename to tests/python/relay/test_backend_graph_executor.py
index 68708aaeb413..b9553d79c3b6 100644
--- a/tests/python/relay/test_backend_graph_runtime.py
+++ b/tests/python/relay/test_backend_graph_executor.py
@@ -18,7 +18,7 @@
 
 import tvm
 from tvm import relay
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 from tvm.relay.op import add
 import tvm.testing
 
@@ -102,7 +102,7 @@ def test_with_params():
     y_data = np.random.rand(1, 5).astype("float32")
     params = {"y": y_data}
     graph, lib, params = relay.build(tvm.IRModule.from_expr(func), "llvm", params=params)
-    mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
+    mod = graph_executor.create(graph, lib, device=tvm.cpu(0))
     mod.set_input(**params)
     mod.set_input(x=x_data)
     mod.run()
@@ -171,10 +171,10 @@ def unit_numpy(X, W):
     out_shape = (1, rnn_dim)
     z = unit(rnn_dim)
 
-    for target, ctx in tvm.testing.enabled_targets():
+    for target, dev in tvm.testing.enabled_targets():
         with tvm.transform.PassContext(opt_level=2):
             graph, lib, params = relay.build(tvm.IRModule.from_expr(z), target)
-            m = graph_runtime.create(graph, lib, ctx)
+            m = graph_executor.create(graph, lib, dev)
             m.set_input("X", tvm.nd.array(x.astype(dtype)))
             m.set_input("y", tvm.nd.array(y.astype(dtype)))
             m.set_input(**params)
@@ -194,7 +194,7 @@ def test_compile_nested_tuples():
     func = relay.Function([x], out)
 
     graph, lib, _ = relay.build(tvm.IRModule.from_expr(func), "llvm")
-    mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
+    mod = graph_executor.create(graph, lib, device=tvm.cpu(0))
 
     x_data = np.random.uniform(size=(10,)).astype(np.float32)
     mod.set_input(x=x_data)
@@ -215,7 +215,7 @@ def test_graph_executor_nested_tuples():
     func = relay.Function([x, y, z, w], out)
 
     exe = relay.create_executor(
-        kind="graph", mod=tvm.IRModule.from_expr(func), ctx=tvm.cpu(0), target="llvm"
+        kind="graph", mod=tvm.IRModule.from_expr(func), device=tvm.cpu(0), target="llvm"
     )
     f = exe.evaluate()
 
diff --git a/tests/python/relay/test_backend_interpreter.py b/tests/python/relay/test_backend_interpreter.py
index 0beb93deaef5..b5d76030f41d 100644
--- a/tests/python/relay/test_backend_interpreter.py
+++ b/tests/python/relay/test_backend_interpreter.py
@@ -29,10 +29,10 @@
 def check_eval(expr, args, expected_result, mod=None, rtol=1e-07):
     # TODO(tqchen) add more types once the schedule register is fixed.
     for target in ["llvm"]:
-        ctx = tvm.context(target, 0)
+        dev = tvm.device(target, 0)
         if not tvm.testing.device_enabled(target):
             return
-        intrp = create_executor(mod=mod, ctx=ctx, target=target)
+        intrp = create_executor(mod=mod, device=dev, target=target)
         result = intrp.evaluate(expr)(*args)
         # use tvm.testing which also set atol
         tvm.testing.assert_allclose(result.asnumpy(), expected_result, rtol=rtol)
@@ -220,9 +220,9 @@ def test_tuple_passing():
     mod[gv] = fn
     mod = relay.transform.InferType()(mod)
 
-    ctx = tvm.cpu()
+    dev = tvm.cpu()
     target = tvm.target.Target("llvm")
-    exec = relay.create_executor(mod=mod, ctx=ctx, target=target)
+    exec = relay.create_executor(mod=mod, device=dev, target=target)
     f = exec.evaluate(gv)
     # First use a Python tuple.
     out = f((10, 8))
diff --git a/tests/python/relay/test_cpp_build_module.py b/tests/python/relay/test_cpp_build_module.py
index 60f3dfa76e38..7d2209a34835 100644
--- a/tests/python/relay/test_cpp_build_module.py
+++ b/tests/python/relay/test_cpp_build_module.py
@@ -25,7 +25,7 @@
 
 def test_basic_build():
     tgt = "llvm"
-    ctx = tvm.cpu()
+    dev = tvm.cpu()
     # func
     a = relay.var("a", dtype="float32", shape=(16, 8))
     b = relay.var("b", dtype="float32", shape=(8, 8))
@@ -34,12 +34,12 @@ def test_basic_build():
     y = relay.nn.relu(x)
     z = y + c
     func = relay.Function([a, b, c], z)
-    A = tvm.nd.array(np.random.uniform(-1, 1, (16, 8)).astype("float32"), ctx=ctx)
-    B = tvm.nd.array(np.random.uniform(-1, 1, (8, 8)).astype("float32"), ctx=ctx)
-    C = tvm.nd.array(np.random.uniform(-1, 1, (16, 8)).astype("float32"), ctx=ctx)
+    A = tvm.nd.array(np.random.uniform(-1, 1, (16, 8)).astype("float32"), device=dev)
+    B = tvm.nd.array(np.random.uniform(-1, 1, (8, 8)).astype("float32"), device=dev)
+    C = tvm.nd.array(np.random.uniform(-1, 1, (16, 8)).astype("float32"), device=dev)
     params = {"b": B, "c": C}
     # build
-    targets = {tvm.tir.IntImm("int32", ctx.device_type): tgt}
+    targets = {tvm.tir.IntImm("int32", dev.device_type): tgt}
     mod = tvm.IRModule.from_expr(func)
     func_in_mod = mod["main"]
     assert mod["main"] == func_in_mod, "cannot compare function to itself"
@@ -48,7 +48,7 @@ def test_basic_build():
     assert mod["main"] == func_in_mod, "relay.build changed module in-place"
 
     # test
-    rt = tvm.contrib.graph_runtime.GraphModule(lib["default"](ctx))
+    rt = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
     rt.set_input("a", A)
     rt.run()
     out = rt.get_output(0)
@@ -65,8 +65,8 @@ def test_basic_build():
 def test_fp16_build():
     dtype = "float16"
 
-    ctx = tvm.gpu(0)
-    if dtype == "float16" and not have_fp16(ctx.compute_version):
+    dev = tvm.gpu(0)
+    if dtype == "float16" and not have_fp16(dev.compute_version):
         print("skip because gpu does not support fp16")
         return
 
@@ -74,8 +74,8 @@ def test_fp16_build():
     y = relay.var("y", dtype=dtype, shape=(4, 4))
     z = x + y
     func = relay.Function([x, y], z)
-    X = tvm.nd.array(np.random.uniform(-1, 1, (4, 4)).astype(dtype), ctx=ctx)
-    Y = tvm.nd.array(np.random.uniform(-1, 1, (4, 4)).astype(dtype), ctx=ctx)
+    X = tvm.nd.array(np.random.uniform(-1, 1, (4, 4)).astype(dtype), device=dev)
+    Y = tvm.nd.array(np.random.uniform(-1, 1, (4, 4)).astype(dtype), device=dev)
     params = {
         "x": X,
         "y": Y,
@@ -85,7 +85,7 @@ def test_fp16_build():
     g_json, mmod, params = relay.build(func, "cuda", params=params)
 
     # test
-    rt = tvm.contrib.graph_runtime.create(g_json, mmod, ctx)
+    rt = tvm.contrib.graph_executor.create(g_json, mmod, dev)
     rt.load_params(runtime.save_param_dict(params))
     rt.run()
     out = rt.get_output(0)
@@ -94,8 +94,8 @@ def test_fp16_build():
 
 
 @tvm.testing.parametrize_targets("llvm", "cuda")
-def test_fp16_conversion(target, ctx):
-    if target == "cuda" and not have_fp16(ctx.compute_version):
+def test_fp16_conversion(target, dev):
+    if target == "cuda" and not have_fp16(dev.compute_version):
         print("skip because gpu does not support fp16")
         return
 
@@ -114,7 +114,7 @@ def test_fp16_conversion(target, ctx):
             g_json, mmod, params = relay.build(tvm.IRModule.from_expr(func), target)
 
         # test
-        rt = tvm.contrib.graph_runtime.create(g_json, mmod, ctx)
+        rt = tvm.contrib.graph_executor.create(g_json, mmod, dev)
         rt.set_input("x", X)
         rt.run()
         out = rt.get_output(0)
diff --git a/tests/python/relay/test_external_codegen.py b/tests/python/relay/test_external_codegen.py
index 0d729b7b1b94..9f6d88e47f0b 100644
--- a/tests/python/relay/test_external_codegen.py
+++ b/tests/python/relay/test_external_codegen.py
@@ -23,30 +23,35 @@
 from tvm import te
 import tvm.relay.testing
 import tvm.relay.transform
+
 from tvm import relay
 from tvm import runtime
+from tvm.relay import transform
 from tvm.contrib import utils
+from tvm.relay.build_module import bind_params_by_name
+from tvm.relay.op.annotation import compiler_begin, compiler_end
 
 
-def check_result(mod, map_inputs, out_shape, result, tol=1e-5, target="llvm", ctx=tvm.cpu()):
-    if sys.platform == "win32":
-        print("Skip test on Windows for now")
-        return
+def update_lib(lib):
+    test_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__)))
+    source_dir = os.path.join(test_dir, "..", "..", "..")
+    contrib_path = os.path.join(source_dir, "src", "runtime", "contrib")
 
-    def update_lib(lib):
-        test_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__)))
-        source_dir = os.path.join(test_dir, "..", "..", "..")
-        contrib_path = os.path.join(source_dir, "src", "runtime", "contrib")
+    kwargs = {}
+    kwargs["options"] = ["-O2", "-std=c++14", "-I" + contrib_path]
+    tmp_path = utils.tempdir()
+    lib_name = "lib.so"
+    lib_path = tmp_path.relpath(lib_name)
+    lib.export_library(lib_path, fcompile=False, **kwargs)
+    lib = tvm.runtime.load_module(lib_path)
 
-        kwargs = {}
-        kwargs["options"] = ["-O2", "-std=c++14", "-I" + contrib_path]
-        tmp_path = utils.tempdir()
-        lib_name = "lib.so"
-        lib_path = tmp_path.relpath(lib_name)
-        lib.export_library(lib_path, fcompile=False, **kwargs)
-        lib = tvm.runtime.load_module(lib_path)
+    return lib
 
-        return lib
+
+def check_result(mod, map_inputs, out_shape, result, tol=1e-5, target="llvm", device=tvm.cpu()):
+    if sys.platform == "win32":
+        print("Skip test on Windows for now")
+        return
 
     def check_vm_result():
         with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
@@ -54,26 +59,26 @@ def check_vm_result():
         code, lib = exe.save()
         lib = update_lib(lib)
         exe = runtime.vm.Executable.load_exec(code, lib)
-        vm = runtime.vm.VirtualMachine(exe, ctx)
+        vm = runtime.vm.VirtualMachine(exe, device)
         out = vm.run(**map_inputs)
         tvm.testing.assert_allclose(out.asnumpy(), result, rtol=tol, atol=tol)
 
-    def check_graph_runtime_result():
+    def check_graph_executor_result():
         with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
             json, lib, _ = relay.build(mod, target=target)
         lib = update_lib(lib)
-        rt_mod = tvm.contrib.graph_runtime.create(json, lib, ctx)
+        rt_mod = tvm.contrib.graph_executor.create(json, lib, device)
 
         for name, data in map_inputs.items():
             rt_mod.set_input(name, data)
         rt_mod.run()
-        out = tvm.nd.empty(out_shape, ctx=ctx)
+        out = tvm.nd.empty(out_shape, device=device)
         out = rt_mod.get_output(0, out)
 
         tvm.testing.assert_allclose(out.asnumpy(), result, rtol=tol, atol=tol)
 
     check_vm_result()
-    check_graph_runtime_result()
+    check_graph_executor_result()
 
 
 def set_external_func_attr(func, compiler, ext_symbol):
@@ -285,7 +290,7 @@ def test_extern_dnnl():
     i_data = np.random.uniform(0, 1, ishape).astype(dtype)
     w_data = np.random.uniform(0, 1, w1shape).astype(dtype)
 
-    ref_ex = relay.create_executor("graph", mod=ref_mod, ctx=tvm.cpu())
+    ref_ex = relay.create_executor("graph", mod=ref_mod, device=tvm.cpu())
     ref_res = ref_ex.evaluate()(i_data, w_data, w_data)
     check_result(
         mod, {"data0": i_data, "weight0": w_data}, (1, 32, 14, 14), ref_res.asnumpy(), tol=1e-5
@@ -324,11 +329,34 @@ def test_extern_dnnl_const():
 
     i_data = np.random.uniform(0, 1, ishape).astype(dtype)
 
-    ref_ex = relay.create_executor("graph", mod=ref_mod, ctx=tvm.cpu())
+    ref_ex = relay.create_executor("graph", mod=ref_mod, device=tvm.cpu())
     ref_res = ref_ex.evaluate()(i_data)
     check_result(mod, {"data0": i_data}, (1, 32, 14, 14), ref_res.asnumpy(), tol=1e-5)
 
 
+def test_load_params_with_constants_in_ext_codegen():
+    # After binding params and partitioning graph_module.get_params()
+    # might contain parameters that are not an graph executor input but
+    # for example constants in external function.
+    y_in = np.ones((1,)).astype("float32")
+    params = {"y": y_in}
+    mod = tvm.IRModule()
+    x = relay.var("x", shape=(1, 10))
+    y = relay.var("y", shape=(1,))
+    xcb = compiler_begin(x, "ccompiler")
+    ycb = compiler_begin(y, "ccompiler")
+    z = relay.add(xcb, ycb)
+    zce = compiler_end(z, "ccompiler")
+    mod["main"] = relay.Function([x, y], zce)
+    mod["main"] = bind_params_by_name(mod["main"], params)
+    mod = transform.PartitionGraph()(mod)
+
+    graph_module = relay.build(mod, target="llvm", params=params)
+    lib = update_lib(graph_module.get_lib())
+    rt_mod = tvm.contrib.graph_executor.create(graph_module.get_json(), lib, tvm.cpu(0))
+    rt_mod.load_params(runtime.save_param_dict(graph_module.get_params()))
+
+
 if __name__ == "__main__":
     test_multi_node_subgraph()
     test_extern_gcc_single_op()
@@ -337,3 +365,4 @@ def test_extern_dnnl_const():
     test_extern_gcc_consts()
     test_extern_dnnl()
     test_extern_dnnl_const()
+    test_load_params_with_constants_in_ext_codegen()
diff --git a/tests/python/relay/test_ir_parser.py b/tests/python/relay/test_ir_parser.py
index 62e52abefeb4..8b6b39e3df15 100644
--- a/tests/python/relay/test_ir_parser.py
+++ b/tests/python/relay/test_ir_parser.py
@@ -827,8 +827,8 @@ def test_import_grad():
     mod.import_from_std("gradient.rly")
 
 
-def test_resnet():
-    mod, _ = relay.testing.resnet.get_workload()
+def test_mlp():
+    mod, _ = relay.testing.mlp.get_workload(1)
     text = mod.astext()
     parsed_mod = tvm.parser.parse(text)
     tvm.ir.assert_structural_equal(mod, parsed_mod)
@@ -850,8 +850,8 @@ def inline_params(mod, params):
     return mod
 
 
-def test_resnet_inlined_params():
-    mod, params = relay.testing.resnet.get_workload()
+def test_mlp_inlined_params():
+    mod, params = relay.testing.mlp.get_workload(1)
     mod = inline_params(mod, params)
     mod = relay.transform.InferType()(mod)
     text = mod.astext()
diff --git a/tests/python/relay/test_ir_text_printer.py b/tests/python/relay/test_ir_text_printer.py
index 72a243dbbb67..b2ae28649e6a 100644
--- a/tests/python/relay/test_ir_text_printer.py
+++ b/tests/python/relay/test_ir_text_printer.py
@@ -181,11 +181,6 @@ def test_squeezenet():
         astext(net)
 
 
-def test_vgg():
-    net, _ = tvm.relay.testing.vgg.get_workload(batch_size=1)
-    astext(net)
-
-
 def test_densenet():
     net, _ = tvm.relay.testing.densenet.get_workload(batch_size=1)
     astext(net)
diff --git a/tests/python/relay/test_json_runtime.py b/tests/python/relay/test_json_runtime.py
index df4dff81b03e..bf5676d096f1 100644
--- a/tests/python/relay/test_json_runtime.py
+++ b/tests/python/relay/test_json_runtime.py
@@ -40,7 +40,7 @@ def set_func_attr(func, compile_name, symbol_name):
 
 
 def check_result(
-    mod, ref_mod, map_inputs, out_shape, tol=1e-5, target="llvm", ctx=tvm.cpu(), params=None
+    mod, ref_mod, map_inputs, out_shape, tol=1e-5, target="llvm", device=tvm.cpu(), params=None
 ):
     if sys.platform == "win32":
         print("Skip test on Windows for now")
@@ -50,13 +50,13 @@ def check_result(
     compile_engine.get().clear()
     with tvm.transform.PassContext(opt_level=3):
         json, lib, param = relay.build(ref_mod, target=target, params=params)
-    rt_mod = tvm.contrib.graph_runtime.create(json, lib, ctx)
+    rt_mod = tvm.contrib.graph_executor.create(json, lib, device)
 
     for name, data in map_inputs.items():
         rt_mod.set_input(name, data)
     rt_mod.set_input(**param)
     rt_mod.run()
-    out = tvm.nd.empty(out_shape, ctx=ctx)
+    out = tvm.nd.empty(out_shape, device=device)
     out = rt_mod.get_output(0, out)
     ref_result = out.asnumpy()
 
@@ -66,26 +66,26 @@ def check_vm_result():
             exe = relay.vm.compile(mod, target=target, params=params)
         code, lib = exe.save()
         exe = runtime.vm.Executable.load_exec(code, lib)
-        vm = runtime.vm.VirtualMachine(exe, ctx)
+        vm = runtime.vm.VirtualMachine(exe, device)
         out = vm.run(**map_inputs)
         tvm.testing.assert_allclose(out.asnumpy(), ref_result, rtol=tol, atol=tol)
 
-    def check_graph_runtime_result():
+    def check_graph_executor_result():
         compile_engine.get().clear()
         with relay.build_config(opt_level=3):
             json, lib, param = relay.build(mod, target=target, params=params)
-        rt_mod = tvm.contrib.graph_runtime.create(json, lib, ctx)
+        rt_mod = tvm.contrib.graph_executor.create(json, lib, device)
 
         for name, data in map_inputs.items():
             rt_mod.set_input(name, data)
         rt_mod.set_input(**param)
         rt_mod.run()
-        out = tvm.nd.empty(out_shape, ctx=ctx)
+        out = tvm.nd.empty(out_shape, device=device)
         out = rt_mod.get_output(0, out)
         tvm.testing.assert_allclose(out.asnumpy(), ref_result, rtol=tol, atol=tol)
 
     check_vm_result()
-    check_graph_runtime_result()
+    check_graph_executor_result()
 
 
 def test_conv2d():
@@ -636,8 +636,8 @@ def test_partial_constant():
     data3 = np.random.uniform(0, 1, ishape).astype(dtype)
 
     params = {
-        "in_1": tvm.nd.array(data1, ctx=tvm.cpu(0)),
-        "in_3": tvm.nd.array(data3, ctx=tvm.cpu(0)),
+        "in_1": tvm.nd.array(data1, device=tvm.cpu(0)),
+        "in_3": tvm.nd.array(data3, device=tvm.cpu(0)),
     }
     ref_mod["main"] = bind_params_by_name(ref_mod["main"], params)
 
diff --git a/tests/python/relay/test_op_fast_math.py b/tests/python/relay/test_op_fast_math.py
index 1658658be279..7bcbc6839c4f 100644
--- a/tests/python/relay/test_op_fast_math.py
+++ b/tests/python/relay/test_op_fast_math.py
@@ -22,7 +22,7 @@
 import tvm.relay as relay
 from tvm import topi
 from tvm import te
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 
 
 def test_fastmath():
@@ -42,10 +42,10 @@ def test_apply(relay_op, name, f_numpy, low, high, step, dtype="float32"):
         func_name = "fused_" + name
         assert lib.get_function(func_name)
 
-        ctx = tvm.cpu(0)
-        m = graph_runtime.create(graph, lib, ctx)
+        dev = tvm.cpu(0)
+        m = graph_executor.create(graph, lib, dev)
         # Set inputs
-        m.set_input("x", tvm.nd.array(a_np, ctx))
+        m.set_input("x", tvm.nd.array(a_np, dev))
         m.set_input(**params)
         # Execute
         m.run()
diff --git a/tests/python/relay/test_op_grad_level1.py b/tests/python/relay/test_op_grad_level1.py
index 0ac604c6bca1..6c6c727c788f 100644
--- a/tests/python/relay/test_op_grad_level1.py
+++ b/tests/python/relay/test_op_grad_level1.py
@@ -53,8 +53,8 @@ def check_single_op(opfunc, ref, dtype):
             fwd_func = run_infer_type(fwd_func)
             bwd_func = run_infer_type(gradient(fwd_func))
 
-            for target, ctx in tvm.testing.enabled_targets():
-                intrp = relay.create_executor(ctx=ctx, target=target)
+            for target, dev in tvm.testing.enabled_targets():
+                intrp = relay.create_executor(device=dev, target=target)
                 op_res, (op_grad, _) = intrp.evaluate(bwd_func)(data, grad_in)
                 np.testing.assert_allclose(op_grad.asnumpy(), ref_grad, rtol=0.01)
 
@@ -104,8 +104,8 @@ def check_binary_op(opfunc, ref, dtype):
         fwd_func = run_infer_type(fwd_func)
         bwd_func = run_infer_type(gradient(fwd_func))
 
-        for target, ctx in tvm.testing.enabled_targets():
-            intrp = relay.create_executor(ctx=ctx, target=target)
+        for target, dev in tvm.testing.enabled_targets():
+            intrp = relay.create_executor(device=dev, target=target)
             op_res, (op_grad0, op_grad1) = intrp.evaluate(bwd_func)(x_data, y_data)
             np.testing.assert_allclose(op_grad0.asnumpy(), ref_grad0, rtol=0.01)
             np.testing.assert_allclose(op_grad1.asnumpy(), ref_grad1, rtol=0.01)
diff --git a/tests/python/relay/test_op_grad_level2.py b/tests/python/relay/test_op_grad_level2.py
index bcf75de7915b..b855065186c2 100644
--- a/tests/python/relay/test_op_grad_level2.py
+++ b/tests/python/relay/test_op_grad_level2.py
@@ -50,8 +50,8 @@ def verify_max_pool2d_grad(x_shape, pool_size, strides, padding, ceil_mode):
         ceil_mode=ceil_mode,
     )
 
-    for target, ctx in tvm.testing.enabled_targets():
-        intrp = relay.create_executor(ctx=ctx, target=target)
+    for target, dev in tvm.testing.enabled_targets():
+        intrp = relay.create_executor(device=dev, target=target)
         op_res, (op_grad,) = intrp.evaluate(bwd_func)(data)
         np.testing.assert_allclose(op_grad.asnumpy(), ref_grad, rtol=0.01)
 
@@ -99,8 +99,8 @@ def verify_avg_pool2d_grad(
             ceil_mode=ceil_mode,
         )
 
-        for target, ctx in tvm.testing.enabled_targets():
-            intrp = relay.create_executor(ctx=ctx, target=target)
+        for target, dev in tvm.testing.enabled_targets():
+            intrp = relay.create_executor(device=dev, target=target)
             op_res, (op_grad,) = intrp.evaluate(bwd_func)(data)
             np.testing.assert_allclose(op_grad.asnumpy(), ref_grad, rtol=0.01)
 
@@ -155,8 +155,8 @@ def verify_global_avg_pool2d_grad(x_shape):
         ceil_mode=False,
     )
 
-    for target, ctx in tvm.testing.enabled_targets():
-        intrp = relay.create_executor(ctx=ctx, target=target)
+    for target, dev in tvm.testing.enabled_targets():
+        intrp = relay.create_executor(device=dev, target=target)
         op_res, (op_grad,) = intrp.evaluate(bwd_func)(data)
         np.testing.assert_allclose(op_grad.asnumpy(), ref_grad, rtol=0.01)
 
diff --git a/tests/python/relay/test_op_grad_level3.py b/tests/python/relay/test_op_grad_level3.py
index d43744b38e3e..e394eaa45a82 100644
--- a/tests/python/relay/test_op_grad_level3.py
+++ b/tests/python/relay/test_op_grad_level3.py
@@ -40,8 +40,8 @@ def test_clip():
         fwd_func = run_infer_type(fwd_func)
         bwd_func = run_infer_type(gradient(fwd_func))
 
-        for target, ctx in tvm.testing.enabled_targets():
-            intrp = relay.create_executor(ctx=ctx, target=target)
+        for target, dev in tvm.testing.enabled_targets():
+            intrp = relay.create_executor(device=dev, target=target)
             op_res, (op_grad,) = intrp.evaluate(bwd_func)(data)
             np.testing.assert_allclose(op_grad.asnumpy(), ref_grad, rtol=0.01)
 
@@ -173,8 +173,8 @@ def test_zeros_ones_grad_dynamic():
         fwd_func = relay.Function([shape_data], op(shape_data, dtype="float32"))
         bwd_func = run_infer_type(gradient(run_infer_type(fwd_func)))
 
-        for target, ctx in tvm.testing.enabled_targets():
-            intrp = relay.create_executor(ctx=ctx, target=target)
+        for target, dev in tvm.testing.enabled_targets():
+            intrp = relay.create_executor(device=dev, target=target)
             res, (grad,) = intrp.evaluate(bwd_func)(dyn_shape)
             tvm.testing.assert_allclose(res.asnumpy(), op_ref(dyn_shape, dtype="float32"))
             tvm.testing.assert_allclose(grad.asnumpy(), np.zeros((rank,), dtype="int32"))
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index dfd350486c3b..91b37135fbe1 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -61,7 +61,7 @@ def check_single_op(opfunc, ref, dtype):
             data = np.random.rand(*shape).astype(dtype)
             ref_res = ref(data)
             func = relay.Function([x], y)
-            for target, ctx in tvm.testing.enabled_targets():
+            for target, dev in tvm.testing.enabled_targets():
                 # use graph by execuor default for testing, as we need
                 # create function explicitly to avoid constant-folding.
                 if (
@@ -70,7 +70,7 @@ def check_single_op(opfunc, ref, dtype):
                     and not have_fp16(tvm.gpu(0).compute_version)
                 ):
                     continue
-                intrp = relay.create_executor("graph", ctx=ctx, target=target)
+                intrp = relay.create_executor("graph", device=dev, target=target)
                 op_res = intrp.evaluate(func)(data)
                 np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
 
@@ -123,7 +123,7 @@ def check_binary_op(opfunc, ref, dtype):
             ref_res = ref(x_data, y_data)
             func = relay.Function([x, y], z)
 
-            for target, ctx in tvm.testing.enabled_targets():
+            for target, dev in tvm.testing.enabled_targets():
                 # use graph by execuor default for testing, as we need
                 # create function explicitly to avoid constant-folding.
                 if (
@@ -132,7 +132,7 @@ def check_binary_op(opfunc, ref, dtype):
                     and not have_fp16(tvm.gpu(0).compute_version)
                 ):
                     continue
-                intrp = relay.create_executor("graph", ctx=ctx, target=target)
+                intrp = relay.create_executor("graph", device=dev, target=target)
                 op_res = intrp.evaluate(func)(x_data, y_data)
                 np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01, atol=1e-3)
 
@@ -154,7 +154,7 @@ def test_expand_dims():
     def verify_expand_dims(dshape, dtype, oshape, axis, num_newaxis):
         x = relay.Var("x", relay.TensorType(dshape, dtype))
         func = relay.Function([x], relay.expand_dims(x, axis, num_newaxis))
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             if (
                 dtype == "float16"
                 and target == "cuda"
@@ -163,7 +163,7 @@ def verify_expand_dims(dshape, dtype, oshape, axis, num_newaxis):
                 continue
             data = np.random.uniform(size=dshape).astype(dtype)
             ref_res = data.reshape(oshape)
-            intrp = relay.create_executor("graph", ctx=ctx, target=target)
+            intrp = relay.create_executor("graph", device=dev, target=target)
             op_res = intrp.evaluate(func)(data)
             np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
 
@@ -189,14 +189,14 @@ def test_bias_add():
         x_data = np.random.uniform(size=xshape).astype(dtype)
         y_data = np.random.uniform(size=bshape).astype(dtype)
         ref_res = x_data + y_data.reshape((2, 1, 1))
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             if (
                 dtype == "float16"
                 and target == "cuda"
                 and not have_fp16(tvm.gpu(0).compute_version)
             ):
                 continue
-            intrp = relay.create_executor("graph", ctx=ctx, target=target)
+            intrp = relay.create_executor("graph", device=dev, target=target)
             op_res = intrp.evaluate(func)(x_data, y_data)
             np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=rtol)
 
@@ -239,8 +239,8 @@ def test_softmax():
         func = relay.Function([x], y)
         x_data = np.random.uniform(size=shape).astype(dtype)
         ref_res = tvm.topi.testing.softmax_python(x_data)
-        for target, ctx in tvm.testing.enabled_targets():
-            intrp = relay.create_executor("graph", ctx=ctx, target=target)
+        for target, dev in tvm.testing.enabled_targets():
+            intrp = relay.create_executor("graph", device=dev, target=target)
             op_res = intrp.evaluate(func)(x_data)
             np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
@@ -260,8 +260,8 @@ def test_log_softmax():
         func = relay.Function([x], y)
         x_data = np.random.uniform(size=shape).astype(dtype)
         ref_res = tvm.topi.testing.log_softmax_python(x_data)
-        for target, ctx in tvm.testing.enabled_targets():
-            intrp = relay.create_executor("graph", ctx=ctx, target=target)
+        for target, dev in tvm.testing.enabled_targets():
+            intrp = relay.create_executor("graph", device=dev, target=target)
             op_res = intrp.evaluate(func)(x_data)
             np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
@@ -310,15 +310,15 @@ def test_concatenate():
         t_data = np.random.uniform(size=()).astype(dtype)
         ref_res = np.concatenate((x_data, y_data), axis=1) + t_data
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             if (
                 dtype == "float16"
                 and target == "cuda"
                 and not have_fp16(tvm.gpu(0).compute_version)
             ):
                 continue
-            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
-            intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+            intrp1 = relay.create_executor("graph", device=dev, target=target)
+            intrp2 = relay.create_executor("debug", device=dev, target=target)
             op_res1 = intrp1.evaluate(func)(x_data, y_data, t_data)
             tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=0.01)
             op_res2 = intrp2.evaluate(func)(x_data, y_data, t_data)
@@ -339,9 +339,9 @@ def test_dropout():
     x = relay.const(in_np)
     y = relay.nn.dropout(x, rate=0.5)
     func = relay.Function([], y)
-    for target, ctx in tvm.testing.enabled_targets():
+    for target, dev in tvm.testing.enabled_targets():
         for backend in ["debug", "graph"]:
-            intrp = relay.create_executor("debug", ctx=ctx, target=target)
+            intrp = relay.create_executor("debug", device=dev, target=target)
             op_res = intrp.evaluate(func)()
             tvm.testing.assert_allclose(op_res.asnumpy(), in_np, rtol=0.01)
 
@@ -460,9 +460,9 @@ def test_dense():
         w_data = np.random.rand(2, 5).astype(dtype)
         ref_res = np.dot(x_data, w_data.T)
 
-        for target, ctx in tvm.testing.enabled_targets():
-            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
-            intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+        for target, dev in tvm.testing.enabled_targets():
+            intrp1 = relay.create_executor("graph", device=dev, target=target)
+            intrp2 = relay.create_executor("debug", device=dev, target=target)
             op_res1 = intrp1.evaluate(func)(x_data, w_data)
             tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
             op_res2 = intrp2.evaluate(func)(x_data, w_data)
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index 3ec1a5bb6129..597a1c69e8ee 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -40,9 +40,9 @@ def test_checkpoint():
     assert f.checked_type == f_checkpoint.checked_type
 
     inputs = [np.random.uniform() for _ in range(len(xs))]
-    for target, ctx in tvm.testing.enabled_targets():
+    for target, dev in tvm.testing.enabled_targets():
         for kind in ["graph", "debug"]:
-            intrp = relay.create_executor(kind, ctx=ctx, target=target)
+            intrp = relay.create_executor(kind, device=dev, target=target)
             f_res = intrp.evaluate(f)(*inputs)
             f_checkpoint_res = intrp.evaluate(f_checkpoint)(*inputs)
             tvm.testing.assert_allclose(f_res.asnumpy(), f_checkpoint_res.asnumpy(), 0, 0)
@@ -172,9 +172,9 @@ def test_collapse_sum_like():
     x = np.random.uniform(size=shape).astype(dtype)
     y = np.random.uniform(size=shape_like).astype(dtype)
     ref_res = np.sum(x, 0)
-    for target, ctx in tvm.testing.enabled_targets():
+    for target, dev in tvm.testing.enabled_targets():
         for kind in ["graph", "debug"]:
-            intrp = relay.create_executor(kind, ctx=ctx, target=target)
+            intrp = relay.create_executor(kind, device=dev, target=target)
             op_res = intrp.evaluate(func)(x, y)
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
@@ -192,9 +192,9 @@ def test_collapse_sum_to():
     func = relay.Function([x], z)
     x = np.random.uniform(size=shape).astype(dtype)
     ref_res = np.sum(x, 0)
-    for target, ctx in tvm.testing.enabled_targets():
+    for target, dev in tvm.testing.enabled_targets():
         for kind in ["graph", "debug"]:
-            intrp = relay.create_executor(kind, ctx=ctx, target=target)
+            intrp = relay.create_executor(kind, device=dev, target=target)
             op_res = intrp.evaluate(func)(x)
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
@@ -212,9 +212,9 @@ def test_broadcast_to():
     func = relay.Function([x], z)
     x = np.random.uniform(size=shape).astype(dtype)
     ref_res = np.broadcast_to(x, shape_like)
-    for target, ctx in tvm.testing.enabled_targets():
+    for target, dev in tvm.testing.enabled_targets():
         for kind in ["graph", "debug"]:
-            intrp = relay.create_executor(kind, ctx=ctx, target=target)
+            intrp = relay.create_executor(kind, device=dev, target=target)
             op_res = intrp.evaluate(func)(x)
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
@@ -236,9 +236,9 @@ def test_broadcast_to_like():
     y = np.random.uniform(size=shape_like).astype(dtype)
     ref_res = np.broadcast_to(x, shape_like)
 
-    for target, ctx in tvm.testing.enabled_targets():
+    for target, dev in tvm.testing.enabled_targets():
         for kind in ["graph", "debug"]:
-            intrp = relay.create_executor(kind, ctx=ctx, target=target)
+            intrp = relay.create_executor(kind, device=dev, target=target)
             op_res = intrp.evaluate(func)(x, y)
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
@@ -281,9 +281,9 @@ def verify_slice_like(data, slice_like, axes, output, dtype="float32"):
     y_data = np.random.uniform(size=slice_like).astype(dtype)
     ref_res = np_slice_like(x_data, y_data, axes)
 
-    for target, ctx in tvm.testing.enabled_targets():
+    for target, dev in tvm.testing.enabled_targets():
         for kind in ["graph", "debug"]:
-            intrp = relay.create_executor(kind, ctx=ctx, target=target)
+            intrp = relay.create_executor(kind, device=dev, target=target)
             op_res = intrp.evaluate(func)(x_data, y_data)
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
@@ -315,9 +315,9 @@ def verify_reverse_reshape(shape, newshape, oshape):
         func = relay.Function([x], z)
         x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
         ref_res = np.reshape(x_data, oshape)
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, device=dev, target=target)
                 op_res = intrp.evaluate(func)(x_data)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
@@ -340,9 +340,9 @@ def verify_batch_matmul(x_shape, y_shape, out_shape, dtype="float32"):
     y_np = np.random.uniform(size=y_shape).astype(dtype)
     z_np = tvm.topi.testing.batch_matmul(x_np, y_np)
 
-    for target, ctx in tvm.testing.enabled_targets():
+    for target, dev in tvm.testing.enabled_targets():
         for kind in ["graph", "debug"]:
-            intrp = relay.create_executor(kind, ctx=ctx, target=target)
+            intrp = relay.create_executor(kind, device=dev, target=target)
             z = intrp.evaluate(func)(x_np, y_np)
             tvm.testing.assert_allclose(z.asnumpy(), z_np, rtol=1e-5)
 
@@ -372,10 +372,10 @@ def verify_dynamic_batch_matmul(x_shape, y_shape, out_shape, dtype="float32"):
     y_np = np.random.uniform(size=y_shape).astype(dtype)
     z_np = tvm.topi.testing.batch_matmul(x_np, y_np)
 
-    for target, ctx in tvm.testing.enabled_targets():
+    for target, dev in tvm.testing.enabled_targets():
         for kind in ["vm", "debug"]:
             mod = tvm.ir.IRModule.from_expr(func)
-            intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+            intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
             z = intrp.evaluate()(x_np, y_np)
             tvm.testing.assert_allclose(z.asnumpy(), z_np, rtol=1e-5)
 
@@ -396,11 +396,11 @@ def test_shape_of():
     func = relay.Function([x], relay.op.shape_of(x))
     func = run_infer_type(func)
     x_data = np.random.rand(*shape).astype("float32")
-    for target, ctx in tvm.testing.enabled_targets():
+    for target, dev in tvm.testing.enabled_targets():
         # Because using graph executor, this op will be optimized after
         # constant folding pass, here we only test with interpreter
         for kind in ["debug"]:
-            intrp = relay.create_executor(kind, ctx=ctx, target=target)
+            intrp = relay.create_executor(kind, device=dev, target=target)
             op_res = intrp.evaluate(func)(x_data)
             tvm.testing.assert_allclose(op_res.asnumpy(), np.array(shape).astype("int32"))
 
@@ -414,9 +414,9 @@ def verify_ndarray_size(shape):
 
         x_data = np.random.uniform(size=shape).astype("float32")
         ref_res = np.size(x_data)
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, device=dev, target=target)
                 op_res = intrp.evaluate(func)(x_data)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
 
@@ -433,8 +433,8 @@ def verify_adaptive_pool(dshape, out_size, pool_type, layout, dtype, opfunc):
         np_data = np.random.uniform(low=0, high=255, size=dshape).astype(dtype)
         np_out = tvm.topi.testing.adaptive_pool(np_data, out_size, pool_type, layout)
 
-        for target, ctx in tvm.testing.enabled_targets():
-            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        for target, dev in tvm.testing.enabled_targets():
+            intrp1 = relay.create_executor("graph", device=dev, target=target)
             relay_out = intrp1.evaluate(func)(np_data)
             tvm.testing.assert_allclose(relay_out.asnumpy(), np_out, rtol=1e-5, atol=1e-5)
 
@@ -481,9 +481,9 @@ def _verify(data_shape, mask_value, axis, dtype, itype):
         valid_length_np = np.random.randint(0, max_length, size=nbatch).astype(itype)
         gt_out_np = tvm.topi.testing.sequence_mask(data_np, valid_length_np, mask_value, axis)
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, device=dev, target=target)
                 out_relay = intrp.evaluate(func)(data_np, valid_length_np)
                 tvm.testing.assert_allclose(out_relay.asnumpy(), gt_out_np)
 
@@ -521,9 +521,9 @@ def _verify(indices_shape, depth, on_value, off_value, axis, dtype):
         indices_np = np.random.randint(0, depth, size=indices_shape).astype("int32")
         out_np = tvm.topi.testing.one_hot(indices_np, on_value, off_value, depth, axis, dtype)
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, device=dev, target=target)
                 out_relay = intrp.evaluate(func)(indices_np)
                 tvm.testing.assert_allclose(out_relay.asnumpy(), out_np)
 
@@ -551,9 +551,9 @@ def _verify(input_shape, diagonal_shape, dtype, k=0, align="RIGHT_LEFT"):
         diagonal_np = np.random.randint(-100, 100, size=diagonal_shape).astype(dtype)
         out_np = tvm.topi.testing.matrix_set_diag(input_np, diagonal_np, k, align)
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, device=dev, target=target)
                 out_relay = intrp.evaluate(func)(input_np, diagonal_np)
                 tvm.testing.assert_allclose(out_relay.asnumpy(), out_np)
 
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 1a1f451f4c74..c5843758c3d2 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -96,11 +96,11 @@ def run_test_conv1d(
             data.astype(out_dtype), kernel.astype(out_dtype), 1, padding, dilation
         )
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             if target in except_targets:
                 continue
-            ctx = tvm.context(target, 0)
-            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+            dev = tvm.device(target, 0)
+            intrp1 = relay.create_executor("graph", device=dev, target=target)
             op_res1 = intrp1.evaluate(func)(data, kernel)
             tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
@@ -224,11 +224,11 @@ def run_test_conv2d(
         else:
             ref_res = fref(data.astype(out_dtype), dkernel.astype(out_dtype))
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             if target in except_targets:
                 continue
-            ctx = tvm.context(target, 0)
-            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+            dev = tvm.device(target, 0)
+            intrp1 = relay.create_executor("graph", device=dev, target=target)
             op_res1 = intrp1.evaluate(func)(data, kernel)
             tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-4, atol=1e-4)
 
@@ -399,13 +399,13 @@ def run_test_conv2d_cuda(
         )
 
         with WinogradFallback(), tvm.transform.PassContext(opt_level=3):
-            for target, ctx in tvm.testing.enabled_targets():
+            for target, dev in tvm.testing.enabled_targets():
                 if target != "cuda":
                     continue
-                ctx = tvm.context(target, 0)
+                dev = tvm.device(target, 0)
                 params = {"w": tvm.nd.array(kernel)}
                 graph, lib, params = relay.build_module.build(mod, target=target, params=params)
-                module = tvm.contrib.graph_runtime.create(graph, lib, ctx)
+                module = tvm.contrib.graph_executor.create(graph, lib, dev)
                 module.set_input("x", tvm.nd.array(data))
                 module.set_input(**params)
                 module.run()
@@ -510,12 +510,12 @@ def run_test_conv3d(
         else:
             ref_res = fref(data.astype(out_dtype), dkernel.astype(out_dtype))
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             if target in except_targets:
                 continue
-            ctx = tvm.context(target, 0)
+            dev = tvm.device(target, 0)
 
-            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+            intrp1 = relay.create_executor("graph", device=dev, target=target)
             op_res1 = intrp1.evaluate(func)(data, kernel)
             tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
@@ -575,12 +575,12 @@ def run_test_conv3d(
         else:
             ref_res = fref(data.astype(out_dtype), dkernel.astype(out_dtype))
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             if target in except_targets:
                 continue
-            ctx = tvm.context(target, 0)
+            dev = tvm.device(target, 0)
 
-            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+            intrp1 = relay.create_executor("graph", device=dev, target=target)
             op_res1 = intrp1.evaluate(func)(data, kernel)
             tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
@@ -662,13 +662,13 @@ def run_test_conv3d_cuda(
         )
 
         with WinogradFallback(), tvm.transform.PassContext(opt_level=3):
-            for target, ctx in tvm.testing.enabled_targets():
+            for target, dev in tvm.testing.enabled_targets():
                 if target != "cuda":
                     continue
-                ctx = tvm.context(target, 0)
+                dev = tvm.device(target, 0)
                 params = {"w": tvm.nd.array(kernel)}
                 graph, lib, params = relay.build_module.build(mod, target=target, params=params)
-                module = tvm.contrib.graph_runtime.create(graph, lib, ctx)
+                module = tvm.contrib.graph_executor.create(graph, lib, dev)
                 module.set_input("x", tvm.nd.array(data))
                 module.set_input(**params)
                 module.run()
@@ -762,8 +762,8 @@ def test_conv3d_transpose_ncdhw_run():
     kernel = np.random.uniform(size=kshape).astype(dtype)
     ref_res = tvm.topi.testing.conv3d_transpose_ncdhw_python(data, kernel, 1, 1, 0)
 
-    for target, ctx in tvm.testing.enabled_targets():
-        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+    for target, dev in tvm.testing.enabled_targets():
+        intrp1 = relay.create_executor("graph", device=dev, target=target)
         op_res1 = intrp1.evaluate(func)(data, kernel)
         tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
@@ -805,8 +805,8 @@ def test_conv2d_transpose_nchw_run():
     kernel = np.random.uniform(size=kshape).astype(dtype)
     ref_res = tvm.topi.testing.conv2d_transpose_nchw_python(data, kernel, 2, 1, (1, 1))
 
-    for target, ctx in tvm.testing.enabled_targets():
-        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+    for target, dev in tvm.testing.enabled_targets():
+        intrp1 = relay.create_executor("graph", device=dev, target=target)
         op_res1 = intrp1.evaluate(func)(data, kernel)
         tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
@@ -841,8 +841,8 @@ def test_conv2d_transpose_nhwc_run():
         data, kernel, "HWOI", 2, 1, output_padding=(1, 1)
     )
 
-    for target, ctx in tvm.testing.enabled_targets():
-        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+    for target, dev in tvm.testing.enabled_targets():
+        intrp1 = relay.create_executor("graph", device=dev, target=target)
         op_res1 = intrp1.evaluate(func)(data, kernel)
         tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
@@ -863,8 +863,8 @@ def test_conv1d_transpose_ncw_run():
     kernel = np.random.uniform(size=kshape).astype(dtype)
     ref_res = tvm.topi.testing.conv1d_transpose_ncw_python(data, kernel, 2, 1, output_padding=(1,))
 
-    for target, ctx in tvm.testing.enabled_targets():
-        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+    for target, dev in tvm.testing.enabled_targets():
+        intrp1 = relay.create_executor("graph", device=dev, target=target)
         op_res1 = intrp1.evaluate(func)(data, kernel)
         tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
@@ -943,8 +943,8 @@ def _test_pool2d(opfunc, reffunc, pool_size=(2, 2), strides=(2, 2), padding=(0,
     func = relay.Function([x], y)
     data = np.random.uniform(size=dshape).astype(dtype)
     ref_res = reffunc(data.reshape(1, 3, 14, 2, 14, 2), axis=(3, 5))
-    for target, ctx in tvm.testing.enabled_targets():
-        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+    for target, dev in tvm.testing.enabled_targets():
+        intrp1 = relay.create_executor("graph", device=dev, target=target)
         op_res1 = intrp1.evaluate(func)(data)
         tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
@@ -965,8 +965,8 @@ def _test_pool2d_int(opfunc, reffunc, dtype):
         func = relay.Function([x], y)
         data = np.random.randint(low=-128, high=128, size=dshape)
         ref_res = reffunc(data.reshape(1, 3, 14, 2, 14, 2), axis=(3, 5)).astype(dtype)
-        for target, ctx in tvm.testing.enabled_targets():
-            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        for target, dev in tvm.testing.enabled_targets():
+            intrp1 = relay.create_executor("graph", device=dev, target=target)
             op_res1 = intrp1.evaluate(func)(data)
             tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
@@ -991,8 +991,8 @@ def _test_global_pool2d(opfunc, reffunc):
     func = relay.Function([x], y)
     data = np.random.uniform(size=dshape).astype(dtype)
     ref_res = reffunc(data, axis=(2, 3), keepdims=True)
-    for target, ctx in tvm.testing.enabled_targets():
-        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+    for target, dev in tvm.testing.enabled_targets():
+        intrp1 = relay.create_executor("graph", device=dev, target=target)
         op_res1 = intrp1.evaluate(func)(data)
         tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
@@ -1029,8 +1029,8 @@ def _test_pool1d(opfunc, pool_size=(2,), strides=(2,), padding=(0, 0), dtype="fl
             ref_res = tvm.topi.testing.pool1d_ncw_python(
                 data, (2,), (2,), (0, 0), (1, 3, 16), pool_type, False
             )
-            for target, ctx in tvm.testing.enabled_targets():
-                intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+            for target, dev in tvm.testing.enabled_targets():
+                intrp1 = relay.create_executor("graph", device=dev, target=target)
                 op_res1 = intrp1.evaluate(func)(data)
                 tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
@@ -1075,8 +1075,8 @@ def _test_pool3d(
             ref_res = tvm.topi.testing.pool3d_ncdhw_python(
                 data, pool_size, strides, padding, out_shape, pool_type, False
             )
-            for target, ctx in tvm.testing.enabled_targets():
-                intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+            for target, dev in tvm.testing.enabled_targets():
+                intrp1 = relay.create_executor("graph", device=dev, target=target)
                 op_res1 = intrp1.evaluate(func)(data)
                 tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
@@ -1125,8 +1125,8 @@ def test_avg_pool2d_no_count_pad():
     ref_res = np.maximum(b_np, 0.0)
     data = a_np
 
-    for target, ctx in tvm.testing.enabled_targets():
-        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+    for target, dev in tvm.testing.enabled_targets():
+        intrp1 = relay.create_executor("graph", device=dev, target=target)
         op_res1 = intrp1.evaluate(func)(data)
         tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
@@ -1160,9 +1160,9 @@ def test_flatten_infer_type():
     x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype)
     ref_res = x_data.flatten().reshape(o_shape)
 
-    for target, ctx in tvm.testing.enabled_targets():
-        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
-        intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+    for target, dev in tvm.testing.enabled_targets():
+        intrp1 = relay.create_executor("graph", device=dev, target=target)
+        intrp2 = relay.create_executor("debug", device=dev, target=target)
         op_res1 = intrp1.evaluate(func)(x_data)
         tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
         op_res2 = intrp2.evaluate(func)(x_data)
@@ -1223,8 +1223,8 @@ def _test_run(dtype):
                 mod_pad.append((pad_x, pad_y))
 
             ref_res = np.pad(mod_data, tuple(mod_pad), "constant")
-            for target, ctx in tvm.testing.enabled_targets():
-                intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+            for target, dev in tvm.testing.enabled_targets():
+                intrp1 = relay.create_executor("graph", device=dev, target=target)
                 op_res1 = intrp1.evaluate(func)(data)
                 tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
@@ -1256,9 +1256,9 @@ def test_lrn():
     x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype)
     ref_res = tvm.topi.testing.lrn_python(x_data, size, axis, bias, alpha, beta)
 
-    for target, ctx in tvm.testing.enabled_targets():
-        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
-        intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+    for target, dev in tvm.testing.enabled_targets():
+        intrp1 = relay.create_executor("graph", device=dev, target=target)
+        intrp2 = relay.create_executor("debug", device=dev, target=target)
         op_res1 = intrp1.evaluate(func)(x_data)
         tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
         op_res2 = intrp2.evaluate(func)(x_data)
@@ -1286,9 +1286,9 @@ def test_l2_normalize():
     x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype)
     ref_res = tvm.topi.testing.l2_normalize_python(x_data, eps, axis)
 
-    for target, ctx in tvm.testing.enabled_targets():
-        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
-        intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+    for target, dev in tvm.testing.enabled_targets():
+        intrp1 = relay.create_executor("graph", device=dev, target=target)
+        intrp2 = relay.create_executor("debug", device=dev, target=target)
         op_res1 = intrp1.evaluate(func)(x_data)
         tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
         op_res2 = intrp2.evaluate(func)(x_data)
@@ -1311,8 +1311,8 @@ def test_batch_flatten():
 
     data = np.random.rand(5, 10, 5).astype(t1.dtype)
     ref_res = batch_flatten(data)
-    for target, ctx in tvm.testing.enabled_targets():
-        intrp = relay.create_executor("graph", ctx=ctx, target=target)
+    for target, dev in tvm.testing.enabled_targets():
+        intrp = relay.create_executor("graph", device=dev, target=target)
         op_res = intrp.evaluate(func)(data)
         np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
 
@@ -1359,8 +1359,8 @@ def get_shape():
         ref = tvm.topi.testing.bilinear_resize_python(
             data, (int(round(h * scale_h)), int(round(w * scale_w))), layout
         )
-    for target, ctx in tvm.testing.enabled_targets():
-        executor = relay.create_executor("graph", ctx=ctx, target=target)
+    for target, dev in tvm.testing.enabled_targets():
+        executor = relay.create_executor("graph", device=dev, target=target)
         out = executor.evaluate(func)(data)
         tvm.testing.assert_allclose(out.asnumpy(), ref, rtol=1e-5, atol=1e-5)
 
@@ -1431,8 +1431,8 @@ def get_shape():
             (int(round(d * scale_d)), int(round(h * scale_h)), int(round(w * scale_w))),
             layout,
         )
-    for target, ctx in tvm.testing.enabled_targets():
-        executor = relay.create_executor("graph", ctx=ctx, target=target)
+    for target, dev in tvm.testing.enabled_targets():
+        executor = relay.create_executor("graph", device=dev, target=target)
         out = executor.evaluate(func)(data)
         tvm.testing.assert_allclose(out.asnumpy(), ref, rtol=1e-5, atol=1e-5)
 
@@ -1698,8 +1698,8 @@ def _test_correlation(
             is_multiply,
         )
 
-        for target, ctx in tvm.testing.enabled_targets():
-            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        for target, dev in tvm.testing.enabled_targets():
+            intrp1 = relay.create_executor("graph", device=dev, target=target)
             op_res1 = intrp1.evaluate(func)(data1_np, data2_np)
             tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index d2a5090943c3..bf0a7e4952e5 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -16,16 +16,16 @@
 # under the License.
 """ Support level3 operator test cases.
 """
+from typing import Callable, Optional
+
 import numpy as np
 import pytest
 import tvm
-from tvm import te
-from tvm import relay
+import tvm.testing
+from tvm import relay, te
 from tvm.error import TVMError
 from tvm.relay import create_executor, transform
 from tvm.relay.testing import check_grad, run_infer_type
-from typing import Optional
-import tvm.testing
 
 
 def test_zeros_ones():
@@ -217,9 +217,9 @@ def verify_transpose(dshape, axes):
         x_data = np.random.uniform(low=-1, high=1, size=dshape).astype("float32")
         ref_res = np.transpose(x_data, axes=axes)
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, device=dev, target=target)
                 op_res = intrp.evaluate(func)(x_data)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
@@ -272,9 +272,9 @@ def verify_reshape(shape, newshape, oshape):
         check_grad(func)
         x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
         ref_res = np.reshape(x_data, oshape)
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, device=dev, target=target)
                 op_res = intrp.evaluate(func)(x_data)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
@@ -361,9 +361,9 @@ def verify_reshape_like(shape, oshape, shape_like=None, reshape_like_kwargs={}):
 
         func = relay.Function([x, y], z)
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, device=dev, target=target)
                 op_res = intrp.evaluate(func)(x_data, y_data)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
@@ -407,9 +407,9 @@ def verify_take(src_shape, indices_src, axis=None, mode="clip"):
         np_mode = "raise" if mode == "fast" else mode
         ref_res = np.take(x_data, indices=indices_src, axis=axis, mode=np_mode)
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, device=dev, target=target)
                 op_res = intrp.evaluate(func)(x_data, indices_src)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
@@ -542,9 +542,9 @@ def verify_full(fill_value, src_shape, dtype):
         z = relay.full(x, src_shape, dtype)
         func = relay.Function([x], z)
         ref_res = np.full(src_shape, fill_value)
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, device=dev, target=target)
                 op_res = intrp.evaluate(func)(np.array(fill_value, dtype))
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
@@ -581,9 +581,9 @@ def verify_full_like(base, fill_value, dtype):
         func = relay.Function([x, y], z)
         ref_res = np.full_like(x_data, fill_value)
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, device=dev, target=target)
                 op_res = intrp.evaluate(func)(x_data, np.array(fill_value, dtype))
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
@@ -611,9 +611,9 @@ def test_infer_type_leaky_relu():
     x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype)
     ref_res = np.where(x_data > 0, x_data, x_data * 0.1)
 
-    for target, ctx in tvm.testing.enabled_targets():
-        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
-        intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+    for target, dev in tvm.testing.enabled_targets():
+        intrp1 = relay.create_executor("graph", device=dev, target=target)
+        intrp2 = relay.create_executor("debug", device=dev, target=target)
         op_res1 = intrp1.evaluate(func)(x_data)
         tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
         op_res2 = intrp2.evaluate(func)(x_data)
@@ -648,9 +648,9 @@ def verify_infer_type_prelu(data, alpha, axis, output, dtype="float32"):
     else:
         ref_res = (x_data < 0) * (x_data * a_data.reshape(1, 1, 3)) + (x_data >= 0) * x_data
 
-    for target, ctx in tvm.testing.enabled_targets():
-        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
-        intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+    for target, dev in tvm.testing.enabled_targets():
+        intrp1 = relay.create_executor("graph", device=dev, target=target)
+        intrp2 = relay.create_executor("debug", device=dev, target=target)
         op_res1 = intrp1.evaluate(func)(x_data, a_data)
         tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
         op_res2 = intrp2.evaluate(func)(x_data, a_data)
@@ -692,9 +692,9 @@ def verify_arange(start, stop, step):
             ref_res = np.arange(start, stop, step).astype(dtype)
 
         func = relay.Function([], x)
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, device=dev, target=target)
                 op_res = intrp.evaluate(func)()
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
@@ -731,9 +731,9 @@ def verify_meshgrid(lengths, indexing="ij"):
         # Get ref
         ref_res = np.meshgrid(*input_data, indexing=indexing)
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, device=dev, target=target)
                 op_res = intrp.evaluate(func)(*input_data)
                 assert len(op_res) == len(ref_res)
                 for i in range(len(op_res)):
@@ -757,9 +757,9 @@ def verify_tile(dshape, reps):
         x_data = np.random.uniform(low=-1, high=1, size=dshape).astype("float32")
         ref_res = np.tile(x_data, reps=reps)
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, device=dev, target=target)
                 op_res = intrp.evaluate(func)(x_data)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
@@ -775,9 +775,9 @@ def verify_repeat(dshape, repeats, axis):
         func = relay.Function([x], relay.repeat(x, repeats, axis))
         data = np.random.uniform(size=dshape).astype("float32")
         ref_res = np.repeat(data, repeats, axis)
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, device=dev, target=target)
                 op_res = intrp.evaluate(func)(data)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
@@ -800,9 +800,9 @@ def verify_stack(input_expr, relay_args, ref_res, axis):
         inp_vars = relay.analysis.free_vars(z)
         func = relay.Function(inp_vars, z)
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, device=dev, target=target)
                 op_res = intrp.evaluate(func)(*relay_args)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
@@ -852,9 +852,9 @@ def verify_reverse(dshape, axis):
         func = relay.Function([x], z)
         x_data = np.random.uniform(low=-1, high=1, size=dshape).astype("float32")
         ref_res = np.flip(x_data, axis)
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, device=dev, target=target)
                 op_res = intrp.evaluate(func)(x_data)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
@@ -873,9 +873,9 @@ def verify_reverse_sequence(x_data, seq_lengths, batch_axis, seq_axis, ref_res):
         assert zz.checked_type == x.type_annotation
         func = relay.Function([x], z)
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, device=dev, target=target)
                 op_res = intrp.evaluate(func)(x_data)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
@@ -967,9 +967,9 @@ def verify_scatter(dshape, ishape, axis=0):
 
         ref_res = ref_scatter(data_np, indices_np, updates_np, axis)
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, device=dev, target=target)
                 op_res = intrp.evaluate(func)(data_np, indices_np, updates_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
@@ -987,10 +987,10 @@ def verify_dynamic_scatter(dshape, ishape, axis=0):
 
         ref_res = ref_scatter(data_np, indices_np, updates_np, axis)
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["vm", "debug"]:
                 mod = tvm.ir.IRModule.from_expr(func)
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(data_np, indices_np, updates_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
@@ -1241,9 +1241,9 @@ def verify_gather(data, axis, indices, ref_res):
 
         func = relay.Function([d, i], z)
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, device=dev, target=target)
                 op_res = intrp.evaluate(func)(data, indices)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
@@ -1261,9 +1261,9 @@ def verify_gather_nd(xshape, yshape, y_data):
         x_data = np.random.uniform(size=xshape).astype("float32")
         ref_res = x_data[tuple(y_data)]
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, device=dev, target=target)
                 op_res = intrp.evaluate(func)(x_data, y_data)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
@@ -1321,9 +1321,9 @@ def verify_unravel_index(indices, shape, dtype):
 
         func = relay.Function([x, y], z)
         ref_res = np.unravel_index(x_data, y_data)
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, device=dev, target=target)
                 op_res = intrp.evaluate(func)(x_data, y_data)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
@@ -1367,9 +1367,9 @@ def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_
         assert zz.checked_type == relay.ty.TensorType(output_shape, str(sparse_values_data.dtype))
 
         func = relay.Function(args, d)
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, device=dev, target=target)
                 if default_value is None:
                     op_res = intrp.evaluate(func)(sparse_indices_data, sparse_values_data)
                 else:
@@ -1707,12 +1707,12 @@ def verify_segment_sum(
     verify_segment_sum(data_np, segment_ids_np, num_segments)
 
 
-def verify_func(func, data, ref_res, target_ctx=tvm.testing.enabled_targets()):
+def verify_func(func, data, ref_res, target_device=tvm.testing.enabled_targets()):
     assert isinstance(data, list)
-    for target, ctx in target_ctx:
+    for target, dev in target_device:
         for kind in ["vm"]:
             mod = tvm.ir.IRModule.from_expr(func)
-            intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+            intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
             op_res = intrp.evaluate()(*data)
             if isinstance(op_res, tvm.runtime.container.ADT):
                 assert len(op_res) == len(
@@ -1742,9 +1742,9 @@ def verify_adv_index(data_shape, index_shapes):
         out = relay.op.adv_index(inputs)
 
         func = relay.Function(inputs, out)
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, device=dev, target=target)
                 op_res = intrp.evaluate(func)(*np_args)
                 tvm.testing.assert_allclose(op_res.asnumpy(), np_out, rtol=1e-5)
 
@@ -1758,42 +1758,81 @@ def verify_adv_index(data_shape, index_shapes):
     verify_adv_index((10, 5, 15), [(1, 2, 1), (1, 2, 7)])
 
 
-@tvm.testing.parametrize_targets
-def test_cumsum(target, ctx):
-    def verify_cumsum(data_np, np_out, axis=None, out_dtype=None, rtol=1e-5, atol=1e-5):
+# Helper for testing binop functions
+scanops_supported = {"cumsum": relay.op.cumsum, "cumprod": relay.op.cumprod}
+
+
+def run_binop_tests(
+    target, dev, binop_type: str, gt_func: Callable[..., np.array], identity_value: int
+):
+    def assert_relay_scanop(
+        data_np: np.array,
+        np_out: np.array,
+        axis: int = None,
+        out_dtype: str = None,
+        rtol: float = 1e-5,
+        atol: float = 1e-5,
+        exclusive: bool = False,
+    ):
         inp = relay.var("data", relay.TensorType(data_np.shape, str(data_np.dtype)))
 
-        out = relay.op.cumsum(inp, axis, out_dtype)
+        if binop_type not in scanops_supported.keys():
+            raise ValueError(f"Unknown function {binop_type}. Options: {scanops_supported.keys()}")
+        out = scanops_supported[binop_type](inp, axis, out_dtype, exclusive=exclusive)
         func = relay.Function([inp], out)
 
         for kind in ["graph", "debug"]:
-            intrp = relay.create_executor(kind, ctx=ctx, target=target)
+            intrp = relay.create_executor(kind, device=dev, target=target)
             op_res = intrp.evaluate(func)(data_np)
             tvm.testing.assert_allclose(op_res.asnumpy(), np_out, rtol=rtol, atol=atol)
 
     data = np.array([2, 3, 0])
-    verify_cumsum(data, np.cumsum(data))
-    verify_cumsum(data, np.cumsum(data), out_dtype="int64")
+    assert_relay_scanop(data, gt_func(data))
+    assert_relay_scanop(data, gt_func(data), out_dtype="int64")
 
     data = np.random.randn(10, 10)
-    verify_cumsum(data, np.cumsum(data))
-    verify_cumsum(data, np.cumsum(data, axis=0), axis=0)
-    verify_cumsum(data, np.cumsum(data, axis=1), axis=1)
+    assert_relay_scanop(data, gt_func(data))
+    assert_relay_scanop(data, gt_func(data, axis=0), axis=0)
+    assert_relay_scanop(data, gt_func(data, axis=1), axis=1)
 
     data = np.random.randn(10, 5, 10).astype("float32")
-    verify_cumsum(data, np.cumsum(data), rtol=1e-4, atol=1e-4)
-    verify_cumsum(data, np.cumsum(data, axis=0), axis=0, rtol=1e-4, atol=1e-4)
-    verify_cumsum(data, np.cumsum(data, axis=1), axis=1, rtol=1e-4, atol=1e-4)
-    verify_cumsum(data, np.cumsum(data, axis=-1), axis=-1, rtol=1e-4, atol=1e-4)
+    assert_relay_scanop(data, gt_func(data), rtol=1e-4, atol=1e-4)
+    assert_relay_scanop(data, gt_func(data, axis=0), axis=0, rtol=1e-4, atol=1e-4)
+    assert_relay_scanop(data, gt_func(data, axis=1), axis=1, rtol=1e-4, atol=1e-4)
+    assert_relay_scanop(data, gt_func(data, axis=-1), axis=-1, rtol=1e-4, atol=1e-4)
 
     data = np.random.rand(10) > 0.5
     data = data.astype(np.int32)
-    verify_cumsum(data, np.cumsum(data, dtype=np.int32))
-    verify_cumsum(data, np.cumsum(data, dtype="int64"), out_dtype="int64")
+    assert_relay_scanop(data, gt_func(data, dtype=np.int32))
+    assert_relay_scanop(data, gt_func(data, dtype="int64"), out_dtype="int64")
+
+    # Test exclusivity operations
+    data = np.random.randint(-100, 100, size=(10, 10)).astype("int64")
+    expected_result = np.roll(gt_func(data), 1)
+    expected_result[0] = identity_value
+    assert_relay_scanop(data, expected_result, exclusive=True)
+
+    expected_result = np.roll(gt_func(data, axis=0), 1, axis=0)
+    expected_result[0, :] = identity_value
+    assert_relay_scanop(data, expected_result, exclusive=True, axis=0)
+
+    expected_result = np.roll(gt_func(data, axis=1), 1, axis=1)
+    expected_result[:, 0] = identity_value
+    assert_relay_scanop(data, expected_result, exclusive=True, axis=1)
+
+
+@tvm.testing.parametrize_targets
+def test_cumsum(target, dev):
+    run_binop_tests(target, dev, binop_type="cumsum", gt_func=np.cumsum, identity_value=0)
+
+
+@tvm.testing.parametrize_targets
+def test_cumprod(target, dev):
+    run_binop_tests(target, dev, binop_type="cumprod", gt_func=np.cumprod, identity_value=1)
 
 
 @tvm.testing.parametrize_targets
-def test_scatter_nd(target, ctx):
+def test_scatter_nd(target, dev):
     def verify_scatter_nd(data_np, indices_np, shape, ref_res, rtol=1e-5, atol=1e-5):
         data = relay.var("data", shape=data_np.shape, dtype=str(data_np.dtype))
         indices = relay.var("indices", shape=indices_np.shape, dtype=str(indices_np.dtype))
@@ -1802,7 +1841,7 @@ def verify_scatter_nd(data_np, indices_np, shape, ref_res, rtol=1e-5, atol=1e-5)
         func = relay.Function([data, indices], out)
 
         for kind in ["graph", "debug"]:
-            intrp = relay.create_executor(kind, ctx=ctx, target=target)
+            intrp = relay.create_executor(kind, device=dev, target=target)
             op_res = intrp.evaluate(func)(data_np, indices_np)
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=rtol, atol=atol)
 
@@ -1829,7 +1868,7 @@ def verify_scatter_nd_with_stack(data_np, indices_np, shape, ref_res, rtol=1e-5,
         for a in indices_np:
             fargs.append(a)
         for kind in ["graph", "debug"]:
-            intrp = relay.create_executor(kind, ctx=ctx, target=target)
+            intrp = relay.create_executor(kind, device=dev, target=target)
             op_res = intrp.evaluate(func)(*fargs)
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=rtol, atol=atol)
 
@@ -1897,10 +1936,10 @@ def verify_unique(n, dtype, is_dyn=False, is_sorted=False, return_counts=False):
         else:
             backends = ["graph", "debug"]
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in backends:
                 mod = tvm.ir.IRModule.from_expr(func)
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 tvm_res = intrp.evaluate()(x_data)
                 np_res = calc_numpy_unique(x_data, is_sorted)
                 num_unique = np_res[3][0]
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index 114783e55f20..036d4a0f6044 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -49,8 +49,8 @@ def check_binary_op(opfunc, ref):
             ref_res = ref(x_data, y_data)
             func = relay.Function([x, y], z)
 
-            for target, ctx in tvm.testing.enabled_targets():
-                intrp = relay.create_executor("graph", ctx=ctx, target=target)
+            for target, dev in tvm.testing.enabled_targets():
+                intrp = relay.create_executor("graph", device=dev, target=target)
                 op_res = intrp.evaluate(func)(x_data, y_data)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
 
@@ -87,8 +87,8 @@ def test_cmp_type():
             ref_res = ref(x_data, y_data)
             func = relay.Function([x, y], z)
 
-            for target, ctx in tvm.testing.enabled_targets():
-                intrp = relay.create_executor("graph", ctx=ctx, target=target)
+            for target, dev in tvm.testing.enabled_targets():
+                intrp = relay.create_executor("graph", device=dev, target=target)
                 op_res = intrp.evaluate(func)(x_data, y_data)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
 
@@ -112,8 +112,8 @@ def test_binary_int_broadcast_1():
             func = relay.Function([x, y], z)
             ref_res = ref(x_data, y_data)
 
-            for target, ctx in tvm.testing.enabled_targets():
-                intrp = relay.create_executor("graph", ctx=ctx, target=target)
+            for target, dev in tvm.testing.enabled_targets():
+                intrp = relay.create_executor("graph", device=dev, target=target)
                 op_res = intrp.evaluate(func)(x_data, y_data)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
 
@@ -137,8 +137,8 @@ def test_binary_int_broadcast_2():
             func = relay.Function([x, y], z)
             ref_res = ref(x_data, y_data)
 
-            for target, ctx in tvm.testing.enabled_targets():
-                intrp = relay.create_executor("graph", ctx=ctx, target=target)
+            for target, dev in tvm.testing.enabled_targets():
+                intrp = relay.create_executor("graph", device=dev, target=target)
                 op_res = intrp.evaluate(func)(x_data, y_data)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
 
@@ -146,9 +146,9 @@ def test_binary_int_broadcast_2():
 @tvm.testing.uses_gpu
 def test_where():
     def run(func, inputs, ref_res):
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, device=dev, target=target)
                 op_res = intrp.evaluate(func)(*inputs)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
@@ -257,9 +257,9 @@ def verify_reduce(funcs, data, axis, keepdims, exclude, output, dtype="float32")
             return
         ref_res = ref_func(x_data + 0, axis=axis, keepdims=keepdims)
 
-    for target, ctx in tvm.testing.enabled_targets():
-        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
-        intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+    for target, dev in tvm.testing.enabled_targets():
+        intrp1 = relay.create_executor("graph", device=dev, target=target)
+        intrp2 = relay.create_executor("debug", device=dev, target=target)
         op_res1 = intrp1.evaluate(func)(x_data)
         tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
         op_res2 = intrp2.evaluate(func)(x_data)
@@ -351,9 +351,9 @@ def verify_mean_var_std(funcs, shape, axis, keepdims):
     ref_mean = np.mean(x_data, axis=axis, dtype=dtype, keepdims=keepdims)
     ref_res = ref_func(x_data, axis=axis, dtype=dtype, keepdims=keepdims)
 
-    for target, ctx in tvm.testing.enabled_targets():
-        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
-        intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+    for target, dev in tvm.testing.enabled_targets():
+        intrp1 = relay.create_executor("graph", device=dev, target=target)
+        intrp2 = relay.create_executor("debug", device=dev, target=target)
         op_res1 = intrp1.evaluate(func)(x_data)
         tvm.testing.assert_allclose(op_res1[0].asnumpy(), ref_mean, rtol=1e-5)
         tvm.testing.assert_allclose(op_res1[1].asnumpy(), ref_res, rtol=1e-5)
@@ -405,8 +405,8 @@ def verify(dshape, begin, end, strides, output, slice_mode="end", test_ref=True,
 
         if not test_ref:
             return
-        for target, ctx in tvm.testing.enabled_targets():
-            intrp = relay.create_executor("graph", ctx=ctx, target=target)
+        for target, dev in tvm.testing.enabled_targets():
+            intrp = relay.create_executor("graph", device=dev, target=target)
             op_res = intrp.evaluate(func)(x_data)
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
 
@@ -461,9 +461,9 @@ def verify(dshape, begin, end, strides, output, slice_mode="end", test_ref=True,
 
         if not test_ref:
             return
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             mod = tvm.ir.IRModule.from_expr(func)
-            intrp = relay.create_executor("vm", mod=mod, ctx=ctx, target=target)
+            intrp = relay.create_executor("vm", mod=mod, device=dev, target=target)
             op_res = intrp.evaluate()(x_data)
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
 
@@ -513,8 +513,8 @@ def verify(dshape, begin, end, strides, vshape, test_ref=True):
         x_data = np.random.uniform(size=dshape).astype("float32")
         v_data = np.random.uniform(size=vshape).astype("float32")
         ref_res = tvm.topi.testing.strided_set_python(x_data, v_data, begin, end, strides)
-        for target, ctx in tvm.testing.enabled_targets():
-            intrp = relay.create_executor("graph", ctx=ctx, target=target)
+        for target, dev in tvm.testing.enabled_targets():
+            intrp = relay.create_executor("graph", device=dev, target=target)
             op_res = intrp.evaluate(func)(x_data, v_data)
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
 
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 929764b6e40a..2d6c8b50fd37 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -63,9 +63,9 @@ def verify_resize(dshape, scale, method, layout, coord_trans):
         assert zz.checked_type == relay.TensorType(ref_res.shape, "float32")
         func = relay.Function([x], z)
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, device=dev, target=target)
                 op_res = intrp.evaluate(func)(x_data)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-3, atol=1e-4)
 
@@ -104,7 +104,7 @@ def test_resize3d_infer_type():
 
 
 @tvm.testing.parametrize_targets
-def test_resize3d(target, ctx):
+def test_resize3d(target, dev):
     def verify_resize(dshape, scale, method, layout):
         if layout == "NDHWC":
             size = (dshape[1] * scale, dshape[2] * scale, dshape[3] * scale)
@@ -124,7 +124,7 @@ def verify_resize(dshape, scale, method, layout):
         func = relay.Function([x], z)
 
         for kind in ["graph", "debug"]:
-            intrp = relay.create_executor(kind, ctx=ctx, target=target)
+            intrp = relay.create_executor(kind, device=dev, target=target)
             op_res = intrp.evaluate(func)(x_data)
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-4, atol=1e-6)
 
@@ -156,9 +156,9 @@ def verify_crop_and_resize(
         assert zz.checked_type == relay.TensorType(ref_res.shape, "float32")
         func = relay.Function([img, bx, bx_idx], z)
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, device=dev, target=target)
                 op_res = intrp.evaluate(func)(image_data, boxes, box_indices)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-3, atol=1e-04)
 
@@ -257,11 +257,11 @@ def verify_multibox_prior(
         data = np.random.uniform(low=-1, high=1, size=dshape).astype("float32")
         func = relay.Function([x], z)
         func = run_infer_type(func)
-        for target, ctx in tvm.testing.enabled_targets():
-            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        for target, dev in tvm.testing.enabled_targets():
+            intrp1 = relay.create_executor("graph", device=dev, target=target)
             op_res1 = intrp1.evaluate(func)(data)
             tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
-            intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+            intrp2 = relay.create_executor("debug", device=dev, target=target)
             op_res2 = intrp2.evaluate(func)(data)
             tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
 
@@ -316,8 +316,8 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index):
         assert "score_threshold" in z.astext()
         func = relay.Function([x], z.astuple())
         func = run_infer_type(func)
-        for target, ctx in tvm.testing.enabled_targets():
-            intrp = relay.create_executor("debug", ctx=ctx, target=target)
+        for target, dev in tvm.testing.enabled_targets():
+            intrp = relay.create_executor("debug", device=dev, target=target)
             out = intrp.evaluate(func)(np_data)
 
             tvm.testing.assert_allclose(out[0].asnumpy(), np_out1, rtol=1e-3, atol=1e-04)
@@ -390,11 +390,11 @@ def verify_nms(
         func = run_infer_type(func)
         func_indices = relay.Function([x0, x1, x2, x3], z_indices)
         func_indices = run_infer_type(func_indices)
-        for target, ctx in tvm.testing.enabled_targets():
-            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        for target, dev in tvm.testing.enabled_targets():
+            intrp1 = relay.create_executor("graph", device=dev, target=target)
             op_res1 = intrp1.evaluate(func)(x0_data, x1_data, x2_data, x3_data)
             tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
-            intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+            intrp2 = relay.create_executor("debug", device=dev, target=target)
             op_res2 = intrp2.evaluate(func)(x0_data, x1_data, x2_data, x3_data)
             tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
             op_indices_res1 = intrp1.evaluate(func_indices)(x0_data, x1_data, x2_data, x3_data)
@@ -581,11 +581,11 @@ def test_default_value():
         nms = relay.vision.non_max_suppression(mtl[0], mtl[1], mtl[0], return_indices=False)
         func = relay.Function([cls_prob, loc_pred, anchors], nms)
         func = run_infer_type(func)
-        for target, ctx in tvm.testing.enabled_targets():
-            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        for target, dev in tvm.testing.enabled_targets():
+            intrp1 = relay.create_executor("graph", device=dev, target=target)
             op_res1 = intrp1.evaluate(func)(np_cls_prob, np_loc_preds, np_anchors)
             tvm.testing.assert_allclose(op_res1.asnumpy(), expected_np_out, rtol=1e-5)
-            intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+            intrp2 = relay.create_executor("debug", device=dev, target=target)
             op_res2 = intrp2.evaluate(func)(np_cls_prob, np_loc_preds, np_anchors)
             tvm.testing.assert_allclose(op_res2.asnumpy(), expected_np_out, rtol=1e-5)
 
@@ -674,12 +674,12 @@ def verify_roi_align(
             sample_ratio=sample_ratio,
             mode=mode,
         )
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             print("test on", target)
-            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+            intrp1 = relay.create_executor("graph", device=dev, target=target)
             op_res1 = intrp1.evaluate(func)(np_data, np_rois)
             tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-4)
-            intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+            intrp2 = relay.create_executor("debug", device=dev, target=target)
             op_res2 = intrp2.evaluate(func)(np_data, np_rois)
             tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-4)
 
@@ -770,11 +770,11 @@ def verify_roi_pool(data_shape, rois_shape, pooled_size, spatial_scale):
         ref_res = tvm.topi.testing.roi_pool_nchw_python(
             np_data, np_rois, pooled_size=pooled_size, spatial_scale=spatial_scale
         )
-        for target, ctx in tvm.testing.enabled_targets():
-            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        for target, dev in tvm.testing.enabled_targets():
+            intrp1 = relay.create_executor("graph", device=dev, target=target)
             op_res1 = intrp1.evaluate(func)(np_data, np_rois)
             tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-4)
-            intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+            intrp2 = relay.create_executor("debug", device=dev, target=target)
             op_res2 = intrp2.evaluate(func)(np_data, np_rois)
             tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-4)
 
@@ -798,11 +798,11 @@ def verify_proposal(np_cls_prob, np_bbox_pred, np_im_info, np_out, attrs):
             if not tvm.testing.device_enabled(target):
                 print("Skip test because %s is not enabled." % target)
                 continue
-            ctx = tvm.context(target, 0)
-            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+            dev = tvm.device(target, 0)
+            intrp1 = relay.create_executor("graph", device=dev, target=target)
             op_res1 = intrp1.evaluate(func)(np_cls_prob, np_bbox_pred, np_im_info)
             tvm.testing.assert_allclose(op_res1.asnumpy(), np_out, rtol=1e-4)
-            intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+            intrp2 = relay.create_executor("debug", device=dev, target=target)
             op_res2 = intrp2.evaluate(func)(np_cls_prob, np_bbox_pred, np_im_info)
             tvm.testing.assert_allclose(op_res2.asnumpy(), np_out, rtol=1e-4)
 
@@ -891,9 +891,9 @@ def verify_yolo_reorg(shape, stride):
 
         func = relay.Function([x], z)
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, device=dev, target=target)
                 op_res = intrp.evaluate(func)(x_data)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
@@ -1024,11 +1024,11 @@ def test_run(batch, in_channel, size, out_channel, deformable_groups, groups, la
                 deformable_groups=deformable_groups,
                 groups=groups,
             )
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             if target == "cuda" and layout == "NHWC":
                 continue  # Cannot run NHWC layout on cuda target, only on llvm
             for kind in ["graph", "debug"]:
-                intrp1 = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp1 = relay.create_executor(kind, device=dev, target=target)
                 op_res1 = intrp1.evaluate(func)(data, offset, kernel)
                 tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
@@ -1071,9 +1071,9 @@ def verify_depth_to_space(dshape, block_size, layout, mode):
         assert zz.checked_type == relay.TensorType(ref_res.shape, "float32")
         func = relay.Function([x], z)
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, device=dev, target=target)
                 op_res = intrp.evaluate(func)(x_data)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-4)
 
@@ -1115,9 +1115,9 @@ def verify_space_to_depth(dshape, block_size, layout):
         assert zz.checked_type == relay.TensorType(ref_res.shape, "float32")
         func = relay.Function([x], z)
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, device=dev, target=target)
                 op_res = intrp.evaluate(func)(x_data)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-4)
 
@@ -1170,10 +1170,10 @@ def run_test_dilation2d(
         )
         func = relay.Function([x, w], y)
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             if target in except_targets:
                 continue
-            intrp = relay.create_executor("graph", ctx=ctx, target=target)
+            intrp = relay.create_executor("graph", device=dev, target=target)
             op_res = intrp.evaluate(func)(indata, kernel)
             tvm.testing.assert_allclose(op_res.asnumpy(), out, rtol=1e-5, atol=1e-5)
 
@@ -1273,9 +1273,9 @@ def verify_affine_grid(num_batch, target_shape):
         data_np = np.random.uniform(size=data_shape).astype(dtype)
         ref_res = tvm.topi.testing.affine_grid_python(data_np, target_shape)
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp1 = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp1 = relay.create_executor(kind, device=dev, target=target)
                 op_res1 = intrp1.evaluate(func)(data_np)
                 tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
@@ -1300,9 +1300,9 @@ def verify_grid_sample(data_shape, grid_shape):
         grid_np = np.random.uniform(size=grid_shape, low=-1.5, high=1.5).astype(dtype)
         ref_res = tvm.topi.testing.grid_sample_nchw_python(data_np, grid_np, method="bilinear")
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp1 = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp1 = relay.create_executor(kind, device=dev, target=target)
                 op_res1 = intrp1.evaluate(func)(data_np, grid_np)
                 tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
@@ -1327,9 +1327,9 @@ def verify_space_to_batch_nd(dshape, block_shape, paddings):
         assert zz.checked_type == relay.TensorType(ref_res.shape, "float32")
         func = relay.Function([x], z)
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, device=dev, target=target)
                 op_res = intrp.evaluate(func)(x_data)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-4)
 
@@ -1354,9 +1354,9 @@ def verify_batch_to_space_nd(dshape, block_shape, crops):
         assert zz.checked_type == relay.TensorType(ref_res.shape, "float32")
         func = relay.Function([x], z)
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, device=dev, target=target)
                 op_res = intrp.evaluate(func)(x_data)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-4)
 
diff --git a/tests/python/relay/test_op_level6.py b/tests/python/relay/test_op_level6.py
index 0dac69e36025..f0c66247329d 100644
--- a/tests/python/relay/test_op_level6.py
+++ b/tests/python/relay/test_op_level6.py
@@ -26,6 +26,7 @@
 @tvm.testing.uses_gpu
 def test_sort():
     def verify_sort(shape, axis, is_ascend, is_dyn=False):
+
         if is_dyn:
             x = relay.var("x", relay.TensorType([relay.Any()] * len(shape), "float32"))
         else:
@@ -42,10 +43,10 @@ def verify_sort(shape, axis, is_ascend, is_dyn=False):
             backends = ["vm", "debug"]
         else:
             backends = ["graph", "debug"]
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in backends:
                 mod = tvm.ir.IRModule.from_expr(func)
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(x_data)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
@@ -76,10 +77,10 @@ def verify_argsort(shape, axis, is_ascend, dtype, is_dyn=False):
             backends = ["vm", "debug"]
         else:
             backends = ["graph", "debug"]
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in backends:
                 mod = tvm.ir.IRModule.from_expr(func)
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(x_data)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.astype(dtype), rtol=1e-5)
 
@@ -87,9 +88,11 @@ def verify_argsort(shape, axis, is_ascend, dtype, is_dyn=False):
         for dtype in ["int32", "int64", "float32", "float64"]:
             verify_argsort((2, 3, 4), axis=0, is_ascend=False, dtype=dtype, is_dyn=is_dyn)
             verify_argsort((1, 4, 6), axis=1, is_ascend=True, dtype=dtype, is_dyn=is_dyn)
-            verify_argsort((3, 5, 6), axis=-1, is_ascend=False, dtype=dtype, is_dyn=is_dyn)
-            verify_argsort((3, 2000, 6), axis=1, is_ascend=False, dtype=dtype, is_dyn=is_dyn)
-            verify_argsort((1, 122640), axis=1, is_ascend=False, dtype=dtype, is_dyn=is_dyn)
+        dtype = "int32"
+        verify_argsort((3, 5, 6), axis=-1, is_ascend=False, dtype=dtype, is_dyn=is_dyn)
+        verify_argsort((3, 6000, 6), axis=1, is_ascend=False, dtype=dtype, is_dyn=is_dyn)
+        verify_argsort((1000, 1, 1), axis=0, is_ascend=False, dtype=dtype, is_dyn=is_dyn)
+        verify_argsort((1, 122640), axis=1, is_ascend=False, dtype=dtype, is_dyn=is_dyn)
 
 
 @tvm.testing.uses_gpu
@@ -119,9 +122,9 @@ def verify_topk(k, axis, ret_type, is_ascend, dtype):
                 np_values[i, :] = np_data[i, np_indices[i, :]]
         np_indices = np_indices.astype(dtype)
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, device=dev, target=target)
                 op_res = intrp.evaluate(func)(np_data)
                 if ret_type == "both":
                     tvm.testing.assert_allclose(op_res[0].asnumpy(), np_values)
diff --git a/tests/python/relay/test_op_qnn_add.py b/tests/python/relay/test_op_qnn_add.py
index 6f33a7bb0b51..b37ddc2c227c 100644
--- a/tests/python/relay/test_op_qnn_add.py
+++ b/tests/python/relay/test_op_qnn_add.py
@@ -63,7 +63,7 @@ def test_tflite_same_io_qnn_params():
         y_data = y_datas[i]
         golden_output = golden_outputs[i]
 
-        intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm")
+        intrp = relay.create_executor("graph", device=tvm.cpu(0), target="llvm")
         op_res = intrp.evaluate(func)(x_data, y_data)
         np.testing.assert_equal(op_res.asnumpy(), golden_output)
 
@@ -111,7 +111,7 @@ def test_tflite_different_io_qnn_params():
         y_data = y_datas[i]
         golden_output = golden_outputs[i]
 
-        intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm")
+        intrp = relay.create_executor("graph", device=tvm.cpu(0), target="llvm")
         op_res = intrp.evaluate(func)(x_data, y_data)
         np.testing.assert_equal(op_res.asnumpy(), golden_output)
 
@@ -143,7 +143,7 @@ def test_saturation():
     y_data = np.array((255, 255, 128, 0)).reshape((1, 4))
     golden_output = np.array((255, 255, 129, 0)).reshape((1, 4))
 
-    intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm")
+    intrp = relay.create_executor("graph", device=tvm.cpu(0), target="llvm")
     op_res = intrp.evaluate(func)(x_data, y_data)
     np.testing.assert_equal(op_res.asnumpy(), golden_output)
 
@@ -169,7 +169,7 @@ def test_saturation():
     y_data = np.array((255, 255, 127, 0)).reshape((1, 4))
     golden_output = np.array((255, 129, 65, 0)).reshape((1, 4))
 
-    intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm")
+    intrp = relay.create_executor("graph", device=tvm.cpu(0), target="llvm")
     op_res = intrp.evaluate(func)(x_data, y_data)
     np.testing.assert_equal(op_res.asnumpy(), golden_output)
 
@@ -195,7 +195,7 @@ def test_saturation():
     y_data = np.array((255, 255, 127, 0)).reshape((1, 4))
     golden_output = np.array((255, 129, 65, 0)).reshape((1, 4))
 
-    intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm")
+    intrp = relay.create_executor("graph", device=tvm.cpu(0), target="llvm")
     op_res = intrp.evaluate(func)(x_data, y_data)
     np.testing.assert_equal(op_res.asnumpy(), golden_output)
 
@@ -221,7 +221,7 @@ def test_saturation():
     y_data = np.array((0, 128, 64, 0)).reshape((1, 4))
     golden_output = np.array((255, 255, 132, 0)).reshape((1, 4))
 
-    intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm")
+    intrp = relay.create_executor("graph", device=tvm.cpu(0), target="llvm")
     op_res = intrp.evaluate(func)(x_data, y_data)
     np.testing.assert_equal(op_res.asnumpy(), golden_output)
 
diff --git a/tests/python/relay/test_op_qnn_concatenate.py b/tests/python/relay/test_op_qnn_concatenate.py
index 55836dc1ee52..453875301af9 100644
--- a/tests/python/relay/test_op_qnn_concatenate.py
+++ b/tests/python/relay/test_op_qnn_concatenate.py
@@ -19,7 +19,7 @@
 from tvm import te
 import numpy as np
 from tvm import relay
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 import tvm.topi.testing
 
 
@@ -51,7 +51,7 @@ def test_same_io_qnn_params():
 
     golden_output = np.concatenate((x_data, y_data), axis=axis)
 
-    intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm")
+    intrp = relay.create_executor("graph", device=tvm.cpu(0), target="llvm")
     op_res = intrp.evaluate(func)(x_data, y_data)
     np.testing.assert_equal(op_res.asnumpy(), golden_output)
 
@@ -86,7 +86,7 @@ def test_different_io_qnn_params():
 
     golden_output = np.concatenate((x_data - 2, y_data - 3), axis=axis)
 
-    intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm")
+    intrp = relay.create_executor("graph", device=tvm.cpu(0), target="llvm")
     op_res = intrp.evaluate(func)(x_data, y_data)
     np.testing.assert_equal(op_res.asnumpy(), golden_output)
 
@@ -121,7 +121,7 @@ def test_few_same_io_qnn_params():
 
     golden_output = np.concatenate((x_data + 1, y_data), axis=axis)
 
-    intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm")
+    intrp = relay.create_executor("graph", device=tvm.cpu(0), target="llvm")
     op_res = intrp.evaluate(func)(x_data, y_data)
     np.testing.assert_equal(op_res.asnumpy(), golden_output)
 
@@ -156,7 +156,7 @@ def test_same_i_qnn_params():
 
     golden_output = np.concatenate((x_data + 1, y_data + 1), axis=axis)
 
-    intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm")
+    intrp = relay.create_executor("graph", device=tvm.cpu(0), target="llvm")
     op_res = intrp.evaluate(func)(x_data, y_data)
     np.testing.assert_equal(op_res.asnumpy(), golden_output)
 
@@ -183,7 +183,7 @@ def test_call_input():
     )
     func = relay.Function([x], z)
 
-    intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm")
+    intrp = relay.create_executor("graph", device=tvm.cpu(0), target="llvm")
     op_res = intrp.evaluate(func)(x_data)
     np.testing.assert_equal(op_res.asnumpy(), x_data)
 
diff --git a/tests/python/relay/test_op_qnn_conv2_transpose.py b/tests/python/relay/test_op_qnn_conv2_transpose.py
index a86f9e1c6a80..e4e02279efd6 100644
--- a/tests/python/relay/test_op_qnn_conv2_transpose.py
+++ b/tests/python/relay/test_op_qnn_conv2_transpose.py
@@ -21,7 +21,7 @@
 from tvm import relay
 from tvm.relay import transform
 from tvm.relay.testing import run_infer_type
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 from tvm.relay.testing.temp_op_attr import TempOpAttr
 
 
@@ -191,7 +191,7 @@ def get_output(func, golden_inputs):
             golden_data, golden_weight = golden_inputs
             params = {"kernel": golden_weight}
             graph, lib, params = relay.build(func, "llvm", params=params)
-            mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
+            mod = graph_executor.create(graph, lib, device=tvm.cpu(0))
             mod.set_input("data", golden_data)
             mod.set_input(**params)
             mod.run()
diff --git a/tests/python/relay/test_op_qnn_conv2d.py b/tests/python/relay/test_op_qnn_conv2d.py
index 67d4c6f0b807..928450312147 100644
--- a/tests/python/relay/test_op_qnn_conv2d.py
+++ b/tests/python/relay/test_op_qnn_conv2d.py
@@ -21,7 +21,7 @@
 from tvm import relay
 from tvm.relay import transform
 from tvm.relay.testing import run_infer_type
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 from tvm.relay.testing.temp_op_attr import TempOpAttr
 
 # We use llvm target for testing functionality. `llvm` points to an older Intel
@@ -198,7 +198,7 @@ def get_output(func, golden_inputs):
             golden_data, golden_weight = golden_inputs
             params = {"kernel": golden_weight}
             graph, lib, params = relay.build(func, "llvm", params=params)
-            mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
+            mod = graph_executor.create(graph, lib, device=tvm.cpu(0))
             mod.set_input("data", golden_data)
             mod.set_input(**params)
             mod.run()
@@ -722,7 +722,7 @@ def test_tflite_large_irregular():
         with tvm.transform.PassContext(opt_level=2):
             params = {"kernel": golden_weight}
             graph, lib, params = relay.build(qnn_func, "llvm", params=params)
-            mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
+            mod = graph_executor.create(graph, lib, device=tvm.cpu(0))
             mod.set_input("data", golden_data)
             mod.set_input(**params)
             mod.run()
@@ -767,7 +767,7 @@ def test_tflite_output_multiplier_greater_than_one():
         with tvm.transform.PassContext(opt_level=2):
             params = {"kernel": golden_weight}
             graph, lib, params = relay.build(qnn_func, "llvm", params=params)
-            mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
+            mod = graph_executor.create(graph, lib, device=tvm.cpu(0))
             mod.set_input("data", golden_data)
             mod.set_input(**params)
             mod.run()
@@ -830,7 +830,7 @@ def test_tflite_anistropic_strides():
         with tvm.transform.PassContext(opt_level=2):
             params = {"kernel": golden_weight}
             graph, lib, params = relay.build(qnn_func, "llvm", params=params)
-            mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
+            mod = graph_executor.create(graph, lib, device=tvm.cpu(0))
             mod.set_input("data", golden_data)
             mod.set_input(**params)
             mod.run()
diff --git a/tests/python/relay/test_op_qnn_dense.py b/tests/python/relay/test_op_qnn_dense.py
index 923940b5382d..c47ac6b35ec7 100644
--- a/tests/python/relay/test_op_qnn_dense.py
+++ b/tests/python/relay/test_op_qnn_dense.py
@@ -19,7 +19,7 @@
 from tvm import te
 import numpy as np
 from tvm import relay
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 from tvm.relay.testing.temp_op_attr import TempOpAttr
 
 
@@ -211,7 +211,7 @@ def qnn_dense_driver(test_configuration):
     mod = relay.qnn.transform.CanonicalizeOps()(mod)
     with tvm.transform.PassContext(opt_level=2):
         graph, lib, params = relay.build(mod, "llvm", params=None)
-        mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
+        mod = graph_executor.create(graph, lib, device=tvm.cpu(0))
         mod.set_input(quantized_data_name, test_configuration[quantized_data_name])
         mod.set_input(quantized_kernel_name, test_configuration[quantized_kernel_name])
         if test_configuration[bias_name] is not None:
diff --git a/tests/python/relay/test_op_qnn_dequantize.py b/tests/python/relay/test_op_qnn_dequantize.py
index e7fb161a13cb..ab398bbc1316 100644
--- a/tests/python/relay/test_op_qnn_dequantize.py
+++ b/tests/python/relay/test_op_qnn_dequantize.py
@@ -19,7 +19,7 @@
 from tvm import te
 import numpy as np
 from tvm import relay
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 from tvm.relay.testing import run_infer_type
 
 
@@ -35,7 +35,7 @@ def dequantize_test_driver(in_dtype, quant_args, in_data, verify_output_data, ax
     mod = tvm.IRModule.from_expr(mod)
     with tvm.transform.PassContext(opt_level=3):
         graph, lib, params = relay.build(mod, "llvm", params=None)
-        rt_mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
+        rt_mod = graph_executor.create(graph, lib, device=tvm.cpu(0))
         rt_mod.set_input(input_data=in_data)
         rt_mod.set_input(**params)
         rt_mod.run()
@@ -98,7 +98,7 @@ def test_channelwise_axis_1():
     }
 
     dequantize_test_driver(
-        in_dtype="uint8", quant_args=quant_args, in_data=data, verify_output_data=output, axis=1
+        in_dtype="uint8", quant_args=quant_args, in_data=data, verify_output_data=output, axis=-1
     )
 
 
@@ -135,12 +135,12 @@ def test_dynamic_dequantize():
 
     mod = tvm.ir.IRModule.from_expr(func)
 
-    for target, ctx in tvm.testing.enabled_targets():
+    for target, dev in tvm.testing.enabled_targets():
         # TODO: (electriclilies) enable AlterOpLayout when it is fixed
         with relay.build_config(opt_level=3, disabled_pass=["AlterOpLayout"]):
             lib = relay.build(mod, target=target)
 
-    module = graph_runtime.GraphModule(lib["default"](ctx))
+    module = graph_executor.GraphModule(lib["default"](dev))
     module.set_input(**{"x": data, "scale": scale, "zp": zp})
     module.run()
 
diff --git a/tests/python/relay/test_op_qnn_mul.py b/tests/python/relay/test_op_qnn_mul.py
index 7a846cbf4717..8ff3ab5c3df2 100644
--- a/tests/python/relay/test_op_qnn_mul.py
+++ b/tests/python/relay/test_op_qnn_mul.py
@@ -19,7 +19,7 @@
 from tvm import te
 import numpy as np
 from tvm import relay
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 import tvm.topi.testing
 
 # "unquantize" a quantized tensor
@@ -80,7 +80,7 @@ def test_tflite_same_io_qnn_params():
         y_rec = recover(y_data, rhs_scale, rhs_zero_point)
         golden = generate_golden_output(x_rec, y_rec, output_scale, output_zero_point)
 
-        intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm")
+        intrp = relay.create_executor("graph", device=tvm.cpu(0), target="llvm")
         op_res = intrp.evaluate(func)(x_data, y_data)
 
         np.testing.assert_equal(op_res.asnumpy(), np.uint8(golden))
@@ -134,7 +134,7 @@ def test_tflite_different_io_qnn_params():
         y_rec = recover(y_data, rhs_scale, rhs_zero_point)
         golden = generate_golden_output(x_rec, y_rec, output_scale, output_zero_point)
 
-        intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm")
+        intrp = relay.create_executor("graph", device=tvm.cpu(0), target="llvm")
         op_res = intrp.evaluate(func)(x_data, y_data)
         np.testing.assert_equal(op_res.asnumpy(), np.uint8(golden))
 
@@ -172,7 +172,7 @@ def test_saturation():
 
     golden = generate_golden_output(x_rec, y_rec, output_scale, output_zero_point)
 
-    intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm")
+    intrp = relay.create_executor("graph", device=tvm.cpu(0), target="llvm")
     op_res = intrp.evaluate(func)(x_data, y_data)
     np.testing.assert_equal(op_res.asnumpy(), np.uint8(golden))
 
@@ -206,7 +206,7 @@ def test_saturation():
 
     golden = generate_golden_output(x_rec, y_rec, output_scale, output_zero_point)
 
-    intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm")
+    intrp = relay.create_executor("graph", device=tvm.cpu(0), target="llvm")
     op_res = intrp.evaluate(func)(x_data, y_data)
     np.testing.assert_equal(op_res.asnumpy(), np.uint8(golden))
 
@@ -241,7 +241,7 @@ def test_saturation():
 
     golden = generate_golden_output(x_rec, y_rec, output_scale, output_zero_point)
 
-    intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm")
+    intrp = relay.create_executor("graph", device=tvm.cpu(0), target="llvm")
     op_res = intrp.evaluate(func)(x_data, y_data)
     np.testing.assert_equal(op_res.asnumpy(), np.uint8(golden))
 
diff --git a/tests/python/relay/test_op_qnn_quantize.py b/tests/python/relay/test_op_qnn_quantize.py
index 2ef298679904..2ae688ef4784 100644
--- a/tests/python/relay/test_op_qnn_quantize.py
+++ b/tests/python/relay/test_op_qnn_quantize.py
@@ -19,7 +19,7 @@
 from tvm import te
 import numpy as np
 from tvm import relay
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 from tvm.relay.testing import run_infer_type
 
 
@@ -39,7 +39,7 @@ def quantize_test_driver(in_dtype, quant_args, axis, out_dtype, in_data, verify_
     mod = tvm.IRModule.from_expr(mod)
     with tvm.transform.PassContext(opt_level=3):
         graph, lib, params = relay.build(mod, "llvm", params=None)
-        rt_mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
+        rt_mod = graph_executor.create(graph, lib, device=tvm.cpu(0))
         rt_mod.set_input(input_data=in_data)
         rt_mod.set_input(**params)
         rt_mod.run()
@@ -127,7 +127,7 @@ def test_channelwise_axis_1():
     quantize_test_driver(
         in_dtype="float32",
         quant_args=quant_args,
-        axis=1,
+        axis=-1,
         out_dtype="uint8",
         in_data=data,
         verify_output_data=output,
@@ -150,12 +150,12 @@ def test_dynamic_quantize():
 
     mod = tvm.ir.IRModule.from_expr(func)
 
-    for target, ctx in tvm.testing.enabled_targets():
+    for target, dev in tvm.testing.enabled_targets():
         # TODO: (electriclilies) enable AlterOpLayout when it is fixed
         with relay.build_config(opt_level=3, disabled_pass=["AlterOpLayout"]):
             lib = relay.build(mod, target=target)
 
-    module = graph_runtime.GraphModule(lib["default"](ctx))
+    module = graph_executor.GraphModule(lib["default"](dev))
     module.set_input(**{"x": data, "scale": scale, "zp": zp})
     module.run()
 
diff --git a/tests/python/relay/test_op_qnn_requantize.py b/tests/python/relay/test_op_qnn_requantize.py
index f40a08711451..5e61fad7676d 100644
--- a/tests/python/relay/test_op_qnn_requantize.py
+++ b/tests/python/relay/test_op_qnn_requantize.py
@@ -19,7 +19,7 @@
 from tvm import te
 import numpy as np
 from tvm import relay
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 
 roundings = ["UPWARD", "TONEAREST"]
 
@@ -28,7 +28,7 @@ def verify(mod, goldens):
     with tvm.transform.PassContext(opt_level=3):
         graph, lib, params = relay.build(mod, "llvm", params=None)
         golden_data, golden_output = goldens
-        rt_mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
+        rt_mod = graph_executor.create(graph, lib, device=tvm.cpu(0))
         rt_mod.set_input("quantized_data", golden_data)
         rt_mod.set_input(**params)
         rt_mod.run()
diff --git a/tests/python/relay/test_op_qnn_simulated_dequantize.py b/tests/python/relay/test_op_qnn_simulated_dequantize.py
new file mode 100644
index 000000000000..3aecd935b62b
--- /dev/null
+++ b/tests/python/relay/test_op_qnn_simulated_dequantize.py
@@ -0,0 +1,177 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+from tvm import te
+import numpy as np
+from tvm import relay
+from tvm.contrib import graph_executor
+from tvm.runtime.vm import VirtualMachine
+from tvm.topi.nn.qnn import SQNN_DTYPE_TO_CODE
+
+
+def dequantize_test_driver(in_dtype, quant_args, axis, in_data):
+    shape = in_data.shape
+    input_data = relay.var("input_data", shape=shape, dtype=in_dtype)
+    input_zero_point = relay.const(quant_args["in_zero_point"])
+    input_scale = relay.const(quant_args["in_scale"])
+    dequantized_output = relay.qnn.op.dequantize(
+        input_data,
+        input_scale=input_scale,
+        input_zero_point=input_zero_point,
+        axis=axis,
+    )
+    mod = relay.Function(relay.analysis.free_vars(dequantized_output), dequantized_output)
+    mod = tvm.IRModule.from_expr(mod)
+    with tvm.transform.PassContext(opt_level=3):
+        graph, lib, params = relay.build(mod, "llvm", params=None)
+    rt_mod = graph_executor.create(graph, lib, device=tvm.cpu(0))
+    rt_mod.set_input(input_data=in_data)
+    rt_mod.set_input(**params)
+    rt_mod.run()
+    res = rt_mod.get_output(0).asnumpy()
+    return res
+
+
+def build_simulated_dequantize(input_data, scale, zp, dtype, axis=-1):
+    sim_q = relay.qnn.op.simulated_dequantize(
+        input_data,
+        scale,
+        zp,
+        axis=axis,
+        in_dtype=dtype,
+    )
+    mod = tvm.IRModule.from_expr(sim_q)
+    with tvm.transform.PassContext(opt_level=3):
+        vm_exec = relay.vm.compile(mod, "llvm", params=None)
+    vm = VirtualMachine(vm_exec, tvm.cpu(0))
+    return vm
+
+
+def verify_simulated_dequantize_simple(dtype):
+    data = np.random.uniform(low=-128, high=127, size=[2, 5]).astype(dtype)
+    data_fp = data.astype("float32")
+    scale_np = np.float32(0.5)
+    zp_np = np.int32(127)
+    dtype_np = np.int32(SQNN_DTYPE_TO_CODE[dtype])
+    quant_args = {"in_zero_point": zp_np, "in_scale": scale_np}
+    dq_out = dequantize_test_driver(
+        in_dtype=dtype,
+        quant_args=quant_args,
+        axis=-1,
+        in_data=data,
+    )
+    input_data = relay.var("input_data", shape=data.shape, dtype="float32")
+    scale = relay.var("scale", shape=[])
+    zp = relay.var("zp", shape=[])
+    dtype = relay.var("dtype", shape=[])
+    vm = build_simulated_dequantize(input_data, scale, zp, dtype)
+    sim_dq_out = vm.invoke("main", input_data=data_fp, scale=scale_np, zp=zp_np, dtype=dtype_np)
+    np.testing.assert_allclose(sim_dq_out.asnumpy(), dq_out, rtol=1e-5)
+
+
+def test_simulated_dequantize():
+    verify_simulated_dequantize_simple("uint8")
+    verify_simulated_dequantize_simple("int8")
+    verify_simulated_dequantize_simple("int32")
+
+
+def test_dynamic_channels():
+    # Compile simulated quantize once but support either per-channel or scalar params.
+    data = np.random.uniform(low=-64, high=64, size=[2, 5]).astype("int8")
+    data_fp = data.astype("float32")
+    # Test scalar qnn params.
+    scale_np = np.asarray([0.5]).astype("float32")
+    zp_np = np.asarray([0]).astype("int32")
+    dtype_np = np.int32(SQNN_DTYPE_TO_CODE["int8"])
+    quant_args = {"in_zero_point": zp_np[0], "in_scale": scale_np[0]}
+    dq_out = dequantize_test_driver(
+        in_dtype="int8",
+        quant_args=quant_args,
+        axis=0,
+        in_data=data,
+    )
+    # Create variables with undefined shape and run with scalar inputs.
+    input_data = relay.var("input_data", shape=data.shape, dtype="float32")
+    scale = relay.var("scale", shape=[relay.Any()], dtype="float32")
+    zp = relay.var("zp", shape=[relay.Any()], dtype="int32")
+    dtype = relay.var("dtype", shape=[])
+    vm = build_simulated_dequantize(input_data, scale, zp, dtype, axis=0)
+    sim_dq_out = vm.invoke("main", input_data=data_fp, scale=scale_np, zp=zp_np, dtype=dtype_np)
+    np.testing.assert_allclose(sim_dq_out.asnumpy(), dq_out, rtol=1e-5)
+
+    # Now get the perchannel quantize output and compare without recompiling.
+    scale_np = np.array([0.5, 0.25]).astype("float32")
+    zp_np = np.array([127, 123]).astype("int32")
+
+    # Get the reference quantize output.
+    quant_args = {"in_zero_point": zp_np, "in_scale": scale_np}
+    dq_out = dequantize_test_driver(
+        in_dtype="int8",
+        quant_args=quant_args,
+        axis=0,
+        in_data=data,
+    )
+    # Run the simulated quantize without recompiling and confirm results match.
+    sim_dq_out = vm.invoke("main", input_data=data_fp, scale=scale_np, zp=zp_np, dtype=dtype_np)
+    np.testing.assert_allclose(sim_dq_out.asnumpy(), dq_out, rtol=1e-5)
+
+
+def test_dynamic_dtype():
+    # Compile simulated quantize once but support any type of quantization.
+    data = np.random.uniform(low=0, high=255, size=[2, 5]).astype("uint8")
+    data_fp = data.astype("float32")
+    # Test scalar uint8 to fp32.
+    scale_np = np.asarray([0.5]).astype("float32")
+    zp_np = np.asarray([127]).astype("int32")
+    dtype_np = np.int32(SQNN_DTYPE_TO_CODE["uint8"])
+    quant_args = {"in_zero_point": zp_np[0], "in_scale": scale_np[0]}
+    dq_out = dequantize_test_driver(
+        in_dtype="uint8",
+        quant_args=quant_args,
+        axis=-1,
+        in_data=data,
+    )
+    # Create variables with undefined shape and run with scalar inputs.
+    input_data = relay.var("input_data", shape=data.shape, dtype="float32")
+    scale = relay.var("scale", shape=[relay.Any()], dtype="float32")
+    zp = relay.var("zp", shape=[relay.Any()], dtype="int32")
+    dtype = relay.var("dtype", shape=[])
+    vm = build_simulated_dequantize(input_data, scale, zp, dtype)
+    sim_dq_out = vm.invoke("main", input_data=data_fp, scale=scale_np, zp=zp_np, dtype=dtype_np)
+    np.testing.assert_allclose(sim_dq_out.asnumpy(), dq_out, rtol=1e-5)
+
+    # Now test int8 to float32 compilation.
+    data = np.random.uniform(low=0, high=255, size=[2, 5]).astype("int8")
+    data_fp = data.astype("float32")
+    # Get the reference quantize output.
+    dq_out = dequantize_test_driver(
+        in_dtype="int8",
+        quant_args=quant_args,
+        axis=-1,
+        in_data=data,
+    )
+    # Run the simulated quantize without recompiling and confirm results match.
+    dtype_np = np.int32(SQNN_DTYPE_TO_CODE["int8"])
+    sim_dq_out = vm.invoke("main", input_data=data_fp, scale=scale_np, zp=zp_np, dtype=dtype_np)
+    np.testing.assert_allclose(sim_dq_out.asnumpy(), dq_out, rtol=1e-5)
+
+
+if __name__ == "__main__":
+    test_simulated_dequantize()
+    test_dynamic_channels()
+    test_dynamic_dtype()
diff --git a/tests/python/relay/test_op_qnn_simulated_quantize.py b/tests/python/relay/test_op_qnn_simulated_quantize.py
new file mode 100644
index 000000000000..fd9d13168e01
--- /dev/null
+++ b/tests/python/relay/test_op_qnn_simulated_quantize.py
@@ -0,0 +1,185 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+from tvm import te
+import numpy as np
+from tvm import relay
+from tvm.contrib import graph_executor
+from tvm.runtime.vm import VirtualMachine
+from tvm.topi.nn.qnn import SQNN_DTYPE_TO_CODE
+
+
+def allclose_with_rounding(a, b):
+    # Find number of mismatches in inputs.
+    mismatch = a != b
+    # Allow some rounding errors due to GPU fp32 arithmetic.
+    assert np.sum(mismatch) <= 3
+
+
+def quantize_test_driver(in_dtype, quant_args, axis, out_dtype, in_data):
+    shape = in_data.shape
+    input_data = relay.var("input_data", shape=shape, dtype=in_dtype)
+    output_zero_point = relay.const(quant_args["out_zero_point"])
+    output_scale = relay.const(quant_args["out_scale"])
+    quantized_output = relay.qnn.op.quantize(
+        input_data,
+        output_scale=output_scale,
+        output_zero_point=output_zero_point,
+        axis=axis,
+        out_dtype=out_dtype,
+    )
+    mod = relay.Function(relay.analysis.free_vars(quantized_output), quantized_output)
+    mod = tvm.IRModule.from_expr(mod)
+    with tvm.transform.PassContext(opt_level=3):
+        graph, lib, params = relay.build(mod, "llvm", params=None)
+    rt_mod = graph_executor.create(graph, lib, device=tvm.cpu(0))
+    rt_mod.set_input(input_data=in_data)
+    rt_mod.set_input(**params)
+    rt_mod.run()
+    res = rt_mod.get_output(0).asnumpy()
+    return res
+
+
+def build_simulated_quantize(input_data, scale, zp, dtype, axis=-1):
+    sim_q = relay.qnn.op.simulated_quantize(
+        input_data,
+        scale,
+        zp,
+        axis=axis,
+        out_dtype=dtype,
+    )
+    mod = tvm.IRModule.from_expr(sim_q)
+    with tvm.transform.PassContext(opt_level=3):
+        vm_exec = relay.vm.compile(mod, "llvm", params=None)
+    vm = VirtualMachine(vm_exec, tvm.cpu(0))
+    return vm
+
+
+def verify_simulated_quantize_simple(dtype):
+    data = np.random.uniform(low=-128, high=127, size=[2, 5]).astype("float32")
+    scale_np = np.float32(0.5)
+    zp_np = np.int32(127)
+    dtype_np = np.int32(SQNN_DTYPE_TO_CODE[dtype])
+    quant_args = {"out_zero_point": zp_np, "out_scale": scale_np}
+    q_out = quantize_test_driver(
+        in_dtype="float32",
+        quant_args=quant_args,
+        axis=-1,
+        out_dtype=dtype,
+        in_data=data,
+    )
+    input_data = relay.var("input_data", shape=data.shape, dtype="float32")
+    scale = relay.var("scale", shape=[])
+    zp = relay.var("zp", shape=[])
+    dtype = relay.var("dtype", shape=[])
+    vm = build_simulated_quantize(input_data, scale, zp, dtype)
+    sim_q_out = vm.invoke("main", input_data=data, scale=scale_np, zp=zp_np, dtype=dtype_np)
+    allclose_with_rounding(sim_q_out.asnumpy(), q_out)
+
+
+def test_simulated_quantize():
+    verify_simulated_quantize_simple("uint8")
+    verify_simulated_quantize_simple("int8")
+    verify_simulated_quantize_simple("int32")
+
+
+def test_dynamic_channels():
+    # Compile simulated quantize once but support either per-channel or scalar params.
+    data = np.random.uniform(low=-64, high=64, size=[2, 5]).astype("float32")
+    # Test scalar qnn params.
+    scale_np = np.asarray([0.5]).astype("float32")
+    zp_np = np.asarray([127]).astype("int32")
+    dtype_np = np.int32(SQNN_DTYPE_TO_CODE["uint8"])
+    quant_args = {"out_zero_point": zp_np[0], "out_scale": scale_np[0]}
+    q_out = quantize_test_driver(
+        in_dtype="float32",
+        quant_args=quant_args,
+        axis=0,
+        out_dtype="uint8",
+        in_data=data,
+    )
+    # Create variables with undefined shape and run with scalar inputs.
+    input_data = relay.var("input_data", shape=data.shape, dtype="float32")
+    scale = relay.var("scale", shape=[relay.Any()], dtype="float32")
+    zp = relay.var("zp", shape=[relay.Any()], dtype="int32")
+    dtype = relay.var("dtype", shape=[])
+    vm = build_simulated_quantize(input_data, scale, zp, dtype, axis=0)
+    sim_q_out = vm.invoke("main", input_data=data, scale=scale_np, zp=zp_np, dtype=dtype_np)
+    allclose_with_rounding(sim_q_out.asnumpy(), q_out)
+
+    # Now get the perchannel quantize output and compare without recompiling.
+    scale_np = np.array([0.5, 0.25]).astype("float32")
+    zp_np = np.array([127, 123]).astype("int32")
+
+    # Get the reference quantize output.
+    quant_args = {"out_zero_point": zp_np, "out_scale": scale_np}
+    q_out = quantize_test_driver(
+        in_dtype="float32",
+        quant_args=quant_args,
+        axis=0,
+        out_dtype="uint8",
+        in_data=data,
+    )
+    # Run the simulated quantize without recompiling and confirm results match.
+    sim_q_out = vm.invoke("main", input_data=data, scale=scale_np, zp=zp_np, dtype=dtype_np)
+    allclose_with_rounding(sim_q_out.asnumpy(), q_out)
+
+
+def test_dynamic_dtype():
+    # Compile simulated quantize once but support any type of quantization.
+    data = np.random.uniform(low=-64, high=64, size=[2, 5]).astype("float32")
+    # Test scalar float32 to uint8.
+    scale_np = np.asarray([0.5]).astype("float32")
+    zp_np = np.asarray([127]).astype("int32")
+    dtype_np = np.int32(SQNN_DTYPE_TO_CODE["uint8"])
+    quant_args = {"out_zero_point": zp_np[0], "out_scale": scale_np[0]}
+    q_out = quantize_test_driver(
+        in_dtype="float32",
+        quant_args=quant_args,
+        axis=-1,
+        out_dtype="uint8",
+        in_data=data,
+    )
+    # Create variables with undefined shape and run with scalar inputs.
+    input_data = relay.var("input_data", shape=data.shape, dtype="float32")
+    scale = relay.var("scale", shape=[relay.Any()], dtype="float32")
+    zp = relay.var("zp", shape=[relay.Any()], dtype="int32")
+    dtype = relay.var("dtype", shape=[])
+    vm = build_simulated_quantize(input_data, scale, zp, dtype)
+    sim_q_out = vm.invoke("main", input_data=data, scale=scale_np, zp=zp_np, dtype=dtype_np)
+    allclose_with_rounding(sim_q_out.asnumpy(), q_out)
+
+    # Now test float32 to int32 compilation.
+    # Get the reference quantize output.
+    q_out = quantize_test_driver(
+        in_dtype="float32",
+        quant_args=quant_args,
+        axis=-1,
+        out_dtype="int32",
+        in_data=data,
+    )
+    # Run the simulated quantize without recompiling and confirm results match.
+    dtype_np = np.int32(SQNN_DTYPE_TO_CODE["int32"])
+    sim_q_out = vm.invoke("main", input_data=data, scale=scale_np, zp=zp_np, dtype=dtype_np)
+    allclose_with_rounding(sim_q_out.asnumpy(), q_out)
+
+
+if __name__ == "__main__":
+    test_simulated_quantize()
+    test_dynamic_channels()
+    test_dynamic_dtype()
diff --git a/tests/python/relay/test_op_qnn_subtract.py b/tests/python/relay/test_op_qnn_subtract.py
index a76b05c31564..fb55cdc94844 100644
--- a/tests/python/relay/test_op_qnn_subtract.py
+++ b/tests/python/relay/test_op_qnn_subtract.py
@@ -52,7 +52,7 @@ def qnn_subtract_driver(x_datas, y_datas, golden_outputs, scale_and_zp, data_dty
         x_data = x_datas[i]
         y_data = y_datas[i]
         golden_output = golden_outputs[i]
-        intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm")
+        intrp = relay.create_executor("graph", device=tvm.cpu(0), target="llvm")
         op_res = intrp.evaluate(func)(x_data, y_data)
         np.testing.assert_equal(op_res.asnumpy(), golden_output)
 
diff --git a/tests/python/relay/test_param_dict.py b/tests/python/relay/test_param_dict.py
index 29e0b5c0463b..2272883fc39c 100644
--- a/tests/python/relay/test_param_dict.py
+++ b/tests/python/relay/test_param_dict.py
@@ -24,7 +24,7 @@
 from tvm.relay.op import add
 from tvm import relay
 from tvm import rpc
-from tvm.contrib import utils, graph_runtime
+from tvm.contrib import utils, graph_executor
 
 
 def test_save_load():
@@ -60,7 +60,7 @@ def test_bigendian_rpc_param():
     if host is None:
         return
 
-    def verify_graph_runtime(remote, target, shape, dtype):
+    def verify_graph_executor(remote, target, shape, dtype):
         x = relay.var("x")
         y = relay.const(1)
         z = relay.add(x, y)
@@ -75,18 +75,18 @@ def verify_graph_runtime(remote, target, shape, dtype):
         lib.save(path_dso)
         remote.upload(path_dso)
         lib = remote.load_module("dev_lib.o")
-        ctx = remote.cpu(0)
-        mod = graph_runtime.create(graph, lib, ctx)
+        dev = remote.cpu(0)
+        mod = graph_executor.create(graph, lib, dev)
         mod.load_params(runtime.save_param_dict(params))
         mod.run()
-        out = mod.get_output(0, tvm.nd.empty(shape, dtype=dtype, ctx=ctx))
+        out = mod.get_output(0, tvm.nd.empty(shape, dtype=dtype, device=dev))
         tvm.testing.assert_allclose(x_in + 1, out.asnumpy())
 
     print("Test RPC connection to PowerPC...")
     remote = rpc.connect(host, port)
     target = "llvm -mtriple=powerpc-linux-gnu"
     for dtype in ["float32", "float64", "int32", "int8"]:
-        verify_graph_runtime(remote, target, (10,), dtype)
+        verify_graph_executor(remote, target, (10,), dtype)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py
index 41186884bdb2..aeaf1f89c388 100644
--- a/tests/python/relay/test_pass_alter_op_layout.py
+++ b/tests/python/relay/test_pass_alter_op_layout.py
@@ -757,10 +757,10 @@ def expected():
     mod_before = transform.InferType()(mod_before)
     mod_new = transform.InferType()(mod_new)
     with relay.build_config(opt_level=3):
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             for kind in ["graph", "debug", "vm"]:
-                ex_before = relay.create_executor(kind, mod=mod_before, ctx=ctx, target=target)
-                ex_new = relay.create_executor(kind, mod=mod_new, ctx=ctx, target=target)
+                ex_before = relay.create_executor(kind, mod=mod_before, device=dev, target=target)
+                ex_new = relay.create_executor(kind, mod=mod_new, device=dev, target=target)
                 np_data = np.random.uniform(size=(1, 32, 28, 28)).astype("float32")
                 np_weight = np.random.uniform(size=(32, 32, 3, 3)).astype("float32")
                 result_before = ex_before.evaluate()(np_data, np_weight)
diff --git a/tests/python/relay/test_pass_annotate_target.py b/tests/python/relay/test_pass_annotate_target.py
index ce86cc603d6d..c756d74ff0be 100644
--- a/tests/python/relay/test_pass_annotate_target.py
+++ b/tests/python/relay/test_pass_annotate_target.py
@@ -29,7 +29,7 @@
 
 
 def check_result(
-    mod, map_inputs, out_shape, result, tol=1e-5, target="llvm", ctx=tvm.cpu(), params=None
+    mod, map_inputs, out_shape, result, tol=1e-5, target="llvm", device=tvm.cpu(), params=None
 ):
     if sys.platform == "win32":
         print("Skip test on Windows for now")
@@ -56,27 +56,27 @@ def check_vm_result():
         code, lib = exe.save()
         lib = update_lib(lib)
         exe = runtime.vm.Executable.load_exec(code, lib)
-        vm = runtime.vm.VirtualMachine(exe, ctx)
+        vm = runtime.vm.VirtualMachine(exe, device)
         out = vm.run(**map_inputs)
         tvm.testing.assert_allclose(out.asnumpy(), result, rtol=tol, atol=tol)
 
-    def check_graph_runtime_result():
+    def check_graph_executor_result():
         with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
             json, lib, param = relay.build(mod, target=target, params=params)
         lib = update_lib(lib)
-        rt_mod = tvm.contrib.graph_runtime.create(json, lib, ctx)
+        rt_mod = tvm.contrib.graph_executor.create(json, lib, device)
 
         for name, data in map_inputs.items():
             rt_mod.set_input(name, data)
         rt_mod.set_input(**param)
         rt_mod.run()
-        out = tvm.nd.empty(out_shape, ctx=ctx)
+        out = tvm.nd.empty(out_shape, device=device)
         out = rt_mod.get_output(0, out)
 
         tvm.testing.assert_allclose(out.asnumpy(), result, rtol=tol, atol=tol)
 
     check_vm_result()
-    check_graph_runtime_result()
+    check_graph_executor_result()
 
 
 def test_extern_dnnl():
@@ -144,7 +144,7 @@ def test_run():
         i_data = np.random.uniform(0, 1, ishape).astype(dtype)
         w1_data = np.random.uniform(0, 1, w1shape).astype(dtype)
 
-        ref_ex = relay.create_executor("graph", mod=ref_mod, ctx=tvm.cpu())
+        ref_ex = relay.create_executor("graph", mod=ref_mod, device=tvm.cpu())
         ref_res = ref_ex.evaluate()(i_data, w1_data)
 
         check_result(
@@ -171,7 +171,7 @@ def test_extern_dnnl_mobilenet():
     i_data = np.random.uniform(0, 1, ishape).astype(dtype)
 
     ref_mod, params = relay.testing.mobilenet.get_workload(batch_size=1, dtype="float32")
-    ref_ex = relay.create_executor("graph", mod=ref_mod, ctx=tvm.cpu(0))
+    ref_ex = relay.create_executor("graph", mod=ref_mod, device=tvm.cpu(0))
     ref_res = ref_ex.evaluate()(i_data, **params)
 
     check_result(mod, {"data": i_data}, (1, 1000), ref_res.asnumpy(), tol=1e-5, params=params)
diff --git a/tests/python/relay/test_pass_annotation.py b/tests/python/relay/test_pass_annotation.py
index ff68d489c7c5..a9c31f5ccedd 100644
--- a/tests/python/relay/test_pass_annotation.py
+++ b/tests/python/relay/test_pass_annotation.py
@@ -20,7 +20,7 @@
 
 import tvm
 from tvm import relay
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 from tvm.relay.expr_functor import ExprMutator
 from tvm.relay import transform
 import tvm.testing
@@ -31,17 +31,17 @@ def _trace(module, metadata, _):
         pass  # import pdb; pdb.set_trace()
 
 
-def check_graph_runtime(
+def check_graph_executor(
     target, ref_res, device, func, params, config, opt_level, expected_index=None
 ):
     with tvm.transform.PassContext(opt_level=opt_level, config=config):
         graph, lib, new_params = relay.build(func, target, params=params)
-        contexts = [tvm.cpu(0), tvm.context(device)]
+        contexts = [tvm.cpu(0), tvm.device(device)]
         graph_json = json.loads(graph)
         if "device_index" in graph_json["attrs"]:
             device_index = graph_json["attrs"]["device_index"][1]
             assert device_index == expected_index
-        mod = graph_runtime.create(graph, lib, contexts)
+        mod = graph_executor.create(graph, lib, contexts)
         mod.set_input(**new_params)
         mod.run()
         res = mod.get_output(0).asnumpy()
@@ -53,8 +53,8 @@ def check_vm_runtime(target, ref_res, device, func, params, config, opt_level, e
         mod = tvm.IRModule()
         mod["main"] = func
         exe = relay.vm.compile(mod, target)
-        ctx = [tvm.cpu(0), tvm.context(device)]
-        vm = tvm.runtime.vm.VirtualMachine(exe, ctx)
+        dev = [tvm.cpu(0), tvm.device(device)]
+        vm = tvm.runtime.vm.VirtualMachine(exe, dev)
         res = vm.invoke("main", **params)
         tvm.testing.assert_allclose(res.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
@@ -69,28 +69,28 @@ def run_opt_pass(expr, passes):
 
 
 def test_redundant_annotation():
-    ctx1 = tvm.context(1)
-    ctx2 = tvm.context(2)
+    dev1 = tvm.device(1)
+    dev2 = tvm.device(2)
     x = relay.var("x", shape=(3,))
     y = relay.var("y", shape=(3,))
     z = relay.var("z", shape=(3,))
 
     def annotated():
         add = relay.add(x, y)
-        _add1 = relay.annotation.on_device(add, ctx2)
-        _add2 = relay.annotation.on_device(add, ctx2)
+        _add1 = relay.annotation.on_device(add, dev2)
+        _add2 = relay.annotation.on_device(add, dev2)
         sub1 = relay.subtract(_add1, z)
         sub2 = relay.subtract(_add2, z)
 
         func = relay.Function([x, y, z], relay.Tuple([sub1, sub2]))
-        func = run_opt_pass(func, transform.RewriteAnnotatedOps(ctx1.device_type))
+        func = run_opt_pass(func, transform.RewriteAnnotatedOps(dev1.device_type))
         return func
 
     def expected():
         add = relay.add(x, y)
-        copy_add_sub1 = relay.device_copy(add, ctx2, ctx1)
+        copy_add_sub1 = relay.device_copy(add, dev2, dev1)
         sub1 = relay.subtract(copy_add_sub1, z)
-        copy_add_sub2 = relay.device_copy(add, ctx2, ctx1)
+        copy_add_sub2 = relay.device_copy(add, dev2, dev1)
         sub2 = relay.subtract(copy_add_sub2, z)
         func = relay.Function([x, y, z], relay.Tuple([sub1, sub2]))
         return func
@@ -101,23 +101,23 @@ def expected():
 
 
 def test_annotate_expr():
-    ctx1 = tvm.context(1)
-    ctx2 = tvm.context(2)
+    dev1 = tvm.device(1)
+    dev2 = tvm.device(2)
     x = relay.var("x", shape=(3,))
     y = relay.var("y", shape=(3,))
     z = relay.var("z", shape=(3,))
 
     def annotated():
         add = relay.add(x, y)
-        _add = relay.annotation.on_device(add, ctx1)
+        _add = relay.annotation.on_device(add, dev1)
         sub = relay.subtract(_add, z)
-        _sub = relay.annotation.on_device(sub, ctx2)
-        expr = run_opt_pass(_sub, transform.RewriteAnnotatedOps(ctx1.device_type))
+        _sub = relay.annotation.on_device(sub, dev2)
+        expr = run_opt_pass(_sub, transform.RewriteAnnotatedOps(dev1.device_type))
         return expr
 
     def expected():
         add = relay.add(x, y)
-        copy_add_sub = relay.device_copy(add, ctx1, ctx2)
+        copy_add_sub = relay.device_copy(add, dev1, dev2)
         sub = relay.subtract(copy_add_sub, z)
         return sub
 
@@ -127,20 +127,20 @@ def expected():
 
 
 def test_annotate_all():
-    ctx1 = tvm.context(1)
-    ctx2 = tvm.context(2)
+    dev1 = tvm.device(1)
+    dev2 = tvm.device(2)
     x = relay.var("x", shape=(3,))
     y = relay.var("y", shape=(3,))
     z = relay.var("z", shape=(3,))
 
     def annotated():
         add = relay.add(x, y)
-        _add = relay.annotation.on_device(add, ctx2)
+        _add = relay.annotation.on_device(add, dev2)
         sub = relay.subtract(_add, z)
-        _sub = relay.annotation.on_device(sub, ctx2)
+        _sub = relay.annotation.on_device(sub, dev2)
 
         func = relay.Function([x, y, z], _sub)
-        func = run_opt_pass(func, transform.RewriteAnnotatedOps(ctx1.device_type))
+        func = run_opt_pass(func, transform.RewriteAnnotatedOps(dev1.device_type))
         return func
 
     def expected():
@@ -155,8 +155,8 @@ def expected():
 
 
 def test_annotate_none():
-    ctx1 = tvm.context(1)
-    ctx2 = tvm.context(2)
+    dev1 = tvm.device(1)
+    dev2 = tvm.device(2)
     x = relay.var("x", shape=(3,))
     y = relay.var("y", shape=(3,))
     z = relay.var("z", shape=(3,))
@@ -165,7 +165,7 @@ def annotated():
         add = relay.add(x, y)
         sub = relay.subtract(add, z)
         func = relay.Function([x, y, z], sub)
-        func = run_opt_pass(func, transform.RewriteAnnotatedOps(ctx1.device_type))
+        func = run_opt_pass(func, transform.RewriteAnnotatedOps(dev1.device_type))
         return func
 
     def expected():
@@ -200,8 +200,8 @@ def test_conv_network():
     weight = relay.var("weight", shape=(64, 64, 3, 3))
     data1 = relay.var("data1", shape=dshape)
     data2 = relay.var("data2", shape=dshape)
-    dev1 = tvm.context(1)
-    dev2 = tvm.context(2)
+    dev1 = tvm.device(1)
+    dev2 = tvm.device(2)
 
     def original():
         conv2d_1 = relay.nn.conv2d(data1, weight, channels=64, kernel_size=(3, 3), padding=(1, 1))
@@ -210,7 +210,7 @@ def original():
         conv2d_3 = relay.nn.conv2d(add, weight, channels=64, kernel_size=(3, 3), padding=(1, 1))
 
         func = relay.Function([data1, data2, weight], conv2d_3)
-        func = run_opt_pass(func, transform.RewriteAnnotatedOps(tvm.context(3).device_type))
+        func = run_opt_pass(func, transform.RewriteAnnotatedOps(tvm.device(3).device_type))
         return func
 
     def annotated():
@@ -224,7 +224,7 @@ def annotated():
         _conv2d_3 = relay.annotation.on_device(conv2d_3, dev2)
 
         func = relay.Function([data1, data2, weight], _conv2d_3)
-        func = run_opt_pass(func, transform.RewriteAnnotatedOps(tvm.context(3).device_type))
+        func = run_opt_pass(func, transform.RewriteAnnotatedOps(tvm.device(3).device_type))
         return func
 
     class ScheduleConv2d(ExprMutator):
@@ -303,10 +303,10 @@ def test_propogation():
                   |
                  tan          1
     """
-    ctx1 = tvm.context(1)
-    ctx2 = tvm.context(2)
+    dev1 = tvm.device(1)
+    dev2 = tvm.device(2)
 
-    expected_dev_type = {"log": ctx1, "log2": ctx2, "log10": ctx2, "add": ctx2, "tan": ctx1}
+    expected_dev_type = {"log": dev1, "log2": dev2, "log10": dev2, "add": dev2, "tan": dev1}
 
     x = relay.var("x", shape=(3,))
 
@@ -322,17 +322,17 @@ def annotated():
         tan = relay.tan(_add)
         _tan = relay.annotation.on_device(tan, expected_dev_type["tan"])
 
-        func = run_opt_pass(_tan, transform.RewriteAnnotatedOps(ctx1.device_type))
+        func = run_opt_pass(_tan, transform.RewriteAnnotatedOps(dev1.device_type))
         return func
 
     def expected():
         log = relay.log(x)
-        _log_left = relay.device_copy(log, ctx1, ctx2)
-        _log_right = relay.device_copy(log, ctx1, ctx2)
+        _log_left = relay.device_copy(log, dev1, dev2)
+        _log_right = relay.device_copy(log, dev1, dev2)
         log2 = relay.log2(_log_left)
         log10 = relay.log10(_log_right)
         add = relay.add(log2, log10)
-        _add = relay.device_copy(add, ctx2, ctx1)
+        _add = relay.device_copy(add, dev2, dev1)
         tan = relay.tan(_add)
 
         func = run_opt_pass(tan, transform.InferType())
@@ -344,9 +344,9 @@ def expected():
 
     smap = relay.backend._backend.GraphPlanMemory(annotated_expr)
     for expr, storage_dev_type in smap.items():
-        # x is ctx1 as output is ctx1
+        # x is dev1 as output is dev1
         if isinstance(expr, tvm.relay.expr.Var):
-            assert storage_dev_type[1][0] == ctx1.device_type
+            assert storage_dev_type[1][0] == dev1.device_type
         else:
             # device_copy op should be its dst_dev_type
             if isinstance(expr.attrs, tvm.relay.op.op_attrs.DeviceCopyAttrs):
@@ -390,32 +390,32 @@ def get_func():
 
     def test_fuse_log_add(device, tgt):
         """ Only log and add are fused."""
-        fallback_device = tvm.context("cpu")
+        fallback_device = tvm.device("cpu")
         target = {"cpu": "llvm", device: tgt}
-        cpu_ctx = fallback_device
-        dev_ctx = tvm.context(device)
+        cpu_dev = fallback_device
+        dev_dev = tvm.device(device)
 
         def annotated():
             add = relay.add(x, y)
             sqrt = relay.sqrt(add)
-            _sqrt = relay.annotation.on_device(sqrt, dev_ctx)
+            _sqrt = relay.annotation.on_device(sqrt, dev_dev)
             log = relay.log(add)
             subtract = relay.subtract(_sqrt, log)
             exp = relay.exp(subtract)
-            _exp = relay.annotation.on_device(exp, dev_ctx)
+            _exp = relay.annotation.on_device(exp, dev_dev)
 
             func = relay.Function([x, y], _exp)
-            func = run_opt_pass(func, transform.RewriteAnnotatedOps(cpu_ctx.device_type))
+            func = run_opt_pass(func, transform.RewriteAnnotatedOps(cpu_dev.device_type))
             return func
 
         def expected():
             add = relay.add(x, y)
-            copy_add_sqrt = relay.device_copy(add, cpu_ctx, dev_ctx)
+            copy_add_sqrt = relay.device_copy(add, cpu_dev, dev_dev)
             sqrt = relay.sqrt(copy_add_sqrt)
             log = relay.log(add)
-            copy_sqrt_subtract = relay.device_copy(sqrt, dev_ctx, cpu_ctx)
+            copy_sqrt_subtract = relay.device_copy(sqrt, dev_dev, cpu_dev)
             subtract = relay.subtract(copy_sqrt_subtract, log)
-            copy_sub_exp = relay.device_copy(subtract, cpu_ctx, dev_ctx)
+            copy_sub_exp = relay.device_copy(subtract, cpu_dev, dev_dev)
             exp = relay.exp(copy_sub_exp)
 
             func = relay.Function([x, y], exp)
@@ -423,13 +423,13 @@ def expected():
 
         annotated_func = annotated()
         expected_func = expected()
-        ctx = tvm.context(device, 0)
-        dev_idx = ctx.device_type
+        dev = tvm.device(device, 0)
+        dev_idx = dev.device_type
         expected_index = [1, 1, 1, dev_idx, dev_idx, 1, 1, dev_idx, dev_idx]
         check_annotated_graph(annotated_func, expected_func)
         opt_level = 1
         config = {"relay.fallback_device_type": fallback_device.device_type}
-        check_graph_runtime(
+        check_graph_executor(
             target, ref_res, device, annotated_func, params, config, opt_level, expected_index
         )
         opt_level = 2
@@ -439,25 +439,25 @@ def expected():
 
     def test_fuse_all(device, tgt):
         """Fuse all operators."""
-        fallback_device = tvm.context("cpu")
+        fallback_device = tvm.device("cpu")
         target = {"cpu": "llvm", device: tgt}
-        cpu_ctx = fallback_device
-        dev_ctx = tvm.context(device)
+        cpu_dev = fallback_device
+        dev_dev = tvm.device(device)
 
         def annotated():
             add = relay.add(x, y)
-            _add = relay.annotation.on_device(add, dev_ctx)
+            _add = relay.annotation.on_device(add, dev_dev)
             sqrt = relay.sqrt(_add)
-            _sqrt = relay.annotation.on_device(sqrt, dev_ctx)
+            _sqrt = relay.annotation.on_device(sqrt, dev_dev)
             log = relay.log(_add)
-            _log = relay.annotation.on_device(log, dev_ctx)
+            _log = relay.annotation.on_device(log, dev_dev)
             subtract = relay.subtract(_sqrt, _log)
-            _subtract = relay.annotation.on_device(subtract, dev_ctx)
+            _subtract = relay.annotation.on_device(subtract, dev_dev)
             exp = relay.exp(_subtract)
-            _exp = relay.annotation.on_device(exp, dev_ctx)
+            _exp = relay.annotation.on_device(exp, dev_dev)
 
             func = relay.Function([x, y], _exp)
-            func = run_opt_pass(func, transform.RewriteAnnotatedOps(cpu_ctx.device_type))
+            func = run_opt_pass(func, transform.RewriteAnnotatedOps(cpu_dev.device_type))
             return func
 
         annotated_func = annotated()
@@ -465,15 +465,15 @@ def annotated():
         check_annotated_graph(annotated_func, expected_func)
         opt_level = 1
         config = {"relay.fallback_device_type": fallback_device.device_type}
-        check_graph_runtime(target, ref_res, device, annotated_func, params, config, opt_level)
+        check_graph_executor(target, ref_res, device, annotated_func, params, config, opt_level)
         opt_level = 2
         check_vm_runtime(target, ref_res, device, annotated_func, params, config, opt_level)
 
     def test_fallback_exp(device, tgt):
-        fallback_device = tvm.context("cpu")
+        fallback_device = tvm.device("cpu")
         target = {"cpu": "llvm", device: tgt}
-        cpu_ctx = fallback_device
-        dev_ctx = tvm.context(device)
+        cpu_dev = fallback_device
+        dev_dev = tvm.device(device)
 
         def annotated():
             add = relay.add(x, y)
@@ -481,10 +481,10 @@ def annotated():
             log = relay.log(add)
             subtract = relay.subtract(sqrt, log)
             exp = relay.exp(subtract)
-            _exp = relay.annotation.on_device(exp, cpu_ctx)
+            _exp = relay.annotation.on_device(exp, cpu_dev)
 
             func = relay.Function([x, y], _exp)
-            func = run_opt_pass(func, transform.RewriteAnnotatedOps(dev_ctx.device_type))
+            func = run_opt_pass(func, transform.RewriteAnnotatedOps(dev_dev.device_type))
             return func
 
         def expected():
@@ -492,7 +492,7 @@ def expected():
             sqrt = relay.sqrt(add)
             log = relay.log(add)
             subtract = relay.subtract(sqrt, log)
-            copy_sub_exp = relay.device_copy(subtract, dev_ctx, cpu_ctx)
+            copy_sub_exp = relay.device_copy(subtract, dev_dev, cpu_dev)
             exp = relay.exp(copy_sub_exp)
 
             func = relay.Function([x, y], exp)
@@ -500,13 +500,13 @@ def expected():
 
         annotated_func = annotated()
         expected_func = expected()
-        ctx = tvm.context(device, 0)
-        dev_idx = ctx.device_type
+        dev = tvm.device(device, 0)
+        dev_idx = dev.device_type
         expected_index = [dev_idx, dev_idx, dev_idx, 1, 1]
         opt_level = 1
         config = {"relay.fallback_device_type": fallback_device.device_type}
         check_annotated_graph(annotated_func, expected_func)
-        check_graph_runtime(
+        check_graph_executor(
             target, ref_res, device, annotated_func, params, config, opt_level, expected_index
         )
         opt_level = 2
@@ -520,7 +520,7 @@ def test_fallback_all_operators(device, tgt):
         expected_func = get_func()
         check_annotated_graph(annotated_func, expected_func)
         opt_level = 2
-        check_graph_runtime(target, ref_res, device, annotated_func, params, {}, opt_level)
+        check_graph_executor(target, ref_res, device, annotated_func, params, {}, opt_level)
         check_vm_runtime(target, ref_res, device, annotated_func, params, {}, opt_level)
 
     test_fuse_log_add(dev, tgt)
@@ -550,26 +550,26 @@ def run_unpropagatable_graph(dev, tgt):
     tmp_mul = np.multiply(c_data, d_data)
     ref_res = np.subtract(tmp_add, tmp_mul)
 
-    fallback_device = tvm.context("cpu")
+    fallback_device = tvm.device("cpu")
     target = {"cpu": "llvm", dev: tgt}
-    cpu_ctx = fallback_device
-    dev_ctx = tvm.context(dev)
+    cpu_dev = fallback_device
+    dev_dev = tvm.device(dev)
 
     def annotated():
         add = relay.add(a, b)
-        _add = relay.annotation.on_device(add, dev_ctx)
+        _add = relay.annotation.on_device(add, dev_dev)
         mul = relay.multiply(c, d)
-        _mul = relay.annotation.on_device(mul, cpu_ctx)
+        _mul = relay.annotation.on_device(mul, cpu_dev)
         sub = relay.subtract(_add, _mul)
-        _sub = relay.annotation.on_device(sub, dev_ctx)
+        _sub = relay.annotation.on_device(sub, dev_dev)
         func = relay.Function([a, b, c, d], _sub)
-        func = run_opt_pass(func, transform.RewriteAnnotatedOps(dev_ctx.device_type))
+        func = run_opt_pass(func, transform.RewriteAnnotatedOps(dev_dev.device_type))
         return func
 
     def expected():
         add = relay.add(a, b)
         mul = relay.multiply(c, d)
-        copy_mul_sub = relay.device_copy(mul, cpu_ctx, dev_ctx)
+        copy_mul_sub = relay.device_copy(mul, cpu_dev, dev_dev)
         sub = relay.subtract(add, copy_mul_sub)
         func = relay.Function([a, b, c, d], sub)
         return func
@@ -582,7 +582,7 @@ def expected():
     opt_level = 0
     config = {"relay.fallback_device_type": fallback_device.device_type}
 
-    check_graph_runtime(
+    check_graph_executor(
         target, ref_res, dev, annotated_func, params, config, opt_level, expected_index
     )
 
@@ -617,14 +617,14 @@ def test_check_run_cuda():
 @tvm.testing.requires_cuda
 def test_tuple_get_item():
     dev = "cuda"
-    cpu_ctx = tvm.cpu(0)
-    gpu_ctx = tvm.context(dev)
+    cpu_dev = tvm.cpu(0)
+    gpu_dev = tvm.device(dev)
 
     def expected():
         x = relay.var("x", relay.ty.TensorType((3, 3, 4), "float32"))
         split = relay.op.split(x, 3)
-        elem0 = relay.device_copy(split[0], gpu_ctx, cpu_ctx)
-        elem1 = relay.device_copy(split[1], gpu_ctx, cpu_ctx)
+        elem0 = relay.device_copy(split[0], gpu_dev, cpu_dev)
+        elem1 = relay.device_copy(split[1], gpu_dev, cpu_dev)
         sub = elem0 - elem1
         func = relay.Function(relay.analysis.free_vars(sub), sub)
         return func
@@ -633,11 +633,11 @@ def annotated():
         x = relay.var("x", relay.ty.TensorType((3, 3, 4), "float32"))
         split = relay.op.split(x, 3)
         split = split.astuple()
-        split = relay.annotation.on_device(split, gpu_ctx)
+        split = relay.annotation.on_device(split, gpu_dev)
         split = relay.TupleWrapper(split, 3)
         sub = split[0] - split[1]
         func = relay.Function(relay.analysis.free_vars(sub), sub)
-        func = run_opt_pass(func, transform.RewriteAnnotatedOps(cpu_ctx.device_type))
+        func = run_opt_pass(func, transform.RewriteAnnotatedOps(cpu_dev.device_type))
         return func
 
     annotated_func = annotated()
diff --git a/tests/python/relay/test_pass_auto_quantize.py b/tests/python/relay/test_pass_auto_quantize.py
index 8a7c4cbfbbd6..326416f3c501 100644
--- a/tests/python/relay/test_pass_auto_quantize.py
+++ b/tests/python/relay/test_pass_auto_quantize.py
@@ -126,7 +126,7 @@ def gen_rand_tvm(tt, low, high):
         data_np = np.random.uniform(low, high, size=get_const_tuple(tt.shape)).astype(tt.dtype)
     else:
         assert False, "unknown dtype"
-    return tvm.nd.array(data_np, ctx=tvm.cpu(0))
+    return tvm.nd.array(data_np, device=tvm.cpu(0))
 
 
 def verify_partition_fails(mod, params):
@@ -155,7 +155,7 @@ def verify_partition(mod, params):
     params = [gen_rand_tvm(param.type_annotation, 0, 1) for param in partitioned_mod["main"].params]
 
     def _eval_mod(mod):
-        vm = relay.create_executor("vm", ctx=tvm.cpu(0), target="llvm", mod=mod)
+        vm = relay.create_executor("vm", device=tvm.cpu(0), target="llvm", mod=mod)
         return vm.evaluate()(*params)
 
     partitioned_mod_result = _eval_mod(partitioned_mod)
@@ -307,6 +307,39 @@ def @main(
     verify_partition_fails(mod, params)
 
 
+def test_left_shift_negative():
+    data = relay.var("data", shape=(1, 16, 64, 64))
+    weight = relay.const(np.full((16, 16, 3, 3), 256.0))
+    conv2d = relay.nn.conv2d(data, weight, kernel_size=(3, 3), padding=(1, 1), channels=16)
+    relu = relay.nn.relu(conv2d)
+
+    mod = tvm.IRModule.from_expr(relu)
+
+    with tvm.transform.PassContext(opt_level=3):
+        with relay.quantize.qconfig(
+            calibrate_mode="global_scale", global_scale=8.0, skip_conv_layers=None
+        ):
+            qnn_mod = relay.quantize.quantize(mod)
+
+    class OpFinder(relay.ExprVisitor):
+        def __init__(self, op_name):
+            super(OpFinder, self).__init__()
+            self._op_name = op_name
+            self.ops = list()
+
+        def visit_call(self, call):
+            super().visit_call(call)
+            if call.op.name == self._op_name:
+                self.ops.append(call)
+
+    opf = OpFinder("left_shift")
+    opf.visit(qnn_mod["main"])
+    assert len(opf.ops) > 0, 'Broken case, can\'t find any "left_shift" operators.'
+    for left_shift_op in opf.ops:
+        shift_amount = left_shift_op.args[1].data.asnumpy()
+        assert shift_amount >= 0, "Shift amount must be non-negative."
+
+
 if __name__ == "__main__":
     test_mul_rewrite()
     test_batch_flatten_rewrite()
@@ -320,3 +353,4 @@ def @main(
     test_unquantizable_prefix_partition()
     test_unquantizable_core_partition()
     test_unquantizable_suffix_partition()
+    test_left_shift_negative()
diff --git a/tests/python/relay/test_pass_dynamic_to_static.py b/tests/python/relay/test_pass_dynamic_to_static.py
index c9e047a38540..b9d3a8ef357c 100644
--- a/tests/python/relay/test_pass_dynamic_to_static.py
+++ b/tests/python/relay/test_pass_dynamic_to_static.py
@@ -37,10 +37,10 @@ def run_opt_pass(expr, opt_pass):
 
 def verify_func(func, data, ref_res, rtol=1e-5, atol=1e-7):
     assert isinstance(data, list)
-    for target, ctx in tvm.testing.enabled_targets():
+    for target, dev in tvm.testing.enabled_targets():
         for kind in ["graph", "vm", "debug"]:
             mod = tvm.ir.IRModule.from_expr(func)
-            intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+            intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
             op_res = intrp.evaluate()(*data)
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=rtol, atol=atol)
 
@@ -176,12 +176,12 @@ def verify_topk(k, axis, ret_type, is_ascend, dtype):
         assert isinstance(zz, relay.Call)
         assert zz.op == relay.op.get("topk")
 
-        for target, ctx in tvm.testing.enabled_targets():
+        for target, dev in tvm.testing.enabled_targets():
             if "llvm" not in target:
                 continue
             for kind in ["graph", "vm", "debug"]:
                 mod = tvm.ir.IRModule.from_expr(func2)
-                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                intrp = relay.create_executor(kind, mod=mod, device=dev, target=target)
                 op_res = intrp.evaluate()(np_data)
                 if ret_type == "both":
                     tvm.testing.assert_allclose(op_res[0].asnumpy(), np_values)
diff --git a/tests/python/relay/test_pass_fold_explicit_padding.py b/tests/python/relay/test_pass_fold_explicit_padding.py
index 302a2b91bb8f..a3f82dd6d270 100644
--- a/tests/python/relay/test_pass_fold_explicit_padding.py
+++ b/tests/python/relay/test_pass_fold_explicit_padding.py
@@ -70,8 +70,8 @@ def validate(ndim, pad_width, pad_value, pad_mode, orig_padding, layout):
         mod2 = tvm.IRModule.from_expr(zz)
 
         with tvm.transform.PassContext():
-            ex1 = relay.create_executor("vm", mod=mod1, ctx=tvm.cpu(), target="llvm")
-        ex2 = relay.create_executor("vm", mod=mod2, ctx=tvm.cpu(), target="llvm")
+            ex1 = relay.create_executor("vm", mod=mod1, device=tvm.cpu(), target="llvm")
+        ex2 = relay.create_executor("vm", mod=mod2, device=tvm.cpu(), target="llvm")
         x_np = np.random.rand(*shape).astype("float32")
         w_np = np.random.rand(*wshape).astype("float32")
         result1 = ex1.evaluate()(x_np, w_np)
diff --git a/tests/python/relay/test_pass_fuse_ops.py b/tests/python/relay/test_pass_fuse_ops.py
index 30ee29525daa..9b7471f9a5ed 100644
--- a/tests/python/relay/test_pass_fuse_ops.py
+++ b/tests/python/relay/test_pass_fuse_ops.py
@@ -712,7 +712,7 @@ def expected():
 
     orig = before()
     m = fuse2(tvm.IRModule.from_expr(orig))
-    for tgt, ctx in tvm.testing.enabled_targets():
+    for tgt, dev in tvm.testing.enabled_targets():
         relay.build(m, tgt)
     after = run_opt_pass(expected(), transform.InferType())
     assert tvm.ir.structural_equal(m["main"], after)
@@ -775,7 +775,7 @@ def test_fuse_dynamic_squeeze_slice_take():
     take = relay.op.take(strided_slice, take_val, axis=0)
 
     mod = tvm.IRModule.from_expr(take)
-    ex = relay.create_executor("vm", mod=mod, ctx=tvm.cpu(), target="llvm")
+    ex = relay.create_executor("vm", mod=mod, device=tvm.cpu(), target="llvm")
 
     result = ex.evaluate()(*input_data)
 
diff --git a/tests/python/relay/test_pass_legalize.py b/tests/python/relay/test_pass_legalize.py
index 0d14f6611db9..8a37da33a10f 100644
--- a/tests/python/relay/test_pass_legalize.py
+++ b/tests/python/relay/test_pass_legalize.py
@@ -20,7 +20,7 @@
 from tvm import te
 
 from tvm import relay
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 from tvm.relay import transform, analysis
 from tvm.relay.testing.temp_op_attr import TempOpAttr
 
diff --git a/tests/python/relay/test_pass_legalize_tensorcore.py b/tests/python/relay/test_pass_legalize_tensorcore.py
index 5ecda4ba07a8..f45e39047238 100644
--- a/tests/python/relay/test_pass_legalize_tensorcore.py
+++ b/tests/python/relay/test_pass_legalize_tensorcore.py
@@ -20,7 +20,7 @@
 from tvm import te
 from tvm import topi
 from tvm import relay
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 from tvm.relay import transform, analysis
 from tvm.relay.testing.temp_op_attr import TempOpAttr
 
diff --git a/tests/python/relay/test_pass_manager.py b/tests/python/relay/test_pass_manager.py
index 7e2282809f76..5a29d1acd171 100644
--- a/tests/python/relay/test_pass_manager.py
+++ b/tests/python/relay/test_pass_manager.py
@@ -178,9 +178,9 @@ def test_pass_run():
         x_nd = get_rand(shape, dtype)
         y_nd = get_rand(shape, dtype)
         ref_res = x_nd.asnumpy() + y_nd.asnumpy()
-        for target, ctx in tvm.testing.enabled_targets():
-            exe1 = relay.create_executor("graph", ctx=ctx, target=target)
-            exe2 = relay.create_executor("debug", ctx=ctx, target=target)
+        for target, dev in tvm.testing.enabled_targets():
+            exe1 = relay.create_executor("graph", device=dev, target=target)
+            exe2 = relay.create_executor("debug", device=dev, target=target)
             res1 = exe1.evaluate(new_add)(x_nd, y_nd)
             tvm.testing.assert_allclose(res1.asnumpy(), ref_res, rtol=1e-5)
             res2 = exe2.evaluate(new_add)(x_nd, y_nd)
@@ -275,9 +275,9 @@ def test_pass_run():
         # Execute the add function.
         x_nd = get_rand(shape, dtype)
         ref_res = np.log(x_nd.asnumpy() * 2)
-        for target, ctx in tvm.testing.enabled_targets():
-            exe1 = relay.create_executor("graph", ctx=ctx, target=target)
-            exe2 = relay.create_executor("debug", ctx=ctx, target=target)
+        for target, dev in tvm.testing.enabled_targets():
+            exe1 = relay.create_executor("graph", device=dev, target=target)
+            exe2 = relay.create_executor("debug", device=dev, target=target)
             res1 = exe1.evaluate(new_log)(x_nd)
             tvm.testing.assert_allclose(res1.asnumpy(), ref_res, rtol=1e-5)
             res2 = exe2.evaluate(new_log)(x_nd)
@@ -437,9 +437,9 @@ def test_multiple_passes():
         x_nd = get_rand(shape, dtype)
         y_nd = get_rand(shape, dtype)
         ref_res = np.subtract(x_nd.asnumpy() * 2, y_nd.asnumpy() * 2)
-        for target, ctx in tvm.testing.enabled_targets():
-            exe1 = relay.create_executor("graph", ctx=ctx, target=target)
-            exe2 = relay.create_executor("debug", ctx=ctx, target=target)
+        for target, dev in tvm.testing.enabled_targets():
+            exe1 = relay.create_executor("graph", device=dev, target=target)
+            exe2 = relay.create_executor("debug", device=dev, target=target)
             res1 = exe1.evaluate(new_sub)(x_nd, y_nd)
             tvm.testing.assert_allclose(res1.asnumpy(), ref_res, rtol=1e-5)
             res2 = exe2.evaluate(new_sub)(x_nd, y_nd)
@@ -448,9 +448,9 @@ def test_multiple_passes():
         # Execute the updated abs function.
         x_nd = get_rand((5, 10), dtype)
         ref_res = np.abs(x_nd.asnumpy() * 2)
-        for target, ctx in tvm.testing.enabled_targets():
-            exe1 = relay.create_executor("graph", ctx=ctx, target=target)
-            exe2 = relay.create_executor("debug", ctx=ctx, target=target)
+        for target, dev in tvm.testing.enabled_targets():
+            exe1 = relay.create_executor("graph", device=dev, target=target)
+            exe2 = relay.create_executor("debug", device=dev, target=target)
             res1 = exe1.evaluate(new_abs)(x_nd)
             tvm.testing.assert_allclose(res1.asnumpy(), ref_res, rtol=1e-5)
             res2 = exe2.evaluate(new_abs)(x_nd)
diff --git a/tests/python/relay/test_pass_partial_eval.py b/tests/python/relay/test_pass_partial_eval.py
index 45749c31f38f..57286670f06d 100644
--- a/tests/python/relay/test_pass_partial_eval.py
+++ b/tests/python/relay/test_pass_partial_eval.py
@@ -30,8 +30,8 @@
 
 
 def check_eval(expr, expected_result, mod=None, rtol=1e-07):
-    ctx = tvm.context("llvm", 0)
-    intrp = create_executor(mod=mod, ctx=ctx, target="llvm")
+    dev = tvm.device("llvm", 0)
+    intrp = create_executor(mod=mod, device=dev, target="llvm")
 
     result = intrp.evaluate(expr)
     np.testing.assert_allclose(result.asnumpy(), expected_result, rtol=rtol)
diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py
index d8f674eeff34..01a1e48f832a 100644
--- a/tests/python/relay/test_pass_partition_graph.py
+++ b/tests/python/relay/test_pass_partition_graph.py
@@ -43,7 +43,7 @@ def __init__(self, op_list, compiler):
         self.op_list = op_list
         self.compiler = compiler
 
-    def transform_function(self, func, mod, ctx):
+    def transform_function(self, func, mod, dev):
 
         annotator = self
 
@@ -173,7 +173,7 @@ def visit_call(self, call):
 
 
 def check_result(
-    mod, map_inputs, out_shape, result, tol=1e-5, target="llvm", ctx=tvm.cpu(), params=None
+    mod, map_inputs, out_shape, result, tol=1e-5, target="llvm", device=tvm.cpu(), params=None
 ):
     if sys.platform == "win32":
         print("Skip test on Windows for now")
@@ -201,19 +201,19 @@ def check_vm_result():
         code, lib = exe.save()
         lib = update_lib(lib)
         exe = runtime.vm.Executable.load_exec(code, lib)
-        vm = runtime.vm.VirtualMachine(exe, ctx)
+        vm = runtime.vm.VirtualMachine(exe, device)
         outs = vm.run(**map_inputs)
         outs = outs if isinstance(outs, runtime.container.ADT) else [outs]
         results = result if isinstance(result, list) else [result]
         for out, ref in zip(outs, results):
             tvm.testing.assert_allclose(out.asnumpy(), ref, rtol=tol, atol=tol)
 
-    def check_graph_runtime_result():
+    def check_graph_executor_result():
         compile_engine.get().clear()
         with tvm.transform.PassContext(opt_level=3):
             json, lib, param = relay.build(mod, target=target, params=params)
         lib = update_lib(lib)
-        rt_mod = tvm.contrib.graph_runtime.create(json, lib, ctx)
+        rt_mod = tvm.contrib.graph_executor.create(json, lib, device)
 
         for name, data in map_inputs.items():
             rt_mod.set_input(name, data)
@@ -224,12 +224,12 @@ def check_graph_runtime_result():
         results = result if isinstance(result, list) else [result]
 
         for idx, shape in enumerate(out_shapes):
-            out = tvm.nd.empty(shape, ctx=ctx)
+            out = tvm.nd.empty(shape, device=device)
             out = rt_mod.get_output(idx, out)
             tvm.testing.assert_allclose(out.asnumpy(), results[idx], rtol=tol, atol=tol)
 
     check_vm_result()
-    check_graph_runtime_result()
+    check_graph_executor_result()
 
 
 def test_multi_node_compiler():
@@ -295,7 +295,7 @@ def test_multi_node_compiler():
 def test_extern_ccompiler_single_op():
     @transform.function_pass(opt_level=0)
     class MyAnnotator:
-        def transform_function(self, func, mod, ctx):
+        def transform_function(self, func, mod, dev):
             class Annotator(tvm.relay.ExprMutator):
                 def visit_call(self, call):
                     new_args = []
@@ -456,7 +456,7 @@ def get_func():
     i_data = np.random.uniform(0, 1, ishape).astype(dtype)
     w1_data = np.random.uniform(0, 1, w1shape).astype(dtype)
 
-    ref_ex = relay.create_executor("graph", mod=ref_mod, ctx=tvm.cpu())
+    ref_ex = relay.create_executor("graph", mod=ref_mod, device=tvm.cpu())
     ref_res = ref_ex.evaluate()(i_data, w1_data)
     check_result(
         mod, {"data": i_data, "weight1": w1_data}, (1, 32, 14, 14), ref_res.asnumpy(), tol=1e-5
@@ -476,7 +476,7 @@ def test_extern_dnnl_mobilenet():
     mod = transform.PartitionGraph()(mod)
     i_data = np.random.uniform(0, 1, ishape).astype(dtype)
 
-    ref_ex = relay.create_executor("graph", mod=ref_mod, ctx=tvm.cpu(0))
+    ref_ex = relay.create_executor("graph", mod=ref_mod, device=tvm.cpu(0))
     ref_res = ref_ex.evaluate()(i_data, **params)
     compile_engine.get().clear()
 
@@ -917,7 +917,7 @@ def test_partition_mobilenet():
     def test_exec(mod, params, ref_mod, ref_params, out_shape):
         ishape = (1, 3, 224, 224)
         i_data = np.random.randn(*ishape).astype(np.float32)
-        ref_ex = relay.create_executor("graph", mod=ref_mod, ctx=tvm.cpu(0))
+        ref_ex = relay.create_executor("graph", mod=ref_mod, device=tvm.cpu(0))
         ref_res = ref_ex.evaluate()(i_data, **ref_params)
         compile_engine.get().clear()
 
diff --git a/tests/python/relay/test_pass_qnn_legalize.py b/tests/python/relay/test_pass_qnn_legalize.py
index 6a5c8f7cd647..a30cd1e73e3f 100644
--- a/tests/python/relay/test_pass_qnn_legalize.py
+++ b/tests/python/relay/test_pass_qnn_legalize.py
@@ -20,7 +20,7 @@
 from tvm import te
 
 from tvm import relay
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 from tvm.relay import transform, analysis
 from tvm.relay.testing.temp_op_attr import TempOpAttr
 
diff --git a/tests/python/relay/test_pass_simplify_expr.py b/tests/python/relay/test_pass_simplify_expr.py
index 9531d896b2ed..897f90b9ee2a 100644
--- a/tests/python/relay/test_pass_simplify_expr.py
+++ b/tests/python/relay/test_pass_simplify_expr.py
@@ -60,6 +60,63 @@ def symbolic():
     assert tvm.ir.structural_equal(zz, after)
 
 
+def test_simplify_transpose():
+    # Test a series of transpose and layout_transform ops
+    def before1():
+        x = relay.var("x", shape=(1, 3, 224, 224), dtype="float32")  # NCHW
+        y = relay.transpose(x, axes=[0, 2, 3, 1])  # To NHWC
+        y = relay.layout_transform(y, "NHWC", "HWCN")  # To HWCN
+        y = relay.transpose(y, axes=[3, 0, 1, 2])  # To NHWC
+        return relay.Function([x], y)
+
+    def expected1():
+        x = relay.var("x", shape=(1, 3, 224, 224), dtype="float32")  # NCHW
+        y = relay.transpose(x, axes=[0, 2, 3, 1])  # To NHWC
+        return relay.Function([x], y)
+
+    # Test that all transpose ops can be cancelled
+    def before2():
+        x = relay.var("x", shape=(1, 3, 224, 224), dtype="float32")  # NCHW
+        y = relay.nn.relu(x)
+        y = relay.transpose(y, axes=[0, 2, 3, 1])  # To NHWC
+        y = relay.transpose(y, axes=[1, 2, 3, 0])  # To HWCN
+        y = relay.transpose(y, axes=[3, 2, 0, 1])  # To NCHW
+        return relay.Function([x], y)
+
+    def expected2():
+        x = relay.var("x", shape=(1, 3, 224, 224), dtype="float32")  # NCHW
+        y = relay.nn.relu(x)
+        return relay.Function([x], y)
+
+    # Test default axis (reverse) and negative axis
+    def before3():
+        x = relay.var("x", shape=(1, 3, 224, 224), dtype="float32")  # NCHW
+        y = relay.nn.relu(x)
+        y = relay.transpose(y)  # Reverse
+        y = relay.transpose(y)  # Reverse
+        y = relay.transpose(y, axes=[0, 2, -1, 1])
+        y = relay.transpose(y)  # Reverse
+        y = relay.transpose(y)  # Reverse
+        return relay.Function([x], y)
+
+    def expected3():
+        x = relay.var("x", shape=(1, 3, 224, 224), dtype="float32")  # NCHW
+        y = relay.nn.relu(x)
+        y = relay.transpose(y, axes=[0, 2, 3, 1])
+        return relay.Function([x], y)
+
+    for before, expected in [
+        [before1(), expected1()],
+        [before2(), expected2()],
+        [before3(), expected3()],
+    ]:
+        after = run_opt_pass(before, transform.SimplifyExpr())
+        expected = run_opt_pass(expected, transform.InferType())
+        assert tvm.ir.structural_equal(after, expected), "\nafter: {} \nexpected: {}".format(
+            after, expected
+        )
+
+
 def test_simplify_full_elementwise():
     def validate(shape, value, dtype):
         def before_left(x, elem_op, full):
@@ -126,4 +183,5 @@ def after_right(x, elem_op, value):
 
 if __name__ == "__main__":
     test_simplify_reshape()
+    test_simplify_transpose()
     test_simplify_full_elementwise()
diff --git a/tests/python/relay/test_pass_to_a_normal_form.py b/tests/python/relay/test_pass_to_a_normal_form.py
index 72325e537c0e..e7aee5fae00b 100644
--- a/tests/python/relay/test_pass_to_a_normal_form.py
+++ b/tests/python/relay/test_pass_to_a_normal_form.py
@@ -36,8 +36,8 @@ def run_opt_pass(expr, passes):
 
 
 def check_eval(expr, expected_result, mod=None, rtol=1e-07):
-    ctx = tvm.context("llvm", 0)
-    intrp = create_executor(mod=mod, ctx=ctx, target="llvm")
+    dev = tvm.device("llvm", 0)
+    intrp = create_executor(mod=mod, device=dev, target="llvm")
 
     result = intrp.evaluate(expr)
     np.testing.assert_allclose(result.asnumpy(), expected_result, rtol=rtol)
@@ -149,8 +149,8 @@ def test_nat_add():
     p.mod.import_from_std("nat.rly")
     nat, z, s = p.mod.get_type("nat")
     add = p.mod.get_global_var("nat_add")
-    ctx = tvm.context("llvm", 0)
-    intrp = create_executor(mod=mod, ctx=ctx, target="llvm")
+    dev = tvm.device("llvm", 0)
+    intrp = create_executor(mod=mod, device=dev, target="llvm")
     assert mod[add].checked_type == relay.FuncType([nat(), nat()], nat())
     assert count(p, intrp.evaluate(add(s(z()), s(z())))) == 2
     expr = add(s(z()), s(z()))
diff --git a/tests/python/relay/test_pass_to_basic_block_normal_form.py b/tests/python/relay/test_pass_to_basic_block_normal_form.py
index a52d51ad4960..2085c8a2799c 100644
--- a/tests/python/relay/test_pass_to_basic_block_normal_form.py
+++ b/tests/python/relay/test_pass_to_basic_block_normal_form.py
@@ -38,8 +38,8 @@ def run_opt_pass(expr, passes):
 
 
 def check_eval(expr, expected_result, mod=None, rtol=1e-07):
-    ctx = tvm.context("llvm", 0)
-    intrp = create_executor(mod=mod, ctx=ctx, target="llvm")
+    dev = tvm.device("llvm", 0)
+    intrp = create_executor(mod=mod, device=dev, target="llvm")
 
     result = intrp.evaluate(expr)
     np.testing.assert_allclose(result.asnumpy(), expected_result, rtol=rtol)
@@ -266,8 +266,8 @@ def test_nat_add():
     p.mod.import_from_std("nat.rly")
     nat, z, s = p.mod.get_type("nat")
     add = p.mod.get_global_var("nat_add")
-    ctx = tvm.context("llvm", 0)
-    intrp = create_executor(mod=mod, ctx=ctx, target="llvm")
+    dev = tvm.device("llvm", 0)
+    intrp = create_executor(mod=mod, device=dev, target="llvm")
     assert mod[add].checked_type == relay.FuncType([nat(), nat()], nat())
     assert count(p, intrp.evaluate(add(s(z()), s(z())))) == 2
     expr = add(s(z()), s(z()))
diff --git a/tests/python/relay/test_pass_to_graph_normal_form.py b/tests/python/relay/test_pass_to_graph_normal_form.py
index 88d6829bd16e..f4c1a32b2566 100644
--- a/tests/python/relay/test_pass_to_graph_normal_form.py
+++ b/tests/python/relay/test_pass_to_graph_normal_form.py
@@ -33,8 +33,8 @@ def check_eval(expr, args, expected_result, mod=None, rtol=1e-07):
     if mod is None:
         mod = tvm.IRModule()
 
-    ctx = tvm.context("llvm", 0)
-    intrp = create_executor(mod=mod, ctx=ctx, target="llvm")
+    dev = tvm.device("llvm", 0)
+    intrp = create_executor(mod=mod, device=dev, target="llvm")
 
     result = intrp.evaluate(expr)(*args)
     np.testing.assert_allclose(result.asnumpy(), expected_result, rtol=rtol)
diff --git a/tests/python/relay/test_prng.py b/tests/python/relay/test_prng.py
index 2109d3b30a82..ba4fdc466ecc 100644
--- a/tests/python/relay/test_prng.py
+++ b/tests/python/relay/test_prng.py
@@ -22,18 +22,17 @@
 
 
 @tvm.testing.parametrize_targets
-def test_threefry_repeatability(target, ctx):
-    target, ctx = "llvm", tvm.cpu(0)
+def test_threefry_repeatability(target, dev):
     key1 = tvm.relay.random.threefry_key(1)
     rand1 = tvm.relay.random.threefry_generate(key1, (12,))
     out_key1, out1 = tvm.relay.create_executor(
-        "vm", tvm.IRModule.from_expr(tvm.relay.Function([], rand1)), target=target, ctx=ctx
+        "vm", tvm.IRModule.from_expr(tvm.relay.Function([], rand1)), target=target, device=dev
     ).evaluate()()
 
     key2 = tvm.relay.random.threefry_key(1)
     rand2 = tvm.relay.random.threefry_generate(key2, (12,))
     out_key2, out2 = tvm.relay.create_executor(
-        "vm", tvm.IRModule.from_expr(tvm.relay.Function([], rand2)), target=target, ctx=ctx
+        "vm", tvm.IRModule.from_expr(tvm.relay.Function([], rand2)), target=target, device=dev
     ).evaluate()()
 
     assert (
@@ -46,7 +45,7 @@ def test_threefry_repeatability(target, ctx):
 
 
 @tvm.testing.parametrize_targets
-def test_threefry_split(target, ctx):
+def test_threefry_split(target, dev):
     key = tvm.relay.random.threefry_key(1)
     left, right = tvm.relay.TupleWrapper(tvm.relay.random.threefry_split(key), 2)
     _, rand1 = tvm.relay.TupleWrapper(tvm.relay.random.threefry_generate(left, (16,)), 2)
@@ -55,7 +54,7 @@ def test_threefry_split(target, ctx):
         "vm",
         tvm.IRModule.from_expr(tvm.relay.Function([], tvm.relay.Tuple((rand1, rand2)))),
         target=target,
-        ctx=ctx,
+        device=dev,
     ).evaluate()()
 
     assert (
@@ -64,7 +63,7 @@ def test_threefry_split(target, ctx):
 
 
 @tvm.testing.parametrize_targets
-def test_threefry_sequential_generate(target, ctx):
+def test_threefry_sequential_generate(target, dev):
     key = tvm.relay.random.threefry_key(1)
     key, rand1 = tvm.relay.TupleWrapper(tvm.relay.random.threefry_generate(key, (4,)), 2)
     _, rand2 = tvm.relay.TupleWrapper(tvm.relay.random.threefry_generate(key, (4,)), 2)
@@ -72,7 +71,7 @@ def test_threefry_sequential_generate(target, ctx):
         "vm",
         tvm.IRModule.from_expr(tvm.relay.Function([], tvm.relay.Tuple((rand1, rand2)))),
         target=target,
-        ctx=ctx,
+        device=dev,
     ).evaluate()()
 
     assert (
@@ -132,11 +131,11 @@ def test_threefry_generate_incorrect_out_size():
         "vm",
         tvm.IRModule.from_expr(tvm.relay.Function([], rand1)),
         target=tvm.target.Target("llvm"),
-        ctx=tvm.context("cpu"),
+        device=tvm.device("cpu"),
     ).evaluate()()
 
 
 if __name__ == "__main__":
-    test_threefry_repeatability(tvm.target.Target("llvm"), tvm.context("cpu"))
-    test_threefry_split(tvm.target.Target("llvm"), tvm.context("cpu"))
-    test_threefry_sequential_generate(tvm.target.Target("llvm"), tvm.context("cpu"))
+    test_threefry_repeatability(tvm.target.Target("llvm"), tvm.device("cpu"))
+    test_threefry_split(tvm.target.Target("llvm"), tvm.device("cpu"))
+    test_threefry_sequential_generate(tvm.target.Target("llvm"), tvm.device("cpu"))
diff --git a/tests/python/relay/test_simplify_fc_transpose.py b/tests/python/relay/test_simplify_fc_transpose.py
index d5d195d3ff1d..fa5f332e6cd5 100644
--- a/tests/python/relay/test_simplify_fc_transpose.py
+++ b/tests/python/relay/test_simplify_fc_transpose.py
@@ -31,11 +31,11 @@ def run_func(func, params, x):
     with tvm.transform.PassContext(opt_level=3):
         lib = relay.build(func, "llvm", params=params)
 
-    from tvm.contrib import graph_runtime
+    from tvm.contrib import graph_executor
 
-    ctx = tvm.cpu(0)
+    dev = tvm.cpu(0)
     dtype = "float32"
-    m = graph_runtime.GraphModule(lib["default"](ctx))
+    m = graph_executor.GraphModule(lib["default"](dev))
     # set inputs
     m.set_input("data", tvm.nd.array(x.astype(dtype)))
     # execute
diff --git a/tests/python/relay/test_sparse_dense_convert.py b/tests/python/relay/test_sparse_dense_convert.py
index e3644e9704fc..1efa813ebfb0 100644
--- a/tests/python/relay/test_sparse_dense_convert.py
+++ b/tests/python/relay/test_sparse_dense_convert.py
@@ -52,11 +52,11 @@ def run_func(func, params, x):
     with tvm.transform.PassContext(opt_level=3):
         graph, lib, new_params = relay.build(func, "llvm", params=params)
 
-    from tvm.contrib import graph_runtime
+    from tvm.contrib import graph_executor
 
-    ctx = tvm.cpu(0)
+    dev = tvm.cpu(0)
     dtype = "float32"
-    m = graph_runtime.create(graph, lib, ctx)
+    m = graph_executor.create(graph, lib, dev)
     # set inputs
     m.set_input("data", tvm.nd.array(x.astype(dtype)))
     m.set_input(**new_params)
diff --git a/tests/python/relay/test_tensor_array.py b/tests/python/relay/test_tensor_array.py
index 76e9d4a6d8a0..3b950b45f1e2 100644
--- a/tests/python/relay/test_tensor_array.py
+++ b/tests/python/relay/test_tensor_array.py
@@ -60,11 +60,10 @@ def vmobj_to_list(mod, o, dtype="float32"):
 
 def check_tensor_array(ta_mod, ref_res, *args, dtype="float32", rtol=1e-5):
     for kind in ["debug", "vm"]:
-        for target, ctx in [("llvm", tvm.cpu(0))]:  # testing.enabled_targets():
-            # for target, ctx in testing.enabled_targets():
-            if kind == "debug" and ctx.device_type != tvm.cpu().device_type:
+        for target, dev in [("llvm", tvm.cpu(0))]:  # testing.enabled_targets():
+            if kind == "debug" and dev.device_type != tvm.cpu().device_type:
                 continue
-            ex = relay.create_executor(kind, mod=ta_mod, ctx=ctx, target=target)
+            ex = relay.create_executor(kind, mod=ta_mod, device=dev, target=target)
             result = ex.evaluate()(*args)
             got = vmobj_to_list(ta_mod, result, dtype)
             tvm.testing.assert_allclose(ref_res, got, rtol=rtol, atol=rtol)
diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py
index 975070ad1aaa..4ecd0d9189ea 100644
--- a/tests/python/relay/test_vm.py
+++ b/tests/python/relay/test_vm.py
@@ -40,13 +40,13 @@ def check_result(args, expected_result, mod=None):
     expected_result:
         The expected result of running the expression.
     """
-    for target, ctx in tvm.testing.enabled_targets():
-        vm = relay.create_executor("vm", ctx=ctx, target=target, mod=mod)
+    for target, dev in tvm.testing.enabled_targets():
+        vm = relay.create_executor("vm", device=dev, target=target, mod=mod)
         rts_result = vm.evaluate()(*args)
         tvm.testing.assert_allclose(expected_result, rts_result.asnumpy())
 
 
-def veval(f, *args, ctx=tvm.cpu(), target="llvm"):
+def veval(f, *args, device=tvm.cpu(), target="llvm"):
     if isinstance(f, relay.Expr):
         mod = tvm.IRModule()
         mod["main"] = f
@@ -54,7 +54,7 @@ def veval(f, *args, ctx=tvm.cpu(), target="llvm"):
         assert isinstance(f, tvm.IRModule), "expected expression or module"
         mod = f
     exe = relay.vm.compile(mod, target)
-    vm = runtime.vm.VirtualMachine(exe, ctx)
+    vm = runtime.vm.VirtualMachine(exe, device)
     return vm.invoke("main", *args)
 
 
@@ -80,8 +80,8 @@ def test_split():
         12,
     ).astype("float32")
     ref_res = np.split(x_data, 3, axis=0)
-    for tgt, ctx in tvm.testing.enabled_targets():
-        res = veval(f, x_data, ctx=ctx, target=tgt)
+    for tgt, dev in tvm.testing.enabled_targets():
+        res = veval(f, x_data, device=dev, target=tgt)
         for i in range(3):
             tvm.testing.assert_allclose(res[i].asnumpy(), ref_res[i])
 
@@ -96,8 +96,8 @@ def test_split_no_fuse():
     x_data = np.random.rand(
         12,
     ).astype("float32")
-    for tgt, ctx in tvm.testing.enabled_targets():
-        res = veval(f, x_data, ctx=ctx, target=tgt)
+    for tgt, dev in tvm.testing.enabled_targets():
+        res = veval(f, x_data, device=dev, target=tgt)
         tvm.testing.assert_allclose(res.asnumpy(), np.split(x_data, 3, axis=0)[0])
 
 
@@ -176,8 +176,8 @@ def test_multiple_ifs():
     out = relay.Let(v0, relay.Tuple([relay.const(0)]), out)
     fn = relay.Function([b], out)
     mod["main"] = fn
-    ctx = tvm.runtime.ndarray.context("llvm", 0)
-    vm = relay.create_executor(ctx=ctx, mod=mod, kind="vm")
+    dev = tvm.runtime.device("llvm", 0)
+    vm = relay.create_executor(device=dev, mod=mod, kind="vm")
     res = vmobj_to_list(vm.evaluate()(False))
     assert res == [1, 0]
 
@@ -214,8 +214,8 @@ def test_count_loop():
     i_data = np.array(0, dtype="int32")
     iarg = relay.var("i", shape=[], dtype="int32")
     mod["main"] = relay.Function([iarg], sum_up(iarg))
-    for tgt, ctx in tvm.testing.enabled_targets():
-        result = veval(mod, i_data, ctx=ctx, target=tgt)
+    for tgt, dev in tvm.testing.enabled_targets():
+        result = veval(mod, i_data, device=dev, target=tgt)
         tvm.testing.assert_allclose(result.asnumpy(), i_data)
     check_result([i_data], i_data, mod=mod)
 
@@ -283,8 +283,8 @@ def test_list_constructor():
 
     mod["main"] = f
 
-    for tgt, ctx in tvm.testing.enabled_targets():
-        result = veval(mod, ctx=ctx, target=tgt)
+    for tgt, dev in tvm.testing.enabled_targets():
+        result = veval(mod, device=dev, target=tgt)
         assert len(result) == 2
         assert len(result[1]) == 2
 
@@ -361,8 +361,8 @@ def test_compose():
     mod["main"] = f
 
     x_data = np.array(np.random.rand()).astype("float32")
-    for tgt, ctx in tvm.testing.enabled_targets():
-        result = veval(mod, [x_data], ctx=ctx, target=tgt)
+    for tgt, dev in tvm.testing.enabled_targets():
+        result = veval(mod, [x_data], device=dev, target=tgt)
         tvm.testing.assert_allclose(result.asnumpy(), x_data + 2.0)
 
 
@@ -382,8 +382,8 @@ def test_list_hd():
 
     mod["main"] = f
 
-    for tgt, ctx in tvm.testing.enabled_targets():
-        result = veval(mod, ctx=ctx, target=tgt)
+    for tgt, dev in tvm.testing.enabled_targets():
+        result = veval(mod, device=dev, target=tgt)
         tvm.testing.assert_allclose(result.asnumpy(), 3)
 
 
@@ -399,8 +399,8 @@ def test_list_tl_empty_list():
 
     mod["main"] = f
 
-    for tgt, ctx in tvm.testing.enabled_targets():
-        result = veval(mod, ctx=ctx, target=tgt)
+    for tgt, dev in tvm.testing.enabled_targets():
+        result = veval(mod, device=dev, target=tgt)
 
 
 @tvm.testing.uses_gpu
@@ -419,8 +419,8 @@ def test_list_tl():
 
     mod["main"] = f
 
-    for tgt, ctx in tvm.testing.enabled_targets():
-        result = veval(mod, ctx=ctx, target=tgt)
+    for tgt, dev in tvm.testing.enabled_targets():
+        result = veval(mod, device=dev, target=tgt)
         tvm.testing.assert_allclose(vmobj_to_list(result), np.array([2, 1]))
 
 
@@ -441,8 +441,8 @@ def test_list_nth():
 
         f = relay.Function([], nth(l, relay.const(i)))
         mod["main"] = f
-        for tgt, ctx in tvm.testing.enabled_targets():
-            result = veval(mod, ctx=ctx, target=tgt)
+        for tgt, dev in tvm.testing.enabled_targets():
+            result = veval(mod, device=dev, target=tgt)
             tvm.testing.assert_allclose(result.asnumpy(), expected[i])
 
 
@@ -467,8 +467,8 @@ def test_list_update():
 
     f = relay.Function([], l)
     mod["main"] = f
-    for tgt, ctx in tvm.testing.enabled_targets():
-        result = veval(mod, ctx=ctx, target=tgt)
+    for tgt, dev in tvm.testing.enabled_targets():
+        result = veval(mod, device=dev, target=tgt)
         tvm.testing.assert_allclose(vmobj_to_list(result), np.array(expected))
 
 
@@ -491,8 +491,8 @@ def test_list_length():
 
     f = relay.Function([], l)
     mod["main"] = f
-    for tgt, ctx in tvm.testing.enabled_targets():
-        result = veval(mod, ctx=ctx, target=tgt)
+    for tgt, dev in tvm.testing.enabled_targets():
+        result = veval(mod, device=dev, target=tgt)
         tvm.testing.assert_allclose(result.asnumpy(), 10)
 
 
@@ -511,8 +511,8 @@ def test_list_map():
 
     f = relay.Function([], map(add_one_func, l))
     mod["main"] = f
-    for tgt, ctx in tvm.testing.enabled_targets():
-        result = veval(mod, ctx=ctx, target=tgt)
+    for tgt, dev in tvm.testing.enabled_targets():
+        result = veval(mod, device=dev, target=tgt)
         tvm.testing.assert_allclose(vmobj_to_list(result), np.array([3, 2]))
 
 
@@ -531,8 +531,8 @@ def test_list_foldl():
     l = cons(relay.const(1), cons(relay.const(2), cons(relay.const(3), nil())))
     f = relay.Function([], foldl(rev_dup_func, nil(), l))
     mod["main"] = f
-    for tgt, ctx in tvm.testing.enabled_targets():
-        result = veval(mod, ctx=ctx, target=tgt)
+    for tgt, dev in tvm.testing.enabled_targets():
+        result = veval(mod, device=dev, target=tgt)
         tvm.testing.assert_allclose(vmobj_to_list(result), np.array([3, 3, 2, 2, 1, 1]))
 
 
@@ -551,8 +551,8 @@ def test_list_foldr():
     l = cons(relay.const(1), cons(relay.const(2), cons(relay.const(3), nil())))
     f = relay.Function([], foldr(identity_func, nil(), l))
     mod["main"] = f
-    for tgt, ctx in tvm.testing.enabled_targets():
-        result = veval(mod, ctx=ctx, target=tgt)
+    for tgt, dev in tvm.testing.enabled_targets():
+        result = veval(mod, device=dev, target=tgt)
         tvm.testing.assert_allclose(vmobj_to_list(result), np.array([1, 2, 3]))
 
 
@@ -567,8 +567,8 @@ def test_list_sum():
     l = cons(relay.const(1), cons(relay.const(2), cons(relay.const(3), nil())))
     f = relay.Function([], sum(l))
     mod["main"] = f
-    for tgt, ctx in tvm.testing.enabled_targets():
-        result = veval(mod, ctx=ctx, target=tgt)
+    for tgt, dev in tvm.testing.enabled_targets():
+        result = veval(mod, device=dev, target=tgt)
         tvm.testing.assert_allclose(result.asnumpy(), 6)
 
 
@@ -590,8 +590,8 @@ def test_list_filter():
     )
     f = relay.Function([], filter(greater_than_one, l))
     mod["main"] = f
-    for tgt, ctx in tvm.testing.enabled_targets():
-        result = veval(mod, ctx=ctx, target=tgt)
+    for tgt, dev in tvm.testing.enabled_targets():
+        result = veval(mod, device=dev, target=tgt)
         tvm.testing.assert_allclose(vmobj_to_list(result), np.array([3, 5]))
 
 
@@ -603,8 +603,8 @@ def test_closure():
     ff = relay.Function([y], f)
     clo = ff(relay.const(1.0))
     main = clo(relay.const(2.0))
-    for tgt, ctx in tvm.testing.enabled_targets():
-        res = veval(main, ctx=ctx, target=tgt)
+    for tgt, dev in tvm.testing.enabled_targets():
+        res = veval(main, device=dev, target=tgt)
         tvm.testing.assert_allclose(res.asnumpy(), 3.0)
 
 
@@ -769,8 +769,8 @@ def test_vm_reshape_tuple(x_shape=(1, 4, 2), y_shape=(1, 2, 10)):
     x_data = np.random.uniform(size=x_shape).astype("float32")
     y_data = np.random.uniform(size=y_shape).astype("float32")
 
-    for tgt, ctx in tvm.testing.enabled_targets():
-        res = veval(f, (x_data, y_data), ctx=ctx, target=tgt)
+    for tgt, dev in tvm.testing.enabled_targets():
+        res = veval(f, (x_data, y_data), device=dev, target=tgt)
         tvm.testing.assert_allclose(res.asnumpy(), np.reshape(x_data, (1, -1)))
 
 
diff --git a/tests/python/relay/test_vm_serialization.py b/tests/python/relay/test_vm_serialization.py
index b2a695dc5434..36f97f815e6e 100644
--- a/tests/python/relay/test_vm_serialization.py
+++ b/tests/python/relay/test_vm_serialization.py
@@ -43,29 +43,29 @@ def create_exec(f, target="llvm", params=None):
         return executable
 
 
-def get_serialized_output(mod, *data, params=None, target="llvm", ctx=tvm.cpu()):
+def get_serialized_output(mod, *data, params=None, target="llvm", device=tvm.cpu()):
     exe = create_exec(mod, target, params=params)
     code, lib = exe.save()
     des_exec = _vm.Executable.load_exec(code, lib)
-    des_vm = _vm.VirtualMachine(des_exec, ctx)
+    des_vm = _vm.VirtualMachine(des_exec, device)
     result = des_vm.run(*data)
     return result
 
 
 def run_network(mod, params, dtype="float32"):
-    def get_vm_output(mod, data, params, target, ctx, dtype="float32"):
-        ex = relay.create_executor("vm", mod=mod, ctx=ctx)
+    def get_vm_output(mod, data, params, target, device, dtype="float32"):
+        ex = relay.create_executor("vm", mod=mod, device=device)
         result = ex.evaluate()(data, **params)
         return result.asnumpy().astype(dtype)
 
     data_shape = [int(x) for x in mod["main"].checked_type.arg_types[0].shape]
     data = np.random.uniform(size=data_shape).astype(dtype)
     target = "llvm"
-    ctx = tvm.cpu(0)
+    dev = tvm.cpu(0)
 
-    tvm_out = get_vm_output(mod, tvm.nd.array(data.astype(dtype)), params, target, ctx, dtype)
+    tvm_out = get_vm_output(mod, tvm.nd.array(data.astype(dtype)), params, target, dev, dtype)
     vm_out = get_serialized_output(
-        mod, tvm.nd.array(data.astype(dtype)), params=params, target=target, ctx=ctx
+        mod, tvm.nd.array(data.astype(dtype)), params=params, target=target, device=dev
     )
     tvm.testing.assert_allclose(vm_out.asnumpy().astype(dtype), tvm_out, rtol=1e-5, atol=1e-5)
 
@@ -311,8 +311,8 @@ def test_dynamic_bcast():
     x_data = np.random.uniform(size=(1, 2)).astype(dtype)
     y_data = np.random.uniform(size=(3, 2)).astype(dtype)
     res_np = np.add(x_data, y_data)
-    for target, ctx in testing.enabled_targets():
-        res = get_serialized_output(mod, *(x_data, y_data), target=target, ctx=ctx)
+    for target, dev in testing.enabled_targets():
+        res = get_serialized_output(mod, *(x_data, y_data), target=target, device=dev)
         tvm.testing.assert_allclose(res.asnumpy(), res_np)
 
 
diff --git a/tests/python/topi/python/test_fifo_buffer.py b/tests/python/topi/python/test_fifo_buffer.py
index 458fabf4e1f3..c2a4f8e7dd84 100644
--- a/tests/python/topi/python/test_fifo_buffer.py
+++ b/tests/python/topi/python/test_fifo_buffer.py
@@ -46,22 +46,22 @@ def get_ref_data():
     # Get the test data
     buffer_np, data_np, out_np = get_ref_data()
 
-    def check_device(device, ctx):
-        print("  Running on target: {}".format(device))
+    def check_device(target, dev):
+        print("  Running on target: {}".format(target))
 
-        with tvm.target.Target(device):
+        with tvm.target.Target(target):
             out = topi.nn.fifo_buffer(data, buffer, axis=axis)
-            s = tvm.topi.testing.get_injective_schedule(device)([out])
+            s = tvm.topi.testing.get_injective_schedule(target)([out])
 
-        buffer_tvm = tvm.nd.array(buffer_np, ctx=ctx)
-        data_tvm = tvm.nd.array(data_np, ctx=ctx)
-        out_tvm = tvm.nd.empty(shape=buffer_shape, ctx=ctx, dtype=dtype)
-        f = tvm.build(s, [data, buffer, out], device, name="fifo")
+        buffer_tvm = tvm.nd.array(buffer_np, device=dev)
+        data_tvm = tvm.nd.array(data_np, device=dev)
+        out_tvm = tvm.nd.empty(shape=buffer_shape, device=dev, dtype=dtype)
+        f = tvm.build(s, [data, buffer, out], target, name="fifo")
         f(data_tvm, buffer_tvm, out_tvm)
         tvm.testing.assert_allclose(out_tvm.asnumpy(), out_np)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_device(target, dev)
 
 
 def verify_conv1d_integration():
@@ -120,49 +120,49 @@ def get_data():
     # Get the test data
     inc_input_np, input_window_np, kernel_np, context_np, output_window_np = get_data()
 
-    def check_device(device, ctx):
-        print("  Running on target: {}".format(device))
+    def check_device(target, dev):
+        print("  Running on target: {}".format(target))
 
-        conv2d_nchw, schedule_conv2d_nchw = tvm.topi.testing.get_conv2d_nchw_implement(device)
+        conv2d_nchw, schedule_conv2d_nchw = tvm.topi.testing.get_conv2d_nchw_implement(target)
 
-        with tvm.target.Target(device):
+        with tvm.target.Target(target):
             out = topi.nn.fifo_buffer(inc_input, context, axis=buffer_axis)
-            s = tvm.topi.testing.get_injective_schedule(device)([out])
-            update_context = tvm.build(s, [inc_input, context, out], device, name="update_context")
+            s = tvm.topi.testing.get_injective_schedule(target)([out])
+            update_context = tvm.build(s, [inc_input, context, out], target, name="update_context")
 
             out = conv2d_nchw(context, kernel, stride, padding, dilate, dtype)
             s = schedule_conv2d_nchw([out])
-            conv2d_inc = tvm.build(s, [context, kernel, out], device, name="conv2d_inc")
+            conv2d_inc = tvm.build(s, [context, kernel, out], target, name="conv2d_inc")
 
             out = topi.nn.fifo_buffer(inc_output, output_window, axis=buffer_axis)
-            s = tvm.topi.testing.get_injective_schedule(device)([out])
+            s = tvm.topi.testing.get_injective_schedule(target)([out])
             update_output_window = tvm.build(
-                s, [inc_output, output_window, out], device, name="update_output_window"
+                s, [inc_output, output_window, out], target, name="update_output_window"
             )
 
             out = topi.nn.fifo_buffer(inc_input, input_window, axis=buffer_axis)
-            s = tvm.topi.testing.get_injective_schedule(device)([out])
+            s = tvm.topi.testing.get_injective_schedule(target)([out])
             update_input_window = tvm.build(
-                s, [inc_input, input_window, out], device, name="update_input_window"
+                s, [inc_input, input_window, out], target, name="update_input_window"
             )
 
             out = conv2d_nchw(input_window, kernel, stride, padding, dilate, dtype)
             s = schedule_conv2d_nchw([out])
-            conv2d = tvm.build(s, [input_window, kernel, out], device, name="conv2d")
-
-        input_window_tvm = tvm.nd.array(input_window_np, ctx=ctx)
-        new_input_window_tvm = tvm.nd.empty(shape=input_window_shape, ctx=ctx, dtype=dtype)
-        kernel_tvm = tvm.nd.array(kernel_np, ctx=ctx)
-        context_tvm = tvm.nd.array(context_np, ctx=ctx)
-        new_context_tvm = tvm.nd.empty(shape=context_shape, ctx=ctx, dtype=dtype)
-        inc_output_tvm = tvm.nd.empty(shape=inc_output_shape, ctx=ctx, dtype=dtype)
-        output_window_tvm = tvm.nd.array(output_window_np, ctx=ctx)
-        new_output_window_tvm = tvm.nd.empty(shape=output_window_shape, ctx=ctx, dtype=dtype)
-        output_window_ref_tvm = tvm.nd.empty(shape=output_window_shape, ctx=ctx, dtype=dtype)
+            conv2d = tvm.build(s, [input_window, kernel, out], target, name="conv2d")
+
+        input_window_tvm = tvm.nd.array(input_window_np, device=dev)
+        new_input_window_tvm = tvm.nd.empty(shape=input_window_shape, device=dev, dtype=dtype)
+        kernel_tvm = tvm.nd.array(kernel_np, device=dev)
+        context_tvm = tvm.nd.array(context_np, device=dev)
+        new_context_tvm = tvm.nd.empty(shape=context_shape, device=dev, dtype=dtype)
+        inc_output_tvm = tvm.nd.empty(shape=inc_output_shape, device=dev, dtype=dtype)
+        output_window_tvm = tvm.nd.array(output_window_np, device=dev)
+        new_output_window_tvm = tvm.nd.empty(shape=output_window_shape, device=dev, dtype=dtype)
+        output_window_ref_tvm = tvm.nd.empty(shape=output_window_shape, device=dev, dtype=dtype)
 
         for i in range(num_iteration):
             # Take i-th slice of inc_input_np
-            inc_input_tvm = tvm.nd.array(inc_input_np[i], ctx=ctx)
+            inc_input_tvm = tvm.nd.array(inc_input_np[i], device=dev)
 
             # Compute new output window incrementally, using the FIFO buffer op
             update_context(inc_input_tvm, context_tvm, new_context_tvm)
@@ -181,8 +181,8 @@ def check_device(device, ctx):
                 output_window_tvm.asnumpy(), output_window_ref_tvm.asnumpy()
             )
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_device(target, dev)
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/topi/python/test_topi_argwhere.py b/tests/python/topi/python/test_topi_argwhere.py
index 69993d287b79..2b75dada3f1e 100644
--- a/tests/python/topi/python/test_topi_argwhere.py
+++ b/tests/python/topi/python/test_topi_argwhere.py
@@ -40,27 +40,27 @@ def verify_argwhere(data_shape):
     out_shape = te.placeholder(shape=(out_shape, len(data_shape)), name="out_shape", dtype=dtype)
     condition = te.placeholder(shape=data_shape, name="condition", dtype=dtype)
 
-    def check_device(device, ctx):
-        ctx = tvm.context(device, 0)
-        if not ctx.exist or device not in _argwhere_compute:
+    def check_device(target):
+        dev = tvm.device(target, 0)
+        if not dev.exist or target not in _argwhere_compute:
             return
 
-        with tvm.target.Target(device):
-            out = _argwhere_compute[device](out_shape, condition)
-            s_func = tvm.topi.testing.dispatch(device, _argwhere_schedule)
+        with tvm.target.Target(target):
+            out = _argwhere_compute[target](out_shape, condition)
+            s_func = tvm.topi.testing.dispatch(target, _argwhere_schedule)
             sch = s_func(out)
 
-        func = tvm.build(sch, [out_shape, condition, out], device, name="argwhere")
+        func = tvm.build(sch, [out_shape, condition, out], target, name="argwhere")
 
-        args = [tvm.nd.array(np_shape, ctx)]
-        args.append(tvm.nd.array(np_data, ctx))
-        args.append(tvm.nd.empty(out.shape, ctx=ctx, dtype=condition.dtype))
+        args = [tvm.nd.array(np_shape, dev)]
+        args.append(tvm.nd.array(np_data, dev))
+        args.append(tvm.nd.empty(out.shape, device=dev, dtype=condition.dtype))
         func(*args)
         np.set_printoptions(threshold=np.inf)
         tvm.testing.assert_allclose(args[-1].asnumpy(), np.array(np_out))
 
-    for target, ctx in tvm.testing.enabled_targets():
-        check_device(target, ctx)
+    for target, _ in tvm.testing.enabled_targets():
+        check_device(target)
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/topi/python/test_topi_batch_matmul.py b/tests/python/topi/python/test_topi_batch_matmul.py
index 78f85d079f58..05f2c3029bc9 100644
--- a/tests/python/topi/python/test_topi_batch_matmul.py
+++ b/tests/python/topi/python/test_topi_batch_matmul.py
@@ -61,10 +61,10 @@ def get_ref_data():
     # get the test data
     a_np, b_np, c_np = get_ref_data()
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            fcompute, fschedule = tvm.topi.testing.dispatch(device, _batch_matmul_implement)
+    def check_device(target, dev):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            fcompute, fschedule = tvm.topi.testing.dispatch(target, _batch_matmul_implement)
             out = fcompute(x, y)
             if not dynamic:
                 s = fschedule([out])
@@ -76,19 +76,19 @@ def check_device(device, ctx):
             if debug:
                 print(tvm.lower(s, [x, y, out], simple_mode=True))
 
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(b_np, ctx)
-        c = tvm.nd.array(np.zeros(get_const_tuple(out_shape), dtype=dtype), ctx)
-        f = tvm.build(s, [x, y, out], device, name="dense")
+        a = tvm.nd.array(a_np, dev)
+        b = tvm.nd.array(b_np, dev)
+        c = tvm.nd.array(np.zeros(get_const_tuple(out_shape), dtype=dtype), dev)
+        f = tvm.build(s, [x, y, out], target, name="dense")
         f(a, b, c)
         tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        if dynamic and (device == "cuda" or device == "nvptx"):
-            print("Dynamic batch matmul test is skippped on %s" % device)
+    for target, dev in tvm.testing.enabled_targets():
+        if dynamic and (target == "cuda" or target == "nvptx"):
+            print("Dynamic batch matmul test is skippped on %s" % target)
             continue
 
-        check_device(device, ctx)
+        check_device(target, dev)
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/topi/python/test_topi_batch_matmul_tensorcore.py b/tests/python/topi/python/test_topi_batch_matmul_tensorcore.py
index 77df5be0a491..9712aa561d51 100644
--- a/tests/python/topi/python/test_topi_batch_matmul_tensorcore.py
+++ b/tests/python/topi/python/test_topi_batch_matmul_tensorcore.py
@@ -47,15 +47,15 @@ def get_ref_data():
     a_np, b_np, c_np = get_ref_data()
 
     def check_device(device):
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         print("Running on target: %s" % device)
         with tvm.target.Target(device):
             fcompute, fschedule = tvm.topi.testing.dispatch(device, _batch_matmul_implement)
             out = fcompute(x, y)
             s = fschedule([out])
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(b_np, ctx)
-        c = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=dtype), ctx)
+        a = tvm.nd.array(a_np, dev)
+        b = tvm.nd.array(b_np, dev)
+        c = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=dtype), dev)
         f = tvm.build(s, [x, y, out], device, name="dense")
         f(a, b, c)
         tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-3)
diff --git a/tests/python/topi/python/test_topi_batch_to_space_nd.py b/tests/python/topi/python/test_topi_batch_to_space_nd.py
index 89d044fed963..7a8a813a196b 100644
--- a/tests/python/topi/python/test_topi_batch_to_space_nd.py
+++ b/tests/python/topi/python/test_topi_batch_to_space_nd.py
@@ -42,18 +42,18 @@ def verify_batch_to_space_nd(input_shape, block_shape, crop_begin_list, crop_end
         a_np, block_shape, crop_begin_list, crop_end_list
     )
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        with tvm.target.create(device):
-            s = tvm.topi.testing.get_injective_schedule(device)(B)
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx)
-        f = tvm.build(s, [A, B], device)
+    def check_device(target, dev):
+        print("Running on target: %s" % target)
+        with tvm.target.create(target):
+            s = tvm.topi.testing.get_injective_schedule(target)(B)
+        a = tvm.nd.array(a_np, dev)
+        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), dev)
+        f = tvm.build(s, [A, B], target)
         f(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_device(target, dev)
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/topi/python/test_topi_bitserial_conv2d.py b/tests/python/topi/python/test_topi_bitserial_conv2d.py
index b0bce44a03f9..4834b9069f9c 100644
--- a/tests/python/topi/python/test_topi_bitserial_conv2d.py
+++ b/tests/python/topi/python/test_topi_bitserial_conv2d.py
@@ -72,10 +72,10 @@ def get_ref_data():
 
     a_np, w_np, b_np = get_ref_data()
 
-    ctx = tvm.cpu(0)
-    a = tvm.nd.array(a_np, ctx)
-    w = tvm.nd.array(w_np, ctx)
-    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+    dev = tvm.cpu(0)
+    a = tvm.nd.array(a_np, dev)
+    w = tvm.nd.array(w_np, dev)
+    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
     func = tvm.build(s, [A, W, B], "llvm")
     func(a, w, b)
     tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
@@ -125,10 +125,10 @@ def get_ref_data():
 
     a_np, w_np, b_np = get_ref_data()
 
-    ctx = tvm.cpu(0)
-    a = tvm.nd.array(a_np, ctx)
-    w = tvm.nd.array(w_np, ctx)
-    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+    dev = tvm.cpu(0)
+    a = tvm.nd.array(a_np, dev)
+    w = tvm.nd.array(w_np, dev)
+    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
     func = tvm.build(s, [A, W, B], "llvm")
 
     func(a, w, b)
diff --git a/tests/python/topi/python/test_topi_bitserial_conv2d_rasp.py b/tests/python/topi/python/test_topi_bitserial_conv2d_rasp.py
index 1cd982db5450..2478e92625f7 100644
--- a/tests/python/topi/python/test_topi_bitserial_conv2d_rasp.py
+++ b/tests/python/topi/python/test_topi_bitserial_conv2d_rasp.py
@@ -67,7 +67,7 @@ def verify_bitserial_conv2d_nhwc(
     matches = re.findall("vpadd", assembly)
     assert len(matches) > 0
 
-    ctx = tvm.context(device, 0)
+    dev = tvm.device(device, 0)
     if "arm" not in os.uname()[4]:
         print("Skipped running code, not an arm device")
         return
@@ -89,9 +89,9 @@ def get_ref_data():
         return a_np, w_np, b_np
 
     a_np, w_np, b_np = get_ref_data()
-    a = tvm.nd.array(a_np, ctx)
-    w = tvm.nd.array(w_np, ctx)
-    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+    a = tvm.nd.array(a_np, dev)
+    w = tvm.nd.array(w_np, dev)
+    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
     func = tvm.build(s, [A, W, B], device)
 
     func(a, w, b)
diff --git a/tests/python/topi/python/test_topi_bitserial_dense.py b/tests/python/topi/python/test_topi_bitserial_dense.py
index a624b1b1fede..1e68fddcede9 100644
--- a/tests/python/topi/python/test_topi_bitserial_dense.py
+++ b/tests/python/topi/python/test_topi_bitserial_dense.py
@@ -68,10 +68,10 @@ def get_ref_data(a_shape, b_shape, input_dtype):
         b_shape = get_const_tuple(B.shape)
         a_np, b_np, c_np = get_ref_data(a_shape, b_shape, input_dtype)
 
-        ctx = tvm.cpu(0)
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(b_np, ctx)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        dev = tvm.cpu(0)
+        a = tvm.nd.array(a_np, dev)
+        b = tvm.nd.array(b_np, dev)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
         func = tvm.build(s, [A, B, C], target)
         func(a, b, c)
         tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
diff --git a/tests/python/topi/python/test_topi_bnn.py b/tests/python/topi/python/test_topi_bnn.py
index fbd9ac5d66c6..710489e41d45 100644
--- a/tests/python/topi/python/test_topi_bnn.py
+++ b/tests/python/topi/python/test_topi_bnn.py
@@ -51,12 +51,12 @@ def get_ref_data():
 
     a_np, b_np, c_np = get_ref_data()
 
-    ctx = tvm.cpu(0)
-    a = tvm.nd.array(a_np, ctx)
-    b = tvm.nd.array(b_np, ctx)
-    bnn_a = tvm.nd.array(np.zeros(get_const_tuple(bnn_A.shape), dtype=bnn_A.dtype), ctx)
-    bnn_b = tvm.nd.array(np.zeros(get_const_tuple(bnn_B.shape), dtype=bnn_B.dtype), ctx)
-    bnn_c = tvm.nd.array(np.zeros(get_const_tuple(bnn_C.shape), dtype=bnn_C.dtype), ctx)
+    dev = tvm.cpu(0)
+    a = tvm.nd.array(a_np, dev)
+    b = tvm.nd.array(b_np, dev)
+    bnn_a = tvm.nd.array(np.zeros(get_const_tuple(bnn_A.shape), dtype=bnn_A.dtype), dev)
+    bnn_b = tvm.nd.array(np.zeros(get_const_tuple(bnn_B.shape), dtype=bnn_B.dtype), dev)
+    bnn_c = tvm.nd.array(np.zeros(get_const_tuple(bnn_C.shape), dtype=bnn_C.dtype), dev)
     f1 = tvm.build(s1, [A, bnn_A], "llvm")
     f2 = tvm.build(s2, [B, bnn_B], "llvm")
     f3 = tvm.build(s3, [bnn_A1, bnn_B1, bnn_C], "llvm")
diff --git a/tests/python/topi/python/test_topi_broadcast.py b/tests/python/topi/python/test_topi_broadcast.py
index ada03ea5377b..1abd2cfc5e50 100644
--- a/tests/python/topi/python/test_topi_broadcast.py
+++ b/tests/python/topi/python/test_topi_broadcast.py
@@ -28,25 +28,25 @@ def verify_broadcast_to_ele(in_shape, out_shape, fbcast):
     A = te.placeholder(shape=in_shape, name="A")
     B = fbcast(A, out_shape)
 
-    def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
+    def check_target(target):
+        dev = tvm.device(target, 0)
+        if not tvm.testing.device_enabled(target):
+            print("Skip because %s is not enabled" % target)
             return
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_broadcast_schedule(device)(B)
-        foo = tvm.build(s, [A, B], device, name="broadcast_to")
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s = tvm.topi.testing.get_broadcast_schedule(target)(B)
+        foo = tvm.build(s, [A, B], target, name="broadcast_to")
         data_npy = np.random.uniform(size=in_shape).astype(A.dtype)
         out_npy = np.broadcast_to(data_npy, out_shape)
-        data_nd = tvm.nd.array(data_npy, ctx)
-        out_nd = tvm.nd.array(np.empty(out_shape).astype(B.dtype), ctx)
+        data_nd = tvm.nd.array(data_npy, dev)
+        out_nd = tvm.nd.array(np.empty(out_shape).astype(B.dtype), dev)
         foo(data_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for target, ctx in tvm.testing.enabled_targets():
-        check_device(target)
-    check_device("sdaccel")
+    for target, dev in tvm.testing.enabled_targets():
+        check_target(target)
+    check_target("sdaccel")
 
 
 def verify_broadcast_binary_ele(
@@ -76,7 +76,7 @@ def verify_broadcast_binary_ele(
         assert isinstance(C, tvm.tir.PrimExpr)
         return
 
-    def gen_operand(shape, low, high, ctx):
+    def gen_operand(shape, low, high, dev):
         if shape is None:
             npy = float(np.random.uniform(low=low, high=high))
             if dtype.startswith("int"):
@@ -84,30 +84,30 @@ def gen_operand(shape, low, high, ctx):
             nd = npy
         else:
             npy = np.random.uniform(low=low, high=high, size=shape).astype(dtype)
-            nd = tvm.nd.array(npy, ctx)
+            nd = tvm.nd.array(npy, dev)
         return npy, nd
 
-    def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
+    def check_target(target):
+        dev = tvm.device(target, 0)
+        if not tvm.testing.device_enabled(target):
+            print("Skip because %s is not enabled" % target)
             return
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_broadcast_schedule(device)(C)
-        foo = tvm.build(s, [A, B, C], device, name="broadcast_binary" + "_" + ftopi.__name__)
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s = tvm.topi.testing.get_broadcast_schedule(target)(C)
+        foo = tvm.build(s, [A, B, C], target, name="broadcast_binary" + "_" + ftopi.__name__)
 
-        lhs_npy, lhs_nd = gen_operand(lhs_shape, lhs_min, lhs_max, ctx)
-        rhs_npy, rhs_nd = gen_operand(rhs_shape, rhs_min, rhs_max, ctx)
+        lhs_npy, lhs_nd = gen_operand(lhs_shape, lhs_min, lhs_max, dev)
+        rhs_npy, rhs_nd = gen_operand(rhs_shape, rhs_min, rhs_max, dev)
         out_npy = fnumpy(lhs_npy, rhs_npy)
 
-        out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(C.dtype), ctx)
+        out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(C.dtype), dev)
         foo(lhs_nd, rhs_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1e-4, atol=1e-4)
 
-    for target, ctx in tvm.testing.enabled_targets():
-        check_device(target)
-    check_device("sdaccel")
+    for target, dev in tvm.testing.enabled_targets():
+        check_target(target)
+    check_target("sdaccel")
 
 
 @tvm.testing.uses_gpu
@@ -304,22 +304,22 @@ def test_apply(
             assert isinstance(B, tvm.tir.PrimExpr)
             return
 
-        def check_device(device, ctx):
-            print("Running on target: %s" % device)
-            with tvm.target.Target(device):
-                s = tvm.topi.testing.get_broadcast_schedule(device)(B)
-            foo = tvm.build(s, [A, B], device, name=name)
+        def check_target(target, dev):
+            print("Running on target: %s" % target)
+            with tvm.target.Target(target):
+                s = tvm.topi.testing.get_broadcast_schedule(target)(B)
+            foo = tvm.build(s, [A, B], target, name=name)
 
             data_npy = indata.astype(A.dtype)
-            data_nd = tvm.nd.array(data_npy, ctx)
+            data_nd = tvm.nd.array(data_npy, dev)
 
             out_npy = f_numpy(indata)
-            out_nd = tvm.nd.array(np.empty(data_npy.shape).astype(B.dtype), ctx)
+            out_nd = tvm.nd.array(np.empty(data_npy.shape).astype(B.dtype), dev)
             foo(data_nd, out_nd)
             tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-        for device, ctx in tvm.testing.enabled_targets():
-            check_device(device, ctx)
+        for target, dev in tvm.testing.enabled_targets():
+            check_target(target, dev)
 
     test_apply(topi.logical_not, "logical_not", np.logical_not, np.array([True, False, 0, 1]))
     test_apply(topi.logical_not, "logical_not", np.logical_not, np.array(np.arange(5) < 3))
@@ -342,22 +342,22 @@ def test_apply(
             assert isinstance(B, tvm.tir.PrimExpr)
             return
 
-        def check_device(device, ctx):
-            print("Running on target: %s" % device)
-            with tvm.target.Target(device):
-                s = tvm.topi.testing.get_broadcast_schedule(device)(B)
-            foo = tvm.build(s, [A, B], device, name=name)
+        def check_target(target, dev):
+            print("Running on target: %s" % target)
+            with tvm.target.Target(target):
+                s = tvm.topi.testing.get_broadcast_schedule(target)(B)
+            foo = tvm.build(s, [A, B], target, name=name)
 
             data_npy = np.random.uniform(size=shape).astype(A.dtype)
-            data_nd = tvm.nd.array(data_npy, ctx)
+            data_nd = tvm.nd.array(data_npy, dev)
 
             out_npy = f_numpy(data_npy)
-            out_nd = tvm.nd.array(np.empty(data_npy.shape).astype(B.dtype), ctx)
+            out_nd = tvm.nd.array(np.empty(data_npy.shape).astype(B.dtype), dev)
             foo(data_nd, out_nd)
             tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-        for device, ctx in tvm.testing.enabled_targets():
-            check_device(device, ctx)
+        for target, dev in tvm.testing.enabled_targets():
+            check_target(target, dev)
 
     test_apply(topi.bitwise_not, "bitwise_not", np.bitwise_not, ())
     test_apply(topi.bitwise_not, "bitwise_not", np.bitwise_not, (2, 1, 2))
@@ -381,22 +381,22 @@ def test_apply(
             assert isinstance(C, tvm.tir.PrimExpr)
             return
 
-        def check_device(device, ctx):
-            print("Running on target: %s" % device)
-            with tvm.target.Target(device):
-                s = tvm.topi.testing.get_broadcast_schedule(device)(C)
-            foo = tvm.build(s, [A, B, C], device, name=name)
+        def check_target(target, dev):
+            print("Running on target: %s" % target)
+            with tvm.target.Target(target):
+                s = tvm.topi.testing.get_broadcast_schedule(target)(C)
+            foo = tvm.build(s, [A, B, C], target, name=name)
 
-            lhs_nd = tvm.nd.array(lhs, ctx)
-            rhs_nd = tvm.nd.array(rhs, ctx)
+            lhs_nd = tvm.nd.array(lhs, dev)
+            rhs_nd = tvm.nd.array(rhs, dev)
 
             out_npy = f_numpy(lhs, rhs)
-            out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(C.dtype), ctx)
+            out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(C.dtype), dev)
             foo(lhs_nd, rhs_nd, out_nd)
             tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1e-4, atol=1e-4)
 
-        for device, ctx in tvm.testing.enabled_targets():
-            check_device(device, ctx)
+        for target, dev in tvm.testing.enabled_targets():
+            check_target(target, dev)
 
     test_apply(topi.logical_and, "logical_and", np.logical_and, True, False)
     test_apply(topi.logical_and, "logical_and", np.logical_and, [True, False], [False, False])
diff --git a/tests/python/topi/python/test_topi_clip.py b/tests/python/topi/python/test_topi_clip.py
index 704ffe7e6843..b8d5321d40f4 100644
--- a/tests/python/topi/python/test_topi_clip.py
+++ b/tests/python/topi/python/test_topi_clip.py
@@ -39,19 +39,19 @@ def get_ref_data():
 
     a_np, b_np = get_ref_data()
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_injective_schedule(device)(B)
+    def check_target(target, dev):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s = tvm.topi.testing.get_injective_schedule(target)(B)
 
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
-        f = tvm.build(s, [A, B], device, name="clip")
+        a = tvm.nd.array(a_np, dev)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), dev)
+        f = tvm.build(s, [A, B], target, name="clip")
         f(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_target(target, dev)
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/topi/python/test_topi_conv1d.py b/tests/python/topi/python/test_topi_conv1d.py
index aad029ce3ce5..4b1d71282484 100644
--- a/tests/python/topi/python/test_topi_conv1d.py
+++ b/tests/python/topi/python/test_topi_conv1d.py
@@ -77,25 +77,25 @@ def get_ref_data(layout):
 
     a_np, w_np, b_np = get_ref_data(layout)
 
-    def check_device(device, ctx):
+    def check_target(target, dev):
         if layout == "NCW":
-            fcompute, fschedule = tvm.topi.testing.dispatch(device, _conv1d_ncw_implement)
+            fcompute, fschedule = tvm.topi.testing.dispatch(target, _conv1d_ncw_implement)
         else:
-            fcompute, fschedule = tvm.topi.testing.dispatch(device, _conv1d_nwc_implement)
-        with tvm.target.Target(device):
+            fcompute, fschedule = tvm.topi.testing.dispatch(target, _conv1d_nwc_implement)
+        with tvm.target.Target(target):
             B = fcompute(A, W, stride, padding, dilation, "float32")
             s = fschedule([B])
 
-        a = tvm.nd.array(a_np, ctx)
-        w = tvm.nd.array(w_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
+        a = tvm.nd.array(a_np, dev)
+        w = tvm.nd.array(w_np, dev)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), dev)
 
-        func = tvm.build(s, [A, W, B], device)
+        func = tvm.build(s, [A, W, B], target)
         func(a, w, b)
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_target(target, dev)
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/topi/python/test_topi_conv1d_transpose_ncw.py b/tests/python/topi/python/test_topi_conv1d_transpose_ncw.py
index 2b8c486b8cd1..bfb60a9168d9 100644
--- a/tests/python/topi/python/test_topi_conv1d_transpose_ncw.py
+++ b/tests/python/topi/python/test_topi_conv1d_transpose_ncw.py
@@ -54,28 +54,28 @@ def get_ref_data():
 
     a_np, w_np, b_np, c_np = get_ref_data()
 
-    def check_device(device, ctx):
-        ctx = tvm.context(device, 0)
-        with tvm.target.Target(device):
-            fcompute, fschedule = tvm.topi.testing.dispatch(device, _conv1d_transpose_ncw_implement)
+    def check_target(target, dev):
+        dev = tvm.device(target, 0)
+        with tvm.target.Target(target):
+            fcompute, fschedule = tvm.topi.testing.dispatch(target, _conv1d_transpose_ncw_implement)
             B = fcompute(A, W, stride, padding, A.dtype, output_padding)
             C = topi.nn.relu(B)
             s1 = fschedule([B])
             s2 = fschedule([C])
-        a = tvm.nd.array(a_np, ctx)
-        w = tvm.nd.array(w_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        a = tvm.nd.array(a_np, dev)
+        w = tvm.nd.array(w_np, dev)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
 
-        func1 = tvm.build(s1, [A, W, B], device)
-        func2 = tvm.build(s2, [A, W, C], device)
+        func1 = tvm.build(s1, [A, W, B], target)
+        func2 = tvm.build(s2, [A, W, C], target)
         func1(a, w, b)
         func2(a, w, c)
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
         tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_target(target, dev)
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/topi/python/test_topi_conv2d_NCHWc.py b/tests/python/topi/python/test_topi_conv2d_NCHWc.py
index b1955ef5fa3b..09bd17ab2a72 100644
--- a/tests/python/topi/python/test_topi_conv2d_NCHWc.py
+++ b/tests/python/topi/python/test_topi_conv2d_NCHWc.py
@@ -116,7 +116,7 @@ def get_ref_data():
     a_np, w_np, b_np, c_np = get_ref_data()
 
     def check_device(device):
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
@@ -138,10 +138,10 @@ def check_device(device):
                 C = topi.nn.relu(C)
             s = topi.x86.schedule_conv2d_NCHWc([C])
 
-        a = tvm.nd.array(a_np, ctx)
-        w = tvm.nd.array(w_np, ctx)
-        b = tvm.nd.array(b_np, ctx)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        a = tvm.nd.array(a_np, dev)
+        w = tvm.nd.array(w_np, dev)
+        b = tvm.nd.array(b_np, dev)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
         if add_bias:
             func = tvm.build(
                 s,
diff --git a/tests/python/topi/python/test_topi_conv2d_hwcn.py b/tests/python/topi/python/test_topi_conv2d_hwcn.py
index bd88839c9c15..74aa1bfe0880 100644
--- a/tests/python/topi/python/test_topi_conv2d_hwcn.py
+++ b/tests/python/topi/python/test_topi_conv2d_hwcn.py
@@ -58,30 +58,30 @@ def get_ref_data():
 
     a_np, w_np, b_np, c1_np, c2_np, c3_np = get_ref_data()
 
-    def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
+    def check_target(target):
+        dev = tvm.device(target, 0)
+        if not tvm.testing.device_enabled(target):
+            print("Skip because %s is not enabled" % target)
             return
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            fcompute, fschedule = tvm.topi.testing.dispatch(device, _conv2d_hwcn_implement)
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            fcompute, fschedule = tvm.topi.testing.dispatch(target, _conv2d_hwcn_implement)
             t_conv = fcompute(A, W, stride, padding, dilation)
             t_bias = topi.add(t_conv, B)
             t_relu = topi.nn.relu(t_bias)
             s1 = fschedule([t_conv])
             s2 = fschedule([t_bias])
             s3 = fschedule([t_relu])
-        a = tvm.nd.array(a_np, ctx)
-        w = tvm.nd.array(w_np, ctx)
-        b = tvm.nd.array(b_np, ctx)
+        a = tvm.nd.array(a_np, dev)
+        w = tvm.nd.array(w_np, dev)
+        b = tvm.nd.array(b_np, dev)
 
-        conv_out = tvm.nd.array(np.zeros(get_const_tuple(t_conv.shape), dtype=t_conv.dtype), ctx)
-        bias_out = tvm.nd.array(np.zeros(get_const_tuple(t_bias.shape), dtype=t_bias.dtype), ctx)
-        relu_out = tvm.nd.array(np.zeros(get_const_tuple(t_relu.shape), dtype=t_relu.dtype), ctx)
-        func1 = tvm.build(s1, [A, W, t_conv], device)
-        func2 = tvm.build(s2, [A, W, B, t_bias], device)
-        func3 = tvm.build(s3, [A, W, B, t_relu], device)
+        conv_out = tvm.nd.array(np.zeros(get_const_tuple(t_conv.shape), dtype=t_conv.dtype), dev)
+        bias_out = tvm.nd.array(np.zeros(get_const_tuple(t_bias.shape), dtype=t_bias.dtype), dev)
+        relu_out = tvm.nd.array(np.zeros(get_const_tuple(t_relu.shape), dtype=t_relu.dtype), dev)
+        func1 = tvm.build(s1, [A, W, t_conv], target)
+        func2 = tvm.build(s2, [A, W, B, t_bias], target)
+        func3 = tvm.build(s3, [A, W, B, t_relu], target)
         func1(a, w, conv_out)
         func2(a, w, b, bias_out)
         func3(a, w, b, relu_out)
@@ -89,8 +89,8 @@ def check_device(device):
         tvm.testing.assert_allclose(bias_out.asnumpy(), c2_np, rtol=1e-5)
         tvm.testing.assert_allclose(relu_out.asnumpy(), c3_np, rtol=1e-5)
 
-    for device in ["cuda", "opencl", "metal", "rocm", "vulkan", "nvptx"]:
-        check_device(device)
+    for target in ["cuda", "opencl", "metal", "rocm", "vulkan", "nvptx"]:
+        check_target(target)
 
 
 @tvm.testing.requires_gpu
diff --git a/tests/python/topi/python/test_topi_conv2d_hwnc_tensorcore.py b/tests/python/topi/python/test_topi_conv2d_hwnc_tensorcore.py
index 9d63175d2e84..bb11a56cdce3 100644
--- a/tests/python/topi/python/test_topi_conv2d_hwnc_tensorcore.py
+++ b/tests/python/topi/python/test_topi_conv2d_hwnc_tensorcore.py
@@ -103,28 +103,28 @@ def convert_int32_into_int4(a_int32):
         a_np = convert_int32_into_int4(a_np)
         w_np = convert_int32_into_int4(w_np)
 
-    def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
+    def check_target(target):
+        dev = tvm.device(target, 0)
+        if not tvm.testing.device_enabled(target):
+            print("Skip because %s is not enabled" % target)
             return
-        if not nvcc.have_tensorcore(ctx.compute_version):
+        if not nvcc.have_tensorcore(dev.compute_version):
             print("skip because gpu does not support Tensor Cores")
             return
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            fcompute, fschedule = topi.testing.dispatch(device, _conv2d_hwnc_tensorcore_implement)
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            fcompute, fschedule = topi.testing.dispatch(target, _conv2d_hwnc_tensorcore_implement)
             C = fcompute(A, W, stride, padding, dilation, dtype, "int32")
             s = fschedule([C])
 
-        a = tvm.nd.array(a_np.transpose((1, 2, 0, 3)), ctx)
-        w = tvm.nd.array(w_np, ctx)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        a = tvm.nd.array(a_np.transpose((1, 2, 0, 3)), dev)
+        w = tvm.nd.array(w_np, dev)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
 
         func = tvm.build(
             s,
             [A, W, C],
-            device,
+            target,
             name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
             % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
         )
@@ -133,7 +133,7 @@ def check_device(device):
         rtol = 1e-3
         tvm.testing.assert_allclose(c.asnumpy().transpose((2, 0, 1, 3)), c_np, rtol=rtol)
 
-    check_device("cuda")
+    check_target("cuda")
 
 
 @tvm.testing.requires_tensorcore
diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py b/tests/python/topi/python/test_topi_conv2d_int8.py
index a934e3ef2fd2..07f7895f47f7 100644
--- a/tests/python/topi/python/test_topi_conv2d_int8.py
+++ b/tests/python/topi/python/test_topi_conv2d_int8.py
@@ -83,16 +83,16 @@ def compile_conv2d_NHWC_gemm_int8_arm(
     ]
 
     for device_tuple in devices:
-        device = device_tuple[0]
+        target = device_tuple[0]
         compute = device_tuple[1]
         schedule = device_tuple[2]
 
-        ctx = tvm.context(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
+        dev = tvm.device(target, 0)
+        if not tvm.testing.device_enabled(target):
+            print("Skip because %s is not enabled" % target)
             return
-        print("Compiling on arm AArch64 target: %s" % device)
-        with tvm.target.Target(device):
+        print("Compiling on arm AArch64 target: %s" % target)
+        with tvm.target.Target(target):
             assert is_aarch64_arm(), "AArch64 target not recognized"
 
             C = compute(A, W, (stride, stride), padding, (dilation, dilation), dtype)
@@ -106,14 +106,14 @@ def compile_conv2d_NHWC_gemm_int8_arm(
             tvm.build(
                 s,
                 [A, W, bias, C],
-                device,
+                target,
                 name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
                 % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
             )
             func = tvm.build(
                 s,
                 [A, W, bias, C],
-                device,
+                target,
                 name="relu_%dnnn_%d_%d_%d_%d_%d_%d_%d"
                 % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
             )
@@ -121,7 +121,7 @@ def compile_conv2d_NHWC_gemm_int8_arm(
             func = tvm.build(
                 s,
                 [A, W, C],
-                device,
+                target,
                 name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
                 % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
             )
@@ -175,13 +175,13 @@ def get_ref_data():
 
     a_np, w_np, b_np, c_np = get_ref_data()
 
-    def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
+    def check_target(target):
+        dev = tvm.device(target, 0)
+        if not tvm.testing.device_enabled(target):
+            print("Skip because %s is not enabled" % target)
             return
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
             C = topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved(
                 A, W, (stride, stride), padding, (dilation, dilation), dtype
             )
@@ -191,22 +191,22 @@ def check_device(device):
                 C = topi.nn.relu(C)
             s = topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved([C])
 
-        a = tvm.nd.array(a_np, ctx)
-        w = tvm.nd.array(w_np, ctx)
-        b = tvm.nd.array(b_np, ctx)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        a = tvm.nd.array(a_np, dev)
+        w = tvm.nd.array(w_np, dev)
+        b = tvm.nd.array(b_np, dev)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
         if add_bias:
             tvm.build(
                 s,
                 [A, W, bias, C],
-                device,
+                target,
                 name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
                 % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
             )
             func = tvm.build(
                 s,
                 [A, W, bias, C],
-                device,
+                target,
                 name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
                 % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
             )
@@ -215,14 +215,14 @@ def check_device(device):
             func = tvm.build(
                 s,
                 [A, W, C],
-                device,
+                target,
                 name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
                 % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
             )
             func(a, w, c)
         tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
-    check_device("llvm")
+    check_target("llvm")
 
 
 oc_block_factor = 4
@@ -284,17 +284,17 @@ def get_ref_data():
 
     a_np, w_np, b_np, c_np = get_ref_data()
 
-    def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
+    def check_target(target):
+        dev = tvm.device(target, 0)
+        if not tvm.testing.device_enabled(target):
+            print("Skip because %s is not enabled" % target)
             return
-        if device == "cuda" and not tvm.contrib.nvcc.have_int8(ctx.compute_version):
+        if target == "cuda" and not tvm.contrib.nvcc.have_int8(dev.compute_version):
             print("Skip because int8 intrinsics are not available")
             return
 
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
             C = topi.cuda.conv2d_NCHWc_int8(
                 A, W, (stride, stride), padding, (dilation, dilation), "NCHW", dtype
             )
@@ -304,22 +304,22 @@ def check_device(device):
                 C = topi.nn.relu(C)
             s = topi.cuda.schedule_conv2d_NCHWc_int8([C])
 
-        a = tvm.nd.array(a_np, ctx)
-        w = tvm.nd.array(w_np, ctx)
-        b = tvm.nd.array(b_np, ctx)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        a = tvm.nd.array(a_np, dev)
+        w = tvm.nd.array(w_np, dev)
+        b = tvm.nd.array(b_np, dev)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
         if add_bias:
             tvm.build(
                 s,
                 [A, W, bias, C],
-                device,
+                target,
                 name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
                 % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
             )
             func = tvm.build(
                 s,
                 [A, W, bias, C],
-                device,
+                target,
                 name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
                 % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
             )
@@ -328,15 +328,15 @@ def check_device(device):
             func = tvm.build(
                 s,
                 [A, W, C],
-                device,
+                target,
                 name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
                 % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
             )
             func(a, w, c)
         tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
-    for device in ["cuda"]:
-        check_device(device)
+    for target in ["cuda"]:
+        check_target(target)
 
 
 def verify_conv2d_nchw_int8(
@@ -403,17 +403,17 @@ def verify_workload_padding():
 
         tvm.testing.assert_allclose(ow_tile, out_width)
 
-    def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
+    def check_target(target):
+        dev = tvm.device(target, 0)
+        if not tvm.testing.device_enabled(target):
+            print("Skip because %s is not enabled" % target)
             return
-        if device == "cuda" and not tvm.contrib.nvcc.have_int8(ctx.compute_version):
+        if target == "cuda" and not tvm.contrib.nvcc.have_int8(dev.compute_version):
             print("Skip because int8 intrinsics are not available")
             return
 
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
             C = topi.cuda.conv2d_nchw_int8(
                 A, W, (stride, stride), padding, (dilation, dilation), dtype
             )
@@ -423,22 +423,22 @@ def check_device(device):
                 C = topi.nn.relu(C)
             s = topi.cuda.schedule_conv2d_nchw_int8([C])
 
-        a = tvm.nd.array(a_np, ctx)
-        w = tvm.nd.array(w_np, ctx)
-        b = tvm.nd.array(b_np, ctx)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        a = tvm.nd.array(a_np, dev)
+        w = tvm.nd.array(w_np, dev)
+        b = tvm.nd.array(b_np, dev)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
         if add_bias:
             tvm.build(
                 s,
                 [A, W, bias, C],
-                device,
+                target,
                 name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
                 % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
             )
             func = tvm.build(
                 s,
                 [A, W, bias, C],
-                device,
+                target,
                 name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
                 % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
             )
@@ -447,7 +447,7 @@ def check_device(device):
             func = tvm.build(
                 s,
                 [A, W, C],
-                device,
+                target,
                 name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
                 % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
             )
@@ -456,8 +456,8 @@ def check_device(device):
 
     verify_workload_padding()
 
-    for device in ["cuda"]:
-        check_device(device)
+    for target in ["cuda"]:
+        check_target(target)
 
 
 @tvm.testing.requires_cuda
diff --git a/tests/python/topi/python/test_topi_conv2d_nchw.py b/tests/python/topi/python/test_topi_conv2d_nchw.py
index 07ad45c971df..5aff6e807633 100644
--- a/tests/python/topi/python/test_topi_conv2d_nchw.py
+++ b/tests/python/topi/python/test_topi_conv2d_nchw.py
@@ -89,20 +89,20 @@ def verify_workload_padding():
 
         tvm.testing.assert_allclose(ow_tile, out_width)
 
-    def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
+    def check_target(target):
+        dev = tvm.device(target, 0)
+        if not tvm.testing.device_enabled(target):
+            print("Skip because %s is not enabled" % target)
             return
-        print("Running on target: %s" % device)
+        print("Running on target: %s" % target)
 
-        if "cudnn" in device:
+        if "cudnn" in target:
             fcompute, fschedule = topi.cuda.conv2d_cudnn, topi.cuda.schedule_conv2d_cudnn
         else:
-            fcompute, fschedule = tvm.topi.testing.get_conv2d_nchw_implement(device)
+            fcompute, fschedule = tvm.topi.testing.get_conv2d_nchw_implement(target)
 
-        with tvm.target.Target(device):
-            if "cudnn" in device:
+        with tvm.target.Target(target):
+            if "cudnn" in target:
                 C = fcompute(
                     A, W, (stride, stride), padding, (dilation, dilation), 1, "NCHW", dtype
                 )
@@ -114,19 +114,19 @@ def check_device(device):
                 C = topi.nn.relu(C)
             s = fschedule([C])
 
-            if "llvm" in device:
+            if "llvm" in target:
                 verify_workload_padding()
 
-        a = tvm.nd.array(a_np, ctx)
-        w = tvm.nd.array(w_np, ctx)
-        b = tvm.nd.array(b_np, ctx)
+        a = tvm.nd.array(a_np, dev)
+        w = tvm.nd.array(w_np, dev)
+        b = tvm.nd.array(b_np, dev)
 
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
         if add_bias:
             func = tvm.build(
                 s,
                 [A, W, bias, C],
-                device,
+                target,
                 name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
                 % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
             )
@@ -135,19 +135,19 @@ def check_device(device):
             func = tvm.build(
                 s,
                 [A, W, C],
-                device,
+                target,
                 name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
                 % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
             )
             func(a, w, c)
         tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-4)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
-            check_device(device)
+    for target, dev in tvm.testing.enabled_targets():
+        with autotvm.tophub.context(target):  # load tophub pre-tuned parameters
+            check_target(target)
 
     if use_cudnn:
-        check_device("cuda -model=unknown -libs=cudnn")
+        check_target("cuda -model=unknown -libs=cudnn")
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/topi/python/test_topi_conv2d_nhwc.py b/tests/python/topi/python/test_topi_conv2d_nhwc.py
index 8c3b9e931eea..98a9387e8777 100644
--- a/tests/python/topi/python/test_topi_conv2d_nhwc.py
+++ b/tests/python/topi/python/test_topi_conv2d_nhwc.py
@@ -67,10 +67,10 @@ def check_device(device):
             fcompute, fschedule = tvm.topi.testing.dispatch(device, _conv2d_nhwc_implement)
             B = fcompute(A, W, stride, padding, dilation, dtype)
             s = fschedule([B])
-        ctx = tvm.context(device, 0)
-        a = tvm.nd.array(a_np, ctx)
-        w = tvm.nd.array(w_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+        dev = tvm.device(device, 0)
+        a = tvm.nd.array(a_np, dev)
+        w = tvm.nd.array(w_np, dev)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
         func = tvm.build(s, [A, W, B], device)
         func(a, w, b)
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
diff --git a/tests/python/topi/python/test_topi_conv2d_nhwc_pack_int8.py b/tests/python/topi/python/test_topi_conv2d_nhwc_pack_int8.py
index 66ce6ffe41f4..a191f2eb9d37 100644
--- a/tests/python/topi/python/test_topi_conv2d_nhwc_pack_int8.py
+++ b/tests/python/topi/python/test_topi_conv2d_nhwc_pack_int8.py
@@ -52,7 +52,7 @@ def get_ref_data():
     a_np, w_np, b_np = get_ref_data()
 
     def check_device(device):
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
@@ -61,9 +61,9 @@ def check_device(device):
         with tvm.target.Target(device):
             B = topi.nn.conv2d(A, W, stride, padding, dilation, layout="NHWC", out_dtype="int32")
             s = topi.x86.schedule_conv2d_nhwc_pack_int8([B])
-        a = tvm.nd.array(a_np, ctx)
-        w = tvm.nd.array(w_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+        a = tvm.nd.array(a_np, dev)
+        w = tvm.nd.array(w_np, dev)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
         func = tvm.build(s, [A, W, B], device)
         func(a, w, b)
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
diff --git a/tests/python/topi/python/test_topi_conv2d_nhwc_tensorcore.py b/tests/python/topi/python/test_topi_conv2d_nhwc_tensorcore.py
index eab73410dbe6..5761dccf48fc 100644
--- a/tests/python/topi/python/test_topi_conv2d_nhwc_tensorcore.py
+++ b/tests/python/topi/python/test_topi_conv2d_nhwc_tensorcore.py
@@ -83,11 +83,11 @@ def get_ref_data():
     a_np, w_np, b_np, c_np = get_ref_data()
 
     def check_device(device):
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
-        if not nvcc.have_tensorcore(ctx.compute_version):
+        if not nvcc.have_tensorcore(dev.compute_version):
             print("skip because gpu does not support Tensor Cores")
             return
         print("Running on target: %s" % device)
@@ -102,10 +102,10 @@ def check_device(device):
                 C = topi.nn.relu(C)
             s = fschedule([C])
 
-        a = tvm.nd.array(a_np, ctx)
-        w = tvm.nd.array(w_np, ctx)
-        b = tvm.nd.array(b_np, ctx)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        a = tvm.nd.array(a_np, dev)
+        w = tvm.nd.array(w_np, dev)
+        b = tvm.nd.array(b_np, dev)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
         if add_bias:
             func = tvm.build(
                 s,
diff --git a/tests/python/topi/python/test_topi_conv2d_nhwc_winograd.py b/tests/python/topi/python/test_topi_conv2d_nhwc_winograd.py
index 436270173316..cb1fd3d233fa 100644
--- a/tests/python/topi/python/test_topi_conv2d_nhwc_winograd.py
+++ b/tests/python/topi/python/test_topi_conv2d_nhwc_winograd.py
@@ -91,7 +91,7 @@ def get_ref_data():
     a_np, w_np, b_np, c_np = get_ref_data()
 
     def check_device(device):
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         print("Running on target: %s" % device)
         with tvm.target.Target(device):
             if bgemm == "direct":
@@ -109,10 +109,10 @@ def check_device(device):
                 C = topi.nn.relu(C)
             s = fschedule([C])
 
-        a = tvm.nd.array(a_np, ctx)
-        w = tvm.nd.array(w_np, ctx)
-        b = tvm.nd.array(b_np, ctx)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        a = tvm.nd.array(a_np, dev)
+        w = tvm.nd.array(w_np, dev)
+        b = tvm.nd.array(b_np, dev)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
         if add_bias:
             func = tvm.build(
                 s,
diff --git a/tests/python/topi/python/test_topi_conv2d_transpose_nchw.py b/tests/python/topi/python/test_topi_conv2d_transpose_nchw.py
index 5cc2c2eb6f5d..95159a05d54f 100644
--- a/tests/python/topi/python/test_topi_conv2d_transpose_nchw.py
+++ b/tests/python/topi/python/test_topi_conv2d_transpose_nchw.py
@@ -62,7 +62,7 @@ def get_ref_data():
 
     a_np, w_np, b_np, c_np = get_ref_data()
 
-    def check(fcompute, fschedule, device, ctx):
+    def check(fcompute, fschedule, target, dev):
         B = fcompute(
             A,
             W,
@@ -74,36 +74,36 @@ def check(fcompute, fschedule, device, ctx):
         C = topi.nn.relu(B)
         s1 = fschedule([B])
         s2 = fschedule([C])
-        a = tvm.nd.array(a_np, ctx)
-        w = tvm.nd.array(w_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        a = tvm.nd.array(a_np, dev)
+        w = tvm.nd.array(w_np, dev)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
 
-        func1 = tvm.build(s1, [A, W, B], device)
-        func2 = tvm.build(s2, [A, W, C], device)
+        func1 = tvm.build(s1, [A, W, B], target)
+        func2 = tvm.build(s2, [A, W, C], target)
         func1(a, w, b)
         func2(a, w, c)
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
         tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
-    def check_generic(device, ctx):
-        print("Running generic on target: %s" % device)
-        with tvm.target.Target(device):
+    def check_generic(target, dev):
+        print("Running generic on target: %s" % target)
+        with tvm.target.Target(target):
             fcompute, fschedule = _conv2d_transpose_nchw_implement["generic"]
-            check(fcompute, fschedule, device, ctx)
+            check(fcompute, fschedule, target, dev)
 
     check_generic("llvm", tvm.cpu(0))
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
+    def check_target(target, dev):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
             fcompute, fschedule = tvm.topi.testing.dispatch(
-                device, _conv2d_transpose_nchw_implement
+                target, _conv2d_transpose_nchw_implement
             )
-            check(fcompute, fschedule, device, ctx)
+            check(fcompute, fschedule, target, dev)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_target(target, dev)
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/topi/python/test_topi_conv2d_winograd.py b/tests/python/topi/python/test_topi_conv2d_winograd.py
index 34febfd9460a..c91447f1096f 100644
--- a/tests/python/topi/python/test_topi_conv2d_winograd.py
+++ b/tests/python/topi/python/test_topi_conv2d_winograd.py
@@ -84,7 +84,7 @@ def get_ref_data():
     a_np, w_np, b_np, c_np = get_ref_data()
 
     def check_device(device):
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
@@ -98,10 +98,10 @@ def check_device(device):
                 C = topi.nn.relu(C)
             s = fschedule([C])
 
-        a = tvm.nd.array(a_np, ctx)
-        w = tvm.nd.array(w_np, ctx)
-        b = tvm.nd.array(b_np, ctx)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        a = tvm.nd.array(a_np, dev)
+        w = tvm.nd.array(w_np, dev)
+        b = tvm.nd.array(b_np, dev)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
         if add_bias:
             func = tvm.build(
                 s,
@@ -205,15 +205,15 @@ def get_ref_data():
     a_np, w_np, b_np, c_np = get_ref_data()
 
     target = "llvm"
-    ctx = tvm.context(target)
+    dev = tvm.device(target)
 
     C = topi.nn.conv2d_winograd_nhwc(A, W, stride, padding, dilation, dtype)
     s = te.create_schedule([C.op])
 
-    a = tvm.nd.array(a_np, ctx=ctx)
-    w = tvm.nd.array(w_np, ctx=ctx)
-    b = tvm.nd.array(b_np, ctx=ctx)
-    c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx=ctx)
+    a = tvm.nd.array(a_np, device=dev)
+    w = tvm.nd.array(w_np, device=dev)
+    b = tvm.nd.array(b_np, device=dev)
+    c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), device=dev)
     func = tvm.build(s, [A, W, C], target=target)
     func(a, w, c)
 
diff --git a/tests/python/topi/python/test_topi_conv3d_ncdhw.py b/tests/python/topi/python/test_topi_conv3d_ncdhw.py
index 094a71074fa0..e28aea5bc4f6 100644
--- a/tests/python/topi/python/test_topi_conv3d_ncdhw.py
+++ b/tests/python/topi/python/test_topi_conv3d_ncdhw.py
@@ -81,10 +81,10 @@ def get_ref_data():
 
     a_np, w_np, b_np, c_np = get_ref_data()
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        fcompute, fschedule = tvm.topi.testing.dispatch(device, _conv3d_ncdhw_implement)
-        with tvm.target.Target(device):
+    def check_target(target, dev):
+        print("Running on target: %s" % target)
+        fcompute, fschedule = tvm.topi.testing.dispatch(target, _conv3d_ncdhw_implement)
+        with tvm.target.Target(target):
             C = fcompute(
                 A, W, (stride, stride, stride), padding, (dilation, dilation, dilation), dtype
             )
@@ -94,15 +94,15 @@ def check_device(device, ctx):
                 C = topi.nn.relu(C)
             s = fschedule([C])
 
-        a = tvm.nd.array(a_np, ctx)
-        w = tvm.nd.array(w_np, ctx)
-        b = tvm.nd.array(b_np, ctx)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        a = tvm.nd.array(a_np, dev)
+        w = tvm.nd.array(w_np, dev)
+        b = tvm.nd.array(b_np, dev)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
         if add_bias:
             func = tvm.build(
                 s,
                 [A, W, bias, C],
-                device,
+                target,
                 name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
                 % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
             )
@@ -111,16 +111,16 @@ def check_device(device, ctx):
             func = tvm.build(
                 s,
                 [A, W, C],
-                device,
+                target,
                 name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
                 % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
             )
             func(a, w, c)
         tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-4)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
-            check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        with autotvm.tophub.context(target):  # load tophub pre-tuned parameters
+            check_target(target, dev)
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/topi/python/test_topi_conv3d_ndhwc.py b/tests/python/topi/python/test_topi_conv3d_ndhwc.py
index 2d2541af5979..e5791c3bb482 100644
--- a/tests/python/topi/python/test_topi_conv3d_ndhwc.py
+++ b/tests/python/topi/python/test_topi_conv3d_ndhwc.py
@@ -64,22 +64,22 @@ def get_ref_data():
 
     a_np, w_np, b_np = get_ref_data()
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        fcompute, fschedule = tvm.topi.testing.dispatch(device, _conv3d_ndhwc_implement)
-        with tvm.target.Target(device):
+    def check_target(target, dev):
+        print("Running on target: %s" % target)
+        fcompute, fschedule = tvm.topi.testing.dispatch(target, _conv3d_ndhwc_implement)
+        with tvm.target.Target(target):
             B = fcompute(A, W, stride, padding, dilation, dtype)
             s = fschedule([B])
-        ctx = tvm.context(device, 0)
-        a = tvm.nd.array(a_np, ctx)
-        w = tvm.nd.array(w_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
-        func = tvm.build(s, [A, W, B], device)
+        dev = tvm.device(target, 0)
+        a = tvm.nd.array(a_np, dev)
+        w = tvm.nd.array(w_np, dev)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
+        func = tvm.build(s, [A, W, B], target)
         func(a, w, b)
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_target(target, dev)
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/topi/python/test_topi_conv3d_ndhwc_tensorcore.py b/tests/python/topi/python/test_topi_conv3d_ndhwc_tensorcore.py
index 1e027aba4cd3..8340ff27fddf 100644
--- a/tests/python/topi/python/test_topi_conv3d_ndhwc_tensorcore.py
+++ b/tests/python/topi/python/test_topi_conv3d_ndhwc_tensorcore.py
@@ -85,7 +85,7 @@ def get_ref_data():
     a_np, w_np, b_np, c_np = get_ref_data()
 
     def check_device(device):
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         print("Running on target: %s" % device)
         with tvm.target.Target(device):
             fcompute, fschedule = tvm.topi.testing.dispatch(
@@ -98,10 +98,10 @@ def check_device(device):
                 C = topi.nn.relu(C)
             s = fschedule([C])
 
-        a = tvm.nd.array(a_np, ctx)
-        w = tvm.nd.array(w_np, ctx)
-        b = tvm.nd.array(b_np, ctx)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        a = tvm.nd.array(a_np, dev)
+        w = tvm.nd.array(w_np, dev)
+        b = tvm.nd.array(b_np, dev)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
         if add_bias:
             func = tvm.build(
                 s,
diff --git a/tests/python/topi/python/test_topi_conv3d_transpose_ncdhw.py b/tests/python/topi/python/test_topi_conv3d_transpose_ncdhw.py
index 2ac7ccc708ec..fadbc8d48403 100644
--- a/tests/python/topi/python/test_topi_conv3d_transpose_ncdhw.py
+++ b/tests/python/topi/python/test_topi_conv3d_transpose_ncdhw.py
@@ -61,11 +61,11 @@ def get_ref_data():
 
     a_np, w_np, b_np, c_np = get_ref_data()
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
+    def check_target(target, dev):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
             fcompute, fschedule = tvm.topi.testing.dispatch(
-                device, _conv3d_transpose_ncdhw_implement
+                target, _conv3d_transpose_ncdhw_implement
             )
             B = fcompute(
                 A,
@@ -78,20 +78,20 @@ def check_device(device, ctx):
             C = topi.nn.relu(B)
             s1 = fschedule([B])
             s2 = fschedule([C])
-        a = tvm.nd.array(a_np, ctx)
-        w = tvm.nd.array(w_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        a = tvm.nd.array(a_np, dev)
+        w = tvm.nd.array(w_np, dev)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
 
-        func1 = tvm.build(s1, [A, W, B], device)
-        func2 = tvm.build(s2, [A, W, C], device)
+        func1 = tvm.build(s1, [A, W, B], target)
+        func2 = tvm.build(s2, [A, W, C], target)
         func1(a, w, b)
         func2(a, w, c)
         tvm.testing.assert_allclose(b.asnumpy(), b_np, atol=1e-4, rtol=1e-4)
         tvm.testing.assert_allclose(c.asnumpy(), c_np, atol=1e-4, rtol=1e-4)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_target(target, dev)
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/topi/python/test_topi_conv3d_winograd.py b/tests/python/topi/python/test_topi_conv3d_winograd.py
index d00249ba4392..650ead3ff009 100644
--- a/tests/python/topi/python/test_topi_conv3d_winograd.py
+++ b/tests/python/topi/python/test_topi_conv3d_winograd.py
@@ -82,7 +82,7 @@ def get_ref_data():
     a_np, w_np, b_np, c_np = get_ref_data()
 
     def check_device(device):
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
@@ -98,10 +98,10 @@ def check_device(device):
                 C = topi.nn.relu(C)
             s = fschedule([C])
 
-        a = tvm.nd.array(a_np, ctx)
-        w = tvm.nd.array(w_np, ctx)
-        b = tvm.nd.array(b_np, ctx)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        a = tvm.nd.array(a_np, dev)
+        w = tvm.nd.array(w_np, dev)
+        b = tvm.nd.array(b_np, dev)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
         if add_bias:
             func = tvm.build(
                 s,
diff --git a/tests/python/topi/python/test_topi_correlation.py b/tests/python/topi/python/test_topi_correlation.py
index 4709fb7d68f9..33df5a1a00f4 100644
--- a/tests/python/topi/python/test_topi_correlation.py
+++ b/tests/python/topi/python/test_topi_correlation.py
@@ -65,25 +65,25 @@ def get_ref_data():
 
     a_np, b_np, c_np = get_ref_data()
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        fcompute, fschedule = tvm.topi.testing.dispatch(device, _correlation_implement)
-        with tvm.target.Target(device):
+    def check_device(target, dev):
+        print("Running on target: %s" % target)
+        fcompute, fschedule = tvm.topi.testing.dispatch(target, _correlation_implement)
+        with tvm.target.Target(target):
             C = fcompute(
                 A, B, kernel_size, max_displacement, stride1, stride2, pad_size, is_multiply
             )
             s = fschedule([C])
 
-            a = tvm.nd.array(a_np, ctx)
-            b = tvm.nd.array(b_np, ctx)
-            c = tvm.nd.empty(c_np.shape, dtype=dtype, ctx=ctx)
+            a = tvm.nd.array(a_np, dev)
+            b = tvm.nd.array(b_np, dev)
+            c = tvm.nd.empty(c_np.shape, dtype=dtype, device=dev)
 
-            func = tvm.build(s, [A, B, C], device)
+            func = tvm.build(s, [A, B, C], target)
             func(a, b, c)
             tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_device(target, dev)
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/topi/python/test_topi_cumsum.py b/tests/python/topi/python/test_topi_cumsum.py
deleted file mode 100644
index cfe5130643c5..000000000000
--- a/tests/python/topi/python/test_topi_cumsum.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-import tvm
-import tvm.testing
-from tvm import topi
-import tvm.topi.testing
-
-
-@tvm.testing.parametrize_targets
-def test_cumsum(ctx, target):
-    def check_cumsum(np_ref, data, axis=None, dtype=None):
-        implementations = {
-            "generic": (lambda x: topi.cumsum(x, axis, dtype), topi.generic.schedule_extern),
-            "cuda": (lambda x: topi.cuda.cumsum(x, axis, dtype), topi.cuda.schedule_scan),
-            "nvptx": (lambda x: topi.cuda.cumsum(x, axis, dtype), topi.cuda.schedule_scan),
-            "vulkan": (lambda x: topi.cuda.cumsum(x, axis, dtype), topi.cuda.schedule_scan),
-            "metal": (lambda x: topi.cuda.cumsum(x, axis, dtype), topi.cuda.schedule_scan),
-        }
-        fcompute, fschedule = tvm.topi.testing.dispatch(target, implementations)
-        tvm.topi.testing.compare_numpy_tvm([data], np_ref, target, ctx, fcompute, fschedule)
-
-    data = np.array([2, 3, 0])
-    check_cumsum(np.cumsum(data), data)
-
-    data = np.random.rand(10) > 0.5
-    data = data.astype(np.int32)
-    check_cumsum(np.cumsum(data, dtype=np.int32), data)
-    check_cumsum(np.cumsum(data), data, dtype="int64")
-
-    data = np.random.rand(10) > 0.5
-    check_cumsum(np.cumsum(data, dtype=np.int32), data, dtype="int32")
-
-    for in_dtype in ["float32", "float64"]:
-        if target == "metal" and in_dtype == "float64":
-            # float64 is not supported in metal
-            continue
-        data = np.random.randn(10, 10).astype(in_dtype)
-        check_cumsum(np.cumsum(data), data)
-        check_cumsum(np.cumsum(data, axis=0), data, axis=0)
-        check_cumsum(np.cumsum(data, axis=1), data, axis=1)
-
-        data = np.random.randn(10, 5, 10).astype(in_dtype)
-        check_cumsum(np.cumsum(data), data)
-        check_cumsum(np.cumsum(data, axis=0), data, axis=0)
-        check_cumsum(np.cumsum(data, axis=1), data, axis=1)
-        check_cumsum(np.cumsum(data, axis=-1), data, axis=-1)
-
-    for in_dtype in ["int32", "int64"]:
-        data = np.random.randint(-100, 100, size=(100, 100)).astype(in_dtype)
-        check_cumsum(np.cumsum(data, dtype=in_dtype), data)
-        check_cumsum(np.cumsum(data), data, dtype="int64")
-        check_cumsum(np.cumsum(data, axis=0, dtype=in_dtype), data, axis=0)
-        check_cumsum(np.cumsum(data, axis=1, dtype=in_dtype), data, axis=1)
-
-        data = np.random.randint(1 << 30, (1 << 31) - 1, size=(100)).astype(in_dtype)
-        check_cumsum(np.cumsum(data), data, dtype="int64")
-
-
-if __name__ == "__main__":
-    test_cumsum(tvm.context("cpu"), tvm.target.Target("llvm"))
-    test_cumsum(tvm.context("cuda"), tvm.target.Target("cuda"))
-    test_cumsum(tvm.context("nvptx"), tvm.target.Target("nvptx"))
-    test_cumsum(tvm.context("vulkan"), tvm.target.Target("vulkan"))
-    test_cumsum(tvm.context("metal"), tvm.target.Target("metal"))
diff --git a/tests/python/topi/python/test_topi_deformable_conv2d.py b/tests/python/topi/python/test_topi_deformable_conv2d.py
index cd6f33f14fd7..20df09f30d12 100644
--- a/tests/python/topi/python/test_topi_deformable_conv2d.py
+++ b/tests/python/topi/python/test_topi_deformable_conv2d.py
@@ -93,7 +93,7 @@ def get_ref_data():
     a_np, offset_np, w_np, c_np = get_ref_data()
 
     def check_device(device):
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
@@ -103,10 +103,10 @@ def check_device(device):
             C = fcompute(A, Offset, W, stride, padding, dilation, deformable_groups, groups, dtype)
             s = fschedule([C])
 
-            a = tvm.nd.array(a_np, ctx)
-            offset = tvm.nd.array(offset_np, ctx)
-            w = tvm.nd.array(w_np, ctx)
-            c = tvm.nd.empty(c_np.shape, dtype=c_np.dtype, ctx=ctx)
+            a = tvm.nd.array(a_np, dev)
+            offset = tvm.nd.array(offset_np, dev)
+            w = tvm.nd.array(w_np, dev)
+            c = tvm.nd.empty(c_np.shape, dtype=c_np.dtype, device=dev)
 
             func = tvm.build(s, [A, Offset, W, C], device)
             func(a, offset, w, c)
@@ -173,7 +173,7 @@ def get_ref_data():
     a_np, offset_np, w_np, c_np = get_ref_data()
 
     def check_device(device):
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
@@ -183,10 +183,10 @@ def check_device(device):
             C = fcompute(A, Offset, W, stride, padding, dilation, deformable_groups, groups, dtype)
             s = fschedule([C])
 
-            a = tvm.nd.array(a_np, ctx)
-            offset = tvm.nd.array(offset_np, ctx)
-            w = tvm.nd.array(w_np, ctx)
-            c = tvm.nd.empty(c_np.shape, dtype=c_np.dtype, ctx=ctx)
+            a = tvm.nd.array(a_np, dev)
+            offset = tvm.nd.array(offset_np, dev)
+            w = tvm.nd.array(w_np, dev)
+            c = tvm.nd.empty(c_np.shape, dtype=c_np.dtype, device=dev)
 
             func = tvm.build(s, [A, Offset, W, C], device)
             func(a, offset, w, c)
diff --git a/tests/python/topi/python/test_topi_dense.py b/tests/python/topi/python/test_topi_dense.py
index 95ebce43497b..fa966b6f00e5 100644
--- a/tests/python/topi/python/test_topi_dense.py
+++ b/tests/python/topi/python/test_topi_dense.py
@@ -64,23 +64,23 @@ def get_ref_data():
     # get the test data
     a_np, b_np, c_np, d_np = get_ref_data()
 
-    def check_device(device, ctx):
+    def check_device(device, dev):
         print("Running on target: %s" % device)
         for fcompute, fschedule in tvm.topi.testing.dispatch(device, _dense_implement):
             with tvm.target.Target(device):
                 D = fcompute(A, B, C if use_bias else None)
                 D = topi.nn.relu(D)
                 s = fschedule([D])
-            a = tvm.nd.array(a_np, ctx)
-            b = tvm.nd.array(b_np, ctx)
-            c = tvm.nd.array(c_np, ctx)
-            d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx)
+            a = tvm.nd.array(a_np, dev)
+            b = tvm.nd.array(b_np, dev)
+            c = tvm.nd.array(c_np, dev)
+            d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), dev)
             f = tvm.build(s, [A, B, C, D], device, name="dense")
             f(a, b, c, d)
             tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for device, dev in tvm.testing.enabled_targets():
+        check_device(device, dev)
 
 
 def verify_dense_int8(batch, in_dim, out_dim, use_bias=True):
@@ -106,8 +106,8 @@ def get_ref_data():
     a_np, b_np, c_np, d_np = get_ref_data()
 
     def check_device(device):
-        ctx = tvm.context(device, 0)
-        if device == "cuda" and not tvm.contrib.nvcc.have_int8(ctx.compute_version):
+        dev = tvm.device(device, 0)
+        if device == "cuda" and not tvm.contrib.nvcc.have_int8(dev.compute_version):
             print("Skip because int8 intrinsics are not available")
             return
 
@@ -116,10 +116,10 @@ def check_device(device):
             D = topi.cuda.dense_int8(A, B, C if use_bias else None, out_dtype)
             D = topi.nn.relu(D)
             s = topi.cuda.schedule_dense_int8([D])
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(b_np, ctx)
-        c = tvm.nd.array(c_np, ctx)
-        d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=out_dtype), ctx)
+        a = tvm.nd.array(a_np, dev)
+        b = tvm.nd.array(b_np, dev)
+        c = tvm.nd.array(c_np, dev)
+        d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=out_dtype), dev)
         f = tvm.build(s, [A, B, C, D], device, name="dense")
         f(a, b, c, d)
         tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5)
diff --git a/tests/python/topi/python/test_topi_dense_tensorcore.py b/tests/python/topi/python/test_topi_dense_tensorcore.py
index 3ffdea50d660..4ea3202569cb 100644
--- a/tests/python/topi/python/test_topi_dense_tensorcore.py
+++ b/tests/python/topi/python/test_topi_dense_tensorcore.py
@@ -52,17 +52,17 @@ def get_ref_data():
     a_np, b_np, c_np, d_np = get_ref_data()
 
     def check_device(device):
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         print("Running on target: %s" % device)
         for fcompute, fschedule in tvm.topi.testing.dispatch(device, _dense_implement):
             with tvm.target.Target(device):
                 D = fcompute(A, B, C if use_bias else None)
                 D = topi.nn.relu(D)
                 s = fschedule([D])
-            a = tvm.nd.array(a_np, ctx)
-            b = tvm.nd.array(b_np, ctx)
-            c = tvm.nd.array(c_np, ctx)
-            d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx)
+            a = tvm.nd.array(a_np, dev)
+            b = tvm.nd.array(b_np, dev)
+            c = tvm.nd.array(c_np, dev)
+            d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), dev)
             f = tvm.build(s, [A, B, C, D], device, name="dense")
             f(a, b, c, d)
             tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-3)
diff --git a/tests/python/topi/python/test_topi_depth_to_space.py b/tests/python/topi/python/test_topi_depth_to_space.py
index cb16f9bf438f..93731868e922 100644
--- a/tests/python/topi/python/test_topi_depth_to_space.py
+++ b/tests/python/topi/python/test_topi_depth_to_space.py
@@ -51,18 +51,18 @@ def verify_depth_to_space(
         a_np = np.transpose(a_np, axes=[0, 2, 3, 1])
         b_np = np.transpose(b_np, axes=[0, 2, 3, 1])
 
-    def check_device(device, ctx):
+    def check_device(device, dev):
         print("Running on target: %s" % device)
         with tvm.target.Target(device):
             s = tvm.topi.testing.get_injective_schedule(device)(B)
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx)
+        a = tvm.nd.array(a_np, dev)
+        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), dev)
         f = tvm.build(s, [A, B], device)
         f(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for device, dev in tvm.testing.enabled_targets():
+        check_device(device, dev)
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/topi/python/test_topi_depthwise_conv2d.py b/tests/python/topi/python/test_topi_depthwise_conv2d.py
index 804c486d27d7..91ee1ee02fe4 100644
--- a/tests/python/topi/python/test_topi_depthwise_conv2d.py
+++ b/tests/python/topi/python/test_topi_depthwise_conv2d.py
@@ -78,16 +78,16 @@ def compile_depthwise_NHWC_int8_arm(
     bias = te.placeholder((in_channel * depth_multiplier,), name="bias", dtype="int32")
     dtype = "int32"
 
-    device = "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu"
+    target = "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu"
     compute = topi.arm_cpu.compute_depthwise_conv2d_nhwc
     schedule = topi.arm_cpu.schedule_depthwise_conv2d_nhwc
 
-    if not tvm.testing.device_enabled(device):
-        print("Skip because %s is not enabled" % device)
+    if not tvm.testing.device_enabled(target):
+        print("Skip because %s is not enabled" % target)
         return
 
-    print("Compiling on arm AArch64 target: %s" % device)
-    with tvm.target.Target(device):
+    print("Compiling on arm AArch64 target: %s" % target)
+    with tvm.target.Target(target):
         assert topi.arm_cpu.arm_utils.is_aarch64_arm(), "AArch64 target not recognized"
 
         C = compute(A, W, (stride, stride), padding, (dilation, dilation), dtype)
@@ -102,7 +102,7 @@ def compile_depthwise_NHWC_int8_arm(
         func = tvm.build(
             s,
             ins_outs,
-            device,
+            target,
             name="depthwise_conv2d",
         )
 
@@ -133,17 +133,17 @@ def depthwise_conv2d_with_workload_nchw(
 
     dtype = "float32"
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
+    def check_target(target, dev):
+        print("Running on target: %s" % target)
 
-        impl_list = tvm.topi.testing.dispatch(device, _depthwise_conv2d_nchw_implement)[:]
-        if device == "llvm" and channel_multiplier == 1 and dilation == 1:
+        impl_list = tvm.topi.testing.dispatch(target, _depthwise_conv2d_nchw_implement)[:]
+        if target == "llvm" and channel_multiplier == 1 and dilation == 1:
             impl_list.append(
                 (topi.x86.depthwise_conv2d_nchw, topi.x86.schedule_depthwise_conv2d_nchw)
             )
 
         for fcompute, fschedule in impl_list:
-            with tvm.target.Target(device):
+            with tvm.target.Target(target):
                 # declare
                 DepthwiseConv2d = fcompute(
                     Input, Filter, (stride_h, stride_w), padding_args, dilation, dtype
@@ -155,9 +155,9 @@ def check_device(device, ctx):
                 s2 = fschedule(ScaleShift)
                 s3 = fschedule(Relu)
             # build the kernels
-            f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device)
-            f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device)
-            f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device)
+            f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], target)
+            f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], target)
+            f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], target)
 
             # Prepare pod type for test data closure
             input_shape = get_const_tuple(Input.shape)
@@ -214,38 +214,38 @@ def verify_workload_padding():
                 )
 
                 # check if tile_ow candidates are the factors of the right output weight.
-                with tvm.target.Target(device):
+                with tvm.target.Target(target):
                     cfg = autotvm.get_config()
                     _fallback_schedule(cfg, wkl)
                     ow_tile = np.prod(cfg["tile_ow"].size)
 
                     tvm.testing.assert_allclose(ow_tile, out_width)
 
-            if "llvm" in device:
+            if "llvm" in target:
                 verify_workload_padding()
 
-            input_tvm = tvm.nd.array(input_np, ctx)
-            filter_tvm = tvm.nd.array(filter_np, ctx)
-            scale_tvm = tvm.nd.array(scale_np, ctx)
-            shift_tvm = tvm.nd.array(shift_np, ctx)
+            input_tvm = tvm.nd.array(input_np, dev)
+            filter_tvm = tvm.nd.array(filter_np, dev)
+            scale_tvm = tvm.nd.array(scale_np, dev)
+            shift_tvm = tvm.nd.array(shift_np, dev)
             depthwise_conv2d_tvm = tvm.nd.array(
                 np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype),
-                ctx,
+                dev,
             )
             scale_shift_tvm = tvm.nd.array(
-                np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), ctx
+                np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), dev
             )
             relu_tvm = tvm.nd.array(
-                np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx
+                np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), dev
             )
             # launch kernel 1 (depthwise_conv2d)
-            timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=1)
+            timer_1 = f1.time_evaluator(f1.entry_name, dev, number=1)
             tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean
             # launch kernel 2 (depthwise_conv2d + scale_shift)
-            timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=1)
+            timer_2 = f2.time_evaluator(f2.entry_name, dev, number=1)
             tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm, scale_shift_tvm).mean
             # launch kernel 3 (depthwise_conv2d + scale_shift + relu)
-            timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1)
+            timer_3 = f3.time_evaluator(f3.entry_name, dev, number=1)
             tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean
             tvm.testing.assert_allclose(
                 depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5
@@ -253,9 +253,9 @@ def verify_workload_padding():
             tvm.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5)
             tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
-            check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        with autotvm.tophub.context(target):  # load tophub pre-tuned parameters
+            check_target(target, dev)
 
 
 def depthwise_conv2d_with_workload_nhwc(
@@ -284,11 +284,11 @@ def depthwise_conv2d_with_workload_nhwc(
 
     dtype = "float32"
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
+    def check_target(target, dev):
+        print("Running on target: %s" % target)
 
-        fcompute, fschedule = tvm.topi.testing.dispatch(device, _depthwise_conv2d_nhwc_implement)
-        with tvm.target.Target(device):
+        fcompute, fschedule = tvm.topi.testing.dispatch(target, _depthwise_conv2d_nhwc_implement)
+        with tvm.target.Target(target):
             # declare
             DepthwiseConv2d = fcompute(
                 Input, Filter, (stride_h, stride_w), padding_args, dilation, dtype
@@ -300,9 +300,9 @@ def check_device(device, ctx):
             s2 = fschedule(ScaleShift)
             s3 = fschedule(Relu)
         # build the kernels
-        f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device)
-        f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device)
-        f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device)
+        f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], target)
+        f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], target)
+        f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], target)
 
         # Prepare pod type for test data closure
         input_shape = get_const_tuple(Input.shape)
@@ -353,25 +353,25 @@ def get_ref_data():
         ) = get_ref_data()
 
         # prepare data
-        input_tvm = tvm.nd.array(input_np, ctx)
-        filter_tvm = tvm.nd.array(filter_np, ctx)
-        scale_tvm = tvm.nd.array(scale_np, ctx)
-        shift_tvm = tvm.nd.array(shift_np, ctx)
+        input_tvm = tvm.nd.array(input_np, dev)
+        filter_tvm = tvm.nd.array(filter_np, dev)
+        scale_tvm = tvm.nd.array(scale_np, dev)
+        shift_tvm = tvm.nd.array(shift_np, dev)
         depthwise_conv2d_tvm = tvm.nd.array(
-            np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), ctx
+            np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), dev
         )
         scale_shift_tvm = tvm.nd.array(
-            np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), ctx
+            np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), dev
         )
-        relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx)
+        relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), dev)
         # launch kernel 1 (depthwise_conv2d)
-        timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=1)
+        timer_1 = f1.time_evaluator(f1.entry_name, dev, number=1)
         tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean
         # launch kernel 2 (depthwise_conv2d + scale_shift)
-        timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=1)
+        timer_2 = f2.time_evaluator(f2.entry_name, dev, number=1)
         tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm, scale_shift_tvm).mean
         # launch kernel 3 (depthwise_conv2d + scale_shift + relu)
-        timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1)
+        timer_3 = f3.time_evaluator(f3.entry_name, dev, number=1)
         tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean
         relu_scipy = np.maximum(scale_shift_scipy, 0)
         tvm.testing.assert_allclose(
@@ -380,9 +380,9 @@ def get_ref_data():
         tvm.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5)
         tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
-            check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        with autotvm.tophub.context(target):  # load tophub pre-tuned parameters
+            check_target(target, dev)
 
 
 def _transform_data(data, bn):
@@ -444,13 +444,13 @@ def depthwise_conv2d_with_workload_NCHWc(
     out_layout = "NCHW%dc" % oc_block
     dtype = "float32"
 
-    def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
+    def check_target(target):
+        dev = tvm.device(target, 0)
+        if not tvm.testing.device_enabled(target):
+            print("Skip because %s is not enabled" % target)
             return
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
             # declare
             DepthwiseConv2d = topi.x86.depthwise_conv2d_NCHWc(
                 Input,
@@ -468,8 +468,8 @@ def check_device(device):
             s1 = topi.x86.schedule_depthwise_conv2d_NCHWc(DepthwiseConv2d)
             s2 = topi.x86.schedule_depthwise_conv2d_NCHWc(Relu)
         # build the kernels
-        f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device)
-        f2 = tvm.build(s2, [Input, Filter, Relu], device)
+        f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], target)
+        f2 = tvm.build(s2, [Input, Filter, Relu], target)
 
         # Prepare pod type for test data closure
         input_shape = (batch, in_channel, in_height, in_width)
@@ -498,13 +498,13 @@ def get_ref_data():
         # Get the test data
         (input_np, filter_np, depthwise_conv2d_scipy, relu_scipy) = get_ref_data()
 
-        input_tvm = tvm.nd.array(input_np, ctx)
-        filter_tvm = tvm.nd.array(filter_np, ctx)
+        input_tvm = tvm.nd.array(input_np, dev)
+        filter_tvm = tvm.nd.array(filter_np, dev)
 
         depthwise_conv2d_tvm = tvm.nd.array(
-            np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), ctx
+            np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), dev
         )
-        relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx)
+        relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), dev)
         # launch kernel 1 (depthwise_conv2d)
         f1(input_tvm, filter_tvm, depthwise_conv2d_tvm)
         # launch kernel 2 (depthwise_conv2d + relu)
@@ -515,9 +515,9 @@ def get_ref_data():
         tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
 
     # test llvm only for now since depthwise_conv2d_NCHWc implement is missing in other backend.
-    for device in ["llvm"]:
-        with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
-            check_device(device)
+    for target in ["llvm"]:
+        with autotvm.tophub.context(target):  # load tophub pre-tuned parameters
+            check_target(target)
 
 
 @tvm.testing.uses_gpu
@@ -556,7 +556,7 @@ def test_depthwise_conv2d():
     depthwise_conv2d_with_workload_NCHWc(1, 728, 32, 1, 3, 1, "SAME")
     depthwise_conv2d_with_workload_NCHWc(1, 728, 32, 1, 3, 1, "VALID")
 
-    # Test compilation on arm devices
+    # Test compilation on arm targets
     compile_depthwise_NHWC_int8_arm(1, 728, 32, 1, 3, 1, "SAME")
     compile_depthwise_NHWC_int8_arm(1, 728, 32, 1, 1, 1, "SAME", True)
 
diff --git a/tests/python/topi/python/test_topi_depthwise_conv2d_back_input.py b/tests/python/topi/python/test_topi_depthwise_conv2d_back_input.py
index 72ad1e29004a..d8c8f0e195c4 100644
--- a/tests/python/topi/python/test_topi_depthwise_conv2d_back_input.py
+++ b/tests/python/topi/python/test_topi_depthwise_conv2d_back_input.py
@@ -59,7 +59,7 @@ def verify_depthwise_conv2d_back_input(
     schedule = schedule_depthwise_conv2d_backward_input_nhwc(In_grad)
 
     def check_device(device):
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
@@ -117,11 +117,11 @@ def get_ref_data():
 
         (out_grad_np, filter_np, in_grad_np) = get_ref_data()
 
-        out_grad_tvm = tvm.nd.array(out_grad_np, ctx)
-        filter_tvm = tvm.nd.array(filter_np, ctx)
-        in_grad_tvm = tvm.nd.array(np.zeros(shape=ishape, dtype=dtype), ctx)
+        out_grad_tvm = tvm.nd.array(out_grad_np, dev)
+        filter_tvm = tvm.nd.array(filter_np, dev)
+        in_grad_tvm = tvm.nd.array(np.zeros(shape=ishape, dtype=dtype), dev)
         # launch the kernel
-        timer = f.time_evaluator(f.entry_name, ctx, number=1)
+        timer = f.time_evaluator(f.entry_name, dev, number=1)
         tcost = timer(filter_tvm, out_grad_tvm, in_grad_tvm).mean
         tvm.testing.assert_allclose(in_grad_np, in_grad_tvm.asnumpy(), rtol=1e-5)
 
diff --git a/tests/python/topi/python/test_topi_depthwise_conv2d_back_weight.py b/tests/python/topi/python/test_topi_depthwise_conv2d_back_weight.py
index 53328113aa71..daf7b5c82d41 100644
--- a/tests/python/topi/python/test_topi_depthwise_conv2d_back_weight.py
+++ b/tests/python/topi/python/test_topi_depthwise_conv2d_back_weight.py
@@ -54,7 +54,7 @@ def verify_depthwise_conv2d_back_weight(
     schedule = schedule_depthwise_conv2d_backward_weight_nhwc(Weight_grad)
 
     def check_device(device):
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
@@ -101,11 +101,11 @@ def get_ref_data():
 
         (out_grad_np, input_np, weight_grad_np) = get_ref_data()
 
-        out_grad_tvm = tvm.nd.array(out_grad_np, ctx)
-        input_tvm = tvm.nd.array(input_np, ctx)
-        weight_grad_tvm = tvm.nd.array(np.zeros(shape=fshape, dtype=dtype), ctx)
+        out_grad_tvm = tvm.nd.array(out_grad_np, dev)
+        input_tvm = tvm.nd.array(input_np, dev)
+        weight_grad_tvm = tvm.nd.array(np.zeros(shape=fshape, dtype=dtype), dev)
         # launch the kernel
-        timer = f.time_evaluator(f.entry_name, ctx, number=1)
+        timer = f.time_evaluator(f.entry_name, dev, number=1)
         tcost = timer(input_tvm, out_grad_tvm, weight_grad_tvm).mean
         tvm.testing.assert_allclose(weight_grad_np, weight_grad_tvm.asnumpy(), rtol=1e-4)
 
diff --git a/tests/python/topi/python/test_topi_dilate.py b/tests/python/topi/python/test_topi_dilate.py
index 27e71735c565..c09bcc0deaa6 100644
--- a/tests/python/topi/python/test_topi_dilate.py
+++ b/tests/python/topi/python/test_topi_dilate.py
@@ -24,7 +24,7 @@
 
 def test_dilate():
     target = "llvm"
-    ctx = tvm.cpu(0)
+    dev = tvm.cpu(0)
 
     def _test_dilate(input_size, strides, dilation_value=None):
         Input = te.placeholder((input_size))
@@ -38,9 +38,9 @@ def _test_dilate(input_size, strides, dilation_value=None):
             output_np = tvm.topi.testing.dilate_python(input_np, strides)
         else:
             output_np = tvm.topi.testing.dilate_python(input_np, strides, dilation_value)
-        input_tvm = tvm.nd.array(input_np, ctx=ctx)
+        input_tvm = tvm.nd.array(input_np, device=dev)
         output_size = topi.utils.get_const_tuple(Output.shape)
-        output_tvm = tvm.nd.array(np.zeros(shape=output_size).astype(Output.dtype), ctx=ctx)
+        output_tvm = tvm.nd.array(np.zeros(shape=output_size).astype(Output.dtype), device=dev)
         f = tvm.build(schedule, [Input, Output], target)
         f(input_tvm, output_tvm)
         tvm.testing.assert_allclose(output_tvm.asnumpy(), output_np, rtol=1e-5)
diff --git a/tests/python/topi/python/test_topi_einsum.py b/tests/python/topi/python/test_topi_einsum.py
index 49e951398f40..35de9306deaf 100644
--- a/tests/python/topi/python/test_topi_einsum.py
+++ b/tests/python/topi/python/test_topi_einsum.py
@@ -26,15 +26,15 @@ def with_tvm(lam, *args):
     """Take numpy arrays as args, convert them to TVM tensors and call `lam`.
     Result of lambda is converted back to numpy array and returned.
     """
-    ctx = tvm.cpu(0)
+    dev = tvm.cpu(0)
     pls = []  # placeholders
     vals_nd = []  # initial values
     for i, arg in enumerate(args):
         pls.append(te.placeholder(arg.shape, name="pl" + str(i)))
-        vals_nd.append(tvm.nd.array(arg, ctx))
+        vals_nd.append(tvm.nd.array(arg, dev))
 
     out = lam(*pls)
-    out_nd = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=out.dtype), ctx)
+    out_nd = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=out.dtype), dev)
     s = te.create_schedule([out.op])
     m = tvm.build(s, pls + [out], "llvm")
     m(*(vals_nd + [out_nd]))
diff --git a/tests/python/topi/python/test_topi_group_conv2d.py b/tests/python/topi/python/test_topi_group_conv2d.py
index 9c4da5c2c849..7f4803b9a8cd 100644
--- a/tests/python/topi/python/test_topi_group_conv2d.py
+++ b/tests/python/topi/python/test_topi_group_conv2d.py
@@ -89,15 +89,15 @@ def get_ref_data():
 
     a_np, w_np, b_np, c_np = get_ref_data()
 
-    def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
+    def check_target(target):
+        dev = tvm.device(target, 0)
+        if not tvm.testing.device_enabled(target):
+            print("Skip because %s is not enabled" % target)
             return
 
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            fcompute, fschedule = tvm.topi.testing.dispatch(device, _group_conv2d_nchw_implement)
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            fcompute, fschedule = tvm.topi.testing.dispatch(target, _group_conv2d_nchw_implement)
             C = fcompute(A, W, stride, padding, dilation, groups, dtype)
             if add_bias:
                 C = topi.add(C, bias)
@@ -105,15 +105,15 @@ def check_device(device):
                 C = topi.nn.relu(C)
             s = fschedule([C])
 
-        a = tvm.nd.array(a_np, ctx)
-        w = tvm.nd.array(w_np, ctx)
-        b = tvm.nd.array(b_np, ctx)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        a = tvm.nd.array(a_np, dev)
+        w = tvm.nd.array(w_np, dev)
+        b = tvm.nd.array(b_np, dev)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
         if add_bias:
             func = tvm.build(
                 s,
                 [A, W, bias, C],
-                device,
+                target,
                 name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d"
                 % (
                     batch,
@@ -132,7 +132,7 @@ def check_device(device):
             func = tvm.build(
                 s,
                 [A, W, C],
-                device,
+                target,
                 name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d"
                 % (
                     batch,
@@ -149,8 +149,8 @@ def check_device(device):
             func(a, w, c)
         tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
-    for device in ["llvm", "cuda"]:
-        check_device(device)
+    for target in ["llvm", "cuda"]:
+        check_target(target)
 
 
 oc_block_factor = 4
@@ -213,17 +213,17 @@ def get_ref_data():
 
     a_np, w_np, b_np, c_np = get_ref_data()
 
-    def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
+    def check_target(target):
+        dev = tvm.device(target, 0)
+        if not tvm.testing.device_enabled(target):
+            print("Skip because %s is not enabled" % target)
             return
-        if device == "cuda" and not tvm.contrib.nvcc.have_int8(ctx.compute_version):
+        if target == "cuda" and not tvm.contrib.nvcc.have_int8(dev.compute_version):
             print("Skip because int8 intrinsics are not available")
             return
 
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
             C = topi.cuda.group_conv2d_NCHWc_int8(A, W, stride, padding, dilation, groups, dtype)
             if add_bias:
                 C = topi.add(C, bias)
@@ -231,15 +231,15 @@ def check_device(device):
                 C = topi.nn.relu(C)
             s = topi.cuda.schedule_group_conv2d_NCHWc_int8([C])
 
-        a = tvm.nd.array(a_np, ctx)
-        w = tvm.nd.array(w_np, ctx)
-        b = tvm.nd.array(b_np, ctx)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        a = tvm.nd.array(a_np, dev)
+        w = tvm.nd.array(w_np, dev)
+        b = tvm.nd.array(b_np, dev)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
         if add_bias:
             func = tvm.build(
                 s,
                 [A, W, bias, C],
-                device,
+                target,
                 name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d"
                 % (
                     batch,
@@ -258,7 +258,7 @@ def check_device(device):
             func = tvm.build(
                 s,
                 [A, W, C],
-                device,
+                target,
                 name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d"
                 % (
                     batch,
@@ -275,8 +275,8 @@ def check_device(device):
             func(a, w, c)
         tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
-    for device in ["cuda"]:
-        check_device(device)
+    for target in ["cuda"]:
+        check_target(target)
 
 
 def verify_group_conv2d_nhwc(
@@ -328,15 +328,15 @@ def get_ref_data():
 
     a_np, w_np, b_np, c_np = get_ref_data()
 
-    def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
+    def check_target(target):
+        dev = tvm.device(target, 0)
+        if not tvm.testing.device_enabled(target):
+            print("Skip because %s is not enabled" % target)
             return
 
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            fcompute, fschedule = tvm.topi.testing.dispatch(device, _group_conv2d_nhwc_implement)
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            fcompute, fschedule = tvm.topi.testing.dispatch(target, _group_conv2d_nhwc_implement)
             C = fcompute(A, W, stride, padding, dilation, groups, dtype)
             if add_bias:
                 C = topi.add(C, bias)
@@ -344,15 +344,15 @@ def check_device(device):
                 C = topi.nn.relu(C)
             s = fschedule([C])
 
-        a = tvm.nd.array(a_np, ctx)
-        w = tvm.nd.array(w_np, ctx)
-        b = tvm.nd.array(b_np, ctx)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        a = tvm.nd.array(a_np, dev)
+        w = tvm.nd.array(w_np, dev)
+        b = tvm.nd.array(b_np, dev)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
         if add_bias:
             func = tvm.build(
                 s,
                 [A, W, bias, C],
-                device,
+                target,
                 name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d"
                 % (
                     batch,
@@ -371,7 +371,7 @@ def check_device(device):
             func = tvm.build(
                 s,
                 [A, W, C],
-                device,
+                target,
                 name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d"
                 % (
                     batch,
@@ -388,8 +388,8 @@ def check_device(device):
             func(a, w, c)
         tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
-    for device in ["llvm"]:
-        check_device(device)
+    for target in ["llvm"]:
+        check_target(target)
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/topi/python/test_topi_group_conv2d_NCHWc_int8.py b/tests/python/topi/python/test_topi_group_conv2d_NCHWc_int8.py
index b6cef2e97662..e69a3094ac4d 100644
--- a/tests/python/topi/python/test_topi_group_conv2d_NCHWc_int8.py
+++ b/tests/python/topi/python/test_topi_group_conv2d_NCHWc_int8.py
@@ -113,8 +113,8 @@ def get_ref_data():
     a_np, w_np, c_np = get_ref_data()
 
     def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not tvm.testing.device_enabled(ctx):
+        dev = tvm.device(device, 0)
+        if not tvm.testing.device_enabled(dev):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
@@ -131,9 +131,9 @@ def check_device(device):
             )
             s = topi.x86.schedule_conv2d_NCHWc([C])
 
-        a = tvm.nd.array(a_np, ctx)
-        w = tvm.nd.array(w_np, ctx)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        a = tvm.nd.array(a_np, dev)
+        w = tvm.nd.array(w_np, dev)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
         func = tvm.build(
             s,
             [A, W, C],
diff --git a/tests/python/topi/python/test_topi_image.py b/tests/python/topi/python/test_topi_image.py
index c605df7037e4..b766e599c679 100644
--- a/tests/python/topi/python/test_topi_image.py
+++ b/tests/python/topi/python/test_topi_image.py
@@ -66,19 +66,19 @@ def verify_resize(
         scale_w = out_width / in_width
         b_np = tvm.topi.testing.upsampling_python(a_np, (scale_h, scale_w), layout)
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_injective_schedule(device)(B)
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx)
-        f = tvm.build(s, [A, B], device)
+    def check_target(target, dev):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s = tvm.topi.testing.get_injective_schedule(target)(B)
+        a = tvm.nd.array(a_np, dev)
+        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), dev)
+        f = tvm.build(s, [A, B], target)
         f(a, b)
 
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_target(target, dev)
 
 
 @tvm.testing.uses_gpu
@@ -153,19 +153,19 @@ def verify_resize3d(
         scale_w = out_width / in_width
         b_np = tvm.topi.testing.upsampling3d_python(a_np, (scale_d, scale_h, scale_w), layout)
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_injective_schedule(device)(B)
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx)
-        f = tvm.build(s, [A, B], device)
+    def check_target(target, dev):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s = tvm.topi.testing.get_injective_schedule(target)(B)
+        a = tvm.nd.array(a_np, dev)
+        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), dev)
+        f = tvm.build(s, [A, B], target)
         f(a, b)
 
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_target(target, dev)
 
 
 @tvm.testing.uses_gpu
@@ -225,21 +225,21 @@ def verify_crop_and_resize(
             np_images, np_boxes, np_box_indices, np_crop_size, layout, method, extrapolation_value
         )
 
-        def check_device(device, ctx):
-            print("Running on target: %s" % device)
-            with tvm.target.Target(device):
-                s = tvm.topi.testing.get_injective_schedule(device)(out)
-            tvm_images = tvm.nd.array(np_images, ctx)
-            tvm_boxes = tvm.nd.array(np_boxes, ctx)
-            tvm_indices = tvm.nd.array(np_box_indices, ctx)
-            tvm_out = tvm.nd.array(np.zeros(out_shape, dtype="float32"), ctx)
-            f = tvm.build(s, [images, boxes, box_ind, out], device, name="crop_and_resize")
+        def check_target(target, dev):
+            print("Running on target: %s" % target)
+            with tvm.target.Target(target):
+                s = tvm.topi.testing.get_injective_schedule(target)(out)
+            tvm_images = tvm.nd.array(np_images, dev)
+            tvm_boxes = tvm.nd.array(np_boxes, dev)
+            tvm_indices = tvm.nd.array(np_box_indices, dev)
+            tvm_out = tvm.nd.array(np.zeros(out_shape, dtype="float32"), dev)
+            f = tvm.build(s, [images, boxes, box_ind, out], target, name="crop_and_resize")
             f(tvm_images, tvm_boxes, tvm_indices, tvm_out)
 
             tvm.testing.assert_allclose(tvm_out.asnumpy(), baseline_np, rtol=1e-3, atol=1e-3)
 
-        for device, ctx in tvm.testing.enabled_targets():
-            check_device(device, ctx)
+        for target, dev in tvm.testing.enabled_targets():
+            check_target(target, dev)
 
     boxes_1 = np.array([[0.2, 0.3, 0.7, 0.9]], dtype="float32")
     boxes_2 = np.array([[0.2, 0.3, 0.7, 0.9], [0, 0.1, 0.8, 1]], dtype="float32")
@@ -272,19 +272,19 @@ def get_ref_data():
 
         data_np, out_np = get_ref_data()
 
-        def check_device(device, ctx):
-            print("Running on target: %s" % device)
-            with tvm.target.Target(device):
-                s = tvm.topi.testing.get_injective_schedule(device)(out)
-            tvm_data = tvm.nd.array(data_np, ctx)
-            tvm_out = tvm.nd.empty(out_np.shape, dtype, ctx)
-            f = tvm.build(s, [data, out], device)
+        def check_target(target, dev):
+            print("Running on target: %s" % target)
+            with tvm.target.Target(target):
+                s = tvm.topi.testing.get_injective_schedule(target)(out)
+            tvm_data = tvm.nd.array(data_np, dev)
+            tvm_out = tvm.nd.empty(out_np.shape, dtype, dev)
+            f = tvm.build(s, [data, out], target)
             f(tvm_data, tvm_out)
 
             tvm.testing.assert_allclose(tvm_out.asnumpy(), out_np, rtol=1e-5, atol=1e-5)
 
-        for device, ctx in tvm.testing.enabled_targets():
-            check_device(device, ctx)
+        for target, dev in tvm.testing.enabled_targets():
+            check_target(target, dev)
 
     verify_affine_grid(1, (16, 32))
     verify_affine_grid(4, (16, 32))
@@ -308,20 +308,20 @@ def get_ref_data():
 
         data_np, grid_np, out_np = get_ref_data()
 
-        def check_device(device, ctx):
-            print("Running on target: %s" % device)
-            with tvm.target.Target(device):
-                s = tvm.topi.testing.get_injective_schedule(device)(out)
-            tvm_data = tvm.nd.array(data_np, ctx)
-            tvm_grid = tvm.nd.array(grid_np, ctx)
-            tvm_out = tvm.nd.empty(out_np.shape, dtype, ctx)
-            f = tvm.build(s, [data, grid, out], device)
+        def check_target(target, dev):
+            print("Running on target: %s" % target)
+            with tvm.target.Target(target):
+                s = tvm.topi.testing.get_injective_schedule(target)(out)
+            tvm_data = tvm.nd.array(data_np, dev)
+            tvm_grid = tvm.nd.array(grid_np, dev)
+            tvm_out = tvm.nd.empty(out_np.shape, dtype, dev)
+            f = tvm.build(s, [data, grid, out], target)
             f(tvm_data, tvm_grid, tvm_out)
 
             tvm.testing.assert_allclose(tvm_out.asnumpy(), out_np, rtol=1e-5, atol=1e-5)
 
-        for device, ctx in tvm.testing.enabled_targets():
-            check_device(device, ctx)
+        for target, dev in tvm.testing.enabled_targets():
+            check_target(target, dev)
 
     verify_grid_sample((4, 4, 16, 32), (4, 2, 8, 8))
     verify_grid_sample((4, 4, 16, 32), (4, 2, 32, 32))
diff --git a/tests/python/topi/python/test_topi_lrn.py b/tests/python/topi/python/test_topi_lrn.py
index 278926479977..203680b14781 100644
--- a/tests/python/topi/python/test_topi_lrn.py
+++ b/tests/python/topi/python/test_topi_lrn.py
@@ -50,9 +50,9 @@ def check_device(device):
         with tvm.target.Target(device):
             s_func = tvm.topi.testing.dispatch(device, _lrn_schedule)
             s = s_func([B])
-        ctx = tvm.context(device, 0)
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
+        dev = tvm.device(device, 0)
+        a = tvm.nd.array(a_np, dev)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), dev)
         f = tvm.build(s, [A, B], device)
         f(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
diff --git a/tests/python/topi/python/test_topi_math.py b/tests/python/topi/python/test_topi_math.py
index 74575ddba1c3..e2d978190877 100644
--- a/tests/python/topi/python/test_topi_math.py
+++ b/tests/python/topi/python/test_topi_math.py
@@ -58,18 +58,18 @@ def test_apply(
             a_np += ((np.abs(np.fmod(a_np, 1)) - 0.5) < 1e-6) * 1e-4
         b_np = f_numpy(a_np)
 
-        def check_device(device, ctx):
-            print("Running on target: %s" % device)
-            with tvm.target.Target(device):
-                s = tvm.topi.testing.get_injective_schedule(device)(B)
-            foo = tvm.build(s, [A, B], device, name=name)
-            a = tvm.nd.array(a_np, ctx)
-            b = tvm.nd.array(np.zeros_like(b_np), ctx)
+        def check_target(target, dev):
+            print("Running on target: %s" % target)
+            with tvm.target.Target(target):
+                s = tvm.topi.testing.get_injective_schedule(target)(B)
+            foo = tvm.build(s, [A, B], target, name=name)
+            a = tvm.nd.array(a_np, dev)
+            b = tvm.nd.array(np.zeros_like(b_np), dev)
             foo(a, b)
             tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
 
-        for target, ctx in tvm.testing.enabled_targets():
-            check_device(target, ctx)
+        for target, dev in tvm.testing.enabled_targets():
+            check_target(target, dev)
 
     def test_isnan(
         low,
@@ -94,18 +94,18 @@ def test_isnan(
             a_np += ((np.abs(np.fmod(a_np, 1)) - 0.5) < 1e-6) * 1e-5
         b_np = np.isnan(a_np)
 
-        def check_device(device, ctx):
-            print("Running on target: %s" % device)
-            with tvm.target.Target(device):
-                s = tvm.topi.testing.get_injective_schedule(device)(B)
-            foo = tvm.build(s, [A, B], device, name="isnan")
-            a = tvm.nd.array(a_np, ctx)
-            b = tvm.nd.array(np.zeros_like(b_np), ctx)
+        def check_target(target, dev):
+            print("Running on target: %s" % target)
+            with tvm.target.Target(target):
+                s = tvm.topi.testing.get_injective_schedule(target)(B)
+            foo = tvm.build(s, [A, B], target, name="isnan")
+            a = tvm.nd.array(a_np, dev)
+            b = tvm.nd.array(np.zeros_like(b_np), dev)
             foo(a, b)
             tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
 
-        for target, ctx in tvm.testing.enabled_targets():
-            check_device(target, ctx)
+        for target, dev in tvm.testing.enabled_targets():
+            check_target(target, dev)
 
     def test_infiniteness_ops(topi_op, ref_op, name):
         for dtype in ["float32", "float64", "int32", "int16"]:
@@ -125,17 +125,17 @@ def test_infiniteness_ops(topi_op, ref_op, name):
                 ] = np.nan
             b_np = ref_op(a_np)
 
-            def check_device(device, ctx):
-                with tvm.target.Target(device):
-                    s = tvm.topi.testing.get_injective_schedule(device)(B)
-                foo = tvm.build(s, [A, B], device, name=name)
-                a = tvm.nd.array(a_np, ctx)
-                b = tvm.nd.array(np.zeros_like(b_np), ctx)
+            def check_target(target, dev):
+                with tvm.target.Target(target):
+                    s = tvm.topi.testing.get_injective_schedule(target)(B)
+                foo = tvm.build(s, [A, B], target, name=name)
+                a = tvm.nd.array(a_np, dev)
+                b = tvm.nd.array(np.zeros_like(b_np), dev)
                 foo(a, b)
                 tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
 
-            for target, ctx in tvm.testing.enabled_targets():
-                check_device(target, ctx)
+            for target, dev in tvm.testing.enabled_targets():
+                check_target(target, dev)
 
     test_apply(topi.floor, "floor", np.floor, -100, 100)
     test_apply(topi.ceil, "ceil", np.ceil, -100, 100)
@@ -177,13 +177,13 @@ def verify(from_dtype, to_dtype, low=-100, high=100):
             a_np = a_np - a_np[2, 3]
         b_np = a_np.astype(to_dtype)
 
-        for device, ctx in tvm.testing.enabled_targets():
-            print("Running on target: %s" % device)
-            with tvm.target.Target(device):
-                s = tvm.topi.testing.get_injective_schedule(device)(B)
-            foo = tvm.build(s, [A, B], device)
-            a = tvm.nd.array(a_np, ctx)
-            b = tvm.nd.empty(shape=shape, dtype=to_dtype, ctx=ctx)
+        for target, dev in tvm.testing.enabled_targets():
+            print("Running on target: %s" % target)
+            with tvm.target.Target(target):
+                s = tvm.topi.testing.get_injective_schedule(target)(B)
+            foo = tvm.build(s, [A, B], target)
+            a = tvm.nd.array(a_np, dev)
+            b = tvm.nd.empty(shape=shape, dtype=to_dtype, device=dev)
             foo(a, b)
             tvm.testing.assert_allclose(b.asnumpy(), b_np)
 
@@ -205,21 +205,21 @@ def test_apply(func, name, f_numpy, low, high, step, dtype="float32"):
         B = func(A)
         assert tuple(B.shape) == tuple(A.shape)
 
-        def check_device(device):
-            ctx = tvm.context(device, 0)
-            if not tvm.testing.device_enabled(device):
-                print("Skip because %s is not enabled" % device)
+        def check_target(target):
+            dev = tvm.device(target, 0)
+            if not tvm.testing.device_enabled(target):
+                print("Skip because %s is not enabled" % target)
                 return
-            with tvm.target.Target(device):
+            with tvm.target.Target(target):
                 s = topi.generic.schedule_injective(B)
-            func = tvm.build(s, [A, B], device, name=name)
-            a = tvm.nd.array(a_np, ctx)
-            b = tvm.nd.array(np.zeros_like(b_np), ctx)
+            func = tvm.build(s, [A, B], target, name=name)
+            a = tvm.nd.array(a_np, dev)
+            b = tvm.nd.array(np.zeros_like(b_np), dev)
             func(a, b)
             tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
 
-        check_device("llvm")
-        check_device("llvm -device=arm-cpu")
+        check_target("llvm")
+        check_target("llvm -device=arm-cpu")
 
     test_apply(topi.fast_exp, "fast_exp", np.exp, low=-88, high=88, step=0.01)
     test_apply(topi.fast_erf, "fast_erf", scipy.special.erf, low=-10, high=10, step=0.01)
diff --git a/tests/python/topi/python/test_topi_matmul.py b/tests/python/topi/python/test_topi_matmul.py
index 26ba6f8142b4..b0d71b315add 100644
--- a/tests/python/topi/python/test_topi_matmul.py
+++ b/tests/python/topi/python/test_topi_matmul.py
@@ -26,15 +26,15 @@ def with_tvm(lam, *args):
     """Take numpy arrays as args, convert them to TVM tensors and call `lam`.
     Result of lambda is converted back to numpy array and returned.
     """
-    ctx = tvm.cpu(0)
+    dev = tvm.cpu(0)
     pls = []  # placeholders
     vals_nd = []  # initial values
     for i, arg in enumerate(args):
         pls.append(te.placeholder(arg.shape, name="pl" + str(i)))
-        vals_nd.append(tvm.nd.array(arg, ctx))
+        vals_nd.append(tvm.nd.array(arg, dev))
 
     out = lam(*pls)
-    out_nd = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=out.dtype), ctx)
+    out_nd = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=out.dtype), dev)
     s = te.create_schedule([out.op])
     m = tvm.build(s, pls + [out], "llvm")
     m(*(vals_nd + [out_nd]))
diff --git a/tests/python/topi/python/test_topi_pooling.py b/tests/python/topi/python/test_topi_pooling.py
index 6f62b8ad969b..1451d18e42dd 100644
--- a/tests/python/topi/python/test_topi_pooling.py
+++ b/tests/python/topi/python/test_topi_pooling.py
@@ -106,20 +106,20 @@ def verify_pool(n, ic, ih, kh, sh, padding, pool_type, ceil_mode, count_include_
                 )
     b_np = np.maximum(b_np, 0.0)
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s_func = tvm.topi.testing.dispatch(device, _pool_schedule)
+    def check_target(target, dev):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s_func = tvm.topi.testing.dispatch(target, _pool_schedule)
             s = s_func(B, layout)
 
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
-        f = tvm.build(s, [A, B], device)
+        a = tvm.nd.array(a_np, dev)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), dev)
+        f = tvm.build(s, [A, B], target)
         f(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=2e-5, atol=1e-5)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_target(target, dev)
 
 
 def verify_pool_grad(
@@ -181,21 +181,21 @@ def verify_pool_grad(
     if add_relu:
         pool_grad_np = np.maximum(pool_grad_np, 0.0)
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s_func = tvm.topi.testing.dispatch(device, _pool_grad_schedule)
+    def check_target(target, dev):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s_func = tvm.topi.testing.dispatch(target, _pool_grad_schedule)
             s = s_func(PoolGrad)
 
-        a = tvm.nd.array(a_np, ctx)
-        out_grad = tvm.nd.array(out_grad_np, ctx)
-        pool_grad = tvm.nd.array(np.zeros(get_const_tuple(PoolGrad.shape), dtype=dtype), ctx)
-        f = tvm.build(s, [A, OutGrad, PoolGrad], device)
+        a = tvm.nd.array(a_np, dev)
+        out_grad = tvm.nd.array(out_grad_np, dev)
+        pool_grad = tvm.nd.array(np.zeros(get_const_tuple(PoolGrad.shape), dtype=dtype), dev)
+        f = tvm.build(s, [A, OutGrad, PoolGrad], target)
         f(a, out_grad, pool_grad)
         tvm.testing.assert_allclose(pool_grad.asnumpy(), pool_grad_np, rtol=1e-5)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_target(target, dev)
 
 
 @tvm.testing.uses_gpu
@@ -256,22 +256,22 @@ def verify_global_pool(dshape, pool_type, layout="NCHW"):
         b_np = np.max(a_np, axis=axis, keepdims=True)
     b_np = np.maximum(b_np, 0.0)
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s_func = tvm.topi.testing.dispatch(device, _adaptive_pool_schedule)
-            if device == "cuda":
+    def check_target(target, dev):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s_func = tvm.topi.testing.dispatch(target, _adaptive_pool_schedule)
+            if target == "cuda":
                 s = s_func(B, layout)
             else:
                 s = s_func(B)
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
-        f = tvm.build(s, [A, B], device)
+        a = tvm.nd.array(a_np, dev)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
+        f = tvm.build(s, [A, B], target)
         f(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_target(target, dev)
 
 
 @tvm.testing.uses_gpu
@@ -300,22 +300,22 @@ def verify_adaptive_pool(dshape, out_size, pool_type, layout="NCHW", dtype="floa
         assert len(out_size) == 3
         out = topi.nn.adaptive_pool3d(data, out_size, pool_type, layout)
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s_func = tvm.topi.testing.dispatch(device, _adaptive_pool_schedule)
-            if device == "cuda":
+    def check_target(target, dev):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s_func = tvm.topi.testing.dispatch(target, _adaptive_pool_schedule)
+            if target == "cuda":
                 s = s_func(out, layout)
             else:
                 s = s_func(out)
-        a = tvm.nd.array(np_data, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(oshape), dtype=out.dtype), ctx)
-        f = tvm.build(s, [data, out], device)
+        a = tvm.nd.array(np_data, dev)
+        b = tvm.nd.array(np.zeros(get_const_tuple(oshape), dtype=out.dtype), dev)
+        f = tvm.build(s, [data, out], target)
         f(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), np_out, rtol=4e-5, atol=1e-6)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_target(target, dev)
 
 
 @tvm.testing.uses_gpu
@@ -367,20 +367,20 @@ def verify_pool3d(
         input_np, kernel, stride, padding, output_shape, pool_type, count_include_pad, ceil_mode
     )
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s_func = tvm.topi.testing.dispatch(device, _pool_schedule)
+    def check_target(target, dev):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s_func = tvm.topi.testing.dispatch(target, _pool_schedule)
             s = s_func(B, layout)
 
-        a = tvm.nd.array(input_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
-        f = tvm.build(s, [A, B], device)
+        a = tvm.nd.array(input_np, dev)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), dev)
+        f = tvm.build(s, [A, B], target)
         f(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), ref_np, rtol=1e-5)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_target(target, dev)
 
 
 @tvm.testing.uses_gpu
@@ -428,20 +428,20 @@ def verify_pool1d(
         input_np, kernel, stride, padding, output_shape, pool_type, count_include_pad, ceil_mode
     )
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s_func = tvm.topi.testing.dispatch(device, _pool_schedule)
+    def check_target(target, dev):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s_func = tvm.topi.testing.dispatch(target, _pool_schedule)
             s = s_func(B, layout)
 
-        a = tvm.nd.array(input_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
-        f = tvm.build(s, [A, B], device)
+        a = tvm.nd.array(input_np, dev)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), dev)
+        f = tvm.build(s, [A, B], target)
         f(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), ref_np, rtol=1e-5)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_target(target, dev)
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/topi/python/test_topi_prng.py b/tests/python/topi/python/test_topi_prng.py
index 649e5410c147..4ad3a80c6a9e 100644
--- a/tests/python/topi/python/test_topi_prng.py
+++ b/tests/python/topi/python/test_topi_prng.py
@@ -21,7 +21,7 @@
 import numpy as np
 
 
-def threefry_split(target, ctx, gen):
+def threefry_split(target, dev, gen):
     gen_placeholder = tvm.te.placeholder(gen.shape, name="gen", dtype="uint64")
     left_placeholder, right_placeholder = tvm.topi.random.threefry_split(gen_placeholder)
     s = tvm.topi.generic.schedule_extern([left_placeholder, right_placeholder])
@@ -32,7 +32,7 @@ def threefry_split(target, ctx, gen):
     return left.asnumpy(), right.asnumpy()
 
 
-def threefry_generate(target, ctx, gen, size):
+def threefry_generate(target, dev, gen, size):
     gen_placeholder = tvm.te.placeholder(gen.shape, name="gen", dtype="uint64")
     left_placeholder, right_placeholder = tvm.topi.random.threefry_generate(gen_placeholder, size)
     s = tvm.topi.generic.schedule_extern([left_placeholder, right_placeholder])
@@ -44,10 +44,10 @@ def threefry_generate(target, ctx, gen, size):
 
 
 @tvm.testing.parametrize_targets
-def test_threefry_split(target, ctx):
+def test_threefry_split(target, dev):
     # test that results of split do not equal eachother or the input
     gen = tvm.relay.random.threefry_key(0).data.asnumpy()
-    a, b = threefry_split(target, ctx, gen)
+    a, b = threefry_split(target, dev, gen)
     assert (a != b).any() and (
         a != gen
     ).any(), "Splitting a gen should result in different output gens"
@@ -57,39 +57,39 @@ def test_threefry_split(target, ctx):
 
     # test enough splits to go over path length
     for i in range(129):
-        a, b = threefry_split(target, ctx, b)
+        a, b = threefry_split(target, dev, b)
     assert (a[0:4] == b[0:4]).all(), "State part of split should be the same"
     assert (b[0:4] != np.zeros(4, dtype="uint64")).any()
 
     # check that split then generate does not generate the same for both sides
-    a, a_rands = threefry_generate(target, ctx, a, (100,))
-    b, b_rands = threefry_generate(target, ctx, b, (100,))
+    a, a_rands = threefry_generate(target, dev, a, (100,))
+    b, b_rands = threefry_generate(target, dev, b, (100,))
     assert (
         a_rands != b_rands
     ).all(), "Numbers generated from different initial states should be different"
 
     # check repeatability
-    _, rands1 = threefry_generate(target, ctx, a, (100,))
-    _, rands2 = threefry_generate(target, ctx, a, (100,))
+    _, rands1 = threefry_generate(target, dev, a, (100,))
+    _, rands2 = threefry_generate(target, dev, a, (100,))
     assert (
         rands1 == rands2
     ).all(), "Numbers generated from the same initial state should be the same"
 
-    a1, b1 = threefry_split(target, ctx, a)
-    a2, b2 = threefry_split(target, ctx, a)
+    a1, b1 = threefry_split(target, dev, a)
+    a2, b2 = threefry_split(target, dev, a)
     assert (a1 == a2).all() and (
         b1 == b2
     ).all(), "Split called on the same input should return the same result"
 
 
 @tvm.testing.parametrize_targets
-def test_threefry_generate(target, ctx):
+def test_threefry_generate(target, dev):
     gen = tvm.relay.random.threefry_key(0).data.asnumpy()
 
     # check that we can generate some data
-    a, rands = threefry_generate(target, ctx, gen, (100,))
+    a, rands = threefry_generate(target, dev, gen, (2048,))
     assert (
-        rands.shape[0] == 100 and len(rands.shape) == 1
+        rands.shape[0] == 2048 and len(rands.shape) == 1
     ), "Output shape should match requested shape"
 
     # check that gen out does not equal input
@@ -99,26 +99,26 @@ def test_threefry_generate(target, ctx):
     gen = np.array(
         [0, 0, 0, 0, 0, 0, 0, 2 ** 64 - 2, 1 << 63, 0], dtype="uint64"
     )  # make counter large
-    a, rands = threefry_generate(target, ctx, gen, (100,))
+    a, rands = threefry_generate(target, dev, gen, (2048,))
     assert gen[4] != a[4], "Overflow of counter should trigger path change"
-    assert a[7] == 100, "Overflow of counter should still update counter"
+    assert a[7] == 2048, "Overflow of counter should still update counter"
 
     # check generate with path at length limit
     gen = np.array([0, 0, 0, 0, 0, 0, 0, 2 ** 64 - 2, 0, 0], dtype="uint64")  # make counter large
-    a, rands = threefry_generate(target, ctx, gen, (100,))
+    a, rands = threefry_generate(target, dev, gen, (2048,))
     assert (
         gen[0:4] != a[0:4]
     ).any(), "Overflowing counter with no space left in path should change state"
 
 
 @tvm.testing.parametrize_targets
-def test_threefry_wrapping(target, ctx):
+def test_threefry_wrapping(target, dev):
     assert tvm.topi.random.threefry_test_wrapping(
-        target, ctx
+        target, dev
     ), f"{target} does not suppport wrapping unsigned integer arithmetic"
 
 
 if __name__ == "__main__":
-    test_threefry_split(tvm.target.Target("llvm"), tvm.context("cpu"))
-    test_threefry_generate(tvm.target.Target("llvm"), tvm.context("cpu"))
-    test_threefry_wrapping(tvm.target.Target("llvm"), tvm.context("cpu"))
+    test_threefry_split(tvm.target.Target("llvm"), tvm.device("cpu"))
+    test_threefry_generate(tvm.target.Target("llvm"), tvm.device("cpu"))
+    test_threefry_wrapping(tvm.target.Target("llvm"), tvm.device("cpu"))
diff --git a/tests/python/topi/python/test_topi_qnn.py b/tests/python/topi/python/test_topi_qnn.py
new file mode 100644
index 000000000000..995cfd2df666
--- /dev/null
+++ b/tests/python/topi/python/test_topi_qnn.py
@@ -0,0 +1,161 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test code for QNN operators."""
+import numpy as np
+import tvm
+from tvm import topi, relay, te
+from tvm.contrib import graph_executor
+import tvm.topi.testing
+
+
+def verify_simulated_quantize(data_shape, out_dtype, channels, axis):
+    # Create placeholder variables for all qnn inputs.
+    A = te.placeholder(data_shape, name="value", dtype="float32")
+    D = te.placeholder([], name="dtype", dtype="int32")
+    S = te.placeholder([te.size_var("scale_dim")], name="scale", dtype="float32")
+    Z = te.placeholder([te.size_var("zp_dim")], name="zp", dtype="int32")
+    SIM_Q = topi.nn.simulated_quantize(A, D, output_scale=S, output_zero_point=Z, axis=axis)
+
+    # Create random numpy values to assign to inputs.
+    a_np = np.random.uniform(size=data_shape).astype("float32")
+    d_np = np.int32(topi.nn.SQNN_DTYPE_TO_CODE[out_dtype])
+    s_np = np.random.uniform(low=1e-4, high=0.1, size=channels).astype("float32")
+    z_np = np.random.uniform(low=-10, high=10, size=channels).astype("int32")
+    q_np = np.zeros(shape=data_shape, dtype="float32")
+
+    def check_target(target, dev):
+        # Wrap the numpy arrays in nd arrays.
+        a = tvm.nd.array(a_np, dev)
+        d = tvm.nd.array(d_np, dev)
+        s = tvm.nd.array(s_np, dev)
+        z = tvm.nd.array(z_np, dev)
+        q = tvm.nd.array(q_np, dev)
+
+        # Construct equivalent relay graph.
+        per_channel = channels[0] != 1
+        a_var = relay.var("a", shape=data_shape, dtype="float32")
+        if per_channel:
+            s_var = relay.const(s_np)
+            z_var = relay.const(z_np)
+        else:
+            s_var = relay.const(s_np[0])
+            z_var = relay.const(z_np[0])
+        real_q_op = relay.qnn.op.quantize(a_var, s_var, z_var, axis=axis, out_dtype=out_dtype)
+        with tvm.transform.PassContext(opt_level=3):
+            lib = relay.build(tvm.IRModule.from_expr(real_q_op), target=target)
+
+        # Get real qnn quantize output.
+        m = graph_executor.GraphModule(lib["default"](dev))
+        m.set_input("a", a_np)
+
+        m.run()
+        real_q_out = m.get_output(0)
+
+        # Compile the simulated quantize function.
+        with tvm.target.Target(target):
+            sched = tvm.topi.testing.get_injective_schedule(target)(SIM_Q)
+        func = tvm.build(sched, [A, D, S, Z, SIM_Q], target, name="sim_quantize")
+        func(a, d, s, z, q)
+
+        # Check correctness against the true qnn output.
+        mismatch = q.asnumpy() != real_q_out.asnumpy().astype("float32")
+        # Allow some rounding errors due to GPU fp32 arithmetic.
+        assert np.sum(mismatch) <= 3
+
+    for target, dev in tvm.testing.enabled_targets():
+        check_target(target, dev)
+
+
+def test_simulated_quantize():
+    verify_simulated_quantize([1], "int8", [1], -1)
+    verify_simulated_quantize([2, 5], "int8", [5], 1)
+    verify_simulated_quantize([1, 32, 32, 32], "int8", [32], -1)
+    verify_simulated_quantize([1, 32, 32, 32], "uint8", [32], -2)
+    verify_simulated_quantize([2, 5], "int32", [5], 1)
+
+
+def verify_simulated_dequantize(data_shape, in_dtype, channels, axis):
+    # Create placeholder variables for all qnn inputs.
+    A = te.placeholder(data_shape, name="value", dtype="float32")
+    D = te.placeholder([], name="dtype", dtype="int32")
+    S = te.placeholder([te.size_var("scale_dim")], name="scale", dtype="float32")
+    Z = te.placeholder([te.size_var("zp_dim")], name="zp", dtype="int32")
+    SIM_DQ = topi.nn.simulated_dequantize(A, D, input_scale=S, input_zero_point=Z, axis=axis)
+
+    # Create random numpy values to assign to inputs.
+    a_np = np.random.uniform(low=-128, high=127, size=data_shape).astype(in_dtype)
+    a_np_f = a_np.astype("float32")
+    d_np = np.int32(topi.nn.SQNN_DTYPE_TO_CODE[in_dtype])
+    s_np = np.random.uniform(low=1e-4, high=0.1, size=channels).astype("float32")
+    z_np = np.random.uniform(low=-10, high=10, size=channels).astype("int32")
+    dq_np = np.zeros(shape=data_shape, dtype="float32")
+
+    def check_target(target, dev):
+        # Wrap the numpy arrays in nd arrays.
+        a = tvm.nd.array(a_np_f, dev)
+        d = tvm.nd.array(d_np, dev)
+        s = tvm.nd.array(s_np, dev)
+        z = tvm.nd.array(z_np, dev)
+        dq = tvm.nd.array(dq_np, dev)
+
+        # Construct equivalent relay graph.
+        per_channel = channels[0] != 1
+        a_var = relay.var("a", shape=data_shape, dtype=in_dtype)
+        if per_channel:
+            s_var = relay.const(s_np)
+            z_var = relay.const(z_np)
+        else:
+            s_var = relay.const(s_np[0])
+            z_var = relay.const(z_np[0])
+        real_dq_op = relay.qnn.op.dequantize(a_var, s_var, z_var, axis=axis)
+        with tvm.transform.PassContext(opt_level=3):
+            lib = relay.build(tvm.IRModule.from_expr(real_dq_op), target=target)
+
+        # Get real qnn quantize output.
+        m = graph_executor.GraphModule(lib["default"](dev))
+        m.set_input("a", a_np)
+
+        m.run()
+        real_dq_out = m.get_output(0)
+
+        # Compile the simulated quantize function.
+        with tvm.target.Target(target):
+            sched = tvm.topi.testing.get_injective_schedule(target)(SIM_DQ)
+        func = tvm.build(sched, [A, D, S, Z, SIM_DQ], target, name="sim_quantize")
+        func(a, d, s, z, dq)
+
+        # Check correctness against the true qnn output.
+        tvm.testing.assert_allclose(
+            dq.asnumpy(), real_dq_out.asnumpy().astype("float32"), rtol=1e-5
+        )
+
+    for target, dev in tvm.testing.enabled_targets():
+        check_target(target, dev)
+
+
+def test_simulated_dequantize():
+    verify_simulated_dequantize([1], "int8", [1], -1)
+    verify_simulated_dequantize([2, 5], "int8", [5], 1)
+    verify_simulated_dequantize([2, 5], "int8", [2], 0)
+    verify_simulated_dequantize([1, 32, 32, 32], "int8", [32], -1)
+    verify_simulated_dequantize([1, 32, 32, 32], "uint8", [32], -2)
+    verify_simulated_dequantize([2, 5], "int32", [5], 1)
+
+
+if __name__ == "__main__":
+    test_simulated_quantize()
+    test_simulated_dequantize()
diff --git a/tests/python/topi/python/test_topi_reduce.py b/tests/python/topi/python/test_topi_reduce.py
index 9ddcb0d3884b..c6de8d7c7f4d 100644
--- a/tests/python/topi/python/test_topi_reduce.py
+++ b/tests/python/topi/python/test_topi_reduce.py
@@ -69,7 +69,7 @@ def verify_reduce_map_ele(in_shape, axis, keepdims, type="sum", dtype="float32")
     else:
         raise NotImplementedError
 
-    def check_device(device, ctx):
+    def check_device(device, dev):
         print("Running on target: %s" % device)
         with tvm.target.Target(device):
             s = tvm.topi.testing.get_reduce_schedule(device)(B)
@@ -98,8 +98,8 @@ def check_device(device, ctx):
             out_npy = _my_npy_argmin(in_npy_map, axis=axis, keepdims=keepdims)
         else:
             raise NotImplementedError
-        data_tvm = tvm.nd.array(in_npy, ctx=ctx)
-        out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=ctx, dtype=out_dtype)
+        data_tvm = tvm.nd.array(in_npy, device=dev)
+        out_tvm = tvm.nd.empty(shape=out_npy.shape, device=dev, dtype=out_dtype)
         for _ in range(1):
             foo(data_tvm, out_tvm)
         if type == "argmax" or type == "argmin":
@@ -119,8 +119,8 @@ def check_device(device, ctx):
         else:
             tvm.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1e-3, 1e-3)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for device, dev in tvm.testing.enabled_targets():
+        check_device(device, dev)
 
 
 @tvm.testing.uses_gpu
@@ -163,7 +163,7 @@ def test_complex_reduce():
     C = topi.add(B, B)
     D = topi.multiply(B, B)
     E = topi.add(C, D)
-    for device, ctx in tvm.testing.enabled_targets():
+    for device, dev in tvm.testing.enabled_targets():
         print("Running on target: %s" % device)
         with tvm.target.Target(device):
             s = tvm.topi.testing.get_reduce_schedule(device)(E)
@@ -171,8 +171,8 @@ def test_complex_reduce():
         in_npy = np.random.uniform(-1, 1, size=in_shape).astype(dtype)
         sum_npy = in_npy.sum(axis=axis, keepdims=keepdims)
         out_npy = sum_npy * 2 + sum_npy * sum_npy
-        data_tvm = tvm.nd.array(in_npy, ctx=ctx)
-        out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=ctx, dtype=dtype)
+        data_tvm = tvm.nd.array(in_npy, device=dev)
+        out_tvm = tvm.nd.empty(shape=out_npy.shape, device=dev, dtype=dtype)
         foo(data_tvm, out_tvm)
         tvm.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1e-3, 1e-3)
 
diff --git a/tests/python/topi/python/test_topi_relu.py b/tests/python/topi/python/test_topi_relu.py
index 7c45acae0570..9acf98d8259f 100644
--- a/tests/python/topi/python/test_topi_relu.py
+++ b/tests/python/topi/python/test_topi_relu.py
@@ -34,22 +34,22 @@ def verify_relu(m, n, dtype="float32"):
     a_np = np.random.uniform(low=-1.0, high=1.0, size=get_const_tuple(A.shape)).astype(A.dtype)
     b_np = a_np * (a_np > 0)
 
-    def check_device(device, ctx):
-        if dtype == "float16" and device == "cuda" and not have_fp16(tvm.gpu(0).compute_version):
-            print("Skip because %s does not have fp16 support" % device)
+    def check_target(target, dev):
+        if dtype == "float16" and target == "cuda" and not have_fp16(tvm.gpu(0).compute_version):
+            print("Skip because %s does not have fp16 support" % target)
             return
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_elemwise_schedule(device)(B)
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s = tvm.topi.testing.get_elemwise_schedule(target)(B)
 
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
-        foo = tvm.build(s, [A, B], device, name="relu")
+        a = tvm.nd.array(a_np, dev)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
+        foo = tvm.build(s, [A, B], target, name="relu")
         foo(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_target(target, dev)
 
 
 def verify_leaky_relu(m, alpha):
@@ -59,9 +59,9 @@ def verify_leaky_relu(m, alpha):
 
     a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
     b_np = a_np * (a_np > 0) + a_np * (a_np < 0) * alpha
-    ctx = tvm.cpu(0)
-    a = tvm.nd.array(a_np, ctx)
-    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+    dev = tvm.cpu(0)
+    a = tvm.nd.array(a_np, dev)
+    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
     foo = tvm.build(s, [A, B], "llvm", name="leaky_relu")
     foo(a, b)
     tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
@@ -79,11 +79,11 @@ def _prelu_numpy(x, W):
     B = topi.nn.prelu(X, W, axis)
     s = te.create_schedule([B.op])
 
-    ctx = tvm.cpu(0)
-    x_tvm = tvm.nd.array(x_np, ctx)
-    w_tvm = tvm.nd.array(w_np, ctx)
+    dev = tvm.cpu(0)
+    x_tvm = tvm.nd.array(x_np, dev)
+    w_tvm = tvm.nd.array(w_np, dev)
 
-    b = tvm.nd.array(np.zeros(get_const_tuple(X.shape), dtype=B.dtype), ctx)
+    b = tvm.nd.array(np.zeros(get_const_tuple(X.shape), dtype=B.dtype), dev)
     foo = tvm.build(s, [X, W, B], "llvm", name="prelu")
     foo(x_tvm, w_tvm, b)
     out_np = _prelu_numpy(x_np, w_np)
diff --git a/tests/python/topi/python/test_topi_reorg.py b/tests/python/topi/python/test_topi_reorg.py
index 93464d9bef03..e26a05287e05 100644
--- a/tests/python/topi/python/test_topi_reorg.py
+++ b/tests/python/topi/python/test_topi_reorg.py
@@ -48,7 +48,7 @@ def get_ref_data_reorg():
 
     def check_device(device):
         """Cheching devices is enabled or not"""
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
@@ -56,8 +56,8 @@ def check_device(device):
         with tvm.target.Target(device):
             s_func = tvm.topi.testing.dispatch(device, _reorg_schedule)
             s = s_func([B])
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+        a = tvm.nd.array(a_np, dev)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
         func = tvm.build(s, [A, B], device)
         func(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
diff --git a/tests/python/topi/python/test_topi_scan.py b/tests/python/topi/python/test_topi_scan.py
new file mode 100644
index 000000000000..cd77a1ccfbce
--- /dev/null
+++ b/tests/python/topi/python/test_topi_scan.py
@@ -0,0 +1,144 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from typing import Callable
+
+import numpy as np
+import tvm
+import tvm.testing
+import tvm.topi.testing
+from tvm import topi
+
+topi_funcs = {
+    "cumsum": {"generic": topi.cumsum, "cuda": topi.cuda.cumsum},
+    "cumprod": {"generic": topi.cumprod, "cuda": topi.cuda.cumprod},
+}
+
+identity_value = {"cumsum": 0, "cumprod": 1}
+
+
+def get_implementations(name, axis, dtype, exclusive):
+    topi_func_generic = topi_funcs[name]["generic"]
+    topi_func_cuda = topi_funcs[name]["cuda"]
+
+    return {
+        "generic": (
+            lambda x: topi_func_generic(x, axis, dtype, exclusive=exclusive),
+            topi.generic.schedule_extern,
+        ),
+        "cuda": (
+            lambda x: topi_func_cuda(x, axis, dtype, exclusive=exclusive),
+            topi.cuda.schedule_scan,
+        ),
+        "nvptx": (
+            lambda x: topi_func_cuda(x, axis, dtype, exclusive=exclusive),
+            topi.cuda.schedule_scan,
+        ),
+        "vulkan": (
+            lambda x: topi_func_cuda(x, axis, dtype, exclusive=exclusive),
+            topi.cuda.schedule_scan,
+        ),
+        "metal": (
+            lambda x: topi_func_cuda(x, axis, dtype, exclusive=exclusive),
+            topi.cuda.schedule_scan,
+        ),
+    }
+
+
+def _run_tests(
+    dev,
+    target,
+    op_name: str = "cumsum",
+    gt_func: Callable[..., np.array] = np.cumsum,
+):
+    def check_scan(np_ref, data, axis=None, dtype=None, exclusive=False):
+        implementations = get_implementations(op_name, axis, dtype, exclusive)
+        fcompute, fschedule = tvm.topi.testing.dispatch(target, implementations)
+        tvm.topi.testing.compare_numpy_tvm([data], np_ref, target, dev, fcompute, fschedule)
+
+    data = np.array([2, 3, 0])
+    check_scan(gt_func(data), data)
+
+    data = np.random.rand(10) > 0.5
+    data = data.astype(np.int32)
+    check_scan(gt_func(data, dtype=np.int32), data)
+    check_scan(gt_func(data), data, dtype="int64")
+
+    data = np.random.rand(10) > 0.5
+    check_scan(gt_func(data, dtype=np.int32), data, dtype="int32")
+
+    for in_dtype in ["float32", "float64"]:
+        if target == "metal" and in_dtype == "float64":
+            # float64 is not supported in metal
+            continue
+        data = np.random.randn(10, 10).astype(in_dtype)
+        check_scan(gt_func(data), data)
+        check_scan(gt_func(data, axis=0), data, axis=0)
+        check_scan(gt_func(data, axis=1), data, axis=1)
+
+        data = np.random.randn(10, 5, 10).astype(in_dtype)
+        check_scan(gt_func(data), data)
+        check_scan(gt_func(data, axis=0), data, axis=0)
+        check_scan(gt_func(data, axis=1), data, axis=1)
+        check_scan(gt_func(data, axis=-1), data, axis=-1)
+
+    for in_dtype in ["int32", "int64"]:
+        data = np.random.randint(-100, 100, size=(100, 100)).astype(in_dtype)
+        check_scan(gt_func(data, dtype=in_dtype), data)
+        check_scan(gt_func(data), data, dtype="int64")
+        check_scan(gt_func(data, axis=0, dtype=in_dtype), data, axis=0)
+        check_scan(gt_func(data, axis=1, dtype=in_dtype), data, axis=1)
+
+        data = np.random.randint(1 << 30, (1 << 31) - 1, size=(100)).astype(in_dtype)
+        check_scan(gt_func(data), data, dtype="int64")
+
+    data = np.random.randint(-100, 100, size=(100, 100)).astype("int64")
+
+    expected_result = np.roll(gt_func(data), 1)
+    expected_result[0] = identity_value[op_name]
+    check_scan(expected_result, data, dtype="int64", exclusive=True)
+
+    expected_result = np.roll(gt_func(data, axis=0, dtype=in_dtype), 1, axis=0)
+    expected_result[0, :] = identity_value[op_name]
+    check_scan(expected_result, data, axis=0, exclusive=True)
+
+    expected_result = np.roll(gt_func(data, axis=1, dtype=in_dtype), 1, axis=1)
+    expected_result[:, 0] = identity_value[op_name]
+    check_scan(gt_func(data, axis=1, dtype=in_dtype), data, axis=1)
+
+
+@tvm.testing.parametrize_targets
+def test_cumsum(dev, target):
+    _run_tests(dev, target, op_name="cumsum", gt_func=np.cumsum)
+
+
+@tvm.testing.parametrize_targets
+def test_cumprod(dev, target):
+    _run_tests(dev, target, op_name="cumprod", gt_func=np.cumprod)
+
+
+if __name__ == "__main__":
+    test_cumsum(tvm.device("cpu"), tvm.target.Target("llvm"))
+    test_cumsum(tvm.device("cuda"), tvm.target.Target("cuda"))
+    test_cumsum(tvm.device("nvptx"), tvm.target.Target("nvptx"))
+    test_cumsum(tvm.device("vulkan"), tvm.target.Target("vulkan"))
+    test_cumsum(tvm.device("metal"), tvm.target.Target("metal"))
+
+    test_cumprod(tvm.device("cpu"), tvm.target.Target("llvm"))
+    test_cumprod(tvm.device("cuda"), tvm.target.Target("cuda"))
+    test_cumprod(tvm.device("nvptx"), tvm.target.Target("nvptx"))
+    test_cumprod(tvm.device("vulkan"), tvm.target.Target("vulkan"))
+    test_cumprod(tvm.device("metal"), tvm.target.Target("metal"))
diff --git a/tests/python/topi/python/test_topi_scatter.py b/tests/python/topi/python/test_topi_scatter.py
index 2e701e2903d9..ad73bb51f2d3 100644
--- a/tests/python/topi/python/test_topi_scatter.py
+++ b/tests/python/topi/python/test_topi_scatter.py
@@ -22,7 +22,7 @@
 
 
 @tvm.testing.parametrize_targets
-def test_scatter_nd(ctx, target):
+def test_scatter_nd(dev, target):
     def check_scatter_nd(data, indices, shape, out):
         implementations = {
             "generic": (lambda x, y: topi.scatter_nd(x, y, shape), topi.generic.schedule_extern),
@@ -30,7 +30,7 @@ def check_scatter_nd(data, indices, shape, out):
             "cpu": (lambda x, y: topi.x86.scatter_nd(x, y, shape), topi.generic.schedule_extern),
         }
         fcompute, fschedule = tvm.topi.testing.dispatch(target, implementations)
-        tvm.topi.testing.compare_numpy_tvm([data, indices], out, target, ctx, fcompute, fschedule)
+        tvm.topi.testing.compare_numpy_tvm([data, indices], out, target, dev, fcompute, fschedule)
 
     data = np.array([2, 3, 0])
     indices = np.array([[1, 1, 0], [0, 1, 0]])
@@ -64,4 +64,4 @@ def check_scatter_nd(data, indices, shape, out):
 
 
 if __name__ == "__main__":
-    test_scatter_nd(tvm.context("cpu"), tvm.target.Target("llvm"))
+    test_scatter_nd(tvm.device("cpu"), tvm.target.Target("llvm"))
diff --git a/tests/python/topi/python/test_topi_softmax.py b/tests/python/topi/python/test_topi_softmax.py
index 66c44f937c5e..84fa0d24e434 100644
--- a/tests/python/topi/python/test_topi_softmax.py
+++ b/tests/python/topi/python/test_topi_softmax.py
@@ -34,15 +34,15 @@
 }
 
 
-def check_device(A, B, a_np, b_np, device, ctx, name):
-    print("Running on target: %s" % device)
-    with tvm.target.Target(device):
-        s_func = tvm.topi.testing.dispatch(device, _softmax_schedule)
+def check_target(A, B, a_np, b_np, target, dev, name):
+    print("Running on target: %s" % target)
+    with tvm.target.Target(target):
+        s_func = tvm.topi.testing.dispatch(target, _softmax_schedule)
         s = s_func(B)
 
-    a = tvm.nd.array(a_np, ctx)
-    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
-    f = tvm.build(s, [A, B], device, name=name)
+    a = tvm.nd.array(a_np, dev)
+    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
+    f = tvm.build(s, [A, B], target, name=name)
     f(a, b)
     tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
@@ -57,8 +57,8 @@ def verify_softmax(m, n, dtype="float32"):
     a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
     b_np = tvm.topi.testing.softmax_python(a_np)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(A, B, a_np, b_np, device, ctx, "softmax")
+    for target, dev in tvm.testing.enabled_targets():
+        check_target(A, B, a_np, b_np, target, dev, "softmax")
 
 
 def verify_softmax_4d(shape, dtype="float32"):
@@ -70,8 +70,8 @@ def verify_softmax_4d(shape, dtype="float32"):
     b_np = tvm.topi.testing.softmax_python(a_np.transpose(0, 2, 3, 1).reshape(h * w, c))
     b_np = b_np.reshape(1, h, w, c).transpose(0, 3, 1, 2)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(A, B, a_np, b_np, device, ctx, "softmax")
+    for target, dev in tvm.testing.enabled_targets():
+        check_target(A, B, a_np, b_np, target, dev, "softmax")
 
 
 @tvm.testing.uses_gpu
@@ -91,8 +91,8 @@ def verify_log_softmax(m, n, dtype="float32"):
     a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
     b_np = tvm.topi.testing.log_softmax_python(a_np)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(A, B, a_np, b_np, device, ctx, "log_softmax")
+    for target, dev in tvm.testing.enabled_targets():
+        check_target(A, B, a_np, b_np, target, dev, "log_softmax")
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/topi/python/test_topi_sort.py b/tests/python/topi/python/test_topi_sort.py
index 85a35488ab22..c52dc8d3929a 100644
--- a/tests/python/topi/python/test_topi_sort.py
+++ b/tests/python/topi/python/test_topi_sort.py
@@ -58,25 +58,25 @@ def verify_sort(axis, is_ascend):
     else:
         np_sort = np_sort[:, : dshape[axis]]
 
-    def check_device(device):
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
+    def check_target(target):
+        if not tvm.testing.device_enabled(target):
+            print("Skip because %s is not enabled" % target)
             return
-        ctx = tvm.context(device, 0)
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            fcompute, fschedule = tvm.topi.testing.dispatch(device, _sort_implement)
+        dev = tvm.device(target, 0)
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            fcompute, fschedule = tvm.topi.testing.dispatch(target, _sort_implement)
             out = fcompute(data, axis=axis, is_ascend=is_ascend)
             s = fschedule(out)
 
-        tvm_data = tvm.nd.array(np_data, ctx)
-        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data_dtype), ctx)
-        f = tvm.build(s, [data, out], device)
+        tvm_data = tvm.nd.array(np_data, dev)
+        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data_dtype), dev)
+        f = tvm.build(s, [data, out], target)
         f(tvm_data, tvm_out)
         tvm.testing.assert_allclose(tvm_out.asnumpy(), np_sort, rtol=1e0)
 
-    for device in ["llvm", "cuda", "opencl", "vulkan", "nvptx"]:
-        check_device(device)
+    for target in ["llvm", "cuda", "opencl", "vulkan", "nvptx"]:
+        check_target(target)
 
 
 def verify_argsort(axis, is_ascend):
@@ -98,25 +98,25 @@ def verify_argsort(axis, is_ascend):
     else:
         np_indices = np_indices[:, : dshape[axis]]
 
-    def check_device(device):
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
+    def check_target(target):
+        if not tvm.testing.device_enabled(target):
+            print("Skip because %s is not enabled" % target)
             return
-        ctx = tvm.context(device, 0)
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            fcompute, fschedule = tvm.topi.testing.dispatch(device, _argsort_implement)
+        dev = tvm.device(target, 0)
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            fcompute, fschedule = tvm.topi.testing.dispatch(target, _argsort_implement)
             out = fcompute(data, axis=axis, is_ascend=is_ascend)
             s = fschedule(out)
 
-        tvm_data = tvm.nd.array(np_data, ctx)
-        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data_dtype), ctx)
-        f = tvm.build(s, [data, out], device)
+        tvm_data = tvm.nd.array(np_data, dev)
+        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data_dtype), dev)
+        f = tvm.build(s, [data, out], target)
         f(tvm_data, tvm_out)
         tvm.testing.assert_allclose(tvm_out.asnumpy(), np_indices.astype(data_dtype), rtol=1e0)
 
-    for device in ["llvm", "cuda", "opencl", "vulkan", "nvptx"]:
-        check_device(device)
+    for target in ["llvm", "cuda", "opencl", "vulkan", "nvptx"]:
+        check_target(target)
 
 
 def verify_topk(k, axis, ret_type, is_ascend, dtype):
@@ -142,22 +142,22 @@ def verify_topk(k, axis, ret_type, is_ascend, dtype):
             np_values[i, :] = np_data[i, np_indices[i, :]]
     np_indices = np_indices.astype(dtype)
 
-    def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
+    def check_target(target):
+        dev = tvm.device(target, 0)
+        if not tvm.testing.device_enabled(target):
+            print("Skip because %s is not enabled" % target)
             return
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            fcompute, fschedule = tvm.topi.testing.dispatch(device, _topk_implement)
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            fcompute, fschedule = tvm.topi.testing.dispatch(target, _topk_implement)
             outs = fcompute(data, k, axis, ret_type, is_ascend, dtype)
             outs = outs if isinstance(outs, list) else [outs]
             s = fschedule(outs)
-        tvm_data = tvm.nd.array(np_data, ctx)
+        tvm_data = tvm.nd.array(np_data, dev)
         tvm_res = []
         for t in outs:
-            tvm_res.append(tvm.nd.empty(t.shape, dtype=t.dtype, ctx=ctx))
-        f = tvm.build(s, [data] + outs, device)
+            tvm_res.append(tvm.nd.empty(t.shape, dtype=t.dtype, device=dev))
+        f = tvm.build(s, [data] + outs, target)
         f(tvm_data, *tvm_res)
         if ret_type == "both":
             tvm.testing.assert_allclose(tvm_res[0].asnumpy(), np_values)
@@ -167,8 +167,8 @@ def check_device(device):
         else:
             tvm.testing.assert_allclose(tvm_res[0].asnumpy(), np_indices)
 
-    for device in ["llvm", "cuda", "opencl", "vulkan", "nvptx"]:
-        check_device(device)
+    for target in ["llvm", "cuda", "opencl", "vulkan", "nvptx"]:
+        check_target(target)
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/topi/python/test_topi_space_to_batch_nd.py b/tests/python/topi/python/test_topi_space_to_batch_nd.py
index 6f969f391002..21654dd9f084 100644
--- a/tests/python/topi/python/test_topi_space_to_batch_nd.py
+++ b/tests/python/topi/python/test_topi_space_to_batch_nd.py
@@ -42,18 +42,18 @@ def verify_space_to_batch_nd(input_shape, block_shape, pad_before, pad_after, pa
         a_np, block_shape, pad_before, pad_after, pad_value
     )
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        with tvm.target.create(device):
-            s = tvm.topi.testing.get_injective_schedule(device)(B)
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx)
-        f = tvm.build(s, [A, B], device)
+    def check_target(target, dev):
+        print("Running on target: %s" % target)
+        with tvm.target.create(target):
+            s = tvm.topi.testing.get_injective_schedule(target)(B)
+        a = tvm.nd.array(a_np, dev)
+        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), dev)
+        f = tvm.build(s, [A, B], target)
         f(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_target(target, dev)
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/topi/python/test_topi_space_to_depth.py b/tests/python/topi/python/test_topi_space_to_depth.py
index 397018688eb4..7fd49dc363cb 100644
--- a/tests/python/topi/python/test_topi_space_to_depth.py
+++ b/tests/python/topi/python/test_topi_space_to_depth.py
@@ -49,18 +49,18 @@ def verify_space_to_depth(block_size, batch, in_channel, in_height, in_width, la
         a_np = np.transpose(a_np, axes=[0, 2, 3, 1])
         b_np = np.transpose(b_np, axes=[0, 2, 3, 1])
 
-    def check_device(device, ctx):
+    def check_device(device, dev):
         print("Running on target: %s" % device)
         with tvm.target.Target(device):
             s = tvm.topi.testing.get_injective_schedule(device)(B)
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx)
+        a = tvm.nd.array(a_np, dev)
+        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), dev)
         f = tvm.build(s, [A, B], device)
         f(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for device, dev in tvm.testing.enabled_targets():
+        check_device(device, dev)
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/topi/python/test_topi_sparse.py b/tests/python/topi/python/test_topi_sparse.py
index d5bd7aa1a21e..500384b23f2a 100644
--- a/tests/python/topi/python/test_topi_sparse.py
+++ b/tests/python/topi/python/test_topi_sparse.py
@@ -59,17 +59,17 @@ def get_ref_data():
     a_np, b_np, c_np, d_np = get_ref_data()
 
     def check_device(device):
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
-        a = tvmsp.array(a_np, ctx)
+        a = tvmsp.array(a_np, dev)
         _nr, _nc, _n = a.shape[0], a.shape[1], a.data.shape[0]
         assert a.shape[0] == a.indptr.shape[0] - 1
-        b = tvm.nd.array(b_np, ctx)
-        c = tvm.nd.array(c_np, ctx)
-        d = tvm.nd.array(np.zeros((_nr, 1), dtype=dtype), ctx)
+        b = tvm.nd.array(b_np, dev)
+        c = tvm.nd.array(c_np, dev)
+        d = tvm.nd.array(np.zeros((_nr, 1), dtype=dtype), dev)
         assert a.data.dtype == A.data.dtype
         assert a.indices.dtype == A.indices.dtype
         assert a.indptr.dtype == A.indptr.dtype
@@ -105,17 +105,17 @@ def get_ref_data():
     a_np, b_np, c_np, d_np = get_ref_data()
 
     def check_device(device):
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
-        a = tvmsp.array(a_np, ctx)
+        a = tvmsp.array(a_np, dev)
         _nr, _nc, _n = a.shape[0], a.shape[1], a.data.shape[0]
         assert a.shape[0] == a.indptr.shape[0] - 1
-        b = tvm.nd.array(b_np, ctx)
-        c = tvm.nd.array(c_np, ctx)
-        d = tvm.nd.array(np.zeros((_nr, out_dim), dtype=dtype), ctx)
+        b = tvm.nd.array(b_np, dev)
+        c = tvm.nd.array(c_np, dev)
+        d = tvm.nd.array(np.zeros((_nr, out_dim), dtype=dtype), dev)
         f = tvm.build(s, [nr, A.data, A.indices, A.indptr, B, C, D], device, name="csrmm")
 
         f(_nr, a.data, a.indices, a.indptr, b, c, d)
@@ -152,15 +152,15 @@ def get_ref_data():
     a_np, b_np, c_np, d_np = get_ref_data()
 
     def check_device(device):
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
-        a = tvmsp.array(a_np, ctx)
-        b = tvm.nd.array(b_np, ctx)
-        c = tvm.nd.array(c_np, ctx)
-        d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx)
+        a = tvmsp.array(a_np, dev)
+        b = tvm.nd.array(b_np, dev)
+        c = tvm.nd.array(c_np, dev)
+        d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), dev)
         f = tvm.build(s, [A.data, A.indices, A.indptr, B, C, D], device, name="dense")
         f(a.data, a.indices, a.indptr, b, c, d)
         tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-4, atol=1e-4)
@@ -195,15 +195,15 @@ def get_ref_data():
     a_np, b_np, c_np, d_np = get_ref_data()
 
     def check_device(device):
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
-        a = tvm.nd.array(a_np, ctx)
-        b = tvmsp.array(b_np, ctx)
-        c = tvm.nd.array(c_np, ctx)
-        d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx)
+        a = tvm.nd.array(a_np, dev)
+        b = tvmsp.array(b_np, dev)
+        c = tvm.nd.array(c_np, dev)
+        d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), dev)
         f = tvm.build(s, [A, B.data, B.indices, B.indptr, C, D], device, name="dense")
         f(a, b.data, b.indices, b.indptr, c, d)
         tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-4, atol=1e-4)
@@ -355,7 +355,7 @@ def random_bsr_matrix(M, N, BS_R, BS_C, density, dtype):
     return s
 
 
-def verify_sparse_dense_bsr(M, N, K, BS_R, BS_C, density, use_relu, ctx, target):
+def verify_sparse_dense_bsr(M, N, K, BS_R, BS_C, density, use_relu, device, target):
     X_np = np.random.randn(M, K).astype("float32")
     W_sp_np = random_bsr_matrix(N, K, BS_R, BS_C, density=density, dtype="float32")
     W_np = W_sp_np.todense()
@@ -375,22 +375,22 @@ def verify_sparse_dense_bsr(M, N, K, BS_R, BS_C, density, use_relu, ctx, target)
             Y = topi.nn.relu(Y)
         s = fschedule([Y])
         func = tvm.build(s, [X, W_data, W_indices, W_indptr, Y])
-        Y_tvm = tvm.nd.array(np.zeros(Y_np.shape, dtype=Y_np.dtype), ctx=ctx)
+        Y_tvm = tvm.nd.array(np.zeros(Y_np.shape, dtype=Y_np.dtype), device=device)
         func(
-            tvm.nd.array(X_np, ctx=ctx),
-            tvm.nd.array(W_sp_np.data, ctx=ctx),
-            tvm.nd.array(W_sp_np.indices, ctx=ctx),
-            tvm.nd.array(W_sp_np.indptr, ctx=ctx),
+            tvm.nd.array(X_np, device=device),
+            tvm.nd.array(W_sp_np.data, device=device),
+            tvm.nd.array(W_sp_np.indices, device=device),
+            tvm.nd.array(W_sp_np.indptr, device=device),
             Y_tvm,
         )
         tvm.testing.assert_allclose(Y_tvm.asnumpy(), Y_np, atol=1e-4, rtol=1e-4)
 
 
 @tvm.testing.parametrize_targets("llvm", "cuda")
-def test_sparse_dense_bsr_relu(ctx, target):
+def test_sparse_dense_bsr_relu(dev, target):
     M, N, K, BS_R, BS_C, density = 1, 64, 128, 8, 16, 0.9
-    verify_sparse_dense_bsr(M, N, K, BS_R, BS_C, density, True, ctx, target)
-    verify_sparse_dense_bsr(M, N, K, BS_R, BS_C, density, False, ctx, target)
+    verify_sparse_dense_bsr(M, N, K, BS_R, BS_C, density, True, dev, target)
+    verify_sparse_dense_bsr(M, N, K, BS_R, BS_C, density, False, dev, target)
 
 
 def test_sparse_dense_bsr_reverse():
@@ -439,7 +439,7 @@ def test_sparse_dense_bsr_randomized():
         X = te.placeholder(shape=X_np.shape, dtype=str(X_np.dtype))
 
         def check_device(device):
-            ctx = tvm.context(device, 0)
+            dev = tvm.device(device, 0)
             if not tvm.testing.device_enabled(device):
                 print("Skip because %s is not enabled" % device)
                 return
@@ -449,12 +449,12 @@ def check_device(device):
                 Y = fcompute(X, W_data, W_indices, W_indptr)
                 s = fschedule([Y])
                 func = tvm.build(s, [X, W_data, W_indices, W_indptr, Y])
-                Y_tvm = tvm.nd.array(np.zeros(Y_np.shape, dtype=Y_np.dtype), ctx=ctx)
+                Y_tvm = tvm.nd.array(np.zeros(Y_np.shape, dtype=Y_np.dtype), device=dev)
                 func(
-                    tvm.nd.array(X_np, ctx=ctx),
-                    tvm.nd.array(W_sp_np.data, ctx=ctx),
-                    tvm.nd.array(W_sp_np.indices, ctx=ctx),
-                    tvm.nd.array(W_sp_np.indptr, ctx=ctx),
+                    tvm.nd.array(X_np, device=dev),
+                    tvm.nd.array(W_sp_np.data, device=dev),
+                    tvm.nd.array(W_sp_np.indices, device=dev),
+                    tvm.nd.array(W_sp_np.indptr, device=dev),
                     Y_tvm,
                 )
                 tvm.testing.assert_allclose(Y_tvm.asnumpy(), Y_np, atol=1e-5, rtol=1e-5)
@@ -484,16 +484,16 @@ def test_sparse_dense_padded_cuda():
     )
     X = te.placeholder(shape=X_np.shape, dtype=str(X_np.dtype))
     with tvm.target.Target("cuda"):
-        ctx = tvm.context("gpu")
+        dev = tvm.device("gpu")
         Y = topi.cuda.sparse_dense_padded(X, W_data, W_indices, W_indptr)
         s = topi.cuda.schedule_sparse_dense_padded([Y])
         func = tvm.build(s, [X, W_data, W_indices, W_indptr, Y])
-        Y_tvm = tvm.nd.array(np.zeros(Y_np.shape, dtype=Y_np.dtype), ctx=ctx)
+        Y_tvm = tvm.nd.array(np.zeros(Y_np.shape, dtype=Y_np.dtype), device=dev)
         func(
-            tvm.nd.array(X_np, ctx=ctx),
-            tvm.nd.array(W_sp_np_padded.data, ctx=ctx),
-            tvm.nd.array(W_sp_np_padded.indices, ctx=ctx),
-            tvm.nd.array(W_sp_np_padded.indptr, ctx=ctx),
+            tvm.nd.array(X_np, device=dev),
+            tvm.nd.array(W_sp_np_padded.data, device=dev),
+            tvm.nd.array(W_sp_np_padded.indices, device=dev),
+            tvm.nd.array(W_sp_np_padded.indptr, device=dev),
             Y_tvm,
         )
         tvm.testing.assert_allclose(Y_tvm.asnumpy(), Y_np, atol=1e-5, rtol=1e-5)
@@ -526,6 +526,33 @@ def test_sparse_dense_padded_alter_op():
             x = relay.build(tvm.IRModule.from_expr(f), target=tvm.target.Target("cuda"))
 
 
+def test_sparse_add_csr():
+    for indices_dtype in ["int32", "int64"]:
+        for data_dtype in ["float32", "float64"]:
+            M, K, density = 3, 49, 0.2
+            X_np = np.random.randn(M, K).astype(data_dtype)
+            Y_sp_np = sp.random(M, K, density=density, format="csr", dtype=data_dtype)
+            Y_np = Y_sp_np.todense()
+            Z_np = X_np + Y_np
+
+            Y_data = te.placeholder(shape=Y_sp_np.data.shape, dtype=data_dtype)
+            Y_indices = te.placeholder(shape=Y_sp_np.indices.shape, dtype=indices_dtype)
+            Y_indptr = te.placeholder(shape=Y_sp_np.indptr.shape, dtype=indices_dtype)
+            X = te.placeholder(shape=X_np.shape, dtype=data_dtype)
+            Z = topi.nn.sparse_add(X, Y_data, Y_indices, Y_indptr)
+            s = te.create_schedule(Z.op)
+            func = tvm.build(s, [X, Y_data, Y_indices, Y_indptr, Z])
+            Z_tvm = tvm.nd.array(np.zeros(Z_np.shape, dtype=Z_np.dtype))
+            func(
+                tvm.nd.array(X_np.astype(data_dtype)),
+                tvm.nd.array(Y_sp_np.data.astype(data_dtype)),
+                tvm.nd.array(Y_sp_np.indices.astype(indices_dtype)),
+                tvm.nd.array(Y_sp_np.indptr.astype(indices_dtype)),
+                Z_tvm,
+            )
+            tvm.testing.assert_allclose(Z_tvm.asnumpy(), Z_np, atol=1e-4, rtol=1e-4)
+
+
 if __name__ == "__main__":
     test_csrmv()
     test_csrmm()
@@ -537,3 +564,4 @@ def test_sparse_dense_padded_alter_op():
     test_sparse_dense_padded_alter_op()
     test_sparse_dense_csr_reverse()
     test_sparse_dense_bsr_reverse()
+    test_sparse_add_csr()
diff --git a/tests/python/topi/python/test_topi_tensor.py b/tests/python/topi/python/test_topi_tensor.py
index d384767e17a9..d395c0c4e62f 100644
--- a/tests/python/topi/python/test_topi_tensor.py
+++ b/tests/python/topi/python/test_topi_tensor.py
@@ -41,21 +41,21 @@ def get_ref_data():
 
     np_nd = get_ref_data()
 
-    def check_device(device):
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
+    def check_target(target):
+        if not tvm.testing.device_enabled(target):
+            print("Skip because %s is not enabled" % target)
             return
 
-        ctx = tvm.context(device, 0)
-        out = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx)
-        f = tvm.build(s, tvm_placeholders + [esum], device, name="elemwise_sum")
-        tvm_nd = [tvm.nd.array(nd, ctx) for nd in np_nd] + [out]
+        dev = tvm.device(target, 0)
+        out = tvm.nd.array(np.zeros(shape, dtype=dtype), dev)
+        f = tvm.build(s, tvm_placeholders + [esum], target, name="elemwise_sum")
+        tvm_nd = [tvm.nd.array(nd, dev) for nd in np_nd] + [out]
         f(*tvm_nd)
         np_out = np.sum(np.array(np_nd), axis=0)
         tvm.testing.assert_allclose(out.asnumpy(), np_out, rtol=1e-5)
 
-    for device in ["llvm"]:
-        check_device(device)
+    for target in ["llvm"]:
+        check_target(target)
 
 
 def verify_full(shape, dtype, fill_value):
@@ -71,47 +71,47 @@ def get_ref_data():
 
     np_nd = get_ref_data()
 
-    def check_device(device):
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
+    def check_target(target):
+        if not tvm.testing.device_enabled(target):
+            print("Skip because %s is not enabled" % target)
             return
 
-        ctx = tvm.context(device, 0)
-        out = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx)
-        f = tvm.build(s1, [A, B], device, name="full_like")
-        f(tvm.nd.array(np.zeros(shape, dtype), ctx), out)
+        dev = tvm.device(target, 0)
+        out = tvm.nd.array(np.zeros(shape, dtype=dtype), dev)
+        f = tvm.build(s1, [A, B], target, name="full_like")
+        f(tvm.nd.array(np.zeros(shape, dtype), dev), out)
         tvm.testing.assert_allclose(out.asnumpy(), np_nd, rtol=1e-5)
 
-        f = tvm.build(s2, [C], device, name="full")
+        f = tvm.build(s2, [C], target, name="full")
         f(out)
         tvm.testing.assert_allclose(out.asnumpy(), np_nd, rtol=1e-5)
 
-    for device in ["llvm"]:
-        check_device(device)
+    for target in ["llvm"]:
+        check_target(target)
 
 
 def verify_vectorization(n, m, dtype):
-    def check_device(device):
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
+    def check_targeta(targeta):
+        if not tvm.testing.device_enabled(targeta):
+            print("Skip because %s is not enabled" % targeta)
             return
-        if dtype == "float16" and device == "cuda" and not have_fp16(tvm.gpu(0).compute_version):
+        if dtype == "float16" and targeta == "cuda" and not have_fp16(tvm.gpu(0).compute_version):
             print("Skip because gpu does not have fp16 support")
             return
-        with tvm.target.Target(device):
-            ctx = tvm.context(device, 0)
+        with tvm.target.Target(targeta):
+            dev = tvm.device(targeta, 0)
             A = te.placeholder((n, m), name="A", dtype=dtype)
             B = te.compute((n, m), lambda i, j: A[i, j] + tvm.tir.const(1, A.dtype), name="B")
-            S = tvm.topi.testing.get_elemwise_schedule(device)(B)
+            S = tvm.topi.testing.get_elemwise_schedule(targeta)(B)
 
-            fun = tvm.build(S, [A, B], device)
-            np_A = tvm.nd.empty((n, m), A.dtype, ctx).copyfrom(np.random.uniform(size=(n, m)))
-            np_B = tvm.nd.empty((n, m), B.dtype, ctx)
+            fun = tvm.build(S, [A, B], targeta)
+            np_A = tvm.nd.empty((n, m), A.dtype, dev).copyfrom(np.random.uniform(size=(n, m)))
+            np_B = tvm.nd.empty((n, m), B.dtype, dev)
             fun(np_A, np_B)
             tvm.testing.assert_allclose(np_B.asnumpy(), np_A.asnumpy() + 1, rtol=1e-5)
 
-    for device in ["cuda"]:
-        check_device(device)
+    for targeta in ["cuda"]:
+        check_targeta(targeta)
 
 
 @tvm.testing.requires_gpu
diff --git a/tests/python/topi/python/test_topi_transform.py b/tests/python/topi/python/test_topi_transform.py
index e0018ba0c0d3..16f9f13f05b0 100644
--- a/tests/python/topi/python/test_topi_transform.py
+++ b/tests/python/topi/python/test_topi_transform.py
@@ -30,105 +30,105 @@ def verify_expand_dims(in_shape, out_shape, axis, num_newaxis):
     A = te.placeholder(shape=in_shape, name="A")
     B = topi.expand_dims(A, axis, num_newaxis)
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_broadcast_schedule(device)(B)
-        foo = tvm.build(s, [A, B], device, name="expand_dims")
+    def check_device(target, dev):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s = tvm.topi.testing.get_broadcast_schedule(target)(B)
+        foo = tvm.build(s, [A, B], target, name="expand_dims")
         data_npy = np.random.uniform(size=in_shape).astype(A.dtype)
         out_npy = data_npy.reshape(out_shape)
-        data_nd = tvm.nd.array(data_npy, ctx)
-        out_nd = tvm.nd.array(np.empty(out_shape).astype(B.dtype), ctx)
+        data_nd = tvm.nd.array(data_npy, dev)
+        out_nd = tvm.nd.array(np.empty(out_shape).astype(B.dtype), dev)
         foo(data_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_device(target, dev)
 
 
 def verify_reinterpret(in_shape, in_dtype, out_dtype, generator):
     A = te.placeholder(shape=in_shape, name="A", dtype=in_dtype)
     B = topi.reinterpret(A, out_dtype)
 
-    def check_device(device, ctx):
-        if in_dtype == "float16" and device == "cuda" and not have_fp16(ctx.compute_version):
-            print("Skip because %s does not have fp16 support" % device)
+    def check_device(target, dev):
+        if in_dtype == "float16" and target == "cuda" and not have_fp16(dev.compute_version):
+            print("Skip because %s does not have fp16 support" % target)
             return
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_elemwise_schedule(device)(B)
-        foo = tvm.build(s, [A, B], device, name="reinterpret")
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s = tvm.topi.testing.get_elemwise_schedule(target)(B)
+        foo = tvm.build(s, [A, B], target, name="reinterpret")
         data_npy = generator(in_shape).astype(in_dtype)
         out_npy = data_npy.view(B.dtype)
-        data_nd = tvm.nd.array(data_npy, ctx)
-        out_nd = tvm.nd.array(np.empty(in_shape).astype(B.dtype), ctx)
+        data_nd = tvm.nd.array(data_npy, dev)
+        out_nd = tvm.nd.array(np.empty(in_shape).astype(B.dtype), dev)
         foo(data_nd, out_nd)
         np.testing.assert_equal(out_nd.asnumpy(), out_npy)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_device(target, dev)
 
 
 def verify_transpose(in_shape, axes):
     A = te.placeholder(shape=in_shape, name="A")
     B = topi.transpose(A, axes)
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_injective_schedule(device)(B)
-        foo = tvm.build(s, [A, B], device, name="transpose")
+    def check_device(target, dev):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s = tvm.topi.testing.get_injective_schedule(target)(B)
+        foo = tvm.build(s, [A, B], target, name="transpose")
         data_npy = np.arange(np.prod(in_shape)).reshape(in_shape).astype(A.dtype)
         out_npy = data_npy.transpose(axes)
-        data_nd = tvm.nd.array(data_npy, ctx)
-        out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=B.dtype)
+        data_nd = tvm.nd.array(data_npy, dev)
+        out_nd = tvm.nd.empty(out_npy.shape, device=dev, dtype=B.dtype)
         foo(data_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_device(target, dev)
 
 
 def verify_reshape(src_shape, dst_shape):
     A = te.placeholder(shape=src_shape, name="A")
     B = topi.reshape(A, dst_shape)
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_injective_schedule(device)(B)
-        foo = tvm.build(s, [A, B], device, name="reshape")
+    def check_device(target, dev):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s = tvm.topi.testing.get_injective_schedule(target)(B)
+        foo = tvm.build(s, [A, B], target, name="reshape")
         data_npy = np.random.normal(size=src_shape).astype(A.dtype)
         out_npy = np.reshape(data_npy, newshape=dst_shape)
-        data_nd = tvm.nd.array(data_npy, ctx)
-        out_nd = tvm.nd.empty(dst_shape, ctx=ctx, dtype=B.dtype)
+        data_nd = tvm.nd.array(data_npy, dev)
+        out_nd = tvm.nd.empty(dst_shape, device=dev, dtype=B.dtype)
         foo(data_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_device(target, dev)
 
 
 def verify_squeeze(src_shape, axis):
     A = te.placeholder(shape=src_shape, name="A")
     B = topi.squeeze(A, axis=axis)
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_injective_schedule(device)(B)
+    def check_device(target, dev):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s = tvm.topi.testing.get_injective_schedule(target)(B)
 
-        foo = tvm.build(s, [A, B], device, name="squeeze")
+        foo = tvm.build(s, [A, B], target, name="squeeze")
         data_npy = np.random.normal(size=src_shape).astype(A.dtype)
         out_npy = np.squeeze(data_npy, axis=axis)
-        data_nd = tvm.nd.array(data_npy, ctx)
+        data_nd = tvm.nd.array(data_npy, dev)
         out_nd_shape = out_npy.shape
-        out_nd = tvm.nd.empty(out_nd_shape, ctx=ctx, dtype=B.dtype)
+        out_nd = tvm.nd.empty(out_nd_shape, device=dev, dtype=B.dtype)
         foo(data_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_device(target, dev)
 
 
 def verify_concatenate(shapes, axis):
@@ -149,21 +149,21 @@ def get_concat_schedule(target):
         tensor_l.append(te.placeholder(shape, name="A" + str(i)))
     out_tensor = topi.concatenate(a_tuple=tensor_l, axis=axis)
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = get_concat_schedule(device)(out_tensor)
+    def check_device(target, dev):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s = get_concat_schedule(target)(out_tensor)
 
-        foo = tvm.build(s, tensor_l + [out_tensor], device, name="concatenate")
+        foo = tvm.build(s, tensor_l + [out_tensor], target, name="concatenate")
         data_npys = [np.random.normal(size=shape).astype(tensor_l[0].dtype) for shape in shapes]
         out_npy = np.concatenate(data_npys, axis=axis)
-        data_nds = [tvm.nd.array(data_npy, ctx) for data_npy in data_npys]
-        out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=out_tensor.dtype)
+        data_nds = [tvm.nd.array(data_npy, dev) for data_npy in data_npys]
+        out_nd = tvm.nd.empty(out_npy.shape, device=dev, dtype=out_tensor.dtype)
         foo(*(data_nds + [out_nd]))
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_device(target, dev)
 
 
 def verify_stack(shapes, axis):
@@ -172,45 +172,45 @@ def verify_stack(shapes, axis):
         tensor_l.append(te.placeholder(shape, name="A" + str(i)))
     out_tensor = topi.stack(tensor_l, axis)
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_broadcast_schedule(device)(out_tensor)
+    def check_device(target, dev):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s = tvm.topi.testing.get_broadcast_schedule(target)(out_tensor)
 
-        foo = tvm.build(s, tensor_l + [out_tensor], device, name="stack")
+        foo = tvm.build(s, tensor_l + [out_tensor], target, name="stack")
         data_npys = [np.random.normal(size=shape).astype(tensor_l[0].dtype) for shape in shapes]
         out_npy = np.stack(data_npys, axis=axis)
-        data_nds = [tvm.nd.array(data_npy, ctx) for data_npy in data_npys]
-        out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=out_tensor.dtype)
+        data_nds = [tvm.nd.array(data_npy, dev) for data_npy in data_npys]
+        out_nd = tvm.nd.empty(out_npy.shape, device=dev, dtype=out_tensor.dtype)
         foo(*(data_nds + [out_nd]))
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_device(target, dev)
 
 
 def verify_split(src_shape, indices_or_sections, axis):
     A = te.placeholder(shape=src_shape, name="A")
     tensor_l = topi.split(A, indices_or_sections, axis=axis)
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_injective_schedule(device)(tensor_l)
+    def check_device(target, dev):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s = tvm.topi.testing.get_injective_schedule(target)(tensor_l)
 
-        foo = tvm.build(s, [A] + list(tensor_l), device, name="split")
+        foo = tvm.build(s, [A] + list(tensor_l), target, name="split")
         data_npy = np.random.normal(size=src_shape).astype(A.dtype)
         out_npys = np.split(data_npy, indices_or_sections, axis=axis)
-        data_nd = tvm.nd.array(data_npy, ctx)
+        data_nd = tvm.nd.array(data_npy, dev)
         out_nds = [
-            tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=tensor_l[0].dtype) for out_npy in out_npys
+            tvm.nd.empty(out_npy.shape, device=dev, dtype=tensor_l[0].dtype) for out_npy in out_npys
         ]
         foo(*([data_nd] + out_nds))
         for out_nd, out_npy in zip(out_nds, out_npys):
             tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_device(target, dev)
 
 
 def verify_expand_like(in_shape, out_shape, axis):
@@ -219,13 +219,13 @@ def verify_expand_like(in_shape, out_shape, axis):
     C = topi.expand_like(A, B, axis)
     s = te.create_schedule([C.op])
 
-    def check_device(device):
-        print("Running on target: %s" % device)
+    def check_device(target):
+        print("Running on target: %s" % target)
 
-        ctx = tvm.context(device, 0)
-        f = tvm.build(s, [A, B, C], device, name="expand_like")
+        dev = tvm.device(target, 0)
+        f = tvm.build(s, [A, B, C], target, name="expand_like")
         input = np.random.uniform(size=in_shape).astype(A.dtype)
-        tvm_input = tvm.nd.array(input, ctx)
+        tvm_input = tvm.nd.array(input, dev)
 
         odim = len(out_shape)
         real_axis = [x if x >= 0 else x + odim for x in axis]
@@ -236,38 +236,38 @@ def check_device(device):
             input = np.concatenate([input] * out_shape[x], axis=x).astype(A.dtype)
         assert input.shape == out_shape
 
-        tvm_shape_like = tvm.nd.array(np.zeros(out_shape).astype(B.dtype), ctx)
-        out = tvm.nd.array(np.zeros(out_shape).astype(A.dtype), ctx)
+        tvm_shape_like = tvm.nd.array(np.zeros(out_shape).astype(B.dtype), dev)
+        out = tvm.nd.array(np.zeros(out_shape).astype(A.dtype), dev)
         f(tvm_input, tvm_shape_like, out)
         tvm.testing.assert_allclose(out.asnumpy(), input)
 
-    for device in ["llvm"]:
-        check_device(device)
+    for target in ["llvm"]:
+        check_device(target)
 
 
 def verify_flip(in_shape, axis):
     A = te.placeholder(shape=in_shape, name="A")
     B = topi.flip(A, axis) + 1
 
-    def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
+    def check_device(target):
+        dev = tvm.device(target, 0)
+        if not tvm.testing.device_enabled(target):
+            print("Skip because %s is not enabled" % target)
             return
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_injective_schedule(device)(B)
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s = tvm.topi.testing.get_injective_schedule(target)(B)
 
-        foo = tvm.build(s, [A, B], device, name="reverse")
+        foo = tvm.build(s, [A, B], target, name="reverse")
         x_np = np.random.uniform(size=in_shape).astype(A.dtype)
         out_npy = np.flip(x_np, axis) + 1
-        data_nd = tvm.nd.array(x_np, ctx)
-        out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=A.dtype)
+        data_nd = tvm.nd.array(x_np, dev)
+        out_nd = tvm.nd.empty(out_npy.shape, device=dev, dtype=A.dtype)
         foo(data_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "cuda", "opencl", "sdaccel", "aocl_sw_emu"]:
-        check_device(device)
+    for target in ["llvm", "cuda", "opencl", "sdaccel", "aocl_sw_emu"]:
+        check_device(target)
 
 
 @tvm.testing.uses_gpu
@@ -278,21 +278,21 @@ def verify_reverse_sequence(in_data, seq_lengths, batch_axis, seq_axis, ref_res)
         B = te.placeholder(shape=seq_lengths.shape, name="B", dtype=str(seq_lengths.dtype))
         C = topi.reverse_sequence(A, B, seq_axis, batch_axis)
 
-        def check_device(device, ctx):
-            print("Running on target: %s" % device)
-            with tvm.target.Target(device):
-                s = tvm.topi.testing.get_injective_schedule(device)(C)
+        def check_device(target, dev):
+            print("Running on target: %s" % target)
+            with tvm.target.Target(target):
+                s = tvm.topi.testing.get_injective_schedule(target)(C)
 
-            foo = tvm.build(s, [A, B, C], device, name="reverse_sequence")
+            foo = tvm.build(s, [A, B, C], target, name="reverse_sequence")
 
-            data_nd = tvm.nd.array(in_data, ctx)
-            seq_lengths_nd = tvm.nd.array(seq_lengths, ctx)
-            out_nd = tvm.nd.empty(in_data.shape, ctx=ctx, dtype=A.dtype)
+            data_nd = tvm.nd.array(in_data, dev)
+            seq_lengths_nd = tvm.nd.array(seq_lengths, dev)
+            out_nd = tvm.nd.empty(in_data.shape, device=dev, dtype=A.dtype)
             foo(data_nd, seq_lengths_nd, out_nd)
             tvm.testing.assert_allclose(out_nd.asnumpy(), ref_res)
 
-        for device, ctx in tvm.testing.enabled_targets():
-            check_device(device, ctx)
+        for target, dev in tvm.testing.enabled_targets():
+            check_device(target, dev)
 
     indata = np.array(np.arange(0, 16)).reshape([4, 4]).astype("int32")
     result = [[0, 5, 10, 15], [4, 1, 6, 11], [8, 9, 2, 7], [12, 13, 14, 3]]
@@ -366,16 +366,16 @@ def verify_take(src_shape, indices_src, axis=None, mode="clip"):
     else:
         out_tensor = topi.take(a=A, indices=indices, axis=axis, mode=mode)
 
-    def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
+    def check_device(target):
+        dev = tvm.device(target, 0)
+        if not tvm.testing.device_enabled(target):
+            print("Skip because %s is not enabled" % target)
             return
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_injective_schedule(device)(out_tensor)
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s = tvm.topi.testing.get_injective_schedule(target)(out_tensor)
 
-        foo = tvm.build(s, [A] + [indices] + [out_tensor], device, name="take")
+        foo = tvm.build(s, [A] + [indices] + [out_tensor], target, name="take")
         shape_size = 1
         for i in range(len(src_shape)):
             shape_size = shape_size * src_shape[i]
@@ -387,14 +387,14 @@ def check_device(device):
         else:
             np_mode = "raise" if mode == "fast" else mode
             out_npys = np.take(data_npy, indices_src, axis=axis, mode=np_mode)
-        data_nd = tvm.nd.array(data_npy, ctx)
-        indices_nd = tvm.nd.array(indices_src, ctx)
-        out_nd = tvm.nd.empty(out_npys.shape, ctx=ctx, dtype=src_dtype)
+        data_nd = tvm.nd.array(data_npy, dev)
+        indices_nd = tvm.nd.array(indices_src, dev)
+        out_nd = tvm.nd.empty(out_npys.shape, device=dev, dtype=src_dtype)
         foo(data_nd, indices_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npys)
 
-    for device in ["llvm", "opencl", "sdaccel", "aocl_sw_emu"]:
-        check_device(device)
+    for target in ["llvm", "opencl", "sdaccel", "aocl_sw_emu"]:
+        check_device(target)
 
 
 def verify_strided_slice(in_shape, begin, end, strides=None):
@@ -402,25 +402,25 @@ def verify_strided_slice(in_shape, begin, end, strides=None):
     strides = [1, 1, 1] if strides is None else strides
     B = topi.strided_slice(A, begin, end, strides) + 1
 
-    def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
+    def check_device(target):
+        dev = tvm.device(target, 0)
+        if not tvm.testing.device_enabled(target):
+            print("Skip because %s is not enabled" % target)
             return
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_injective_schedule(device)(B)
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s = tvm.topi.testing.get_injective_schedule(target)(B)
 
-        foo = tvm.build(s, [A, B], device, name="stride_slice")
+        foo = tvm.build(s, [A, B], target, name="stride_slice")
         x_np = np.random.uniform(size=in_shape).astype(A.dtype)
         out_npy = tvm.topi.testing.strided_slice_python(x_np, begin, end, strides) + 1
-        data_nd = tvm.nd.array(x_np, ctx)
-        out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=A.dtype)
+        data_nd = tvm.nd.array(x_np, dev)
+        out_nd = tvm.nd.empty(out_npy.shape, device=dev, dtype=A.dtype)
         foo(data_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "opencl", "sdaccel", "aocl_sw_emu"]:
-        check_device(device)
+    for target in ["llvm", "opencl", "sdaccel", "aocl_sw_emu"]:
+        check_device(target)
 
 
 def verify_dynamic_strided_slice(in_shape, begin, end, strides=None):
@@ -431,28 +431,28 @@ def verify_dynamic_strided_slice(in_shape, begin, end, strides=None):
     strides = [1, 1, 1] if strides is None else strides
     B = topi.strided_slice(A, Begin, End, Strides) + 1
 
-    def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
+    def check_device(target):
+        dev = tvm.device(target, 0)
+        if not tvm.testing.device_enabled(target):
+            print("Skip because %s is not enabled" % target)
             return
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_injective_schedule(device)(B)
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s = tvm.topi.testing.get_injective_schedule(target)(B)
 
-        foo = tvm.build(s, [A, Begin, End, Strides, B], device, name="stride_slice")
+        foo = tvm.build(s, [A, Begin, End, Strides, B], target, name="stride_slice")
         x_np = np.random.uniform(size=in_shape).astype(A.dtype)
         out_npy = tvm.topi.testing.strided_slice_python(x_np, begin, end, strides) + 1
-        data_nd = tvm.nd.array(x_np, ctx)
-        out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=A.dtype)
-        begin_nd = tvm.nd.array(np.array(begin).astype("int64"), ctx)
-        end_nd = tvm.nd.array(np.array(end).astype("int64"), ctx)
-        strides_nd = tvm.nd.array(np.array(strides).astype("int64"), ctx)
+        data_nd = tvm.nd.array(x_np, dev)
+        out_nd = tvm.nd.empty(out_npy.shape, device=dev, dtype=A.dtype)
+        begin_nd = tvm.nd.array(np.array(begin).astype("int64"), dev)
+        end_nd = tvm.nd.array(np.array(end).astype("int64"), dev)
+        strides_nd = tvm.nd.array(np.array(strides).astype("int64"), dev)
         foo(data_nd, begin_nd, end_nd, strides_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "opencl", "sdaccel", "aocl_sw_emu"]:
-        check_device(device)
+    for target in ["llvm", "opencl", "sdaccel", "aocl_sw_emu"]:
+        check_device(target)
 
 
 def verify_strided_set(in_shape, v_shape, begin, end, strides=None):
@@ -466,39 +466,39 @@ def verify_strided_set(in_shape, v_shape, begin, end, strides=None):
     else:
         B = topi.strided_set(A, V, b, e) + 1
 
-    def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
+    def check_device(target):
+        dev = tvm.device(target, 0)
+        if not tvm.testing.device_enabled(target):
+            print("Skip because %s is not enabled" % target)
             return
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_injective_schedule(device)(B)
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s = tvm.topi.testing.get_injective_schedule(target)(B)
 
         if strides is not None:
-            foo = tvm.build(s, [A, V, b, e, st, B], device, name="stride_set")
+            foo = tvm.build(s, [A, V, b, e, st, B], target, name="stride_set")
             s_np = np.asarray(strides).astype("int32")
-            s_nd = tvm.nd.array(s_np, ctx)
+            s_nd = tvm.nd.array(s_np, dev)
         else:
-            foo = tvm.build(s, [A, V, b, e, B], device, name="stride_set")
+            foo = tvm.build(s, [A, V, b, e, B], target, name="stride_set")
         x_np = np.random.uniform(size=in_shape).astype(A.dtype)
         v_np = np.random.uniform(size=v_shape).astype(V.dtype)
         b_np = np.asarray(begin).astype("int32")
         e_np = np.asarray(end).astype("int32")
         out_npy = tvm.topi.testing.strided_set_python(x_np, v_np, begin, end, strides) + 1
-        data_nd = tvm.nd.array(x_np, ctx)
-        v_nd = tvm.nd.array(v_np, ctx)
-        b_nd = tvm.nd.array(b_np, ctx)
-        e_nd = tvm.nd.array(e_np, ctx)
-        out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=A.dtype)
+        data_nd = tvm.nd.array(x_np, dev)
+        v_nd = tvm.nd.array(v_np, dev)
+        b_nd = tvm.nd.array(b_np, dev)
+        e_nd = tvm.nd.array(e_np, dev)
+        out_nd = tvm.nd.empty(out_npy.shape, device=dev, dtype=A.dtype)
         if strides is not None:
             foo(data_nd, v_nd, b_nd, e_nd, s_nd, out_nd)
         else:
             foo(data_nd, v_nd, b_nd, e_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "opencl", "sdaccel", "aocl_sw_emu"]:
-        check_device(device)
+    for target in ["llvm", "opencl", "sdaccel", "aocl_sw_emu"]:
+        check_device(target)
 
 
 def verify_gather(data, axis, indices):
@@ -509,22 +509,22 @@ def verify_gather(data, axis, indices):
     var_indices = te.placeholder(shape=indices.shape, dtype=indices.dtype.name, name="indices")
     out_tensor = topi.gather(var_data, axis, var_indices)
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_injective_schedule(device)(out_tensor)
+    def check_device(target, dev):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s = tvm.topi.testing.get_injective_schedule(target)(out_tensor)
 
-        func = tvm.build(s, [var_data, var_indices, out_tensor], device, name="gather")
+        func = tvm.build(s, [var_data, var_indices, out_tensor], target, name="gather")
         out_npys = tvm.topi.testing.gather_python(data, axis, indices)
 
-        data_nd = tvm.nd.array(data, ctx)
-        indices_nd = tvm.nd.array(indices, ctx)
-        out_nd = tvm.nd.empty(out_npys.shape, ctx=ctx, dtype=data.dtype.name)
+        data_nd = tvm.nd.array(data, dev)
+        indices_nd = tvm.nd.array(indices, dev)
+        out_nd = tvm.nd.empty(out_npys.shape, device=dev, dtype=data.dtype.name)
         func(data_nd, indices_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npys)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_device(target, dev)
 
 
 def verify_gather_nd(src_shape, indices_src, indices_dtype):
@@ -534,26 +534,26 @@ def verify_gather_nd(src_shape, indices_src, indices_dtype):
     indices = te.placeholder(shape=indices_src.shape, dtype=indices_dtype, name="indices")
     out_tensor = topi.gather_nd(a=A, indices=indices)
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_injective_schedule(device)(out_tensor)
+    def check_device(target, dev):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s = tvm.topi.testing.get_injective_schedule(target)(out_tensor)
 
-        func = tvm.build(s, [A, indices, out_tensor], device, name="take")
+        func = tvm.build(s, [A, indices, out_tensor], target, name="take")
         shape_size = 1
         for i in range(len(src_shape)):
             shape_size = shape_size * src_shape[i]
         data_npy = np.arange(shape_size, dtype=src_dtype).reshape((src_shape))
         out_npys = tvm.topi.testing.gather_nd_python(data_npy, indices_src)
 
-        data_nd = tvm.nd.array(data_npy, ctx)
-        indices_nd = tvm.nd.array(indices_src, ctx)
-        out_nd = tvm.nd.empty(out_npys.shape, ctx=ctx, dtype=src_dtype)
+        data_nd = tvm.nd.array(data_npy, dev)
+        indices_nd = tvm.nd.array(indices_src, dev)
+        out_nd = tvm.nd.empty(out_npys.shape, device=dev, dtype=src_dtype)
         func(data_nd, indices_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npys)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_device(target, dev)
 
 
 def verify_arange(start, stop, step):
@@ -570,57 +570,57 @@ def verify_arange(start, stop, step):
         A = topi.arange(start, stop, step)
         a_np = np.arange(start, stop, step)
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_injective_schedule(device)(A)
-        f = tvm.build(s, [A], device, name="arange")
-        a_nd = tvm.nd.empty(a_np.shape, dtype="float32", ctx=ctx)
+    def check_device(target, dev):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s = tvm.topi.testing.get_injective_schedule(target)(A)
+        f = tvm.build(s, [A], target, name="arange")
+        a_nd = tvm.nd.empty(a_np.shape, dtype="float32", device=dev)
         f(a_nd)
         tvm.testing.assert_allclose(a_nd.asnumpy(), a_np)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_device(target, dev)
 
 
 def verify_repeat(in_shape, repeats, axis):
     A = te.placeholder(shape=in_shape, name="A")
     B = topi.repeat(A, repeats, axis)
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_broadcast_schedule(device)(B)
-        foo = tvm.build(s, [A, B], device, name="repeat")
+    def check_device(target, dev):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s = tvm.topi.testing.get_broadcast_schedule(target)(B)
+        foo = tvm.build(s, [A, B], target, name="repeat")
         data_npy = np.random.uniform(size=in_shape).astype(A.dtype)
         out_npy = np.repeat(data_npy, repeats, axis)
-        data_nd = tvm.nd.array(data_npy, ctx)
-        out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), ctx)
+        data_nd = tvm.nd.array(data_npy, dev)
+        out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), dev)
         foo(data_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_device(target, dev)
 
 
 def verify_tile(in_shape, reps):
     A = te.placeholder(shape=in_shape, name="A")
     B = topi.tile(A, reps)
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_broadcast_schedule(device)(B)
-        foo = tvm.build(s, [A, B], device, name="tile")
+    def check_device(target, dev):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s = tvm.topi.testing.get_broadcast_schedule(target)(B)
+        foo = tvm.build(s, [A, B], target, name="tile")
         data_npy = np.random.uniform(size=in_shape).astype(A.dtype)
         out_npy = np.tile(data_npy, reps)
-        data_nd = tvm.nd.array(data_npy, ctx)
-        out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), ctx)
+        data_nd = tvm.nd.array(data_npy, dev)
+        out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), dev)
         foo(data_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_device(target, dev)
 
 
 def verify_where(in_shape):
@@ -630,24 +630,24 @@ def verify_where(in_shape):
     B = te.placeholder(shape=in_shape, name="B")
     C = topi.where(Cond, A, B)
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_broadcast_schedule(device)(C)
-        f = tvm.build(s, [Cond, A, B, C], device, name="where")
+    def check_device(target, dev):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s = tvm.topi.testing.get_broadcast_schedule(target)(C)
+        f = tvm.build(s, [Cond, A, B, C], target, name="where")
         cond_npy = np.random.uniform(low=-1, high=1, size=in_shape).astype(dtype)
         x_npy = np.random.uniform(size=in_shape).astype(dtype)
         y_npy = np.random.uniform(size=in_shape).astype(dtype)
         out_npy = np.where(cond_npy, x_npy, y_npy)
-        cond_nd = tvm.nd.array(cond_npy, ctx)
-        x_nd = tvm.nd.array(x_npy, ctx)
-        y_nd = tvm.nd.array(y_npy, ctx)
-        out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(C.dtype), ctx)
+        cond_nd = tvm.nd.array(cond_npy, dev)
+        x_nd = tvm.nd.array(x_npy, dev)
+        y_nd = tvm.nd.array(y_npy, dev)
+        out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(C.dtype), dev)
         f(cond_nd, x_nd, y_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_device(target, dev)
 
 
 def verify_one_hot(indices_shape, depth, on_value, off_value, axis, dtype):
@@ -658,21 +658,21 @@ def verify_one_hot(indices_shape, depth, on_value, off_value, axis, dtype):
         indices, on_value_const, off_value_const, depth, axis, dtype
     )
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_injective_schedule(device)(one_hot_result)
-        fn = tvm.build(s, [indices, one_hot_result], device, name="one_hot")
+    def check_device(target, dev):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s = tvm.topi.testing.get_injective_schedule(target)(one_hot_result)
+        fn = tvm.build(s, [indices, one_hot_result], target, name="one_hot")
         indices_npy = np.random.randint(0, depth, size=indices_shape).astype(indices.dtype)
         out_npy = tvm.topi.testing.one_hot(indices_npy, on_value, off_value, depth, axis, dtype)
-        indices_nd = tvm.nd.array(indices_npy, ctx)
-        out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(one_hot_result.dtype), ctx)
+        indices_nd = tvm.nd.array(indices_npy, dev)
+        out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(one_hot_result.dtype), dev)
         fn(indices_nd, out_nd)
         out_topi = out_nd.asnumpy()
         tvm.testing.assert_allclose(out_topi, out_npy)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_device(target, dev)
 
 
 def verify_unravel_index(indices, shape, dtype):
@@ -687,21 +687,21 @@ def verify_unravel_index(indices, shape, dtype):
     Y = te.placeholder(shape=y_data.shape, dtype=dtype, name="Y")
     Z = topi.unravel_index(X, Y)
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_injective_schedule(device)(Z)
-        foo = tvm.build(s, [X, Y, Z], device, name="unravel_index")
+    def check_device(target, dev):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s = tvm.topi.testing.get_injective_schedule(target)(Z)
+        foo = tvm.build(s, [X, Y, Z], target, name="unravel_index")
 
         out_npy = np.unravel_index(x_data, y_data)
-        datax_nd = tvm.nd.array(x_data, ctx)
-        datay_nd = tvm.nd.array(y_data, ctx)
-        out_nd = tvm.nd.empty(dst_shape, ctx=ctx, dtype=Z.dtype)
+        datax_nd = tvm.nd.array(x_data, dev)
+        datay_nd = tvm.nd.array(y_data, dev)
+        out_nd = tvm.nd.empty(dst_shape, device=dev, dtype=Z.dtype)
         foo(datax_nd, datay_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_device(target, dev)
 
 
 def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_shape, xpected):
@@ -724,27 +724,27 @@ def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_
         args = [A, B, C]
         D = topi.sparse_to_dense(A, output_shape, B, C)
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_injective_schedule(device)(D)
+    def check_device(target, dev):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s = tvm.topi.testing.get_injective_schedule(target)(D)
 
-        foo = tvm.build(s, args + [D], device, name="sparse_to_dense")
+        foo = tvm.build(s, args + [D], target, name="sparse_to_dense")
 
-        sparse_indices_nd = tvm.nd.array(sparse_indices_data, ctx)
-        sparse_values_nd = tvm.nd.array(sparse_values_data, ctx)
-        out_nd = tvm.nd.empty(output_shape_data, ctx=ctx, dtype=B.dtype)
+        sparse_indices_nd = tvm.nd.array(sparse_indices_data, dev)
+        sparse_values_nd = tvm.nd.array(sparse_values_data, dev)
+        out_nd = tvm.nd.empty(output_shape_data, device=dev, dtype=B.dtype)
 
         if default_value is None:
             foo(sparse_indices_nd, sparse_values_nd, out_nd)
         else:
-            default_value_nd = tvm.nd.array(default_value_data, ctx)
+            default_value_nd = tvm.nd.array(default_value_data, dev)
             foo(sparse_indices_nd, sparse_values_nd, default_value_nd, out_nd)
 
         tvm.testing.assert_allclose(out_nd.asnumpy(), np.array(xpected))
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_device(target, dev)
 
 
 def verify_matrix_set_diag(input_shape, diagonal_shape, dtype, k=0, align="RIGHT_LEFT"):
@@ -752,24 +752,24 @@ def verify_matrix_set_diag(input_shape, diagonal_shape, dtype, k=0, align="RIGHT
     diagonal = te.placeholder(shape=diagonal_shape, name="diagonal", dtype=dtype)
     matrix_set_diag_result = topi.transform.matrix_set_diag(input, diagonal, k, align)
 
-    def check_device(device, ctx):
-        ctx = tvm.context(device, 0)
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_injective_schedule(device)(matrix_set_diag_result)
-        fn = tvm.build(s, [input, diagonal, matrix_set_diag_result], device, name="matrix_set_diag")
+    def check_device(target, dev):
+        dev = tvm.device(target, 0)
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s = tvm.topi.testing.get_injective_schedule(target)(matrix_set_diag_result)
+        fn = tvm.build(s, [input, diagonal, matrix_set_diag_result], target, name="matrix_set_diag")
         input_npy = np.random.randint(-100, 100, size=input_shape).astype(dtype)
         diagonal_npy = np.random.randint(-100, 100, size=diagonal_shape).astype(dtype)
         out_npy = tvm.topi.testing.matrix_set_diag(input_npy, diagonal_npy, k, align)
-        input_nd = tvm.nd.array(input_npy, ctx)
-        diagonal_nd = tvm.nd.array(diagonal_npy, ctx)
-        out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(matrix_set_diag_result.dtype), ctx)
+        input_nd = tvm.nd.array(input_npy, dev)
+        diagonal_nd = tvm.nd.array(diagonal_npy, dev)
+        out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(matrix_set_diag_result.dtype), dev)
         fn(input_nd, diagonal_nd, out_nd)
         out_topi = out_nd.asnumpy()
         tvm.testing.assert_allclose(out_topi, out_npy)
 
-    for target, ctx in tvm.testing.enabled_targets():
-        check_device(target, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_device(target, dev)
 
 
 def verify_adv_index(data_shape, index_shapes):
@@ -785,27 +785,27 @@ def verify_adv_index(data_shape, index_shapes):
     np_out = np_data[tuple(np_indices)]
     out = topi.adv_index(data, indices)
 
-    def check_device(device, ctx):
-        ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
+    def check_device(target, dev):
+        dev = tvm.device(target, 0)
+        if not dev.exist:
+            print("Skip because %s is not enabled" % target)
             return
-        print("Running on target: %s" % device)
-        with tvm.target.create(device):
-            s = tvm.topi.testing.get_injective_schedule(device)(out)
+        print("Running on target: %s" % target)
+        with tvm.target.create(target):
+            s = tvm.topi.testing.get_injective_schedule(target)(out)
 
-        func = tvm.build(s, [data] + indices + [out], device, name="adv_index")
+        func = tvm.build(s, [data] + indices + [out], target, name="adv_index")
 
-        nd_list = [tvm.nd.array(np_data, ctx)]
+        nd_list = [tvm.nd.array(np_data, dev)]
         for np_index in np_indices:
-            nd_list.append(tvm.nd.array(np_index, ctx))
-        nd_list.append(tvm.nd.empty(out.shape, ctx=ctx, dtype=data.dtype))
+            nd_list.append(tvm.nd.array(np_index, dev))
+        nd_list.append(tvm.nd.empty(out.shape, device=dev, dtype=data.dtype))
 
         func(*nd_list)
         tvm.testing.assert_allclose(nd_list[-1].asnumpy(), np.array(np_out))
 
-    for target, ctx in tvm.testing.enabled_targets():
-        check_device(target, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_device(target, dev)
 
 
 @tvm.testing.uses_gpu
@@ -896,14 +896,14 @@ def test_squeeze():
     A = te.placeholder((2,), "float32", "A")
     E = topi.squeeze(A)
     C = te.compute((1,), lambda i: E[(2 * A[0] - 1).astype("int32")])
-    for device in ["cuda", "opencl"]:
-        ctx = tvm.context(device, 0)
-        if tvm.testing.device_enabled(device):
-            with tvm.target.Target(device):
-                s = tvm.topi.testing.get_injective_schedule(device)(C)
+    for target in ["cuda", "opencl"]:
+        dev = tvm.device(target, 0)
+        if tvm.testing.device_enabled(target):
+            with tvm.target.Target(target):
+                s = tvm.topi.testing.get_injective_schedule(target)(C)
                 func = tvm.build(s, [A, C])
-            a = tvm.nd.array(np.array((1, 2)).astype("float32"), ctx=ctx)
-            c = tvm.nd.empty((1,), dtype="float32", ctx=ctx)
+            a = tvm.nd.array(np.array((1, 2)).astype("float32"), device=dev)
+            c = tvm.nd.empty((1,), dtype="float32", device=dev)
             func(a, c)
             assert c.asnumpy()[0] == 2
 
@@ -1040,18 +1040,18 @@ def test_layout_transform():
     output = np.reshape(output, newshape=(1, 8, 8, 2, 16))
     output = np.transpose(output, axes=(0, 3, 1, 2, 4))
 
-    def check_device(device, ctx):
-        tvm_input = tvm.nd.array(input, ctx)
-        tvm_output = tvm.nd.empty(output.shape, ctx=ctx, dtype=B.dtype)
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_injective_schedule(device)(B)
-        f = tvm.build(s, [A, B], device, name="layout_transform")
+    def check_device(target, dev):
+        tvm_input = tvm.nd.array(input, dev)
+        tvm_output = tvm.nd.empty(output.shape, device=dev, dtype=B.dtype)
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s = tvm.topi.testing.get_injective_schedule(target)(B)
+        f = tvm.build(s, [A, B], target, name="layout_transform")
         f(tvm_input, tvm_output)
         tvm.testing.assert_allclose(tvm_output.asnumpy(), output)
 
-    for backend, ctx in tvm.testing.enabled_targets():
-        check_device(backend, ctx)
+    for backend, dev in tvm.testing.enabled_targets():
+        check_device(backend, dev)
 
 
 @tvm.testing.uses_gpu
@@ -1064,18 +1064,18 @@ def test_shape():
     input = np.random.uniform(size=in_shape).astype(A.dtype)
     output = np.asarray(in_shape).astype(dtype)
 
-    def check_device(device, ctx):
-        tvm_input = tvm.nd.array(input, ctx)
-        tvm_output = tvm.nd.empty(output.shape, ctx=ctx, dtype=dtype)
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_injective_schedule(device)(B)
-        f = tvm.build(s, [A, B], device, name="shape")
+    def check_device(target, dev):
+        tvm_input = tvm.nd.array(input, dev)
+        tvm_output = tvm.nd.empty(output.shape, device=dev, dtype=dtype)
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s = tvm.topi.testing.get_injective_schedule(target)(B)
+        f = tvm.build(s, [A, B], target, name="shape")
         f(tvm_input, tvm_output)
         tvm.testing.assert_allclose(tvm_output.asnumpy(), output)
 
-    for backend, ctx in tvm.testing.enabled_targets():
-        check_device(backend, ctx)
+    for backend, dev in tvm.testing.enabled_targets():
+        check_device(backend, dev)
 
 
 @tvm.testing.uses_gpu
@@ -1092,19 +1092,19 @@ def test_sequence_mask():
                 B_data = np.random.randint(1, max_length, (batch_size,)).astype(np.int32)
                 C_gt_data = tvm.topi.testing.sequence_mask(A_data, B_data, mask_value, axis)
 
-                def check_device(device, ctx):
-                    tvm_A = tvm.nd.array(A_data, ctx)
-                    tvm_B = tvm.nd.array(B_data, ctx)
-                    tvm_C = tvm.nd.empty(in_shape, ctx=ctx, dtype="float32")
-                    print("Running on target: %s" % device)
-                    with tvm.target.Target(device):
-                        s = tvm.topi.testing.get_injective_schedule(device)(C)
-                    f = tvm.build(s, [A, B, C], device, name="SequenceMask")
+                def check_device(target, dev):
+                    tvm_A = tvm.nd.array(A_data, dev)
+                    tvm_B = tvm.nd.array(B_data, dev)
+                    tvm_C = tvm.nd.empty(in_shape, device=dev, dtype="float32")
+                    print("Running on target: %s" % target)
+                    with tvm.target.Target(target):
+                        s = tvm.topi.testing.get_injective_schedule(target)(C)
+                    f = tvm.build(s, [A, B, C], target, name="SequenceMask")
                     f(tvm_A, tvm_B, tvm_C)
                     tvm.testing.assert_allclose(tvm_C.asnumpy(), C_gt_data)
 
-                for backend, ctx in tvm.testing.enabled_targets():
-                    check_device(backend, ctx)
+                for backend, dev in tvm.testing.enabled_targets():
+                    check_device(backend, dev)
 
 
 @tvm.testing.uses_gpu
@@ -1117,28 +1117,28 @@ def test_ndarray_size():
     input = np.random.uniform(size=in_shape).astype(A.dtype)
     output = np.asarray(np.size(input)).astype(dtype)
 
-    def check_device(device, ctx):
-        tvm_input = tvm.nd.array(input, ctx=ctx)
-        tvm_output = tvm.nd.empty((), ctx=ctx, dtype=B.dtype)
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_injective_schedule(device)(B)
-        f = tvm.build(s, [A, B], device, name="ndarray_size")
+    def check_device(target, dev):
+        tvm_input = tvm.nd.array(input, device=dev)
+        tvm_output = tvm.nd.empty((), device=dev, dtype=B.dtype)
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s = tvm.topi.testing.get_injective_schedule(target)(B)
+        f = tvm.build(s, [A, B], target, name="ndarray_size")
         f(tvm_input, tvm_output)
         tvm.testing.assert_allclose(tvm_output.asnumpy(), output)
 
-    for backend, ctx in tvm.testing.enabled_targets():
-        check_device(backend, ctx)
+    for backend, dev in tvm.testing.enabled_targets():
+        check_device(backend, dev)
 
 
 @tvm.testing.uses_gpu
 def test_where_fusion():
     """integration test that where and zeros should be properly inlined"""
 
-    def check_device(device, ctx):
-        with tvm.target.Target(device):
-            print("Running on target: %s" % device)
-            conv2d_compute, conv2d_schedule = tvm.topi.testing.get_conv2d_nchw_implement(device)
+    def check_device(target, dev):
+        with tvm.target.Target(target):
+            print("Running on target: %s" % target)
+            conv2d_compute, conv2d_schedule = tvm.topi.testing.get_conv2d_nchw_implement(target)
             data = te.placeholder((2, 1, 2, 4), "int8", "data")
             w = te.placeholder((3, 1, 2, 2), "int8", "w")
             conv1 = conv2d_compute(data, w, 1, 0, 1, "int32")
@@ -1152,8 +1152,8 @@ def check_device(device, ctx):
             s = conv2d_schedule(outs)
             tvm.build(s, [data, w, add], target=backend)
 
-    for backend, ctx in tvm.testing.enabled_targets():
-        check_device(backend, ctx)
+    for backend, dev in tvm.testing.enabled_targets():
+        check_device(backend, dev)
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/topi/python/test_topi_unique.py b/tests/python/topi/python/test_topi_unique.py
index d7ee74282922..a97b95029862 100644
--- a/tests/python/topi/python/test_topi_unique.py
+++ b/tests/python/topi/python/test_topi_unique.py
@@ -22,7 +22,7 @@
 
 
 @tvm.testing.parametrize_targets
-def test_unique(ctx, target):
+def test_unique(dev, target):
     def calc_numpy_unique(data, is_sorted=False):
         uniq, index, inverse, counts = np.unique(
             data, return_index=True, return_inverse=True, return_counts=True
@@ -56,10 +56,10 @@ def check_unique(data, is_sorted=False):
             ),
         }
         fcompute, fschedule = tvm.topi.testing.dispatch(target, implementations)
-        tvm_data = tvm.nd.array(data, ctx=ctx)
-        tvm_unique = tvm.nd.array(np.zeros(data.shape).astype(data.dtype), ctx=ctx)
-        tvm_indices = tvm.nd.array(np.zeros(data.shape).astype("int32"), ctx=ctx)
-        tvm_num_unique = tvm.nd.array(np.zeros([1]).astype("int32"), ctx=ctx)
+        tvm_data = tvm.nd.array(data, device=dev)
+        tvm_unique = tvm.nd.array(np.zeros(data.shape).astype(data.dtype), device=dev)
+        tvm_indices = tvm.nd.array(np.zeros(data.shape).astype("int32"), device=dev)
+        tvm_num_unique = tvm.nd.array(np.zeros([1]).astype("int32"), device=dev)
 
         # without counts
         with tvm.target.Target(target):
@@ -76,7 +76,7 @@ def check_unique(data, is_sorted=False):
         np.testing.assert_allclose(tvm_indices.asnumpy(), np_indices, atol=1e-5, rtol=1e-5)
 
         # with counts
-        tvm_counts = tvm.nd.array(np.zeros(data.shape).astype("int32"), ctx=ctx)
+        tvm_counts = tvm.nd.array(np.zeros(data.shape).astype("int32"), device=dev)
         with tvm.target.Target(target):
             te_input = tvm.te.placeholder(shape=data.shape, dtype=str(data.dtype))
             outs = fcompute(te_input, True)
@@ -106,6 +106,6 @@ def check_unique(data, is_sorted=False):
 
 
 if __name__ == "__main__":
-    test_unique(tvm.context("cpu"), tvm.target.Target("llvm"))
-    test_unique(tvm.context("cuda"), tvm.target.Target("cuda"))
-    test_unique(tvm.context("nvptx"), tvm.target.Target("nvptx"))
+    test_unique(tvm.device("cpu"), tvm.target.Target("llvm"))
+    test_unique(tvm.device("cuda"), tvm.target.Target("cuda"))
+    test_unique(tvm.device("nvptx"), tvm.target.Target("nvptx"))
diff --git a/tests/python/topi/python/test_topi_upsampling.py b/tests/python/topi/python/test_topi_upsampling.py
index 0408220bfd65..590043760820 100644
--- a/tests/python/topi/python/test_topi_upsampling.py
+++ b/tests/python/topi/python/test_topi_upsampling.py
@@ -84,19 +84,19 @@ def verify_upsampling(
     else:
         b_np = tvm.topi.testing.upsampling_python(a_np, (scale_h, scale_w), layout)
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_injective_schedule(device)(B)
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx)
-        f = tvm.build(s, [A, B], device)
+    def check_target(target, dev):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s = tvm.topi.testing.get_injective_schedule(target)(B)
+        a = tvm.nd.array(a_np, dev)
+        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), dev)
+        f = tvm.build(s, [A, B], target)
         f(a, b)
 
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_target(target, dev)
 
 
 @tvm.testing.uses_gpu
@@ -228,19 +228,19 @@ def verify_upsampling3d(
     else:
         b_np = tvm.topi.testing.upsampling3d_python(a_np, (scale_d, scale_h, scale_w), layout)
 
-    def check_device(device, ctx):
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            s = tvm.topi.testing.get_injective_schedule(device)(B)
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx)
-        f = tvm.build(s, [A, B], device)
+    def check_target(target, dev):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            s = tvm.topi.testing.get_injective_schedule(target)(B)
+        a = tvm.nd.array(a_np, dev)
+        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), dev)
+        f = tvm.build(s, [A, B], target)
         f(a, b)
 
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
 
-    for device, ctx in tvm.testing.enabled_targets():
-        check_device(device, ctx)
+    for target, dev in tvm.testing.enabled_targets():
+        check_target(target, dev)
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/topi/python/test_topi_vision.py b/tests/python/topi/python/test_topi_vision.py
index 2fdf3cf4b170..7f8712c55fd1 100644
--- a/tests/python/topi/python/test_topi_vision.py
+++ b/tests/python/topi/python/test_topi_vision.py
@@ -89,31 +89,31 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index):
                     np_out2[i, j, k] = -1.0
                 np_out3[i, j] = -1
 
-    def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
+    def check_device(target):
+        dev = tvm.device(target, 0)
+        if not tvm.testing.device_enabled(target):
+            print("Skip because %s is not enabled" % target)
             return
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            fcompute, fschedule = tvm.topi.testing.dispatch(device, _get_valid_counts_implement)
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            fcompute, fschedule = tvm.topi.testing.dispatch(target, _get_valid_counts_implement)
             data = te.placeholder(dshape, name="data", dtype=dtype)
             outs = fcompute(data, score_threshold, id_index, score_index)
             s = fschedule(outs)
 
-        tvm_input_data = tvm.nd.array(np_data, ctx)
-        tvm_out1 = tvm.nd.array(np.zeros(np_out1.shape, dtype="int32"), ctx)
-        tvm_out2 = tvm.nd.array(np.zeros(np_out2.shape, dtype=dtype), ctx)
-        tvm_out3 = tvm.nd.array(np.zeros(np_out3.shape, dtype="int32"), ctx)
+        tvm_input_data = tvm.nd.array(np_data, dev)
+        tvm_out1 = tvm.nd.array(np.zeros(np_out1.shape, dtype="int32"), dev)
+        tvm_out2 = tvm.nd.array(np.zeros(np_out2.shape, dtype=dtype), dev)
+        tvm_out3 = tvm.nd.array(np.zeros(np_out3.shape, dtype="int32"), dev)
 
-        f = tvm.build(s, [data, outs[0], outs[1], outs[2]], device)
+        f = tvm.build(s, [data, outs[0], outs[1], outs[2]], target)
         f(tvm_input_data, tvm_out1, tvm_out2, tvm_out3)
         tvm.testing.assert_allclose(tvm_out1.asnumpy(), np_out1, rtol=1e-3)
         tvm.testing.assert_allclose(tvm_out2.asnumpy(), np_out2, rtol=1e-3)
         tvm.testing.assert_allclose(tvm_out3.asnumpy(), np_out3, rtol=1e-3)
 
-    for device in ["llvm", "cuda", "opencl", "vulkan"]:
-        check_device(device)
+    for target in ["llvm", "cuda", "opencl", "vulkan"]:
+        check_device(target)
 
 
 @tvm.testing.uses_gpu
@@ -146,14 +146,14 @@ def verify_non_max_suppression(
     valid_count = te.placeholder((batch,), dtype="int32", name="valid_count")
     indices = te.placeholder((batch, num_anchors), dtype="int32", name="indices")
 
-    def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
+    def check_device(target):
+        dev = tvm.device(target, 0)
+        if not tvm.testing.device_enabled(target):
+            print("Skip because %s is not enabled" % target)
             return
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            fcompute, fschedule = tvm.topi.testing.dispatch(device, _nms_implement)
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            fcompute, fschedule = tvm.topi.testing.dispatch(target, _nms_implement)
             out = fcompute(
                 data,
                 valid_count,
@@ -183,22 +183,22 @@ def check_device(device):
             s = fschedule(out)
             indices_s = fschedule(indices_out)
 
-        tvm_data = tvm.nd.array(np_data, ctx)
-        tvm_valid_count = tvm.nd.array(np_valid_count, ctx)
-        tvm_indices = tvm.nd.array(np_indices, ctx)
+        tvm_data = tvm.nd.array(np_data, dev)
+        tvm_valid_count = tvm.nd.array(np_valid_count, dev)
+        tvm_indices = tvm.nd.array(np_indices, dev)
 
-        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx)
-        f = tvm.build(s, [data, valid_count, indices, out], device)
+        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), dev)
+        f = tvm.build(s, [data, valid_count, indices, out], target)
         f(tvm_data, tvm_valid_count, tvm_indices, tvm_out)
         tvm.testing.assert_allclose(tvm_out.asnumpy(), np_result, rtol=1e-4)
 
-        tvm_indices_out = tvm.nd.array(np.zeros(indices_dshape, dtype="int32"), ctx)
-        f = tvm.build(indices_s, [data, valid_count, indices, indices_out[0]], device)
+        tvm_indices_out = tvm.nd.array(np.zeros(indices_dshape, dtype="int32"), dev)
+        f = tvm.build(indices_s, [data, valid_count, indices, indices_out[0]], target)
         f(tvm_data, tvm_valid_count, tvm_indices, tvm_indices_out)
         tvm.testing.assert_allclose(tvm_indices_out.asnumpy(), np_indices_result, rtol=1e-4)
 
-    for device in ["llvm", "cuda", "opencl", "nvptx"]:
-        check_device(device)
+    for target in ["llvm", "cuda", "opencl", "nvptx"]:
+        check_device(target)
 
 
 @tvm.testing.uses_gpu
@@ -339,26 +339,26 @@ def verify_multibox_prior(
     if clip:
         np_out = np.clip(np_out, 0, 1)
 
-    def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
+    def check_device(target):
+        dev = tvm.device(target, 0)
+        if not tvm.testing.device_enabled(target):
+            print("Skip because %s is not enabled" % target)
             return
-        print("Running on target: %s" % device)
+        print("Running on target: %s" % target)
 
-        fcompute, fschedule = tvm.topi.testing.dispatch(device, _multibox_prior_implement)
-        with tvm.target.Target(device):
+        fcompute, fschedule = tvm.topi.testing.dispatch(target, _multibox_prior_implement)
+        with tvm.target.Target(target):
             out = fcompute(data, sizes, ratios, steps, offsets, clip)
             s = fschedule(out)
 
-        tvm_input_data = tvm.nd.array(input_data, ctx)
-        tvm_out = tvm.nd.array(np.zeros(oshape, dtype=dtype), ctx)
-        f = tvm.build(s, [data, out], device)
+        tvm_input_data = tvm.nd.array(input_data, dev)
+        tvm_out = tvm.nd.array(np.zeros(oshape, dtype=dtype), dev)
+        f = tvm.build(s, [data, out], target)
         f(tvm_input_data, tvm_out)
         tvm.testing.assert_allclose(tvm_out.asnumpy(), np_out, rtol=1e-3)
 
-    for device in ["llvm", "opencl", "cuda"]:
-        check_device(device)
+    for target in ["llvm", "opencl", "cuda"]:
+        check_device(target)
 
 
 @tvm.testing.uses_gpu
@@ -394,28 +394,28 @@ def test_multibox_detection():
         ]
     )
 
-    def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
+    def check_device(target):
+        dev = tvm.device(target, 0)
+        if not tvm.testing.device_enabled(target):
+            print("Skip because %s is not enabled" % target)
             return
-        print("Running on target: %s" % device)
+        print("Running on target: %s" % target)
 
-        fcompute, fschedule = tvm.topi.testing.dispatch(device, _multibox_detection_implement)
-        with tvm.target.Target(device):
+        fcompute, fschedule = tvm.topi.testing.dispatch(target, _multibox_detection_implement)
+        with tvm.target.Target(target):
             out = fcompute(cls_prob, loc_preds, anchors)
             s = fschedule(out)
 
-        tvm_cls_prob = tvm.nd.array(np_cls_prob.astype(cls_prob.dtype), ctx)
-        tvm_loc_preds = tvm.nd.array(np_loc_preds.astype(loc_preds.dtype), ctx)
-        tvm_anchors = tvm.nd.array(np_anchors.astype(anchors.dtype), ctx)
-        tvm_out = tvm.nd.array(np.zeros((batch_size, num_anchors, 6)).astype(out.dtype), ctx)
-        f = tvm.build(s, [cls_prob, loc_preds, anchors, out], device)
+        tvm_cls_prob = tvm.nd.array(np_cls_prob.astype(cls_prob.dtype), dev)
+        tvm_loc_preds = tvm.nd.array(np_loc_preds.astype(loc_preds.dtype), dev)
+        tvm_anchors = tvm.nd.array(np_anchors.astype(anchors.dtype), dev)
+        tvm_out = tvm.nd.array(np.zeros((batch_size, num_anchors, 6)).astype(out.dtype), dev)
+        f = tvm.build(s, [cls_prob, loc_preds, anchors, out], target)
         f(tvm_cls_prob, tvm_loc_preds, tvm_anchors, tvm_out)
         tvm.testing.assert_allclose(tvm_out.asnumpy(), expected_np_out, rtol=1e-4)
 
-    for device in ["llvm", "opencl", "cuda"]:
-        check_device(device)
+    for target in ["llvm", "opencl", "cuda"]:
+        check_device(target)
 
 
 def verify_roi_align(
@@ -445,13 +445,13 @@ def get_ref_data():
 
     a_np, rois_np, b_np = get_ref_data()
 
-    def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
+    def check_device(target):
+        dev = tvm.device(target, 0)
+        if not tvm.testing.device_enabled(target):
+            print("Skip because %s is not enabled" % target)
             return
-        with tvm.target.Target(device):
-            fcompute, fschedule = tvm.topi.testing.dispatch(device, _roi_align_implement)
+        with tvm.target.Target(target):
+            fcompute, fschedule = tvm.topi.testing.dispatch(target, _roi_align_implement)
             b = fcompute(
                 a,
                 rois,
@@ -462,16 +462,16 @@ def check_device(device):
             )
             s = fschedule(b)
 
-        tvm_a = tvm.nd.array(a_np, ctx)
-        tvm_rois = tvm.nd.array(rois_np, ctx)
-        tvm_b = tvm.nd.array(np.zeros(get_const_tuple(b.shape), dtype=b.dtype), ctx=ctx)
-        f = tvm.build(s, [a, rois, b], device)
+        tvm_a = tvm.nd.array(a_np, dev)
+        tvm_rois = tvm.nd.array(rois_np, dev)
+        tvm_b = tvm.nd.array(np.zeros(get_const_tuple(b.shape), dtype=b.dtype), device=dev)
+        f = tvm.build(s, [a, rois, b], target)
         f(tvm_a, tvm_rois, tvm_b)
         tvm_val = tvm_b.asnumpy()
         tvm.testing.assert_allclose(tvm_val, b_np, rtol=1e-3, atol=1e-4)
 
-    for device in ["llvm", "cuda", "opencl"]:
-        check_device(device)
+    for target in ["llvm", "cuda", "opencl"]:
+        check_device(target)
 
 
 @tvm.testing.uses_gpu
@@ -506,29 +506,29 @@ def get_ref_data():
 
     a_np, rois_np, b_np = get_ref_data()
 
-    def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
+    def check_device(target):
+        dev = tvm.device(target, 0)
+        if not tvm.testing.device_enabled(target):
+            print("Skip because %s is not enabled" % target)
             return
-        print("Running on target: %s" % device)
+        print("Running on target: %s" % target)
 
-        with tvm.target.Target(device):
+        with tvm.target.Target(target):
             b = topi.vision.rcnn.roi_pool_nchw(
                 a, rois, pooled_size=pooled_size, spatial_scale=spatial_scale
             )
-            s_func = tvm.topi.testing.dispatch(device, _roi_pool_schedule)
+            s_func = tvm.topi.testing.dispatch(target, _roi_pool_schedule)
             s = s_func(b)
 
-        tvm_a = tvm.nd.array(a_np, ctx)
-        tvm_rois = tvm.nd.array(rois_np, ctx)
-        tvm_b = tvm.nd.array(np.zeros(get_const_tuple(b.shape), dtype=b.dtype), ctx=ctx)
-        f = tvm.build(s, [a, rois, b], device)
+        tvm_a = tvm.nd.array(a_np, dev)
+        tvm_rois = tvm.nd.array(rois_np, dev)
+        tvm_b = tvm.nd.array(np.zeros(get_const_tuple(b.shape), dtype=b.dtype), device=dev)
+        f = tvm.build(s, [a, rois, b], target)
         f(tvm_a, tvm_rois, tvm_b)
         tvm.testing.assert_allclose(tvm_b.asnumpy(), b_np, rtol=1e-4)
 
-    for device in ["cuda", "llvm"]:
-        check_device(device)
+    for target in ["cuda", "llvm"]:
+        check_device(target)
 
 
 @tvm.testing.uses_gpu
@@ -542,26 +542,26 @@ def verify_proposal(np_cls_prob, np_bbox_pred, np_im_info, np_out, attrs):
     bbox_pred = te.placeholder(np_bbox_pred.shape)
     im_info = te.placeholder(np_im_info.shape)
 
-    def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
+    def check_device(target):
+        dev = tvm.device(target, 0)
+        if not tvm.testing.device_enabled(target):
+            print("Skip because %s is not enabled" % target)
             return
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            fcompute, fschedule = tvm.topi.testing.dispatch(device, _proposal_implement)
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            fcompute, fschedule = tvm.topi.testing.dispatch(target, _proposal_implement)
             out = fcompute(cls_prob, bbox_pred, im_info, **attrs)
             s = fschedule(out)
-            f = tvm.build(s, [cls_prob, bbox_pred, im_info, out], device)
-            tvm_cls_prob = tvm.nd.array(np_cls_prob, ctx=ctx)
-            tvm_bbox_pred = tvm.nd.array(np_bbox_pred, ctx=ctx)
-            tvm_im_info = tvm.nd.array(np_im_info, ctx=ctx)
-            tvm_out = tvm.nd.empty(ctx=ctx, shape=out.shape, dtype=out.dtype)
+            f = tvm.build(s, [cls_prob, bbox_pred, im_info, out], target)
+            tvm_cls_prob = tvm.nd.array(np_cls_prob, device=dev)
+            tvm_bbox_pred = tvm.nd.array(np_bbox_pred, device=dev)
+            tvm_im_info = tvm.nd.array(np_im_info, device=dev)
+            tvm_out = tvm.nd.empty(device=dev, shape=out.shape, dtype=out.dtype)
             f(tvm_cls_prob, tvm_bbox_pred, tvm_im_info, tvm_out)
             tvm.testing.assert_allclose(tvm_out.asnumpy(), np_out, rtol=1e-4)
 
-    for device in ["llvm", "cuda"]:
-        check_device(device)
+    for target in ["llvm", "cuda"]:
+        check_device(target)
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/unittest/test_arith_iter_affine_map.py b/tests/python/unittest/test_arith_iter_affine_map.py
index 6ab61fdd9592..5ce68aaaf51b 100644
--- a/tests/python/unittest/test_arith_iter_affine_map.py
+++ b/tests/python/unittest/test_arith_iter_affine_map.py
@@ -19,13 +19,13 @@
 from tvm import te
 
 
-def ifuse(inputs):
+def ifuse(inputs, pred_extent=None):
     """Fuse iterators"""
     value, extent = 0, 1
     for i, ext in inputs:
         value = value * ext + i
         extent = extent * ext
-    return (value, extent)
+    return value, extent if pred_extent is None else pred_extent
 
 
 def isplit(axis, factor):
@@ -67,7 +67,9 @@ def test_trivial():
     assert_iter_sum_pattern(res[2], 1, 3)
 
     res = tvm.arith.detect_iter_map([x[0], 3], var_dom([x, y]))
-    assert len(res) == 0
+    assert len(res) == 2
+    assert_iter_sum_pattern(res[0], 3, 0)
+    assert_iter_sum_pattern(res[1], 1, 3)
 
     # not independent
     res = tvm.arith.detect_iter_map([x[0], x[0], 3], var_dom([x, y]))
@@ -79,8 +81,6 @@ def test_fuse():
     y = tvm.tir.Var("y", "int32")
     c = tvm.tir.SizeVar("c", "int32")
     c0 = tvm.tir.SizeVar("c0", "int32")
-    c1 = tvm.tir.SizeVar("c1", "int32")
-    c2 = tvm.tir.SizeVar("c1", "int32")
 
     res = tvm.arith.detect_iter_map([y * 3 + 1 + c + x], var_dom([(x, 3), (y, 4)]))
     assert len(res) == 1
@@ -121,10 +121,8 @@ def test_fuse():
 def test_split():
     x = tvm.tir.Var("x", "int32")
     y = tvm.tir.Var("y", "int32")
-    z = tvm.tir.Var("y", "int32")
     c0 = tvm.tir.SizeVar("c0", "int32")
     c1 = tvm.tir.SizeVar("c1", "int32")
-    c2 = tvm.tir.SizeVar("c1", "int32")
     fld = tvm.tir.floordiv
     flm = tvm.tir.floormod
 
@@ -196,8 +194,121 @@ def test_compound():
     tvm.ir.assert_structural_equal(sz, res[0])
 
 
+def test_predicate():
+    x = tvm.tir.Var("x", "int32"), 13
+    y = tvm.tir.Var("y", "int32"), 10
+
+    res = tvm.arith.detect_iter_map([x[0] * 10 + y[0]], var_dom([x, y]), x[0] * 10 + y[0] < 128)
+
+    assert len(res) == 1
+    assert_iter_sum_pattern(res[0], 128, 0)
+
+    # duplicate constraint
+    res = tvm.arith.detect_iter_map(
+        [x[0] * 10 + y[0]],
+        var_dom([x, y]),
+        tvm.tir.all(x[0] * 10 + y[0] < 128, x[0] * 10 + y[0] < 64),
+    )
+
+    assert len(res) == 1
+    assert_iter_sum_pattern(res[0], 64, 0)
+
+    # useless constraint
+    res = tvm.arith.detect_iter_map([x[0] * 10 + y[0]], var_dom([x, y]), x[0] * 10 + y[0] < 140)
+
+    assert len(res) == 1
+    assert_iter_sum_pattern(res[0], 130, 0)
+
+    i1 = tvm.tir.Var("i1", "int32"), 7
+    i2 = tvm.tir.Var("i2", "int32"), 2
+    i3 = tvm.tir.Var("i3", "int32"), 4
+    i4 = tvm.tir.Var("i4", "int32"), 3
+    res = tvm.arith.detect_iter_map(
+        [i1[0] * 20 + i2[0] * 10 + i3[0] * 3 + i4[0]],
+        var_dom([i1, i2, i3, i4]),
+        (
+            tvm.tir.all(
+                i1[0] * 2 + i2[0] < 13,
+                i1[0] * 20 + i2[0] * 10 + i3[0] * 3 + i4[0] < 128,
+                i3[0] * 3 + i4[0] < 10,
+            )
+        ),
+    )
+    assert len(res) == 1
+    assert_iter_sum_pattern(res[0], 128, 0)
+
+    i1 = tvm.tir.Var("i1", "int32"), 7
+    i2 = tvm.tir.Var("i2", "int32"), 2
+    i3 = tvm.tir.Var("i3", "int32"), 4
+    i4 = tvm.tir.Var("i4", "int32"), 3
+
+    # wrong constraint
+    res = tvm.arith.detect_iter_map(
+        [i1[0] * 20 + i2[0] * 10 + i3[0] * 3 + i4[0]],
+        var_dom([i1, i2, i3, i4]),
+        (
+            tvm.tir.all(
+                i1[0] * 2 + i2[0] < 13,
+                i1[0] * 20 + i2[0] * 10 + i3[0] * 3 + i4[0] < 128,
+                i3[0] * 3 + i4[0] < 7,
+            )
+        ),
+    )
+    assert len(res) == 0
+
+    # incompatible constraint
+    res = tvm.arith.detect_iter_map(
+        [i1[0] * 20 + i2[0] * 10 + i3[0] * 3 + i4[0]],
+        var_dom([i1, i2, i3, i4]),
+        (
+            tvm.tir.all(
+                i1[0] * 2 + i2[0] < 13,
+                i1[0] * 20 + i2[0] * 10 + i3[0] * 3 + i4[0] < 128,
+                i3[0] * 3 + i4[0] < 10,
+                i1[0] * 4 + i3[0] < 20,
+            )
+        ),
+    )
+    assert len(res) == 0
+
+    res = tvm.arith.detect_iter_map(
+        [i1[0] * 20 + i2[0] * 10 + i3[0] * 3 + i4[0]],
+        var_dom([i1, i2, i3, i4]),
+        (
+            tvm.tir.all(
+                i1[0] * 2 + i2[0] < 13,
+                i1[0] * 20 + i2[0] * 10 + i3[0] * 3 + i4[0] < 128,
+                i1[0] * 4 + i3[0] < 20,
+            )
+        ),
+    )
+    assert len(res) == 0
+
+
+def test_normalize_iter_map_to_expr():
+    fld = tvm.tir.floordiv
+    flm = tvm.tir.floormod
+
+    x = tvm.tir.Var("x", "int32"), 10
+    y = tvm.tir.Var("y", "int32"), 9
+
+    xo, xi = isplit(x, 5)
+    yo, yi = isplit(y, 3)
+    z = ifuse([yo, xo, yi])
+
+    res = tvm.arith.detect_iter_map([z[0], xi[0]], var_dom([x, y]))
+
+    tvm.ir.assert_structural_equal(
+        tvm.arith.normalize_iter_map_to_expr(res[0]),
+        fld(y[0], 3) * 6 + fld(x[0], 5) * 3 + flm(y[0], 3),
+    )
+    tvm.ir.assert_structural_equal(tvm.arith.normalize_iter_map_to_expr(res[1]), flm(x[0], 5))
+
+
 if __name__ == "__main__":
     test_split()
     test_trivial()
     test_fuse()
     test_compound()
+    test_predicate()
+    test_normalize_iter_map_to_expr()
diff --git a/tests/python/unittest/test_auto_scheduler_layout_rewrite.py b/tests/python/unittest/test_auto_scheduler_layout_rewrite.py
index 795c3cb3b0a2..91430599124e 100644
--- a/tests/python/unittest/test_auto_scheduler_layout_rewrite.py
+++ b/tests/python/unittest/test_auto_scheduler_layout_rewrite.py
@@ -119,16 +119,16 @@ def test_correctness_layout_rewrite_rewrite_for_preTransformed():
         func = tvm.build(s, bufs, target=target)
         func_ref = tvm.build(s_ref, bufs_ref, target=target)
 
-        ctx = tvm.context(str(target))
-        ctx_ref = tvm.cpu()
+        dev = tvm.device(str(target))
+        dev_ref = tvm.cpu()
 
-        args = [tvm.nd.array(x, ctx=ctx) for x in np_args]
-        args_ref = [tvm.nd.array(x, ctx=ctx_ref) for x in np_args_ref]
-        ctx.sync()
+        args = [tvm.nd.array(x, device=dev) for x in np_args]
+        args_ref = [tvm.nd.array(x, device=dev_ref) for x in np_args_ref]
+        dev.sync()
 
         func(*args)
         func_ref(*args_ref)
-        ctx.sync()
+        dev.sync()
 
         tvm.testing.assert_allclose(args[0].asnumpy(), args_ref[0].asnumpy(), atol=1e-3, rtol=1e-3)
         tvm.testing.assert_allclose(args[2].asnumpy(), args_ref[2].asnumpy(), atol=1e-3, rtol=1e-3)
@@ -166,16 +166,16 @@ def test_correctness_layout_rewrite_insert_transform_stage():
         func = tvm.build(s, bufs, target=target)
         func_ref = tvm.build(s_ref, bufs_ref, target=target)
 
-        ctx = tvm.context(str(target))
-        ctx_ref = tvm.cpu()
+        dev = tvm.device(str(target))
+        dev_ref = tvm.cpu()
 
-        args = [tvm.nd.array(x, ctx=ctx) for x in np_args]
-        args_ref = [tvm.nd.array(x, ctx=ctx_ref) for x in np_args]
-        ctx.sync()
+        args = [tvm.nd.array(x, device=dev) for x in np_args]
+        args_ref = [tvm.nd.array(x, device=dev_ref) for x in np_args]
+        dev.sync()
 
         func(*args)
         func_ref(*args_ref)
-        ctx.sync()
+        dev.sync()
 
         tvm.testing.assert_allclose(args[0].asnumpy(), args_ref[0].asnumpy(), atol=1e-3, rtol=1e-3)
         tvm.testing.assert_allclose(args[1].asnumpy(), args_ref[1].asnumpy(), atol=1e-3, rtol=1e-3)
diff --git a/tests/python/unittest/test_auto_scheduler_measure.py b/tests/python/unittest/test_auto_scheduler_measure.py
index 1f141a2cfd00..d82cfd447a40 100644
--- a/tests/python/unittest/test_auto_scheduler_measure.py
+++ b/tests/python/unittest/test_auto_scheduler_measure.py
@@ -355,7 +355,7 @@ def test_measure_target_host():
 
 
 @tvm.testing.requires_llvm
-def test_measure_special_inputs_map_by_name():
+def test_measure_special_inputs_map_by_name_local_runner():
     @auto_scheduler.register_workload
     def foo():
         X = te.placeholder(shape=[10], dtype="int32")
@@ -382,6 +382,38 @@ def foo():
     assert mress[0].error_no == 0
 
 
+@tvm.testing.requires_llvm
+def test_measure_special_inputs_map_by_name_rpc_runner():
+    @auto_scheduler.register_workload
+    def foo():
+        X = te.placeholder(shape=[10], dtype="int32")
+        Index = te.placeholder(shape=[1], dtype="int32", name="Index")
+        Y = te.compute((1,), lambda i: X[Index[i]])
+        return [X, Index, Y]
+
+    # This workload cannot use random input for the `Index` input
+    task = auto_scheduler.SearchTask(
+        func=foo,
+        target="llvm",
+        task_inputs={
+            "Index": tvm.nd.array(np.array([5], dtype="int32")),
+        },
+    )
+
+    for enable_cpu_cache_flush in [True, False]:
+        minp = auto_scheduler.MeasureInput(task, task.compute_dag.init_state)
+        local_builder = auto_scheduler.LocalBuilder()
+        measure_ctx = auto_scheduler.LocalRPCMeasureContext(
+            timeout=60, enable_cpu_cache_flush=enable_cpu_cache_flush
+        )
+        rpc_runner = measure_ctx.runner
+
+        bress = local_builder.build([minp])
+        assert bress[0].error_no == 0
+        mress = rpc_runner.run([minp], bress)
+        assert mress[0].error_no == 0
+
+
 if __name__ == "__main__":
     test_record_split_reorder_fuse_annotation()
     test_record_compute_at_root_inline_cache_read_write()
@@ -393,4 +425,5 @@ def foo():
     test_dag_measure_local_builder_runner()
     test_measure_local_builder_rpc_runner()
     test_measure_target_host()
-    test_measure_special_inputs_map_by_name()
+    test_measure_special_inputs_map_by_name_local_runner()
+    test_measure_special_inputs_map_by_name_rpc_runner()
diff --git a/tests/python/unittest/test_auto_scheduler_search_policy.py b/tests/python/unittest/test_auto_scheduler_search_policy.py
index 30aafbd22390..32245d9bba81 100644
--- a/tests/python/unittest/test_auto_scheduler_search_policy.py
+++ b/tests/python/unittest/test_auto_scheduler_search_policy.py
@@ -95,7 +95,7 @@ def search_common(
         sch, args = task.compute_dag.apply_steps_from_state(task.compute_dag.init_state)
         mod_ref = tvm.build(sch, args, "llvm")
 
-        ctx = tvm.context(str(target), 0)
+        ctx = tvm.device(str(target), 0)
         np_arrays = [np.random.uniform(size=get_const_tuple(x.shape)).astype(x.dtype) for x in args]
 
         tvm_arrays = [tvm.nd.array(x, ctx) for x in np_arrays]
diff --git a/tests/python/unittest/test_autotvm_graph_tuner_utils.py b/tests/python/unittest/test_autotvm_graph_tuner_utils.py
index 6ab194c10ea7..3f6d3980ee28 100644
--- a/tests/python/unittest/test_autotvm_graph_tuner_utils.py
+++ b/tests/python/unittest/test_autotvm_graph_tuner_utils.py
@@ -109,6 +109,16 @@ def test_get_direct_ancestor():
     out = get_direct_ancestor(node_list, visited_dict, target_ops, 5, input_names)
     assert out == [0], "Output mismatch: expecting [0] but got %s." % str(out)
 
+    # non-regression test
+    out = relay.add(relay.log(data), relay.sqrt(data))
+    net = relay.Function(relay.analysis.free_vars(out), out)
+    net = bind_inputs(net, {"data": (1, 16, 224, 224)})
+    node_list = []
+    node_dict = {}
+    expr2graph(net, target_ops, node_dict, node_list)
+    out = get_direct_ancestor(node_list, visited_dict, target_ops, 3, input_names)
+    assert out == [0], "Output mismatch: expecting [0] but got %s." % str(out)
+
 
 def test_get_in_nodes():
     data = relay.var("data")
diff --git a/tests/python/unittest/test_crt.py b/tests/python/unittest/test_crt.py
index def396dad72c..6d678b8a3753 100644
--- a/tests/python/unittest/test_crt.py
+++ b/tests/python/unittest/test_crt.py
@@ -93,11 +93,11 @@ def test_compile_runtime():
     workspace = tvm.micro.Workspace()
 
     with _make_add_sess(workspace) as sess:
-        A_data = tvm.nd.array(np.array([2, 3], dtype="int8"), ctx=sess.context)
+        A_data = tvm.nd.array(np.array([2, 3], dtype="int8"), device=sess.device)
         assert (A_data.asnumpy() == np.array([2, 3])).all()
-        B_data = tvm.nd.array(np.array([4], dtype="int8"), ctx=sess.context)
+        B_data = tvm.nd.array(np.array([4], dtype="int8"), device=sess.device)
         assert (B_data.asnumpy() == np.array([4])).all()
-        C_data = tvm.nd.array(np.array([0, 0], dtype="int8"), ctx=sess.context)
+        C_data = tvm.nd.array(np.array([0, 0], dtype="int8"), device=sess.device)
         assert (C_data.asnumpy() == np.array([0, 0])).all()
 
         system_lib = sess.get_system_lib()
@@ -139,8 +139,8 @@ def test_reset():
 
 
 @tvm.testing.requires_micro
-def test_graph_runtime():
-    """Test use of the graph runtime with microTVM."""
+def test_graph_executor():
+    """Test use of the graph executor with microTVM."""
     import tvm.micro
 
     workspace = tvm.micro.Workspace(debug=True)
@@ -157,12 +157,12 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), uint8]) {
         factory = tvm.relay.build(relay_mod, target=TARGET)
 
     with _make_session(workspace, factory.get_lib()) as sess:
-        graph_mod = tvm.micro.create_local_graph_runtime(
-            factory.get_json(), sess.get_system_lib(), sess.context
+        graph_mod = tvm.micro.create_local_graph_executor(
+            factory.get_json(), sess.get_system_lib(), sess.device
         )
-        A_data = tvm.nd.array(np.array([2, 3], dtype="uint8"), ctx=sess.context)
+        A_data = tvm.nd.array(np.array([2, 3], dtype="uint8"), device=sess.device)
         assert (A_data.asnumpy() == np.array([2, 3])).all()
-        B_data = tvm.nd.array(np.array([4, 7], dtype="uint8"), ctx=sess.context)
+        B_data = tvm.nd.array(np.array([4, 7], dtype="uint8"), device=sess.device)
         assert (B_data.asnumpy() == np.array([4, 7])).all()
 
         graph_mod.run(a=A_data, b=B_data)
@@ -179,11 +179,11 @@ def test_std_math_functions():
     workspace = tvm.micro.Workspace()
 
     with _make_add_sess(workspace) as sess:
-        A_data = tvm.nd.array(np.array([2, 3], dtype="int8"), ctx=sess.context)
+        A_data = tvm.nd.array(np.array([2, 3], dtype="int8"), device=sess.device)
         assert (A_data.asnumpy() == np.array([2, 3])).all()
-        B_data = tvm.nd.array(np.array([4], dtype="int8"), ctx=sess.context)
+        B_data = tvm.nd.array(np.array([4], dtype="int8"), device=sess.device)
         assert (B_data.asnumpy() == np.array([4])).all()
-        C_data = tvm.nd.array(np.array([0, 0], dtype="int8"), ctx=sess.context)
+        C_data = tvm.nd.array(np.array([0, 0], dtype="int8"), device=sess.device)
         assert (C_data.asnumpy() == np.array([0, 0])).all()
 
         system_lib = sess.get_system_lib()
@@ -195,8 +195,8 @@ def test_std_math_functions():
     s = tvm.te.create_schedule(B.op)
 
     with _make_sess_from_op(workspace, "myexpf", s, [A, B]) as sess:
-        A_data = tvm.nd.array(np.array([2.0, 3.0], dtype="float32"), ctx=sess.context)
-        B_data = tvm.nd.array(np.array([2.0, 3.0], dtype="float32"), ctx=sess.context)
+        A_data = tvm.nd.array(np.array([2.0, 3.0], dtype="float32"), device=sess.device)
+        B_data = tvm.nd.array(np.array([2.0, 3.0], dtype="float32"), device=sess.device)
         lib = sess.get_system_lib()
         func = lib["myexpf"]
         func(A_data, B_data)
@@ -214,11 +214,11 @@ def test_platform_timer():
     s = tvm.te.create_schedule(B.op)
 
     with _make_sess_from_op(workspace, "myexpf", s, [A, B]) as sess:
-        A_data = tvm.nd.array(np.array([2.0, 3.0], dtype="float32"), ctx=sess.context)
-        B_data = tvm.nd.array(np.array([2.0, 3.0], dtype="float32"), ctx=sess.context)
+        A_data = tvm.nd.array(np.array([2.0, 3.0], dtype="float32"), device=sess.device)
+        B_data = tvm.nd.array(np.array([2.0, 3.0], dtype="float32"), device=sess.device)
         lib = sess.get_system_lib()
         time_eval_f = lib.time_evaluator(
-            "myexpf", sess.context, number=2000, repeat=3, min_repeat_ms=40
+            "myexpf", sess.device, number=2000, repeat=3, min_repeat_ms=40
         )
         result = time_eval_f(A_data, B_data)
         assert result.mean > 0
diff --git a/tests/python/unittest/test_link_params.py b/tests/python/unittest/test_link_params.py
index ffe859927ad7..3ad515604d0b 100644
--- a/tests/python/unittest/test_link_params.py
+++ b/tests/python/unittest/test_link_params.py
@@ -124,7 +124,7 @@ def _verify_linked_param(dtype, lib, mod, graph, name):
     """Directly read memory from the linked library to verify the linked parameter is correct."""
     sid = _lookup_sid(graph, name)
     # NOTE: query_imports=True because when loading a module from disk (i.e. for C backend),
-    # a GraphRuntimeFactory module is created instead of the module itself.
+    # a GraphExecutorFactory module is created instead of the module itself.
     param_ptr = mod.get_function("_lookup_linked_param", True)(sid)
     gen_param = lib.params[name]
     arr_data = (_get_ctypes_dtype(dtype) * np.prod(gen_param.shape)).from_address(param_ptr.value)
@@ -154,7 +154,7 @@ def _add_decl(name, dtype):
         f"def @main(%rand_input : Tensor[{INPUT_SHAPE}, {dtype}], { ', '.join(param_decls.values()) } )  {{",
         # This program ensures that GraphPlanMemory alternates between the same two storage IDs for a
         # while. In doing this, it ensures that param %{dtype}_b will be placed into the graph at an
-        # index unequal to its storage_id. This ensures that GraphRuntimeCodegen encodes the storage_id
+        # index unequal to its storage_id. This ensures that GraphExecutorCodegen encodes the storage_id
         # and not the parameter index into the graph.
         (
             f'    %0 = nn.conv2d(%rand_input, %{dtype}_a, data_layout="NCHW", kernel_layout="OIHW", '
@@ -206,7 +206,7 @@ def test_llvm_link_params():
             # Wrap in function to explicitly deallocate the runtime.
             def _run_linked(lib, mod):
                 graph_json, _, _ = lib
-                graph_rt = tvm.contrib.graph_runtime.create(graph_json, mod, tvm.cpu(0))
+                graph_rt = tvm.contrib.graph_executor.create(graph_json, mod, tvm.cpu(0))
                 graph_rt.set_input("rand_input", rand_input)  # NOTE: params not required.
                 graph_rt.run()
                 return graph_rt.get_output(0)
@@ -218,7 +218,7 @@ def _run_linked(lib, mod):
 
             def _run_unlinked(lib):
                 graph_json, mod, lowered_params = lib
-                graph_rt = tvm.contrib.graph_runtime.create(graph_json, mod, tvm.cpu(0))
+                graph_rt = tvm.contrib.graph_executor.create(graph_json, mod, tvm.cpu(0))
                 graph_rt.set_input("rand_input", rand_input, **lowered_params)
                 graph_rt.run()
                 return graph_rt.get_output(0)
@@ -316,7 +316,7 @@ def test_c_link_params():
 
             # Wrap in function to explicitly deallocate the runtime.
             def _run_linked(lib_mod):
-                graph_rt = tvm.contrib.graph_runtime.GraphModule(lib_mod["default"](tvm.cpu(0)))
+                graph_rt = tvm.contrib.graph_executor.GraphModule(lib_mod["default"](tvm.cpu(0)))
                 graph_rt.set_input("rand_input", rand_input)  # NOTE: params not required.
                 graph_rt.run()
 
@@ -334,7 +334,7 @@ def _run_linked(lib_mod):
             lib_mod = tvm.runtime.load_module(lib_path)
 
             def _run_unlinked(lib_mod):
-                graph_rt = tvm.contrib.graph_runtime.GraphModule(lib_mod["default"](tvm.cpu(0)))
+                graph_rt = tvm.contrib.graph_executor.GraphModule(lib_mod["default"](tvm.cpu(0)))
                 graph_rt.set_input("rand_input", rand_input, **params)
                 graph_rt.run()
                 return graph_rt.get_output(0)
@@ -365,7 +365,7 @@ def test_crt_link_params():
             opts = tvm.micro.default_options(
                 os.path.join(tvm.micro.get_standalone_crt_dir(), "template", "host")
             )
-            opts["bin_opts"]["ldflags"].append("-DTVM_HOST_USE_GRAPH_RUNTIME_MODULE")
+            opts["bin_opts"]["ldflags"].append("-DTVM_HOST_USE_GRAPH_EXECUTOR_MODULE")
 
             micro_binary = tvm.micro.build_static_runtime(
                 workspace,
@@ -374,7 +374,7 @@ def test_crt_link_params():
                 compiler_options=opts,
                 extra_libs=[
                     tvm.micro.get_standalone_crt_lib(m)
-                    for m in ("memory", "graph_runtime_module", "graph_runtime")
+                    for m in ("memory", "graph_executor_module", "graph_executor")
                 ],
             )
 
@@ -383,8 +383,8 @@ def test_crt_link_params():
             }
             flasher = compiler.flasher(**flasher_kw)
             with tvm.micro.Session(binary=micro_binary, flasher=flasher) as sess:
-                graph_rt = tvm.micro.session.create_local_graph_runtime(
-                    graph_json, sess.get_system_lib(), sess.context
+                graph_rt = tvm.micro.session.create_local_graph_executor(
+                    graph_json, sess.get_system_lib(), sess.device
                 )
 
                 # NOTE: not setting params here.
@@ -397,7 +397,7 @@ def test_crt_link_params():
 
             def _run_unlinked(lib):
                 graph_json, mod, lowered_params = lib
-                graph_rt = tvm.contrib.graph_runtime.create(graph_json, mod, tvm.cpu(0))
+                graph_rt = tvm.contrib.graph_executor.create(graph_json, mod, tvm.cpu(0))
                 graph_rt.set_input("rand_input", rand_input, **lowered_params)
                 graph_rt.run()
                 return graph_rt.get_output(0).asnumpy()
diff --git a/tests/python/unittest/test_micro_model_library_format.py b/tests/python/unittest/test_micro_model_library_format.py
index c999091cc3cc..db6c55bca12a 100644
--- a/tests/python/unittest/test_micro_model_library_format.py
+++ b/tests/python/unittest/test_micro_model_library_format.py
@@ -26,7 +26,7 @@
 
 import tvm
 import tvm.relay
-from tvm.relay.backend import graph_runtime_factory
+from tvm.relay.backend import graph_executor_factory
 import tvm.runtime.module
 import tvm.testing
 from tvm.contrib import utils
@@ -170,7 +170,7 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
 @tvm.testing.requires_micro
 def test_export_model():
     module = tvm.support.FrontendTestModule()
-    factory = graph_runtime_factory.GraphRuntimeFactoryModule(
+    factory = graph_executor_factory.GraphExecutorFactoryModule(
         None, tvm.target.target.micro("host"), '"graph_json"', module, "test_module", {}
     )
 
diff --git a/tests/python/unittest/test_runtime_container.py b/tests/python/unittest/test_runtime_container.py
index 3e89a51cc93a..4607892a5a4c 100644
--- a/tests/python/unittest/test_runtime_container.py
+++ b/tests/python/unittest/test_runtime_container.py
@@ -47,7 +47,7 @@ def test_tuple_object():
     fn = relay.Function([x], relay.expr.TupleGetItem(x, 0))
     mod = tvm.IRModule.from_expr(fn)
 
-    exe = relay.create_executor(kind="vm", mod=mod, ctx=nd.cpu(), target="llvm")
+    exe = relay.create_executor(kind="vm", mod=mod, device=nd.cpu(), target="llvm")
     f = exe.evaluate()
     value_tuple = _container.tuple_object([nd.array(np.array(11)), nd.array(np.array(12))])
     # pass an ADT object to evaluate
diff --git a/tests/python/unittest/test_runtime_graph.py b/tests/python/unittest/test_runtime_graph.py
index 16e9db42cba3..5f0c7837d4f5 100644
--- a/tests/python/unittest/test_runtime_graph.py
+++ b/tests/python/unittest/test_runtime_graph.py
@@ -20,7 +20,8 @@
 import numpy as np
 import json
 from tvm import rpc
-from tvm.contrib import utils, graph_runtime
+from tvm import relay
+from tvm.contrib import utils, graph_executor
 
 
 @tvm.testing.requires_llvm
@@ -58,7 +59,7 @@ def test_graph_simple():
 
     def check_verify():
         mlib = tvm.build(s, [A, B], "llvm", name="myadd")
-        mod = graph_runtime.create(graph, mlib, tvm.cpu(0))
+        mod = graph_executor.create(graph, mlib, tvm.cpu(0))
         a = np.random.uniform(size=(n,)).astype(A.dtype)
         mod.run(x=a)
         out = mod.get_output(0, tvm.nd.empty((n,)))
@@ -69,21 +70,19 @@ def check_remote():
         server = rpc.Server("localhost")
         remote = rpc.connect(server.host, server.port)
         temp = utils.tempdir()
-        ctx = remote.cpu(0)
+        dev = remote.cpu(0)
         path_dso = temp.relpath("dev_lib.so")
         mlib.export_library(path_dso)
         remote.upload(path_dso)
         mlib = remote.load_module("dev_lib.so")
-        mod = graph_runtime.create(graph, mlib, remote.cpu(0))
+        mod = graph_executor.create(graph, mlib, remote.cpu(0))
         a = np.random.uniform(size=(n,)).astype(A.dtype)
-        mod.run(x=tvm.nd.array(a, ctx))
-        out = tvm.nd.empty((n,), ctx=ctx)
+        mod.run(x=tvm.nd.array(a, dev))
+        out = tvm.nd.empty((n,), device=dev)
         out = mod.get_output(0, out)
         np.testing.assert_equal(out.asnumpy(), a + 1)
 
     def check_sharing():
-        from tvm import relay
-
         x = relay.var("x", shape=(1, 10))
         y = relay.var("y", shape=(1, 10))
         z = relay.add(x, y)
@@ -93,10 +92,10 @@ def check_sharing():
         params = {"x": x_in}
         graph, lib, params = relay.build(func, target="llvm", params=params)
 
-        mod_shared = graph_runtime.create(graph, lib, tvm.cpu(0))
+        mod_shared = graph_executor.create(graph, lib, tvm.cpu(0))
         mod_shared.load_params(runtime.save_param_dict(params))
         num_mods = 10
-        mods = [graph_runtime.create(graph, lib, tvm.cpu(0)) for _ in range(num_mods)]
+        mods = [graph_executor.create(graph, lib, tvm.cpu(0)) for _ in range(num_mods)]
 
         for mod in mods:
             mod.share_params(mod_shared, runtime.save_param_dict(params))
@@ -120,5 +119,26 @@ def check_sharing():
     check_sharing()
 
 
+def test_load_unexpected_params():
+    # Test whether graph_executor.load_params works if parameters
+    # are provided that are not an expected input.
+    mod = tvm.IRModule()
+    params = {}
+    x = relay.var("x", shape=(1, 10))
+    y = relay.var("y", shape=(1, 10))
+    z = relay.add(x, y)
+    mod["main"] = relay.Function([x, y], z)
+
+    graph_module = relay.build(mod, target="llvm", params=params)
+    rt_mod = tvm.contrib.graph_executor.create(
+        graph_module.get_json(), graph_module.get_lib(), tvm.cpu(0)
+    )
+
+    new_params = graph_module.get_params()
+    new_params.update({"y_unknown": np.ones((1,)).astype("float32")})
+    rt_mod.load_params(runtime.save_param_dict(new_params))
+
+
 if __name__ == "__main__":
     test_graph_simple()
+    test_load_unexpected_params()
diff --git a/tests/python/unittest/test_runtime_graph_cuda_graph.py b/tests/python/unittest/test_runtime_graph_cuda_graph.py
new file mode 100644
index 000000000000..ee7750e3e142
--- /dev/null
+++ b/tests/python/unittest/test_runtime_graph_cuda_graph.py
@@ -0,0 +1,100 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import json
+import os
+import re
+import sys
+import time
+
+import pytest
+
+import tvm
+import tvm.testing
+from tvm import te
+import numpy as np
+
+from tvm.contrib import utils, graph_executor
+from tvm.contrib.cuda_graph import cuda_graph_executor
+
+
+bx = te.thread_axis("blockIdx.x")
+tx = te.thread_axis("threadIdx.x")
+
+
+@tvm.testing.requires_cudagraph
+def test_graph_simple():
+    n = 32
+    A = te.placeholder((n,), name="A")
+    B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
+    s = te.create_schedule(B.op)
+    xo, xi = s[B].split(B.op.axis[0], factor=8)
+    s[B].bind(xo, bx)
+    s[B].bind(xi, tx)
+
+    node0 = {"op": "null", "name": "x", "inputs": []}
+    node1 = {
+        "op": "tvm_op",
+        "name": "add",
+        "inputs": [[0, 0, 0]],
+        "attrs": {"func_name": "myadd", "flatten_data": "1", "num_inputs": "1", "num_outputs": "1"},
+    }
+    nodes = [node0, node1]
+    arg_nodes = [0]
+    node_row_ptr = [0, 1, 2]
+    outputs = [[1, 0, 0]]
+    shape = (n,)
+    attrs = {
+        "shape": ["list_shape", [shape, shape]],
+        "dltype": ["list_str", ["float32", "float32"]],
+        "storage_id": ["list_int", [0, 1]],
+    }
+    graph = {
+        "nodes": nodes,
+        "arg_nodes": arg_nodes,
+        "node_row_ptr": node_row_ptr,
+        "heads": outputs,
+        "attrs": attrs,
+    }
+    graph = json.dumps(graph)
+
+    def check_verify():
+        mlib = tvm.build(s, [A, B], "cuda", name="myadd")
+        dev = tvm.gpu(0)
+        try:
+            mod = cuda_graph_executor.create(graph, mlib, dev)
+        except ValueError:
+            return
+
+        for i in range(3):
+            a = np.random.uniform(size=(n,)).astype(A.dtype)
+            mod.run(x=a)  # The first run captured a CUDA graph
+            out = mod.get_output(0, tvm.nd.empty((n,)))
+            np.testing.assert_equal(out.asnumpy(), a + 1)
+
+        # capture / run CUDA graph manually
+        mod.capture_cuda_graph()
+        a = np.random.uniform(size=(n,)).astype(A.dtype)
+        mod.set_input(x=a)
+        mod.run_cuda_graph()
+        out = mod.get_output(0, tvm.nd.empty((n,)))
+        np.testing.assert_equal(out.asnumpy(), a + 1)
+
+    check_verify()
+
+
+if __name__ == "__main__":
+    test_graph_simple()
diff --git a/tests/python/unittest/test_runtime_graph_debug.py b/tests/python/unittest/test_runtime_graph_debug.py
index 996d426efaa9..6cab75d9b9fc 100644
--- a/tests/python/unittest/test_runtime_graph_debug.py
+++ b/tests/python/unittest/test_runtime_graph_debug.py
@@ -28,7 +28,7 @@
 import numpy as np
 from tvm import rpc
 from tvm.contrib import utils
-from tvm.contrib.debugger import debug_runtime
+from tvm.contrib.debugger import debug_executor
 
 
 @tvm.testing.requires_llvm
@@ -75,7 +75,7 @@ def myadd(*args):
         mlib_proxy = tvm.support.FrontendTestModule()
         mlib_proxy["myadd"] = myadd
         try:
-            mod = debug_runtime.create(graph, mlib_proxy, tvm.cpu(0))
+            mod = debug_executor.create(graph, mlib_proxy, tvm.cpu(0))
         except ValueError:
             return
 
@@ -165,19 +165,19 @@ def check_remote():
         server = rpc.Server("localhost")
         remote = rpc.connect(server.host, server.port)
         temp = utils.tempdir()
-        ctx = remote.cpu(0)
+        dev = remote.cpu(0)
         path_dso = temp.relpath("dev_lib.so")
         mlib.export_library(path_dso)
         remote.upload(path_dso)
         mlib = remote.load_module("dev_lib.so")
         try:
-            mod = debug_runtime.create(graph, mlib, remote.cpu(0))
+            mod = debug_executor.create(graph, mlib, remote.cpu(0))
         except ValueError:
             print("Skip because debug runtime not enabled")
             return
         a = np.random.uniform(size=(n,)).astype(A.dtype)
-        mod.run(x=tvm.nd.array(a, ctx))
-        out = tvm.nd.empty((n,), ctx=ctx)
+        mod.run(x=tvm.nd.array(a, dev))
+        out = tvm.nd.empty((n,), device=dev)
         out = mod.get_output(0, out)
         np.testing.assert_equal(out.asnumpy(), a + 1)
 
diff --git a/tests/python/unittest/test_runtime_heterogeneous.py b/tests/python/unittest/test_runtime_heterogeneous.py
index 2317248adf4c..5388dee2fa58 100644
--- a/tests/python/unittest/test_runtime_heterogeneous.py
+++ b/tests/python/unittest/test_runtime_heterogeneous.py
@@ -21,7 +21,7 @@
 
 import tvm
 from tvm import te
-from tvm.contrib import graph_runtime, utils
+from tvm.contrib import graph_executor, utils
 from tvm import topi
 
 
@@ -130,7 +130,7 @@ def test_simplex_data_transferring():
     """
     host = "cpu"
     target_host = "llvm"
-    host_ctx = tvm.context(host)
+    host_dev = tvm.device(host)
     if not tvm.runtime.enabled(target_host):
         print("Skip test because llvm is not enabled.")
         return
@@ -140,8 +140,8 @@ def check_device(device, target_device):
             print("Skip test because {} is not enabled.".format(target_device))
             return
 
-        device_ctx = tvm.context(device)
-        graph = get_simplex_graph(host_ctx.device_type, device_ctx.device_type)
+        device_dev = tvm.device(device)
+        graph = get_simplex_graph(host_dev.device_type, device_dev.device_type)
         shape = (4,)
 
         # Create module for add whose target is the device.
@@ -170,12 +170,10 @@ def check_device(device, target_device):
         )
 
         target_flist = {target_device: lower_add, target_host: lower_sub}
-        target = tvm.target.Target.current() if target is None else target
-        target = target if target else "llvm"
         target = tvm.target.Target(target, target_host)
         mhost = tvm.build(target_flist, target=target)
-        ctx = [host_ctx, device_ctx]
-        mod = graph_runtime.create(graph, mhost, ctx)
+        dev = [host_dev, device_dev]
+        mod = graph_executor.create(graph, mhost, dev)
         params = {}
         params["A"] = tensor_a = np.random.uniform(size=shape).astype(tensor_a.dtype)
         params["B"] = tensor_b = np.random.uniform(size=shape).astype(tensor_b.dtype)
@@ -351,7 +349,7 @@ def test_duplex_data_transferring():
     """
     host = "cpu"
     target_host = "llvm"
-    host_ctx = tvm.context(host)
+    host_dev = tvm.device(host)
     if not tvm.runtime.enabled(target_host):
         print("Skip test because llvm is not enabled.")
         return
@@ -361,8 +359,8 @@ def check_device(device, target_device):
             print("Skip test because {} is not enabled.".format(target_device))
             return
 
-        device_ctx = tvm.context(device)
-        graph = get_duplex_graph(host_ctx.device_type, device_ctx.device_type)
+        device_dev = tvm.device(device)
+        graph = get_duplex_graph(host_dev.device_type, device_dev.device_type)
         shape = (4,)
 
         # Insert copy nodes for data transferring between add and sub nodes.
@@ -402,11 +400,9 @@ def check_device(device, target_device):
 
         lower_add0.update(lower_add1)
         target_flist = {target_device: lower_add0, target_host: lower_sub}
-        target = tvm.target.Target.current() if target is None else target
-        target = target if target else "llvm"
         target = tvm.target.Target(target, target_host)
         mhost = tvm.build(target_flist, target=target)
-        ctx = [host_ctx, device_ctx]
+        dev = [host_dev, device_dev]
         params = {}
         params["A"] = tensor_a = np.random.uniform(size=shape).astype(tensor_a.dtype)
         params["B"] = tensor_b = np.random.uniform(size=shape).astype(tensor_b.dtype)
@@ -414,7 +410,7 @@ def check_device(device, target_device):
         params["D"] = tensor_d = np.random.uniform(size=shape).astype(tensor_d.dtype)
 
         def check_verify():
-            mod = graph_runtime.create(graph, mhost, ctx)
+            mod = graph_executor.create(graph, mhost, dev)
             mod.set_input(**params)
             mod.run()
             out = mod.get_output(0, tvm.nd.empty(shape))
@@ -428,7 +424,7 @@ def check_load_module():
                 out_file.write(graph)
             loaded_lib = tvm.runtime.load_module(path_lib)
             loaded_graph = open(temp.relpath("deploy.json")).read()
-            mod = graph_runtime.create(loaded_graph, loaded_lib, ctx)
+            mod = graph_executor.create(loaded_graph, loaded_lib, dev)
             mod.set_input(**params)
             mod.run()
             out = mod.get_output(0, tvm.nd.empty(shape))
diff --git a/tests/python/unittest/test_runtime_module_based_interface.py b/tests/python/unittest/test_runtime_module_based_interface.py
index a34fe4a062cb..766338de3558 100644
--- a/tests/python/unittest/test_runtime_module_based_interface.py
+++ b/tests/python/unittest/test_runtime_module_based_interface.py
@@ -18,8 +18,9 @@
 from tvm import relay, runtime
 from tvm.relay import testing
 import tvm
-from tvm.contrib import graph_runtime
-from tvm.contrib.debugger import debug_runtime
+from tvm.contrib import graph_executor
+from tvm.contrib.debugger import debug_executor
+from tvm.contrib.cuda_graph import cuda_graph_executor
 import tvm.testing
 
 
@@ -35,8 +36,8 @@ def verify(data):
     with relay.build_config(opt_level=3):
         graph, lib, graph_params = relay.build_module.build(mod, "llvm", params=params)
 
-    ctx = tvm.cpu()
-    module = graph_runtime.create(graph, lib, ctx)
+    dev = tvm.cpu()
+    module = graph_executor.create(graph, lib, dev)
     module.set_input("data", data)
     module.set_input(**graph_params)
     module.run()
@@ -53,8 +54,8 @@ def test_legacy_compatibility():
     with relay.build_config(opt_level=3):
         graph, lib, graph_params = relay.build_module.build(mod, "llvm", params=params)
     data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32")
-    ctx = tvm.cpu()
-    module = graph_runtime.create(graph, lib, ctx)
+    dev = tvm.cpu()
+    module = graph_executor.create(graph, lib, dev)
     module.set_input("data", data)
     module.set_input(**graph_params)
     module.run()
@@ -71,8 +72,8 @@ def test_cpu():
         complied_graph_lib = relay.build_module.build(mod, "llvm", params=params)
     data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32")
     # raw api
-    ctx = tvm.cpu()
-    gmod = complied_graph_lib["default"](ctx)
+    dev = tvm.cpu()
+    gmod = complied_graph_lib["default"](dev)
     set_input = gmod["set_input"]
     run = gmod["run"]
     get_output = gmod["get_output"]
@@ -81,8 +82,8 @@ def test_cpu():
     out = get_output(0).asnumpy()
     tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
 
-    # graph runtime wrapper
-    gmod = graph_runtime.GraphModule(complied_graph_lib["default"](ctx))
+    # graph executor wrapper
+    gmod = graph_executor.GraphModule(complied_graph_lib["default"](dev))
     gmod.set_input("data", data)
     gmod.run()
     out = gmod.get_output(0).asnumpy()
@@ -96,10 +97,10 @@ def test_gpu():
     with relay.build_config(opt_level=3):
         complied_graph_lib = relay.build_module.build(mod, "cuda", params=params)
     data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32")
-    ctx = tvm.gpu()
+    dev = tvm.gpu()
 
     # raw api
-    gmod = complied_graph_lib["default"](ctx)
+    gmod = complied_graph_lib["default"](dev)
     set_input = gmod["set_input"]
     run = gmod["run"]
     get_output = gmod["get_output"]
@@ -108,8 +109,8 @@ def test_gpu():
     out = get_output(0).asnumpy()
     tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
 
-    # graph runtime wrapper
-    gmod = graph_runtime.GraphModule(complied_graph_lib["default"](ctx))
+    # graph executor wrapper
+    gmod = graph_executor.GraphModule(complied_graph_lib["default"](dev))
     gmod.set_input("data", data)
     gmod.run()
     out = gmod.get_output(0).asnumpy()
@@ -137,8 +138,8 @@ def verify_cpu_export(obj_format):
         path_lib = temp.relpath(file_name)
         complied_graph_lib.export_library(path_lib)
         loaded_lib = tvm.runtime.load_module(path_lib)
-        ctx = tvm.cpu(0)
-        gmod = loaded_lib["default"](ctx)
+        dev = tvm.cpu(0)
+        gmod = loaded_lib["default"](dev)
 
         # raw api
         set_input = gmod["set_input"]
@@ -150,8 +151,8 @@ def verify_cpu_export(obj_format):
         out = get_output(0).asnumpy()
         tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
 
-        # graph runtime wrapper
-        gmod = graph_runtime.GraphModule(loaded_lib["default"](ctx))
+        # graph executor wrapper
+        gmod = graph_executor.GraphModule(loaded_lib["default"](dev))
         gmod.set_input("data", data)
         gmod.run()
         out = gmod.get_output(0).asnumpy()
@@ -177,10 +178,10 @@ def verify_gpu_export(obj_format):
         complied_graph_lib.export_library(path_lib)
         loaded_lib = tvm.runtime.load_module(path_lib)
         data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32")
-        ctx = tvm.gpu()
+        dev = tvm.gpu()
 
         # raw api
-        gmod = loaded_lib["default"](ctx)
+        gmod = loaded_lib["default"](dev)
         set_input = gmod["set_input"]
         run = gmod["run"]
         get_output = gmod["get_output"]
@@ -189,8 +190,8 @@ def verify_gpu_export(obj_format):
         out = get_output(0).asnumpy()
         tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
 
-        # graph runtime wrapper
-        gmod = graph_runtime.GraphModule(loaded_lib["default"](ctx))
+        # graph executor wrapper
+        gmod = graph_executor.GraphModule(loaded_lib["default"](dev))
         gmod.set_input("data", data)
         gmod.run()
         out = gmod.get_output(0).asnumpy()
@@ -221,20 +222,20 @@ def verify_rpc_cpu_export(obj_format):
         remote.upload(path_lib)
         loaded_lib = remote.load_module(path_lib)
         data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32")
-        ctx = remote.cpu()
+        dev = remote.cpu()
 
         # raw api
-        gmod = loaded_lib["default"](ctx)
+        gmod = loaded_lib["default"](dev)
         set_input = gmod["set_input"]
         run = gmod["run"]
         get_output = gmod["get_output"]
-        set_input("data", tvm.nd.array(data, ctx=ctx))
+        set_input("data", tvm.nd.array(data, device=dev))
         run()
         out = get_output(0).asnumpy()
         tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
 
-        # graph runtime wrapper
-        gmod = graph_runtime.GraphModule(loaded_lib["default"](ctx))
+        # graph executor wrapper
+        gmod = graph_executor.GraphModule(loaded_lib["default"](dev))
         gmod.set_input("data", data)
         gmod.run()
         out = gmod.get_output(0).asnumpy()
@@ -266,20 +267,20 @@ def verify_rpc_gpu_export(obj_format):
         remote.upload(path_lib)
         loaded_lib = remote.load_module(path_lib)
         data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32")
-        ctx = remote.gpu()
+        dev = remote.gpu()
 
         # raw api
-        gmod = loaded_lib["default"](ctx)
+        gmod = loaded_lib["default"](dev)
         set_input = gmod["set_input"]
         run = gmod["run"]
         get_output = gmod["get_output"]
-        set_input("data", tvm.nd.array(data, ctx=ctx))
+        set_input("data", tvm.nd.array(data, device=dev))
         run()
         out = get_output(0).asnumpy()
         tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
 
-        # graph runtime wrapper
-        gmod = graph_runtime.GraphModule(loaded_lib["default"](ctx))
+        # graph executor wrapper
+        gmod = graph_executor.GraphModule(loaded_lib["default"](dev))
         gmod.set_input("data", data)
         gmod.run()
         out = gmod.get_output(0).asnumpy()
@@ -317,10 +318,10 @@ def verify_cpu_remove_package_params(obj_format):
             fo.write(runtime.save_param_dict(complied_graph_lib.get_params()))
         loaded_lib = tvm.runtime.load_module(path_lib)
         data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32")
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
 
         # raw api
-        gmod = loaded_lib["default"](ctx)
+        gmod = loaded_lib["default"](dev)
         set_input = gmod["set_input"]
         run = gmod["run"]
         get_output = gmod["get_output"]
@@ -332,8 +333,8 @@ def verify_cpu_remove_package_params(obj_format):
         out = get_output(0).asnumpy()
         tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
 
-        # graph runtime wrapper
-        gmod = graph_runtime.GraphModule(loaded_lib["default"](ctx))
+        # graph executor wrapper
+        gmod = graph_executor.GraphModule(loaded_lib["default"](dev))
         loaded_params = bytearray(open(temp.relpath("deploy_param.params"), "rb").read())
         gmod.set_input("data", data)
         gmod.load_params(loaded_params)
@@ -364,10 +365,10 @@ def verify_gpu_remove_package_params(obj_format):
             fo.write(runtime.save_param_dict(complied_graph_lib.get_params()))
         loaded_lib = tvm.runtime.load_module(path_lib)
         data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32")
-        ctx = tvm.gpu(0)
+        dev = tvm.gpu(0)
 
         # raw api
-        gmod = loaded_lib["default"](ctx)
+        gmod = loaded_lib["default"](dev)
         set_input = gmod["set_input"]
         run = gmod["run"]
         get_output = gmod["get_output"]
@@ -379,8 +380,8 @@ def verify_gpu_remove_package_params(obj_format):
         out = get_output(0).asnumpy()
         tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
 
-        # graph runtime wrapper
-        gmod = graph_runtime.GraphModule(loaded_lib["default"](ctx))
+        # graph executor wrapper
+        gmod = graph_executor.GraphModule(loaded_lib["default"](dev))
         loaded_params = bytearray(open(temp.relpath("deploy_param.params"), "rb").read())
         gmod.set_input("data", data)
         gmod.load_params(loaded_params)
@@ -417,23 +418,23 @@ def verify_rpc_cpu_remove_package_params(obj_format):
         remote.upload(path_lib)
         loaded_lib = remote.load_module(path_lib)
         data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32")
-        ctx = remote.cpu()
+        dev = remote.cpu()
 
         # raw api
-        gmod = loaded_lib["default"](ctx)
+        gmod = loaded_lib["default"](dev)
         set_input = gmod["set_input"]
         run = gmod["run"]
         get_output = gmod["get_output"]
         load_params = gmod["load_params"]
         loaded_params = bytearray(open(path_params, "rb").read())
-        set_input("data", tvm.nd.array(data, ctx=ctx))
+        set_input("data", tvm.nd.array(data, device=dev))
         load_params(loaded_params)
         run()
         out = get_output(0).asnumpy()
         tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
 
-        # graph runtime wrapper
-        gmod = graph_runtime.GraphModule(loaded_lib["default"](ctx))
+        # graph executor wrapper
+        gmod = graph_executor.GraphModule(loaded_lib["default"](dev))
         loaded_params = bytearray(open(path_params, "rb").read())
         gmod.set_input("data", data)
         gmod.load_params(loaded_params)
@@ -470,23 +471,23 @@ def verify_rpc_gpu_remove_package_params(obj_format):
         remote.upload(path_lib)
         loaded_lib = remote.load_module(path_lib)
         data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32")
-        ctx = remote.gpu()
+        dev = remote.gpu()
 
         # raw api
-        gmod = loaded_lib["default"](ctx)
+        gmod = loaded_lib["default"](dev)
         set_input = gmod["set_input"]
         run = gmod["run"]
         get_output = gmod["get_output"]
         load_params = gmod["load_params"]
         loaded_params = bytearray(open(path_params, "rb").read())
-        set_input("data", tvm.nd.array(data, ctx=ctx))
+        set_input("data", tvm.nd.array(data, device=dev))
         load_params(loaded_params)
         run()
         out = get_output(0).asnumpy()
         tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
 
-        # graph runtime wrapper
-        gmod = graph_runtime.GraphModule(loaded_lib["default"](ctx))
+        # graph executor wrapper
+        gmod = graph_executor.GraphModule(loaded_lib["default"](dev))
         loaded_params = bytearray(open(path_params, "rb").read())
         gmod.set_input("data", data)
         gmod.load_params(loaded_params)
@@ -501,7 +502,7 @@ def verify_rpc_gpu_remove_package_params(obj_format):
         verify_rpc_gpu_remove_package_params(obj_format)
 
 
-def test_debug_graph_runtime():
+def test_debug_graph_executor():
     if not tvm.testing.device_enabled("llvm"):
         print("Skip because llvm is not enabled")
         return
@@ -511,11 +512,11 @@ def test_debug_graph_runtime():
     data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32")
 
     # raw api
-    ctx = tvm.cpu()
+    dev = tvm.cpu()
     try:
-        gmod = complied_graph_lib["debug_create"]("default", ctx)
+        gmod = complied_graph_lib["debug_create"]("default", dev)
     except:
-        print("Skip because debug graph_runtime not enabled")
+        print("Skip because debug graph_executor not enabled")
         return
     set_input = gmod["set_input"]
     run = gmod["run"]
@@ -525,10 +526,10 @@ def test_debug_graph_runtime():
     out = get_output(0).asnumpy()
     tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
 
-    # debug graph runtime wrapper
-    debug_g_mod = debug_runtime.GraphModuleDebug(
-        complied_graph_lib["debug_create"]("default", ctx),
-        [ctx],
+    # debug graph executor wrapper
+    debug_g_mod = debug_executor.GraphModuleDebug(
+        complied_graph_lib["debug_create"]("default", dev),
+        [dev],
         complied_graph_lib.get_json(),
         None,
     )
@@ -538,6 +539,35 @@ def test_debug_graph_runtime():
     tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
 
 
+@tvm.testing.requires_cudagraph
+def test_cuda_graph_executor():
+    mod, params = relay.testing.synthetic.get_workload()
+    with tvm.transform.PassContext(opt_level=3):
+        complied_graph_lib = relay.build_module.build(mod, "cuda", params=params)
+    data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32")
+
+    dev = tvm.gpu()
+    try:
+        gmod = complied_graph_lib["cuda_graph_create"](dev)
+    except:
+        print("Skip because cuda_graph not enabled")
+        return
+    set_input = gmod["set_input"]
+    run = gmod["run"]
+    get_output = gmod["get_output"]
+    set_input("data", tvm.nd.array(data))
+    run()
+    out = get_output(0).asnumpy()
+    tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
+
+    # cuda graph executor wrapper
+    cu_gmod = cuda_graph_executor.GraphModuleCudaGraph(gmod)
+    cu_gmod.set_input("data", data)
+    cu_gmod.run()
+    out = cu_gmod.get_output(0).asnumpy()
+    tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
+
+
 def test_multiple_imported_modules():
     def make_func(symbol):
         n = tvm.te.size_var("n")
@@ -572,5 +602,5 @@ def make_module(mod):
     test_gpu()
     test_mod_export()
     test_remove_package_params()
-    test_debug_graph_runtime()
+    test_debug_graph_executor()
     test_multiple_imported_modules()
diff --git a/tests/python/unittest/test_runtime_module_load.py b/tests/python/unittest/test_runtime_module_load.py
index 38800e8de6ad..c34b2f292d33 100644
--- a/tests/python/unittest/test_runtime_module_load.py
+++ b/tests/python/unittest/test_runtime_module_load.py
@@ -105,7 +105,7 @@ def test_device_module_dump():
     s[B].bind(tx, te.thread_axis("threadIdx.x"))
 
     def check_device(device):
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
@@ -123,8 +123,8 @@ def check_device(device):
         f.export_library(path_dso, cc.cross_compiler("g++"))
 
         f1 = tvm.runtime.load_module(path_dso)
-        a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev)
+        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
         f1(a, b)
         np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
         if sys.platform != "win32":
@@ -133,7 +133,7 @@ def check_device(device):
             np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
 
     def check_stackvm(device):
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
@@ -143,8 +143,8 @@ def check_stackvm(device):
         path_dso = temp.relpath("dev_lib.stackvm")
         f.export_library(path_dso)
         f1 = tvm.runtime.load_module(path_dso)
-        a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev)
+        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
         f(a, b)
         np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
 
@@ -163,7 +163,7 @@ def test_combine_module_llvm():
     s = te.create_schedule(B.op)
 
     def check_llvm():
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         if not tvm.testing.device_enabled("llvm"):
             print("Skip because llvm is not enabled")
             return
@@ -180,15 +180,15 @@ def check_llvm():
         m = tvm.runtime.load_module(path_dso)
         fadd1 = m["myadd1"]
         fadd2 = m["myadd2"]
-        a = tvm.nd.array(np.random.uniform(size=nn).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.zeros(nn, dtype=A.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=nn).astype(A.dtype), dev)
+        b = tvm.nd.array(np.zeros(nn, dtype=A.dtype), dev)
         fadd1(a, b)
         np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
         fadd2(a, b)
         np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
 
     def check_system_lib():
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         if not tvm.testing.device_enabled("llvm"):
             print("Skip because llvm is not enabled")
             return
@@ -205,8 +205,8 @@ def check_system_lib():
         dll = ctypes.CDLL(path_dso)
         # Load the system wide library
         mm = tvm.runtime.system_lib()
-        a = tvm.nd.array(np.random.uniform(size=nn).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.zeros(nn, dtype=A.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=nn).astype(A.dtype), dev)
+        b = tvm.nd.array(np.zeros(nn, dtype=A.dtype), dev)
         mm["myadd1"](a, b)
         np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
         mm["myadd2"](a, b)
diff --git a/tests/python/unittest/test_runtime_rpc.py b/tests/python/unittest/test_runtime_rpc.py
index a19aea14da2c..256fd33387bf 100644
--- a/tests/python/unittest/test_runtime_rpc.py
+++ b/tests/python/unittest/test_runtime_rpc.py
@@ -67,9 +67,9 @@ def verify_rpc(remote, target, shape, dtype):
         s = te.create_schedule(B.op)
         f = tvm.build(s, [A, B], target, name="myadd")
 
-        ctx = remote.cpu(0)
-        a = tvm.nd.array(np.random.randint(0, 256, size=shape).astype(A.dtype), ctx=ctx)
-        b = tvm.nd.array(np.zeros(shape).astype(A.dtype), ctx=ctx)
+        dev = remote.cpu(0)
+        a = tvm.nd.array(np.random.randint(0, 256, size=shape).astype(A.dtype), device=dev)
+        b = tvm.nd.array(np.zeros(shape).astype(A.dtype), device=dev)
         temp = utils.tempdir()
         path_dso = temp.relpath("dev_lib.o")
         f.save(path_dso)
@@ -143,7 +143,7 @@ def test_rpc_array():
     server = rpc.Server("localhost")
     remote = rpc.connect(server.host, server.port)
     r_cpu = tvm.nd.array(x, remote.cpu(0))
-    assert str(r_cpu.context).startswith("remote")
+    assert str(r_cpu.device).startswith("remote")
     np.testing.assert_equal(r_cpu.asnumpy(), x)
     fremote = remote.get_function("rpc.test.remote_array_func")
     fremote(r_cpu)
@@ -154,11 +154,11 @@ def test_rpc_large_array():
     # testcase of large array creation
     server = rpc.Server("localhost")
     remote = rpc.connect(server.host, server.port)
-    ctx = remote.cpu(0)
+    dev = remote.cpu(0)
     a_np = np.ones((5041, 720)).astype("float32")
     b_np = np.ones((720, 192)).astype("float32")
-    a = tvm.nd.array(a_np, ctx)
-    b = tvm.nd.array(b_np, ctx)
+    a = tvm.nd.array(a_np, dev)
+    b = tvm.nd.array(b_np, dev)
     np.testing.assert_equal(a.asnumpy(), a_np)
     np.testing.assert_equal(b.asnumpy(), b_np)
 
@@ -238,14 +238,14 @@ def test_rpc_remote_module():
 
     def check_remote(remote):
         temp = utils.tempdir()
-        ctx = remote.cpu(0)
+        dev = remote.cpu(0)
         f = tvm.build(s, [A, B], "llvm", name="myadd")
         path_dso = temp.relpath("dev_lib.so")
         f.export_library(path_dso)
         remote.upload(path_dso)
         f1 = remote.load_module("dev_lib.so")
-        a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.zeros(102, dtype=A.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), dev)
+        b = tvm.nd.array(np.zeros(102, dtype=A.dtype), dev)
         time_f = f1.time_evaluator(f1.entry_name, remote.cpu(0), number=10)
         cost = time_f(a, b).mean
         print("%g secs/op" % cost)
@@ -278,11 +278,11 @@ def check_minrpc():
 
         # statrt the minrpc session.
         remote = tvm.rpc.PopenSession(path_minrpc)
-        ctx = remote.cpu(0)
+        dev = remote.cpu(0)
         f1 = remote.system_lib()
 
-        a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.zeros(102, dtype=A.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), dev)
+        b = tvm.nd.array(np.zeros(102, dtype=A.dtype), dev)
         time_f = f1.time_evaluator("myadd", remote.cpu(0), number=1)
         cost = time_f(a, b).mean
         np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
@@ -304,7 +304,7 @@ def check_remote_link_cl(remote):
             print("Skip because opencl is not enabled")
             return
         temp = utils.tempdir()
-        ctx = remote.cl(0)
+        dev = remote.cl(0)
         s = te.create_schedule(B.op)
         xo, xi = s[B].split(B.op.axis[0], factor=32)
         s[B].bind(xo, te.thread_axis("blockIdx.x"))
@@ -323,8 +323,8 @@ def check_remote_link_cl(remote):
         fhost = remote.load_module("myadd.o")
         fdev = remote.load_module("myadd.cl")
         fhost.import_module(fdev)
-        a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.zeros(102, dtype=A.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), dev)
+        b = tvm.nd.array(np.zeros(102, dtype=A.dtype), dev)
         fhost(a, b)
         np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
         # Option 2: export library as a tar ball then handled by remote compiler
@@ -332,8 +332,8 @@ def check_remote_link_cl(remote):
         f.export_library(path_tar)
         remote.upload(path_tar)
         fhost = remote.load_module("myadd.tar")
-        a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.zeros(102, dtype=A.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), dev)
+        b = tvm.nd.array(np.zeros(102, dtype=A.dtype), dev)
         fhost(a, b)
         np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
 
@@ -377,7 +377,7 @@ def check_multi_hop():
         assert fecho("xyz") == "xyz"
         assert bytes(fecho(bytearray(b"123"))) == b"123"
 
-        nd = tvm.nd.array([1, 2, 3], ctx=client.cpu(0))
+        nd = tvm.nd.array([1, 2, 3], device=client.cpu(0))
         assert nd.asnumpy()[1] == 2
 
     def check_error_handling():
diff --git a/tests/python/unittest/test_runtime_vm_profiler.py b/tests/python/unittest/test_runtime_vm_profiler.py
index 29715222e429..c1c94211a664 100644
--- a/tests/python/unittest/test_runtime_vm_profiler.py
+++ b/tests/python/unittest/test_runtime_vm_profiler.py
@@ -26,9 +26,9 @@ def test_basic():
     if not profiler_vm.enabled():
         return
 
-    for target, ctx in enabled_targets():
+    for target, dev in enabled_targets():
         exe = relay.vm.compile(mod, target, params=params)
-        vm = profiler_vm.VirtualMachineProfiler(exe, ctx)
+        vm = profiler_vm.VirtualMachineProfiler(exe, dev)
 
         data = np.random.rand(1, 3, 224, 224).astype("float32")
         res = vm.invoke("main", [data])
diff --git a/tests/python/unittest/test_target_codegen_blob.py b/tests/python/unittest/test_target_codegen_blob.py
index c17513acecaa..c7698197c111 100644
--- a/tests/python/unittest/test_target_codegen_blob.py
+++ b/tests/python/unittest/test_target_codegen_blob.py
@@ -18,7 +18,7 @@
 import numpy as np
 from tvm import relay
 from tvm.relay import testing
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 import tvm
 from tvm import te
 import ctypes
@@ -38,8 +38,8 @@ def verify(data):
         mod, params = relay.testing.synthetic.get_workload(input_shape=input_shape)
         with tvm.transform.PassContext(opt_level=3):
             lib = relay.build_module.build(mod, "llvm", params=params)
-        ctx = tvm.cpu()
-        module = graph_runtime.GraphModule(lib["default"](ctx))
+        dev = tvm.cpu()
+        module = graph_executor.GraphModule(lib["default"](dev))
         module.set_input("data", data)
         module.run()
         out = module.get_output(0).asnumpy()
@@ -57,8 +57,8 @@ def verify(data):
 
     loaded_lib = tvm.runtime.load_module(path_lib)
     data = np.random.uniform(-1, 1, size=input_shape).astype("float32")
-    ctx = tvm.gpu()
-    module = graph_runtime.GraphModule(loaded_lib["default"](ctx))
+    dev = tvm.gpu()
+    module = graph_executor.GraphModule(loaded_lib["default"](dev))
     module.set_input("data", data)
     module.run()
     out = module.get_output(0).asnumpy()
@@ -68,7 +68,7 @@ def verify(data):
 
 @tvm.testing.uses_gpu
 def test_cuda_lib():
-    ctx = tvm.gpu(0)
+    dev = tvm.gpu(0)
     for device in ["llvm", "cuda"]:
         if not tvm.testing.device_enabled(device):
             print("skip because %s is not enabled..." % device)
@@ -89,8 +89,8 @@ def test_cuda_lib():
     path_lib = temp.relpath("deploy_lib.so")
     fn_add.export_library(path_lib)
     m = tvm.runtime.load_module(path_lib)
-    a = tvm.nd.array(np.random.uniform(size=nn).astype(A.dtype), ctx)
-    b = tvm.nd.array(np.zeros(nn, dtype=A.dtype), ctx)
+    a = tvm.nd.array(np.random.uniform(size=nn).astype(A.dtype), dev)
+    b = tvm.nd.array(np.zeros(nn, dtype=A.dtype), dev)
     m["add"](a, b)
     np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
 
diff --git a/tests/python/unittest/test_target_codegen_bool.py b/tests/python/unittest/test_target_codegen_bool.py
index f4b5f90435d2..527741a5ef0d 100644
--- a/tests/python/unittest/test_target_codegen_bool.py
+++ b/tests/python/unittest/test_target_codegen_bool.py
@@ -39,11 +39,11 @@ def check_llvm():
         s[C].parallel(xo2)
         # BUILD and invoke the kernel.
         f = tvm.build(s, [A, B, D], "llvm")
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         a_np = np.random.uniform(size=n).astype(A.dtype)
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
-        d = tvm.nd.array(np.zeros(n, dtype=D.dtype), ctx)
+        a = tvm.nd.array(a_np, dev)
+        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
+        d = tvm.nd.array(np.zeros(n, dtype=D.dtype), dev)
         f(a, b, d)
         np.testing.assert_equal(
             d.asnumpy(),
@@ -53,7 +53,7 @@ def check_llvm():
     def check_device(device):
         if not tvm.testing.device_enabled(device):
             return
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         s = te.create_schedule(D.op)
         for stage in [C, D]:
             xo, xi = s[stage].split(stage.op.axis[0], factor=4)
@@ -61,9 +61,9 @@ def check_device(device):
             s[stage].bind(xi, te.thread_axis("threadIdx.x"))
         f = tvm.build(s, [A, B, D], device)
         a_np = np.random.uniform(size=n).astype(A.dtype)
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
-        d = tvm.nd.array(np.zeros(n, dtype=D.dtype), ctx)
+        a = tvm.nd.array(a_np, dev)
+        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
+        d = tvm.nd.array(np.zeros(n, dtype=D.dtype), dev)
         f(a, b, d)
         np.testing.assert_equal(
             d.asnumpy(),
diff --git a/tests/python/unittest/test_target_codegen_c_host.py b/tests/python/unittest/test_target_codegen_c_host.py
index d1ca8b1450f0..6102beba0d25 100644
--- a/tests/python/unittest/test_target_codegen_c_host.py
+++ b/tests/python/unittest/test_target_codegen_c_host.py
@@ -36,12 +36,12 @@ def check_c():
         mhost.export_library(path_dso)
         m = tvm.runtime.load_module(path_dso)
         fadd = m["test_fadd"]
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         # launch the kernel.
         n = nn
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
+        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
+        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
         fadd(a, b, c)
         tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
 
@@ -81,12 +81,12 @@ def check_c():
         mhost.export_library(path_dso)
         m = tvm.runtime.load_module(path_dso)
         fadd = m["test_fadd_pipeline"]
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         # launch the kernel.
         n = nn
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
+        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
+        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
         fadd(a, b, c)
         tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
 
@@ -109,10 +109,10 @@ def check_c():
         mhost.export_library(path_dso)
         m = tvm.runtime.load_module(path_dso)
         fadd = m["test_reinterpret"]
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         n = nn
-        a = tvm.nd.array(np.random.randint(-(2 ** 30), 2 ** 30, size=n).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx)
+        a = tvm.nd.array(np.random.randint(-(2 ** 30), 2 ** 30, size=n).astype(A.dtype), dev)
+        b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev)
         fadd(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), (2 + a.asnumpy()).view("float32"))
 
@@ -133,10 +133,10 @@ def check_c():
         mhost.export_library(path_dso)
         m = tvm.runtime.load_module(path_dso)
         fceil = m["test_ceil"]
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         n = nn
-        a = tvm.nd.array(np.random.rand(n).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx)
+        a = tvm.nd.array(np.random.rand(n).astype(A.dtype), dev)
+        b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev)
         fceil(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), (np.ceil(a.asnumpy()).view("float32")))
 
@@ -157,10 +157,10 @@ def check_c():
         mhost.export_library(path_dso)
         m = tvm.runtime.load_module(path_dso)
         ffloor = m["test_floor"]
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         n = nn
-        a = tvm.nd.array(np.random.rand(n).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx)
+        a = tvm.nd.array(np.random.rand(n).astype(A.dtype), dev)
+        b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev)
         ffloor(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), (np.floor(a.asnumpy()).view("float32")))
 
@@ -181,10 +181,10 @@ def check_c():
         mhost.export_library(path_dso)
         m = tvm.runtime.load_module(path_dso)
         fround = m["test_round"]
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         n = nn
-        a = tvm.nd.array(np.random.rand(n).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx)
+        a = tvm.nd.array(np.random.rand(n).astype(A.dtype), dev)
+        b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev)
         fround(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), (np.round(a.asnumpy()).view("float32")))
 
diff --git a/tests/python/unittest/test_target_codegen_cross_llvm.py b/tests/python/unittest/test_target_codegen_cross_llvm.py
index a55530a090e4..feb1d43d5ce2 100644
--- a/tests/python/unittest/test_target_codegen_cross_llvm.py
+++ b/tests/python/unittest/test_target_codegen_cross_llvm.py
@@ -78,11 +78,11 @@ def build_arm():
         if remote:
             remote.upload(path)
             farm = remote.load_module("myadd.o")
-            ctx = remote.cpu(0)
+            dev = remote.cpu(0)
             n = nn
-            a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
-            b = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
-            c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+            a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
+            b = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
+            c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
             farm(a, b, c)
             tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
             print("Verification finish on remote..")
diff --git a/tests/python/unittest/test_target_codegen_cuda.py b/tests/python/unittest/test_target_codegen_cuda.py
index 06d7cb4bb7bb..a63aeaa40f9d 100644
--- a/tests/python/unittest/test_target_codegen_cuda.py
+++ b/tests/python/unittest/test_target_codegen_cuda.py
@@ -46,9 +46,9 @@ def check_cuda(dtype, n, lanes):
         s[B].bind(xo, bx)
         s[B].bind(xi, tx)
         fun = tvm.build(s, [A, B], "cuda")
-        ctx = tvm.gpu(0)
-        a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np.random.uniform(size=(n, lanes)))
-        c = tvm.nd.empty((n,), B.dtype, ctx)
+        dev = tvm.gpu(0)
+        a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np.random.uniform(size=(n, lanes)))
+        c = tvm.nd.empty((n,), B.dtype, dev)
         fun(a, c)
         tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
 
@@ -99,13 +99,13 @@ def check_cuda(n, lanes):
             disabled_pass=["tir.BF16Promote", "tir.BF16CastElimination", "tir.BF16TypeLowering"]
         ):
             fun = tvm.build(s, [A, B], "cuda")
-        ctx = tvm.gpu(0)
+        dev = tvm.gpu(0)
         np_a = np.random.uniform(size=(n, lanes)).astype("float32")
         np_a = np_bf162np_float(np_float2np_bf16(np_a))
-        a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np_float2np_bf16(np_a))
-        c = tvm.nd.empty((n,), B.dtype, ctx)
+        a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np_float2np_bf16(np_a))
+        c = tvm.nd.empty((n,), B.dtype, dev)
         fun(a, c)
-        c = tvm.nd.empty((n, lanes), "uint16", ctx).copyfrom(c)
+        c = tvm.nd.empty((n, lanes), "uint16", dev).copyfrom(c)
         tvm.testing.assert_allclose(c.asnumpy(), np_float2np_bf16(np_a + 1))
 
     check_cuda(64, 2)
@@ -138,11 +138,11 @@ def check_cuda(dtype, n, lanes):
         np_b = np.random.randint(low=-128, high=127, size=(n, lanes))
         np_c = np.random.randint(low=0, high=127, size=(n,))
         np_d = [sum(x * y) + z for x, y, z in zip(np_a, np_b, np_c)]
-        ctx = tvm.gpu(0)
-        a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np_a)
-        b = tvm.nd.empty((n,), B.dtype, ctx).copyfrom(np_b)
-        c = tvm.nd.empty((n,), C.dtype, ctx).copyfrom(np_c)
-        d = tvm.nd.empty((n,), D.dtype, ctx)
+        dev = tvm.gpu(0)
+        a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np_a)
+        b = tvm.nd.empty((n,), B.dtype, dev).copyfrom(np_b)
+        c = tvm.nd.empty((n,), C.dtype, dev).copyfrom(np_c)
+        d = tvm.nd.empty((n,), D.dtype, dev)
         fun(a, b, c, d)
         tvm.testing.assert_allclose(d.asnumpy(), np_d)
 
@@ -155,7 +155,7 @@ def test_cuda_vectorize_load():
     num_thread = 8
 
     def check_cuda(dtype, n, lanes):
-        ctx = tvm.gpu(0)
+        dev = tvm.gpu(0)
         A = te.placeholder((n,), name="A", dtype="%sx%d" % (dtype, lanes))
         B = te.compute((n,), lambda i: A[i], name="B")
         s = te.create_schedule(B.op)
@@ -164,8 +164,8 @@ def check_cuda(dtype, n, lanes):
         s[B].bind(thread, tx)
         fun = tvm.build(s, [A, B], "cuda", name="vector_load")
         np_a = np.random.randint(low=-128, high=127, size=(n, lanes))
-        a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np_a)
-        b = tvm.nd.empty((n,), B.dtype, ctx)
+        a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np_a)
+        b = tvm.nd.empty((n,), B.dtype, dev)
         fun(a, b)
         tvm.testing.assert_allclose(a.asnumpy(), b.asnumpy())
 
@@ -181,7 +181,7 @@ def check_cuda(dtype, n, lanes):
 def test_cuda_make_int8():
     def check_cuda(n, value, lanes):
         dtype = "int8"
-        ctx = tvm.gpu(0)
+        dev = tvm.gpu(0)
         A = te.compute((n, lanes), lambda i, j: tvm.tir.const(value, dtype=dtype))
         s = te.create_schedule(A.op)
         y, x = s[A].op.axis
@@ -189,7 +189,7 @@ def check_cuda(n, value, lanes):
         s[A].bind(y, bx)
         fun = tvm.build(s, [A], "cuda", name="make_int8x4")
         np_a = np.full((n, lanes), value, dtype=dtype)
-        a = tvm.nd.empty(np_a.shape, dtype, ctx)
+        a = tvm.nd.empty(np_a.shape, dtype, dev)
         fun(a)
         np.testing.assert_equal(a.asnumpy(), np_a)
 
@@ -209,26 +209,26 @@ def check_cuda(n, value, lanes):
 def test_cuda_inf_nan():
     target = "cuda"
 
-    def check_inf_nan(ctx, n, value, dtype):
+    def check_inf_nan(dev, n, value, dtype):
         A = te.placeholder((n,), name="A", dtype=dtype)
         inf_value = tvm.tir.const(value, dtype=dtype)
         C = te.compute((n,), lambda i: inf_value, name="C")
         s = te.create_schedule(C.op)
         s[C].bind(s[C].op.axis[0], tx)
         fun = tvm.build(s, [A, C], target)
-        a = tvm.nd.empty((n,), A.dtype, ctx)
-        c = tvm.nd.empty((n,), A.dtype, ctx)
+        a = tvm.nd.empty((n,), A.dtype, dev)
+        c = tvm.nd.empty((n,), A.dtype, dev)
         # Only need to test compiling here
         fun(a, c)
 
-    ctx = tvm.context(target, 0)
+    dev = tvm.device(target, 0)
 
-    check_inf_nan(ctx, 1, -float("inf"), "float32")
-    check_inf_nan(ctx, 1, -float("inf"), "float64")
-    check_inf_nan(ctx, 1, float("inf"), "float32")
-    check_inf_nan(ctx, 1, float("inf"), "float64")
-    check_inf_nan(ctx, 1, float("nan"), "float32")
-    check_inf_nan(ctx, 1, float("nan"), "float64")
+    check_inf_nan(dev, 1, -float("inf"), "float32")
+    check_inf_nan(dev, 1, -float("inf"), "float64")
+    check_inf_nan(dev, 1, float("inf"), "float32")
+    check_inf_nan(dev, 1, float("inf"), "float64")
+    check_inf_nan(dev, 1, float("nan"), "float32")
+    check_inf_nan(dev, 1, float("nan"), "float64")
 
 
 @tvm.testing.requires_gpu
@@ -285,7 +285,7 @@ def _transform(f, *_):
 
 
 @tvm.testing.parametrize_targets("cuda", "rocm")
-def test_crossthread_reduction1(target, ctx):
+def test_crossthread_reduction1(target, dev):
     n = te.var("n")
     m = te.var("m")
     A = te.placeholder((n, m), name="A")
@@ -307,8 +307,8 @@ def verify(nthd):
         vals = [nthd - 1, nthd, nthd + 1]
         for kk in [x for x in vals]:
             size = (nn, kk)
-            a = tvm.nd.array(np.random.uniform(size=size).astype(A.dtype), ctx)
-            b = tvm.nd.array(np.zeros(nn, dtype=B.dtype), ctx)
+            a = tvm.nd.array(np.random.uniform(size=size).astype(A.dtype), dev)
+            b = tvm.nd.array(np.zeros(nn, dtype=B.dtype), dev)
             func(a, b)
             tvm.testing.assert_allclose(b.asnumpy(), np.sum(a.asnumpy(), axis=1), rtol=1e-3)
 
@@ -318,7 +318,7 @@ def verify(nthd):
 
 
 @tvm.testing.parametrize_targets("cuda", "rocm")
-def test_crossthread_reduction2(target, ctx):
+def test_crossthread_reduction2(target, dev):
     n = te.var("n")
     k0 = te.var("k0")
     k1 = te.var("k1")
@@ -345,8 +345,8 @@ def verify(nthdx, nthdy):
         vy = [nthdy - 1, nthdy, nthdy + 1]
         for kk0, kk1 in [(x, y) for x in vx for y in vy]:
             size = (nn, kk0, kk1)
-            a = tvm.nd.array(np.random.uniform(size=size).astype(A.dtype), ctx)
-            b = tvm.nd.array(np.zeros(nn, dtype=B.dtype), ctx)
+            a = tvm.nd.array(np.random.uniform(size=size).astype(A.dtype), dev)
+            b = tvm.nd.array(np.zeros(nn, dtype=B.dtype), dev)
             func(a, b)
             tvm.testing.assert_allclose(b.asnumpy(), np.sum(a.asnumpy(), axis=(1, 2)), rtol=1e-3)
 
@@ -373,7 +373,7 @@ def test_cuda_reduction_binding():
 
 
 @tvm.testing.parametrize_targets("cuda", "rocm")
-def test_rfactor_predicates(target, ctx):
+def test_rfactor_predicates(target, dev):
     n = te.reduce_axis((0, 129), "n")
     A = te.placeholder((129,), name="A")
     B = te.compute((1,), lambda b: te.sum(A[n], axis=n), name="B")
@@ -419,11 +419,11 @@ def test_cuda_const_float_to_half():
     s[c].bind(tx, te.thread_axis("threadIdx.x"))
 
     func = tvm.build(s, [a, c], "cuda")
-    ctx = tvm.gpu(0)
+    dev = tvm.gpu(0)
     a_np = np.random.uniform(size=shape).astype(a.dtype)
     c_np = np.zeros(shape=shape, dtype=c.dtype)
-    a = tvm.nd.array(a_np, ctx)
-    c = tvm.nd.array(c_np, ctx)
+    a = tvm.nd.array(a_np, dev)
+    c = tvm.nd.array(c_np, dev)
     func(a, c)
     np.testing.assert_equal(c.asnumpy(), a_np > b.value)
 
@@ -435,7 +435,7 @@ def check(device, dtype, m=32, n=32):
         if not tvm.testing.device_enabled(device):
             print("Skipping", device)
             return
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         a = te.placeholder((m, n), name="a", dtype=dtype)
         b = te.placeholder((m, n), name="b", dtype=dtype)
         c = a + b
@@ -448,9 +448,9 @@ def check(device, dtype, m=32, n=32):
             a_np = np.random.uniform(size=(m, n)).astype(a.dtype)
             b_np = np.random.uniform(size=(m, n)).astype(b.dtype)
             g_np = np.sum(np.add(a_np * b_np, a_np + b_np))
-            a_nd = tvm.nd.array(a_np, ctx)
-            b_nd = tvm.nd.array(b_np, ctx)
-            g_nd = tvm.nd.array(np.zeros(g_np.shape, dtype=g_np.dtype), ctx)
+            a_nd = tvm.nd.array(a_np, dev)
+            b_nd = tvm.nd.array(b_np, dev)
+            g_nd = tvm.nd.array(np.zeros(g_np.shape, dtype=g_np.dtype), dev)
             func(a_nd, b_nd, g_nd)
             tvm.testing.assert_allclose(g_nd.asnumpy(), g_np, rtol=1e-3)
 
@@ -466,8 +466,8 @@ def check(device, dtype, m=32, n=32):
         if not tvm.testing.device_enabled(device):
             print("Skipping", device)
             return
-        ctx = tvm.context(device, 0)
-        if dtype == "float16" and not have_fp16(ctx.compute_version):
+        dev = tvm.device(device, 0)
+        if dtype == "float16" and not have_fp16(dev.compute_version):
             print("Skip because gpu does not have fp16 support")
             return
 
@@ -480,8 +480,8 @@ def check(device, dtype, m=32, n=32):
             func = tvm.build(sb, [a, b], device)
             a_np = np.random.uniform(size=(m, n)).astype(a.dtype)
             b_np = np.sum(a_np)
-            a_nd = tvm.nd.array(a_np, ctx)
-            b_nd = tvm.nd.array(np.zeros(b_np.shape, dtype=b_np.dtype), ctx)
+            a_nd = tvm.nd.array(a_np, dev)
+            b_nd = tvm.nd.array(np.zeros(b_np.shape, dtype=b_np.dtype), dev)
             func(a_nd, b_nd)
             tvm.testing.assert_allclose(b_nd.asnumpy(), b_np, rtol=1e-3)
 
@@ -507,11 +507,11 @@ def test_cuda_floordiv_with_vectorization():
         s[B].bind(xio, tx)
         func = tvm.build(s, [A, B], "cuda")
 
-        ctx = tvm.gpu(0)
+        dev = tvm.gpu(0)
         a_np = np.random.uniform(size=(n,)).astype(A.dtype)
         b_np = np.array([a_np[i // k] for i in range(0, n)])
-        a_nd = tvm.nd.array(a_np, ctx)
-        b_nd = tvm.nd.array(np.zeros(b_np.shape, dtype=b_np.dtype), ctx)
+        a_nd = tvm.nd.array(a_np, dev)
+        b_nd = tvm.nd.array(np.zeros(b_np.shape, dtype=b_np.dtype), dev)
         func(a_nd, b_nd)
         tvm.testing.assert_allclose(b_nd.asnumpy(), b_np, rtol=1e-3)
 
@@ -533,11 +533,11 @@ def test_cuda_floormod_with_vectorization():
         s[B].bind(xio, tx)
         func = tvm.build(s, [A, B], "cuda")
 
-        ctx = tvm.gpu(0)
+        dev = tvm.gpu(0)
         a_np = np.random.uniform(size=(n,)).astype(A.dtype)
         b_np = np.array([a_np[i % k] for i in range(0, n)])
-        a_nd = tvm.nd.array(a_np, ctx)
-        b_nd = tvm.nd.array(np.zeros(b_np.shape, dtype=b_np.dtype), ctx)
+        a_nd = tvm.nd.array(a_np, dev)
+        b_nd = tvm.nd.array(np.zeros(b_np.shape, dtype=b_np.dtype), dev)
         func(a_nd, b_nd)
         tvm.testing.assert_allclose(b_nd.asnumpy(), b_np, rtol=1e-3)
 
@@ -564,14 +564,14 @@ def check(t0, t1, factor):
         func = tvm.build(s, [A, B, C], "cuda")
 
         # correctness
-        ctx = tvm.gpu(0)
+        dev = tvm.gpu(0)
         low, high = (0, 20) if t0.startswith("u") or t1.startswith("u") else (-10, 10)
         a_np = np.random.randint(low, high, size=n).astype(A.dtype)
         b_np = np.random.randint(low, high, size=n).astype(B.dtype)
         c_np = (a_np + b_np).astype(A.dtype)
-        a_nd = tvm.nd.array(a_np, ctx)
-        b_nd = tvm.nd.array(b_np, ctx)
-        c_nd = tvm.nd.array(np.zeros(c_np.shape, dtype=c_np.dtype), ctx)
+        a_nd = tvm.nd.array(a_np, dev)
+        b_nd = tvm.nd.array(b_np, dev)
+        c_nd = tvm.nd.array(np.zeros(c_np.shape, dtype=c_np.dtype), dev)
         func(a_nd, b_nd, c_nd)
         tvm.testing.assert_allclose(c_nd.asnumpy(), c_np, rtol=1e-3)
 
@@ -665,9 +665,9 @@ def run_test(tvm_intrin, np_func, dtype):
         B = te.compute((n,), lambda *i: tvm_intrin(A(*i)), name="B")
         s = sched(B)
         f = tvm.build(s, [A, B], "cuda")
-        ctx = tvm.gpu(0)
-        a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.zeros(shape=(n,)).astype(A.dtype), ctx)
+        dev = tvm.gpu(0)
+        a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), dev)
+        b = tvm.nd.array(np.zeros(shape=(n,)).astype(A.dtype), dev)
         f(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), np_func(a.asnumpy()), atol=1e-3, rtol=1e-3)
 
@@ -691,9 +691,9 @@ def run_test(tvm_intrin, np_func):
         B = te.compute((n,), lambda i: tvm_intrin(A[i], c2), name="B")
         s = sched(B)
         f = tvm.build(s, [A, B], "cuda")
-        ctx = tvm.gpu(0)
-        a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.zeros(shape=(n,)).astype(A.dtype), ctx)
+        dev = tvm.gpu(0)
+        a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), dev)
+        b = tvm.nd.array(np.zeros(shape=(n,)).astype(A.dtype), dev)
         f(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), np_func(a.asnumpy()), atol=1e-3, rtol=1e-3)
 
@@ -717,9 +717,9 @@ def run_test(dtype):
         B = te.compute((n,), lambda i: tvm.tir.popcount(A[i]), name="B")
         s = sched(B)
         f = tvm.build(s, [A, B], "cuda")
-        ctx = tvm.gpu(0)
-        a = tvm.nd.array(np.random.randint(0, 100000, size=n).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.zeros(shape=(n,)).astype(B.dtype), ctx)
+        dev = tvm.gpu(0)
+        a = tvm.nd.array(np.random.randint(0, 100000, size=n).astype(A.dtype), dev)
+        b = tvm.nd.array(np.zeros(shape=(n,)).astype(B.dtype), dev)
         f(a, b)
         ref = np.vectorize(ref_popcount)(a.asnumpy())
         tvm.testing.assert_allclose(b.asnumpy(), ref)
@@ -736,7 +736,7 @@ def check_cuda(dtype, n, l, padding, lanes):
             print("Skip because gpu does not have fp16 support")
             return
 
-        ctx = tvm.gpu(0)
+        dev = tvm.gpu(0)
         A = tvm.te.placeholder((n, l), name="A", dtype=dtype)
         B = tvm.te.compute(
             (n // lanes, l + 2 * padding, lanes),
@@ -754,8 +754,8 @@ def check_cuda(dtype, n, l, padding, lanes):
         s[B].vectorize(vectorize)
         fun = tvm.build(s, [A, B], "cuda", name="vector_load_permute_pad")
         np_a = np.random.randint(low=-128, high=127, size=(n, l)).astype(A.dtype)
-        a = tvm.nd.empty((n, l), A.dtype, ctx).copyfrom(np_a)
-        b = tvm.nd.empty((n // lanes, l + padding * 2, lanes), B.dtype, ctx)
+        a = tvm.nd.empty((n, l), A.dtype, dev).copyfrom(np_a)
+        b = tvm.nd.empty((n // lanes, l + padding * 2, lanes), B.dtype, dev)
         fun(a, b)
         np_a_reshape = np_a.reshape(n // lanes, lanes, l).transpose(0, 2, 1)
         ref = np.pad(
@@ -812,10 +812,10 @@ def post_visit(stmt):
     # To check if every vectorize loop transforms to correct instruction
     # print(mod.imported_modules[0].get_source())
 
-    ctx = tvm.context("cuda", 0)
-    a = tvm.nd.array(np.random.uniform(size=(512, 512)).astype("float32"), ctx)
-    b = tvm.nd.array(np.random.uniform(size=(512, 512)).astype("float32"), ctx)
-    c = tvm.nd.array(np.zeros((512, 512), dtype="float32"), ctx)
+    dev = tvm.device("cuda", 0)
+    a = tvm.nd.array(np.random.uniform(size=(512, 512)).astype("float32"), dev)
+    b = tvm.nd.array(np.random.uniform(size=(512, 512)).astype("float32"), dev)
+    c = tvm.nd.array(np.zeros((512, 512), dtype="float32"), dev)
     mod(a, b, c)
     tvm.testing.assert_allclose(c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-5)
 
@@ -957,10 +957,10 @@ def test_unrolled_vectorization():
     s[CC].vectorize(j)
 
     # Check correctness
-    ctx = tvm.context(target)
-    a_tvm = tvm.nd.array(np.ones((N, N)).astype(dtype), ctx=ctx)
-    b_tvm = tvm.nd.array(np.ones((N, N)).astype(dtype), ctx=ctx)
-    c_tvm = tvm.nd.empty((N, N), ctx=ctx)
+    dev = tvm.device(target)
+    a_tvm = tvm.nd.array(np.ones((N, N)).astype(dtype), device=dev)
+    b_tvm = tvm.nd.array(np.ones((N, N)).astype(dtype), device=dev)
+    c_tvm = tvm.nd.empty((N, N), device=dev)
     func_tvm = tvm.build(s, [A, B, C], target=target)
     func_tvm(a_tvm, b_tvm, c_tvm)
     c_np = c_tvm.asnumpy()
diff --git a/tests/python/unittest/test_target_codegen_device.py b/tests/python/unittest/test_target_codegen_device.py
index 42ddfc516f7e..4ce7a021981d 100644
--- a/tests/python/unittest/test_target_codegen_device.py
+++ b/tests/python/unittest/test_target_codegen_device.py
@@ -37,10 +37,10 @@ def test_large_uint_imm():
     def check_target(device):
         if not tvm.testing.device_enabled(device):
             return
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         f = tvm.build(s, [A], device)
         # launch the kernel.
-        a = tvm.nd.empty((n,), dtype=A.dtype, ctx=ctx)
+        a = tvm.nd.empty((n,), dtype=A.dtype, device=dev)
         f(a)
         assert a.asnumpy()[0] == value + 3
 
@@ -70,14 +70,14 @@ def test_add_pipeline():
     def check_target(device, host="stackvm"):
         if not tvm.testing.device_enabled(device) or not tvm.testing.device_enabled(host):
             return
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         mhost = tvm.driver.build(s, [A, B, D], target=tvm.target.Target(device, host))
         f = mhost.entry_func
         # launch the kernel.
         n = 1027
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=()).astype(B.dtype), ctx)
-        d = tvm.nd.array(np.zeros(n, dtype=D.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
+        b = tvm.nd.array(np.random.uniform(size=()).astype(B.dtype), dev)
+        d = tvm.nd.array(np.zeros(n, dtype=D.dtype), dev)
         f(a, b, d)
         tvm.testing.assert_allclose(d.asnumpy(), a.asnumpy() + b.asnumpy() + 1)
 
diff --git a/tests/python/unittest/test_target_codegen_extern.py b/tests/python/unittest/test_target_codegen_extern.py
index 032b1059f583..2aefee97d649 100644
--- a/tests/python/unittest/test_target_codegen_extern.py
+++ b/tests/python/unittest/test_target_codegen_extern.py
@@ -68,11 +68,11 @@ def check_target(target):
         C = C_gpu if target in ["opencl", "cuda"] else C_cpu
         # build and invoke the kernel.
         f = tvm.build(s, [A, C], target)
-        ctx = tvm.context(target, 0)
+        dev = tvm.device(target, 0)
         # launch the kernel.
         n = nn
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
+        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
         f(a, c)
         tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
 
@@ -102,11 +102,11 @@ def check_target(target):
             return
         # build and invoke the kernel.
         f = tvm.build(s, [A, C], target)
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         # launch the kernel.
         n = nn
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
+        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
 
         f(a, c)
         tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy())
@@ -133,11 +133,11 @@ def check_target(target):
             return
         # build and invoke the kernel.
         f = tvm.build(s, [A, C], target)
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         # launch the kernel.
         n = nn
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
+        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
 
         @tvm.register_func
         def my_extern_array_func2(aa, bb):
diff --git a/tests/python/unittest/test_target_codegen_llvm.py b/tests/python/unittest/test_target_codegen_llvm.py
index ec7c5aea333f..56a8514b30bf 100644
--- a/tests/python/unittest/test_target_codegen_llvm.py
+++ b/tests/python/unittest/test_target_codegen_llvm.py
@@ -104,9 +104,9 @@ def test_llvm_large_uintimm():
 
     def check_llvm():
         f = tvm.build(s, [A], "llvm")
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         # launch the kernel.
-        a = tvm.nd.empty((), dtype=A.dtype, ctx=ctx)
+        a = tvm.nd.empty((), dtype=A.dtype, device=dev)
         f(a)
         assert a.asnumpy() == value + 3
 
@@ -132,10 +132,10 @@ def test_llvm_persist_parallel():
     def check_llvm():
         # BUILD and invoke the kernel.
         f = tvm.build(s, [A, C], "llvm")
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         # launch the kernel.
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
+        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
         f(a, c)
         tvm.testing.assert_allclose(c.asnumpy(), np.sqrt(a.asnumpy() + 1) * 2 + 2, rtol=1e-5)
 
@@ -154,11 +154,11 @@ def check_llvm(nn, base):
         s[C].vectorize(xi)
         # build and invoke the kernel.
         f = tvm.build(s, [A, C], "llvm")
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         # launch the kernel.
         n = nn
-        a = tvm.nd.array(np.random.uniform(size=(n + base)).astype(A.dtype), ctx)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=(n + base)).astype(A.dtype), dev)
+        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
         f(a, c)
         tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy()[::-1][:n])
 
@@ -184,10 +184,10 @@ def check_llvm(n, lanes):
         s[B].vectorize(xi)
         # build and invoke the kernel.
         f = tvm.build(s, [A, C], "llvm")
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         # launch the kernel.
         a = tvm.nd.empty((n,), A.dtype).copyfrom(np.random.uniform(size=(n, lanes)))
-        c = tvm.nd.empty((n,), C.dtype, ctx)
+        c = tvm.nd.empty((n,), C.dtype, dev)
         f(a, c)
         tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
 
@@ -207,11 +207,11 @@ def check_llvm(nn, base, stride):
         s[C].vectorize(xi)
         # build and invoke the kernel.
         f = tvm.build(s, [A, C], "llvm")
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         # launch the kernel.
         n = nn
-        a = tvm.nd.array(np.random.uniform(size=(n + base, stride)).astype(A.dtype), ctx)
-        c = tvm.nd.array(np.zeros((n, stride), dtype=C.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=(n + base, stride)).astype(A.dtype), dev)
+        c = tvm.nd.array(np.zeros((n, stride), dtype=C.dtype), dev)
         f(a, c)
         tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy()[base:] + 1)
 
@@ -234,11 +234,11 @@ def test_llvm_temp_space():
     def check_llvm():
         # build and invoke the kernel.
         f = tvm.build(s, [A, C], "llvm")
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         # launch the kernel.
         n = nn
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
+        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
         f(a, c)
         tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1 + 1)
 
@@ -265,12 +265,12 @@ def check_llvm():
         fadd2 = m["fadd2"]
         fadd1 = m["fadd1"]
 
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         # launch the kernel.
         n = nn
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
+        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
+        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
         fadd1(a, b, c)
         tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
         fadd2(a, b, c)
@@ -287,10 +287,10 @@ def check_llvm(n, offset):
         s = te.create_schedule(C.op)
         # build and invoke the kernel.
         f = tvm.build(s, [A, C], "llvm")
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         # launch the kernel.
-        a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx)
-        c = tvm.nd.empty((n,), A.dtype, ctx)
+        a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev)
+        c = tvm.nd.empty((n,), A.dtype, dev)
         f(a, c)
         c_np = a.asnumpy()
         c_np[:offset] = 0
@@ -307,10 +307,10 @@ def check_llvm(n):
         s = te.create_schedule(C.op)
         # build and invoke the kernel.
         f = tvm.build(s, [A, C], "llvm")
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         # launch the kernel.
-        a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), ctx)
-        c = tvm.nd.empty((n,), C.dtype, ctx)
+        a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), dev)
+        c = tvm.nd.empty((n,), C.dtype, dev)
         f(a, c)
         c_np = a.asnumpy() == 1
         tvm.testing.assert_allclose(c.asnumpy(), c_np)
@@ -329,11 +329,11 @@ def check_llvm(n):
         s = te.create_schedule(D.op)
         # build and invoke the kernel.
         f = tvm.build(s, [A, scale, D], "llvm")
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         # launch the kernel.
-        a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), ctx)
-        sc = tvm.nd.array(np.random.randint(0, 2, size=()).astype(scale.dtype), ctx)
-        d = tvm.nd.empty((), D.dtype, ctx)
+        a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), dev)
+        sc = tvm.nd.array(np.random.randint(0, 2, size=()).astype(scale.dtype), dev)
+        d = tvm.nd.empty((), D.dtype, dev)
         f(a, sc, d)
         d_np = np.sum(a.asnumpy()) * sc.asnumpy() + 1
         tvm.testing.assert_allclose(d.asnumpy(), d_np)
@@ -353,11 +353,11 @@ def check_llvm(n):
             s = te.create_schedule(D.op)
             # build and invoke the kernel.
             f = tvm.build(s, [A, scale, D], "llvm")
-            ctx = tvm.cpu(0)
+            dev = tvm.cpu(0)
             # launch the kernel.
-            a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), ctx)
-            sc = tvm.nd.array(np.random.randint(0, 2, size=()).astype(scale.dtype), ctx)
-            d = tvm.nd.empty((), D.dtype, ctx)
+            a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), dev)
+            sc = tvm.nd.array(np.random.randint(0, 2, size=()).astype(scale.dtype), dev)
+            d = tvm.nd.empty((), D.dtype, dev)
             f(a, sc, d)
             d_np = np.sum(a.asnumpy()) * sc.asnumpy() + 1
             tvm.testing.assert_allclose(d.asnumpy(), d_np)
@@ -810,8 +810,8 @@ def do_atomic_add(A):
         s = tvm.te.create_schedule(C.op)
         f = tvm.build(s, [A], target="nvptx")
 
-        ctx = tvm.gpu()
-        a = tvm.nd.array(np.zeros((size,)).astype(A.dtype), ctx)
+        dev = tvm.gpu()
+        a = tvm.nd.array(np.zeros((size,)).astype(A.dtype), dev)
         f(a)
         ref = np.zeros((size,)).astype(A.dtype)
         ref[0] = size
diff --git a/tests/python/unittest/test_target_codegen_opencl.py b/tests/python/unittest/test_target_codegen_opencl.py
index 8a070da89641..98340f0e6ac5 100644
--- a/tests/python/unittest/test_target_codegen_opencl.py
+++ b/tests/python/unittest/test_target_codegen_opencl.py
@@ -24,7 +24,7 @@
 @tvm.testing.requires_gpu
 @tvm.testing.requires_opencl
 def test_opencl_ternary_expression():
-    def check_if_then_else(ctx, n, dtype):
+    def check_if_then_else(dev, n, dtype):
         A = te.placeholder((n,), name="A", dtype=dtype)
         true_value = tvm.tir.const(1, dtype=dtype)
         false_value = tvm.tir.const(3, dtype=dtype)
@@ -35,12 +35,12 @@ def check_if_then_else(ctx, n, dtype):
         s[C].bind(s[C].op.axis[0], te.thread_axis("threadIdx.x"))
         fun = tvm.build(s, [A, C], target)
 
-        a = tvm.nd.empty((n,), A.dtype, ctx)
-        c = tvm.nd.empty((n,), A.dtype, ctx)
+        a = tvm.nd.empty((n,), A.dtype, dev)
+        c = tvm.nd.empty((n,), A.dtype, dev)
         # Only need to test compiling here
         fun(a, c)
 
-    def check_select(ctx, n, dtype):
+    def check_select(dev, n, dtype):
         A = te.placeholder((n,), name="A", dtype=dtype)
         true_value = tvm.tir.const(1, dtype=dtype)
         false_value = tvm.tir.const(3, dtype=dtype)
@@ -51,52 +51,52 @@ def check_select(ctx, n, dtype):
         s[C].bind(s[C].op.axis[0], te.thread_axis("threadIdx.x"))
         fun = tvm.build(s, [A, C], target)
 
-        a = tvm.nd.empty((n,), A.dtype, ctx)
-        c = tvm.nd.empty((n,), A.dtype, ctx)
+        a = tvm.nd.empty((n,), A.dtype, dev)
+        c = tvm.nd.empty((n,), A.dtype, dev)
         # Only need to test compiling here
         fun(a, c)
 
-    ctx = tvm.context(target, 0)
+    dev = tvm.device(target, 0)
 
-    check_if_then_else(ctx, 1, "int8")
-    check_if_then_else(ctx, 1, "uint8")
-    check_if_then_else(ctx, 1, "int16")
-    check_if_then_else(ctx, 1, "uint16")
-    check_select(ctx, 1, "int8")
-    check_select(ctx, 1, "uint8")
-    check_select(ctx, 1, "int16")
-    check_select(ctx, 1, "uint16")
+    check_if_then_else(dev, 1, "int8")
+    check_if_then_else(dev, 1, "uint8")
+    check_if_then_else(dev, 1, "int16")
+    check_if_then_else(dev, 1, "uint16")
+    check_select(dev, 1, "int8")
+    check_select(dev, 1, "uint8")
+    check_select(dev, 1, "int16")
+    check_select(dev, 1, "uint16")
 
 
 @tvm.testing.requires_gpu
 @tvm.testing.requires_opencl
 def test_opencl_inf_nan():
-    def check_inf_nan(ctx, n, value, dtype):
+    def check_inf_nan(dev, n, value, dtype):
         A = te.placeholder((n,), name="A", dtype=dtype)
         inf_value = tvm.tir.const(value, dtype=dtype)
         C = te.compute((n,), lambda i: inf_value, name="C")
         s = te.create_schedule(C.op)
         s[C].bind(s[C].op.axis[0], te.thread_axis("threadIdx.x"))
         fun = tvm.build(s, [A, C], target)
-        a = tvm.nd.empty((n,), A.dtype, ctx)
-        c = tvm.nd.empty((n,), A.dtype, ctx)
+        a = tvm.nd.empty((n,), A.dtype, dev)
+        c = tvm.nd.empty((n,), A.dtype, dev)
         # Only need to test compiling here
         fun(a, c)
 
-    ctx = tvm.context(target, 0)
+    dev = tvm.device(target, 0)
 
-    check_inf_nan(ctx, 1, -float("inf"), "float32")
-    check_inf_nan(ctx, 1, -float("inf"), "float64")
-    check_inf_nan(ctx, 1, float("inf"), "float32")
-    check_inf_nan(ctx, 1, float("inf"), "float64")
-    check_inf_nan(ctx, 1, float("nan"), "float32")
-    check_inf_nan(ctx, 1, float("nan"), "float64")
+    check_inf_nan(dev, 1, -float("inf"), "float32")
+    check_inf_nan(dev, 1, -float("inf"), "float64")
+    check_inf_nan(dev, 1, float("inf"), "float32")
+    check_inf_nan(dev, 1, float("inf"), "float64")
+    check_inf_nan(dev, 1, float("nan"), "float32")
+    check_inf_nan(dev, 1, float("nan"), "float64")
 
 
 @tvm.testing.requires_gpu
 @tvm.testing.requires_opencl
 def test_opencl_max():
-    def check_max(ctx, n, dtype):
+    def check_max(dev, n, dtype):
         A = te.placeholder((n,), name="A", dtype=dtype)
         max_lhs = A[0] + tvm.tir.const(1, dtype=dtype)
         max_rhs = tvm.tir.const(0, dtype=dtype)
@@ -105,19 +105,19 @@ def check_max(ctx, n, dtype):
         s[C].bind(s[C].op.axis[0], te.thread_axis("threadIdx.x"))
         fun = tvm.build(s, [A, C], target)
 
-        a = tvm.nd.empty((n,), A.dtype, ctx)
-        c = tvm.nd.empty((n,), A.dtype, ctx)
+        a = tvm.nd.empty((n,), A.dtype, dev)
+        c = tvm.nd.empty((n,), A.dtype, dev)
         # Only need to test compiling here
         fun(a, c)
 
-    ctx = tvm.context(target, 0)
+    dev = tvm.device(target, 0)
 
-    check_max(ctx, 1, "int8")
-    check_max(ctx, 1, "uint8")
-    check_max(ctx, 1, "int16")
-    check_max(ctx, 1, "uint16")
-    check_max(ctx, 1, "float32")
-    check_max(ctx, 1, "float64")
+    check_max(dev, 1, "int8")
+    check_max(dev, 1, "uint8")
+    check_max(dev, 1, "int16")
+    check_max(dev, 1, "uint16")
+    check_max(dev, 1, "float32")
+    check_max(dev, 1, "float64")
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_target_codegen_rocm.py b/tests/python/unittest/test_target_codegen_rocm.py
index 36a659b07f9b..9eb0b5cf938d 100644
--- a/tests/python/unittest/test_target_codegen_rocm.py
+++ b/tests/python/unittest/test_target_codegen_rocm.py
@@ -46,35 +46,35 @@ def test_rocm_cross_thread_reduction():
     frocm = tvm.build(s, [A, B], "rocm")
 
     nn = 128
-    ctx = tvm.rocm(0)
-    a = tvm.nd.array(np.random.uniform(size=(nn, nn)).astype(A.dtype), ctx)
-    b = tvm.nd.array(np.zeros(nn, dtype=B.dtype), ctx)
+    dev = tvm.rocm(0)
+    a = tvm.nd.array(np.random.uniform(size=(nn, nn)).astype(A.dtype), dev)
+    b = tvm.nd.array(np.zeros(nn, dtype=B.dtype), dev)
     frocm(a, b)
     tvm.testing.assert_allclose(b.asnumpy(), np.sum(a.asnumpy(), axis=1), rtol=1e-4)
 
 
 @tvm.testing.requires_rocm
 def test_rocm_inf_nan():
-    def check_inf_nan(ctx, n, value, dtype):
+    def check_inf_nan(dev, n, value, dtype):
         A = te.placeholder((n,), name="A", dtype=dtype)
         inf_value = tvm.tir.const(value, dtype=dtype)
         C = te.compute((n,), lambda i: inf_value, name="C")
         s = te.create_schedule(C.op)
         s[C].bind(s[C].op.axis[0], tx)
         fun = tvm.build(s, [A, C], "rocm")
-        a = tvm.nd.empty((n,), A.dtype, ctx)
-        c = tvm.nd.empty((n,), A.dtype, ctx)
+        a = tvm.nd.empty((n,), A.dtype, dev)
+        c = tvm.nd.empty((n,), A.dtype, dev)
         # Only need to test compiling here
         fun(a, c)
 
-    ctx = tvm.rocm(0)
+    dev = tvm.rocm(0)
 
-    check_inf_nan(ctx, 1, -float("inf"), "float32")
-    check_inf_nan(ctx, 1, -float("inf"), "float64")
-    check_inf_nan(ctx, 1, float("inf"), "float32")
-    check_inf_nan(ctx, 1, float("inf"), "float64")
-    check_inf_nan(ctx, 1, float("nan"), "float32")
-    check_inf_nan(ctx, 1, float("nan"), "float64")
+    check_inf_nan(dev, 1, -float("inf"), "float32")
+    check_inf_nan(dev, 1, -float("inf"), "float64")
+    check_inf_nan(dev, 1, float("inf"), "float32")
+    check_inf_nan(dev, 1, float("inf"), "float64")
+    check_inf_nan(dev, 1, float("nan"), "float32")
+    check_inf_nan(dev, 1, float("nan"), "float64")
 
 
 @tvm.testing.requires_rocm
@@ -94,9 +94,9 @@ def test_rocm_reduction_binding():
 def test_rocm_copy():
     def check_rocm(dtype, n):
         A = te.placeholder((n,), name="A", dtype=dtype)
-        ctx = tvm.rocm(0)
+        dev = tvm.rocm(0)
         a_np = np.random.uniform(size=(n,)).astype(A.dtype)
-        a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(a_np)
+        a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(a_np)
         b_np = a.asnumpy()
         tvm.testing.assert_allclose(a_np, b_np)
         tvm.testing.assert_allclose(a_np, a.asnumpy())
@@ -120,9 +120,9 @@ def check_rocm(dtype, n, lanes):
         s[B].bind(xo, bx)
         s[B].bind(xi, tx)
         fun = tvm.build(s, [A, B], "rocm")
-        ctx = tvm.rocm(0)
-        a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np.random.uniform(size=(n, lanes)))
-        c = tvm.nd.empty((n,), B.dtype, ctx)
+        dev = tvm.rocm(0)
+        a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np.random.uniform(size=(n, lanes)))
+        c = tvm.nd.empty((n,), B.dtype, dev)
         fun(a, c)
         tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
 
diff --git a/tests/python/unittest/test_target_codegen_spirv.py b/tests/python/unittest/test_target_codegen_spirv.py
index 68be5c480358..df42eeb721ab 100644
--- a/tests/python/unittest/test_target_codegen_spirv.py
+++ b/tests/python/unittest/test_target_codegen_spirv.py
@@ -62,27 +62,28 @@ def do_copy(A, B, n):
     with tvm.transform.PassContext(opt_level=3):
         func = tvm.build(s, [A, B], target)
 
-    ctx = tvm.context(target, 0)
+    dev = tvm.device(target, 0)
     a_np = np.random.uniform(size=n) > 0.5
     b_np = np.zeros((n,), dtype="int32")
-    a = tvm.nd.array(a_np, ctx)
-    b = tvm.nd.array(b_np, ctx)
+    a = tvm.nd.array(a_np, dev)
+    b = tvm.nd.array(b_np, dev)
     func(a, b)
     ref = a_np.astype(np.int32)
     tvm.testing.assert_allclose(b.asnumpy(), ref)
 
 
+def check_mod(mod, x_np, res_np):
+    target = "vulkan"
+    dev = tvm.device(target, 0)
+    ex = relay.create_executor("vm", mod=mod, device=dev, target=target)
+    res = ex.evaluate()(x_np).asnumpy()
+    tvm.testing.assert_allclose(res, res_np, atol=1e-5)
+
+
 def test_pushconstants():
     if not tvm.testing.device_enabled("vulkan"):
         return
 
-    def check_mod(mod, x_np, res_np):
-        target = "vulkan"
-        ctx = tvm.context(target, 0)
-        ex = relay.create_executor("vm", mod=mod, ctx=ctx, target=target)
-        res = ex.evaluate()(x_np).asnumpy()
-        tvm.testing.assert_allclose(res, res_np, atol=1e-5)
-
     # Three 32 bit pushconstants: any_dim, stride, stride
     dtype = "float32"
     x = relay.var("x", shape=(relay.Any(),), dtype=dtype)
@@ -104,6 +105,21 @@ def check_mod(mod, x_np, res_np):
     check_mod(mod, x_np, res_np)
 
 
+def test_unique():
+    if not tvm.testing.device_enabled("vulkan"):
+        return
+
+    dtype = "int32"
+    x = relay.var("x", shape=(relay.Any(),), dtype=dtype)
+    mod = tvm.IRModule()
+    [unique, _, num_unique] = relay.unique(x, is_sorted=True)
+    mod["main"] = relay.Function([x], relay.op.strided_slice(unique, begin=[0], end=num_unique))
+    x_np = np.random.randint(0, high=10, size=(10,)).astype(dtype)
+    res_np = np.unique(x_np)
+    check_mod(mod, x_np, res_np)
+
+
 if __name__ == "__main__":
     test_bool_load()
     test_pushconstants()
+    test_unique()
diff --git a/tests/python/unittest/test_target_codegen_vulkan.py b/tests/python/unittest/test_target_codegen_vulkan.py
index d53045613bee..e68996df531f 100644
--- a/tests/python/unittest/test_target_codegen_vulkan.py
+++ b/tests/python/unittest/test_target_codegen_vulkan.py
@@ -65,9 +65,9 @@ def check_correct_assembly(dtype):
 def test_vulkan_copy():
     def check_vulkan(dtype, n):
         A = te.placeholder((n,), name="A", dtype=dtype)
-        ctx = tvm.vulkan(0)
+        dev = tvm.vulkan(0)
         a_np = np.random.uniform(size=(n,)).astype(A.dtype)
-        a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(a_np)
+        a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(a_np)
         b_np = a.asnumpy()
         tvm.testing.assert_allclose(a_np, b_np)
         tvm.testing.assert_allclose(a_np, a.asnumpy())
@@ -91,9 +91,9 @@ def check_vulkan(dtype, n, lanes):
         s[B].bind(xo, bx)
         s[B].bind(xi, tx)
         fun = tvm.build(s, [A, B], "vulkan")
-        ctx = tvm.vulkan(0)
-        a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np.random.uniform(size=(n, lanes)))
-        c = tvm.nd.empty((n,), B.dtype, ctx)
+        dev = tvm.vulkan(0)
+        a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np.random.uniform(size=(n, lanes)))
+        c = tvm.nd.empty((n,), B.dtype, dev)
         fun(a, c)
         tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
 
@@ -139,10 +139,10 @@ def build_f(f_ref):
             fs = [
                 build_f(random.choice(functions)) for _ in range(np.random.randint(low=1, high=10))
             ]
-            ctx = tvm.vulkan(0)
-            a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np.random.uniform(size=(n,)))
-            b = tvm.nd.empty((n,), B.dtype, ctx).copyfrom(np.random.uniform(size=(n,)))
-            cs = [tvm.nd.empty((n,), A.dtype, ctx) for _ in fs]
+            dev = tvm.vulkan(0)
+            a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np.random.uniform(size=(n,)))
+            b = tvm.nd.empty((n,), B.dtype, dev).copyfrom(np.random.uniform(size=(n,)))
+            cs = [tvm.nd.empty((n,), A.dtype, dev) for _ in fs]
             for ((f, _), c) in zip(fs, cs):
                 f(a, b, c)
 
diff --git a/tests/python/unittest/test_te_autodiff.py b/tests/python/unittest/test_te_autodiff.py
index b2f26471d267..59b20bd11e75 100644
--- a/tests/python/unittest/test_te_autodiff.py
+++ b/tests/python/unittest/test_te_autodiff.py
@@ -31,7 +31,7 @@ def check_grad(
     inputs = inputs if isinstance(inputs, list) else [inputs]
 
     def check_device(device, host="llvm"):
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(host):
             return
 
diff --git a/tests/python/unittest/test_te_hybrid_script.py b/tests/python/unittest/test_te_hybrid_script.py
index be9956529dcc..f3091c7b71a6 100644
--- a/tests/python/unittest/test_te_hybrid_script.py
+++ b/tests/python/unittest/test_te_hybrid_script.py
@@ -32,7 +32,7 @@ def tvm_val_2_py_val(val):
         assert isinstance(val, (tvm.tir.IntImm,))
         return val.value
 
-    ctx = tvm.context(target, 0)
+    dev = tvm.device(target, 0)
     op = None
 
     if sch is None:
@@ -50,7 +50,7 @@ def tvm_val_2_py_val(val):
         if isinstance(i, te.tensor.Tensor):
             shape = [tvm_val_2_py_val(j) for j in i.shape]
             emu_args.append(numpy.random.randn(*shape).astype(i.dtype))
-            nd_args.append(tvm.nd.array(emu_args[-1], ctx))
+            nd_args.append(tvm.nd.array(emu_args[-1], dev))
         elif isinstance(i, tvm.tir.Var):
             emu_args.append(tvm_val_2_py_val(i))
             nd_args.append(emu_args[-1])
@@ -68,7 +68,7 @@ def tvm_val_2_py_val(val):
     for i in range(op.num_outputs):
         output = op.output(i)
         shape = [tvm_val_2_py_val(j) for j in output.shape]
-        nd_args.append(tvm.nd.array(numpy.zeros(shape).astype(output.dtype), ctx))
+        nd_args.append(tvm.nd.array(numpy.zeros(shape).astype(output.dtype), dev))
         out_tensors.append(nd_args[-1])
 
     ref_data = func(*emu_args)
diff --git a/tests/python/unittest/test_te_schedule_postproc_rewrite_for_tensor_core.py b/tests/python/unittest/test_te_schedule_postproc_rewrite_for_tensor_core.py
index 88cf66e6f03b..e7a8469a8311 100644
--- a/tests/python/unittest/test_te_schedule_postproc_rewrite_for_tensor_core.py
+++ b/tests/python/unittest/test_te_schedule_postproc_rewrite_for_tensor_core.py
@@ -100,15 +100,15 @@ def tensor_core_matmul(warp_tile_m=16, m=64, n=32, l=96):
 
     func = tvm.build(s, [A, B, C], "cuda")
 
-    ctx = tvm.gpu(0)
+    dev = tvm.gpu(0)
     a_np = np.random.uniform(size=(n, l)).astype(A.dtype)
     b_np = np.random.uniform(size=(l, m)).astype(B.dtype)
     c_np = np.zeros((n, m), dtype=np.float32)
-    a = tvm.nd.array(a_np, ctx)
-    b = tvm.nd.array(b_np, ctx)
-    c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
+    a = tvm.nd.array(a_np, dev)
+    b = tvm.nd.array(b_np, dev)
+    c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev)
     func(a, b, c)
-    evaluator = func.time_evaluator(func.entry_name, ctx, number=3)
+    evaluator = func.time_evaluator(func.entry_name, dev, number=3)
     print("gemm m=%d n=%d k=%d: %f ms" % (m, n, l, evaluator(a, b, c).mean * 1e3))
 
     c_np = np.dot(a_np, b_np)
@@ -195,15 +195,15 @@ def tensor_core_batch_matmul(warp_tile_m=16, m=64, n=32, l=96, batch=2):
 
     func = tvm.build(s, [A, B, C], "cuda")
 
-    ctx = tvm.gpu(0)
+    dev = tvm.gpu(0)
     a_np = np.random.uniform(size=(batch, n, l)).astype(A.dtype)
     b_np = np.random.uniform(size=(batch, l, m)).astype(B.dtype)
     c_np = np.zeros((batch, n, m), dtype=np.float32)
-    a = tvm.nd.array(a_np, ctx)
-    b = tvm.nd.array(b_np, ctx)
-    c = tvm.nd.array(np.zeros((batch, n, m), dtype=C.dtype), ctx)
+    a = tvm.nd.array(a_np, dev)
+    b = tvm.nd.array(b_np, dev)
+    c = tvm.nd.array(np.zeros((batch, n, m), dtype=C.dtype), dev)
     func(a, b, c)
-    evaluator = func.time_evaluator(func.entry_name, ctx, number=3)
+    evaluator = func.time_evaluator(func.entry_name, dev, number=3)
     print(
         "batch gemm m=%d n=%d k=%d batch=%d: %f ms"
         % (m, n, l, batch, evaluator(a, b, c).mean * 1e3)
diff --git a/tests/python/unittest/test_te_schedule_tensor_core.py b/tests/python/unittest/test_te_schedule_tensor_core.py
index 01da1a1a0db2..9491425b3866 100644
--- a/tests/python/unittest/test_te_schedule_tensor_core.py
+++ b/tests/python/unittest/test_te_schedule_tensor_core.py
@@ -256,14 +256,14 @@ def test_tensor_core_batch_matmal():
 
     func = tvm.build(s, [A, B, C], "cuda")
 
-    ctx = tvm.gpu(0)
+    dev = tvm.gpu(0)
     a_np = np.random.uniform(size=(batch_size, nn, ll, 32, 16)).astype(A.dtype)
     b_np = np.random.uniform(size=(batch_size, ll, mm, 16, 8)).astype(B.dtype)
-    a = tvm.nd.array(a_np, ctx)
-    b = tvm.nd.array(b_np, ctx)
-    c = tvm.nd.array(np.zeros((batch_size, nn, mm, 32, 8), dtype=C.dtype), ctx)
+    a = tvm.nd.array(a_np, dev)
+    b = tvm.nd.array(b_np, dev)
+    c = tvm.nd.array(np.zeros((batch_size, nn, mm, 32, 8), dtype=C.dtype), dev)
     func(a, b, c)
-    evaluator = func.time_evaluator(func.entry_name, ctx, number=3)
+    evaluator = func.time_evaluator(func.entry_name, dev, number=3)
     print("gemm with tensor core: %f ms" % (evaluator(a, b, c).mean * 1e3))
 
     if VERIFY:
@@ -432,13 +432,13 @@ def test_tensor_core_batch_conv():
 
     func = tvm.build(s, [A, W, Conv], "cuda")
 
-    ctx = tvm.gpu(0)
+    dev = tvm.gpu(0)
     a_np = np.random.uniform(size=data_shape).astype(A.dtype)
     w_np = np.random.uniform(size=kernel_shape).astype(W.dtype)
-    a = tvm.nd.array(a_np, ctx)
-    w = tvm.nd.array(w_np, ctx)
-    c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), ctx)
-    evaluator = func.time_evaluator(func.entry_name, ctx, number=3)
+    a = tvm.nd.array(a_np, dev)
+    w = tvm.nd.array(w_np, dev)
+    c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), dev)
+    evaluator = func.time_evaluator(func.entry_name, dev, number=3)
     print("conv2d with tensor core: %f ms" % (evaluator(a, w, c).mean * 1e3))
 
     if VERIFY:
diff --git a/tests/python/unittest/test_te_tensor_overload.py b/tests/python/unittest/test_te_tensor_overload.py
index 3126d8484e35..33dc19a19be9 100644
--- a/tests/python/unittest/test_te_tensor_overload.py
+++ b/tests/python/unittest/test_te_tensor_overload.py
@@ -77,12 +77,12 @@ def test_combination():
     D = k + A - B * C + x
     s = te.create_schedule(D.op)
     foo = tvm.build(s, [x, A, B, C, D], "llvm")
-    ctx = tvm.cpu(0)
+    dev = tvm.cpu(0)
     x = 2
-    a = tvm.nd.array(np.random.uniform(size=(n, m)).astype(A.dtype), ctx)
-    b = tvm.nd.array(np.random.uniform(size=(n, m)).astype(B.dtype), ctx)
-    c = tvm.nd.array(np.random.uniform(size=(n, m)).astype(C.dtype), ctx)
-    d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx)
+    a = tvm.nd.array(np.random.uniform(size=(n, m)).astype(A.dtype), dev)
+    b = tvm.nd.array(np.random.uniform(size=(n, m)).astype(B.dtype), dev)
+    c = tvm.nd.array(np.random.uniform(size=(n, m)).astype(C.dtype), dev)
+    d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), dev)
     foo(x, a, b, c, d)
     tvm.testing.assert_allclose(d.asnumpy(), k + a.asnumpy() - b.asnumpy() * c.asnumpy() + x)
 
@@ -107,7 +107,7 @@ def check_device(device):
         if not tvm.testing.device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         print("Running on target: %s" % device)
         with tvm.target.Target(device):
             s = tvm.topi.testing.get_elemwise_schedule(device)(B)
@@ -126,8 +126,8 @@ def check_device(device):
         else:
             raise NotImplementedError()
 
-        a_nd = tvm.nd.array(a_npy, ctx)
-        b_nd = tvm.nd.array(np.empty(b_npy.shape).astype(B.dtype), ctx)
+        a_nd = tvm.nd.array(a_npy, dev)
+        b_nd = tvm.nd.array(np.empty(b_npy.shape).astype(B.dtype), dev)
         foo(a_nd, b_nd, k_, *shape)
         tvm.testing.assert_allclose(b_nd.asnumpy(), b_npy, rtol=1e-5)
 
@@ -150,7 +150,7 @@ def verify_broadcast_bop(lhs_shape, rhs_shape, typ="add"):
         raise NotImplementedError()
 
     def check_device(device):
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
@@ -173,9 +173,9 @@ def check_device(device):
         else:
             raise NotImplementedError()
 
-        lhs_nd = tvm.nd.array(lhs_npy, ctx)
-        rhs_nd = tvm.nd.array(rhs_npy, ctx)
-        out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), ctx)
+        lhs_nd = tvm.nd.array(lhs_npy, dev)
+        rhs_nd = tvm.nd.array(rhs_npy, dev)
+        out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), dev)
         for _ in range(1):
             foo(lhs_nd, rhs_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1e-4, atol=1e-4)
@@ -189,7 +189,7 @@ def verify_conv2d_scalar_bop(
     batch, in_size, in_channel, num_filter, kernel, stride, padding, typ="add"
 ):
     def check_device(device):
-        ctx = tvm.context(device, 0)
+        dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
@@ -232,10 +232,10 @@ def check_device(device):
         else:
             raise NotImplementedError()
 
-        a_nd = tvm.nd.array(a_npy, ctx)
-        w_nd = tvm.nd.array(w_npy, ctx)
-        b_nd = tvm.nd.array(np.empty(b_npy.shape).astype(B.dtype), ctx)
-        c_nd = tvm.nd.array(np.empty(c_npy.shape).astype(C.dtype), ctx)
+        a_nd = tvm.nd.array(a_npy, dev)
+        w_nd = tvm.nd.array(w_npy, dev)
+        b_nd = tvm.nd.array(np.empty(b_npy.shape).astype(B.dtype), dev)
+        c_nd = tvm.nd.array(np.empty(c_npy.shape).astype(C.dtype), dev)
         foo(a_nd, w_nd, b_nd, c_nd)
         tvm.testing.assert_allclose(c_nd.asnumpy(), c_npy, rtol=1e-4, atol=1e-4)
 
diff --git a/tests/python/unittest/test_tir_analysis_get_block_access_region.py b/tests/python/unittest/test_tir_analysis_get_block_access_region.py
new file mode 100644
index 000000000000..7e4d7d87c1e1
--- /dev/null
+++ b/tests/python/unittest/test_tir_analysis_get_block_access_region.py
@@ -0,0 +1,57 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+from tvm import tir, script
+from tvm.ir import Range
+
+
+@tvm.script.tir
+def func() -> None:
+    A = tir.alloc_buffer((128, 128), "float32")
+    B = tir.alloc_buffer((128, 128), "float32")
+    C = tir.alloc_buffer((128, 128), "float32")
+    D = tir.alloc_buffer((128, 128), "float32")
+    with tir.block([]):
+        # Need add read/write region manually to avoid triggering block access region detector
+        tir.reads([B[0, 0], C[0:16, 0:16], A[4:12, 4:12]])
+        tir.writes([A[0:12, 0:12]])
+        for i, j in tir.grid(8, 8):
+            A[i, j] = B[0, 0] + C[0, 0]
+        with tir.block([2, 2]) as [vi, vj]:
+            tir.reads([A[vi * 4 + 4 : vi * 4 + 8, vj * 4 + 4 : vj * 4 + 8], C[12:16, 12:16]])
+            tir.writes([A[vi * 4 + 4 : vi * 4 + 8, vj * 4 + 4 : vj * 4 + 8]])
+            for i, j in tir.grid(4, 4):
+                A[vi * 4 + 4 + i, vj * 4 + 4 + j] += C[i + 12, j + 12]
+        tir.evaluate(D.data)
+
+
+def test_block_access_region_detector():
+    block = func.body.block.body.block
+    alloc_buffers = func.body.block.alloc_buffers
+    buffer_var_map = {buf.data: buf for buf in alloc_buffers}
+    ret = tir.analysis.get_block_access_region(block, buffer_var_map)
+
+    tvm.ir.assert_structural_equal(block.reads, ret[0])
+    tvm.ir.assert_structural_equal(block.writes, ret[1])
+    D = alloc_buffers[-1]
+    tvm.ir.assert_structural_equal(
+        [tvm.tir.BufferRegion(D, [Range(0, 128), Range(0, 128)])], ret[2]
+    )
+
+
+if __name__ == "__main__":
+    test_block_access_region_detector()
diff --git a/tests/python/unittest/test_tir_buffer.py b/tests/python/unittest/test_tir_buffer.py
index fcbab046dab3..de03cddfb50c 100644
--- a/tests/python/unittest/test_tir_buffer.py
+++ b/tests/python/unittest/test_tir_buffer.py
@@ -149,10 +149,10 @@ def test_buffer_broadcast():
 
     def check():
         fadd = tvm.build(s, [A, B, C], target="llvm", name="bcast_add", binds={A: Ab, B: Bb})
-        ctx = tvm.cpu(0)
-        a = tvm.nd.array(np.random.uniform(size=(2, 4, 3)).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=(2, 1, 1)).astype(B.dtype), ctx)
-        c = tvm.nd.array(np.zeros((2, 4, 3), dtype=C.dtype), ctx)
+        dev = tvm.cpu(0)
+        a = tvm.nd.array(np.random.uniform(size=(2, 4, 3)).astype(A.dtype), dev)
+        b = tvm.nd.array(np.random.uniform(size=(2, 1, 1)).astype(B.dtype), dev)
+        c = tvm.nd.array(np.zeros((2, 4, 3), dtype=C.dtype), dev)
         fadd(a, b, c)
         tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
 
@@ -178,10 +178,10 @@ def check_stride():
         fadd = tvm.build(
             s, [A, B, C, o1, x], target="llvm", name="bcast_add", binds={A: Ab, B: Bb, C: Cc}
         )
-        ctx = tvm.cpu(0)
-        a = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(B.dtype), ctx)
-        c = tvm.nd.array(np.zeros((2, 4), dtype=C.dtype), ctx)
+        dev = tvm.cpu(0)
+        a = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(A.dtype), dev)
+        b = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(B.dtype), dev)
+        c = tvm.nd.array(np.zeros((2, 4), dtype=C.dtype), dev)
         fadd(a, b, c, 4, 1)
         tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
 
@@ -189,20 +189,20 @@ def check_no_stride():
         fadd = tvm.build(
             s, [A, B, C, o1, x], target="llvm", name="bcast_add", binds={A: Ab, B: Bb, C: Cc}
         )
-        ctx = tvm.cpu(0)
-        a = tvm.nd.array(np.random.uniform(size=(1, 4)).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(B.dtype), ctx)
-        c = tvm.nd.array(np.zeros((2, 4), dtype=C.dtype), ctx)
+        dev = tvm.cpu(0)
+        a = tvm.nd.array(np.random.uniform(size=(1, 4)).astype(A.dtype), dev)
+        b = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(B.dtype), dev)
+        c = tvm.nd.array(np.zeros((2, 4), dtype=C.dtype), dev)
         fadd(a, b, c, 4, 1)
         tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
 
     def check_auto_bind():
         # Let build bind buffers
         fadd = tvm.build(s, [A, B, C, o1, x], target="llvm", name="bcast_add")
-        ctx = tvm.cpu(0)
-        a = tvm.nd.array(np.random.uniform(size=(1, 4)).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(B.dtype), ctx)
-        c = tvm.nd.array(np.zeros((2, 4), dtype=C.dtype), ctx)
+        dev = tvm.cpu(0)
+        a = tvm.nd.array(np.random.uniform(size=(1, 4)).astype(A.dtype), dev)
+        b = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(B.dtype), dev)
+        c = tvm.nd.array(np.zeros((2, 4), dtype=C.dtype), dev)
         fadd(a, b, c, 4, 1)
         tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
 
diff --git a/tests/python/unittest/test_tir_intrin.py b/tests/python/unittest/test_tir_intrin.py
index 76390dace757..755ffdf213b3 100644
--- a/tests/python/unittest/test_tir_intrin.py
+++ b/tests/python/unittest/test_tir_intrin.py
@@ -32,10 +32,10 @@ def test_nearbyint():
     A_rounded = te.compute((m,), lambda *i: tvm.tir.nearbyint(A(*i)), name="A")
     s = te.create_schedule(A_rounded.op)
     f = tvm.build(s, [A, A_rounded], "llvm")
-    ctx = tvm.cpu(0)
+    dev = tvm.cpu(0)
     n = 10
-    a = tvm.nd.array(np.random.uniform(high=100, size=n).astype(A.dtype), ctx)
-    a_rounded = tvm.nd.array(np.random.uniform(size=n).astype(A_rounded.dtype), ctx)
+    a = tvm.nd.array(np.random.uniform(high=100, size=n).astype(A.dtype), dev)
+    a_rounded = tvm.nd.array(np.random.uniform(size=n).astype(A_rounded.dtype), dev)
     f(a, a_rounded)
     # Note that numpys rint rounds to nearest integer with
     # ties to halfway is broken by rounding to even.
@@ -80,10 +80,10 @@ def run_test(tvm_intrin, np_func):
         B = te.compute((m,), lambda *i: tvm_intrin(A(*i)), name="B")
         s = te.create_schedule(B.op)
         f = tvm.build(s, [A, B], "llvm")
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         n = 10
-        a = tvm.nd.array(np.random.uniform(0.1, 0.5, size=n).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(0.1, 0.5, size=n).astype(A.dtype), dev)
+        b = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
         f(a, b)
         tvm.testing.assert_allclose(b.asnumpy(), np_func(a.asnumpy()), atol=1e-5, rtol=1e-5)
 
@@ -108,11 +108,11 @@ def run_test(tvm_intrin, np_func):
         C = te.compute((m,), lambda *i: tvm_intrin(A(*i), B(*i)), name="C")
         s = te.create_schedule(C.op)
         f = tvm.build(s, [A, B, C], "llvm")
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
         n = 10
-        a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(B.dtype), ctx)
-        c = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), dev)
+        b = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(B.dtype), dev)
+        c = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
         f(a, b, c)
         tvm.testing.assert_allclose(
             c.asnumpy(), np_func(a.asnumpy(), b.asnumpy()), atol=1e-5, rtol=1e-5
@@ -131,11 +131,11 @@ def test_ldexp():
     C = te.compute((m,), lambda *i: tvm.tir.ldexp(A(*i), B(*i)), name="C")
     s = te.create_schedule(C.op)
     f = tvm.build(s, [A, B, C], "llvm")
-    ctx = tvm.cpu(0)
+    dev = tvm.cpu(0)
     n = 10
-    a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), ctx)
-    b = tvm.nd.array(np.random.randint(0, 5, size=n).astype(B.dtype), ctx)
-    c = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
+    a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), dev)
+    b = tvm.nd.array(np.random.randint(0, 5, size=n).astype(B.dtype), dev)
+    c = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
     f(a, b, c)
     tvm.testing.assert_allclose(
         c.asnumpy(), np.ldexp(a.asnumpy(), b.asnumpy()), atol=1e-5, rtol=1e-5
diff --git a/tests/python/unittest/test_tir_ir_builder.py b/tests/python/unittest/test_tir_ir_builder.py
index 8ad5cb63924e..0b05c1093bc6 100644
--- a/tests/python/unittest/test_tir_ir_builder.py
+++ b/tests/python/unittest/test_tir_ir_builder.py
@@ -108,11 +108,11 @@ def check_target(target):
             return
         # build and invoke the kernel.
         fadd = tvm.build(s, [A, B, C], target)
-        ctx = tvm.context(target, 0)
+        dev = tvm.device(target, 0)
         # launch the kernel.
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
+        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
+        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
         fadd(a, b, c)
         tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
 
@@ -161,11 +161,11 @@ def check_target(target):
             return
         # build and invoke the kernel.
         fadd = tvm.build(s, [A, B, C], target)
-        ctx = tvm.context(target, 0)
+        dev = tvm.device(target, 0)
         # launch the kernel.
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
+        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
+        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
         fadd(a, b, c)
         tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
 
@@ -215,12 +215,12 @@ def check_target(target, ir):
         with tvm.transform.PassContext(opt_level=3):
             func = tvm.build(s, [A, B, C], target)
 
-        ctx = tvm.context(target, 0)
+        dev = tvm.device(target, 0)
         a_np = np.random.uniform(size=n).astype(A.dtype)
         b_np = np.random.uniform(size=n).astype(B.dtype)
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(b_np, ctx)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+        a = tvm.nd.array(a_np, dev)
+        b = tvm.nd.array(b_np, dev)
+        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
         func(a, b, c)
         ref = num_iter * (a_np + b_np)
         tvm.testing.assert_allclose(c.asnumpy(), ref, rtol=1e-5, atol=1e-5)
@@ -283,8 +283,8 @@ def check_target(target, ir):
         with tvm.transform.PassContext(opt_level=3):
             func = tvm.build(s, [C], target)
 
-        ctx = tvm.context(target, 0)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+        dev = tvm.device(target, 0)
+        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
         func(c)
         ref = np.array([collatz_ref(i) for i in range(n)])
         tvm.testing.assert_allclose(c.asnumpy(), ref)
@@ -397,8 +397,8 @@ def check_target(target, ir):
         with tvm.transform.PassContext(opt_level=3):
             func = tvm.build(s, [C], target)
 
-        ctx = tvm.context(target, 0)
-        c = tvm.nd.array(np.zeros(shape, dtype=C.dtype), ctx)
+        dev = tvm.device(target, 0)
+        c = tvm.nd.array(np.zeros(shape, dtype=C.dtype), dev)
         func(c)
         tvm.testing.assert_allclose(c.asnumpy(), ref, rtol=1e-5, atol=1e-5)
 
@@ -480,13 +480,13 @@ def check_target(target, ir):
         with tvm.transform.PassContext(opt_level=3):
             func = tvm.build(s, [A, B, C], target)
 
-        ctx = tvm.context(target, 0)
+        dev = tvm.device(target, 0)
         a_np = np.random.uniform(size=n).astype(A.dtype)
         b_np = np.random.uniform(size=n).astype(B.dtype)
         a_np = np.sort(a_np)
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(b_np, ctx)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+        a = tvm.nd.array(a_np, dev)
+        b = tvm.nd.array(b_np, dev)
+        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
         func(a, b, c)
         ref = np.searchsorted(a_np, b_np)
         tvm.testing.assert_allclose(c.asnumpy(), ref)
diff --git a/tests/python/unittest/test_tir_nodes.py b/tests/python/unittest/test_tir_nodes.py
index 6e338d64a61c..19e7bc89122e 100644
--- a/tests/python/unittest/test_tir_nodes.py
+++ b/tests/python/unittest/test_tir_nodes.py
@@ -440,9 +440,20 @@ def test_block_blockrealize():
     assert block_realize.predicate == tvm.tir.const(True, "bool")
     assert block_realize.block == block
 
-    # make sure we can print
+    # make sure we can print using ReprPrinter
     str(block)
     str(block_realize)
+    # make sure we can print using TIRTextPrinter
+    func = tvm.tir.PrimFunc([], block_realize)
+    output = func.astext()
+    assert output.find("meta[tir.BlockRealise]") == -1
+    assert output.find("bind") != -1
+    assert output.find("reads") != -1
+    assert output.find("writes") != -1
+    assert output.find("alloc_buffer") != -1
+    assert output.find("match_buffer_region") != -1
+    assert output.find("attr") != -1
+    assert output.find("with init()") != -1
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_tir_transform_hoist_if.py b/tests/python/unittest/test_tir_transform_hoist_if.py
index 77ab38d0490d..7d02e4f12c1d 100644
--- a/tests/python/unittest/test_tir_transform_hoist_if.py
+++ b/tests/python/unittest/test_tir_transform_hoist_if.py
@@ -762,15 +762,15 @@ def test_hoisting_op_conv():
     kernel = np.random.uniform(-scale, scale, size=kshape).astype(dtype)
 
     params = {"w": tvm.nd.array(kernel)}
-    for target, ctx in enabled_targets():
+    for target, dev in enabled_targets():
         with tvm.transform.PassContext(opt_level=3):
             lib = relay.build_module.build(mod, target=target, params=params)
-            m = tvm.contrib.graph_runtime.GraphModule(lib["default"](ctx))
+            m = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
             x = np.random.uniform(size=dshape)
             data_tvm = tvm.nd.array(data)
             m.set_input("x", data_tvm)
             m.run()
-            e = m.module.time_evaluator("run", ctx, number=300, repeat=3)
+            e = m.module.time_evaluator("run", dev, number=300, repeat=3)
             t1 = e(data_tvm).results
             t1 = np.array(t1) * 1000
             print("{} ms".format(t1.mean()))
@@ -779,13 +779,13 @@ def test_hoisting_op_conv():
             opt_level=3, config={"tir.HoistIfThenElse": {"support_block_scope_hosting": True}}
         ):
             lib = relay.build_module.build(mod, target=target, params=params)
-            m = tvm.contrib.graph_runtime.GraphModule(lib["default"](ctx))
+            m = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
             x = np.random.uniform(size=dshape)
             data_tvm = tvm.nd.array(data)
             m.set_input("x", data_tvm)
             m.set_input(**params)
             m.run()
-            e = m.module.time_evaluator("run", ctx, number=300, repeat=3)
+            e = m.module.time_evaluator("run", dev, number=300, repeat=3)
             t2 = e(data_tvm).results
             t2 = np.array(t2) * 1000
 
diff --git a/tests/python/unittest/test_tir_transform_instrument_bound_checkers.py b/tests/python/unittest/test_tir_transform_instrument_bound_checkers.py
index bee8bfb60764..c035fd063dba 100644
--- a/tests/python/unittest/test_tir_transform_instrument_bound_checkers.py
+++ b/tests/python/unittest/test_tir_transform_instrument_bound_checkers.py
@@ -39,11 +39,12 @@ def test_out_of_bounds_llvm(index_a, index_b):
     tgt_host = "llvm"
     stmt = tvm.lower(s, [A, B, C], simple_mode=True)
     print(stmt)
-    fadd = tvm.build(s, [A, B, C], tvm.target.Target(tgt, tgt_host), name="myadd")
-    ctx = tvm.context(tgt, 0)
-    a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
-    b = tvm.nd.array(np.random.uniform(size=1024).astype(B.dtype), ctx)
-    c = tvm.nd.array(np.zeros(1024, dtype=C.dtype), ctx)
+    tgt = tvm.target.Target(tgt, tgt_host)
+    fadd = tvm.build(s, [A, B, C], target=tgt, name="myadd")
+    dev = tvm.device(tgt.kind.name, 0)
+    a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev)
+    b = tvm.nd.array(np.random.uniform(size=1024).astype(B.dtype), dev)
+    c = tvm.nd.array(np.zeros(1024, dtype=C.dtype), dev)
     fadd(a, b, c)
 
 
@@ -57,11 +58,12 @@ def test_in_bounds_llvm():
     tgt = "llvm"
     tgt_host = "llvm"
     stmt = tvm.lower(s, [A, B, C], simple_mode=True)
-    fadd = tvm.build(s, [A, B, C], tvm.target.Target(tgt, tgt_host), name="myadd")
-    ctx = tvm.context(tgt, 0)
-    a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
-    b = tvm.nd.array(np.random.uniform(size=1024).astype(B.dtype), ctx)
-    c = tvm.nd.array(np.zeros(1024, dtype=C.dtype), ctx)
+    tgt = tvm.target.Target(tgt, tgt_host)
+    fadd = tvm.build(s, [A, B, C], target=tgt, name="myadd")
+    dev = tvm.device(tgt.kind.name, 0)
+    a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev)
+    b = tvm.nd.array(np.random.uniform(size=1024).astype(B.dtype), dev)
+    c = tvm.nd.array(np.zeros(1024, dtype=C.dtype), dev)
     fadd(a, b, c)
 
 
@@ -79,12 +81,13 @@ def test_out_of_bounds_vectorize_llvm(nn, index_a, index_b):
     tgt = "llvm"
     tgt_host = "llvm"
     stmt = tvm.lower(s, [a, b, c], simple_mode=True)
-    f = tvm.build(s, [a, b, c], tgt, tvm.target.Target(tgt, tgt_host), name="myaddvec")
-    ctx = tvm.cpu(0)
+    tgt = tvm.target.Target(tgt, tgt_host)
+    f = tvm.build(s, [a, b, c], target=tgt, name="myaddvec")
+    dev = tvm.cpu(0)
     n = nn
-    a = tvm.nd.array(np.random.uniform(size=(n)).astype(a.dtype), ctx)
-    b = tvm.nd.array(np.random.uniform(size=(n)).astype(a.dtype), ctx)
-    c = tvm.nd.array(np.zeros(n, dtype=c.dtype), ctx)
+    a = tvm.nd.array(np.random.uniform(size=(n)).astype(a.dtype), dev)
+    b = tvm.nd.array(np.random.uniform(size=(n)).astype(a.dtype), dev)
+    c = tvm.nd.array(np.zeros(n, dtype=c.dtype), dev)
     f(a, b, c)
 
 
@@ -106,10 +109,10 @@ def test_in_bounds_vectorize_llvm():
     # build and invoke the kernel.
     lowered_func = tvm.lower(s, [A, C], "llvm", simple_mode=False)
     f = tvm.build(s, [A, C], "llvm")
-    ctx = tvm.cpu(0)
+    dev = tvm.cpu(0)
     # launch the kernel.
     a = tvm.nd.empty((n,), A.dtype).copyfrom(np.random.uniform(size=(n, lanes)))
-    c = tvm.nd.empty((n,), C.dtype, ctx)
+    c = tvm.nd.empty((n,), C.dtype, dev)
     f(a, c)
     tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
 
@@ -124,12 +127,12 @@ def test_in_bounds_loop_partition_basic_llvm():
     s = te.create_schedule(T.op)
     xo, xi = s[T].split(T.op.axis[0], factor=4)
     lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False)
-    ctx = tvm.cpu(0)
+    dev = tvm.cpu(0)
 
     f = tvm.build(s, [A, B, T], "llvm")
-    a = tvm.nd.array(np.random.uniform(size=(32,)).astype(A.dtype), ctx)
-    b = tvm.nd.array(np.random.uniform(size=(32,)).astype(B.dtype), ctx)
-    t = tvm.nd.empty((32,), T.dtype, ctx)
+    a = tvm.nd.array(np.random.uniform(size=(32,)).astype(A.dtype), dev)
+    b = tvm.nd.array(np.random.uniform(size=(32,)).astype(B.dtype), dev)
+    t = tvm.nd.empty((32,), T.dtype, dev)
     f(a, b, t)
 
 
@@ -144,12 +147,12 @@ def test_out_of_bounds_loop_partition_basic_llvm(index_a, index_b):
     s = te.create_schedule(T.op)
     xo, xi = s[T].split(T.op.axis[0], factor=4)
     lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False)
-    ctx = tvm.cpu(0)
+    dev = tvm.cpu(0)
 
     f = tvm.build(s, [A, B, T], "llvm")
-    a = tvm.nd.array(np.random.uniform(size=(32,)).astype(A.dtype), ctx)
-    b = tvm.nd.array(np.random.uniform(size=(32,)).astype(B.dtype), ctx)
-    t = tvm.nd.empty((32,), T.dtype, ctx)
+    a = tvm.nd.array(np.random.uniform(size=(32,)).astype(A.dtype), dev)
+    b = tvm.nd.array(np.random.uniform(size=(32,)).astype(B.dtype), dev)
+    t = tvm.nd.empty((32,), T.dtype, dev)
     f(a, b, t)
 
 
@@ -221,12 +224,12 @@ def test_in_bounds_const_loop_partition_llvm():
         s = te.create_schedule(T.op)
         xo, xi = s[T].split(T.op.axis[0], factor=4)
         lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False)
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
 
         f = tvm.build(s, [A, B, T], "llvm")
-        a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=(n,)).astype(B.dtype), ctx)
-        t = tvm.nd.empty((n,), T.dtype, ctx)
+        a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev)
+        b = tvm.nd.array(np.random.uniform(size=(n,)).astype(B.dtype), dev)
+        t = tvm.nd.empty((n,), T.dtype, dev)
         f(a, b, t)
 
 
@@ -247,12 +250,12 @@ def test_out_of_bounds_const_loop_partition_llvm(index_a, index_b):
         s = te.create_schedule(T.op)
         xo, xi = s[T].split(T.op.axis[0], factor=4)
         lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False)
-        ctx = tvm.cpu(0)
+        dev = tvm.cpu(0)
 
         f = tvm.build(s, [A, B, T], "llvm")
-        a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=(n,)).astype(B.dtype), ctx)
-        t = tvm.nd.empty((n,), T.dtype, ctx)
+        a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev)
+        b = tvm.nd.array(np.random.uniform(size=(n,)).astype(B.dtype), dev)
+        t = tvm.nd.empty((n,), T.dtype, dev)
         f(a, b, t)
 
 
@@ -283,19 +286,19 @@ def test_in_bounds_conv_llvm(loop_tiling=False):
     if loop_tiling:
         oho, owo, ohi, owi = s[conv].tile(oh, ow, 16, 16)
     lowered_func = tvm.lower(s, [data, kernel, conv], simple_mode=True)
-    ctx = tvm.cpu(0)
+    dev = tvm.cpu(0)
 
     f = tvm.build(s, [data, kernel, conv], "llvm")
     data_input = tvm.nd.array(
-        np.random.uniform(size=(batch_size, in_channel, in_height, in_width)).astype("float32"), ctx
+        np.random.uniform(size=(batch_size, in_channel, in_height, in_width)).astype("float32"), dev
     )
     kernel_input = tvm.nd.array(
         np.random.uniform(size=(kernel_height, kernel_width, in_channel, out_channel)).astype(
             "float32"
         ),
-        ctx,
+        dev,
     )
-    conv_out = tvm.nd.empty((batch_size, out_channel, out_height, out_width), "float32", ctx)
+    conv_out = tvm.nd.empty((batch_size, out_channel, out_height, out_width), "float32", dev)
     f(data_input, kernel_input, conv_out)
 
 
@@ -339,19 +342,19 @@ def test_out_of_bounds_conv_llvm(data_offsets, kernel_offsets, loop_tiling=False
     if loop_tiling:
         oho, owo, ohi, owi = s[conv].tile(oh, ow, 16, 16)
     lowered_func = tvm.lower(s, [data, kernel, conv], simple_mode=True)
-    ctx = tvm.cpu(0)
+    dev = tvm.cpu(0)
 
     f = tvm.build(s, [data, kernel, conv], "llvm")
     data_input = tvm.nd.array(
-        np.random.uniform(size=(batch_size, in_channel, in_height, in_width)).astype("float32"), ctx
+        np.random.uniform(size=(batch_size, in_channel, in_height, in_width)).astype("float32"), dev
     )
     kernel_input = tvm.nd.array(
         np.random.uniform(size=(kernel_height, kernel_width, in_channel, out_channel)).astype(
             "float32"
         ),
-        ctx,
+        dev,
     )
-    conv_out = tvm.nd.empty((batch_size, out_channel, out_height, out_width), "float32", ctx)
+    conv_out = tvm.nd.empty((batch_size, out_channel, out_height, out_width), "float32", dev)
     f(data_input, kernel_input, conv_out)
 
 
@@ -366,12 +369,12 @@ def test_in_bounds_tensors_with_same_shapes1D_llvm():
     T = te.compute((m,), lambda i: A[i] * B[i])
     s = te.create_schedule(T.op)
     lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False)
-    ctx = tvm.cpu(0)
+    dev = tvm.cpu(0)
 
     f = tvm.build(s, [A, B, T], "llvm")
-    a = tvm.nd.array(np.random.uniform(size=(32,)).astype(A.dtype), ctx)
-    b = tvm.nd.array(np.random.uniform(size=(32,)).astype(B.dtype), ctx)
-    t = tvm.nd.empty((32,), T.dtype, ctx)
+    a = tvm.nd.array(np.random.uniform(size=(32,)).astype(A.dtype), dev)
+    b = tvm.nd.array(np.random.uniform(size=(32,)).astype(B.dtype), dev)
+    t = tvm.nd.empty((32,), T.dtype, dev)
     f(a, b, t)
 
 
@@ -387,12 +390,12 @@ def test_out_of_bounds_tensors_with_diff_shapes1D_llvm(a_shape, b_shape, c_shape
     T = te.compute((m,), lambda i: A[i] * B[i])
     s = te.create_schedule(T.op)
     lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False)
-    ctx = tvm.cpu(0)
+    dev = tvm.cpu(0)
 
     f = tvm.build(s, [A, B, T], "llvm")
-    a = tvm.nd.array(np.random.uniform(size=(a_shape,)).astype(A.dtype), ctx)
-    b = tvm.nd.array(np.random.uniform(size=(b_shape,)).astype(B.dtype), ctx)
-    t = tvm.nd.empty((c_shape,), T.dtype, ctx)
+    a = tvm.nd.array(np.random.uniform(size=(a_shape,)).astype(A.dtype), dev)
+    b = tvm.nd.array(np.random.uniform(size=(b_shape,)).astype(B.dtype), dev)
+    t = tvm.nd.empty((c_shape,), T.dtype, dev)
     f(a, b, t)
 
 
@@ -407,12 +410,12 @@ def test_in_bounds_tensors_with_same_shapes2D_llvm():
     T = te.compute((m, m), lambda i, j: A[i][j] * B[i][j])
     s = te.create_schedule(T.op)
     lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False)
-    ctx = tvm.cpu(0)
+    dev = tvm.cpu(0)
 
     f = tvm.build(s, [A, B, T], "llvm")
-    a = tvm.nd.array(np.random.uniform(size=(32, 32)).astype(A.dtype), ctx)
-    b = tvm.nd.array(np.random.uniform(size=(32, 32)).astype(B.dtype), ctx)
-    t = tvm.nd.empty((32, 32), T.dtype, ctx)
+    a = tvm.nd.array(np.random.uniform(size=(32, 32)).astype(A.dtype), dev)
+    b = tvm.nd.array(np.random.uniform(size=(32, 32)).astype(B.dtype), dev)
+    t = tvm.nd.empty((32, 32), T.dtype, dev)
     f(a, b, t)
 
 
@@ -428,12 +431,12 @@ def test_out_of_bounds_tensors_with_diff_shapes2D_llvm(a_shape, b_shape, c_shape
     T = te.compute((m, m), lambda i, j: A[i][j] * B[i][j])
     s = te.create_schedule(T.op)
     lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False)
-    ctx = tvm.cpu(0)
+    dev = tvm.cpu(0)
 
     f = tvm.build(s, [A, B, T], "llvm")
-    a = tvm.nd.array(np.random.uniform(size=(a_shape[0], a_shape[1])).astype(A.dtype), ctx)
-    b = tvm.nd.array(np.random.uniform(size=(b_shape[0], b_shape[1])).astype(B.dtype), ctx)
-    t = tvm.nd.empty((c_shape[0], c_shape[1]), T.dtype, ctx)
+    a = tvm.nd.array(np.random.uniform(size=(a_shape[0], a_shape[1])).astype(A.dtype), dev)
+    b = tvm.nd.array(np.random.uniform(size=(b_shape[0], b_shape[1])).astype(B.dtype), dev)
+    t = tvm.nd.empty((c_shape[0], c_shape[1]), T.dtype, dev)
     f(a, b, t)
 
 
@@ -449,12 +452,12 @@ def test_in_bounds_tensors_with_same_shapes3D_llvm():
     s = te.create_schedule(T.op)
     lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False)
 
-    ctx = tvm.cpu(0)
+    dev = tvm.cpu(0)
 
     f = tvm.build(s, [A, B, T], "llvm")
-    a = tvm.nd.array(np.random.uniform(size=(32, 32, 32)).astype(A.dtype), ctx)
-    b = tvm.nd.array(np.random.uniform(size=(32, 32, 32)).astype(B.dtype), ctx)
-    t = tvm.nd.empty((32, 32, 32), T.dtype, ctx)
+    a = tvm.nd.array(np.random.uniform(size=(32, 32, 32)).astype(A.dtype), dev)
+    b = tvm.nd.array(np.random.uniform(size=(32, 32, 32)).astype(B.dtype), dev)
+    t = tvm.nd.empty((32, 32, 32), T.dtype, dev)
     f(a, b, t)
 
 
@@ -471,16 +474,16 @@ def test_out_of_bounds_tensors_with_diff_shapes3D_llvm(a_shape, b_shape, c_shape
     s = te.create_schedule(T.op)
     lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False)
 
-    ctx = tvm.cpu(0)
+    dev = tvm.cpu(0)
 
     f = tvm.build(s, [A, B, T], "llvm")
     a = tvm.nd.array(
-        np.random.uniform(size=(a_shape[0], a_shape[1], c_shape[2])).astype(A.dtype), ctx
+        np.random.uniform(size=(a_shape[0], a_shape[1], c_shape[2])).astype(A.dtype), dev
     )
     b = tvm.nd.array(
-        np.random.uniform(size=(b_shape[0], b_shape[1], b_shape[2])).astype(B.dtype), ctx
+        np.random.uniform(size=(b_shape[0], b_shape[1], b_shape[2])).astype(B.dtype), dev
     )
-    t = tvm.nd.empty((c_shape[0], c_shape[1], c_shape[2]), T.dtype, ctx)
+    t = tvm.nd.empty((c_shape[0], c_shape[1], c_shape[2]), T.dtype, dev)
     f(a, b, t)
 
 
@@ -498,11 +501,11 @@ def test_out_of_bounds_tensors_with_zero_shape_op_with_not_zero_shape_llvm():
 
     # build and invoke the kernel.
     f = tvm.build(s, [A, scale, D], "llvm")
-    ctx = tvm.cpu(0)
+    dev = tvm.cpu(0)
     # launch the kernel.
-    a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), ctx)
-    sc = tvm.nd.array(np.random.randint(0, 2, size=()).astype(scale.dtype), ctx)
-    d = tvm.nd.empty((), D.dtype, ctx)
+    a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), dev)
+    sc = tvm.nd.array(np.random.randint(0, 2, size=()).astype(scale.dtype), dev)
+    d = tvm.nd.empty((), D.dtype, dev)
     f(a, sc, d)
     d_np = np.sum(a.asnumpy()) * sc.asnumpy() + 1
     tvm.testing.assert_allclose(d.asnumpy(), d_np)
diff --git a/tests/python/unittest/test_tir_transform_loop_partition.py b/tests/python/unittest/test_tir_transform_loop_partition.py
index ecaff319441d..f5a5e4ca6563 100644
--- a/tests/python/unittest/test_tir_transform_loop_partition.py
+++ b/tests/python/unittest/test_tir_transform_loop_partition.py
@@ -487,24 +487,24 @@ def test_double_splitting_with_indivisible_factors():
     assert not any(collect_visit(top_produce, lambda x: isinstance(x, tvm.tir.IfThenElse)))
 
     # check functional correctness of generated code
-    ctx = tvm.context(target, 0)
+    dev = tvm.device(target, 0)
     a = tvm.nd.array(
         numpy.ones(
             m,
         ).astype(dtype),
-        ctx,
+        dev,
     )
     c = tvm.nd.array(
         numpy.zeros(
             m,
         ).astype(dtype),
-        ctx,
+        dev,
     )
     d = tvm.nd.array(
         numpy.zeros(
             m,
         ).astype(dtype),
-        ctx,
+        dev,
     )
     func(a, c, d)
     tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy(), rtol=1e-5)
diff --git a/tests/python/unittest/test_tir_transform_lower_warp_memory.py b/tests/python/unittest/test_tir_transform_lower_warp_memory.py
index 28179c2d7b40..ac7204368c1d 100644
--- a/tests/python/unittest/test_tir_transform_lower_warp_memory.py
+++ b/tests/python/unittest/test_tir_transform_lower_warp_memory.py
@@ -114,7 +114,7 @@ def check_cuda(dtype):
             xo, xi = s[AA].split(s[AA].op.axis[0], 32)
             s[AA].bind(xi, tx)
 
-            ctx = tvm.gpu(0)
+            dev = tvm.gpu(0)
             func = tvm.build(s, [A, B], "cuda")
             A_np = np.array(list(range(m)), dtype=dtype)
             B_np = np.array(
@@ -128,8 +128,8 @@ def check_cuda(dtype):
                 + [96],
                 dtype=dtype,
             )
-            A_nd = tvm.nd.array(A_np, ctx)
-            B_nd = tvm.nd.array(np.zeros(B_np.shape, dtype=B_np.dtype), ctx)
+            A_nd = tvm.nd.array(A_np, dev)
+            B_nd = tvm.nd.array(np.zeros(B_np.shape, dtype=B_np.dtype), dev)
             func(A_nd, B_nd)
             tvm.testing.assert_allclose(B_nd.asnumpy(), B_np, rtol=1e-3)
 
@@ -181,12 +181,12 @@ def check_cuda(dtype):
             _, x = AA.op.axis
             s[AA].bind(x, tx)
 
-            ctx = tvm.gpu(0)
+            dev = tvm.gpu(0)
             func = tvm.build(s, [A, B], "cuda")
             A_np = np.array([list(range(i, m + i)) for i in range(n)], dtype=dtype)
             B_np = np.array([list(range(1 + i, m + i)) + [i] for i in range(n)], dtype=dtype)
-            A_nd = tvm.nd.array(A_np, ctx)
-            B_nd = tvm.nd.array(np.zeros(B_np.shape, dtype=B_np.dtype), ctx)
+            A_nd = tvm.nd.array(A_np, dev)
+            B_nd = tvm.nd.array(np.zeros(B_np.shape, dtype=B_np.dtype), dev)
             func(A_nd, B_nd)
             tvm.testing.assert_allclose(B_nd.asnumpy(), B_np, rtol=1e-3)
 
@@ -228,13 +228,13 @@ def check_cuda(dtype):
             s[BB].bind(xo, bx)
             s[BB].bind(xi, tx)
 
-            ctx = tvm.gpu(0)
+            dev = tvm.gpu(0)
             func = tvm.build(s, [A, B, C], "cuda")
             AB_np = np.array(list(range(m)), dtype=dtype)
             C_np = np.array(list(range(1, m)) + [0], dtype=dtype) * 2
-            A_nd = tvm.nd.array(AB_np, ctx)
-            B_nd = tvm.nd.array(AB_np, ctx)
-            C_nd = tvm.nd.array(np.zeros(C_np.shape, dtype=C_np.dtype), ctx)
+            A_nd = tvm.nd.array(AB_np, dev)
+            B_nd = tvm.nd.array(AB_np, dev)
+            C_nd = tvm.nd.array(np.zeros(C_np.shape, dtype=C_np.dtype), dev)
             func(A_nd, B_nd, C_nd)
             tvm.testing.assert_allclose(C_nd.asnumpy(), C_np, rtol=1e-3)
 
@@ -260,12 +260,12 @@ def check(device, m):
             s[AA].bind(yi, tx)
             s[AA].compute_at(s[B], xo)
 
-            ctx = tvm.context(device, 0)
+            dev = tvm.device(device, 0)
             func = tvm.build(s, [A, B], device)
             A_np = np.random.uniform(size=(m,)).astype(A.dtype)
             B_np = np.zeros(shape=(m,)).astype(B.dtype)
-            A_nd = tvm.nd.array(A_np, ctx)
-            B_nd = tvm.nd.array(B_np, ctx)
+            A_nd = tvm.nd.array(A_np, dev)
+            B_nd = tvm.nd.array(B_np, dev)
             func(A_nd, B_nd)
             B_np = A_np + 1
             tvm.testing.assert_allclose(B_nd.asnumpy(), B_np)
diff --git a/tests/python/unittest/test_tvmscript_complete.py b/tests/python/unittest/test_tvmscript_complete.py
new file mode 100644
index 000000000000..012ccc4b8628
--- /dev/null
+++ b/tests/python/unittest/test_tvmscript_complete.py
@@ -0,0 +1,174 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+from tvm import tir
+from tvm.ir import Range
+from tvm.script import ty, from_source
+from tvm.ir.diagnostics import override_renderer
+
+
+@tvm.script.tir
+def matmul(a: ty.handle, b: ty.handle, c: ty.handle) -> None:
+    A = tir.match_buffer(a, [128, 128])
+    B = tir.match_buffer(b, [128, 128])
+    C = tir.match_buffer(c, [128, 128])
+
+    with tir.block([128, 128, tir.reduce_axis(0, 128)], "update") as [vi, vj, vk]:
+        with tir.init():
+            C[vi, vj] = tir.float32(0)
+        C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
+
+
+@tvm.script.tir
+def matmul_original(a: ty.handle, b: ty.handle, c: ty.handle) -> None:
+    A = tir.match_buffer(a, [128, 128])
+    B = tir.match_buffer(b, [128, 128])
+    C = tir.match_buffer(c, [128, 128])
+
+    for i, j in tir.grid(32, 32):
+        with tir.block([32, 32], "init") as [vi, vj]:
+            for ii, jj in tir.grid(4, 4):
+                C[vi * 4 + ii, vj * 4 + jj] = tir.float32(0)
+
+        for k in range(0, 32):
+            with tir.block([128, 128, tir.reduce_axis(0, 128)], "update") as [vi, vj, vk]:
+                for ii, jj, kk in tir.grid(4, 4, 4):
+                    C[vi * 4 + ii, vj * 4 + jj] = (
+                        C[vi * 4 + ii, vj * 4 + jj]
+                        + A[vi * 4 + ii, vk * 4 + kk] * B[vj * 4 + jj, vk * 4 + kk]
+                    )
+
+
+@tvm.script.tir
+def elementwise_with_root(a: ty.handle, b: ty.handle, c: ty.handle) -> None:
+    A = tir.match_buffer(a, [128, 128])
+    B = tir.match_buffer(b, [128, 128])
+    C = tir.match_buffer(c, [128, 128])
+
+    with tir.block([]) as []:
+        with tir.block([128, 128]) as [vi, vj]:
+            B[vi, vj] = A[vi, vj] + tir.float32(1)
+
+        with tir.block([128, 128]) as [vi, vj]:
+            C[vi, vj] = B[vi, vj] + tir.float32(1)
+
+
+def func_with_opaque_block(a: ty.handle, b: ty.handle, c: ty.handle) -> None:
+    A = tir.match_buffer(a, [128, 128])
+    B = tir.match_buffer(b, [128, 128])
+    C = tir.match_buffer(c, [128, 128])
+
+    with tir.block([]) as []:
+        with tir.block([]) as []:
+            B[0, 0] = A[0, 0] + tir.float32(1)
+
+        with tir.block([128, 128]) as [vi, vj]:
+            C[vi, vj] = B[vi, vj] + tir.float32(1)
+
+
+def test_complete_matmul():
+    func = matmul
+    A, B, C = [func.buffer_map[x] for x in func.params]
+
+    block = func.body.block.body.body.body.body.block
+    assert isinstance(block, tvm.tir.Block)
+    vi, vj, vk = [x.var for x in block.iter_vars]
+    access_A = tir.BufferRegion(A, [Range.from_min_extent(vi, 1), Range.from_min_extent(vk, 1)])
+    access_B = tir.BufferRegion(B, [Range.from_min_extent(vj, 1), Range.from_min_extent(vk, 1)])
+    access_C = tir.BufferRegion(C, [Range.from_min_extent(vi, 1), Range.from_min_extent(vj, 1)])
+    tvm.ir.assert_structural_equal(block.reads, [access_C, access_A, access_B])
+    tvm.ir.assert_structural_equal(block.writes, [access_C])
+
+
+def test_complete_matmul_original():
+    func = matmul_original
+    A, B, C = [func.buffer_map[x] for x in func.params]
+
+    block1 = func.body.block.body.body.body[0].block
+    assert isinstance(block1, tvm.tir.Block)
+    vi, vj = [x.var for x in block1.iter_vars]
+    access_C = tir.BufferRegion(
+        C, [Range.from_min_extent(vi * 4, 4), Range.from_min_extent(vj * 4, 4)]
+    )
+    tvm.ir.assert_structural_equal(block1.reads, [])
+    tvm.ir.assert_structural_equal(block1.writes, [access_C])
+
+    block2 = func.body.block.body.body.body[1].body.block
+    assert isinstance(block2, tvm.tir.Block)
+    vi, vj, vk = [x.var for x in block2.iter_vars]
+    access_A = tir.BufferRegion(
+        A, [Range.from_min_extent(vi * 4, 4), Range.from_min_extent(vk * 4, 4)]
+    )
+    access_B = tir.BufferRegion(
+        B, [Range.from_min_extent(vj * 4, 4), Range.from_min_extent(vk * 4, 4)]
+    )
+    access_C = tir.BufferRegion(
+        C, [Range.from_min_extent(vi * 4, 4), Range.from_min_extent(vj * 4, 4)]
+    )
+    tvm.ir.assert_structural_equal(block2.reads, [access_C, access_A, access_B])
+    tvm.ir.assert_structural_equal(block2.writes, [access_C])
+
+
+def test_complete_with_root():
+    func = elementwise_with_root
+    A, B, C = [func.buffer_map[x] for x in func.params]
+
+    block1 = func.body.block.body[0].body.body.block
+    assert isinstance(block1, tvm.tir.Block)
+    vi, vj = [x.var for x in block1.iter_vars]
+
+    tvm.ir.assert_structural_equal(
+        block1.reads,
+        [tir.BufferRegion(A, [Range.from_min_extent(vi, 1), Range.from_min_extent(vj, 1)])],
+    )
+    tvm.ir.assert_structural_equal(
+        block1.writes,
+        [tir.BufferRegion(B, [Range.from_min_extent(vi, 1), Range.from_min_extent(vj, 1)])],
+    )
+
+    block2 = func.body.block.body[1].body.body.block
+    assert isinstance(block2, tvm.tir.Block)
+    vi, vj = [x.var for x in block2.iter_vars]
+    tvm.ir.assert_structural_equal(
+        block2.reads,
+        [tir.BufferRegion(B, [Range.from_min_extent(vi, 1), Range.from_min_extent(vj, 1)])],
+    )
+    tvm.ir.assert_structural_equal(
+        block2.writes,
+        [tir.BufferRegion(C, [Range.from_min_extent(vi, 1), Range.from_min_extent(vj, 1)])],
+    )
+
+
+def test_complete_opaque_block_error():
+    def render(e):
+        pass
+
+    override_renderer(render)
+
+    try:
+        from_source(func_with_opaque_block)
+    except tvm.error.DiagnosticError:
+        return
+    assert False
+
+
+if __name__ == "__main__":
+    test_complete_matmul()
+    test_complete_matmul_original()
+    test_complete_with_root()
+    test_complete_opaque_block_error()
diff --git a/tests/python/unittest/test_tvmscript_error_report.py b/tests/python/unittest/test_tvmscript_error_report.py
index 048a9544d6df..052217b32cb5 100644
--- a/tests/python/unittest/test_tvmscript_error_report.py
+++ b/tests/python/unittest/test_tvmscript_error_report.py
@@ -144,6 +144,197 @@ def test_no_body():
     check_error(no_body, 3)
 
 
+def allocate_with_buffers() -> None:
+    with tir.allocate([1], "float32", "") as [A, B]:  # error
+        tir.evaluate(1.0)
+
+
+def test_allocate_with_buffers():
+    check_error(allocate_with_buffers, 2)
+
+
+def inconsistent_binding() -> None:
+    with tir.block([128, 128]) as [vi]:  # error
+        tir.evaluate(1.0)
+
+
+def test_inconsistent_binding():
+    check_error(inconsistent_binding, 2)
+
+
+def invalid_block_axes(a: ty.handle) -> None:
+    A = tir.match_buffer(a, (16, 16), "float32")
+    with tir.block([A]) as [vi]:  # error
+        tir.evaluate(1.0)
+
+
+def test_invalid_block_axes():
+    check_error(invalid_block_axes, 3)
+
+
+def miss_block_bind() -> None:
+    with tir.block([16, 16]) as [vi, vj]:  # error
+        tir.bind(vi, 1)
+        tir.evaluate(1.0)
+
+
+def test_miss_block_bind():
+    check_error(miss_block_bind, 2)
+
+
+def invalid_loop_var() -> None:
+    for i, j in range(0, 16):  # error
+        tir.evaluate(1.0)
+
+
+def test_invalid_loop_var():
+    check_error(invalid_loop_var, 2)
+
+
+def inconsistent_grid() -> None:
+    for i in tir.grid(16, 16):  # error
+        tir.evaluate(1.0)
+
+
+def test_inconsistent_grid():
+    check_error(inconsistent_grid, 2)
+
+
+def invalid_match_buffer_region() -> None:
+    with tir.block([16, 16]) as [vi, vj]:
+        A = tir.match_buffer_region(vi)  # error
+        tir.evaluate(1.0)
+
+
+def test_invalid_match_buffer_region():
+    check_error(invalid_match_buffer_region, 3)
+
+
+def duplicate_buffer() -> None:
+    A = tir.alloc_buffer((128, 128), "float32")
+    with tir.block([16, 16]) as [vi, vj]:
+        A = tir.alloc_buffer((128, 128), "float32")  # error
+        tir.evaluate(1.0)
+
+
+def test_duplicate_buffer():
+    check_error(duplicate_buffer, 4)
+
+
+def duplicate_reads() -> None:
+    A = tir.alloc_buffer((128, 128), "float32")
+    with tir.block([16, 16]) as [vi, vj]:
+        tir.reads(A[0:8, 0:8])
+        tir.reads(A[0:16, 0:16])  # error
+        tir.evaluate(1.0)
+
+
+def duplicate_writes() -> None:
+    A = tir.alloc_buffer((128, 128), "float32")
+    with tir.block([16, 16]) as [vi, vj]:
+        tir.writes(A[0:8, 0:8])
+        tir.writes(A[0:16, 0:16])  # error
+        tir.evaluate(1.0)
+
+
+def duplicate_predicate() -> None:
+    with tir.block([16, 16]) as [vi, vj]:
+        tir.where(1)
+        tir.where(0)  # error
+
+
+def duplicate_annotations() -> None:
+    with tir.block([16, 16]) as [vi, vj]:
+        tir.block_attr({})
+        tir.block_attr({})  # error
+
+
+def duplicate_init() -> None:
+    with tir.block([16, 16]) as [vi, vj]:
+        with tir.init():
+            tir.evaluate(1.0)
+        with tir.init():  # error
+            tir.evaluate(1.0)
+
+
+def test_duplicate_block_signature():
+    check_error(duplicate_reads, 5)
+    check_error(duplicate_writes, 5)
+    check_error(duplicate_predicate, 4)
+    check_error(duplicate_annotations, 4)
+    check_error(duplicate_init, 5)
+
+
+def opaque_access_during_complete(a: ty.handle) -> None:  # error
+    A = tir.match_buffer(a, (16, 16), "float32")
+    with tir.block([16, 16]) as [vi, vj]:
+        tir.evaluate(tir.load("float32", A.data, vi * 16 + vj))
+
+
+def test_opaque_access_during_complete():
+    check_error(opaque_access_during_complete, 1)
+
+
+def convert_slice_to_bufferload() -> None:
+    A = tir.alloc_buffer((128, 128), "float32")
+    with tir.block([16, 16]) as [vi, vj]:
+        A[vi, vj] = A[vi : vi + 2, vj] + 1  # error
+
+
+def test_convert_slice_to_bufferload():
+    check_error(convert_slice_to_bufferload, 4)
+
+
+def error_index_type() -> None:
+    A = tir.alloc_buffer((128, 128), "float32")
+    with tir.block([16, 16]) as [vi, vj]:
+        A[vi, vj] = A[vi, 0.0] + 1  # error
+
+
+def test_error_index_type():
+    check_error(error_index_type, 4)
+
+
+def mismatch_args() -> None:
+    A = tir.alloc_buffer((128, 128), "float32")
+    with tir.block([16, 16]) as [vi, vj]:
+        tir.reads(A[0, 0], A[1, 1])  # error
+        tir.evaluate(1.0)
+
+
+def test_mismatch_args():
+    check_error(mismatch_args, 4)
+
+
+def special_stmt_except() -> None:
+    A = tir.alloc_buffer("(128, 128)", "float32")  # error
+    with tir.block([16, 16]) as [vi, vj]:
+        tir.evaluate(1.0)
+
+
+def scope_handler_except() -> None:
+    for i in tir.serial("1", "1"):  # error
+        tir.evaluate(1)
+
+
+def intrin_except_unassign(a: ty.handle) -> None:
+    A = tir.match_buffer(a, (16, 16), "float32")
+    tir.evaluate(A)  # error
+
+
+def intrin_except_assign(a: ty.handle) -> None:
+    A = tir.match_buffer(a, (16, 16), "float32")
+    A[0, 0] = tir.load(A, A, A)  # error
+
+
+def test_tvm_exception_catch():
+    # test catching c++ side exception
+    check_error(special_stmt_except, 2)
+    check_error(scope_handler_except, 2)
+    check_error(intrin_except_unassign, 3)
+    check_error(intrin_except_assign, 3)
+
+
 def check_error(module, rel_lineno):
     # Override the default renderer to accumulate errors
     _, start_line = inspect.getsourcelines(module)
@@ -180,3 +371,17 @@ def render(e):
     test_return_not_allowed()
     test_tir_assert()
     test_no_body()
+    test_allocate_with_buffers()
+    test_inconsistent_binding()
+    test_invalid_block_axes()
+    test_miss_block_bind()
+    test_invalid_loop_var()
+    test_inconsistent_grid()
+    test_invalid_match_buffer_region()
+    test_duplicate_buffer()
+    test_duplicate_block_signature()
+    test_opaque_access_during_complete()
+    test_convert_slice_to_bufferload()
+    test_error_index_type()
+    test_mismatch_args()
+    test_tvm_exception_catch()
diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index c7a38cccda49..bd36b79d7f4e 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -2662,6 +2662,172 @@ def test_opt_conv_tensorcore_mod_host():
     tvm.ir.assert_structural_equal(mod, rt_mod, True)
 
 
+@tvm.script.tir
+def matmul(a: ty.handle, b: ty.handle, c: ty.handle) -> None:
+    A = tir.match_buffer(a, [128, 128])
+    B = tir.match_buffer(b, [128, 128])
+    C = tir.match_buffer(c, [128, 128])
+
+    with tir.block([128, 128, tir.reduce_axis(0, 128)], "update") as [vi, vj, vk]:
+        with tir.init():
+            C[vi, vj] = tir.float32(0)
+        C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
+
+
+@tvm.script.tir
+def matmul_original(a: ty.handle, b: ty.handle, c: ty.handle) -> None:
+    A = tir.match_buffer(a, [128, 128])
+    B = tir.match_buffer(b, [128, 128])
+    C = tir.match_buffer(c, [128, 128])
+
+    for i, j in tir.grid(128, 128):
+        with tir.block([128, 128], "init") as [vi, vj]:
+            C[vi, vj] = tir.float32(0)
+
+        for k in range(0, 128):
+            with tir.block([128, 128, tir.reduce_axis(0, 128)], "update") as [vi, vj, vk]:
+                C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
+
+
+@tvm.script.tir
+def element_wise(a: ty.handle, c: ty.handle) -> None:
+    A = tir.match_buffer(a, (128, 128), "float32")
+    C = tir.match_buffer(c, (128, 128), "float32")
+    B = tir.alloc_buffer((128, 128), "float32")
+
+    with tir.block([128, 128], "B") as [vi, vj]:
+        B[vi, vj] = A[vi, vj] * tir.float32(2)
+
+    with tir.block([128, 128], "C") as [vi, vj]:
+        C[vi, vj] = B[vi, vj] + tir.float32(1)
+
+
+@tvm.script.tir
+def predicate(b: ty.handle, c: ty.handle) -> None:
+    B = tir.match_buffer(b, (16, 16), "float32")
+    C = tir.match_buffer(c, (16, 16), "float32")
+
+    for i, jo, ji in tir.grid(16, 4, 5):
+        with tir.block([16, 16], "update") as [vi, vj]:
+            tir.bind(vi, i)
+            tir.bind(vj, jo * 4 + ji)
+            tir.where(jo * 4 + ji < 16)
+            C[vi, vj] = B[vi, vj] + tir.float32(1)
+
+
+def test_module_define():
+    func1 = tvm.script.create_module({"matmul": matmul})["matmul"]
+    func2 = tvm.script.create_module({"element_wise": element_wise})["element_wise"]
+    func3 = tvm.script.create_module({"predicate": predicate})["predicate"]
+    mod1 = tvm.script.create_module({"func1": func1, "func2": func2, "func3": func3})
+    mod2 = tvm.script.create_module({"func1": matmul, "func2": element_wise, "func3": predicate})
+    tvm.ir.assert_structural_equal(mod1, mod2)
+
+
+def test_matmul():
+    func = matmul
+    rt_func = tvm.script.from_source(tvm.script.asscript(func, True))
+    tvm.ir.assert_structural_equal(func, rt_func)
+
+
+def test_matmul_original():
+    func = matmul_original
+    rt_func = tvm.script.from_source(tvm.script.asscript(func, True))
+    tvm.ir.assert_structural_equal(func, rt_func)
+
+    assert isinstance(rt_func.body.block, tir.stmt.Block)
+    assert isinstance(rt_func.body.block.body, tir.stmt.For)
+    assert isinstance(rt_func.body.block.body.body, tir.stmt.For)
+    assert isinstance(rt_func.body.block.body.body.body, tir.stmt.SeqStmt)
+    assert isinstance(rt_func.body.block.body.body.body[0].block, tir.stmt.Block)
+    assert isinstance(rt_func.body.block.body.body.body[1], tir.stmt.For)
+    assert isinstance(rt_func.body.block.body.body.body[1].body.block, tir.stmt.Block)
+
+
+def test_element_wise():
+    func = element_wise
+    rt_func = tvm.script.from_source(tvm.script.asscript(func, True))
+    tvm.ir.assert_structural_equal(func, rt_func)
+
+    assert isinstance(rt_func.body.block, tir.stmt.Block)
+    assert isinstance(rt_func.body.block.body, tir.stmt.SeqStmt)
+    assert isinstance(rt_func.body.block.body[0], tir.stmt.For)
+    assert isinstance(rt_func.body.block.body[0].body, tir.stmt.For)
+    assert isinstance(rt_func.body.block.body[0].body.body.block, tir.stmt.Block)
+
+    assert isinstance(rt_func.body.block.body[1], tir.stmt.For)
+    assert isinstance(rt_func.body.block.body[1].body, tir.stmt.For)
+    assert isinstance(rt_func.body.block.body[1].body.body.block, tir.stmt.Block)
+
+
+def test_predicate():
+    func = predicate
+    rt_func = tvm.script.from_source(tvm.script.asscript(func, True))
+    tvm.ir.assert_structural_equal(func, rt_func)
+
+    assert isinstance(rt_func.body.block, tir.stmt.Block)
+    assert isinstance(rt_func.body.block.body, tir.stmt.For)
+    assert isinstance(rt_func.body.block.body.body, tir.stmt.For)
+    assert isinstance(rt_func.body.block.body.body.body, tir.stmt.For)
+    assert isinstance(rt_func.body.block.body.body.body.body.block, tir.stmt.Block)
+
+
+@tvm.script.tir
+def for_thread_binding(a: ty.handle, b: ty.handle) -> None:
+    A = tir.match_buffer(a, (16, 16), "float32")
+    B = tir.match_buffer(b, (16, 16), "float32")
+
+    for i in tir.thread_binding(0, 16, thread="threadIdx.x"):
+        for j in tir.thread_binding(0, 16, thread="threadIdx.y"):
+            A[i, j] = B[i, j] + tir.float32(1)
+
+
+def test_for_thread_binding():
+    func = for_thread_binding
+    rt_func = tvm.script.from_source(tvm.script.asscript(func, True))
+    tvm.ir.assert_structural_equal(func, rt_func)
+
+    assert isinstance(rt_func.body, tir.stmt.For)
+    assert rt_func.body.kind == 4
+    assert rt_func.body.thread_binding.thread_tag == "threadIdx.x"
+    assert isinstance(rt_func.body.body, tir.stmt.For)
+    assert rt_func.body.body.kind == 4
+    assert rt_func.body.body.thread_binding.thread_tag == "threadIdx.y"
+
+
+@tvm.script.tir
+def block_elements(a: ty.handle, b: ty.handle) -> None:
+    A = tir.match_buffer(a, (16, 16), "float32")
+    B = tir.match_buffer(b, (1, 1), "float32")
+
+    with tir.block([1], "update") as [vi]:
+        tir.bind(vi, 0)
+        tir.where(True)
+        tir.reads(A[0:16, 0:16])
+        tir.writes(B[0, 0])
+        tir.block_attr({"attr_key": "attr_value"})
+        C = tir.alloc_buffer((4, 4), dtype="float32")
+        D = tir.match_buffer_region(A[0:4, 0])
+        with tir.init():
+            B[0, 0] = tir.float32(0)
+        B[0, 0] = A[0, 0] + B[0, 0] + C[1, 1] + D[2, 0]
+
+
+def test_block_elements():
+    func = block_elements
+    rt_func = tvm.script.from_source(tvm.script.asscript(func, True))
+    tvm.ir.assert_structural_equal(func, rt_func)
+
+    assert isinstance(rt_func.body.block, tir.stmt.Block)
+    assert isinstance(rt_func.body.block.body, tir.stmt.BlockRealize)
+    assert isinstance(rt_func.body.block.body.block, tir.stmt.Block)
+    block = rt_func.body.block.body.block
+    assert isinstance(block.body, tir.stmt.BufferStore)
+    assert isinstance(block.init, tir.stmt.BufferStore)
+    assert len(block.annotations) == 1
+    assert block.annotations["attr_key"] == "attr_value"
+
+
 if __name__ == "__main__":
     test_opt_gemm_normalize()
     test_opt_gemm_mod_host()
@@ -2669,3 +2835,10 @@ def test_opt_conv_tensorcore_mod_host():
     test_opt_conv_tensorcore_normalize()
     test_opt_conv_tensorcore_lower()
     test_opt_conv_tensorcore_mod_host()
+    test_module_define()
+    test_matmul()
+    test_matmul_original()
+    test_element_wise()
+    test_predicate()
+    test_for_thread_binding()
+    test_block_elements()
diff --git a/tests/scripts/task_build.sh b/tests/scripts/task_build.sh
index d8e35ebd4de3..845b7153ae20 100755
--- a/tests/scripts/task_build.sh
+++ b/tests/scripts/task_build.sh
@@ -16,4 +16,4 @@
 # specific language governing permissions and limitations
 # under the License.
 export VTA_HW_PATH=`pwd`/3rdparty/vta-hw
-cd $1 && cmake .. && make $2 && cd ..
+cd $1 && cmake .. -DCMAKE_BUILD_TYPE=RelWithDebInfo && make $2 && cd ..
diff --git a/tests/scripts/task_ci_python_setup.sh b/tests/scripts/task_ci_python_setup.sh
index f48ed49a2266..b880cb9d6457 100755
--- a/tests/scripts/task_ci_python_setup.sh
+++ b/tests/scripts/task_ci_python_setup.sh
@@ -30,4 +30,4 @@ set -o pipefail
 #
 echo "Addtiional setup in" ${CI_IMAGE_NAME}
 
-python3 -m pip install --user tlcpack-sphinx-addon==0.1.4 synr==0.2.1
+python3 -m pip install --user tlcpack-sphinx-addon==0.1.4 synr==0.3.0
diff --git a/tests/scripts/task_ci_setup.sh b/tests/scripts/task_ci_setup.sh
index 17838c58a83c..9dda54e10523 100755
--- a/tests/scripts/task_ci_setup.sh
+++ b/tests/scripts/task_ci_setup.sh
@@ -30,7 +30,7 @@ set -o pipefail
 #
 echo "Addtiional setup in" ${CI_IMAGE_NAME}
 
-python3 -m pip install --user tlcpack-sphinx-addon==0.1.4 synr==0.2.1
+python3 -m pip install --user tlcpack-sphinx-addon==0.1.4 synr==0.3.0
 
 # Rebuild standalone_crt in build/ tree. This file is not currently archived by pack_lib() in
 # Jenkinsfile. We expect config.cmake to be present from pack_lib().
diff --git a/tests/scripts/task_config_build_arm.sh b/tests/scripts/task_config_build_arm.sh
index 80527466c71e..b3a084aef371 100755
--- a/tests/scripts/task_config_build_arm.sh
+++ b/tests/scripts/task_config_build_arm.sh
@@ -25,10 +25,9 @@ cp ../cmake/config.cmake .
 
 echo set\(USE_SORT ON\) >> config.cmake
 echo set\(USE_RPC ON\) >> config.cmake
-echo set\(USE_GRAPH_RUNTIME_DEBUG ON\) >> config.cmake
 echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
-echo set\(USE_VM_PROFILER ON\) >> config.cmake
+echo set\(USE_PROFILER ON\) >> config.cmake
 echo set\(USE_LLVM llvm-config-8\) >> config.cmake
 echo set\(CMAKE_CXX_COMPILER g++\) >> config.cmake
 echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake
diff --git a/tests/scripts/task_config_build_cpu.sh b/tests/scripts/task_config_build_cpu.sh
index db636063b9e3..2af91d7c6b8e 100755
--- a/tests/scripts/task_config_build_cpu.sh
+++ b/tests/scripts/task_config_build_cpu.sh
@@ -26,8 +26,7 @@ cp ../cmake/config.cmake .
 echo set\(USE_SORT ON\) >> config.cmake
 echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
-echo set\(USE_GRAPH_RUNTIME_DEBUG ON\) >> config.cmake
-echo set\(USE_VM_PROFILER ON\) >> config.cmake
+echo set\(USE_PROFILER ON\) >> config.cmake
 echo set\(USE_DNNL_CODEGEN ON\) >> config.cmake
 echo set\(USE_ARM_COMPUTE_LIB ON\) >> config.cmake
 echo set\(USE_LLVM llvm-config-11\) >> config.cmake
@@ -46,3 +45,4 @@ echo set\(USE_ETHOSN /opt/arm/ethosn-driver\) >> config.cmake
 echo set\(USE_ETHOSN_HW OFF\) >> config.cmake
 echo set\(USE_VITIS_AI ON\) >> config.cmake
 echo set\(USE_VERILATOR ON\) >> config.cmake
+echo set\(USE_LIBBACKTRACE ON\) >> config.cmake
diff --git a/tests/scripts/task_config_build_gpu.sh b/tests/scripts/task_config_build_gpu.sh
index 155bac80533f..609325c9962b 100755
--- a/tests/scripts/task_config_build_gpu.sh
+++ b/tests/scripts/task_config_build_gpu.sh
@@ -34,10 +34,9 @@ echo set\(USE_NNPACK ON\) >> config.cmake
 echo set\(NNPACK_PATH /NNPACK/build/\) >> config.cmake
 echo set\(USE_RPC ON\) >> config.cmake
 echo set\(USE_SORT ON\) >> config.cmake
-echo set\(USE_GRAPH_RUNTIME ON\) >> config.cmake
+echo set\(USE_GRAPH_EXECUTOR ON\) >> config.cmake
 echo set\(USE_STACKVM_RUNTIME ON\) >> config.cmake
-echo set\(USE_GRAPH_RUNTIME_DEBUG ON\) >> config.cmake
-echo set\(USE_VM_PROFILER ON\) >> config.cmake
+echo set\(USE_PROFILER ON\) >> config.cmake
 echo set\(USE_ANTLR ON\) >> config.cmake
 echo set\(USE_VTA_TSIM ON\) >> config.cmake
 echo set\(USE_VTA_FSIM ON\) >> config.cmake
@@ -45,3 +44,4 @@ echo set\(USE_BLAS openblas\) >> config.cmake
 echo set\(CMAKE_CXX_COMPILER g++\) >> config.cmake
 echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake
 echo set\(USE_TENSORRT_CODEGEN ON\) >> config.cmake
+echo set\(USE_LIBBACKTRACE AUTO\) >> config.cmake
diff --git a/tests/scripts/task_config_build_gpu_vulkan.sh b/tests/scripts/task_config_build_gpu_vulkan.sh
index 74096b1a9760..f12d0f99cb37 100755
--- a/tests/scripts/task_config_build_gpu_vulkan.sh
+++ b/tests/scripts/task_config_build_gpu_vulkan.sh
@@ -27,7 +27,7 @@ echo set\(USE_OPENCL ON\) >> config.cmake
 echo set\(USE_ROCM ON\) >> config.cmake
 echo set\(USE_VULKAN ON\) >> config.cmake
 echo set\(USE_MICRO ON\) >> config.cmake
-echo set\(USE_GRAPH_RUNTIME_DEBUG ON\) >> config.cmake
-echo set\(USE_VM_PROFILER ON\) >> config.cmake
+echo set\(USE_PROFILER ON\) >> config.cmake
+echo set\(USE_LIBBACKTRACE OFF\) >> config.cmake
 echo set\(CMAKE_CXX_COMPILER clang-7\) >> config.cmake
 echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake
diff --git a/tests/scripts/task_config_build_i386.sh b/tests/scripts/task_config_build_i386.sh
index 68e61c6a039c..05acbb022124 100755
--- a/tests/scripts/task_config_build_i386.sh
+++ b/tests/scripts/task_config_build_i386.sh
@@ -25,10 +25,9 @@ cp ../cmake/config.cmake .
 
 echo set\(USE_SORT ON\) >> config.cmake
 echo set\(USE_RPC ON\) >> config.cmake
-echo set\(USE_GRAPH_RUNTIME_DEBUG ON\) >> config.cmake
 echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
-echo set\(USE_VM_PROFILER ON\) >> config.cmake
+echo set\(USE_PROFILER ON\) >> config.cmake
 echo set\(USE_LLVM llvm-config-4.0\) >> config.cmake
 echo set\(CMAKE_CXX_COMPILER g++\) >> config.cmake
 echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake
diff --git a/tests/scripts/task_config_build_wasm.sh b/tests/scripts/task_config_build_wasm.sh
index c37a119b0590..78dc7550028b 100755
--- a/tests/scripts/task_config_build_wasm.sh
+++ b/tests/scripts/task_config_build_wasm.sh
@@ -26,8 +26,7 @@ cp ../cmake/config.cmake .
 echo set\(USE_SORT ON\) >> config.cmake
 echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
-echo set\(USE_GRAPH_RUNTIME_DEBUG ON\) >> config.cmake
-echo set\(USE_VM_PROFILER ON\) >> config.cmake
+echo set\(USE_PROFILER ON\) >> config.cmake
 echo set\(USE_LLVM llvm-config-11\) >> config.cmake
 echo set\(USE_ANTLR ON\) >> config.cmake
 echo set\(CMAKE_CXX_COMPILER g++\) >> config.cmake
diff --git a/tests/scripts/task_java_unittest.sh b/tests/scripts/task_java_unittest.sh
index 7ab4afae3c2e..7818d7d458d6 100755
--- a/tests/scripts/task_java_unittest.sh
+++ b/tests/scripts/task_java_unittest.sh
@@ -32,7 +32,7 @@ TEMP_DIR=$(mktemp -d)
 
 python3 $SCRIPT_DIR/test_add_cpu.py $TEMP_DIR
 python3 $SCRIPT_DIR/test_add_gpu.py $TEMP_DIR
-python3 $SCRIPT_DIR/test_graph_runtime.py $TEMP_DIR
+python3 $SCRIPT_DIR/test_graph_executor.py $TEMP_DIR
 
 # start rpc proxy server
 PORT=$(( ( RANDOM % 1000 )  + 9000 ))
diff --git a/tests/scripts/task_python_docs.sh b/tests/scripts/task_python_docs.sh
index 459b680daeb1..1eb75be830c3 100755
--- a/tests/scripts/task_python_docs.sh
+++ b/tests/scripts/task_python_docs.sh
@@ -74,7 +74,8 @@ cd ..
 
 # Rust doc
 cd rust
-cargo doc --workspace --no-deps
+# Temp disable rust doc build
+# cargo doc --workspace --no-deps
 cd ..
 
 # Prepare the doc dir
@@ -84,7 +85,7 @@ rm -f _docs/.buildinfo
 mkdir -p _docs/api
 mv docs/doxygen/html _docs/api/doxygen
 mv jvm/core/target/site/apidocs _docs/api/javadoc
-mv rust/target/doc _docs/api/rust
+# mv rust/target/doc _docs/api/rust
 mv web/dist/docs _docs/api/typedoc
 
 echo "Start creating the docs tarball.."
diff --git a/tests/scripts/task_python_microtvm.sh b/tests/scripts/task_python_microtvm.sh
index 2e06932ba536..1c202c0ea40c 100755
--- a/tests/scripts/task_python_microtvm.sh
+++ b/tests/scripts/task_python_microtvm.sh
@@ -26,4 +26,4 @@ source tests/scripts/setup-pytest-env.sh
 find . -type f -path "*.pyc" | xargs rm -f
 
 make cython3
-run_pytest ctypes python-microtvm-qemu tests/micro/qemu
+run_pytest ctypes python-microtvm-zephyr tests/micro/zephyr
diff --git a/tests/scripts/task_rust.sh b/tests/scripts/task_rust.sh
index 2c87cceec8bb..c40585b62b47 100755
--- a/tests/scripts/task_rust.sh
+++ b/tests/scripts/task_rust.sh
@@ -58,14 +58,14 @@ cd $RUST_DIR/tvm-rt
 cargo build
 cargo test --tests
 
-# Next we test the graph runtime crate.
+# Next we test the graph executor crate.
 cd $RUST_DIR/tvm-graph-rt
 
 # We first we compile a model using the Python bindings then run the tests.
 python3 tests/build_model.py
 cargo test --tests
 
-# Run some more tests involving the graph runtime API.
+# Run some more tests involving the graph executor API.
 cd tests/test_tvm_basic
 cargo run
 cd -
diff --git a/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py b/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
index 396bdb0d55aa..41fdcbbdbc82 100644
--- a/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
+++ b/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
@@ -145,18 +145,18 @@ def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding):
 conv_np = conv2d_nchw_python(data_np, weight_np, strides, padding)
 out_np = np.maximum(conv_np + bias_np, 0.0)
 
-ctx = tvm.gpu()
-data_tvm = tvm.nd.array(data_np, ctx=ctx)
-weight_tvm = tvm.nd.array(weight_np, ctx=ctx)
-bias_tvm = tvm.nd.array(bias_np, ctx=ctx)
-out_tvm = tvm.nd.empty(out_np.shape, ctx=ctx)
+dev = tvm.gpu()
+data_tvm = tvm.nd.array(data_np, device=dev)
+weight_tvm = tvm.nd.array(weight_np, device=dev)
+bias_tvm = tvm.nd.array(bias_np, device=dev)
+out_tvm = tvm.nd.empty(out_np.shape, device=dev)
 func(data_tvm, weight_tvm, bias_tvm, out_tvm)
 
 # Check results
 np.testing.assert_allclose(out_np, out_tvm.asnumpy(), rtol=1e-3)
 
 # Evaluate execution time
-evaluator = func.time_evaluator(func.entry_name, ctx, min_repeat_ms=500)
+evaluator = func.time_evaluator(func.entry_name, dev, min_repeat_ms=500)
 print(
     "Execution time of this operator: %.3f ms"
     % (np.median(evaluator(data_tvm, weight_tvm, bias_tvm, out_tvm).results) * 1000)
diff --git a/tutorials/auto_scheduler/tune_network_arm.py b/tutorials/auto_scheduler/tune_network_arm.py
index c4add79450e9..153143dd4e94 100644
--- a/tutorials/auto_scheduler/tune_network_arm.py
+++ b/tutorials/auto_scheduler/tune_network_arm.py
@@ -49,7 +49,7 @@
 import tvm
 from tvm import relay, auto_scheduler
 import tvm.relay.testing
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 from tvm.contrib.utils import tempdir
 
 #################################################################
@@ -319,15 +319,15 @@ def tune_and_evaluate():
     remote.upload(tmp.relpath(filename))
     rlib = remote.load_module(filename)
 
-    # Create graph runtime
-    ctx = remote.cpu()
-    module = graph_runtime.GraphModule(rlib["default"](ctx))
+    # Create graph executor
+    dev = remote.cpu()
+    module = graph_executor.GraphModule(rlib["default"](dev))
     data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
     module.set_input("data", data_tvm)
 
     # Evaluate
     print("Evaluate inference time cost...")
-    ftimer = module.module.time_evaluator("run", ctx, repeat=3, min_repeat_ms=500)
+    ftimer = module.module.time_evaluator("run", dev, repeat=3, min_repeat_ms=500)
     prof_res = np.array(ftimer().results) * 1e3  # convert to millisecond
     print(
         "Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res))
diff --git a/tutorials/auto_scheduler/tune_network_cuda.py b/tutorials/auto_scheduler/tune_network_cuda.py
index 5ed3ceef5ba0..7b5619c671be 100644
--- a/tutorials/auto_scheduler/tune_network_cuda.py
+++ b/tutorials/auto_scheduler/tune_network_cuda.py
@@ -49,7 +49,7 @@
 import tvm
 from tvm import relay, auto_scheduler
 import tvm.relay.testing
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 
 #################################################################
 # Define a Network
@@ -252,7 +252,7 @@ def run_tuning():
 #   The last line also prints the total number of measurement trials,
 #   total time spent on auto-tuning and the id of the next task to tune.
 #
-#   There will also be some "dmlc::Error"s and CUDA errors, because the
+#   There will also be some "tvm::Error"s and CUDA errors, because the
 #   auto-scheduler will try some invalid schedules.
 #   You can safely ignore them if the tuning can continue, because these
 #   errors are isolated from the main process.
@@ -280,15 +280,15 @@ def run_tuning():
     with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
         lib = relay.build(mod, target=target, params=params)
 
-# Create graph runtime
-ctx = tvm.context(str(target), 0)
-module = graph_runtime.GraphModule(lib["default"](ctx))
+# Create graph executor
+dev = tvm.device(str(target), 0)
+module = graph_executor.GraphModule(lib["default"](dev))
 data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
 module.set_input("data", data_tvm)
 
 # Evaluate
 print("Evaluate inference time cost...")
-ftimer = module.module.time_evaluator("run", ctx, repeat=3, min_repeat_ms=500)
+ftimer = module.module.time_evaluator("run", dev, repeat=3, min_repeat_ms=500)
 prof_res = np.array(ftimer().results) * 1e3  # convert to millisecond
 print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
 
diff --git a/tutorials/auto_scheduler/tune_network_mali.py b/tutorials/auto_scheduler/tune_network_mali.py
index 0af429b994a7..35751fa11f17 100644
--- a/tutorials/auto_scheduler/tune_network_mali.py
+++ b/tutorials/auto_scheduler/tune_network_mali.py
@@ -49,7 +49,7 @@
 import tvm
 from tvm import relay, auto_scheduler
 import tvm.relay.testing
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 import os
 
 #################################################################
@@ -181,14 +181,14 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"):
 #
 #     from tvm.auto_scheduler.utils import request_remote
 #     remote = request_remote(device_key, "0.0.0.0", 9190)
-#     ctx = remote.cl()
-#     max_shared_memory_per_block = ctx.max_shared_memory_per_block
+#     dev = remote.cl()
+#     max_shared_memory_per_block = dev.max_shared_memory_per_block
 #     # There is no explicit local memory limition
 #     # so we can use INT32_MAX to disalbe the check on local_memory.
 #     max_local_memory_per_block = 2147483647 # INT32_MAX
-#     max_threads_per_block = ctx.max_threads_per_block
-#     max_vthread_extent = int(ctx.warp_size / 4) if int(ctx.warp_size / 4) > 1 else ctx.warp_size
-#     warp_size = ctx.warp_size
+#     max_threads_per_block = dev.max_threads_per_block
+#     max_vthread_extent = int(dev.warp_size / 4) if int(dev.warp_size / 4) > 1 else dev.warp_size
+#     warp_size = dev.warp_size
 #     hardware_params = auto_scheduler.HardwareParams(-1, 16, 64,
 #                                                     max_shared_memory_per_block, max_local_memory_per_block,
 #                                                     max_threads_per_block, max_vthread_extent, warp_size)
@@ -243,12 +243,12 @@ def tune_and_evaluate():
         ):
             lib = relay.build(mod, target, params=params)
 
-    # Create graph runtime
+    # Create graph executor
     print("=============== Request Remote ===============")
     from tvm.auto_scheduler.utils import request_remote
 
     remote = request_remote(device_key, "0.0.0.0", 9190)
-    ctx = remote.cl()
+    dev = remote.cl()
     from tvm.contrib import utils, ndk
 
     temp = utils.tempdir()
@@ -257,14 +257,14 @@ def tune_and_evaluate():
     lib.export_library(path_lib, ndk.create_shared)
     remote.upload(path_lib)
     loaded_lib = remote.load_module(filename)
-    module = graph_runtime.GraphModule(loaded_lib["default"](ctx))
+    module = graph_executor.GraphModule(loaded_lib["default"](dev))
     data = (np.random.uniform(size=input_shape)).astype(dtype)
     data_tvm = tvm.nd.array(data)
     module.set_input("data", data_tvm)
 
     # Evaluate
     print("Evaluate inference time cost...")
-    ftimer = module.module.time_evaluator("run", ctx, repeat=3, min_repeat_ms=500)
+    ftimer = module.module.time_evaluator("run", dev, repeat=3, min_repeat_ms=500)
     prof_res = np.array(ftimer().results) * 1e3  # convert to millisecond
     print(
         "Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res))
@@ -330,7 +330,7 @@ def tune_and_evaluate():
 #   The last line also prints the total number of measurement trials,
 #   total time spent on auto-tuning and the id of the next task to tune.
 #
-#   There will also be some "dmlc::Error"s errors, because the
+#   There will also be some "tvm::Error"s errors, because the
 #   auto-scheduler will try some invalid schedules.
 #   You can safely ignore them if the tuning can continue, because these
 #   errors are isolated from the main process.
diff --git a/tutorials/auto_scheduler/tune_network_x86.py b/tutorials/auto_scheduler/tune_network_x86.py
index 8526abbbe6ca..91dc64eec20e 100644
--- a/tutorials/auto_scheduler/tune_network_x86.py
+++ b/tutorials/auto_scheduler/tune_network_x86.py
@@ -49,7 +49,7 @@
 import tvm
 from tvm import relay, auto_scheduler
 import tvm.relay.testing
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 
 #################################################################
 # Define a Network
@@ -251,7 +251,7 @@ def run_tuning():
 #   The last line also prints the total number of measurement trials,
 #   total time spent on auto-tuning and the id of the next task to tune.
 #
-#   There will also be some "dmlc::Error"s errors, because the
+#   There will also be some "tvm::Error"s errors, because the
 #   auto-scheduler will try some invalid schedules.
 #   You can safely ignore them if the tuning can continue, because these
 #   errors are isolated from the main process.
@@ -279,15 +279,15 @@ def run_tuning():
     with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
         lib = relay.build(mod, target=target, params=params)
 
-# Create graph runtime
-ctx = tvm.context(str(target), 0)
-module = graph_runtime.GraphModule(lib["default"](ctx))
+# Create graph executor
+dev = tvm.device(str(target), 0)
+module = graph_executor.GraphModule(lib["default"](dev))
 data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
 module.set_input("data", data_tvm)
 
 # Evaluate
 print("Evaluate inference time cost...")
-ftimer = module.module.time_evaluator("run", ctx, repeat=3, min_repeat_ms=500)
+ftimer = module.module.time_evaluator("run", dev, repeat=3, min_repeat_ms=500)
 prof_res = np.array(ftimer().results) * 1e3  # convert to millisecond
 print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
 
diff --git a/tutorials/auto_scheduler/tune_sparse_x86.py b/tutorials/auto_scheduler/tune_sparse_x86.py
index ced416f6c500..3df372048a14 100644
--- a/tutorials/auto_scheduler/tune_sparse_x86.py
+++ b/tutorials/auto_scheduler/tune_sparse_x86.py
@@ -274,14 +274,14 @@ def apply_func(search_policy, state, stage_id):
 
 func = tvm.build(sch, args, target)
 
-ctx = tvm.cpu()
+dev = tvm.cpu()
 
-X_tvm = tvm.nd.array(X_np, ctx=ctx)
-W_data_tvm = tvm.nd.array(W_sp_np.data, ctx=ctx)
-W_indices_tvm = tvm.nd.array(W_sp_np.indices, ctx=ctx)
-W_indptr_tvm = tvm.nd.array(W_sp_np.indptr, ctx=ctx)
-B_tvm = tvm.nd.array(B_np, ctx=ctx)
-Y_tvm = tvm.nd.empty(Y_np.shape, ctx=ctx)
+X_tvm = tvm.nd.array(X_np, device=dev)
+W_data_tvm = tvm.nd.array(W_sp_np.data, device=dev)
+W_indices_tvm = tvm.nd.array(W_sp_np.indices, device=dev)
+W_indptr_tvm = tvm.nd.array(W_sp_np.indptr, device=dev)
+B_tvm = tvm.nd.array(B_np, device=dev)
+Y_tvm = tvm.nd.empty(Y_np.shape, device=dev)
 
 func(X_tvm, W_data_tvm, W_indices_tvm, W_indptr_tvm, B_tvm, Y_tvm)
 
@@ -289,7 +289,7 @@ def apply_func(search_policy, state, stage_id):
 tvm.testing.assert_allclose(Y_np, Y_tvm.asnumpy(), atol=1e-4, rtol=1e-4)
 
 # Evaluate execution time.
-evaluator = func.time_evaluator(func.entry_name, ctx, min_repeat_ms=500)
+evaluator = func.time_evaluator(func.entry_name, dev, min_repeat_ms=500)
 print(
     "Execution time of this operator: %.3f ms"
     % (
diff --git a/tutorials/autotvm/tune_conv2d_cuda.py b/tutorials/autotvm/tune_conv2d_cuda.py
index dc8e6e522249..d14f9c33320c 100644
--- a/tutorials/autotvm/tune_conv2d_cuda.py
+++ b/tutorials/autotvm/tune_conv2d_cuda.py
@@ -230,15 +230,15 @@ def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding):
 w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32)
 c_np = conv2d_nchw_python(a_np, w_np, strides, padding)
 
-ctx = tvm.gpu()
-a_tvm = tvm.nd.array(a_np, ctx=ctx)
-w_tvm = tvm.nd.array(w_np, ctx=ctx)
-c_tvm = tvm.nd.empty(c_np.shape, ctx=ctx)
+dev = tvm.gpu()
+a_tvm = tvm.nd.array(a_np, device=dev)
+w_tvm = tvm.nd.array(w_np, device=dev)
+c_tvm = tvm.nd.empty(c_np.shape, device=dev)
 func(a_tvm, w_tvm, c_tvm)
 
 tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)
 
 # Evaluate running time. Here we choose a large repeat number (400) to reduce the noise
 # and the overhead of kernel launch. You can also use nvprof to validate the result.
-evaluator = func.time_evaluator(func.entry_name, ctx, number=400)
+evaluator = func.time_evaluator(func.entry_name, dev, number=400)
 print("Time cost of this operator: %f" % evaluator(a_tvm, w_tvm, c_tvm).mean)
diff --git a/tutorials/autotvm/tune_relay_arm.py b/tutorials/autotvm/tune_relay_arm.py
index 2b389235e4fe..9223eb30cd9d 100644
--- a/tutorials/autotvm/tune_relay_arm.py
+++ b/tutorials/autotvm/tune_relay_arm.py
@@ -70,7 +70,7 @@
 import tvm.relay.testing
 from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
 from tvm.contrib.utils import tempdir
-import tvm.contrib.graph_runtime as runtime
+import tvm.contrib.graph_executor as runtime
 
 #################################################################
 # Define network
@@ -348,14 +348,14 @@ def tune_and_evaluate(tuning_opt):
         rlib = remote.load_module(filename)
 
         # upload parameters to device
-        ctx = remote.context(str(target), 0)
-        module = runtime.GraphModule(rlib["default"](ctx))
+        dev = remote.device(str(target), 0)
+        module = runtime.GraphModule(rlib["default"](dev))
         data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
         module.set_input("data", data_tvm)
 
         # evaluate
         print("Evaluate inference time cost...")
-        ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=10)
+        ftimer = module.module.time_evaluator("run", dev, number=1, repeat=10)
         prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
         print(
             "Mean inference time (std dev): %.2f ms (%.2f ms)"
diff --git a/tutorials/autotvm/tune_relay_cuda.py b/tutorials/autotvm/tune_relay_cuda.py
index 148ebbf43dc1..50485c4d7ff2 100644
--- a/tutorials/autotvm/tune_relay_cuda.py
+++ b/tutorials/autotvm/tune_relay_cuda.py
@@ -67,7 +67,7 @@
 from tvm import relay, autotvm
 import tvm.relay.testing
 from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
-import tvm.contrib.graph_runtime as runtime
+import tvm.contrib.graph_executor as runtime
 
 #################################################################
 # Define Network
@@ -237,14 +237,14 @@ def tune_and_evaluate(tuning_opt):
             lib = relay.build_module.build(mod, target=target, params=params)
 
         # load parameters
-        ctx = tvm.context(str(target), 0)
-        module = runtime.GraphModule(lib["default"](ctx))
+        dev = tvm.device(str(target), 0)
+        module = runtime.GraphModule(lib["default"](dev))
         data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
         module.set_input("data", data_tvm)
 
         # evaluate
         print("Evaluate inference time cost...")
-        ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=600)
+        ftimer = module.module.time_evaluator("run", dev, number=1, repeat=600)
         prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
         print(
             "Mean inference time (std dev): %.2f ms (%.2f ms)"
diff --git a/tutorials/autotvm/tune_relay_mobile_gpu.py b/tutorials/autotvm/tune_relay_mobile_gpu.py
index 8ba47c4eaabf..2b109873c750 100644
--- a/tutorials/autotvm/tune_relay_mobile_gpu.py
+++ b/tutorials/autotvm/tune_relay_mobile_gpu.py
@@ -69,7 +69,7 @@
 import tvm.relay.testing
 from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
 from tvm.contrib.utils import tempdir
-import tvm.contrib.graph_runtime as runtime
+import tvm.contrib.graph_executor as runtime
 
 #################################################################
 # Define network
@@ -345,14 +345,14 @@ def tune_and_evaluate(tuning_opt):
         rlib = remote.load_module(filename)
 
         # upload parameters to device
-        ctx = remote.context(str(target), 0)
-        module = runtime.GraphModule(rlib["default"](ctx))
+        dev = remote.device(str(target), 0)
+        module = runtime.GraphModule(rlib["default"](dev))
         data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
         module.set_input("data", data_tvm)
 
         # evaluate
         print("Evaluate inference time cost...")
-        ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=30)
+        ftimer = module.module.time_evaluator("run", dev, number=1, repeat=30)
         prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
         print(
             "Mean inference time (std dev): %.2f ms (%.2f ms)"
diff --git a/tutorials/autotvm/tune_relay_x86.py b/tutorials/autotvm/tune_relay_x86.py
index 30e62efe0d9d..dd5d4057c211 100644
--- a/tutorials/autotvm/tune_relay_x86.py
+++ b/tutorials/autotvm/tune_relay_x86.py
@@ -36,7 +36,7 @@
 from tvm.relay import testing
 from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
 from tvm.autotvm.graph_tuner import DPTuner, PBQPTuner
-import tvm.contrib.graph_runtime as runtime
+import tvm.contrib.graph_executor as runtime
 
 #################################################################
 # Define network
@@ -213,14 +213,14 @@ def tune_and_evaluate(tuning_opt):
             lib = relay.build_module.build(mod, target=target, params=params)
 
         # upload parameters to device
-        ctx = tvm.cpu()
+        dev = tvm.cpu()
         data_tvm = tvm.nd.array((np.random.uniform(size=data_shape)).astype(dtype))
-        module = runtime.GraphModule(lib["default"](ctx))
+        module = runtime.GraphModule(lib["default"](dev))
         module.set_input(input_name, data_tvm)
 
         # evaluate
         print("Evaluate inference time cost...")
-        ftimer = module.module.time_evaluator("run", ctx, number=100, repeat=3)
+        ftimer = module.module.time_evaluator("run", dev, number=100, repeat=3)
         prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
         print(
             "Mean inference time (std dev): %.2f ms (%.2f ms)"
diff --git a/tutorials/autotvm/tune_simple_template.py b/tutorials/autotvm/tune_simple_template.py
deleted file mode 100644
index bd2dcf3cfd1e..000000000000
--- a/tutorials/autotvm/tune_simple_template.py
+++ /dev/null
@@ -1,336 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Writing Tunable Templates and Using the Auto-tuner
-==================================================
-**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_
-
-This is an introduction tutorial to the auto-tuning module in TVM.
-
-There are two steps in auto-tuning.
-The first step is defining a search space.
-The second step is running a search algorithm to explore through this space.
-In this tutorial, you can learn how to perform these two steps in TVM.
-The whole workflow is illustrated by a matrix multiplication example.
-
-Note that this tutorial will not run on Windows or recent versions of macOS. To
-get it to run, you will need to wrap the body of this tutorial in a :code:`if
-__name__ == "__main__":` block.
-"""
-
-######################################################################
-# Install dependencies
-# --------------------
-# To use autotvm package in TVM, we need to install some extra dependencies.
-# This step (installing xgboost) can be skipped as it doesn't need XGBoost
-# (change "3" to "2" if you use python2):
-#
-# .. code-block:: bash
-#
-#   pip3 install --user psutil xgboost cloudpickle
-#
-# To make TVM run faster in tuning, it is recommended to use cython
-# as FFI of TVM. In the root directory of TVM, execute
-# (change "3" to "2" if you use python2):
-#
-# .. code-block:: bash
-#
-#   pip3 install --user cython
-#   sudo make cython3
-#
-# Now return to python code. Import packages.
-
-import logging
-import sys
-
-import numpy as np
-import tvm
-from tvm import te
-import tvm.testing
-
-# the module is called `autotvm`
-from tvm import autotvm
-
-######################################################################
-# Step 1:  Define the search space
-# --------------------------------
-# In this section, we will rewrite a deterministic TVM schedule code to a
-# tunable schedule template. You can regard the process of search space definition
-# as the parameterization of our existing schedule code.
-#
-# To begin with, here is how we implement a blocked matrix multiplication in TVM.
-
-# Matmul V0: Constant tiling factor
-def matmul_v0(N, L, M, dtype):
-    A = te.placeholder((N, L), name="A", dtype=dtype)
-    B = te.placeholder((L, M), name="B", dtype=dtype)
-
-    k = te.reduce_axis((0, L), name="k")
-    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
-    s = te.create_schedule(C.op)
-
-    # schedule
-    y, x = s[C].op.axis
-    k = s[C].op.reduce_axis[0]
-
-    yo, yi = s[C].split(y, 8)
-    xo, xi = s[C].split(x, 8)
-
-    s[C].reorder(yo, xo, k, yi, xi)
-
-    return s, [A, B, C]
-
-
-#####################################################################
-# Parametrize the schedule
-# ^^^^^^^^^^^^^^^^^^^^^^^^
-# In the previous schedule code, we use a constant "8" as tiling factor.
-# However, it might not be the best one because the best tiling factor depends
-# on real hardware environment and input shape.
-#
-# If you want the schedule code to be portable across a wider range of input shapes
-# and target hardware, it is better to define a set of candidate values and
-# pick the best one according to the measurement results on target hardware.
-#
-# In autotvm, we can define a tunable parameter, or a "knob" for such kind of value.
-
-# Matmul V1: List candidate values
-@autotvm.template("tutorial/matmul_v1")  # 1. use a decorator
-def matmul_v1(N, L, M, dtype):
-    A = te.placeholder((N, L), name="A", dtype=dtype)
-    B = te.placeholder((L, M), name="B", dtype=dtype)
-
-    k = te.reduce_axis((0, L), name="k")
-    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
-    s = te.create_schedule(C.op)
-
-    # schedule
-    y, x = s[C].op.axis
-    k = s[C].op.reduce_axis[0]
-
-    # 2. get the config object
-    cfg = autotvm.get_config()
-
-    # 3. define search space
-    cfg.define_knob("tile_y", [1, 2, 4, 8, 16])
-    cfg.define_knob("tile_x", [1, 2, 4, 8, 16])
-
-    # 4. schedule according to config
-    yo, yi = s[C].split(y, cfg["tile_y"].val)
-    xo, xi = s[C].split(x, cfg["tile_x"].val)
-
-    s[C].reorder(yo, xo, k, yi, xi)
-
-    return s, [A, B, C]
-
-
-###############################################################################
-# Here we make four modifications to the previous schedule code and get
-# a tunable "template". We can explain the modifications one by one.
-#
-# 1. Use a decorator to mark this function as a simple template.
-# 2. Get a config object:
-#    You can regard this :code:`cfg` as an argument of this function but
-#    we obtain it in a different way. With this argument, this function is no longer
-#    a deterministic schedule code. Instead, we can pass different configurations to
-#    this function and get different schedules, so this function is a "template".
-#
-#    To make the template function more compact, we do two things in a single function.
-#    (1) define a search space and (2) schedule according to an entity in this space.
-#    To achieve this, we make :code:`cfg` be either
-#    a :any:`ConfigSpace` or a :any:`ConfigEntity` object.
-#
-#    When it is a :any:`ConfigSpace`, it will collect all tunable knobs in this function and
-#    build the search space.
-#    When it is a :any:`ConfigEntity`, it will ignore all space definition API
-#    (namely, :code:`cfg.define_XXXXX(...)`).   Instead, it stores deterministic values for
-#    all tunable knobs, and we schedule according to these values.
-#
-#    During auto-tuning, we will first call this template with a :any:`ConfigSpace`
-#    object to build the search space. Then we call this template with different :any:`ConfigEntity`
-#    in the built space to get different schedules. Finally we will measure the code generated by
-#    different schedules and pick the best one.
-#
-# 3. Define two tunable knobs. The first one is :code:`tile_y` with
-#    5 possible values. The second one is :code:`tile_x` with a same
-#    list of possible values. These two knobs are independent, so they
-#    span a search space with size = 5x5 = 25
-# 4. Schedule according to the deterministic values in :code:`cfg`
-#
-
-#####################################################################
-# Use better space definition API
-# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-# In the previous template, we manually list all possible values for a knob.
-# This is the lowest level API to define the space.
-# However, we also provide another set of API to make the space definition
-# easier and smarter. It is recommended to use this set of high level API.
-#
-# In the following example, we use :any:`ConfigSpace.define_split` to define a split
-# knob. It will enumerate all the possible ways to split an axis and construct
-# the space.
-#
-# We also have :any:`ConfigSpace.define_reorder` for reorder knob and
-# :any:`ConfigSpace.define_annotate` for annotation like unroll, vectorization,
-# thread binding.
-# When the high level API cannot meet your requirement, you can always fall
-# back to use low level API.
-
-
-@autotvm.template("tutorial/matmul")
-def matmul(N, L, M, dtype):
-    A = te.placeholder((N, L), name="A", dtype=dtype)
-    B = te.placeholder((L, M), name="B", dtype=dtype)
-
-    k = te.reduce_axis((0, L), name="k")
-    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
-    s = te.create_schedule(C.op)
-
-    # schedule
-    y, x = s[C].op.axis
-    k = s[C].op.reduce_axis[0]
-
-    ##### define space begin #####
-    cfg = autotvm.get_config()
-    cfg.define_split("tile_y", y, num_outputs=2)
-    cfg.define_split("tile_x", x, num_outputs=2)
-    ##### define space end #####
-
-    # schedule according to config
-    yo, yi = cfg["tile_y"].apply(s, C, y)
-    xo, xi = cfg["tile_x"].apply(s, C, x)
-
-    s[C].reorder(yo, xo, k, yi, xi)
-
-    return s, [A, B, C]
-
-
-######################################################################
-# .. note:: More Explanation on :code:`cfg.defile_split`
-#
-#  In this template, :code:`cfg.define_split("tile_y", y, num_outputs=2)` will enumerate
-#  all possible combinations that can split axis y into two axes with factors of the length of y.
-#  For example, if the length of y is 32 and we want to split it into two axes
-#  using factors of 32, then there are 6 possible values for
-#  (length of outer axis, length of inner axis) pair, namely
-#  (32, 1), (16, 2), (8, 4), (4, 8), (2, 16) or (1, 32).
-#  They are just the 6 possible values of `tile_y`.
-#
-#  During schedule, :code:`cfg["tile_y"]` is a :code:`SplitEntity` object.
-#  We stores the lengths of outer axes and inner axes in :code:`cfg['tile_y'].size`
-#  (a tuple with two elements).
-#  In this template, we apply it by using :code:`yo, yi = cfg['tile_y'].apply(s, C, y)`.
-#  Actually, this is equivalent to
-#  :code:`yo, yi = s[C].split(y, cfg["tile_y"].size[1])`
-#  or  :code:`yo, yi = s[C].split(y, nparts=cfg['tile_y"].size[0])`
-#
-#  The advantage of using cfg.apply API is that it makes multi-level split
-#  (when num_outputs >= 3) easier.
-
-######################################################################
-# Step 2:  Search through the space
-# ---------------------------------
-# In step 1, we build the search space by extending our old schedule code
-# into a template. The next step is to pick a tuner and explore in this space.
-#
-# Auto-tuners in TVM
-# ^^^^^^^^^^^^^^^^^^
-# The job for a tuner can be described by following pseudo code
-#
-#   .. code-block:: c
-#
-#    ct = 0
-#    while ct < max_number_of_trials:
-#        propose a batch of configs
-#        measure this batch of configs on real hardware and get results
-#        ct += batch_size
-#
-# When proposing the next batch of configs, the tuner can take different strategies. We
-# provide four tuners with different strategies in autotvm.
-#
-# * :any:`RandomTuner`: Enumerate the space in a random order
-# * :any:`GridSearchTuner`: Enumerate the space in a grid search order
-# * :any:`GATuner`: Using genetic algorithm to search through the space
-# * :any:`XGBTuner`: Uses a model based method. Train a XGBoost model to predict the speed of lowered IR and pick the next batch according to the prediction.
-#
-# You can choose the tuner according to the size of your space, your time budget and other factors.
-# For example, if your space is very small (less than 1000), a gridsearch tuner or a
-# random tuner is good enough. If your space is at the level of 10^9 (this is the space
-# size of a conv2d operator on CUDA GPU), XGBoostTuner can explore more efficiently
-# and find better configs.
-
-################################################################
-# Begin tuning
-# ^^^^^^^^^^^^
-# Here we continue our matrix multiplication example.
-# First we should create a tuning task.
-# We can also inspect the initialized search space.
-# In this case, for a 512x512 square matrix multiplication, the space size
-# is 10x10=100
-N, L, M = 512, 512, 512
-task = autotvm.task.create("tutorial/matmul", args=(N, L, M, "float32"), target="llvm")
-print(task.config_space)
-
-################################################################
-# Then we need to define how to measure the generated code and pick a tuner.
-# Since our space is small, a random tuner is just okay.
-#
-# We only make 10 trials in this tutorial for demonstration. In practice,
-# you can do more trials according to your time budget.
-# We will log the tuning results into a log file. This file can be
-# used to get the best config later.
-
-# logging config (for printing tuning log to the screen)
-logging.getLogger("autotvm").setLevel(logging.DEBUG)
-logging.getLogger("autotvm").addHandler(logging.StreamHandler(sys.stdout))
-
-# There are two steps for measuring a config: build and run.
-# By default, we use all CPU cores to compile program. Then measure them sequentially.
-# We measure 5 times and take average to reduce variance.
-measure_option = autotvm.measure_option(builder="local", runner=autotvm.LocalRunner(number=5))
-
-# Begin tuning with RandomTuner, log records to file `matmul.log`
-# You can use alternatives like XGBTuner.
-tuner = autotvm.tuner.RandomTuner(task)
-tuner.tune(
-    n_trial=10,
-    measure_option=measure_option,
-    callbacks=[autotvm.callback.log_to_file("matmul.log")],
-)
-
-#########################################################################
-# Finally we apply history best from the cache file and check its correctness.
-# We can call the function :code:`matmul` directly under the
-# :any:`autotvm.apply_history_best` context. When we call this function,
-# it will query the dispatch context with its argument and get the best config
-# with the same argument.
-
-# apply history best from log file
-with autotvm.apply_history_best("matmul.log"):
-    with tvm.target.Target("llvm"):
-        s, arg_bufs = matmul(N, L, M, "float32")
-        func = tvm.build(s, arg_bufs)
-
-# check correctness
-a_np = np.random.uniform(size=(N, L)).astype(np.float32)
-b_np = np.random.uniform(size=(L, M)).astype(np.float32)
-c_np = a_np.dot(b_np)
-
-c_tvm = tvm.nd.empty(c_np.shape)
-func(tvm.nd.array(a_np), tvm.nd.array(b_np), c_tvm)
-
-tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)
diff --git a/tutorials/frontend/build_gcn.py b/tutorials/frontend/build_gcn.py
index b832d18f9c3a..e73dc2dca287 100644
--- a/tutorials/frontend/build_gcn.py
+++ b/tutorials/frontend/build_gcn.py
@@ -175,7 +175,7 @@ def evaluate(data, logits):
 #                                        = ((H * W)^t * A^t)^t
 #                                        = ((W^t * H^t) * A^t)^t
 from tvm import relay
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 import tvm
 from tvm import te
 
@@ -335,9 +335,9 @@ def prepare_params(g, data):
 with tvm.transform.PassContext(opt_level=0):  # Currently only support opt_level=0
     lib = relay.build(mod, target, params=params)
 
-# Generate graph runtime
-ctx = tvm.context(target, 0)
-m = graph_runtime.GraphModule(lib["default"](ctx))
+# Generate graph executor
+dev = tvm.device(target, 0)
+m = graph_executor.GraphModule(lib["default"](dev))
 
 ######################################################################
 # Run the TVM model, test for accuracy and verify with DGL
diff --git a/tutorials/frontend/deploy_model_on_android.py b/tutorials/frontend/deploy_model_on_android.py
index aa9726b6e27a..158280fe9447 100644
--- a/tutorials/frontend/deploy_model_on_android.py
+++ b/tutorials/frontend/deploy_model_on_android.py
@@ -34,7 +34,7 @@
 from tvm import te
 import tvm.relay as relay
 from tvm import rpc
-from tvm.contrib import utils, ndk, graph_runtime as runtime
+from tvm.contrib import utils, ndk, graph_executor as runtime
 from tvm.contrib.download import download_testdata
 
 
@@ -71,7 +71,7 @@
 #         -DUSE_RPC=ON \
 #         -DUSE_SORT=ON \
 #         -DUSE_VULKAN=ON \
-#         -DUSE_GRAPH_RUNTIME=ON \
+#         -DUSE_GRAPH_EXECUTOR=ON \
 #         ..
 #   make -j10
 #
@@ -301,20 +301,20 @@ def transform_image(image):
     remote = tracker.request(key, priority=0, session_timeout=60)
 
 if local_demo:
-    ctx = remote.cpu(0)
+    dev = remote.cpu(0)
 elif test_target == "opencl":
-    ctx = remote.cl(0)
+    dev = remote.cl(0)
 elif test_target == "vulkan":
-    ctx = remote.vulkan(0)
+    dev = remote.vulkan(0)
 else:
-    ctx = remote.cpu(0)
+    dev = remote.cpu(0)
 
 # upload the library to remote device and load it
 remote.upload(lib_fname)
 rlib = remote.load_module("net.so")
 
 # create the remote runtime module
-module = runtime.GraphModule(rlib["default"](ctx))
+module = runtime.GraphModule(rlib["default"](dev))
 
 ######################################################################
 # Execute on TVM
@@ -332,7 +332,7 @@ def transform_image(image):
 print("TVM prediction top-1: {}".format(synset[top1]))
 
 print("Evaluate inference time cost...")
-ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=10)
+ftimer = module.module.time_evaluator("run", dev, number=1, repeat=10)
 prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
 print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
 
diff --git a/tutorials/frontend/deploy_model_on_rasp.py b/tutorials/frontend/deploy_model_on_rasp.py
index cae9d905898b..a59665f62f1c 100644
--- a/tutorials/frontend/deploy_model_on_rasp.py
+++ b/tutorials/frontend/deploy_model_on_rasp.py
@@ -30,7 +30,7 @@
 from tvm import te
 import tvm.relay as relay
 from tvm import rpc
-from tvm.contrib import utils, graph_runtime as runtime
+from tvm.contrib import utils, graph_executor as runtime
 from tvm.contrib.download import download_testdata
 
 ######################################################################
@@ -217,8 +217,8 @@ def transform_image(image):
 rlib = remote.load_module("net.tar")
 
 # create the remote runtime module
-ctx = remote.cpu(0)
-module = runtime.GraphModule(rlib["default"](ctx))
+dev = remote.cpu(0)
+module = runtime.GraphModule(rlib["default"](dev))
 # set input data
 module.set_input("data", tvm.nd.array(x.astype("float32")))
 # run
diff --git a/tutorials/frontend/deploy_object_detection_pytorch.py b/tutorials/frontend/deploy_object_detection_pytorch.py
index 2852dd3ad99d..6b88b617be18 100644
--- a/tutorials/frontend/deploy_object_detection_pytorch.py
+++ b/tutorials/frontend/deploy_object_detection_pytorch.py
@@ -134,8 +134,8 @@ def forward(self, inp):
 ######################################################################
 # Inference with Relay VM
 # -----------------------
-ctx = tvm.cpu()
-vm = VirtualMachine(vm_exec, ctx)
+dev = tvm.cpu()
+vm = VirtualMachine(vm_exec, dev)
 vm.set_input("main", **{input_name: img})
 tvm_res = vm.run()
 
diff --git a/tutorials/frontend/deploy_prequantized.py b/tutorials/frontend/deploy_prequantized.py
index beba332a8a26..308027a4a193 100644
--- a/tutorials/frontend/deploy_prequantized.py
+++ b/tutorials/frontend/deploy_prequantized.py
@@ -90,7 +90,7 @@ def run_tvm_model(mod, params, input_name, inp, target="llvm"):
     with tvm.transform.PassContext(opt_level=3):
         lib = relay.build(mod, target=target, params=params)
 
-    runtime = tvm.contrib.graph_runtime.GraphModule(lib["default"](tvm.context(target, 0)))
+    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](tvm.device(target, 0)))
 
     runtime.set_input(input_name, inp)
     runtime.run()
@@ -198,8 +198,8 @@ def quantize_model(model, inp):
 # -------------------------
 # Here we give an example of how to measure performance of TVM compiled models.
 n_repeat = 100  # should be bigger to make the measurement more accurate
-ctx = tvm.cpu(0)
-ftimer = rt_mod.module.time_evaluator("run", ctx, number=1, repeat=n_repeat)
+dev = tvm.cpu(0)
+ftimer = rt_mod.module.time_evaluator("run", dev, number=1, repeat=n_repeat)
 prof_res = np.array(ftimer().results) * 1e3
 print("Elapsed average ms:", np.mean(prof_res))
 
diff --git a/tutorials/frontend/deploy_prequantized_tflite.py b/tutorials/frontend/deploy_prequantized_tflite.py
index 121ad9dda151..e0f9a6b2ebde 100644
--- a/tutorials/frontend/deploy_prequantized_tflite.py
+++ b/tutorials/frontend/deploy_prequantized_tflite.py
@@ -168,9 +168,9 @@ def run_tflite_model(tflite_model_buf, input_data):
 ###############################################################################
 # Lets run TVM compiled pre-quantized model inference and get the TVM prediction.
 def run_tvm(lib):
-    from tvm.contrib import graph_runtime
+    from tvm.contrib import graph_executor
 
-    rt_mod = graph_runtime.GraphModule(lib["default"](tvm.cpu(0)))
+    rt_mod = graph_executor.GraphModule(lib["default"](tvm.cpu(0)))
     rt_mod.set_input("input", data)
     rt_mod.run()
     tvm_res = rt_mod.get_output(0).asnumpy()
@@ -231,8 +231,8 @@ def run_tvm(lib):
 # -------------------
 # Here we give an example of how to measure performance of TVM compiled models.
 n_repeat = 100  # should be bigger to make the measurement more accurate
-ctx = tvm.cpu(0)
-ftimer = rt_mod.module.time_evaluator("run", ctx, number=1, repeat=n_repeat)
+dev = tvm.cpu(0)
+ftimer = rt_mod.module.time_evaluator("run", dev, number=1, repeat=n_repeat)
 prof_res = np.array(ftimer().results) * 1e3
 print("Elapsed average ms:", np.mean(prof_res))
 
diff --git a/tutorials/frontend/deploy_quantized.py b/tutorials/frontend/deploy_quantized.py
index e75f6e92a6f1..b2210b8ab69b 100644
--- a/tutorials/frontend/deploy_quantized.py
+++ b/tutorials/frontend/deploy_quantized.py
@@ -39,7 +39,7 @@
 batch_size = 1
 model_name = "resnet18_v1"
 target = "cuda"
-ctx = tvm.context(target)
+dev = tvm.device(target)
 
 ###############################################################################
 # Prepare the Dataset
@@ -146,7 +146,7 @@ def quantize(mod, params, data_aware):
 # -------------
 # We create a Relay VM to build and execute the model.
 def run_inference(mod):
-    executor = relay.create_executor("vm", mod, ctx, target)
+    executor = relay.create_executor("vm", mod, dev, target)
     val_data, batch_fn = get_val_data()
     for i, batch in enumerate(val_data):
         data, label = batch_fn(batch)
diff --git a/tutorials/frontend/deploy_sparse.py b/tutorials/frontend/deploy_sparse.py
index 98004a93c74f..1fcb1b3246da 100644
--- a/tutorials/frontend/deploy_sparse.py
+++ b/tutorials/frontend/deploy_sparse.py
@@ -82,7 +82,7 @@
 import numpy as np
 import tensorflow as tf
 from tvm import relay, runtime
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 from tvm.relay import data_dep_optimization as ddo
 from tensorflow.python.framework.convert_to_constants import (
     convert_variables_to_constants_v2,
@@ -106,7 +106,7 @@
 # appropriately for your specific machine. CUDA and ROCm are also supported.
 target = "llvm"
 # Which device to run on. Should be one of tvm.cpu() or tvm.gpu().
-ctx = tvm.cpu()
+dev = tvm.cpu()
 # If true, then a sparse variant of the network will be run and
 # benchmarked.
 measure_sparse = True
@@ -208,18 +208,18 @@ def import_graphdef(
 # the weights are sparse, we won't see any speedup because we are using
 # regular dense matrix multiplications on these dense (but mostly zero)
 # tensors instead of sparse aware kernels.
-def run_relay_graph(mod, params, shape_dict, target, ctx):
+def run_relay_graph(mod, params, shape_dict, target, dev):
     with relay.build_config(opt_level=3):
         lib = relay.build(mod, target=target, params=params)
     input_shape = shape_dict["input_1"]
     dummy_data = np.random.uniform(size=input_shape, low=0, high=input_shape[1]).astype("int32")
 
-    m = graph_runtime.GraphModule(lib["default"](ctx))
+    m = graph_executor.GraphModule(lib["default"](dev))
     m.set_input(0, dummy_data)
     m.run()
     tvm_output = m.get_output(0)
 
-    ftimer = m.module.time_evaluator("run", ctx, repeat=5, number=5)
+    ftimer = m.module.time_evaluator("run", dev, repeat=5, number=5)
     prof_res = np.array(ftimer().results) * 1000
     print(
         "%-20s %-19s (%s)"
@@ -228,9 +228,9 @@ def run_relay_graph(mod, params, shape_dict, target, ctx):
     return tvm_output
 
 
-def run_dense(mod, params, shape_dict, target, ctx):
+def run_dense(mod, params, shape_dict, target, dev):
     print("Dense Model Benchmark:")
-    return run_relay_graph(mod, params, shape_dict, target, ctx)
+    return run_relay_graph(mod, params, shape_dict, target, dev)
 
 
 ###############################################################################
@@ -295,13 +295,13 @@ def deepcopy(param_dic):
     return new_params
 
 
-def run_sparse(mod, params, shape_dict, target, ctx, bs_r, sparsity, gen_weights):
+def run_sparse(mod, params, shape_dict, target, dev, bs_r, sparsity, gen_weights):
     mod, params = ddo.simplify_fc_transpose.convert(mod["main"], params)
     if gen_weights:
         params = random_sparse_bert_params(mod, params, BS_R=bs_r, BS_C=1, density=1 - sparsity)
     mod, params = ddo.bsr_dense.convert(mod, params, (bs_r, 1), sparsity_threshold=0.8)
     print("Block Sparse Model with {blocksize}x1 blocks:".format(blocksize=bs_r))
-    return run_relay_graph(mod, params, shape_dict, target, ctx)
+    return run_relay_graph(mod, params, shape_dict, target, dev)
 
 
 ###############################################################################
@@ -312,10 +312,10 @@ def run_sparse(mod, params, shape_dict, target, ctx, bs_r, sparsity, gen_weights
 # you'll need to uncomment the last line first.
 def benchmark():
     mod, params, shape_dict = import_graphdef(name, batch_size, seq_len)
-    run_dense(mod, params, shape_dict, target, ctx)
+    run_dense(mod, params, shape_dict, target, dev)
     if measure_sparse:
         gen_weights = "prune" not in name
-        run_sparse(mod, params, shape_dict, target, ctx, bs_r, sparsity, gen_weights)
+        run_sparse(mod, params, shape_dict, target, dev, bs_r, sparsity, gen_weights)
 
 
 # benchmark()
diff --git a/tutorials/frontend/deploy_ssd_gluoncv.py b/tutorials/frontend/deploy_ssd_gluoncv.py
index 478aff255e0c..40b40ce1f441 100644
--- a/tutorials/frontend/deploy_ssd_gluoncv.py
+++ b/tutorials/frontend/deploy_ssd_gluoncv.py
@@ -28,7 +28,7 @@
 
 from matplotlib import pyplot as plt
 from tvm import relay
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 from tvm.contrib.download import download_testdata
 from gluoncv import model_zoo, data, utils
 
@@ -100,10 +100,10 @@ def build(target):
 #   enabled thrust during cmake by -DUSE_THRUST=ON.
 
 
-def run(lib, ctx):
+def run(lib, dev):
     # Build TVM runtime
-    m = graph_runtime.GraphModule(lib["default"](ctx))
-    tvm_input = tvm.nd.array(x.asnumpy(), ctx=ctx)
+    m = graph_executor.GraphModule(lib["default"](dev))
+    tvm_input = tvm.nd.array(x.asnumpy(), device=dev)
     m.set_input("data", tvm_input)
     # execute
     m.run()
@@ -113,10 +113,10 @@ def run(lib, ctx):
 
 
 for target in ["llvm", "cuda"]:
-    ctx = tvm.context(target, 0)
-    if ctx.exist:
+    dev = tvm.device(target, 0)
+    if dev.exist:
         lib = build(target)
-        class_IDs, scores, bounding_boxs = run(lib, ctx)
+        class_IDs, scores, bounding_boxs = run(lib, dev)
 
 ######################################################################
 # Display result
diff --git a/tutorials/frontend/from_caffe2.py b/tutorials/frontend/from_caffe2.py
index 34581c60db3c..a3378de8b0e3 100644
--- a/tutorials/frontend/from_caffe2.py
+++ b/tutorials/frontend/from_caffe2.py
@@ -105,12 +105,12 @@ def transform_image(image):
 # The process is no different from other examples.
 import tvm
 from tvm import te
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 
 # context x86 CPU, use tvm.gpu(0) if you run on GPU
-ctx = tvm.cpu(0)
+dev = tvm.cpu(0)
 # create a runtime executor module
-m = graph_runtime.GraphModule(lib["default"](ctx))
+m = graph_executor.GraphModule(lib["default"](dev))
 # set inputs
 m.set_input(input_name, tvm.nd.array(data.astype("float32")))
 # execute
diff --git a/tutorials/frontend/from_coreml.py b/tutorials/frontend/from_coreml.py
index c868a7fe2899..ea8817d3a0a8 100644
--- a/tutorials/frontend/from_coreml.py
+++ b/tutorials/frontend/from_coreml.py
@@ -81,11 +81,11 @@
 # Execute on TVM
 # -------------------
 # The process is no different from other example
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 
-ctx = tvm.cpu(0)
+dev = tvm.cpu(0)
 dtype = "float32"
-m = graph_runtime.GraphModule(lib["default"](ctx))
+m = graph_executor.GraphModule(lib["default"](dev))
 # set inputs
 m.set_input("image", tvm.nd.array(x.astype(dtype)))
 # execute
diff --git a/tutorials/frontend/from_darknet.py b/tutorials/frontend/from_darknet.py
index 76205b526e85..b29ed3d962c7 100644
--- a/tutorials/frontend/from_darknet.py
+++ b/tutorials/frontend/from_darknet.py
@@ -95,7 +95,7 @@
 # -------------------------
 # compile the model
 target = tvm.target.Target("llvm", host="llvm")
-ctx = tvm.cpu(0)
+dev = tvm.cpu(0)
 data = np.empty([batch_size, net.c, net.h, net.w], dtype)
 shape = {"data": data.shape}
 print("Compiling the model...")
@@ -116,9 +116,9 @@
 # Execute on TVM Runtime
 # ----------------------
 # The process is no different from other examples.
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 
-m = graph_runtime.GraphModule(lib["default"](ctx))
+m = graph_executor.GraphModule(lib["default"](dev))
 
 # set inputs
 m.set_input("data", tvm.nd.array(data.astype(dtype)))
diff --git a/tutorials/frontend/from_keras.py b/tutorials/frontend/from_keras.py
index 25a1e5c9d1fa..5f39a24c9b14 100644
--- a/tutorials/frontend/from_keras.py
+++ b/tutorials/frontend/from_keras.py
@@ -96,9 +96,9 @@
 mod, params = relay.frontend.from_keras(keras_resnet50, shape_dict)
 # compile the model
 target = "cuda"
-ctx = tvm.gpu(0)
+dev = tvm.gpu(0)
 with tvm.transform.PassContext(opt_level=3):
-    executor = relay.build_module.create_executor("graph", mod, ctx, target)
+    executor = relay.build_module.create_executor("graph", mod, dev, target)
 
 ######################################################################
 # Execute on TVM
diff --git a/tutorials/frontend/from_mxnet.py b/tutorials/frontend/from_mxnet.py
index d103d17e5d24..bfaac2c6c98e 100644
--- a/tutorials/frontend/from_mxnet.py
+++ b/tutorials/frontend/from_mxnet.py
@@ -104,11 +104,11 @@ def transform_image(image):
 # Execute the portable graph on TVM
 # ---------------------------------
 # Now, we would like to reproduce the same forward computation using TVM.
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 
-ctx = tvm.gpu(0)
+dev = tvm.gpu(0)
 dtype = "float32"
-m = graph_runtime.GraphModule(lib["default"](ctx))
+m = graph_executor.GraphModule(lib["default"](dev))
 # set inputs
 m.set_input("data", tvm.nd.array(x.astype(dtype)))
 # execute
diff --git a/tutorials/frontend/from_pytorch.py b/tutorials/frontend/from_pytorch.py
index 069626407e63..5f515e656bc8 100644
--- a/tutorials/frontend/from_pytorch.py
+++ b/tutorials/frontend/from_pytorch.py
@@ -101,7 +101,7 @@
 # -----------
 # Compile the graph to llvm target with given input specification.
 target = tvm.target.Target("llvm", host="llvm")
-ctx = tvm.cpu(0)
+dev = tvm.cpu(0)
 with tvm.transform.PassContext(opt_level=3):
     lib = relay.build(mod, target=target, params=params)
 
@@ -109,10 +109,10 @@
 # Execute the portable graph on TVM
 # ---------------------------------
 # Now we can try deploying the compiled model on target.
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 
 dtype = "float32"
-m = graph_runtime.GraphModule(lib["default"](ctx))
+m = graph_executor.GraphModule(lib["default"](dev))
 # Set inputs
 m.set_input(input_name, tvm.nd.array(img.astype(dtype)))
 # Execute
diff --git a/tutorials/frontend/from_tensorflow.py b/tutorials/frontend/from_tensorflow.py
index 3d01c6b0e407..9c8d0f65878c 100644
--- a/tutorials/frontend/from_tensorflow.py
+++ b/tutorials/frontend/from_tensorflow.py
@@ -72,10 +72,10 @@
 # Use these commented settings to build for cuda.
 # target = tvm.target.Target("cuda", host="llvm")
 # layout = "NCHW"
-# ctx = tvm.gpu(0)
+# dev = tvm.gpu(0)
 target = tvm.target.Target("llvm", host="llvm")
 layout = None
-ctx = tvm.cpu(0)
+dev = tvm.cpu(0)
 
 ######################################################################
 # Download required files
@@ -150,10 +150,10 @@
 # ---------------------------------
 # Now we can try deploying the compiled model on target.
 
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 
 dtype = "uint8"
-m = graph_runtime.GraphModule(lib["default"](ctx))
+m = graph_executor.GraphModule(lib["default"](dev))
 # set inputs
 m.set_input("DecodeJpeg/contents", tvm.nd.array(x.astype(dtype)))
 # execute
diff --git a/tutorials/frontend/from_tflite.py b/tutorials/frontend/from_tflite.py
index f7e8422c37b6..a85cfcea913c 100644
--- a/tutorials/frontend/from_tflite.py
+++ b/tutorials/frontend/from_tflite.py
@@ -148,7 +148,7 @@ def extract(path):
 # --------------
 import tvm
 from tvm import te
-from tvm.contrib import graph_runtime as runtime
+from tvm.contrib import graph_executor as runtime
 
 # Create a runtime executor module
 module = runtime.GraphModule(lib["default"](tvm.cpu()))
diff --git a/tutorials/frontend/using_external_lib.py b/tutorials/frontend/using_external_lib.py
index 8e7fcd70e3e9..232f618bb28a 100644
--- a/tutorials/frontend/using_external_lib.py
+++ b/tutorials/frontend/using_external_lib.py
@@ -34,7 +34,7 @@
 import tvm
 from tvm import te
 import numpy as np
-from tvm.contrib import graph_runtime as runtime
+from tvm.contrib import graph_executor as runtime
 from tvm import relay
 from tvm.relay import testing
 import tvm.testing
@@ -77,9 +77,9 @@
 target = "cuda"
 lib = relay.build_module.build(net, target, params=params)
 
-ctx = tvm.context(target, 0)
+dev = tvm.device(target, 0)
 data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
-module = runtime.GraphModule(lib["default"](ctx))
+module = runtime.GraphModule(lib["default"](dev))
 module.set_input("data", data)
 module.run()
 out_shape = (batch_size, out_channels, 224, 224)
@@ -498,9 +498,9 @@
 target = "cuda -libs=cudnn"  # use cudnn for convolution
 lib = relay.build_module.build(net, target, params=params)
 
-ctx = tvm.context(target, 0)
+dev = tvm.device(target, 0)
 data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
-module = runtime.GraphModule(lib["default"](ctx))
+module = runtime.GraphModule(lib["default"](dev))
 module.set_input("data", data)
 module.run()
 out_shape = (batch_size, out_channels, 224, 224)
diff --git a/tutorials/get_started/README.txt b/tutorials/get_started/README.txt
index 41b1b44055f1..aa6c559c1b38 100644
--- a/tutorials/get_started/README.txt
+++ b/tutorials/get_started/README.txt
@@ -1,2 +1,2 @@
-Get Started Tutorials
----------------------
+Getting Started With TVM
+------------------------
diff --git a/tutorials/get_started/autotvm_matmul.py b/tutorials/get_started/autotvm_matmul.py
new file mode 100644
index 000000000000..930e003fe450
--- /dev/null
+++ b/tutorials/get_started/autotvm_matmul.py
@@ -0,0 +1,376 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Optimizing Operators with Templates and AutoTVM
+===============================================
+**Authors**:
+`Lianmin Zheng <https://github.com/merrymercy>`_,
+`Chris Hoge <https://github.com/hogepodge>`_
+
+In this tutorial, we will now show how the TVM Template Extension (TE) language
+can be used to write scheduling templates that can be searched by AutoTVM to
+find optimal configurations of scheduling variables. This process is called
+Auto-Tuning, and builds on TE to help automate the process of optimizing
+operations.
+
+This tutorial builds on the previous `tutorial on how to write a matrix
+multiplication using TE <tensor_expr_get_started>`.
+
+There are two steps in auto-tuning.
+
+- The first step is defining a search space.
+- The second step is running a search algorithm to explore through this space.
+
+In this tutorial, you can learn how to perform these two steps in TVM. The whole
+workflow is illustrated by a matrix multiplication example.
+
+.. note::
+  Note that this tutorial will not run on Windows or recent versions of macOS.
+  To get it to run, you will need to wrap the body of this tutorial in a
+  :code:`if __name__ == "__main__":` block.
+"""
+
+################################################################################
+# Install dependencies
+# --------------------
+# To use autotvm package in TVM, we need to install some extra dependencies.
+#
+# .. code-block:: bash
+#
+#   pip3 install --user psutil xgboost cloudpickle
+#
+# To make TVM run faster in tuning, it is recommended to use cython as FFI of
+# TVM. In the root directory of TVM, execute:
+#
+# .. code-block:: bash
+#
+#   pip3 install --user cython
+#   sudo make cython3
+#
+# Now return to python code. Begin by importing the required packages.
+
+import logging
+import sys
+
+import numpy as np
+import tvm
+from tvm import te
+import tvm.testing
+
+# the module is called `autotvm`
+from tvm import autotvm
+
+################################################################################
+# Basic Matrix Multiplication with TE
+# -----------------------------------
+# Recall the basic implementation of matrix multiplication using TE. We write
+# it down here with a few changes. We will wrap the multiplication in a python
+# function definition. For simplicity, we will focus our attention on a split
+# optimization, using a fixed value that defines the block size of the
+# reordering.
+
+
+def matmul_basic(N, L, M, dtype):
+
+    a = te.placeholder((n, l), name="a", dtype=dtype)
+    B = te.placeholder((L, M), name="B", dtype=dtype)
+
+    k = te.reduce_axis((0, L), name="k")
+    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
+    s = te.create_schedule(C.op)
+
+    # schedule
+    y, x = s[C].op.axis
+    k = s[C].op.reduce_axis[0]
+
+    yo, yi = s[C].split(y, 8)
+    xo, xi = s[C].split(x, 8)
+
+    s[C].reorder(yo, xo, k, yi, xi)
+
+    return s, [A, B, C]
+
+
+################################################################################
+# Matrix Multiplication with AutoTVM
+# ----------------------------------
+# In the previous schedule code, we use a constant "8" as the tiling factor.
+# However, it might not be the best one because the best tiling factor depends
+# on real hardware environment and input shape.
+#
+# If you want the schedule code to be portable across a wider range of input
+# shapes and target hardware, it is better to define a set of candidate values
+# and pick the best one according to the measurement results on target
+# hardware.
+#
+# In autotvm, we can define a tunable parameter, or a "knob" for such kind of
+# value.
+
+################################################################################
+# A Basic Matrix Multiplication Template
+# --------------------------------------
+# We begin with an example of how to create a tunable parameter set for the
+# block size of the `split` scheduling operation.
+
+# Matmul V1: List candidate values
+@autotvm.template("tutorial/matmul_v1")  # 1. use a decorator
+def matmul_v1(N, L, M, dtype):
+    A = te.placeholder((N, L), name="A", dtype=dtype)
+    B = te.placeholder((L, M), name="B", dtype=dtype)
+
+    k = te.reduce_axis((0, L), name="k")
+    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
+    s = te.create_schedule(C.op)
+
+    # schedule
+    y, x = s[C].op.axis
+    k = s[C].op.reduce_axis[0]
+
+    # 2. get the config object
+    cfg = autotvm.get_config()
+
+    # 3. define search space
+    cfg.define_knob("tile_y", [1, 2, 4, 8, 16])
+    cfg.define_knob("tile_x", [1, 2, 4, 8, 16])
+
+    # 4. schedule according to config
+    yo, yi = s[C].split(y, cfg["tile_y"].val)
+    xo, xi = s[C].split(x, cfg["tile_x"].val)
+
+    s[C].reorder(yo, xo, k, yi, xi)
+
+    return s, [A, B, C]
+
+
+################################################################################
+# Here we make four modifications to the previous schedule code and get a
+# tunable "template". We can explain the modifications one by one.
+#
+# 1. Use a decorator to mark this function as a simple template.
+# 2. Get a config object: You can regard this :code:`cfg` as an argument of
+#    this function but we obtain it in a different way. With this argument, this
+#    function is no longer a deterministic schedule. Instead, we can pass
+#    different configurations to this function and get different schedules. A
+#    function that uses a configuration object like this is called a "template".
+#
+#    To make the template function more compact, we can do two things to define
+#    the parameter search space within a single function.
+#
+#    1. Define a search space across a set values. This is done by making
+#       :code:`cfg` a :any:`ConfigSpace` object. It will collect all of the
+#       tunable knobs in this function and build a search space from it.
+#    2. Schedule according to an entity in this space. This is done by making
+#       :code:`cfg` a :any:`ConfigEntity` object. When it is a
+#       :any:`ConfigEntity`, it will ignore all space definition API (namely,
+#       :code:`cfg.define_XXXXX(...)`). Instead, it will store deterministic
+#       values for all tunable knobs, and we schedule according to these values.
+#
+#    During auto-tuning, we will first call this template with a
+#    :any:`ConfigSpace` object to build the search space. Then we call this
+#    template with different :any:`ConfigEntity` in the built space to get
+#    different schedules. Finally we will measure the code generated by
+#    different schedules and pick the best one.
+#
+# 3. Define two tunable knobs. The first one is :code:`tile_y` with 5 possible
+#    values. The second one is :code:`tile_x` with a same list of possible values.
+#    These two knobs are independent, so they span a search space with size 25 =
+#    5x5.
+# 4. The configuration knobs are passed to the :code:`split` schedule
+#    operation, allowing us to schedule according to the 5x5 deterministic values
+#    we previously defined in :code:`cfg`.
+
+################################################################################
+# A Matrix Multiplication Template with the Advanced Parameter API
+# ----------------------------------------------------------------
+# In the previous template, we manually listed all of the possible values for a
+# knob. This is the lowest level API to define the space, and gives an explicit
+# enumeration of the parameter space to search. However, we also provide
+# another set of APIs that can make the definition of the search space easier
+# and smarter. Where possible, we receomment you use this higher-level API
+#
+# In the following example, we use :any:`ConfigSpace.define_split` to define a
+# split knob. It will enumerate all the possible ways to split an axis and
+# construct the space.
+#
+# We also have :any:`ConfigSpace.define_reorder` for reorder knob and
+# :any:`ConfigSpace.define_annotate` for annotation like unroll, vectorization,
+# thread binding. When the high level API cannot meet your requirements, you
+# can always fall back to using the low level API.
+
+
+@autotvm.template("tutorial/matmul")
+def matmul(N, L, M, dtype):
+    A = te.placeholder((N, L), name="A", dtype=dtype)
+    B = te.placeholder((L, M), name="B", dtype=dtype)
+
+    k = te.reduce_axis((0, L), name="k")
+    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
+    s = te.create_schedule(C.op)
+
+    # schedule
+    y, x = s[C].op.axis
+    k = s[C].op.reduce_axis[0]
+
+    ##### define space begin #####
+    cfg = autotvm.get_config()
+    cfg.define_split("tile_y", y, num_outputs=2)
+    cfg.define_split("tile_x", x, num_outputs=2)
+    ##### define space end #####
+
+    # schedule according to config
+    yo, yi = cfg["tile_y"].apply(s, C, y)
+    xo, xi = cfg["tile_x"].apply(s, C, x)
+
+    s[C].reorder(yo, xo, k, yi, xi)
+
+    return s, [A, B, C]
+
+
+################################################################################
+# .. note:: More Explanation on :code:`cfg.define_split`
+#
+#  In this template, :code:`cfg.define_split("tile_y", y, num_outputs=2)` will
+#  enumerate all possible combinations that can split axis y into two axes with
+#  factors of the length of y. For example, if the length of y is 32 and we
+#  want to split it into two axes using factors of 32, then there are 6
+#  possible values for (length of outer axis, length of inner axis) pair,
+#  namely (32, 1), (16, 2), (8, 4), (4, 8), (2, 16) or (1, 32). These are all 6
+#  possible values of `tile_y`.
+#
+#  During scheduling, :code:`cfg["tile_y"]` is a :code:`SplitEntity` object.
+#  We stores the lengths of outer axes and inner axes in
+#  :code:`cfg['tile_y'].size` (a tuple with two elements).  In this template,
+#  we apply it by using :code:`yo, yi = cfg['tile_y'].apply(s, C, y)`.
+#  Actually, this is equivalent to :code:`yo, yi = s[C].split(y,
+#  cfg["tile_y"].size[1])` or  :code:`yo, yi = s[C].split(y,
+#  nparts=cfg['tile_y"].size[0])`
+#
+#  The advantage of using cfg.apply API is that it makes multi-level splits
+#  (that is, when num_outputs >= 3) easier.
+
+################################################################################
+# Step 2: Use AutoTVM to Optimize the Matrix Multiplication
+# ---------------------------------------------------------
+# In Step 1, we wrote a matrix multiplication template that allowed us to
+# paramaterize the block size used in the `split` schedule. We can now conduct
+# a search over this parameter space. The next step is to pick a tuner to guide
+# the exploration of this space.
+#
+# Auto-tuners in TVM
+# ~~~~~~~~~~~~~~~~~~
+# The job for a tuner can be described by following pseudo code
+#
+#   .. code-block:: c
+#
+#    ct = 0
+#    while ct < max_number_of_trials:
+#        propose a batch of configs
+#        measure this batch of configs on real hardware and get results
+#        ct += batch_size
+#
+# When proposing the next batch of configs, the tuner can take different
+# strategies. Some of the tuner strategies provided by TVM include:
+#
+# * :any:`RandomTuner`: Enumerate the space in a random order
+# * :any:`GridSearchTuner`: Enumerate the space in a grid search order
+# * :any:`GATuner`: Using genetic algorithm to search through the space
+# * :any:`XGBTuner`: Uses a model based method. Train a XGBoost model to
+#   predict the speed of lowered IR and pick the next batch according to the
+#   prediction.
+#
+# You can choose the tuner according to the size of your space, your time
+# budget and other factors.  For example, if your space is very small (less
+# than 1000), a gridsearch tuner or a random tuner is good enough. If your
+# space is at the level of 10^9 (this is the space size of a conv2d operator on
+# CUDA GPU), XGBoostTuner can explore more efficiently and find better configs.
+
+################################################################################
+# Begin tuning
+# ~~~~~~~~~~~~
+# Here we continue our matrix multiplication example. First we create a tuning
+# task. We can also inspect the initialized search space. In this case, for a
+# 512x512 square matrix multiplication, the space size is 10x10=100 Note that
+# the task and search space are independent of the tuner picked.
+
+N, L, M = 512, 512, 512
+task = autotvm.task.create("tutorial/matmul", args=(N, L, M, "float32"), target="llvm")
+print(task.config_space)
+
+################################################################################
+# Then we need to define how to measure the generated code and pick a tuner.
+# Since our space is small, a random tuner is just okay.
+#
+# We only make 10 trials in this tutorial for demonstration. In practice, you
+# can do more trials according to your time budget. We will log the tuning
+# results into a log file. This file can be used to choose the best
+# configuration discovered by the tuner later.
+
+# logging config (for printing tuning log to the screen)
+logging.getLogger("autotvm").setLevel(logging.DEBUG)
+logging.getLogger("autotvm").addHandler(logging.StreamHandler(sys.stdout))
+
+################################################################################
+# There are two steps for measuring a config: build and run. By default, we use
+# all CPU cores to compile program. We then measure them sequentially. To help
+# reduce variance, we take 5 measurements and average them.
+measure_option = autotvm.measure_option(builder="local", runner=autotvm.LocalRunner(number=5))
+
+# Begin tuning with RandomTuner, log records to file `matmul.log`
+# You can use alternatives like XGBTuner.
+tuner = autotvm.tuner.RandomTuner(task)
+tuner.tune(
+    n_trial=10,
+    measure_option=measure_option,
+    callbacks=[autotvm.callback.log_to_file("matmul.log")],
+)
+
+################################################################################
+# With tuning completed, we can choose the configuration from the log file that
+# has the best measured performance and compile the schedule with the
+# corresponding parameters. We also do a quick verfication that the schedule is
+# producing correct answers.  We can call the function :code:`matmul` directly
+# under the :any:`autotvm.apply_history_best` context. When we call this
+# function, it will query the dispatch context with its argument and get the
+# best config with the same argument.
+
+# apply history best from log file
+with autotvm.apply_history_best("matmul.log"):
+    with tvm.target.Target("llvm"):
+        s, arg_bufs = matmul(N, L, M, "float32")
+        func = tvm.build(s, arg_bufs)
+
+# check correctness
+a_np = np.random.uniform(size=(N, L)).astype(np.float32)
+b_np = np.random.uniform(size=(L, M)).astype(np.float32)
+c_np = a_np.dot(b_np)
+
+c_tvm = tvm.nd.empty(c_np.shape)
+func(tvm.nd.array(a_np), tvm.nd.array(b_np), c_tvm)
+
+tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-4)
+
+################################################################################
+# Final Notes and Summary
+# -----------------------
+# In this tutorial, we have shown how to build operator templates that allow
+# TVM to search a parameter space and choose optimized schedule configurations.
+# To gain a deeper understanding of how this works, we recommend expanding on
+# this example by adding new search parameters to the schedule based on
+# schedule operations demonstated in the `Getting Started With Tensor
+# Expressions <tensor_expr_get_started>_` tutorial In the upcoming sections, we
+# will demonstate the AutoScheduler, a method for TVM to optimize common
+# operators without the need for the user to provide a user-defined template.
diff --git a/tutorials/get_started/cross_compilation_and_rpc.py b/tutorials/get_started/cross_compilation_and_rpc.py
index cc5b9cc5bc2c..3c23c4956262 100644
--- a/tutorials/get_started/cross_compilation_and_rpc.py
+++ b/tutorials/get_started/cross_compilation_and_rpc.py
@@ -180,9 +180,9 @@
 func = remote.load_module("lib.tar")
 
 # create arrays on the remote device
-ctx = remote.cpu()
-a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
-b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
+dev = remote.cpu()
+a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev)
+b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
 # the function will run on the remote device
 func(a, b)
 np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
@@ -194,7 +194,7 @@
 # function over number times, measures the cost per run on the remote
 # device and returns the measured cost. Network overhead is excluded.
 
-time_f = func.time_evaluator(func.entry_name, ctx, number=10)
+time_f = func.time_evaluator(func.entry_name, dev, number=10)
 cost = time_f(a, b).mean
 print("%g secs/op" % cost)
 
@@ -245,9 +245,9 @@ def run_opencl():
     func = remote.load_module("lib_cl.tar")
 
     # run
-    ctx = remote.cl()
-    a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
-    b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
+    dev = remote.cl()
+    a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev)
+    b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
     func(a, b)
     np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
     print("OpenCL test passed!")
diff --git a/tutorials/get_started/install.py b/tutorials/get_started/install.py
new file mode 100644
index 000000000000..a64d3cfd3b8c
--- /dev/null
+++ b/tutorials/get_started/install.py
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Installing TVM
+==============
+**Authors**:
+`Jocelyn Shiue <https://github.com/>`_,
+`Chris Hoge <https://github.com/hogepodge>`_
+
+Depending on your needs and your working environment, there are a few different
+methods for installing TVM. These include:
+    * Installing from source 
+    * Installing from third-party binary package.
+"""
+
+################################################################################
+# Installing from Source
+# ----------------------
+# Installing from source is the recommended method for installing TVM. It will
+# allow you to enable specific features such as GPU support, microcontroller
+# support (uTVM), and a debugging runtime, and other features. You will also
+# want to install from source if you want to actively contribute to the TVM
+# project. The full instructions are on the `Install TVM From Source
+# </install/from_source.html>`_ page.
+
+################################################################################
+# Installing From Binary Packages
+# --------------------------------
+# You may install convenient third-party binary package distributions to
+# quickly try things out. TLCPack is a thirdparty volunteer community that
+# builds binary packages from TVM source. It offers support matrix with
+# instructions to install on different platforms, with different features.
+# Checkout  `TLCPack <https://tlcpack.ai>`_ to learn more. Note that the
+# thirdparty binary packages could contain additional licensing terms for
+# the hardware drivers that are bundled with it.
diff --git a/tutorials/get_started/introduction.py b/tutorials/get_started/introduction.py
new file mode 100644
index 000000000000..0ee79d334c03
--- /dev/null
+++ b/tutorials/get_started/introduction.py
@@ -0,0 +1,132 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Introduction
+============
+**Authors**:
+`Jocelyn Shiue <https://github.com/>`_,
+`Chris Hoge <https://github.com/hogepodge>`_
+
+Apache TVM is an open source machine learning compiler framework for CPUs,
+GPUs, and machine learning accelerators. It aims to enable machine learning
+engineers to optimize and run computations efficiently on any hardware backend.
+The purpose of this tutorial is to take a guided tour through all of the major
+features of TVM by defining and demonstrating key concepts. A new user should
+be able to work through the tutorial from start to finish and be able to
+operate TVM for automatic model optimization, while having a basic
+understanding of the TVM architecture and how it works.
+
+Contents
+--------
+
+#. :doc:`Introduction <introduction>`
+#. :doc:`Installing TVM <install>`
+#. :doc:`Compiling and Optimizing a Model with TVMC <tvmc_command_line_driver>`
+#. :doc:`Compiling and Optimizing a Model with the Python AutoScheduler <auto_tuning_with_python>`
+#. :doc:`Working with Operators Using Tensor Expressions <tensor_expr_get_started>`
+#. :doc:`Optimizing Operators with Templates and AutoTVM <autotvm_matmul>`
+#. :doc:`Optimizing Operators with AutoScheduling <tune_matmul_x86>`
+#. :doc:`Cross Compilation and Remote Procedure Calls (RPC) <cross_compilation_and_rpc>`
+#. :doc:`Compiling Deep Learning Models for GPUs <relay_quick_start>`
+"""
+
+################################################################################
+# An Overview of TVM and Model Optimization
+# =========================================
+#
+# The diagram below illustrates the steps a machine model takes as it is
+# transformed with the TVM optimizing compiler framework.
+#
+# .. image:: https://raw.githubusercontent.com/hogepodge/web-data/c339ebbbae41f3762873147c1e920a53a08963dd/images/getting_started/overview.png
+#   :width: 100%
+#   :alt: A High Level View of TVM
+#
+# 1. Import the model from a framework like *Tensorflow*, *Pytorch*, or *Onnx*.
+#    The importer layer is where TVM can ingest models from other frameworks, like
+#    ONNX, Tensorflow, or PyTorch. The level of support that TVM offers for each
+#    frontend varies as we are constantly improving the open source project. If
+#    you're having issues importing your model into TVM, you may want to try
+#    converting it to ONNX.
+#
+# 2. Translate to *Relay*, TVM's high level model language.
+#    A model that has been imported into TVM is represented in Relay. Relay is a
+#    functional language and intermediate representation (IR) for neural networks.
+#    It has support for:
+#
+#    - Traditional data flow-style representations
+#    - Functional-style scoping, let-binding which makes it a fully featured
+#      differentiable language
+#    - Ability to allow the user to mix the two programming styles
+#
+#    Relay applies several high-level optimization to the model, after which
+#    is runs the Relay Fusion Pass. To aid in the process of converting to
+#    Relay, TVM includes a Tensor Operator Inventory (TOPI) that has pre-defined
+#    templates of common computations.
+#
+# 3. Lower to *Tensor Expression* (TE) representation. Lowering is when a
+#    higher-level representation is transformed into a lower-level
+#    representation. In Relay Fusion Pass, the model is lowered from the
+#    higher-level Relay representation into a smaller set of subgraphs, where
+#    each node is a task. A task is a collection of computation templates,
+#    expressed in TE, where there parameters of the template can control how
+#    the computation is carried out on hardware. The specific ordering of compuation,
+#    defined by parameters to the TE template, is called a schedule.
+#
+# 4. Search for optimized schedule using *AutoTVM* or *AutoScheduler* for each
+#    task through tuning. Tuning is the process of searching the TE parameter
+#    space for a schedule that is optimized for target hardware. There are
+#    couple of optimization options available, each requiring varying levels of
+#    user interaction. The optimization options include:
+#
+#    - **AutoTVM**: The user specifies a search template for the schedule of a TE task,
+#      or TE subraph. AutoTVM directs the search of the parameter space defined by the
+#      template to produce an optimized configuration. AutoTVM requires users to
+#      define manually templates for each operator as part of the TOPI.
+#    - **Ansor/AutoSchedule**: Using a TVM Operator Inventory (TOPI) of operations,
+#      Ansor can automatically search an optimization space with much less
+#      intervention and guidance from the end user. Ansor depends on TE templates to
+#      guide the search.
+#
+# 5. Choose the optimal configuration for the model. After tuning, an optimal schedule
+#    for each task is chosen. Regardless if it is AutoTVM or AutoSchedule,
+#    schedule records in JSON format are produced that are referred to by this step
+#    to build an optimized model.
+#
+# 6. Lower to a hardware specific compiler. After selecting an optimized configuration
+#    based on the tuning step, the model is then lowered to a representation
+#    expected by the target compiler for the hardware platform. This is the
+#    final code generation phase with the intention of producing an optimized
+#    model that can be deployed into production. TVM supports a number of
+#    different compiler backends including:
+#
+#    - LLVM, which can target arbitrary microprocessor architecture including
+#      standard x86 and ARM processors, AMDGPU and NVPTX code generation, and any
+#      other platform supported by LLVM.
+#    - Specialized compilers, such as NVCC, NVIDIA's compiler.
+#    - Embedded and specialized targets, which are implemented through TVM's
+#      Bring Your Own Codegen (BYOC) framework.
+#
+# 7. Compile down to machine code. At the end of this process, the
+#    compiler-specific generated code can be lowered to machine code.
+#
+#    TVM can compile models down to a linkable object module, which can then be
+#    run with a lightweight TVM runtime that provides C APIs to dynamically
+#    load the model, and entry points for other languages such as Python and
+#    Rust. TVM can also build a bundled deployment in which the runtime is
+#    combined with the model in a single package.
+#
+# The remainder of the tutorial will cover these aspects of TVM in more detail.
diff --git a/tutorials/get_started/relay_quick_start.py b/tutorials/get_started/relay_quick_start.py
index 444b915ca7c8..fa9207604bac 100644
--- a/tutorials/get_started/relay_quick_start.py
+++ b/tutorials/get_started/relay_quick_start.py
@@ -43,7 +43,7 @@
 from tvm.relay import testing
 import tvm
 from tvm import te
-from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
 import tvm.testing
 
 ######################################################################
@@ -104,13 +104,13 @@
 #####################################################################
 # Run the generate library
 # ------------------------
-# Now we can create graph runtime and run the module on Nvidia GPU.
+# Now we can create graph executor and run the module on Nvidia GPU.
 
 # create random input
-ctx = tvm.gpu()
+dev = tvm.gpu()
 data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
 # create module
-module = graph_runtime.GraphModule(lib["default"](ctx))
+module = graph_executor.GraphModule(lib["default"](dev))
 # set input and parameters
 module.set_input("data", data)
 # run
@@ -143,7 +143,7 @@
 loaded_lib = tvm.runtime.load_module(path_lib)
 input_data = tvm.nd.array(np.random.uniform(size=data_shape).astype("float32"))
 
-module = graph_runtime.GraphModule(loaded_lib["default"](ctx))
+module = graph_executor.GraphModule(loaded_lib["default"](dev))
 module.run(data=input_data)
 out_deploy = module.get_output(0).asnumpy()
 
diff --git a/tutorials/get_started/tensor_expr_get_started.py b/tutorials/get_started/tensor_expr_get_started.py
index 83c328560b4d..c63a068360f2 100644
--- a/tutorials/get_started/tensor_expr_get_started.py
+++ b/tutorials/get_started/tensor_expr_get_started.py
@@ -148,17 +148,17 @@
 # We provide a minimal array API in python to aid quick testing and prototyping.
 # The array API is based on the `DLPack <https://github.com/dmlc/dlpack>`_ standard.
 #
-# - We first create a GPU context.
+# - We first create a GPU device.
 # - Then tvm.nd.array copies the data to the GPU.
 # - fadd runs the actual computation.
 # - asnumpy() copies the GPU array back to the CPU and we can use this to verify correctness
 #
-ctx = tvm.context(tgt.kind.name, 0)
+dev = tvm.device(tgt.kind.name, 0)
 
 n = 1024
-a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
-b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
-c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
+b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
+c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
 fadd(a, b, c)
 tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
 
@@ -293,11 +293,11 @@
     fadd_cl = tvm.build(s, [A, B, C], tgt, name="myadd")
     print("------opencl code------")
     print(fadd_cl.imported_modules[0].get_source())
-    ctx = tvm.cl(0)
+    dev = tvm.cl(0)
     n = 1024
-    a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
-    b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
-    c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+    a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
+    b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
+    c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
     fadd_cl(a, b, c)
     tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
 
diff --git a/tutorials/auto_scheduler/tune_matmul_x86.py b/tutorials/get_started/tune_matmul_x86.py
similarity index 52%
rename from tutorials/auto_scheduler/tune_matmul_x86.py
rename to tutorials/get_started/tune_matmul_x86.py
index 084f5ae67518..931f877595f5 100644
--- a/tutorials/auto_scheduler/tune_matmul_x86.py
+++ b/tutorials/get_started/tune_matmul_x86.py
@@ -15,24 +15,27 @@
 # specific language governing permissions and limitations
 # under the License.
 """
-Auto-scheduling Matrix Multiplication for CPU
-=============================================
+Optimizing Operators with Auto-scheduling
+=========================================
 **Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, \
             `Chengfan Jia <https://github.com/jcf94/>`_
 
-This is a tutorial on how to use the auto-scheduler for CPUs.
+In this tutorial, we will show how TVM's Auto Scheduling feature can find
+optimal schedules without the need for writing a custom template.
 
-Different from the template-based :ref:`autotvm <tutorials-autotvm-sec>` which relies on
-manual templates to define the search space, the auto-scheduler does not require any templates.
-Users only need to write the computation declaration without any schedule commands or templates.
-The auto-scheduler can automatically generate a large search space and
-find a good schedule in the space.
+Different from the template-based :ref:`<autotvm_matmul>` which relies on
+manual templates to define the search space, the auto-scheduler does not
+require any templates.  Users only need to write the computation declaration
+without any schedule commands or templates.  The auto-scheduler can
+automatically generate a large search space and find a good schedule in the
+space.
 
 We use matrix multiplication as an example in this tutorial.
 
-Note that this tutorial will not run on Windows or recent versions of macOS. To
-get it to run, you will need to wrap the body of this tutorial in a :code:`if
-__name__ == "__main__":` block.
+.. note::
+  Note that this tutorial will not run on Windows or recent versions of macOS. To
+  get it to run, you will need to wrap the body of this tutorial in a :code:`if
+  __name__ == "__main__":` block.
 """
 
 import os
@@ -41,15 +44,18 @@
 import tvm
 from tvm import te, auto_scheduler
 
-######################################################################
-# Define the computation
-# ^^^^^^^^^^^^^^^^^^^^^^
-# To begin with, let us define the computation of a matmul with bias add.
-# The function should return the list of input/output tensors.
-# From these tensors, the auto-scheduler can get the whole computational graph.
+################################################################################
+# Defining the Matrix Multiplication
+# ----------------------------------
+# To start, we define a matrix multiplication with a bias addition.  Note that
+# this uses standard operations available in TVMs Tensor Expression language.
+# The major difference is the use of the `auto_sceduler` decorator at the top
+# of the function definition.  The function should return a list of
+# input/output tensors.  From these tensors, the auto-scheduler can get the
+# whole computational graph.
 
 
-@auto_scheduler.register_workload
+@auto_scheduler.register_workload  # Note the auto_scheduler decorator
 def matmul_add(N, L, M, dtype):
     A = te.placeholder((N, L), name="A", dtype=dtype)
     B = te.placeholder((L, M), name="B", dtype=dtype)
@@ -67,12 +73,17 @@ def matmul_add(N, L, M, dtype):
     return [A, B, C, out]
 
 
-######################################################################
+################################################################################
 # Create the search task
-# ^^^^^^^^^^^^^^^^^^^^^^
-# We then create a search task with N=L=M=1024 and dtype="float32"
-# If your machine supports avx instructions, you can
+# ----------------------
+# With the function defined, we can now create the task for the auto_scheduler
+# to search against. We specify the particular parameters for this matrix
+# multiplication, in this case a multiplication of to square matricies of size
+# 1024x1024. We then create a search task with N=L=M=1024 and dtype="float32"
 #
+# .. note:: Improve performance with custom targets
+#   In order for TVM to take full advantage of specific hardware platforms,
+#   you will want to manuall specify your CPU capabilities. For example:
 #   - replace "llvm" below with "llvm -mcpu=core-avx2" to enable AVX2
 #   - replace "llvm" below with "llvm -mcpu=skylake-avx512" to enable AVX-512
 
@@ -84,15 +95,18 @@ def matmul_add(N, L, M, dtype):
 print("Computational DAG:")
 print(task.compute_dag)
 
-######################################################################
+################################################################################
+# Set Parameters for Auto-Scheduler
+# ---------------------------------
 # Next, we set parameters for the auto-scheduler.
 #
-# * :code:`num_measure_trials` is the number of measurement trials we can use during the search.
-#   We only make 10 trials in this tutorial for a fast demonstration. In practice, 1000 is a
-#   good value for the search to converge. You can do more trials according to your time budget.
-# * In addition, we use :code:`RecordToFile` to dump measurement records into a file `matmul.json`.
-#   The measurement records can be used to query the history best, resume the search,
-#   and do more analyses later.
+# * :code:`num_measure_trials` is the number of measurement trials we can use
+#   during the search.  We only make 10 trials in this tutorial for a fast
+#   demonstration. In practice, 1000 is a good value for the search to converge.
+#   You can do more trials according to your time budget.
+# * In addition, we use :code:`RecordToFile` to log measurement records into a
+#   file `matmul.json`.  The measurement records can be used to query the history
+#   best, resume the search, and do more analyses later.
 # * see :any:`auto_scheduler.TuningOptions` for more parameters
 
 log_file = "matmul.json"
@@ -102,30 +116,32 @@ def matmul_add(N, L, M, dtype):
     verbose=2,
 )
 
-######################################################################
+################################################################################
 # Run the search
-# ^^^^^^^^^^^^^^
-# Now we get all inputs ready. Pretty simple, isn't it?
-# We can kick off the search and let the auto-scheduler do its magic.
-# After some measurement trials, we can load the best schedule from the log
-# file and apply it.
+# --------------
+# Now we get all inputs ready. Pretty simple, isn't it?  We can kick off the
+# search and let the auto-scheduler do its magic.  After some measurement
+# trials, we can load the best schedule from the log file and apply it.
 
 # Run auto-tuning (search)
 task.tune(tune_option)
 # Apply the best schedule
 sch, args = task.apply_best(log_file)
 
-######################################################################
-# We can lower the schedule to see the IR after auto-scheduling.
-# The auto-scheduler correctly performs optimizations including multi-level tiling,
-# layout transformation, parallelization, vectorization, unrolling, and operator fusion.
+################################################################################
+# Inspecting the Optimized Schedule
+# ---------------------------------
+# We can lower the schedule to see the IR after auto-scheduling.  The
+# auto-scheduler correctly performs optimizations including multi-level tiling,
+# layout transformation, parallelization, vectorization, unrolling, and
+# operator fusion.
 
 print("Lowered TIR:")
 print(tvm.lower(sch, args, simple_mode=True))
 
-######################################################################
+################################################################################
 # Check correctness and evaluate performance
-# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+# ------------------------------------------
 # We build the binary and check its correctness and performance.
 
 func = tvm.build(sch, args, target)
@@ -134,44 +150,43 @@ def matmul_add(N, L, M, dtype):
 c_np = np.random.uniform(size=(N, M)).astype(np.float32)
 out_np = a_np.dot(b_np) + c_np
 
-ctx = tvm.cpu()
-a_tvm = tvm.nd.array(a_np, ctx=ctx)
-b_tvm = tvm.nd.array(b_np, ctx=ctx)
-c_tvm = tvm.nd.array(c_np, ctx=ctx)
-out_tvm = tvm.nd.empty(out_np.shape, ctx=ctx)
+dev = tvm.cpu()
+a_tvm = tvm.nd.array(a_np, device=dev)
+b_tvm = tvm.nd.array(b_np, device=dev)
+c_tvm = tvm.nd.array(c_np, device=dev)
+out_tvm = tvm.nd.empty(out_np.shape, device=dev)
 func(a_tvm, b_tvm, c_tvm, out_tvm)
 
 # Check results
 np.testing.assert_allclose(out_np, out_tvm.asnumpy(), rtol=1e-3)
 
 # Evaluate execution time.
-evaluator = func.time_evaluator(func.entry_name, ctx, min_repeat_ms=500)
+evaluator = func.time_evaluator(func.entry_name, dev, min_repeat_ms=500)
 print(
     "Execution time of this operator: %.3f ms"
     % (np.median(evaluator(a_tvm, b_tvm, c_tvm, out_tvm).results) * 1000)
 )
 
 
-######################################################################
+################################################################################
 # Using the record file
-# ^^^^^^^^^^^^^^^^^^^^^
-# During the search, all measurement records are dumped into the record
-# file "matmul.json". The measurement records can be used to re-apply search results,
-# resume the search, and perform other analyses.
-
-######################################################################
-# Here is an example where we load the best schedule from a file,
-# and print the equivalent python schedule API. This can be used for
-# debugging and learning the behavior of the auto-scheduler.
+# ---------------------
+# During the search, all measurement records are logged into the record file
+# "matmul.json". The measurement records can be used to re-apply search
+# results, resume the search, and perform other analyses.
+#
+# Here is an example where we load the best schedule from a file, and print the
+# equivalent python schedule API. This can be used for debugging and learning
+# the behavior of the auto-scheduler.
 
 print("Equivalent python schedule:")
 print(task.print_best(log_file))
 
-######################################################################
-# A more complicated example is to resume the search.
-# In this case, we need to create the search policy and cost model by ourselves
-# and resume the status of search policy and cost model with the log file.
-# In the example below we resume the status and do more 5 trials.
+################################################################################
+# A more complicated example is to resume the search.  In this case, we need to
+# create the search policy and cost model by ourselves and resume the status of
+# search policy and cost model with the log file.  In the example below we
+# resume the status and do more 5 trials.
 
 
 def resume_search(task, log_file):
@@ -188,3 +203,12 @@ def resume_search(task, log_file):
 
 
 resume_search(task, log_file)
+
+################################################################################
+# Final Notes and Summary
+# -----------------------
+# In this tutorial, we have shown how to use the TVM Auto-Scheduler to
+# automatically optimize a matrix multiplication, without the need to specify a
+# search template.  It ends a series of examples that starts from the Tensor
+# Expression (TE) language that demonstrates how TVM can optimize computational
+# operations.
diff --git a/tutorials/get_started/tvmc_command_line_driver.py b/tutorials/get_started/tvmc_command_line_driver.py
index bcdf03e56875..fffbfbf0356f 100644
--- a/tutorials/get_started/tvmc_command_line_driver.py
+++ b/tutorials/get_started/tvmc_command_line_driver.py
@@ -15,31 +15,33 @@
 # specific language governing permissions and limitations
 # under the License.
 """
-Getting Started with TVM command line driver - TVMC
-===================================================
+Compiling and Optimizing a Model with TVMC
+==========================================
 **Authors**:
 `Leandro Nunes <https://github.com/leandron>`_,
-`Matthew Barrett <https://github.com/mbaret>`_
-
-This tutorial is an introduction to working with TVMC, the TVM command
-line driver. TVMC is a tool that exposes TVM features such as
-auto-tuning, compiling, profiling and execution of models, via a
-command line interface.
-
-In this tutorial we are going to use TVMC to compile, run and tune a
-ResNet-50 on a x86 CPU.
-
-We are going to start by downloading ResNet 50 V2. Then, we are going
-to use TVMC to compile this model into a TVM module, and use the
-compiled module to generate predictions. Finally, we are going to experiment
-with the auto-tuning options, that can be used to help the compiler to
-improve network performance.
-
-The final goal is to give an overview of TVMC's capabilities and also
-some guidance on where to look for more information.
+`Matthew Barrett <https://github.com/mbaret>`_,
+`Chris Hoge <https://github.com/hogepodge>`_
+
+In this section, we will work with TVMC, the TVM command line driver. TVMC is a
+tool that exposes TVM features such as auto-tuning, compiling, profiling and
+execution of models through a command line interface.
+
+Upon completion of this section, we will have used TVMC to accomplish the
+following tasks:
+
+* Compile a pre-trained ResNet 50 v2 model for the TVM runtime.
+* Run a real image through the compiled model, and interpret the output and
+  model performance.
+* Tune the model on a CPU using TVM.
+* Re-compile an optimized model using the tuning data collected by TVM.
+* Run the image through the optimized model, and compare the output and model
+  performance.
+
+The goal of this section is to give you an overview of TVM and TVMC's
+capabilities, and set the stage for understanding how TVM works.
 """
 
-######################################################################
+################################################################################
 # Using TVMC
 # ----------
 #
@@ -61,32 +63,35 @@
 #
 #   tvmc --help
 #
-#
-# As you can see in the help page, the main features are
-# accessible via the subcommands ``tune``, ``compile`` and ``run``.
-# To read about specific options under a given subcommand, use
-# ``tvmc <subcommand> --help``.
-#
-# In the following sections we will use TVMC to tune, compile and
-# run a model. But first, we need a model.
+# The main features of TVM available to ``tvmc`` are from subcommands
+# ``compile``, and ``run``, and ``tune``.  To read about specific options under
+# a given subcommand, use ``tvmc <subcommand> --help``. We will cover each of
+# these commands in this tutorial, but first we need to download a pre-trained
+# model to work with.
 #
 
 
-######################################################################
-# Obtaining the model
+################################################################################
+# Obtaining the Model
 # -------------------
 #
-# We are going to use ResNet-50 V2 as an example to experiment with TVMC.
-# The version below is in ONNX format. To download the file, you can use
-# the command below:
+# For this tutorial, we will be working with ResNet-50 v2. ResNet-50 is a
+# convolutional neural network that is 50-layers deep and designed to classify
+# images. The model we will be using has been pre-trained on more than a
+# million images with 1000 different classifications. The network has an input
+# image size of 224x224. If you are interested exploring more of how the
+# ResNet-50 model is structured, we recommend downloading `Netron
+# <https://netron.app>`, a freely available ML model viewer.
+#
+# For this tutorial we will be using the model in ONNX format.
 #
 # .. code-block:: bash
 #
 #   wget https://github.com/onnx/models/raw/master/vision/classification/resnet/model/resnet50-v2-7.onnx
 #
-#
 
-######################################################################
+
+################################################################################
 # .. note:: Supported model formats
 #
 #   TVMC supports models created with Keras, ONNX, TensorFlow, TFLite
@@ -96,241 +101,398 @@
 #
 
 
-######################################################################
-# Compiling the model
-# -------------------
+################################################################################
+# Compiling an ONNX Model to the TVM Runtime
+# ------------------------------------------
 #
-# The next step once we've downloaded ResNet-50, is to compile it,
-# To accomplish that, we are going to use ``tvmc compile``. The
-# output we get from the compilation process is a TAR package,
-# that can be used to run our model on the target device.
+# Once we've downloaded the ResNet-50 model, the next step is to compile it. To
+# accomplish that, we are going to use ``tvmc compile``. The output we get from
+# the compilation process is a TAR package of the model compiled to a dynamic
+# library for our target platform. We can run that model on our target device
+# using the TVM runtime.
 #
 # .. code-block:: bash
 #
 #   tvmc compile \
-#     --target "llvm" \
-#     --output compiled_module.tar \
-#     resnet50-v2-7.onnx
+#   --target "llvm" \
+#   --output resnet50-v2-7-tvm.tar \
+#   resnet50-v2-7.onnx
 #
-# Once compilation finishes, the output ``compiled_module.tar`` will be created. This
-# can be directly loaded by your application and run via the TVM runtime APIs.
+# Let's take a look at the files that ``tvmc compile`` creates in the module:
 #
+# .. code-block:: bash
+#
+# 	mkdir model
+# 	tar -xvf resnet50-v2-7-tvm.tar -C model
+# 	ls model
+#
+# You will see three files listed.
+#
+# * ``mod.so`` is the model, represented as a C++ library, that can be loaded
+#   by the TVM runtime.
+# * ``mod.json`` is a text representation of the TVM Relay computation graph.
+# * ``mod.params`` is a file containing the parameters for the pre-trained
+#   model.
+#
+# This module can be directly loaded by your application, and the model can be
+# run via the TVM runtime APIs.
 
 
-######################################################################
-# .. note:: Defining the correct target
+################################################################################
+# .. note:: Defining the Correct Target
 #
 #   Specifying the correct target (option ``--target``) can have a huge
 #   impact on the performance of the compiled module, as it can take
 #   advantage of hardware features available on the target. For more
 #   information, please refer to `Auto-tuning a convolutional network
 #   for x86 CPU <https://tvm.apache.org/docs/tutorials/autotvm/tune_relay_x86.html#define-network>`_.
+#   We recommend identifying which CPU you are running, along with optional features,
+#   and set the target appropriately.
 #
 
-
-######################################################################
-#
-# In the next step, we are going to use the compiled module, providing it
-# with some inputs, to generate some predictions.
-#
-
-
-######################################################################
-# Input pre-processing
-# --------------------
+################################################################################
+# Running the Model from The Compiled Module with TVMC
+# ----------------------------------------------------
 #
-# In order to generate predictions, we will need two things:
+# Now that we've compiled the model to this module, we can use the TVM runtime
+# to make predictions with it. TVMC has the TVM runtime built in to it,
+# allowing you to run compiled TVM models. To use TVMC to run the model and
+# make predictions, we need two things:
 #
-# - the compiled module, which we just produced;
-# - a valid input to the model
+# - The compiled module, which we just produced.
+# - Valid input to the model to make predictions on.
 #
-# Each model is particular when it comes to expected tensor shapes, formats and data
-# types. For this reason, most models require some pre and
-# post processing, to ensure the input(s) is valid and to interpret the output(s).
+# Each model is particular when it comes to expected tensor shapes, formats and
+# data types. For this reason, most models require some pre and
+# post-processing, to ensure the input is valid and to interpret the output.
+# TVMC has adopted NumPy's ``.npz`` format for both input and output data. This
+# is a well-supported NumPy format to serialize multiple arrays into a file
 #
-# In TVMC, we adopted NumPy's ``.npz`` format for both input and output data.
-# This is a well-supported NumPy format to serialize multiple arrays into a file.
-#
-# We will use the usual cat image, similar to other TVM tutorials:
+# As input for this tutorial, we will use the image of a cat, but you can feel
+# free to substitute image for any of your choosing.
 #
 # .. image:: https://s3.amazonaws.com/model-server/inputs/kitten.jpg
 #    :height: 224px
 #    :width: 224px
 #    :align: center
+
+
+################################################################################
+# Input pre-processing
+# ~~~~~~~~~~~~~~~~~~~~
 #
 # For our ResNet 50 V2 model, the input is expected to be in ImageNet format.
 # Here is an example of a script to pre-process an image for ResNet 50 V2.
 #
-from tvm.contrib.download import download_testdata
-from PIL import Image
-import numpy as np
-
-img_url = "https://s3.amazonaws.com/model-server/inputs/kitten.jpg"
-img_path = download_testdata(img_url, "imagenet_cat.png", module="data")
-
-# Resize it to 224x224
-resized_image = Image.open(img_path).resize((224, 224))
-img_data = np.asarray(resized_image).astype("float32")
-
-# ONNX expects NCHW input, so convert the array
-img_data = np.transpose(img_data, (2, 0, 1))
-
-# Normalize according to ImageNet
-imagenet_mean = np.array([0.485, 0.456, 0.406])
-imagenet_stddev = np.array([0.229, 0.224, 0.225])
-norm_img_data = np.zeros(img_data.shape).astype("float32")
-for i in range(img_data.shape[0]):
-    norm_img_data[i, :, :] = (img_data[i, :, :] / 255 - imagenet_mean[i]) / imagenet_stddev[i]
-
-# Add batch dimension
-img_data = np.expand_dims(norm_img_data, axis=0)
-
-# Save to .npz (outputs imagenet_cat.npz)
-np.savez("imagenet_cat", data=img_data)
-
+# .. code-block:: python
+#    :caption: preprocess.py
+#    :name: preprocess.py
+#
+#     #!python ./preprocess.py
+#     from tvm.contrib.download import download_testdata
+#     from PIL import Image
+#     import numpy as np
+#
+#     img_url = "https://s3.amazonaws.com/model-server/inputs/kitten.jpg"
+#     img_path = download_testdata(img_url, "imagenet_cat.png", module="data")
+#
+#     # Resize it to 224x224
+#     resized_image = Image.open(img_path).resize((224, 224))
+#     img_data = np.asarray(resized_image).astype("float32")
+#
+#     # ONNX expects NCHW input, so convert the array
+#     img_data = np.transpose(img_data, (2, 0, 1))
+#
+#     # Normalize according to ImageNet
+#     imagenet_mean = np.array([0.485, 0.456, 0.406])
+#     imagenet_stddev = np.array([0.229, 0.224, 0.225])
+#     norm_img_data = np.zeros(img_data.shape).astype("float32")
+#     for i in range(img_data.shape[0]):
+#    	    norm_img_data[i, :, :] = (img_data[i, :, :] / 255 - imagenet_mean[i]) / imagenet_stddev[i]
+#
+#     # Add batch dimension
+#     img_data = np.expand_dims(norm_img_data, axis=0)
+#
+#     # Save to .npz (outputs imagenet_cat.npz)
+#     np.savez("imagenet_cat", data=img_data)
+#
 
-######################################################################
-# Running the compiled module
-# ---------------------------
+################################################################################
+# Running the Compiled Module
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
-# With both the compiled module and input file in hand, we can run it by
-# invoking ``tvmc run``.
+# With both the model and input data in hand, we can now run TVMC to make a
+# prediction:
 #
 # .. code-block:: bash
 #
-#    tvmc run \
-#      --inputs imagenet_cat.npz \
-#      --output predictions.npz \
-#      compiled_module.tar
+#     tvmc run \
+#     --inputs imagenet_cat.npz \
+#     --output predictions.npz \
+#     resnet50-v2-7-tvm.tar
 #
-# When running the above command, a new file ``predictions.npz`` should
-# be produced. It contains the output tensors.
+# Recall that the `.tar` model file includes a C++ library, a description of
+# the Relay model, and the parameters for the model. TVMC includes the TVM
+# runtime, which can load the model and make predictions against input. When
+# running the above command, TVMC outputs a new file, ``predictions.npz``, that
+# contains the model output tensors in NumPy format.
 #
 # In this example, we are running the model on the same machine that we used
-# for compilation. In some cases we might want to run it remotely via
-# an RPC Tracker. To read more about these options please check ``tvmc
-# run --help``.
-#
+# for compilation. In some cases we might want to run it remotely via an RPC
+# Tracker. To read more about these options please check ``tvmc run --help``.
 
-######################################################################
-# Output post-processing
-# ----------------------
+################################################################################
+# Output Post-Processing
+# ~~~~~~~~~~~~~~~~~~~~~~
 #
-# As previously mentioned, each model will have its own particular way
-# of providing output tensors.
+# As previously mentioned, each model will have its own particular way of
+# providing output tensors.
 #
-# In our case, we need to run some post-processing to render the
-# outputs from ResNet 50 V2 into a more human-readable form.
+# In our case, we need to run some post-processing to render the outputs from
+# ResNet 50 V2 into a more human-readable form, using the lookup-table provided
+# for the model.
 #
-# The script below shows an example of the post-processing to extract
-# labels from the output of our compiled module.
+# The script below shows an example of the post-processing to extract labels
+# from the output of our compiled module.
 #
-import os.path
-import numpy as np
-
-from scipy.special import softmax
-
-from tvm.contrib.download import download_testdata
-
-# Download a list of labels
-labels_url = "https://s3.amazonaws.com/onnx-model-zoo/synset.txt"
-labels_path = download_testdata(labels_url, "synset.txt", module="data")
-
-with open(labels_path, "r") as f:
-    labels = [l.rstrip() for l in f]
-
-output_file = "predictions.npz"
-
-# Open the output and read the output tensor
-if os.path.exists(output_file):
-    with np.load(output_file) as data:
-        scores = softmax(data["output_0"])
-        scores = np.squeeze(scores)
-        ranks = np.argsort(scores)[::-1]
-
-        for rank in ranks[0:5]:
-            print("class='%s' with probability=%f" % (labels[rank], scores[rank]))
-
-
-########################################################################
-# When running the script, a list of predictions should be printed similar
-# the the example below.
+# .. code-block:: python
+#     :caption: postprocess.py
+#     :name: postprocess.py
+#
+#     #!python ./postprocess.py
+#     import os.path
+#     import numpy as np
+#
+#     from scipy.special import softmax
+#
+#     from tvm.contrib.download import download_testdata
+#
+#     # Download a list of labels
+#     labels_url = "https://s3.amazonaws.com/onnx-model-zoo/synset.txt"
+#     labels_path = download_testdata(labels_url, "synset.txt", module="data")
+#
+#     with open(labels_path, "r") as f:
+#         labels = [l.rstrip() for l in f]
+#
+#     output_file = "predictions.npz"
+#
+#     # Open the output and read the output tensor
+#     if os.path.exists(output_file):
+#         with np.load(output_file) as data:
+#             scores = softmax(data["output_0"])
+#             scores = np.squeeze(scores)
+#             ranks = np.argsort(scores)[::-1]
+#
+#             for rank in ranks[0:5]:
+#                 print("class='%s' with probability=%f" % (labels[rank], scores[rank]))
+#
+# Running this script should produce the following output:
 #
 # .. code-block:: bash
 #
-#   $ python post_processing.py
-#   class=n02123045 tabby, tabby cat ; probability=446.000000
-#   class=n02123159 tiger cat ; probability=675.000000
-#   class=n02124075 Egyptian cat ; probability=836.000000
-#   class=n02129604 tiger, Panthera tigris ; probability=917.000000
-#   class=n04040759 radiator ; probability=213.000000
+#     python postprocess.py
 #
+#     # class='n02123045 tabby, tabby cat' with probability=0.610553
+#     # class='n02123159 tiger cat' with probability=0.367179
+#     # class='n02124075 Egyptian cat' with probability=0.019365
+#     # class='n02129604 tiger, Panthera tigris' with probability=0.001273
+#     # class='n04040759 radiator' with probability=0.000261
+#
+# Try replacing the cat image with other images, and see what sort of
+# predictions the ResNet model makes.
 
-
-######################################################################
-# Tuning the model
-# ----------------
+################################################################################
+# Automatically Tuning the ResNet Model
+# -------------------------------------
+#
+# The previous model was compiled to work on the TVM runtime, but did not
+# include any platform specific optimization. In this section, we will show you
+# how to build an optimized model using TVMC to target your working platform.
 #
 # In some cases, we might not get the expected performance when running
-# inferences using our compiled module. In cases like this, we can make use
-# of the auto-tuner, to find a better configuration for our model and
-# get a boost in performance.
-#
-# Tuning in TVM refers to the process by which a model is optimized
-# to run faster on a given target. This differs from training or
-# fine-tuning in that it does not affect the accuracy of the model,
-# but only the runtime performance.
-#
-# As part of the tuning process, TVM will try running many different
-# operator implementation variants to see which perform best. The
-# results of these runs are stored in a tuning records file, which is
+# inferences using our compiled module.  In cases like this, we can make use of
+# the auto-tuner, to find a better configuration for our model and get a boost
+# in performance. Tuning in TVM refers to the process by which a model is
+# optimized to run faster on a given target. This differs from training or
+# fine-tuning in that it does not affect the accuracy of the model, but only
+# the runtime performance. As part of the tuning process, TVM will try running
+# many different operator implementation variants to see which perform best.
+# The results of these runs are stored in a tuning records file, which is
 # ultimately the output of the ``tune`` subcommand.
 #
 # In the simplest form, tuning requires you to provide three things:
 #
-# - the target specification of the device you intend to run this model on;
-# - the path to an output file in which the tuning records will be stored, and finally,
+# - the target specification of the device you intend to run this model on
+# - the path to an output file in which the tuning records will be stored, and
+#   finally
 # - a path to the model to be tuned.
 #
-#
 # The example below demonstrates how that works in practice:
 #
 # .. code-block:: bash
 #
-#   tvmc tune \
+#     tvmc tune \
 #     --target "llvm" \
-#     --output autotuner_records.json \
+#     --output resnet50-v2-7-autotuner_records.json \
 #     resnet50-v2-7.onnx
 #
+# In this example, you will see better results if you indicate a more specific
+# target for the `--target` flag.  For example, on an Intel i7 processor you
+# could use `--target llvm -mcpu=skylake`. For this tuning example, we are
+# tuning locally on the CPU using LLVM as the compiler for the specified
+# achitecture.
+#
+# TVMC will perform a search against the parameter space for the model, trying
+# out different configurations for operators and choosing the one that runs
+# fastest on your platform. Although this is a guided search based on the CPU
+# and model operations, it can still take several hours to complete the search.
+# The output of this search will be saved to the
+# `resnet50-v2-7-autotuner_records.json` file, which will later be used to
+# compile an optimized model.
+#
+# .. note:: Defining the Tuning Search Algorithm
+#
+#   By default this search is guided using an `XGBoost Grid` algorithm.
+#   Depending on your model complexity and amount of time avilable, you might
+#   want to choose a different algorithm. A full list is available by
+#   consulting ``tvmc tune --help``.
+#
+# The output will look something like this for a consumer-level Skylake CPU:
+#
+# .. code-block:: bash
+#
+#   tvmc tune   --target "llvm -mcpu=broadwell"   --output resnet50-v2-7-autotuner_records.json   resnet50-v2-7.onnx
+#   # [Task  1/24]  Current/Best:    9.65/  23.16 GFLOPS | Progress: (60/1000) | 130.74 s Done.
+#   # [Task  1/24]  Current/Best:    3.56/  23.16 GFLOPS | Progress: (192/1000) | 381.32 s Done.
+#   # [Task  2/24]  Current/Best:   13.13/  58.61 GFLOPS | Progress: (960/1000) | 1190.59 s Done.
+#   # [Task  3/24]  Current/Best:   31.93/  59.52 GFLOPS | Progress: (800/1000) | 727.85 s Done.
+#   # [Task  4/24]  Current/Best:   16.42/  57.80 GFLOPS | Progress: (960/1000) | 559.74 s Done.
+#   # [Task  5/24]  Current/Best:   12.42/  57.92 GFLOPS | Progress: (800/1000) | 766.63 s Done.
+#   # [Task  6/24]  Current/Best:   20.66/  59.25 GFLOPS | Progress: (1000/1000) | 673.61 s Done.
+#   # [Task  7/24]  Current/Best:   15.48/  59.60 GFLOPS | Progress: (1000/1000) | 953.04 s Done.
+#   # [Task  8/24]  Current/Best:   31.97/  59.33 GFLOPS | Progress: (972/1000) | 559.57 s Done.
+#   # [Task  9/24]  Current/Best:   34.14/  60.09 GFLOPS | Progress: (1000/1000) | 479.32 s Done.
+#   # [Task 10/24]  Current/Best:   12.53/  58.97 GFLOPS | Progress: (972/1000) | 642.34 s Done.
+#   # [Task 11/24]  Current/Best:   30.94/  58.47 GFLOPS | Progress: (1000/1000) | 648.26 s Done.
+#   # [Task 12/24]  Current/Best:   23.66/  58.63 GFLOPS | Progress: (1000/1000) | 851.59 s Done.
+#   # [Task 13/24]  Current/Best:   25.44/  59.76 GFLOPS | Progress: (1000/1000) | 534.58 s Done.
+#   # [Task 14/24]  Current/Best:   26.83/  58.51 GFLOPS | Progress: (1000/1000) | 491.67 s Done.
+#   # [Task 15/24]  Current/Best:   33.64/  58.55 GFLOPS | Progress: (1000/1000) | 529.85 s Done.
+#   # [Task 16/24]  Current/Best:   14.93/  57.94 GFLOPS | Progress: (1000/1000) | 645.55 s Done.
+#   # [Task 17/24]  Current/Best:   28.70/  58.19 GFLOPS | Progress: (1000/1000) | 756.88 s Done.
+#   # [Task 18/24]  Current/Best:   19.01/  60.43 GFLOPS | Progress: (980/1000) | 514.69 s Done.
+#   # [Task 19/24]  Current/Best:   14.61/  57.30 GFLOPS | Progress: (1000/1000) | 614.44 s Done.
+#   # [Task 20/24]  Current/Best:   10.47/  57.68 GFLOPS | Progress: (980/1000) | 479.80 s Done.
+#   # [Task 21/24]  Current/Best:   34.37/  58.28 GFLOPS | Progress: (308/1000) | 225.37 s Done.
+#   # [Task 22/24]  Current/Best:   15.75/  57.71 GFLOPS | Progress: (1000/1000) | 1024.05 s Done.
+#   # [Task 23/24]  Current/Best:   23.23/  58.92 GFLOPS | Progress: (1000/1000) | 999.34 s Done.
+#   # [Task 24/24]  Current/Best:   17.27/  55.25 GFLOPS | Progress: (1000/1000) | 1428.74 s Done.
+#
+# Tuning sessions can take a long time, so ``tvmc tune`` offers many options to customize your tuning
+# process, in terms of number of repetitions (``--repeat`` and ``--number``, for example), the tuning
+# algorithm to be used, and so on. Check ``tvmc tune --help`` for more information.
+#
+
+################################################################################
+# Compiling an Optimized Model with Tuning Data
+# ----------------------------------------------
+#
+# As an output of the tuning process above, we obtained the tuning records
+# stored in ``resnet50-v2-7-autotuner_records.json``. This file can be used in
+# two ways:
+#
+# - As input to further tuning (via ``tvmc tune --tuning-records``).
+# - As input to the compiler
+#
+# The compiler will use the results to generate high performance code for the
+# model on your specified target. To do that we can use ``tvmc compile
+# --tuning-records``. Check ``tvmc compile --help`` for more information.
+#
+# Now that tuning data for the model has been collected, we can re-compile the
+# model using optimized operators to speed up our computations.
+#
+# .. code-block:: bash
+#
+#   tvmc compile \
+#   --target "llvm" \
+#   --tuning-records resnet50-v2-7-autotuner_records.json  \
+#   --output resnet50-v2-7-tvm_autotuned.tar \
+#   resnet50-v2-7.onnx
+#
+# Verify that the optimized model runs and produces the same results:
+#
+# .. code-block:: bash
+#
+#   tvmc run \
+#   --inputs imagenet_cat.npz \
+#   --output predictions.npz \
+#   resnet50-v2-7-tvm_autotuned.tar
+#
+#   python postproccess.py
+#
+# Verifying that the predictions are the same:
+#
+# .. code-block:: bash
+#
+#   # class='n02123045 tabby, tabby cat' with probability=0.610550
+#   # class='n02123159 tiger cat' with probability=0.367181
+#   # class='n02124075 Egyptian cat' with probability=0.019365
+#   # class='n02129604 tiger, Panthera tigris' with probability=0.001273
+#   # class='n04040759 radiator' with probability=0.000261
+
+################################################################################
+# Comparing the Tuned and Untuned Models
+# --------------------------------------
+#
+# TVMC gives you tools for basic performance benchmarking between the models.
+# You can specify a number of repetitions and that TVMC report on the model run
+# time (independent of runtime startup). We can get a rough idea of how much
+# tuning has improved the model performance. For example, on a test Intel i7
+# system, we see that the tuned model runs 47% faster than the untuned model:
+#
+# .. code-block:: bash
 #
-# Tuning sessions can take a long time, so ``tvmc tune`` offers many options to
-# customize your tuning process, in terms of number of repetitions (``--repeat`` and
-# ``--number``, for example), the tuning algorithm to be use, and so on.
-# Check ``tvmc tune --help`` for more information.
+#   tvmc run \
+#   --inputs imagenet_cat.npz \
+#   --output predictions.npz  \
+#   --print-time \
+#   --repeat 100 \
+#   resnet50-v2-7-tvm_autotuned.tar
 #
-# As an output of the tuning process above, we obtained the tuning records stored
-# in ``autotuner_records.json``. This file can be used in two ways:
+#   # Execution time summary:
+#   # mean (s)   max (s)    min (s)    std (s)
+#   # 0.09219    0.11573    0.08985    0.00315
 #
-# - as an input to further tuning (via ``tvmc tune --tuning-records``), or
-# - as an input to the compiler
+#   tvmc run \
+#   --inputs imagenet_cat.npz \
+#   --output predictions.npz  \
+#   --print-time \
+#   --repeat 100 \
+#   resnet50-v2-7-tvm.tar
 #
-# The compiler will use the results to generate high performance code for the model
-# on your specified target. To do that we can use ``tvmc compile --tuning-records``.
-# Check ``tvmc compile --help`` for more information.
+#   # Execution time summary:
+#   # mean (s)   max (s)    min (s)    std (s)
+#   # 0.19332    0.21997    0.18504    0.00711
 #
 
 
-######################################################################
+################################################################################
 # Final Remarks
 # -------------
 #
-# In this tutorial, we presented TVMC, a command line driver for TVM.
-# We demonstrated how to compile, run and tune a model, as well
-# as discussed the need for pre and post processing of inputs and outputs.
+# In this tutorial, we presented TVMC, a command line driver for TVM. We
+# demonstrated how to compile, run, and tune a model. We also discussed the
+# need for pre and post-processing of inputs and outputs. After the tuning
+# process, we demonstrated how to compare the performance of the unoptimized
+# and optimize models.
 #
 # Here we presented a simple example using ResNet 50 V2 locally. However, TVMC
 # supports many more features including cross-compilation, remote execution and
 # profiling/benchmarking.
 #
-# To see what other options are available, please have a look at ``tvmc --help``.
+# To see what other options are available, please have a look at ``tvmc
+# --help``.
 #
+# In the next tutorial, `Compiling and Optimizing a Model with the Python
+# AutoScheduler <auto_tuning_with_pyton>`_, we will cover the same compilation
+# and optimization steps using the Python interface.
diff --git a/tutorials/language/extern_op.py b/tutorials/language/extern_op.py
index 794101a4fb56..277af712d90b 100644
--- a/tutorials/language/extern_op.py
+++ b/tutorials/language/extern_op.py
@@ -77,11 +77,11 @@
 # -----------------
 # We can verify that the result matches what we expected.
 #
-ctx = tvm.cpu(0)
+dev = tvm.cpu(0)
 f = tvm.build(s, [A, B, D, bias], "llvm")
-a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx)
-b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx)
-d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx)
+a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), dev)
+b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), dev)
+d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), dev)
 bb = 10.0
 f(a, b, d, bb)
 tvm.testing.assert_allclose(d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + 10, rtol=1e-5)
@@ -125,8 +125,8 @@ def my_tvm_addone(x, y):
 )
 s = te.create_schedule(B.op)
 f = tvm.build(s, [A, B], "llvm")
-a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx)
-b = tvm.nd.array(np.random.uniform(size=(n,)).astype(B.dtype), ctx)
+a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev)
+b = tvm.nd.array(np.random.uniform(size=(n,)).astype(B.dtype), dev)
 f(a, b)
 tvm.testing.assert_allclose(b.asnumpy(), a.asnumpy() + 1, rtol=1e-5)
 
diff --git a/tutorials/language/reduction.py b/tutorials/language/reduction.py
index cffa10e6cbb2..f782ac6ca280 100644
--- a/tutorials/language/reduction.py
+++ b/tutorials/language/reduction.py
@@ -137,9 +137,9 @@
 # Verify the correctness of result kernel by comparing it to numpy.
 #
 nn = 128
-ctx = tvm.gpu(0)
-a = tvm.nd.array(np.random.uniform(size=(nn, nn)).astype(A.dtype), ctx)
-b = tvm.nd.array(np.zeros(nn, dtype=B.dtype), ctx)
+dev = tvm.gpu(0)
+a = tvm.nd.array(np.random.uniform(size=(nn, nn)).astype(A.dtype), dev)
+b = tvm.nd.array(np.zeros(nn, dtype=B.dtype), dev)
 fcuda(a, b)
 tvm.testing.assert_allclose(b.asnumpy(), np.sum(a.asnumpy(), axis=1), rtol=1e-4)
 
diff --git a/tutorials/language/scan.py b/tutorials/language/scan.py
index 5f513208d56d..8124b567177f 100644
--- a/tutorials/language/scan.py
+++ b/tutorials/language/scan.py
@@ -83,12 +83,12 @@
 # numpy to verify the correctness of the result.
 #
 fscan = tvm.build(s, [X, s_scan], "cuda", name="myscan")
-ctx = tvm.gpu(0)
+dev = tvm.gpu(0)
 n = 1024
 m = 10
 a_np = np.random.uniform(size=(m, n)).astype(s_scan.dtype)
-a = tvm.nd.array(a_np, ctx)
-b = tvm.nd.array(np.zeros((m, n), dtype=s_scan.dtype), ctx)
+a = tvm.nd.array(a_np, dev)
+b = tvm.nd.array(np.zeros((m, n), dtype=s_scan.dtype), dev)
 fscan(a, b)
 tvm.testing.assert_allclose(b.asnumpy(), np.cumsum(a_np, axis=0))
 
diff --git a/tutorials/language/tensorize.py b/tutorials/language/tensorize.py
index a75b78b65ca4..a1575fe832c3 100644
--- a/tutorials/language/tensorize.py
+++ b/tutorials/language/tensorize.py
@@ -186,11 +186,11 @@ def gemv_impl():
 from tvm.topi.utils import get_const_tuple
 
 dtype = A.dtype
-ctx = tvm.context("cpu", 0)
+dev = tvm.device("cpu", 0)
 a = np.random.uniform(size=get_const_tuple(A.shape)).astype(dtype)
 b = np.random.uniform(size=get_const_tuple(B.shape)).astype(dtype)
-c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=dtype), ctx)
-func(tvm.nd.array(a, ctx), tvm.nd.array(b, ctx), c)
+c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=dtype), dev)
+func(tvm.nd.array(a, dev), tvm.nd.array(b, dev), c)
 tvm.testing.assert_allclose(c.asnumpy(), np.dot(a, b.T), rtol=1e-3)
 
 ######################################################################
@@ -300,8 +300,8 @@ def _reduce_update():
 func = tvm.build(s, [A, B, C], target="llvm", name="gemv")
 a = np.random.uniform(size=get_const_tuple(A.shape)).astype(dtype)
 b = np.random.uniform(size=get_const_tuple(B.shape)).astype(dtype)
-c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=dtype), ctx)
-func(tvm.nd.array(a, ctx), tvm.nd.array(b, ctx), c)
+c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=dtype), dev)
+func(tvm.nd.array(a, dev), tvm.nd.array(b, dev), c)
 tvm.testing.assert_allclose(c.asnumpy(), np.dot(a, b.T), rtol=1e-3)
 
 ######################################################################
diff --git a/tutorials/micro/micro_tflite.py b/tutorials/micro/micro_tflite.py
index 6ad0da5aecba..f59b1c3723a8 100644
--- a/tutorials/micro/micro_tflite.py
+++ b/tutorials/micro/micro_tflite.py
@@ -127,7 +127,7 @@
 import tvm
 import tvm.micro as micro
 from tvm.contrib.download import download_testdata
-from tvm.contrib import graph_runtime, utils
+from tvm.contrib import graph_executor, utils
 from tvm import relay
 
 model_url = "https://people.linaro.org/~tom.gall/sine_model.tflite"
@@ -177,7 +177,7 @@
 # Now we create a build config for relay. turning off two options
 # and then calling relay.build which will result in a C source
 # file. When running on a simulated target, choose "host" below:
-TARGET = tvm.target.target.micro("host")
+# TARGET = tvm.target.target.micro("host")
 
 # %%
 # Compiling for physical hardware
@@ -190,8 +190,9 @@
 #
 #  .. code-block:: python
 #
-#     TARGET = tvm.target.target.micro("stm32f746xx")
-#     BOARD = "nucleo_f746zg" # or "stm32f746g_disco"
+TARGET = tvm.target.target.micro("host")
+# BOARD = "nucleo_f746zg" # or "stm32f746g_disco"
+BOARD = "qemu_x86"
 
 ######################################################################
 # Now, compile the model for the target:
@@ -256,8 +257,8 @@
 
 flasher = compiler.flasher()
 with tvm.micro.Session(binary=micro_binary, flasher=flasher) as session:
-    graph_mod = tvm.micro.create_local_graph_runtime(
-        graph, session.get_system_lib(), session.context
+    graph_mod = tvm.micro.create_local_graph_executor(
+        graph, session.get_system_lib(), session.device
     )
 
     # Set the model parameters using the lowered parameters produced by `relay.build`.
diff --git a/tutorials/optimize/opt_conv_cuda.py b/tutorials/optimize/opt_conv_cuda.py
index 9cb29b573514..0cecc82aa8ea 100644
--- a/tutorials/optimize/opt_conv_cuda.py
+++ b/tutorials/optimize/opt_conv_cuda.py
@@ -238,12 +238,12 @@
 #
 
 func = tvm.build(s, [A, W, B], "cuda")
-ctx = tvm.gpu(0)
+dev = tvm.gpu(0)
 a_np = np.random.uniform(size=(in_size, in_size, in_channel, batch)).astype(A.dtype)
 w_np = np.random.uniform(size=(kernel, kernel, in_channel, out_channel)).astype(W.dtype)
-a = tvm.nd.array(a_np, ctx)
-w = tvm.nd.array(w_np, ctx)
-b = tvm.nd.array(np.zeros((out_size, out_size, out_channel, batch), dtype=B.dtype), ctx)
+a = tvm.nd.array(a_np, dev)
+w = tvm.nd.array(w_np, dev)
+b = tvm.nd.array(np.zeros((out_size, out_size, out_channel, batch), dtype=B.dtype), dev)
 func(a, w, b)
-evaluator = func.time_evaluator(func.entry_name, ctx, number=1)
+evaluator = func.time_evaluator(func.entry_name, dev, number=1)
 print("Convolution: %f ms" % (evaluator(a, w, b).mean * 1e3))
diff --git a/tutorials/optimize/opt_conv_tensorcore.py b/tutorials/optimize/opt_conv_tensorcore.py
index 0cbcf7e03342..0a7798d1b9e1 100644
--- a/tutorials/optimize/opt_conv_tensorcore.py
+++ b/tutorials/optimize/opt_conv_tensorcore.py
@@ -392,16 +392,16 @@ def intrin_func(ins, outs):
 # Since TensorCores are only supported in NVIDIA GPU with Compute Capability 7.0 or higher, it may not
 # be able to run on our build server
 
-ctx = tvm.gpu(0)
-if nvcc.have_tensorcore(ctx.compute_version):
+dev = tvm.gpu(0)
+if nvcc.have_tensorcore(dev.compute_version):
     with tvm.transform.PassContext(config={"tir.UnrollLoop": {"auto_max_step": 16}}):
         func = tvm.build(s, [A, W, Conv], "cuda")
     a_np = np.random.uniform(size=data_shape).astype(A.dtype)
     w_np = np.random.uniform(size=kernel_shape).astype(W.dtype)
-    a = tvm.nd.array(a_np, ctx)
-    w = tvm.nd.array(w_np, ctx)
-    c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), ctx)
-    evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
+    a = tvm.nd.array(a_np, dev)
+    w = tvm.nd.array(w_np, dev)
+    c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), dev)
+    evaluator = func.time_evaluator(func.entry_name, dev, number=10)
     print("conv2d with tensor core: %f ms" % (evaluator(a, w, c).mean * 1e3))
 
 ###############################################################################
diff --git a/tutorials/optimize/opt_gemm.py b/tutorials/optimize/opt_gemm.py
index 971269d8c11e..72a8b0a0701e 100644
--- a/tutorials/optimize/opt_gemm.py
+++ b/tutorials/optimize/opt_gemm.py
@@ -75,11 +75,11 @@
 # To get the best performance, please change the following line
 # to llvm -mcpu=core-avx2, or specific type of CPU you use
 target = "llvm"
-ctx = tvm.context(target, 0)
+dev = tvm.device(target, 0)
 
 # Random generated tensor for testing
-a = tvm.nd.array(numpy.random.rand(M, K).astype(dtype), ctx)
-b = tvm.nd.array(numpy.random.rand(K, N).astype(dtype), ctx)
+a = tvm.nd.array(numpy.random.rand(M, K).astype(dtype), dev)
+b = tvm.nd.array(numpy.random.rand(K, N).astype(dtype), dev)
 
 np_repeat = 100
 np_runing_time = timeit.timeit(
@@ -108,11 +108,11 @@
 func = tvm.build(s, [A, B, C], target=target, name="mmult")
 assert func
 
-c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx)
+c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), dev)
 func(a, b, c)
 tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
 
-evaluator = func.time_evaluator(func.entry_name, ctx, number=1)
+evaluator = func.time_evaluator(func.entry_name, dev, number=1)
 print("Baseline: %f" % evaluator(a, b, c).mean)
 
 ################################################################################################
@@ -143,13 +143,13 @@
 func = tvm.build(s, [A, B, C], target=target, name="mmult")
 assert func
 
-c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx)
+c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), dev)
 func(a, b, c)
 tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
 
 # By simply tiling the loop 32x32, and hoisting ko, ki outside the blocking loops,
 # we can see big speedup compared with the baseline.
-evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
+evaluator = func.time_evaluator(func.entry_name, dev, number=10)
 print("Opt1: %f" % evaluator(a, b, c).mean)
 
 ################################################################################################
@@ -179,11 +179,11 @@
 func = tvm.build(s, [A, B, C], target=target, name="mmult")
 assert func
 
-c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx)
+c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), dev)
 func(a, b, c)
 tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
 
-evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
+evaluator = func.time_evaluator(func.entry_name, dev, number=10)
 print("Opt2: %f" % evaluator(a, b, c).mean)
 
 ################################################################################################
@@ -212,11 +212,11 @@
 func = tvm.build(s, [A, B, C], target=target, name="mmult")
 assert func
 
-c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx)
+c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), dev)
 func(a, b, c)
 tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
 
-evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
+evaluator = func.time_evaluator(func.entry_name, dev, number=10)
 print("Opt3: %f" % evaluator(a, b, c).mean)
 
 ################################################################################################
@@ -268,11 +268,11 @@
 func = tvm.build(s, [A, B, C], target=target, name="mmult")
 assert func
 
-c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx)
+c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), dev)
 func(a, b, c)
 tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
 
-evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
+evaluator = func.time_evaluator(func.entry_name, dev, number=10)
 print("Opt4: %f" % evaluator(a, b, c).mean)
 
 ################################################################################################
@@ -314,11 +314,11 @@
 func = tvm.build(s, [A, B, C], target=target, name="mmult")
 assert func
 
-c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx)
+c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), dev)
 func(a, b, c)
 tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
 
-evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
+evaluator = func.time_evaluator(func.entry_name, dev, number=10)
 print("Opt5: %f" % evaluator(a, b, c).mean)
 
 ################################################################################################
@@ -357,11 +357,11 @@
 func = tvm.build(s, [A, B, C], target=target, name="mmult")
 assert func
 
-c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx)
+c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), dev)
 func(a, b, c)
 tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
 
-evaluator = func.time_evaluator(func.entry_name, ctx, number=50)
+evaluator = func.time_evaluator(func.entry_name, dev, number=50)
 opt6_time = evaluator(a, b, c).mean
 print("Opt6: %f" % opt6_time)
 
diff --git a/tutorials/optimize/opt_matmul_auto_tensorcore.py b/tutorials/optimize/opt_matmul_auto_tensorcore.py
index f5450b9524c6..03682a05e86f 100644
--- a/tutorials/optimize/opt_matmul_auto_tensorcore.py
+++ b/tutorials/optimize/opt_matmul_auto_tensorcore.py
@@ -252,8 +252,8 @@ def test_gemm(N, L, M, dtype, layout):
 if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
     raise Exception("skip building this tutorial because cuda is not enabled..")
 
-ctx = tvm.gpu()
-if not nvcc.have_tensorcore(ctx.compute_version):
+dev = tvm.gpu()
+if not nvcc.have_tensorcore(dev.compute_version):
     raise Exception("the gpu has no tensorcore, skipping...")
 
 M, N, L = 512, 32, 512
@@ -385,14 +385,14 @@ def tune_and_evaluate(M, N, L, dtype, layout):
                 for k in range(32):
                     b_np[i, j] = b_np[i, j] | ((b_np_int[i, j * 32 + k] & 0xF) << (31 - k))
 
-    c_tvm = tvm.nd.array(np.zeros(c_np.shape, dtype=c_np_type), ctx=ctx)
-    a_tvm = tvm.nd.array(a_np, ctx=ctx)
-    b_tvm = tvm.nd.array(b_np, ctx=ctx)
+    c_tvm = tvm.nd.array(np.zeros(c_np.shape, dtype=c_np_type), device=dev)
+    a_tvm = tvm.nd.array(a_np, device=dev)
+    b_tvm = tvm.nd.array(b_np, device=dev)
     func(a_tvm, b_tvm, c_tvm)
 
     tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-3)
 
-    evaluator = func.time_evaluator(func.entry_name, ctx, number=100)
+    evaluator = func.time_evaluator(func.entry_name, dev, number=100)
     print("Time cost of this operator: %f" % evaluator(a_tvm, b_tvm, c_tvm).mean)
 
 
diff --git a/tutorials/topi/intro_topi.py b/tutorials/topi/intro_topi.py
index 2459cf249506..1fefae585a2f 100644
--- a/tutorials/topi/intro_topi.py
+++ b/tutorials/topi/intro_topi.py
@@ -99,13 +99,13 @@
 # We can test the correctness by comparing with :code:`numpy` result as follows
 #
 func = tvm.build(sg, [a, b, g], "cuda")
-ctx = tvm.gpu(0)
+dev = tvm.gpu(0)
 a_np = np.random.uniform(size=(x, y, y)).astype(a.dtype)
 b_np = np.random.uniform(size=(y, y)).astype(b.dtype)
 g_np = np.sum(np.add(a_np + b_np, a_np * b_np) / 2.0)
-a_nd = tvm.nd.array(a_np, ctx)
-b_nd = tvm.nd.array(b_np, ctx)
-g_nd = tvm.nd.array(np.zeros(g_np.shape, dtype=g_np.dtype), ctx)
+a_nd = tvm.nd.array(a_np, dev)
+b_nd = tvm.nd.array(b_np, dev)
+g_nd = tvm.nd.array(np.zeros(g_np.shape, dtype=g_np.dtype), dev)
 func(a_nd, b_nd, g_nd)
 tvm.testing.assert_allclose(g_nd.asnumpy(), g_np, rtol=1e-5)
 
diff --git a/vta/python/vta/testing/simulator.py b/vta/python/vta/testing/simulator.py
index 7f2471c61532..05f37c182815 100644
--- a/vta/python/vta/testing/simulator.py
+++ b/vta/python/vta/testing/simulator.py
@@ -17,6 +17,7 @@
 """Utilities to start simulator."""
 import ctypes
 import json
+import warnings
 import tvm
 from ..environment import get_env
 from ..libinfo import find_libvta
@@ -27,25 +28,30 @@ def _load_sw():
 
     env = get_env()
     lib_driver_name = "libvta_tsim" if env.TARGET == "tsim" else "libvta_fsim"
+    require_sim = env.TARGET in ("sim", "tsim")
+    libs = []
 
     # Load driver library
-    lib_driver = find_libvta(lib_driver_name, optional=True)
-    assert lib_driver
+    lib_driver = find_libvta(lib_driver_name, optional=(not require_sim))
+
+    if not lib_driver:
+        return []
+
     try:
         libs = [ctypes.CDLL(lib_driver[0], ctypes.RTLD_GLOBAL)]
-    except OSError:
+    except OSError as err:
+        if require_sim:
+            raise err
+        warnings.warn("Error when loading VTA driver {}: {}".format(lib_driver[0], err))
         return []
 
     if env.TARGET == "tsim":
         lib_hw = find_libvta("libvta_hw", optional=True)
         assert lib_hw  # make sure to make in ${VTA_HW_PATH}/hardware/chisel
-        try:
-            f = tvm.get_global_func("vta.tsim.init")
-            m = tvm.runtime.load_module(lib_hw[0], "vta-tsim")
-            f(m)
-            return lib_hw
-        except OSError:
-            return []
+        f = tvm.get_global_func("vta.tsim.init")
+        m = tvm.runtime.load_module(lib_hw[0], "vta-tsim")
+        f(m)
+        return lib_hw
 
     return libs
 
diff --git a/vta/python/vta/top/graphpack.py b/vta/python/vta/top/graphpack.py
index 99e67481e4d9..8998f5712381 100644
--- a/vta/python/vta/top/graphpack.py
+++ b/vta/python/vta/top/graphpack.py
@@ -201,8 +201,8 @@ class ExprDeviceAnnot(ExprMutator):
     """
 
     def __init__(self, start=-1, end=-1):
-        self.ext_ctx = tvm.context("ext_dev")
-        self.cpu_ctx = tvm.context("cpu")
+        self.ext_dev = tvm.device("ext_dev")
+        self.cpu_dev = tvm.device("cpu")
         self.cast = op.op.get("cast")
         self.counter = -1
         self.start = start
@@ -217,12 +217,12 @@ def visit_call(self, call):
         self.counter += 1
         if self.counter == self.start:
             ret = relay.Call(call.op, args, call.attrs)
-            ret = relay.annotation.on_device(ret, self.ext_ctx)
+            ret = relay.annotation.on_device(ret, self.ext_dev)
             return ret
 
         if self.counter == self.end:
             ret = relay.Call(call.op, args, call.attrs)
-            ret = relay.annotation.on_device(ret, self.cpu_ctx)
+            ret = relay.annotation.on_device(ret, self.cpu_dev)
             return ret
 
         if self.counter > self.start and self.counter < self.end:
@@ -232,7 +232,7 @@ def visit_call(self, call):
             if self.is_float_op(call):
                 return ret
 
-            return relay.annotation.on_device(ret, self.ext_ctx)
+            return relay.annotation.on_device(ret, self.ext_dev)
 
         return relay.Call(self.visit(call.op), args, call.attrs)
 
diff --git a/vta/runtime/device_api.cc b/vta/runtime/device_api.cc
index 0fea7ba5e364..b021ed103933 100644
--- a/vta/runtime/device_api.cc
+++ b/vta/runtime/device_api.cc
@@ -33,38 +33,38 @@ namespace runtime {
 
 class VTADeviceAPI final : public DeviceAPI {
  public:
-  void SetDevice(TVMContext ctx) final {}
+  void SetDevice(Device dev) final {}
 
-  void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final {
+  void GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) final {
     if (kind == kExist) {
       *rv = 1;
     }
   }
 
-  void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment, DLDataType type_hint) final {
+  void* AllocDataSpace(Device dev, size_t size, size_t alignment, DLDataType type_hint) final {
     return VTABufferAlloc(size);
   }
 
-  void FreeDataSpace(TVMContext ctx, void* ptr) final { VTABufferFree(ptr); }
+  void FreeDataSpace(Device dev, void* ptr) final { VTABufferFree(ptr); }
 
   void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
-                      TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
+                      Device dev_from, Device dev_to, DLDataType type_hint,
                       TVMStreamHandle stream) final {
     int kind_mask = 0;
-    if (ctx_from.device_type != kDLCPU) {
+    if (dev_from.device_type != kDLCPU) {
       kind_mask |= 2;
     }
-    if (ctx_to.device_type != kDLCPU) {
+    if (dev_to.device_type != kDLCPU) {
       kind_mask |= 1;
     }
     VTABufferCopy(from, from_offset, to, to_offset, size, kind_mask);
   }
 
-  void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {}
+  void StreamSync(Device dev, TVMStreamHandle stream) final {}
 
-  void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final;
+  void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final;
 
-  void FreeWorkspace(TVMContext ctx, void* data) final;
+  void FreeWorkspace(Device dev, void* data) final;
 
   static VTADeviceAPI* Global() {
     static VTADeviceAPI* inst = new VTADeviceAPI();
@@ -76,12 +76,12 @@ struct VTAWorkspacePool : public WorkspacePool {
   VTAWorkspacePool() : WorkspacePool(kDLExtDev, VTADeviceAPI::Global()) {}
 };
 
-void* VTADeviceAPI::AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) {
-  return dmlc::ThreadLocalStore<VTAWorkspacePool>::Get()->AllocWorkspace(ctx, size);
+void* VTADeviceAPI::AllocWorkspace(Device dev, size_t size, DLDataType type_hint) {
+  return dmlc::ThreadLocalStore<VTAWorkspacePool>::Get()->AllocWorkspace(dev, size);
 }
 
-void VTADeviceAPI::FreeWorkspace(TVMContext ctx, void* data) {
-  dmlc::ThreadLocalStore<VTAWorkspacePool>::Get()->FreeWorkspace(ctx, data);
+void VTADeviceAPI::FreeWorkspace(Device dev, void* data) {
+  dmlc::ThreadLocalStore<VTAWorkspacePool>::Get()->FreeWorkspace(dev, data);
 }
 
 // Register device api with override.
diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py
index a10d1de8c46b..dfb74b129718 100644
--- a/vta/scripts/tune_resnet.py
+++ b/vta/scripts/tune_resnet.py
@@ -28,8 +28,8 @@
 from tvm import rpc, autotvm, relay
 from tvm.autotvm.measure.measure_methods import request_remote
 from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
-from tvm.contrib import graph_runtime, utils, download
-from tvm.contrib.debugger import debug_runtime
+from tvm.contrib import graph_executor, utils, download
+from tvm.contrib.debugger import debug_executor
 import vta
 from vta.testing import simulator
 from vta.top import graph_pack
@@ -325,9 +325,9 @@ def tune_tasks(
 
         # If detailed runtime info is needed build with debug runtime
         if opt.debug_profile:
-            m = debug_runtime.create(graph, lib, ctx)
+            m = debug_executor.create(graph, lib, ctx)
         else:
-            m = graph_runtime.create(graph, lib, ctx)
+            m = graph_executor.create(graph, lib, ctx)
 
         # Set the network parameters and synthetic input
         image = tvm.nd.array((np.random.uniform(size=(1, 3, 224, 224))).astype("float32"))
diff --git a/vta/tests/python/integration/test_benchmark_gemm.py b/vta/tests/python/integration/test_benchmark_gemm.py
index 824aed6efa02..f69766ceaaad 100644
--- a/vta/tests/python/integration/test_benchmark_gemm.py
+++ b/vta/tests/python/integration/test_benchmark_gemm.py
@@ -66,7 +66,7 @@ def verify(s):
             remote.upload(temp.relpath("gemm.o"))
             f = remote.load_module("gemm.o")
             # verify
-            ctx = remote.ext_dev(0)
+            dev = remote.ext_dev(0)
             # Data in original format
             data_orig = np.random.randint(-128, 128, size=(batch_size, channel)).astype(data.dtype)
             weight_orig = np.random.randint(-128, 128, size=(channel, channel)).astype(weight.dtype)
@@ -77,9 +77,9 @@ def verify(s):
                 channel // env.BLOCK_OUT, env.BLOCK_OUT, channel // env.BLOCK_IN, env.BLOCK_IN
             ).transpose((0, 2, 1, 3))
             res_np = np.zeros(res_shape).astype(res.dtype)
-            data_arr = tvm.nd.array(data_packed, ctx)
-            weight_arr = tvm.nd.array(weight_packed, ctx)
-            res_arr = tvm.nd.array(res_np, ctx)
+            data_arr = tvm.nd.array(data_packed, dev)
+            weight_arr = tvm.nd.array(weight_packed, dev)
+            res_arr = tvm.nd.array(res_np, dev)
             res_ref = np.zeros(res_shape).astype(env.acc_dtype)
             for b in range(batch_size // env.BATCH):
                 for i in range(channel // env.BLOCK_OUT):
@@ -90,7 +90,7 @@ def verify(s):
                         )
             res_ref = np.right_shift(res_ref, 8)
             res_ref = np.clip(res_ref, 0, (1 << (env.INP_WIDTH - 1)) - 1).astype(res.dtype)
-            time_f = f.time_evaluator("gemm", ctx, number=20)
+            time_f = f.time_evaluator("gemm", dev, number=20)
             if env.TARGET in ["sim", "tsim"]:
                 simulator.clear_stats()
             cost = time_f(data_arr, weight_arr, res_arr)
diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
index cad560c208b6..b82c3a90c9d0 100644
--- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py
+++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
@@ -228,14 +228,14 @@ def get_ref_data():
     mod.save(temp.relpath("conv2d.o"))
     remote.upload(temp.relpath("conv2d.o"))
     f = remote.load_module("conv2d.o")
-    ctx = remote.context(str(target))
+    dev = remote.device(str(target))
 
     res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype)
-    data_arr = tvm.nd.array(data_np, ctx)
-    kernel_arr = tvm.nd.array(kernel_np, ctx)
-    bias_arr = tvm.nd.array(bias_np, ctx)
-    res_arr = tvm.nd.array(res_np, ctx)
-    time_f = f.time_evaluator("conv2d", ctx, number=samples)
+    data_arr = tvm.nd.array(data_np, dev)
+    kernel_arr = tvm.nd.array(kernel_np, dev)
+    bias_arr = tvm.nd.array(bias_np, dev)
+    res_arr = tvm.nd.array(res_np, dev)
+    time_f = f.time_evaluator("conv2d", dev, number=samples)
 
     # In vta sim mode, collect simulator runtime statistics
     stats = {}
diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py b/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py
index f750225ed8f7..d2516faac00b 100644
--- a/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py
+++ b/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py
@@ -224,13 +224,13 @@ def get_ref_data():
     mod.save(temp.relpath("conv2d_transpose.o"))
     remote.upload(temp.relpath("conv2d_transpose.o"))
     f = remote.load_module("conv2d_transpose.o")
-    ctx = remote.context(str(target))
+    dev = remote.device(str(target))
 
     res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype)
-    data_arr = tvm.nd.array(data_np, ctx)
-    kernel_arr = tvm.nd.array(kernel_np, ctx)
-    res_arr = tvm.nd.array(res_np, ctx)
-    time_f = f.time_evaluator("conv2d_transpose", ctx, number=samples)
+    data_arr = tvm.nd.array(data_np, dev)
+    kernel_arr = tvm.nd.array(kernel_np, dev)
+    res_arr = tvm.nd.array(res_np, dev)
+    time_f = f.time_evaluator("conv2d_transpose", dev, number=samples)
 
     # In vta sim mode, collect simulator runtime statistics
     stats = {}
diff --git a/vta/tests/python/integration/test_benchmark_topi_dense.py b/vta/tests/python/integration/test_benchmark_topi_dense.py
index 0b604108a35f..ceeed1ce8ddb 100644
--- a/vta/tests/python/integration/test_benchmark_topi_dense.py
+++ b/vta/tests/python/integration/test_benchmark_topi_dense.py
@@ -135,13 +135,13 @@ def get_ref_data():
     mod.save(temp.relpath("dense.o"))
     remote.upload(temp.relpath("dense.o"))
     f = remote.load_module("dense.o")
-    ctx = remote.context(str(target))
+    dev = remote.device(str(target))
 
     res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype)
-    data_arr = tvm.nd.array(data_np, ctx)
-    kernel_arr = tvm.nd.array(kernel_np, ctx)
-    res_arr = tvm.nd.array(res_np, ctx)
-    time_f = f.time_evaluator("dense", ctx, number=samples)
+    data_arr = tvm.nd.array(data_np, dev)
+    kernel_arr = tvm.nd.array(kernel_np, dev)
+    res_arr = tvm.nd.array(res_np, dev)
+    time_f = f.time_evaluator("dense", dev, number=samples)
 
     # In vta sim mode, collect simulator runtime statistics
     stats = {}
diff --git a/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py
index da6ba5b8fb94..b7c7b0aa0a8d 100644
--- a/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py
+++ b/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py
@@ -222,14 +222,14 @@ def get_ref_data():
     mod.save(temp.relpath("conv2d.o"))
     remote.upload(temp.relpath("conv2d.o"))
     f = remote.load_module("conv2d.o")
-    ctx = remote.context(str(target))
+    dev = remote.device(str(target))
 
     res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype)
-    data_arr = tvm.nd.array(data_np, ctx)
-    kernel_arr = tvm.nd.array(kernel_np, ctx)
-    bias_arr = tvm.nd.array(bias_np, ctx)
-    res_arr = tvm.nd.array(res_np, ctx)
-    time_f = f.time_evaluator("conv2d", ctx, number=samples)
+    data_arr = tvm.nd.array(data_np, dev)
+    kernel_arr = tvm.nd.array(kernel_np, dev)
+    bias_arr = tvm.nd.array(bias_np, dev)
+    res_arr = tvm.nd.array(res_np, dev)
+    time_f = f.time_evaluator("conv2d", dev, number=samples)
 
     # In vta sim mode, collect simulator runtime statistics
     stats = {}
diff --git a/vta/tests/python/unittest/test_vta_insn.py b/vta/tests/python/unittest/test_vta_insn.py
index b83510f4a9dc..2817ef01b5fa 100644
--- a/vta/tests/python/unittest/test_vta_insn.py
+++ b/vta/tests/python/unittest/test_vta_insn.py
@@ -59,11 +59,11 @@ def _run(env, remote):
         remote.upload(temp.relpath("load_act.o"))
         f = remote.load_module("load_act.o")
         # verify
-        ctx = remote.ext_dev(0)
+        dev = remote.ext_dev(0)
         x_np = np.random.randint(1, 10, size=(n, n, env.BATCH, env.BLOCK_OUT)).astype(x.dtype)
         y_np = x_np.astype(y.dtype)
-        x_nd = tvm.nd.array(x_np, ctx)
-        y_nd = tvm.nd.empty(y_np.shape, ctx=ctx, dtype=y_np.dtype)
+        x_nd = tvm.nd.array(x_np, dev)
+        y_nd = tvm.nd.empty(y_np.shape, device=dev, dtype=y_np.dtype)
 
         if env.TARGET in ["sim", "tsim"]:
             simulator.clear_stats()
@@ -130,7 +130,7 @@ def check_padded_load(pad_before, pad_after, test_name=None):
             remote.upload(temp.relpath("padded_load.o"))
             f = remote.load_module("padded_load.o")
             # verify
-            ctx = remote.ext_dev(0)
+            dev = remote.ext_dev(0)
             x_np = np.random.randint(0, 10, size=(n, m, env.BATCH, env.BLOCK_OUT)).astype(x.dtype)
             y_np = np.zeros(
                 (
@@ -141,8 +141,8 @@ def check_padded_load(pad_before, pad_after, test_name=None):
                 )
             ).astype(y.dtype)
             y_np[pad_before[0] : pad_before[0] + n, pad_before[1] : pad_before[1] + m, :] = x_np
-            x_nd = tvm.nd.array(x_np, ctx)
-            y_nd = tvm.nd.empty(y_np.shape, ctx=ctx, dtype=y_np.dtype)
+            x_nd = tvm.nd.array(x_np, dev)
+            y_nd = tvm.nd.empty(y_np.shape, device=dev, dtype=y_np.dtype)
 
             if env.TARGET in ["sim", "tsim"]:
                 simulator.clear_stats()
@@ -214,7 +214,7 @@ def verify(s, name=None):
             remote.upload(temp.relpath("gemm.o"))
             f = remote.load_module("gemm.o")
             # verify
-            ctx = remote.ext_dev(0)
+            dev = remote.ext_dev(0)
             x_np = np.random.randint(-128, 128, size=(o, n, env.BATCH, env.BLOCK_IN)).astype(
                 x.dtype
             )
@@ -222,9 +222,9 @@ def verify(s, name=None):
                 w.dtype
             )
             y_np = np.zeros((o, m, env.BATCH, env.BLOCK_OUT)).astype(y.dtype)
-            x_nd = tvm.nd.array(x_np, ctx)
-            w_nd = tvm.nd.array(w_np, ctx)
-            y_nd = tvm.nd.array(y_np, ctx)
+            x_nd = tvm.nd.array(x_np, dev)
+            w_nd = tvm.nd.array(w_np, dev)
+            y_nd = tvm.nd.array(y_np, dev)
             y_np = y_np.astype(env.acc_dtype)
             for b in range(o):
                 for i in range(m):
@@ -376,7 +376,7 @@ def check_alu(tvm_op, np_op=None, use_imm=False, test_name=None):
             remote.upload(temp.relpath("load_act.o"))
             f = remote.load_module("load_act.o")
             # verify
-            ctx = remote.ext_dev(0)
+            dev = remote.ext_dev(0)
             a_np = np.random.randint(-16, 16, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype)
             if use_imm:
                 res_np = np_op(a_np, imm) if np_op else tvm_op(a_np, imm)
@@ -386,8 +386,8 @@ def check_alu(tvm_op, np_op=None, use_imm=False, test_name=None):
                 )
                 res_np = np_op(a_np, b_np) if np_op else tvm_op(a_np, b_np)
             res_np = res_np.astype(res.dtype)
-            a_nd = tvm.nd.array(a_np, ctx)
-            res_nd = tvm.nd.array(np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), ctx)
+            a_nd = tvm.nd.array(a_np, dev)
+            res_nd = tvm.nd.array(np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), dev)
 
             if env.TARGET in ["sim", "tsim"]:
                 simulator.clear_stats()
@@ -395,7 +395,7 @@ def check_alu(tvm_op, np_op=None, use_imm=False, test_name=None):
             if use_imm:
                 f(a_nd, res_nd)
             else:
-                b_nd = tvm.nd.array(b_np, ctx)
+                b_nd = tvm.nd.array(b_np, dev)
                 f(a_nd, b_nd, res_nd)
 
             np.testing.assert_equal(res_np, res_nd.asnumpy())
@@ -459,11 +459,11 @@ def _run(env, remote):
         remote.upload(temp.relpath("load_act.o"))
         f = remote.load_module("load_act.o")
         # verify
-        ctx = remote.ext_dev(0)
+        dev = remote.ext_dev(0)
         a_np = np.random.randint(-256, 256, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype)
         res_np = np.clip(a_np, 0, (1 << (env.INP_WIDTH - 1)) - 1).astype(res.dtype)
-        a_nd = tvm.nd.array(a_np, ctx)
-        res_nd = tvm.nd.array(np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), ctx)
+        a_nd = tvm.nd.array(a_np, dev)
+        res_nd = tvm.nd.array(np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), dev)
 
         if env.TARGET in ["sim", "tsim"]:
             simulator.clear_stats()
@@ -521,12 +521,12 @@ def _run(env, remote):
         remote.upload(temp.relpath("load_act.o"))
         f = remote.load_module("load_act.o")
         # verify
-        ctx = remote.ext_dev(0)
+        dev = remote.ext_dev(0)
         a_np = np.random.randint(-10, 10, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype)
         res_np = np.right_shift((a_np + imm_shift), imm_scale)
         res_np = res_np.astype(res.dtype)
-        a_nd = tvm.nd.array(a_np, ctx)
-        res_nd = tvm.nd.array(np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), ctx)
+        a_nd = tvm.nd.array(a_np, dev)
+        res_nd = tvm.nd.array(np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), dev)
 
         if env.TARGET in ["sim", "tsim"]:
             simulator.clear_stats()
@@ -547,9 +547,9 @@ def _run(env, remote):
 def test_runtime_array():
     def _run(env, remote):
         n = 100
-        ctx = remote.ext_dev(0)
+        dev = remote.ext_dev(0)
         x_np = np.random.randint(1, 10, size=(n, n, env.BATCH, env.BLOCK_OUT)).astype("int8")
-        x_nd = tvm.nd.array(x_np, ctx)
+        x_nd = tvm.nd.array(x_np, dev)
         np.testing.assert_equal(x_np, x_nd.asnumpy())
 
     vta.testing.run(_run)
diff --git a/vta/tutorials/autotvm/tune_relay_vta.py b/vta/tutorials/autotvm/tune_relay_vta.py
index ed2671c75ae8..7deb7408479a 100644
--- a/vta/tutorials/autotvm/tune_relay_vta.py
+++ b/vta/tutorials/autotvm/tune_relay_vta.py
@@ -62,7 +62,7 @@
 import tvm
 from tvm import te
 from tvm import rpc, autotvm, relay
-from tvm.contrib import graph_runtime, utils, download
+from tvm.contrib import graph_executor, utils, download
 from tvm.autotvm.measure.measure_methods import request_remote
 from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
 
@@ -431,9 +431,9 @@ def tune_and_evaluate(tuning_opt):
         remote.upload(temp.relpath("graphlib.tar"))
         lib = remote.load_module("graphlib.tar")
 
-        # Generate the graph runtime
+        # Generate the graph executor
         ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
-        m = graph_runtime.GraphModule(lib["default"](ctx))
+        m = graph_executor.GraphModule(lib["default"](ctx))
 
         # upload parameters to device
         image = tvm.nd.array((np.random.uniform(size=(1, 3, 224, 224))).astype("float32"))
diff --git a/vta/tutorials/frontend/deploy_classification.py b/vta/tutorials/frontend/deploy_classification.py
index 1bf4161a3340..f9db824eafa3 100644
--- a/vta/tutorials/frontend/deploy_classification.py
+++ b/vta/tutorials/frontend/deploy_classification.py
@@ -52,8 +52,8 @@
 import tvm
 from tvm import te
 from tvm import rpc, autotvm, relay
-from tvm.contrib import graph_runtime, utils, download
-from tvm.contrib.debugger import debug_runtime
+from tvm.contrib import graph_executor, utils, download
+from tvm.contrib.debugger import debug_executor
 from tvm.relay import transform
 
 import vta
@@ -135,7 +135,7 @@
 ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
 
 ######################################################################
-# Build the inference graph runtime
+# Build the inference graph executor
 # ---------------------------------
 # Grab vision model from Gluon model zoo and compile with Relay.
 # The compilation steps are:
@@ -147,7 +147,7 @@
 # 4. Perform constant folding to reduce number of operators (e.g. eliminate batch norm multiply).
 # 5. Perform relay build to object file.
 # 6. Load the object file onto remote (FPGA device).
-# 7. Generate graph runtime, `m`.
+# 7. Generate graph executor, `m`.
 #
 
 # Load pre-configured AutoTVM schedules
@@ -209,8 +209,8 @@
     remote.upload(temp.relpath("graphlib.tar"))
     lib = remote.load_module("graphlib.tar")
 
-    # Graph runtime
-    m = graph_runtime.GraphModule(lib["default"](ctx))
+    # Graph executor
+    m = graph_executor.GraphModule(lib["default"](ctx))
 
 ######################################################################
 # Perform image classification inference
diff --git a/vta/tutorials/frontend/legacy/deploy_detection.py b/vta/tutorials/frontend/legacy/deploy_detection.py
index cbb320e98f13..696d0508b956 100644
--- a/vta/tutorials/frontend/legacy/deploy_detection.py
+++ b/vta/tutorials/frontend/legacy/deploy_detection.py
@@ -58,7 +58,7 @@
 from tvm import rpc, autotvm, relay
 from tvm.relay.testing import yolo_detection, darknet
 from tvm.relay.testing.darknet import __darknetffi__
-from tvm.contrib import graph_runtime, utils
+from tvm.contrib import graph_executor, utils
 from tvm.contrib.download import download_testdata
 from vta.testing import simulator
 from vta.top import graph_pack
@@ -178,7 +178,7 @@
 ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
 
 ####################################
-# Build the inference graph runtime.
+# Build the inference graph executor.
 # ----------------------------------
 # Using Darknet library load downloaded vision model and compile with Relay.
 # The compilation steps are:
@@ -190,7 +190,7 @@
 # 4. Perform constant folding to reduce number of operators (e.g. eliminate batch norm multiply).
 # 5. Perform relay build to object file.
 # 6. Load the object file onto remote (FPGA device).
-# 7. Generate graph runtime, `m`.
+# 7. Generate graph executor, `m`.
 #
 
 # Load pre-configured AutoTVM schedules
@@ -246,8 +246,8 @@
     remote.upload(temp.relpath("graphlib.tar"))
     lib = remote.load_module("graphlib.tar")
 
-    # Graph runtime
-    m = graph_runtime.GraphModule(lib["default"](ctx))
+    # Graph executor
+    m = graph_executor.GraphModule(lib["default"](ctx))
 
 ####################################
 # Perform image detection inference.
diff --git a/web/apps/node/example.js b/web/apps/node/example.js
index f81a9c903e5d..cff76d8a067e 100644
--- a/web/apps/node/example.js
+++ b/web/apps/node/example.js
@@ -31,7 +31,8 @@ const wasmSource = fs.readFileSync(path.join(wasmPath, "tvmjs_runtime.wasm"));
 // the async version of the API.
 tvmjs.instantiate(wasmSource, new EmccWASI())
 .then((tvm) => {
+    const log_info = tvm.getGlobalFunc("testing.log_info_str");
+    log_info("hello world");
     // List all the global functions from the runtime.
     console.log("Runtime functions using EmccWASI\n", tvm.listGlobalFuncNames());
 });
-
diff --git a/web/emcc/tvmjs_support.cc b/web/emcc/tvmjs_support.cc
index b72caad1e3df..77ce6be66e63 100644
--- a/web/emcc/tvmjs_support.cc
+++ b/web/emcc/tvmjs_support.cc
@@ -24,12 +24,11 @@
  *        We do not need to link this file in standalone wasm.
  */
 
-// configurations for the dmlc log.
-#define DMLC_LOG_CUSTOMIZE 0
-#define DMLC_LOG_STACK_TRACE 0
-#define DMLC_LOG_DEBUG 0
-#define DMLC_LOG_NODATE 1
-#define DMLC_LOG_FATAL_THROW 0
+// configurations for tvm logging
+#define TVM_LOG_STACK_TRACE 0
+#define TVM_LOG_DEBUG 0
+#define TVM_LOG_CUSTOMIZE 1
+#define DMLC_USE_LOGGING_LIBRARY <tvm/runtime/logging.h>
 
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/container.h>
@@ -182,14 +181,14 @@ class AsyncLocalSession : public LocalSession {
     try {
       DLTensor local_from;
       local_from.data = local_from_bytes;
-      local_from.ctx = TVMContext{kDLCPU, 0};
+      local_from.device = Device{kDLCPU, 0};
       local_from.ndim = remote_to->ndim;
       local_from.shape = remote_to->shape;
       local_from.dtype = remote_to->dtype;
       local_from.strides = nullptr;
       local_from.byte_offset = 0;
-      this->GetDeviceAPI(remote_to->ctx)->CopyDataFromTo(&local_from, remote_to, nullptr);
-      this->AsyncStreamWait(remote_to->ctx, nullptr, on_complete);
+      this->GetDeviceAPI(remote_to->device)->CopyDataFromTo(&local_from, remote_to, nullptr);
+      this->AsyncStreamWait(remote_to->device, nullptr, on_complete);
     } catch (const std::runtime_error& e) {
       this->SendException(on_complete, e.what());
     }
@@ -200,27 +199,27 @@ class AsyncLocalSession : public LocalSession {
     try {
       DLTensor local_to;
       local_to.data = local_to_bytes;
-      local_to.ctx = TVMContext{kDLCPU, 0};
+      local_to.device = Device{kDLCPU, 0};
       local_to.ndim = remote_from->ndim;
       local_to.shape = remote_from->shape;
       local_to.dtype = remote_from->dtype;
       local_to.strides = nullptr;
       local_to.byte_offset = 0;
-      this->GetDeviceAPI(remote_from->ctx)->CopyDataFromTo(&local_to, remote_from, nullptr);
-      this->AsyncStreamWait(remote_from->ctx, nullptr, on_complete);
+      this->GetDeviceAPI(remote_from->device)->CopyDataFromTo(&local_to, remote_from, nullptr);
+      this->AsyncStreamWait(remote_from->device, nullptr, on_complete);
     } catch (const std::runtime_error& e) {
       this->SendException(on_complete, e.what());
     }
   }
 
-  void AsyncStreamWait(TVMContext ctx, TVMStreamHandle stream, FAsyncCallback on_complete) final {
-    if (ctx.device_type == kDLCPU) {
+  void AsyncStreamWait(Device dev, TVMStreamHandle stream, FAsyncCallback on_complete) final {
+    if (dev.device_type == kDLCPU) {
       TVMValue value;
       int32_t tcode = kTVMNullptr;
       value.v_handle = nullptr;
       on_complete(RPCCode::kReturn, TVMArgs(&value, &tcode, 1));
     } else {
-      CHECK(ctx.device_type == static_cast<DLDeviceType>(kDLWebGPU));
+      CHECK(dev.device_type == static_cast<DLDeviceType>(kDLWebGPU));
       if (async_wait_ == nullptr) {
         async_wait_ = tvm::runtime::Registry::Get("__async.wasm.WebGPUWaitForTasks");
       }
@@ -244,25 +243,25 @@ class AsyncLocalSession : public LocalSession {
   // time evaluator
   PackedFunc GetTimeEvaluator(Optional<Module> opt_mod, std::string name, int device_type,
                               int device_id, int number, int repeat, int min_repeat_ms) {
-    TVMContext ctx;
-    ctx.device_type = static_cast<DLDeviceType>(device_type);
-    ctx.device_id = device_id;
+    Device dev;
+    dev.device_type = static_cast<DLDeviceType>(device_type);
+    dev.device_id = device_id;
 
     if (opt_mod.defined()) {
       Module m = opt_mod.value();
       std::string tkey = m->type_key();
-      return WrapWasmTimeEvaluator(m.GetFunction(name, false), ctx, number, repeat, min_repeat_ms);
+      return WrapWasmTimeEvaluator(m.GetFunction(name, false), dev, number, repeat, min_repeat_ms);
     } else {
       auto* pf = runtime::Registry::Get(name);
       CHECK(pf != nullptr) << "Cannot find " << name << " in the global function";
-      return WrapWasmTimeEvaluator(*pf, ctx, number, repeat, min_repeat_ms);
+      return WrapWasmTimeEvaluator(*pf, dev, number, repeat, min_repeat_ms);
     }
   }
 
   // time evaluator
-  PackedFunc WrapWasmTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repeat,
+  PackedFunc WrapWasmTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat,
                                    int min_repeat_ms) {
-    auto ftimer = [pf, ctx, number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue* rv) {
+    auto ftimer = [pf, dev, number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue* rv) {
       // the function is a async function.
       PackedFunc on_complete = args[args.size() - 1];
       // keep argument alive in finvoke so that they
@@ -279,7 +278,7 @@ class AsyncLocalSession : public LocalSession {
       };
       auto* time_exec = runtime::Registry::Get("__async.wasm.TimeExecution");
       CHECK(time_exec != nullptr) << "Cannot find wasm.GetTimer in the global function";
-      (*time_exec)(TypedPackedFunc<void(int)>(finvoke), ctx, number, repeat, min_repeat_ms,
+      (*time_exec)(TypedPackedFunc<void(int)>(finvoke), dev, number, repeat, min_repeat_ms,
                    on_complete);
     };
     return PackedFunc(ftimer);
diff --git a/web/emcc/wasm_runtime.cc b/web/emcc/wasm_runtime.cc
index 214c1883f874..bcad656678e5 100644
--- a/web/emcc/wasm_runtime.cc
+++ b/web/emcc/wasm_runtime.cc
@@ -22,24 +22,25 @@
  * \brief TVM wasm runtime library pack.
  */
 
-// configurations for the dmlc log.
-#define DMLC_LOG_CUSTOMIZE 0
-#define DMLC_LOG_STACK_TRACE 0
-#define DMLC_LOG_DEBUG 0
-#define DMLC_LOG_NODATE 1
-#define DMLC_LOG_FATAL_THROW 0
-
-#include <dmlc/logging.h>
+// configurations for tvm logging
+#define TVM_LOG_STACK_TRACE 0
+#define TVM_LOG_DEBUG 0
+#define TVM_LOG_CUSTOMIZE 1
+#define DMLC_USE_LOGGING_LIBRARY <tvm/runtime/logging.h>
+
 #include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/logging.h>
 
 #include "src/runtime/c_runtime_api.cc"
 #include "src/runtime/cpu_device_api.cc"
 #include "src/runtime/file_utils.cc"
-#include "src/runtime/graph/graph_runtime.cc"
+#include "src/runtime/graph_executor/graph_executor.cc"
 #include "src/runtime/library_module.cc"
+#include "src/runtime/logging.cc"
 #include "src/runtime/module.cc"
 #include "src/runtime/ndarray.cc"
 #include "src/runtime/object.cc"
+#include "src/runtime/profiling.cc"
 #include "src/runtime/registry.cc"
 #include "src/runtime/rpc/rpc_channel.cc"
 #include "src/runtime/rpc/rpc_endpoint.cc"
@@ -64,11 +65,31 @@ int TVMBackendParallelBarrier(int task_id, TVMParallelGroupEnv* penv) { return 0
 // --- Environment PackedFuncs for testing ---
 namespace tvm {
 namespace runtime {
+namespace detail {
+// Override logging mechanism
+void LogFatalImpl(const std::string& file, int lineno, const std::string& message) {
+  std::cerr << "[FATAL] " << file << ":" << lineno << ": " << message << std::endl;
+  abort();
+}
+
+void LogMessageImpl(const std::string& file, int lineno, const std::string& message) {
+  std::cout << "[INFO] " << file << ":" << lineno << ": " << message << std::endl;
+}
+
+}  // namespace detail
 
 TVM_REGISTER_GLOBAL("testing.echo").set_body([](TVMArgs args, TVMRetValue* ret) {
   *ret = args[0];
 });
 
+TVM_REGISTER_GLOBAL("testing.log_info_str").set_body([](TVMArgs args, TVMRetValue* ret) {
+  LOG(INFO) << args[0].operator String();
+});
+
+TVM_REGISTER_GLOBAL("testing.log_fatal_str").set_body([](TVMArgs args, TVMRetValue* ret) {
+  LOG(FATAL) << args[0].operator String();
+});
+
 TVM_REGISTER_GLOBAL("testing.add_one").set_body_typed([](int x) { return x + 1; });
 
 TVM_REGISTER_GLOBAL("testing.wrap_callback").set_body([](TVMArgs args, TVMRetValue* ret) {
diff --git a/web/emcc/webgpu_runtime.cc b/web/emcc/webgpu_runtime.cc
index 62b87af01774..073c613bd2c2 100644
--- a/web/emcc/webgpu_runtime.cc
+++ b/web/emcc/webgpu_runtime.cc
@@ -22,12 +22,11 @@
  * \brief WebGPU runtime based on the TVM JS.
  */
 
-// configurations for the dmlc log.
-#define DMLC_LOG_CUSTOMIZE 0
-#define DMLC_LOG_STACK_TRACE 0
-#define DMLC_LOG_DEBUG 0
-#define DMLC_LOG_NODATE 1
-#define DMLC_LOG_FATAL_THROW 0
+// configurations for tvm logging
+#define TVM_LOG_STACK_TRACE 0
+#define TVM_LOG_DEBUG 0
+#define TVM_LOG_CUSTOMIZE 1
+#define DMLC_USE_LOGGING_LIBRARY <tvm/runtime/logging.h>
 
 #include <dmlc/thread_local.h>
 #include <tvm/runtime/c_runtime_api.h>
@@ -35,6 +34,9 @@
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
 
+#include <iostream>
+#include <string>
+
 #include "../../src/runtime/meta_data.h"
 #include "../../src/runtime/vulkan/vulkan_shader.h"
 #include "../../src/runtime/workspace_pool.h"
@@ -67,35 +69,34 @@ class WebGPUDeviceAPI : public DeviceAPI {
     copy_within_gpu_ = getter("deviceCopyWithinGPU");
   }
 
-  void SetDevice(TVMContext ctx) final {}
-  void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final {
+  void SetDevice(Device dev) final {}
+  void GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) final {
     if (kind == kExist) {
       *rv = 1;
     }
   }
 
-  void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment,
-                       DLDataType type_hint) final {
+  void* AllocDataSpace(Device dev, size_t nbytes, size_t alignment, DLDataType type_hint) final {
     double ptr_number = alloc_space_(nbytes);
     return reinterpret_cast<void*>(static_cast<int64_t>(ptr_number));
   }
 
-  void FreeDataSpace(TVMContext ctx, void* ptr) final { return free_space_(ptr); }
+  void FreeDataSpace(Device dev, void* ptr) final { return free_space_(ptr); }
 
  protected:
   void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
-                      TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
+                      Device dev_from, Device dev_to, DLDataType type_hint,
                       TVMStreamHandle stream) final {
-    if (static_cast<int>(ctx_from.device_type) == kDLWebGPU &&
-        static_cast<int>(ctx_to.device_type) == kDLWebGPU) {
-      CHECK_EQ(ctx_from.device_id, ctx_to.device_id);
+    if (static_cast<int>(dev_from.device_type) == kDLWebGPU &&
+        static_cast<int>(dev_to.device_type) == kDLWebGPU) {
+      CHECK_EQ(dev_from.device_id, dev_to.device_id);
       copy_within_gpu_(const_cast<void*>(from), from_offset, to, to_offset, size);
-    } else if (static_cast<int>(ctx_from.device_type) == kDLWebGPU &&
-               ctx_to.device_type == kDLCPU) {
+    } else if (static_cast<int>(dev_from.device_type) == kDLWebGPU &&
+               dev_to.device_type == kDLCPU) {
       void* to_ptr = static_cast<uint8_t*>(to) + to_offset;
       copy_from_gpu_(const_cast<void*>(from), from_offset, to_ptr, size);
-    } else if (ctx_from.device_type == kDLCPU &&
-               static_cast<int>(ctx_to.device_type) == kDLWebGPU) {
+    } else if (dev_from.device_type == kDLCPU &&
+               static_cast<int>(dev_to.device_type) == kDLWebGPU) {
       void* from_ptr = static_cast<uint8_t*>(const_cast<void*>(from)) + from_offset;
       copy_to_gpu_(from_ptr, to, to_offset, size);
     } else {
@@ -104,34 +105,34 @@ class WebGPUDeviceAPI : public DeviceAPI {
   }
 
  public:
-  TVMStreamHandle CreateStream(TVMContext ctx) final {
+  TVMStreamHandle CreateStream(Device dev) final {
     LOG(FATAL) << "Not implemented";
     return nullptr;
   }
 
-  void FreeStream(TVMContext ctx, TVMStreamHandle stream) final {
+  void FreeStream(Device dev, TVMStreamHandle stream) final {
     LOG(FATAL) << "Not implemented";
     return;
   }
 
-  void SyncStreamFromTo(TVMContext ctx, TVMStreamHandle event_src, TVMStreamHandle event_dst) {
+  void SyncStreamFromTo(Device dev, TVMStreamHandle event_src, TVMStreamHandle event_dst) {
     LOG(FATAL) << "Not implemented";
     return;
   }
 
-  void StreamSync(TVMContext ctx, TVMStreamHandle stream) final { LOG(FATAL) << "Not implemented"; }
+  void StreamSync(Device dev, TVMStreamHandle stream) final { LOG(FATAL) << "Not implemented"; }
 
-  void SetStream(TVMContext ctx, TVMStreamHandle stream) final {
+  void SetStream(Device dev, TVMStreamHandle stream) final {
     LOG(FATAL) << "Not implemented";
     return;
   }
 
-  void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final {
-    return WebGPUThreadEntry::ThreadLocal()->pool.AllocWorkspace(ctx, size);
+  void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final {
+    return WebGPUThreadEntry::ThreadLocal()->pool.AllocWorkspace(dev, size);
   }
 
-  void FreeWorkspace(TVMContext ctx, void* data) final {
-    WebGPUThreadEntry::ThreadLocal()->pool.FreeWorkspace(ctx, data);
+  void FreeWorkspace(Device dev, void* data) final {
+    WebGPUThreadEntry::ThreadLocal()->pool.FreeWorkspace(dev, data);
   }
 
   static WebGPUDeviceAPI* Global() {
diff --git a/web/src/ctypes.ts b/web/src/ctypes.ts
index 66c46fe7ed91..4a6d25ae6270 100644
--- a/web/src/ctypes.ts
+++ b/web/src/ctypes.ts
@@ -204,7 +204,7 @@ export const enum SizeOf {
   F64 = 8,
   TVMValue = 8,
   DLDataType = I32,
-  DLContext = I32 + I32,
+  DLDevice = I32 + I32,
 }
 
 /**
@@ -217,7 +217,7 @@ export const enum ArgTypeCode {
   TVMOpaqueHandle = 3,
   Null = 4,
   TVMDataType = 5,
-  TVMContext = 6,
+  DLDevice = 6,
   TVMDLTensorHandle = 7,
   TVMObjectHandle = 8,
   TVMModuleHandle = 9,
diff --git a/web/src/index.ts b/web/src/index.ts
index 2d99fc9106cc..ed84ce7fbea1 100644
--- a/web/src/index.ts
+++ b/web/src/index.ts
@@ -18,7 +18,7 @@
  */
 
 export {
-  Scalar, DLContext, DLDataType,
+  Scalar, DLDevice, DLDataType,
   PackedFunc, Module, NDArray, Instance,
   instantiate
 } from "./runtime";
diff --git a/web/src/runtime.ts b/web/src/runtime.ts
index 80e7d71f06ad..a76096ebba4d 100644
--- a/web/src/runtime.ts
+++ b/web/src/runtime.ts
@@ -194,8 +194,8 @@ const DeviceStrToEnum: Record<string, number> = {
 /**
  * Represent a runtime context where a NDArray can reside.
  */
-export class DLContext {
-  /** The device type code of the context. */
+export class DLDevice {
+  /** The device type code of the device. */
   deviceType: number;
   /** The device index. */
   deviceId: number;
@@ -219,7 +219,7 @@ export class DLContext {
   }
 
   /**
-   * Synchronize the context
+   * Synchronize the device
    */
   async sync(): Promise<void> {
     if (this.deviceType == DeviceStrToEnum.webgpu) {
@@ -294,8 +294,8 @@ export class NDArray implements Disposable {
   dtype: string;
   /** Shape of the array. */
   shape: Array<number>;
-  /** Context of the array. */
-  context: DLContext;
+  /** Device of the array. */
+  device: DLDevice;
   /** Whether it is a temporary view that can become invalid after the call. */
   private isView: boolean;
   private byteOffset: number;
@@ -319,7 +319,7 @@ export class NDArray implements Disposable {
     const arrayOffsetContext = arrayOffsetData + this.lib.sizeofPtr();
     const arrayOffsetDevType = arrayOffsetContext;
     const arrayOffsetDevId = arrayOffsetContext + SizeOf.I32;
-    const arrayOffsetNdim = arrayOffsetContext + SizeOf.DLContext;
+    const arrayOffsetNdim = arrayOffsetContext + SizeOf.DLDevice;
     const arrayOffsetDtype = arrayOffsetNdim + SizeOf.I32;
     const arrayOffsetDtypeCode = arrayOffsetDtype;
     const arrayOffsetDtypeBits = arrayOffsetDtype + SizeOf.U8;
@@ -344,10 +344,10 @@ export class NDArray implements Disposable {
     this.dlDataType = new DLDataType(code, bits, lanes);
     this.dtype = this.dlDataType.toString();
 
-    // ctx
+    // device
     const deviceType = lib.memory.loadI32(this.dltensor + arrayOffsetDevType);
     const deviceId = lib.memory.loadI32(this.dltensor + arrayOffsetDevId);
-    this.context = new DLContext(deviceType, deviceId, lib);
+    this.device = new DLDevice(deviceType, deviceId, lib);
 
     // byte_offset
     this.byteOffset = lib.memory.loadI64(this.dltensor + arrayOffsetByteOffset);
@@ -442,7 +442,7 @@ export class NDArray implements Disposable {
    * @returns The result array.
    */
   toRawBytes(): Uint8Array {
-    if (this.context.deviceType != DeviceStrToEnum.cpu) {
+    if (this.device.deviceType != DeviceStrToEnum.cpu) {
       throw new Error("Can only synchronize copy for GPU array, use copyfrom instead.");
     }
     const size = this.shape.reduce((a, b) => {
@@ -570,13 +570,13 @@ export class Module implements Disposable {
 }
 
 /**
- *  Graph runtime.
+ *  Graph executor.
  *
  *  This is a thin wrapper of the underlying TVM module.
  *  you can also directly call set_input, run, and get_output
  *  of underlying module functions
  */
-class GraphRuntime implements Disposable {
+class GraphExecutor implements Disposable {
   module: Module;
   private packedSetInput: PackedFunc;
   private packedRun: PackedFunc;
@@ -648,22 +648,22 @@ class GraphRuntime implements Disposable {
 
   /**
    * Benchmark stable execution of the graph(without data copy).
-   * @params ctx The context to sync during each run.
+   * @params dev The device to sync during each run.
    * @number The number of times to compute the average.
    * @repeat The number of times to repeat the run.
    */
-  async benchmarkRuns(ctx: DLContext, number=10, repeat=4): Promise<number[]> {
+  async benchmarkRuns(dev: DLDevice, number=10, repeat=4): Promise<number[]> {
     // Skip first run as it can involve GPU warmup and module loading time.
     const perf = compact.getPeformance();
     const results = [];
     this.run();
-    await ctx.sync();
+    await dev.sync();
     for (let k = 0; k < repeat; ++k) {
       const tstart = perf.now();
       for (let i = 0; i < number; ++i) {
         this.run();
       }
-      await ctx.sync();
+      await dev.sync();
       const tend = perf.now();
       results.push((tend - tstart) / number);
     }
@@ -917,29 +917,29 @@ export class Instance implements Disposable {
   }
 
   /**
-   * Create a new {@link DLContext}
+   * Create a new {@link DLDevice}
    * @param deviceType The device type.
    * @param deviceId The device index.
-   * @returns The created context.
+   * @returns The created device.
    */
-  context(deviceType: number | string, deviceId = 0): DLContext {
-    return new DLContext(deviceType, deviceId, this.lib);
+  device(deviceType: number | string, deviceId = 0): DLDevice {
+    return new DLDevice(deviceType, deviceId, this.lib);
   }
 
   /**
-   * Create a new cpu {@link DLContext}
+   * Create a new cpu {@link DLDevice}
    * @param deviceId The device index.
    */
-  cpu(deviceId = 0): DLContext {
-    return this.context("cpu", deviceId);
+  cpu(deviceId = 0): DLDevice {
+    return this.device("cpu", deviceId);
   }
 
   /**
-   * Create a new webgpu {@link DLContext}
+   * Create a new webgpu {@link DLDevice}
    * @param deviceId The device index.
    */
-  webgpu(deviceId = 0): DLContext {
-    return this.context("webgpu", deviceId);
+  webgpu(deviceId = 0): DLDevice {
+    return this.device("webgpu", deviceId);
   }
 
   /**
@@ -947,13 +947,13 @@ export class Instance implements Disposable {
    *
    * @param shape The shape of the array.
    * @param dtype The data type of the array.
-   * @param ctx The context of the ndarray.
+   * @param dev The device of the ndarray.
    * @returns The created ndarray.
    */
   empty(
     shape: Array<number> | number,
     dtype: string | DLDataType = "float32",
-    ctx: DLContext = this.context("cpu", 0)
+    dev: DLDevice = this.device("cpu", 0)
   ): NDArray {
     dtype = this.toDLDataType(dtype);
     shape = typeof shape == "number" ? [shape] : shape;
@@ -975,8 +975,8 @@ export class Instance implements Disposable {
         dtype.code,
         dtype.bits,
         dtype.lanes,
-        ctx.deviceType,
-        ctx.deviceId,
+        dev.deviceType,
+        dev.deviceId,
         outPtr
       )
     );
@@ -986,24 +986,20 @@ export class Instance implements Disposable {
   }
 
   /**
-   * Create a new graph runtime.
+   * Create a new graph executor.
    *
-   * @param graphJson The graph runtime json file.
+   * @param graphJson The graph executor json file.
    * @param lib The underlying library.
-   * @param ctx The execution context of the graph.
+   * @param dev The execution device of the graph.
    */
-  createGraphRuntime(
-    graphJson: string,
-    lib: Module,
-    ctx: DLContext
-  ): GraphRuntime {
-    const fcreate = this.getGlobalFunc("tvm.graph_runtime.create");
+  createGraphExecutor(graphJson: string, lib: Module, dev: DLDevice): GraphExecutor {
+    const fcreate = this.getGlobalFunc('tvm.graph_executor.create');
     const module = fcreate(
       graphJson,
       lib,
-      this.scalar(ctx.deviceType, "int32"),
-      this.scalar(ctx.deviceId, "int32")) as Module;
-    return new GraphRuntime(module);
+      this.scalar(dev.deviceType, "int32"),
+      this.scalar(dev.deviceId, "int32")) as Module;
+    return new GraphExecutor(module);
   }
 
 
@@ -1059,13 +1055,13 @@ export class Instance implements Disposable {
     // Helper function to time the finvoke
     const timeExecution = async (
       finvoke: PackedFunc,
-      ctx: DLContext,
+      dev: DLDevice,
       nstep: number,
       repeat: number,
       minRepeatMs: number
     ): Promise<Uint8Array> => {
       finvoke(this.scalar(1, "int32"));
-      await ctx.sync();
+      await dev.sync();
       const result = [];
       let setupNumber: number = nstep;
 
@@ -1079,7 +1075,7 @@ export class Instance implements Disposable {
           }
           const tstart: number = perf.now();
           finvoke(this.scalar(setupNumber, "int32"));
-          await ctx.sync();
+          await dev.sync();
           const tend: number = perf.now();
 
           durationMs = tend - tstart;
@@ -1162,10 +1158,10 @@ export class Instance implements Disposable {
           stack.storePtr(valueOffset, val.value);
           stack.storeI32(codeOffset, ArgTypeCode.TVMOpaqueHandle);
         }
-      } else if (val instanceof DLContext) {
+      } else if (val instanceof DLDevice) {
         stack.storeI32(valueOffset, val.deviceType);
         stack.storeI32(valueOffset + SizeOf.I32, val.deviceType);
-        stack.storeI32(codeOffset, ArgTypeCode.TVMContext);
+        stack.storeI32(codeOffset, ArgTypeCode.DLDevice);
       } else if (tp == "number") {
         stack.storeF64(valueOffset, val);
         stack.storeI32(codeOffset, ArgTypeCode.Float);
@@ -1328,10 +1324,10 @@ export class Instance implements Disposable {
         );
       }
       case ArgTypeCode.Null: return undefined;
-      case ArgTypeCode.TVMContext: {
+      case ArgTypeCode.DLDevice: {
         const deviceType = this.memory.loadI32(rvaluePtr);
         const deviceId = this.memory.loadI32(rvaluePtr + SizeOf.I32);
-        return this.context(deviceType, deviceId);
+        return this.device(deviceType, deviceId);
       }
       case ArgTypeCode.TVMStr: {
         const ret = this.memory.loadCString(this.memory.loadPointer(rvaluePtr));
diff --git a/web/tests/node/test_ndarray.js b/web/tests/node/test_ndarray.js
index eb0a8f446d4c..9e50557e2a13 100644
--- a/web/tests/node/test_ndarray.js
+++ b/web/tests/node/test_ndarray.js
@@ -36,7 +36,7 @@ function testArrayCopy(dtype, arrayType) {
   let data = [1, 2, 3, 4, 5, 6];
   let a = tvm.empty([2, 3], dtype).copyFrom(data);
 
-  assert(a.context.toString() == "cpu(0)");
+  assert(a.device.toString() == "cpu(0)");
   assert(a.shape[0] == 2 && a.shape[1] == 3);
 
   let ret = a.toArray();
diff --git a/web/tests/node/test_packed_func.js b/web/tests/node/test_packed_func.js
index 87b48df3d67a..6e0546f39df1 100644
--- a/web/tests/node/test_packed_func.js
+++ b/web/tests/node/test_packed_func.js
@@ -122,3 +122,9 @@ test("NDArrayCbArg", () => {
   fcheck(x);
   assert(use_count(x) == 1);
 });
+
+test("Logging", () => {
+  const log_info = tvm.getGlobalFunc("testing.log_info_str");
+  log_info("helow world")
+  log_info.dispose();
+});
diff --git a/web/tests/python/webgpu_rpc_test.py b/web/tests/python/webgpu_rpc_test.py
index 5efc85cf5e32..80e358b56498 100644
--- a/web/tests/python/webgpu_rpc_test.py
+++ b/web/tests/python/webgpu_rpc_test.py
@@ -65,10 +65,10 @@ def test_rpc():
 
     def check(remote):
         # basic function checks.
-        ctx = remote.webgpu(0)
+        dev = remote.webgpu(0)
         adata = np.random.uniform(size=n).astype(A.dtype)
-        a = tvm.nd.array(adata, ctx)
-        b = tvm.nd.array(np.zeros(n, dtype=A.dtype), ctx)
+        a = tvm.nd.array(adata, dev)
+        b = tvm.nd.array(np.zeros(n, dtype=A.dtype), dev)
 
         np.testing.assert_equal(a.asnumpy(), adata)
         f1 = remote.system_lib()
diff --git a/web/tests/python/websock_rpc_test.py b/web/tests/python/websock_rpc_test.py
index 48603e86b7f7..9c1876a182e3 100644
--- a/web/tests/python/websock_rpc_test.py
+++ b/web/tests/python/websock_rpc_test.py
@@ -70,15 +70,15 @@ def check(remote):
 
         # run the generated library.
         f1 = remote.system_lib()
-        ctx = remote.cpu(0)
-        a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
+        dev = remote.cpu(0)
+        a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev)
+        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
         # invoke the function
         addone = f1.get_function("addone")
         addone(a, b)
 
         # time evaluator
-        time_f = f1.time_evaluator("addone", ctx, number=100, repeat=10)
+        time_f = f1.time_evaluator("addone", dev, number=100, repeat=10)
         time_f(a, b)
         cost = time_f(a, b).mean
         print("%g secs/op" % cost)

From 38042698e5b356a7a73ab1a81f6257d3051b4a5c Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Tue, 30 Mar 2021 10:10:42 -0700
Subject: [PATCH 64/69] Fix target.h format

---
 include/tvm/target/target.h | 39 ++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/include/tvm/target/target.h b/include/tvm/target/target.h
index c2e1e295323f..78b68a601da2 100644
--- a/include/tvm/target/target.h
+++ b/include/tvm/target/target.h
@@ -29,7 +29,6 @@
 #include <tvm/node/node.h>
 #include <tvm/support/with.h>
 #include <tvm/target/target_kind.h>
-#include <tvm/runtime/container.h>
 
 #include <string>
 #include <unordered_set>
@@ -161,7 +160,7 @@ class Target : public ObjectRef {
    * \param target The current Target typed object target, with or without host field.
    * \param host The given Target typed object target host
    * \return The new Target object with the given target and host field of given host.
-  */
+   */
   static Target WithHost(const Target&, const Target&);
 
  private:
@@ -181,28 +180,28 @@ class Target : public ObjectRef {
   TVM_DLL void ExitWithScope();
 };
 /*!
-  * \brief Check and update host field of the given legacy target and target host pair.
-  *  Note that this function is for legacy target api compatibility issue only, not
-  *  recommended for other use.
-  * \param target The pointer to a Target typed object with host field to be updated
-  * \param host The pointer to a Target typed object for target host to be updated
-*/
+ * \brief Check and update host field of the given legacy target and target host pair.
+ *  Note that this function is for legacy target api compatibility issue only, not
+ *  recommended for other use.
+ * \param target The pointer to a Target typed object with host field to be updated
+ * \param host The pointer to a Target typed object for target host to be updated
+ */
 void CheckAndUpdateHostConsistency(Target*, Target*);
 /*!
-  * \brief Check and update host field of the given legacy heterogeneous targets and
-  *  target host.Note that this function is for legacy target api compatibility issue only,
-  *  not recommended for other use.
-  * \param target The pointer to a Map objects with values being Target objects
-  * \param host The Target typed object for target host to be updated
-*/
+ * \brief Check and update host field of the given legacy heterogeneous targets and
+ *  target host.Note that this function is for legacy target api compatibility issue only,
+ *  not recommended for other use.
+ * \param target The pointer to a Map objects with values being Target objects
+ * \param host The Target typed object for target host to be updated
+ */
 void CheckAndUpdateHostConsistency(Map<Integer, Target>*, Target*);
 /*!
-  * \brief Check and update host field of the given legacy heterogeneous targets and
-  *  target host.Note that this function is for legacy target api compatibility issue only,
-  *  not recommended for other use.
-  * \param target The pointer to a Map objects with keys being Target objects
-  * \param host The Target typed object for target host to be updated
-*/
+ * \brief Check and update host field of the given legacy heterogeneous targets and
+ *  target host.Note that this function is for legacy target api compatibility issue only,
+ *  not recommended for other use.
+ * \param target The pointer to a Map objects with keys being Target objects
+ * \param host The Target typed object for target host to be updated
+ */
 void CheckAndUpdateHostConsistency(Map<Target, IRModule>*, Target*);
 }  // namespace tvm
 #endif  // TVM_TARGET_TARGET_H_

From dd3787cb19fccd23e1760ca3db9699a40e9aa388 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Tue, 30 Mar 2021 10:28:31 -0700
Subject: [PATCH 65/69] Remove redundent import

---
 python/tvm/autotvm/task/task.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py
index 720990dfc70f..876b29c30300 100644
--- a/python/tvm/autotvm/task/task.py
+++ b/python/tvm/autotvm/task/task.py
@@ -27,7 +27,6 @@
 from tvm.ir import container
 from tvm.target import Target
 from tvm.te import placeholder, tensor
-from tvm.target import Target
 from tvm.tir import expr
 
 

From 6e114ca2b6f9f8f6953be83712808e1f95d5f480 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Tue, 30 Mar 2021 15:54:09 -0700
Subject: [PATCH 66/69] Fix function name

---
 python/tvm/auto_scheduler/measure.py               |  4 ++--
 python/tvm/auto_scheduler/relay_integration.py     |  2 +-
 python/tvm/auto_scheduler/search_task.py           |  6 +++---
 python/tvm/autotvm/graph_tuner/base_graph_tuner.py |  4 +---
 python/tvm/autotvm/measure/measure_methods.py      |  2 +-
 python/tvm/autotvm/task/relay_integration.py       |  4 ++--
 python/tvm/autotvm/task/task.py                    |  6 +++---
 python/tvm/contrib/peak.py                         | 10 +++++-----
 python/tvm/driver/build_module.py                  |  6 +++---
 python/tvm/driver/tvmc/autotuner.py                |  6 +++---
 python/tvm/driver/tvmc/compiler.py                 |  2 +-
 python/tvm/exec/measure_peak.py                    |  2 +-
 python/tvm/relay/backend/_backend.py               |  2 +-
 python/tvm/relay/backend/vm.py                     |  6 +++---
 python/tvm/relay/build_module.py                   |  6 +++---
 python/tvm/target/target.py                        |  4 ++--
 16 files changed, 35 insertions(+), 37 deletions(-)

diff --git a/python/tvm/auto_scheduler/measure.py b/python/tvm/auto_scheduler/measure.py
index 3031fce146ad..f23cab48d904 100644
--- a/python/tvm/auto_scheduler/measure.py
+++ b/python/tvm/auto_scheduler/measure.py
@@ -223,7 +223,7 @@ def recover_measure_input(inp, rebuild_state=False):
     from .search_task import SearchTask  # lazily import to avoid recursive dependency
 
     task = inp.task
-    task.target, task.target_host = Target.check_and_update_host_consistency(
+    task.target, task.target_host = Target.check_and_update_host_consist(
         task.target, task.target_host
     )
     new_task = SearchTask(
@@ -606,7 +606,7 @@ def _timed_func(inp_serialized, build_func, verbose):
     tic = time.time()
     inp = MeasureInput.deserialize(inp_serialized)
     task = inp.task
-    task.target, task.target_host = Target.check_and_update_host_consistency(
+    task.target, task.target_host = Target.check_and_update_host_consist(
         task.target, task.target_host
     )
 
diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py
index 5bd910802926..a791f29e2616 100644
--- a/python/tvm/auto_scheduler/relay_integration.py
+++ b/python/tvm/auto_scheduler/relay_integration.py
@@ -110,7 +110,7 @@ def extract_tasks(
     """
     # pylint: disable=import-outside-toplevel
 
-    target, target_host = Target.check_and_update_host_consistency(target, target_host)
+    target, target_host = Target.check_and_update_host_consist(target, target_host)
 
     # Run the compiler to collect all TOPI calls during compilation.
     env = TracingEnvironment(
diff --git a/python/tvm/auto_scheduler/search_task.py b/python/tvm/auto_scheduler/search_task.py
index dd7e89c69184..0c1c23130b8d 100644
--- a/python/tvm/auto_scheduler/search_task.py
+++ b/python/tvm/auto_scheduler/search_task.py
@@ -394,7 +394,7 @@ def __init__(
 
         assert target is not None, "Must specify a target."
 
-        target, target_host = Target.check_and_update_host_consistency(target, target_host)
+        target, target_host = Target.check_and_update_host_consist(target, target_host)
 
         if layout_rewrite_option is None:
             layout_rewrite_option = LayoutRewriteOption.get_target_default(target)
@@ -504,7 +504,7 @@ def print_best(self, log_file, print_mode="schedule"):
         raise ValueError("Invalid print_mode: %s" % print_mode)
 
     def __getstate__(self):
-        self.target, self.target_host = Target.check_and_update_host_consistency(
+        self.target, self.target_host = Target.check_and_update_host_consist(
             self.target, self.target_host
         )
         return {
@@ -531,7 +531,7 @@ def __setstate__(self, state):
         if workload[0] not in WORKLOAD_FUNC_REGISTRY:
             register_workload_tensors(state["workload_key"], state["compute_dag"].tensors)
 
-        state["target"], state["target_host"] = Target.check_and_update_host_consistency(
+        state["target"], state["target_host"] = Target.check_and_update_host_consist(
             state["target"], state["target_host"]
         )
         self.__init_handle_by_constructor__(
diff --git a/python/tvm/autotvm/graph_tuner/base_graph_tuner.py b/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
index 6605ba733aac..b307130780a7 100644
--- a/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
+++ b/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
@@ -440,9 +440,7 @@ def benchmark_layout_transform(
             This might bring performance loss comparing to benchmarking layout transformation.
         """
         self._logger.info("Start to benchmark layout transformation...")
-        self._target, target_host = Target.check_and_update_host_consistency(
-            self._target, target_host
-        )
+        self._target, target_host = Target.check_and_update_host_consist(self._target, target_host)
 
         if layout_records is None and infer_layout:
             raise RuntimeError("Requires some records to infer layout transformation time.")
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index aaa08ce91af9..d212e5f26f20 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -419,7 +419,7 @@ def set_task(self, task):
 def _build_func_common(measure_input, check_gpu=None, cuda_arch=None, build_option=None):
     """Common part for building a configuration"""
     target, task, config = measure_input
-    target, task.target_host = Target.check_and_update_host_consistency(target, task.target_host)
+    target, task.target_host = Target.check_and_update_host_consist(target, task.target_host)
 
     with target:
         s, args = task.instantiate(config)
diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py
index 0e7a08c9f808..9117ce398d49 100644
--- a/python/tvm/autotvm/task/relay_integration.py
+++ b/python/tvm/autotvm/task/relay_integration.py
@@ -90,7 +90,7 @@ def extract_from_program(mod, params, target, target_host=None, ops=None):
     task: Array of autotvm.task.Task
         collected tasks
     """
-    target, target_host = Target.check_and_update_host_consistency(target, target_host)
+    target, target_host = Target.check_and_update_host_consist(target, target_host)
     return extract_from_multiple_program([mod], [params], target, ops=ops)
 
 
@@ -125,7 +125,7 @@ def extract_from_multiple_program(mods, params, target, target_host=None, ops=No
     env = TaskExtractEnv.get()
 
     # merge target and target host
-    target, target_host = Target.check_and_update_host_consistency(target, target_host)
+    target, target_host = Target.check_and_update_host_consist(target, target_host)
 
     # run compiler to collect all TOPI calls during compilation
     env.reset(ops)
diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py
index 876b29c30300..0d60ca929d7b 100644
--- a/python/tvm/autotvm/task/task.py
+++ b/python/tvm/autotvm/task/task.py
@@ -175,7 +175,7 @@ def __getstate__(self):
         # and restore the function by name when unpickling it.
         import cloudpickle  # pylint: disable=import-outside-toplevel
 
-        self.target, self.target_host = Target.check_and_update_host_consistency(
+        self.target, self.target_host = Target.check_and_update_host_consist(
             self.target, self.target_host
         )
         return {
@@ -198,7 +198,7 @@ def __setstate__(self, state):
         self.config_space = state["config_space"]
         self.func = cloudpickle.loads(state["func"])
         self.flop = state["flop"]
-        self.target, self.target_host = Target.check_and_update_host_consistency(
+        self.target, self.target_host = Target.check_and_update_host_consist(
             state["target"], state["target_host"]
         )
 
@@ -452,7 +452,7 @@ def create(task_name, args, target, target_host=None):
     if isinstance(target, str):
         target = Target(target)
 
-    target, target_host = Target.check_and_update_host_consistency(target, target_host)
+    target, target_host = Target.check_and_update_host_consist(target, target_host)
 
     # init config space
     ret.config_space = ConfigSpace()
diff --git a/python/tvm/contrib/peak.py b/python/tvm/contrib/peak.py
index ea9795254a09..8e8e158b0740 100644
--- a/python/tvm/contrib/peak.py
+++ b/python/tvm/contrib/peak.py
@@ -87,7 +87,7 @@ def measure_bandwidth_sum(
     GBPS: float
          gigabyte per second
     """
-    target, target_host = Target.check_and_update_host_consistency(target, target_host)
+    target, target_host = Target.check_and_update_host_consist(target, target_host)
 
     n, m = total_item, item_per_thread
     n //= lanes
@@ -154,7 +154,7 @@ def measure_bandwidth_all_types(
     result: list
         a list of (type_name, GBPS) pairs
     """
-    target, target_host = Target.check_and_update_host_consistency(target, target_host)
+    target, target_host = Target.check_and_update_host_consist(target, target_host)
     max_threads = target.max_num_threads
 
     result = []
@@ -225,7 +225,7 @@ def measure_compute_mad(
     GOPS: float
          giga operation per second
     """
-    target, target_host = Target.check_and_update_host_consistency(target, target_host)
+    target, target_host = Target.check_and_update_host_consist(target, target_host)
 
     n = total_item
 
@@ -318,7 +318,7 @@ def measure_compute_all_types(
     result: list
         a list of (type_name, GFLOPS/GIOPS) pairs
     """
-    target, target_host = Target.check_and_update_host_consistency(target, target_host)
+    target, target_host = Target.check_and_update_host_consist(target, target_host)
 
     result = []
     for base_type in ["float", "int"]:
@@ -364,7 +364,7 @@ def measure_peak_all(target, target_host, host, port):
     port: int
     """
 
-    target, target_host = Target.check_and_update_host_consistency(target, target_host)
+    target, target_host = Target.check_and_update_host_consist(target, target_host)
     remote = rpc.connect(host, port)
     n_times = 20
 
diff --git a/python/tvm/driver/build_module.py b/python/tvm/driver/build_module.py
index 684dba263648..9f56a9b82a7e 100644
--- a/python/tvm/driver/build_module.py
+++ b/python/tvm/driver/build_module.py
@@ -231,7 +231,7 @@ def _build_for_device(input_mod, target, target_host):
     mdev : tvm.module
         A module that contains device code.
     """
-    target, target_host = Target.check_and_update_host_consistency(target, target_host)
+    target, target_host = Target.check_and_update_host_consist(target, target_host)
     device_type = ndarray.device(target.kind.name, 0).device_type
 
     mod_mixed = input_mod
@@ -398,7 +398,7 @@ def build(inputs, args=None, target=None, target_host=None, name="default_functi
         if not isinstance(mod, tvm.IRModule):
             raise ValueError("inputs must be Schedule, IRModule," "or dict of str to IRModule.")
 
-    target_input_mod, target_host = Target.check_and_update_host_consistency(
+    target_input_mod, target_host = Target.check_and_update_host_consist(
         target_input_mod, target_host
     )
 
@@ -412,7 +412,7 @@ def build(inputs, args=None, target=None, target_host=None, name="default_functi
     if not target_host:
         target_host = "llvm" if tvm.runtime.enabled("llvm") else "stackvm"
 
-    target_input_mod, target_host = Target.check_and_update_host_consistency(
+    target_input_mod, target_host = Target.check_and_update_host_consist(
         target_input_mod, target_host
     )
 
diff --git a/python/tvm/driver/tvmc/autotuner.py b/python/tvm/driver/tvmc/autotuner.py
index df3c6728596c..99ed11789364 100644
--- a/python/tvm/driver/tvmc/autotuner.py
+++ b/python/tvm/driver/tvmc/autotuner.py
@@ -244,7 +244,7 @@ def drive_tune(args):
 
     target, extra_targets = common.target_from_cli(args.target)
     target_host = args.target_host
-    target, target_host = Target.check_and_update_host_consistency(target, target_host)
+    target, target_host = Target.check_and_update_host_consist(target, target_host)
     mod, params = frontends.load_model(args.FILE, args.model_format, shape_dict=args.input_shapes)
 
     for codegen_from_cli in extra_targets:
@@ -363,7 +363,7 @@ def autotvm_get_tuning_tasks(mod, params, target, target_host=None, alter_layout
     tasks : list of autotvm.Tasks
         list of tasks to be tuned
     """
-    target, target_host = Target.check_and_update_host_consistency(target, target_host)
+    target, target_host = Target.check_and_update_host_consist(target, target_host)
 
     if alter_layout:
         mod = common.convert_graph_layout(mod, alter_layout)
@@ -412,7 +412,7 @@ def autoscheduler_get_tuning_tasks(
     weights : List[int]
         the weight (i.e. the number of appearance) of extracted tasks
     """
-    target, target_host = Target.check_and_update_host_consistency(target, target_host)
+    target, target_host = Target.check_and_update_host_consist(target, target_host)
 
     if alter_layout:
         mod = common.convert_graph_layout(mod, alter_layout)
diff --git a/python/tvm/driver/tvmc/compiler.py b/python/tvm/driver/tvmc/compiler.py
index b8e88532efec..f484290bb5d0 100644
--- a/python/tvm/driver/tvmc/compiler.py
+++ b/python/tvm/driver/tvmc/compiler.py
@@ -193,7 +193,7 @@ def compile_model(
 
     tvm_target, extra_targets = common.target_from_cli(target)
     target_host = tvm_target if not target_host else target_host
-    tvm_target, target_host = Target.check_and_update_host_consistency(tvm_target, target_host)
+    tvm_target, target_host = Target.check_and_update_host_consist(tvm_target, target_host)
 
     for codegen_from_cli in extra_targets:
         codegen = composite_target.get_codegen_by_target(codegen_from_cli["name"])
diff --git a/python/tvm/exec/measure_peak.py b/python/tvm/exec/measure_peak.py
index 207e7875da16..d8840fadd802 100644
--- a/python/tvm/exec/measure_peak.py
+++ b/python/tvm/exec/measure_peak.py
@@ -44,7 +44,7 @@ def main():
     args = parser.parse_args()
     logging.basicConfig(level=logging.INFO)
 
-    args.target, args.target_host = Target.check_and_update_host_consistency(
+    args.target, args.target_host = Target.check_and_update_host_consist(
         args.target, args.target_host
     )
     measure_peak_all(args.target, args.target_host, args.rpc_host, args.rpc_port)
diff --git a/python/tvm/relay/backend/_backend.py b/python/tvm/relay/backend/_backend.py
index 1d6d0e0ffb8b..6df83559645d 100644
--- a/python/tvm/relay/backend/_backend.py
+++ b/python/tvm/relay/backend/_backend.py
@@ -80,7 +80,7 @@ def build(mod, target, target_host=None):
         The runtime module.
     """
     target_host = None if target_host == "" else target_host
-    target, target_host = Target.check_and_update_host_consistency(target, target_host)
+    target, target_host = Target.check_and_update_host_consist(target, target_host)
     return tvm.driver.build(mod, target=target)
 
 
diff --git a/python/tvm/relay/backend/vm.py b/python/tvm/relay/backend/vm.py
index a86b29250052..0e65355a4da2 100644
--- a/python/tvm/relay/backend/vm.py
+++ b/python/tvm/relay/backend/vm.py
@@ -63,7 +63,7 @@ def compile(mod, target=None, target_host=None, params=None):
     exec : tvm.runtime.vm.Executable
         The VM executable that contains both library code and bytecode.
     """
-    target, target_host = Target.check_and_update_host_consistency(
+    target, target_host = Target.check_and_update_host_consist(
         target, target_host, target_is_dict_key=False
     )
     compiler = VMCompiler()
@@ -134,7 +134,7 @@ def lower(self, mod, target=None, target_host=None):
         """
         target = self._update_target(target)
         target_host = self._update_target_host(target, target_host)
-        target, target_host = Target.check_and_update_host_consistency(
+        target, target_host = Target.check_and_update_host_consist(
             target, target_host, target_is_dict_key=False
         )
 
@@ -175,7 +175,7 @@ def optimize(self, mod, target=None, target_host=None, params=None):
         """
         target = self._update_target(target)
         target_host = self._update_target_host(target, target_host)
-        target, target_host = Target.check_and_update_host_consistency(
+        target, target_host = Target.check_and_update_host_consist(
             target, target_host, target_is_dict_key=False
         )
 
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 174e89bf69a4..ed59ad9bdc8f 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -115,7 +115,7 @@ def build(self, mod, target=None, target_host=None, params=None):
             The runtime factory for the TVM graph executor.
         """
         target = _update_target(target)
-        target, target_host = Target.check_and_update_host_consistency(
+        target, target_host = Target.check_and_update_host_consist(
             target, target_host, target_is_dict_key=False
         )
 
@@ -209,7 +209,7 @@ def _build_module_no_factory(mod, target=None, target_host=None, params=None, mo
     This wrapper is suitable to be used from other programming languages as
     the runtime::Module can be freely passed between language boundaries.
     """
-    target, target_host = Target.check_and_update_host_consistency(target, target_host)
+    target, target_host = Target.check_and_update_host_consist(target, target_host)
     return build(mod, target, params=params, mod_name=mod_name).module
 
 
@@ -274,7 +274,7 @@ def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"
     elif target_host:
         raise ValueError("target host must be the type of str, " + "tvm.target.Target, or None")
 
-    target, target_host = Target.check_and_update_host_consistency(
+    target, target_host = Target.check_and_update_host_consist(
         target, target_host, target_is_dict_key=False
     )
 
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index dfbcd281e439..6d0a0635221e 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -168,7 +168,7 @@ def list_kinds():
         return list(_ffi_api.ListTargetKinds())
 
     @staticmethod
-    def check_and_update_host_consistency(target, host=None, target_is_dict_key=True):
+    def check_and_update_host_consist(target, host=None, target_is_dict_key=True):
         """A helper function that merges a legacy "target, target_host" pair, then returns
         the merged target and its host field. The function is for legacy target and target
         host pair only, and should not be used in the new target system.
@@ -188,7 +188,7 @@ def check_and_update_host_consistency(target, host=None, target_is_dict_key=True
                 if not target_is_dict_key:
                     tgt, mod = mod, tgt
                 if isinstance(tgt, (dict, str, Target)):
-                    tgt, host = Target.check_and_update_host_consistency(tgt, host)
+                    tgt, host = Target.check_and_update_host_consist(tgt, host)
                 if not target_is_dict_key:
                     tgt, mod = mod, tgt
                 new_target[tgt] = mod

From adec87fbd3558b75ae71f78afaf9a43663370dcb Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Tue, 30 Mar 2021 16:15:20 -0700
Subject: [PATCH 67/69] Add parameter name

---
 include/tvm/target/target.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/tvm/target/target.h b/include/tvm/target/target.h
index 78b68a601da2..9c1fe55749e4 100644
--- a/include/tvm/target/target.h
+++ b/include/tvm/target/target.h
@@ -161,7 +161,7 @@ class Target : public ObjectRef {
    * \param host The given Target typed object target host
    * \return The new Target object with the given target and host field of given host.
    */
-  static Target WithHost(const Target&, const Target&);
+  static Target WithHost(const Target& target, const Target& host);
 
  private:
   // enable with syntax.
@@ -186,7 +186,7 @@ class Target : public ObjectRef {
  * \param target The pointer to a Target typed object with host field to be updated
  * \param host The pointer to a Target typed object for target host to be updated
  */
-void CheckAndUpdateHostConsistency(Target*, Target*);
+void CheckAndUpdateHostConsistency(Target* target, Target* host);
 /*!
  * \brief Check and update host field of the given legacy heterogeneous targets and
  *  target host.Note that this function is for legacy target api compatibility issue only,
@@ -194,7 +194,7 @@ void CheckAndUpdateHostConsistency(Target*, Target*);
  * \param target The pointer to a Map objects with values being Target objects
  * \param host The Target typed object for target host to be updated
  */
-void CheckAndUpdateHostConsistency(Map<Integer, Target>*, Target*);
+void CheckAndUpdateHostConsistency(Map<Integer, Target>* target, Target* host);
 /*!
  * \brief Check and update host field of the given legacy heterogeneous targets and
  *  target host.Note that this function is for legacy target api compatibility issue only,
@@ -202,6 +202,6 @@ void CheckAndUpdateHostConsistency(Map<Integer, Target>*, Target*);
  * \param target The pointer to a Map objects with keys being Target objects
  * \param host The Target typed object for target host to be updated
  */
-void CheckAndUpdateHostConsistency(Map<Target, IRModule>*, Target*);
+void CheckAndUpdateHostConsistency(Map<Target, IRModule>* target, Target* host);
 }  // namespace tvm
 #endif  // TVM_TARGET_TARGET_H_

From b71bd1ae3a88d10ed89624c1864bcb88207081d6 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Tue, 30 Mar 2021 22:51:10 -0700
Subject: [PATCH 68/69] Fix new code bug

---
 tests/python/relay/test_vm.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py
index c1bdc3ff9fd0..7ca06c5c97e0 100644
--- a/tests/python/relay/test_vm.py
+++ b/tests/python/relay/test_vm.py
@@ -808,8 +808,7 @@ def test_vm_rpc():
     upload it to a remote machine using RPC and then execute it
     on the other machine.
     """
-    target = "llvm"
-    target_host = "llvm"
+    target = tvm.target.Target("llvm --host=llvm")
 
     # Build a IRModule.
     x = relay.var("x", shape=(10, 1))
@@ -817,7 +816,7 @@ def test_vm_rpc():
     mod = IRModule.from_expr(f)
 
     # Compile to VMExecutable.
-    vm_exec = vm.compile(mod, target=target, target_host=target_host)
+    vm_exec = vm.compile(mod, target=target)
 
     # Export to Disk
     temp = utils.tempdir()

From 3a8080eeae17d200b2459a97b31a98c4dbca29d0 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Wed, 31 Mar 2021 00:07:39 -0700
Subject: [PATCH 69/69] Fix bug in lowering

---
 python/tvm/relay/backend/vm.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/tvm/relay/backend/vm.py b/python/tvm/relay/backend/vm.py
index 0e65355a4da2..0b6d1372d050 100644
--- a/python/tvm/relay/backend/vm.py
+++ b/python/tvm/relay/backend/vm.py
@@ -218,6 +218,9 @@ def _update_target_host(self, target, target_host):
         """Update target host."""
         target_host = None if target_host == "" else target_host
         if not target_host:
+            for _, tgt in target.items():
+                if tgt.host is not None:
+                    return tgt.host
             for device_type, tgt in target.items():
                 if device_type.value == tvm.nd.cpu(0).device_type:
                     target_host = tgt