From d1399f378e37e9e4d9bfadc5cdae57fdc5bcaf7f Mon Sep 17 00:00:00 2001
From: Xuxue1 <1915998056@qq.com>
Date: Tue, 29 Dec 2020 22:08:17 +0800
Subject: [PATCH 001/357] [Torch] Support hard_swish op (#7174)

* imp_hardswish

* format

* fix

* hard_swish_inplace test case
---
 python/tvm/relay/frontend/pytorch.py          | 11 +++++++++++
 tests/python/frontend/pytorch/test_forward.py | 14 +++++++++++---
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 94ee9282e4fa..8e69739544e5 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -790,6 +790,15 @@ def log_sigmoid(self, inputs, input_types):
         data = inputs[0]
         return _op.log(_op.tensor.sigmoid(data))
 
+    def hard_swish(self, inputs, input_types):
+        data = inputs[0]
+        dtype = input_types[0]
+
+        def _relu6(input_tensor):
+            return _op.tensor.clip(input_tensor, 0.0, 6.0)
+
+        return data * _relu6(data + _expr.const(3.0, dtype=dtype)) / _expr.const(6.0, dtype=dtype)
+
     def adaptive_avg_pool_2d(self, inputs, input_types):
         data = inputs[0]
         output_size = inputs[1]
@@ -2266,6 +2275,8 @@ def create_convert_map(self):
             "aten::bincount": self.bincount,
             "aten::scatter_add": self.scatter_add,
             "aten::__not__": self.logical_not,
+            "aten::hardswish_": self.hard_swish,
+            "aten::hardswish": self.hard_swish,
         }
 
     def update_convert_map(self, custom_map):
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 04f08b903bf1..f76c697a2c81 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -181,14 +181,14 @@ def verify_model(model_name, input_data=[], custom_convert_map={}, rtol=1e-5, at
         baseline_input = [inp.cuda() for inp in baseline_input]
 
     with torch.no_grad():
-        baseline_outputs = baseline_model(*baseline_input)
+        baseline_outputs = baseline_model(*[input.clone() for input in baseline_input])
 
     if isinstance(baseline_outputs, tuple):
         baseline_outputs = tuple(out.cpu().numpy() for out in baseline_outputs)
     else:
         baseline_outputs = (baseline_outputs.cpu().numpy(),)
 
-    trace = torch.jit.trace(baseline_model, baseline_input)
+    trace = torch.jit.trace(baseline_model, [input.clone() for input in baseline_input])
     if isinstance(baseline_model, torch.nn.Module):
         trace = trace.float().eval()
 
@@ -200,7 +200,7 @@ def verify_model(model_name, input_data=[], custom_convert_map={}, rtol=1e-5, at
     input_names = ["input{}".format(idx) for idx, inp in enumerate(baseline_input)]
     input_shapes = list(zip(input_names, [inp.shape for inp in baseline_input]))
     mod, params = relay.frontend.from_pytorch(trace, input_shapes, custom_convert_map)
-    compiled_input = dict(zip(input_names, [inp.cpu().numpy() for inp in baseline_input]))
+    compiled_input = dict(zip(input_names, [inp.clone().cpu().numpy() for inp in baseline_input]))
 
     with tvm.transform.PassContext(opt_level=3):
         for target, ctx in tvm.testing.enabled_targets():
@@ -3437,6 +3437,13 @@ def test_fn(x, weights=None):
     verify_trace_model(test_fn, [inp, weights], targets)
 
 
+def test_hard_swish():
+    examples = [torch.rand(8).float(), torch.rand(8, 10).float(), torch.rand(1, 1, 10).float()]
+    for input in examples:
+        verify_model(torch.nn.Hardswish().eval(), input_data=input)
+        verify_model(torch.nn.Hardswish(inplace=True).eval(), input_data=input)
+
+
 if __name__ == "__main__":
     # some structural tests
     test_forward_traced_function()
@@ -3603,3 +3610,4 @@ def test_fn(x, weights=None):
 
     # Test convert torch script(jit) with specific inputs' types
     test_convert_torch_script_with_input_types()
+    test_hard_swish()

From 59699a7b35c8458f3e45a0cc84240631363a485c Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Tue, 29 Dec 2020 14:01:24 -0800
Subject: [PATCH 002/357] [TFLite] Reshape - support different qnn params for
 input and output (#7159)

---
 python/tvm/relay/frontend/tflite.py          |  30 ++++-
 tests/python/frontend/tflite/test_forward.py | 131 ++++++++++++++-----
 2 files changed, 130 insertions(+), 31 deletions(-)

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index 7a9adf7b1126..94e9e0cccc5b 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -511,13 +511,30 @@ def convert_reshape(self, op):
         in_expr = self.get_expr(input_tensor_idx)
 
         # If the tensors are quantized, ensure that input/output qnn params are same.
-        if input_tensor.qnn_params:
+
+        input_tensor_type_str = self.get_tensor_type_str(input_tensor.tensor.Type())
+        if input_tensor.qnn_params and input_tensor_type_str == "int8":
+            # TFLite 2.x quantization spec requires qnn params to be same and dtype to be int8.
+            # For TFLite 1.x, dtype can be uint8 and qnn params can be different
             output_tensor = output_tensors[0]
             assert self.has_same_qnn_params(
                 input_tensor, output_tensor
             ), "TFLite reshape requires input and output scale and zero points to be equal"
 
         out = _op.reshape(in_expr, newshape=target_shape)
+        if input_tensor.qnn_params and input_tensor_type_str == "uint8":
+            output_tensor = output_tensors[0]
+            if not self.has_same_qnn_params(input_tensor, output_tensor):
+                output_tensor_type_str = self.get_tensor_type_str(output_tensor.tensor.Type())
+                out = _qnn.op.requantize(
+                    out,
+                    input_scale=input_tensor.qnn_params["scale"],
+                    input_zero_point=input_tensor.qnn_params["zero_point"],
+                    output_scale=output_tensor.qnn_params["scale"],
+                    output_zero_point=output_tensor.qnn_params["zero_point"],
+                    out_dtype=output_tensor_type_str,
+                )
+
         return out
 
     def _convert_resize(self, method, op):
@@ -2527,6 +2544,17 @@ def convert_pack(self, op):
         output_tensors = self.get_output_tensors(op)
         assert len(output_tensors) == 1, "output tensors length should be 1"
 
+        if input_tensors[0].qnn_params:
+            output_tensor = output_tensors[0]
+            assert self.has_same_qnn_params(
+                input_tensors[0], output_tensor
+            ), "TFLite pack requires input and output scale and zero points to be equal"
+
+            for input_tensor in input_tensors:
+                assert self.has_same_qnn_params(
+                    input_tensors[0], input_tensor
+                ), "TFLite pack requires all input tensors to have same scale and zero point"
+
         assert op.BuiltinOptionsType() == BuiltinOptions.PackOptions
         op_options = op.BuiltinOptions()
         pack_options = PackOptions()
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index 6cedc65678c5..52dde38703d1 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -1251,30 +1251,61 @@ def test_forward_transpose_conv():
 # -------
 
 
-def _test_reshape(data, out_shape, wrap_shape):
+def _test_reshape(data, out_shape, wrap_shape, quantized=False):
     """ One iteration of reshape operation with given data and out shape """
-    with tf.Graph().as_default():
-        in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype)
+    if quantized:
+        with tf.Graph().as_default():
+            in_data = array_ops.placeholder(shape=data.shape, dtype="float32", name="in")
+            inq_data = tf.quantization.fake_quant_with_min_max_args(
+                in_data, min=-100, max=100, name="inq_0"
+            )
 
-        out_shape = out_shape if not wrap_shape else np.array(out_shape, dtype=np.int32)
+            input_range = {"inq_0": (-100, 100)}
+            out_shape = out_shape if not wrap_shape else np.array(out_shape, dtype=np.int32)
 
-        in_shape = (
-            out_shape
-            if not wrap_shape
-            else array_ops.placeholder(
-                shape=out_shape.shape, dtype=out_shape.dtype, name="Newshape"
+            in_shape = (
+                out_shape
+                if not wrap_shape
+                else array_ops.placeholder(
+                    shape=out_shape.shape, dtype=out_shape.dtype, name="Newshape"
+                )
             )
-        )
 
-        out = array_ops.reshape(in_data, in_shape)
+            out = array_ops.reshape(inq_data, in_shape)
+            out = tf.quantization.fake_quant_with_min_max_args(out, min=-200, max=200, name="out")
+            compare_tflite_with_tvm(
+                [data, out_shape] if wrap_shape else [data],
+                ["inq_0:0", "Newshape:0"] if wrap_shape else ["inq_0:0"],
+                [inq_data, in_shape] if wrap_shape else [inq_data],
+                [out],
+                quantized=True,
+                input_range=input_range,
+                mode="vm",
+            )
+    else:
+        # Test with tensor and constant
+        with tf.Graph().as_default():
+            in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype)
+
+            out_shape = out_shape if not wrap_shape else np.array(out_shape, dtype=np.int32)
 
-        compare_tflite_with_tvm(
-            [data, out_shape] if wrap_shape else [data],
-            ["Placeholder:0", "Newshape:0"] if wrap_shape else ["Placeholder:0"],
-            [in_data, in_shape] if wrap_shape else [in_data],
-            [out],
-            mode="vm",
-        )
+            in_shape = (
+                out_shape
+                if not wrap_shape
+                else array_ops.placeholder(
+                    shape=out_shape.shape, dtype=out_shape.dtype, name="Newshape"
+                )
+            )
+
+            out = array_ops.reshape(in_data, in_shape)
+
+            compare_tflite_with_tvm(
+                [data, out_shape] if wrap_shape else [data],
+                ["Placeholder:0", "Newshape:0"] if wrap_shape else ["Placeholder:0"],
+                [in_data, in_shape] if wrap_shape else [in_data],
+                [out],
+                mode="vm",
+            )
 
 
 def test_forward_reshape():
@@ -1284,6 +1315,9 @@ def test_forward_reshape():
         _test_reshape(np.arange(6), [3, -1], wrap)
         _test_reshape(np.arange(6), [-1], wrap)
 
+    _test_reshape(np.arange(6, dtype=np.uint8), [2, 3], False, True)
+    _test_reshape(np.arange(6, dtype=np.uint8), [-1, 2], False, True)
+
 
 #######################################################################
 # Resize
@@ -2750,25 +2784,51 @@ def test_forward_one_hot():
 # ----
 
 
-def _test_pack(data, is_var, axis):
+def _test_pack(data, is_var, axis, quantized=False):
     """ One iteration of pack """
 
     assert len(data) >= 1
     assert len(data) == len(is_var)
+    if quantized:
+        with tf.Graph().as_default():
+            in_data = [
+                array_ops.placeholder(shape=d.shape, dtype="float32", name="in_" + str(idx))
+                if is_var[idx]
+                else constant_op.constant(
+                    d, shape=d.shape, dtype="float32", name="in_constant_" + str(idx)
+                )
+                for idx, d in enumerate(data)
+            ]
+            inq_data = [
+                tf.quantization.fake_quant_with_min_max_args(
+                    i_data, min=-100, max=100, name="inq_{}".format(idx)
+                )
+                for idx, i_data in enumerate(in_data)
+            ]
+            input_range = {}
+            for i in range(len(data)):
+                input_range["inq_{}".format(i)] = (-100, 100)
 
-    with tf.Graph().as_default():
-        in_data = [
-            array_ops.placeholder(shape=d.shape, dtype=d.dtype, name="in_" + str(idx))
-            if is_var[idx]
-            else constant_op.constant(
-                d, shape=d.shape, dtype=d.dtype, name="in_constant_" + str(idx)
+            out = array_ops.pack(inq_data, axis=axis)
+            out = tf.quantization.fake_quant_with_min_max_args(out, min=-100, max=100, name="out")
+            name = ["inq_{}:0".format(idx) for idx in range(len(data))]
+            compare_tflite_with_tvm(
+                data, name, inq_data, [out], quantized=True, input_range=input_range
             )
-            for idx, d in enumerate(data)
-        ]
+    else:
+        with tf.Graph().as_default():
+            in_data = [
+                array_ops.placeholder(shape=d.shape, dtype=d.dtype, name="in_" + str(idx))
+                if is_var[idx]
+                else constant_op.constant(
+                    d, shape=d.shape, dtype=d.dtype, name="in_constant_" + str(idx)
+                )
+                for idx, d in enumerate(data)
+            ]
 
-        out = array_ops.pack(in_data, axis=axis)
-        name = [_.name for _ in in_data]
-        compare_tflite_with_tvm(data, name, in_data, [out], experimental_new_converter=True)
+            out = array_ops.pack(in_data, axis=axis)
+            name = [_.name for _ in in_data]
+            compare_tflite_with_tvm(data, name, in_data, [out], experimental_new_converter=True)
 
 
 def test_forward_pack():
@@ -2791,6 +2851,17 @@ def test_forward_pack():
         1,
     )
 
+    _test_pack(
+        [
+            np.arange(6, dtype=np.uint8).reshape((2, 1, 1, 3)),
+            np.arange(6, dtype=np.uint8).reshape((2, 1, 1, 3)),
+            np.arange(6, dtype=np.uint8).reshape((2, 1, 1, 3)),
+        ],
+        [True, True, True],
+        1,
+        quantized=True,
+    )
+
 
 #######################################################################
 # Unpack

From cfdbf0eaa52504cc68cad62fd3966bd88e279061 Mon Sep 17 00:00:00 2001
From: Wheest <Wheest@users.noreply.github.com>
Date: Tue, 29 Dec 2020 23:09:47 +0000
Subject: [PATCH 003/357] Asymmetric padding and dilation in conv2d workload
 (#7142)

* added asymmetric padding to conv2d workload

* fixed depthwise conv2d padding

* Added fix to include dilation in workload output width calculation

* Added missing dilation to arm_cpu/conv2d_int8.py workload

* Fixed dilation for x86 conv2d

* Improved dilation workload integration in x86

* Fixed x86 conv2d_alter_op to add dilation

* Local linting not always producing same output as CI, probably my fault

* Fixed bug, tested locally

* Abusing CI until I can figure out how to reproduce the same behaviour of running integration tests locally.

* Ammeded conv2d_int8 test

* Updated workload, improved unit tests

* Added depthwise conv2d workload test
---
 python/tvm/topi/arm_cpu/conv2d_int8.py        |  7 +--
 python/tvm/topi/cuda/conv2d_int8.py           |  7 +--
 python/tvm/topi/generic/conv2d.py             | 15 ++++---
 python/tvm/topi/nn/conv2d.py                  | 43 +++++++++++++++----
 python/tvm/topi/nn/depthwise_conv2d.py        | 33 +++++++++-----
 .../topi/testing/depthwise_conv2d_python.py   |  2 +-
 python/tvm/topi/x86/conv2d.py                 | 16 ++++---
 python/tvm/topi/x86/conv2d_alter_op.py        | 30 +++++++++++--
 python/tvm/topi/x86/conv2d_avx_1x1.py         | 11 +++--
 python/tvm/topi/x86/conv2d_avx_common.py      | 14 +++---
 python/tvm/topi/x86/conv2d_int8.py            | 14 +++---
 python/tvm/topi/x86/depthwise_conv2d.py       |  9 ++--
 .../topi/python/test_topi_conv2d_int8.py      | 23 +++++++++-
 .../topi/python/test_topi_conv2d_nchw.py      | 17 ++++++++
 .../topi/python/test_topi_depthwise_conv2d.py | 23 +++++++++-
 15 files changed, 201 insertions(+), 63 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/conv2d_int8.py b/python/tvm/topi/arm_cpu/conv2d_int8.py
index 445b9ec0c113..fc7e4036341a 100644
--- a/python/tvm/topi/arm_cpu/conv2d_int8.py
+++ b/python/tvm/topi/arm_cpu/conv2d_int8.py
@@ -32,12 +32,12 @@
 from .arm_utils import get_tiling_B_interleaved_t
 
 
-def _get_default_config(cfg, data, kernel, strides, padding, out_dtype):
+def _get_default_config(cfg, data, kernel, strides, padding, dilation, out_dtype):
     """
     Get default int8 schedule config for the workload
     """
-    wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype)
-    is_kernel_1x1 = wkl.hkernel == 1 and wkl.wkernel == 1
+    wkl = _get_conv2d_workload(data, kernel, strides, padding, dilation, out_dtype)
+    is_kernel_1x1 = wkl.kernel_h == 1 and wkl.kernel_w == 1
     if is_kernel_1x1:
         conv2d_generic.fallback_schedule_cpu_1x1_int8(cfg, wkl, int32_lanes=2, num_int8_elements=4)
     else:
@@ -65,6 +65,7 @@ def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out
             te.placeholder((num_filter, in_channel, kh, kw), dtype=kernel.dtype),
             strides,
             padding,
+            dilation,
             out_dtype,
         )
     return nn.conv2d_NCHWc_int8_compute(
diff --git a/python/tvm/topi/cuda/conv2d_int8.py b/python/tvm/topi/cuda/conv2d_int8.py
index 50a0e8b71661..001411d6e4c9 100644
--- a/python/tvm/topi/cuda/conv2d_int8.py
+++ b/python/tvm/topi/cuda/conv2d_int8.py
@@ -142,9 +142,10 @@ def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, dilation, layout, out_
     pad_data = pad(packed_data, pad_before, pad_after, name="pad_data")
 
     # compute the output shape
-    out_height = (in_height - (kernel_h - 1) * dilation_h - 1 + pad_top + pad_down) // stride_h + 1
-    out_width = (in_width - (kernel_w - 1) * dilation_w - 1 + pad_left + pad_right) // stride_w + 1
-
+    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
+    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
+    out_height = (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1
+    out_width = (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1
     oshape = (batch, oc_chunk, out_height, out_width, oc_block)
 
     icc = te.reduce_axis((0, ic_chunk), name="ic_chunk")
diff --git a/python/tvm/topi/generic/conv2d.py b/python/tvm/topi/generic/conv2d.py
index 7dd9aed7545d..4daa84c29528 100644
--- a/python/tvm/topi/generic/conv2d.py
+++ b/python/tvm/topi/generic/conv2d.py
@@ -38,9 +38,10 @@ def fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes, num_int8_elements):
         How many numbers of input int32/uint32 will be multiplied and reduced.
         This is related to input channel.
     """
-    HPAD, WPAD = wkl.hpad, wkl.wpad
-    HSTR, WSTR = wkl.hstride, wkl.wstride
-    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
+    pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
+    HSTR, WSTR = wkl.stride_h, wkl.stride_w
+    dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1
+    out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1
 
     assert wkl.out_filter % int32_lanes == 0, "wkl.out_filter=%d, int32_lanes=%d" % (
         wkl.out_filter,
@@ -85,10 +86,10 @@ def fallback_schedule_cpu_1x1_int8(cfg, wkl, int32_lanes, num_int8_elements):
         How many numbers of input int32/uint32 will be multiplied and reduced.
         This is related to input channel.
     """
-    HPAD, WPAD = wkl.hpad, wkl.wpad
-    HSTR, WSTR = wkl.hstride, wkl.wstride
-    out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1
-    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
+    pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
+    HSTR, WSTR = wkl.stride_h, wkl.stride_w
+    out_height = (wkl.height + pt + pb - wkl.kernel_h) // HSTR + 1
+    out_width = (wkl.width + pl + pr - wkl.kernel_w) // WSTR + 1
 
     assert wkl.out_filter % int32_lanes == 0, "wkl.out_filter=%d, int32_lanes=%d" % (
         wkl.out_filter,
diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index 886470bb3b9d..80f87f86736c 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -38,12 +38,16 @@
         "in_filter",
         "groups",
         "out_filter",
-        "hkernel",
-        "wkernel",
-        "hpad",
-        "wpad",
-        "hstride",
-        "wstride",
+        "kernel_h",
+        "kernel_w",
+        "padt",
+        "padl",
+        "padb",
+        "padr",
+        "dilation_h",
+        "dilation_w",
+        "stride_h",
+        "stride_w",
     ],
 )
 
@@ -154,7 +158,7 @@ def conv2d_infer_layout(workload, cfg):
     raise ValueError("missing register for topi.nn.conv2d_infer_layout")
 
 
-def _get_workload(data, kernel, stride, padding, out_dtype, data_layout="NCHW"):
+def _get_workload(data, kernel, stride, padding, dilation, out_dtype, data_layout="NCHW"):
     """ Get the workload structure. """
     if data_layout == "NCHW":
         _, CI, IH, IW = get_const_tuple(data.shape)
@@ -170,7 +174,10 @@ def _get_workload(data, kernel, stride, padding, out_dtype, data_layout="NCHW"):
     else:
         KH, KW, CIG, CO = get_const_tuple(kernel.shape)
 
-    HPAD, WPAD, _, _ = get_pad_tuple(padding, (get_const_int(KH), get_const_int(KW)))
+    pt, pl, pb, pr = get_pad_tuple(padding, (get_const_int(KH), get_const_int(KW)))
+    dilation_h, dilation_w = (
+        dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
+    )
     GRPS = CI // CIG
     if isinstance(stride, (tuple, list)):
         HSTR, WSTR = stride
@@ -182,7 +189,25 @@ def _get_workload(data, kernel, stride, padding, out_dtype, data_layout="NCHW"):
         '{} vs. {}".format(
         data.dtype, kernel.dtype
     )
-    return Workload(data.dtype, out_dtype, IH, IW, CI, GRPS, CO, KH, KW, HPAD, WPAD, HSTR, WSTR)
+    return Workload(
+        data.dtype,
+        out_dtype,
+        IH,
+        IW,
+        CI,
+        GRPS,
+        CO,
+        KH,
+        KW,
+        pt,
+        pl,
+        pb,
+        pr,
+        dilation_h,
+        dilation_w,
+        HSTR,
+        WSTR,
+    )
 
 
 def conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=None):
diff --git a/python/tvm/topi/nn/depthwise_conv2d.py b/python/tvm/topi/nn/depthwise_conv2d.py
index 72356821770d..052ab8b88d1c 100644
--- a/python/tvm/topi/nn/depthwise_conv2d.py
+++ b/python/tvm/topi/nn/depthwise_conv2d.py
@@ -36,22 +36,28 @@
         "width",
         "in_filter",
         "out_filter",
-        "hkernel",
-        "wkernel",
-        "hpad",
-        "wpad",
-        "hstride",
-        "wstride",
+        "kernel_h",
+        "kernel_w",
+        "padt",
+        "padl",
+        "padb",
+        "padr",
+        "dilation_h",
+        "dilation_w",
+        "stride_h",
+        "stride_w",
     ],
 )
 
 
-def _get_workload(data, kernel, stride, padding, out_dtype):
+def _get_workload(data, kernel, stride, padding, dilation, out_dtype):
     """ Get the workload structure. """
     _, in_channel, height, width = [x.value for x in data.shape]
     channel, channel_multiplier, kh, kw = [x.value for x in kernel.shape]
     out_channel = channel * channel_multiplier
-    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
+    dilation_h, dilation_w = (
+        dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
+    )
     if isinstance(stride, (tuple, list)):
         HSTR, WSTR = stride
     else:
@@ -62,6 +68,9 @@ def _get_workload(data, kernel, stride, padding, out_dtype):
         '{} vs. {}".format(
         data.dtype, kernel.dtype
     )
+    dilated_kernel_h = (kh - 1) * dilation_h + 1
+    dilated_kernel_w = (kw - 1) * dilation_w + 1
+    pt, pl, pb, pr = get_pad_tuple(padding, (dilated_kernel_h, dilated_kernel_w))
     return Workload(
         data.dtype,
         out_dtype,
@@ -71,8 +80,12 @@ def _get_workload(data, kernel, stride, padding, out_dtype):
         out_channel,
         kh,
         kw,
-        HPAD,
-        WPAD,
+        pt,
+        pl,
+        pb,
+        pr,
+        dilation_h,
+        dilation_w,
         HSTR,
         WSTR,
     )
diff --git a/python/tvm/topi/testing/depthwise_conv2d_python.py b/python/tvm/topi/testing/depthwise_conv2d_python.py
index 06f26ab3a2e4..2239c56134f5 100644
--- a/python/tvm/topi/testing/depthwise_conv2d_python.py
+++ b/python/tvm/topi/testing/depthwise_conv2d_python.py
@@ -65,7 +65,7 @@ def depthwise_conv2d_python_nchw(input_np, filter_np, stride, padding):
                     0 : (in_height - filter_height + 1) : stride_h,
                     0 : (in_width - filter_width + 1) : stride_w,
                 ]
-    if padding == "SAME":
+    elif padding == "SAME":
         out_channel = in_channel * channel_multiplier
         out_height = np.int(np.ceil(float(in_height) / float(stride_h)))
         out_width = np.int(np.ceil(float(in_width) / float(stride_w)))
diff --git a/python/tvm/topi/x86/conv2d.py b/python/tvm/topi/x86/conv2d.py
index a3b7e473415e..182454acf3a6 100644
--- a/python/tvm/topi/x86/conv2d.py
+++ b/python/tvm/topi/x86/conv2d.py
@@ -35,7 +35,7 @@
 
 
 def _get_default_config(
-    cfg, data, kernel, strides, padding, out_dtype, is_depthwise=False, layout="NCHW"
+    cfg, data, kernel, strides, padding, dilation, out_dtype, is_depthwise=False, layout="NCHW"
 ):
     """
     Get default schedule config for the workload
@@ -48,13 +48,13 @@ def _get_default_config(
             static_data_shape.append(dim)
     data = te.placeholder(static_data_shape, dtype=data.dtype)
     if is_depthwise:
-        wkl = _get_depthwise_conv2d_workload(data, kernel, strides, padding, out_dtype)
+        wkl = _get_depthwise_conv2d_workload(data, kernel, strides, padding, dilation, out_dtype)
         from .depthwise_conv2d import _fallback_schedule
 
         _fallback_schedule(cfg, wkl)
     else:
-        wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype, layout)
-        is_kernel_1x1 = wkl.hkernel == 1 and wkl.wkernel == 1
+        wkl = _get_conv2d_workload(data, kernel, strides, padding, dilation, out_dtype, layout)
+        is_kernel_1x1 = wkl.kernel_h == 1 and wkl.kernel_w == 1
         if is_kernel_1x1:
             conv2d_avx_1x1._fallback_schedule(cfg, wkl)
         else:
@@ -69,8 +69,11 @@ def _conv2d_infer_layout(workload, cfg):
     idxdiv = tvm.tir.indexdiv
 
     pt, pl, pb, pr = get_pad_tuple(padding, (k_height, k_width))
-    out_height = idxdiv(in_height + pt + pb - k_height, strides[0]) + 1
-    out_width = idxdiv(in_width + pl + pr - k_width, strides[1]) + 1
+    hdilation, wdilation = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
+    dilated_kernel_h = (k_height - 1) * hdilation + 1
+    dilated_kernel_w = (k_width - 1) * wdilation + 1
+    out_height = idxdiv(in_height + pt + pb - dilated_kernel_h, strides[0]) + 1
+    out_width = idxdiv(in_width + pl + pr - dilated_kernel_w, strides[1]) + 1
     tile_ic, tile_oc = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
     in_shape = (batch_size, idxdiv(in_channel, tile_ic), in_height, in_width, tile_ic)
     in_layout = "NCHW%dc" % tile_ic
@@ -208,6 +211,7 @@ def conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation, layout, out_layo
             ),
             strides,
             padding,
+            dilation,
             out_dtype,
         )
 
diff --git a/python/tvm/topi/x86/conv2d_alter_op.py b/python/tvm/topi/x86/conv2d_alter_op.py
index 979dc5ab5702..f05bac82ff0c 100644
--- a/python/tvm/topi/x86/conv2d_alter_op.py
+++ b/python/tvm/topi/x86/conv2d_alter_op.py
@@ -97,7 +97,15 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         if data_layout == "NCHW" and kernel_layout == "OIHW":
             if cfg.is_fallback:
                 _get_default_config(
-                    cfg, data_tensor, kernel_tensor, strides, padding, out_dtype, False, data_layout
+                    cfg,
+                    data_tensor,
+                    kernel_tensor,
+                    strides,
+                    padding,
+                    dilation,
+                    out_dtype,
+                    False,
+                    data_layout,
                 )
             batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
             out_channel, _, kh, kw = get_const_tuple(kernel_tensor.shape)
@@ -142,7 +150,15 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         assert data_layout == "NCHW" and kernel_layout == "OIHW"
         if cfg.is_fallback:
             _get_default_config_int8(
-                cfg, data_tensor, kernel_tensor, strides, padding, out_dtype, False, data_layout
+                cfg,
+                data_tensor,
+                kernel_tensor,
+                strides,
+                padding,
+                dilation,
+                out_dtype,
+                False,
+                data_layout,
             )
 
         batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
@@ -198,7 +214,15 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         if data_layout == "NCHW" and kernel_layout == "OIHW":
             if cfg.is_fallback:
                 _get_default_config(
-                    cfg, data_tensor, kernel_tensor, strides, padding, out_dtype, True, data_layout
+                    cfg,
+                    data_tensor,
+                    kernel_tensor,
+                    strides,
+                    padding,
+                    dilation,
+                    out_dtype,
+                    True,
+                    data_layout,
                 )
 
             batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
diff --git a/python/tvm/topi/x86/conv2d_avx_1x1.py b/python/tvm/topi/x86/conv2d_avx_1x1.py
index 3e5a12bc43b2..afee03a9f6a0 100644
--- a/python/tvm/topi/x86/conv2d_avx_1x1.py
+++ b/python/tvm/topi/x86/conv2d_avx_1x1.py
@@ -31,10 +31,13 @@
 
 def _fallback_schedule(cfg, wkl):
     simd_width = get_fp32_len()
-    HPAD, WPAD = wkl.hpad, wkl.wpad
-    HSTR, WSTR = wkl.hstride, wkl.wstride
-    out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1
-    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
+    pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
+    HSTR, WSTR = wkl.stride_h, wkl.stride_w
+    dilated_kernel_h = (wkl.kernel_h - 1) * wkl.dilation_h + 1
+    dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1
+
+    out_height = (wkl.height + pt + pb - dilated_kernel_h) // HSTR + 1
+    out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1
 
     oc_bn = 1
     for bn in range(simd_width, 0, -1):
diff --git a/python/tvm/topi/x86/conv2d_avx_common.py b/python/tvm/topi/x86/conv2d_avx_common.py
index 8d707445be05..5e63de329bba 100644
--- a/python/tvm/topi/x86/conv2d_avx_common.py
+++ b/python/tvm/topi/x86/conv2d_avx_common.py
@@ -27,9 +27,11 @@
 
 def _fallback_schedule(cfg, wkl):
     simd_width = get_fp32_len()
-    HPAD, WPAD = wkl.hpad, wkl.wpad
-    HSTR, WSTR = wkl.hstride, wkl.wstride
-    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
+    pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
+    HSTR, WSTR = wkl.stride_h, wkl.stride_w
+    dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1
+
+    out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1
 
     oc_bn = 1
     for bn in range(simd_width, 0, -1):
@@ -56,9 +58,9 @@ def _fallback_schedule(cfg, wkl):
 
 
 def _fallback_schedule_int8(cfg, wkl):
-    HPAD, WPAD = wkl.hpad, wkl.wpad
-    HSTR, WSTR = wkl.hstride, wkl.wstride
-    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
+    pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
+    HSTR, WSTR = wkl.stride_h, wkl.stride_w
+    out_width = (wkl.width + pl + pr - wkl.kernel_w) // WSTR + 1
 
     oc_bn = 16
     assert wkl.out_filter % oc_bn == 0
diff --git a/python/tvm/topi/x86/conv2d_int8.py b/python/tvm/topi/x86/conv2d_int8.py
index 905ada68f277..ca0d0b8b223c 100644
--- a/python/tvm/topi/x86/conv2d_int8.py
+++ b/python/tvm/topi/x86/conv2d_int8.py
@@ -33,7 +33,7 @@
 
 
 def _get_default_config_int8(
-    cfg, data, kernel, strides, padding, out_dtype, is_depthwise=False, layout="NCHW"
+    cfg, data, kernel, strides, padding, dilation, out_dtype, is_depthwise=False, layout="NCHW"
 ):
     """
     Get default schedule config for the workload
@@ -45,8 +45,8 @@ def _get_default_config_int8(
 
         _fallback_schedule(cfg, wkl)
     else:
-        wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype, layout)
-        is_kernel_1x1 = wkl.hkernel == 1 and wkl.wkernel == 1
+        wkl = _get_conv2d_workload(data, kernel, strides, padding, dilation, out_dtype, layout)
+        is_kernel_1x1 = wkl.kernel_h == 1 and wkl.kernel_w == 1
         if is_kernel_1x1:
             conv2d_generic.fallback_schedule_cpu_1x1_int8(
                 cfg, wkl, int32_lanes=16, num_int8_elements=4
@@ -138,8 +138,11 @@ def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out
     is_kernel_1x1 = kernel_height == 1 and kernel_width == 1
     pt, pl, pb, pr = get_pad_tuple(padding, (kernel_height, kernel_width))
     sh, sw = strides if isinstance(strides, (tuple, list)) else (strides, strides)
-    oh = (ih - kernel_height + pt + pb) // sh + 1
-    ow = (iw - kernel_width + pl + pr) // sw + 1
+    dh, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
+    dilated_kernel_h = (kernel_height - 1) * dh + 1
+    dilated_kernel_w = (kernel_width - 1) * dw + 1
+    oh = (ih - dilated_kernel_h + pt + pb) // sh + 1
+    ow = (iw - dilated_kernel_w + pl + pr) // sw + 1
 
     cfg.define_split("tile_ic", in_channel, num_outputs=2, filter=lambda y: y.size[-1] % 4 == 0)
     cfg.define_split("tile_oc", num_filter, num_outputs=2, filter=lambda y: y.size[-1] % 16 == 0)
@@ -159,6 +162,7 @@ def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out
             ),
             strides,
             padding,
+            dilation,
             out_dtype,
         )
 
diff --git a/python/tvm/topi/x86/depthwise_conv2d.py b/python/tvm/topi/x86/depthwise_conv2d.py
index badba1a248e9..a0225ef9e147 100644
--- a/python/tvm/topi/x86/depthwise_conv2d.py
+++ b/python/tvm/topi/x86/depthwise_conv2d.py
@@ -42,9 +42,11 @@ def _fallback_schedule(cfg, wkl):
     """
     simd_width = get_fp32_len()
 
-    HPAD, WPAD = wkl.hpad, wkl.wpad
-    HSTR, WSTR = wkl.hstride, wkl.wstride
-    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
+    pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
+    HSTR, WSTR = wkl.stride_h, wkl.stride_w
+    dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1
+
+    out_width = (wkl.width - dilated_kernel_w + pl + pr) // WSTR + 1
 
     oc_bn = 1
     for bn in range(simd_width, 0, -1):
@@ -165,6 +167,7 @@ def depthwise_conv2d_NCHWc(
         ),
         strides,
         (pad_top, pad_down),
+        dilation,
         out_dtype,
     )
     if cfg.is_fallback:
diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py b/tests/python/topi/python/test_topi_conv2d_int8.py
index 1bf83eba53ac..a934e3ef2fd2 100644
--- a/tests/python/topi/python/test_topi_conv2d_int8.py
+++ b/tests/python/topi/python/test_topi_conv2d_int8.py
@@ -27,6 +27,8 @@
 from tvm.topi.nn.utils import get_pad_tuple
 from tvm.topi.utils import get_const_tuple
 from tvm.topi.arm_cpu.conv2d_gemm import is_aarch64_arm
+from tvm.topi.nn.conv2d import _get_workload
+from tvm.topi.generic.conv2d import fallback_schedule_cpu_common_int8
 
 from common import Int8Fallback
 import tvm.testing
@@ -112,7 +114,7 @@ def compile_conv2d_NHWC_gemm_int8_arm(
                 s,
                 [A, W, bias, C],
                 device,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
+                name="relu_%dnnn_%d_%d_%d_%d_%d_%d_%d"
                 % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
             )
         else:
@@ -385,6 +387,22 @@ def get_ref_data():
 
     a_np, w_np, b_np, c_np = get_ref_data()
 
+    def verify_workload_padding():
+        _, _, out_height, out_width = get_const_tuple(c_np.shape)
+        wkl = _get_workload(A, W, (stride, stride), padding, dilation, dtype)
+
+        # for testing functionality,
+        # we choose arbitrary int32_lanes and num_int8_elements can divide the channel,
+        # regardless of the performance.
+        int32_lanes, num_int8_elements = num_filter, in_channel
+
+        # check if tile_ow candidates are the factors of the right output weight.
+        cfg = autotvm.get_config()
+        fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes, num_int8_elements)
+        ow_tile = np.prod(cfg["tile_ow"].size)
+
+        tvm.testing.assert_allclose(ow_tile, out_width)
+
     def check_device(device):
         ctx = tvm.context(device, 0)
         if not tvm.testing.device_enabled(device):
@@ -436,6 +454,8 @@ def check_device(device):
             func(a, w, c)
         tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
+    verify_workload_padding()
+
     for device in ["cuda"]:
         check_device(device)
 
@@ -547,6 +567,7 @@ def test_conv2d_nchw():
         verify_conv2d_nchw_int8(1, 32, 149, 32, 3, 1, 0)
         verify_conv2d_nchw_int8(7, 32, 149, 32, 3, 1, 0)
         verify_conv2d_nchw_int8(1, 32, 35, 64, 7, 2, (0, 0, 1, 1))
+        verify_conv2d_nchw_int8(1, 32, 35, 64, 7, 2, (0, 0, 2, 2))
 
 
 def test_conv2d_nhwc():
diff --git a/tests/python/topi/python/test_topi_conv2d_nchw.py b/tests/python/topi/python/test_topi_conv2d_nchw.py
index 1b7575211dac..07ad45c971df 100644
--- a/tests/python/topi/python/test_topi_conv2d_nchw.py
+++ b/tests/python/topi/python/test_topi_conv2d_nchw.py
@@ -25,6 +25,8 @@
 from tvm.contrib.pickle_memoize import memoize
 from tvm.topi.nn.utils import get_pad_tuple
 from tvm.topi.utils import get_const_tuple
+from tvm.topi.nn.conv2d import _get_workload
+from tvm.topi.x86.conv2d_avx_common import _fallback_schedule
 
 import tvm.testing
 
@@ -76,6 +78,17 @@ def get_ref_data():
 
     a_np, w_np, b_np, c_np = get_ref_data()
 
+    def verify_workload_padding():
+        _, _, out_height, out_width = get_const_tuple(c_np.shape)
+        wkl = _get_workload(A, W, (stride, stride), padding, dilation, dtype)
+
+        # check if tile_ow candidates are the factors of the right output weight.
+        cfg = autotvm.get_config()
+        _fallback_schedule(cfg, wkl)
+        ow_tile = np.prod(cfg["tile_ow"].size)
+
+        tvm.testing.assert_allclose(ow_tile, out_width)
+
     def check_device(device):
         ctx = tvm.context(device, 0)
         if not tvm.testing.device_enabled(device):
@@ -101,6 +114,9 @@ def check_device(device):
                 C = topi.nn.relu(C)
             s = fschedule([C])
 
+            if "llvm" in device:
+                verify_workload_padding()
+
         a = tvm.nd.array(a_np, ctx)
         w = tvm.nd.array(w_np, ctx)
         b = tvm.nd.array(b_np, ctx)
@@ -242,6 +258,7 @@ def test_conv2d_nchw():
     verify_conv2d_nchw(1, 64, 8, 64, 5, 2, (1, 3), add_bias=True)
     verify_conv2d_nchw(1, 64, 8, 64, 3, 1, "VALID", add_bias=True, add_relu=True)
     verify_conv2d_nchw(1, 64, 8, 64, 24, 1, "SAME", add_bias=True, add_relu=True)
+    verify_conv2d_nchw(1, 32, 35, 64, 7, 2, (0, 0, 2, 2))
 
 
 if __name__ == "__main__":
diff --git a/tests/python/topi/python/test_topi_depthwise_conv2d.py b/tests/python/topi/python/test_topi_depthwise_conv2d.py
index 55d2fe0c4e52..804c486d27d7 100644
--- a/tests/python/topi/python/test_topi_depthwise_conv2d.py
+++ b/tests/python/topi/python/test_topi_depthwise_conv2d.py
@@ -23,6 +23,8 @@
 from tvm.topi.utils import get_const_tuple
 from tvm.topi.nn.utils import get_pad_tuple
 from tvm.contrib.pickle_memoize import memoize
+from tvm.topi.nn.depthwise_conv2d import _get_workload
+from tvm.topi.x86.depthwise_conv2d import _fallback_schedule
 
 import tvm.testing
 
@@ -116,8 +118,8 @@ def depthwise_conv2d_with_workload_nchw(
     if dilation == 1:
         # here we transform the padding argument from 'str' to  'tuple' ,
         # because we need this to match the "workload" tuple to the records in TopHub
-        pad_h, pad_w, _, _ = get_pad_tuple(padding, (filter_height, filter_width))
-        padding_args = (pad_h, pad_w)
+        padt, padl, padb, padr = get_pad_tuple(padding, (filter_height, filter_width))
+        padding_args = (padt, padl, padb, padr)
     else:
         padding_args = padding
 
@@ -205,6 +207,23 @@ def get_ref_data():
                 relu_scipy,
             ) = get_ref_data()
 
+            def verify_workload_padding():
+                _, _, out_height, out_width = get_const_tuple(depthwise_conv2d_scipy.shape)
+                wkl = _get_workload(
+                    Input, Filter, (stride_h, stride_w), padding_args, dilation, dtype
+                )
+
+                # check if tile_ow candidates are the factors of the right output weight.
+                with tvm.target.Target(device):
+                    cfg = autotvm.get_config()
+                    _fallback_schedule(cfg, wkl)
+                    ow_tile = np.prod(cfg["tile_ow"].size)
+
+                    tvm.testing.assert_allclose(ow_tile, out_width)
+
+            if "llvm" in device:
+                verify_workload_padding()
+
             input_tvm = tvm.nd.array(input_np, ctx)
             filter_tvm = tvm.nd.array(filter_np, ctx)
             scale_tvm = tvm.nd.array(scale_np, ctx)

From 466383a232097b5c17733e347c2cb4a8ba14d972 Mon Sep 17 00:00:00 2001
From: "Steven S. Lyubomirsky" <sslyu@cs.washington.edu>
Date: Tue, 29 Dec 2020 20:43:12 -0500
Subject: [PATCH 004/357] [Relay][fix] Stack should take exprs that evaluate to
 tuples (#7130)

* Fix stack to take Relay exprs that evaluate to tuples

* Doc tweak

* Linting fix
---
 python/tvm/relay/op/tensor.py        |  9 +++--
 tests/python/relay/test_op_level3.py | 60 +++++++++++++++++++++-------
 2 files changed, 50 insertions(+), 19 deletions(-)

diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
index 453a9b7a7759..75e298786ddd 100644
--- a/python/tvm/relay/op/tensor.py
+++ b/python/tvm/relay/op/tensor.py
@@ -1105,8 +1105,8 @@ def stack(data, axis):
 
     Parameters
     ----------
-    data : Union(List[relay.Expr], Tuple(relay.Expr))
-        A list of tensors.
+    data : Union(List[relay.Expr], relay.Expr)
+        A list of tensors or a Relay expression that evaluates to a tuple of tensors.
 
     axis : int
         The axis in the result array along which the input arrays are stacked.
@@ -1116,12 +1116,13 @@ def stack(data, axis):
     ret : relay.Expr
         The stacked tensor.
     """
-    data = list(data)
     if not data:
         raise ValueError("relay.stack requires data to be non-empty.")
     if not isinstance(axis, int):
         raise ValueError("For now, we only support integer axis")
-    return _make.stack(Tuple(data), axis)
+    if not isinstance(data, Expr):
+        data = Tuple(list(data))
+    return _make.stack(data, axis)
 
 
 def copy(data):
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 668285dfb882..5e44170b6428 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -787,28 +787,58 @@ def verify_repeat(dshape, repeats, axis):
 
 @tvm.testing.uses_gpu
 def test_stack():
-    def verify_stack(dshapes, axis):
-        y = []
-        for shape in dshapes:
-            y.append(relay.var("input", relay.TensorType(shape, "float32")))
-        x = relay.Tuple(y)
-        z = relay.stack(x, axis=axis)
+    def produce_input_tuple(dshapes):
+        y = [relay.var("input", relay.TensorType(shape, "float32")) for shape in dshapes]
+        return relay.Tuple(y)
 
-        func = relay.Function(y, z)
-        x_data = [np.random.normal(size=shape).astype("float32") for shape in dshapes]
-        ref_res = np.stack(x_data, axis=axis)
+    def ref_stack(inputs, axis):
+        return np.stack(inputs, axis=axis)
+
+    def verify_stack(input_expr, relay_args, ref_res, axis):
+        z = relay.stack(input_expr, axis=axis)
+        inp_vars = relay.analysis.free_vars(z)
+        func = relay.Function(inp_vars, z)
 
         for target, ctx in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
-                op_res = intrp.evaluate(func)(*x_data)
+                op_res = intrp.evaluate(func)(*relay_args)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
-    verify_stack([(2,), (2,), (2,)], -1)
-    verify_stack([(2,), (2,), (2,)], 0)
-    verify_stack([(2, 2, 4), (2, 2, 4), (2, 2, 4)], 1)
-    verify_stack([(2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4)], -1)
-    verify_stack([(2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4)], 4)
+    def verify_tup_lit_stack(dshapes, axis):
+        input_tuple = produce_input_tuple(dshapes)
+        input_data = [np.random.normal(size=shape).astype("float32") for shape in dshapes]
+        ref_res = ref_stack(input_data, axis)
+        verify_stack(input_tuple, input_data, ref_res, axis)
+
+    def verify_list_lit_stack(dshapes, axis):
+        input_list = produce_input_tuple(dshapes).fields
+        input_data = [np.random.normal(size=shape).astype("float32") for shape in dshapes]
+        ref_res = ref_stack(input_data, axis)
+        verify_stack(input_list, input_data, ref_res, axis)
+
+    def verify_tup_expr_stack(dshapes, axis):
+        input_data = [np.random.normal(size=shape).astype("float32") for shape in dshapes]
+        ref_res = ref_stack(input_data, axis)
+
+        # expression that evaluates to a tuple
+        # but is not a tuple literal
+        x = relay.Var("x")
+        input_expr = relay.Let(x, relay.Tuple([relay.const(inp) for inp in input_data]), x)
+        verify_stack(input_expr, [], ref_res, axis)
+
+    dshape_axis_combos = [
+        ([(2,), (2,), (2,)], -1),
+        ([(2,), (2,), (2,)], 0),
+        ([(2, 2, 4), (2, 2, 4), (2, 2, 4)], 1),
+        ([(2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4)], -1),
+        ([(2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4)], 4),
+    ]
+
+    for dshapes, axis in dshape_axis_combos:
+        verify_tup_lit_stack(dshapes, axis)
+        verify_list_lit_stack(dshapes, axis)
+        verify_tup_expr_stack(dshapes, axis)
 
 
 @tvm.testing.uses_gpu

From f2ab977de0ac543cae77d3bef76af1b56dd61eed Mon Sep 17 00:00:00 2001
From: roger-zhao <roger-zhao@users.noreply.github.com>
Date: Wed, 30 Dec 2020 15:45:52 +0800
Subject: [PATCH 005/357] [AutoTVM-FIX] avoid unexpected value(1) of search
 space when get length for uninitiated search space (#7175)

* [AutoTVM-FIX] avoid unexpected value(1) of search space when get length for uninitiated search space

* Update python/tvm/autotvm/task/space.py

Co-authored-by: Cody Yu <comaniac0422@gmail.com>

Co-authored-by: ZhaoYanjie <roger.zhao@montage-tech.com>
Co-authored-by: Cody Yu <comaniac0422@gmail.com>
---
 python/tvm/autotvm/task/space.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/tvm/autotvm/task/space.py b/python/tvm/autotvm/task/space.py
index cf9cd809aa8d..b24ab415c60a 100644
--- a/python/tvm/autotvm/task/space.py
+++ b/python/tvm/autotvm/task/space.py
@@ -836,6 +836,8 @@ def _add_new_transform(self, space_class, name, axes, policy, **kwargs):
         return [Axis(None, i) for i in range(space_class.get_num_output(axes, policy, **kwargs))]
 
     def __len__(self):
+        if not self.space_map:
+            return 0
         if self._length is None:
             self._length = int(np.prod([len(x) for x in self.space_map.values()]))
         return self._length

From 66e123ff7ce4f5524b3f51ccd95bd4010b7af2c6 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Wed, 30 Dec 2020 18:00:22 +0900
Subject: [PATCH 006/357] [TOPI] Parallelize GPU NMS inner loop (#7172)

* make NMS inner loop parallel

* use one block two avoid global sync issue

* temp disable write by only thread 0

* leave a TODO on write by only one thread

* add some comments, remove check the check on negative class id

* minor improvement when topk is available

* fix write by a single thread
---
 python/tvm/topi/cuda/nms.py | 50 ++++++++++++++++++++++++-------------
 1 file changed, 32 insertions(+), 18 deletions(-)

diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
index 020cf9b5bc63..dd9d3f8a1d0e 100644
--- a/python/tvm/topi/cuda/nms.py
+++ b/python/tvm/topi/cuda/nms.py
@@ -512,26 +512,44 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
 
     with ib.new_scope():
         nthread_by = batch_size
+        nthread_tx = max_threads
+
         by = te.thread_axis("blockIdx.y")
+        tx = te.thread_axis("threadIdx.x")
         ib.scope_attr(by, "thread_extent", nthread_by)
+        ib.scope_attr(tx, "thread_extent", nthread_tx)
+
         i = by
+
         base_idx = i * num_anchors * box_data_length
         num_valid_boxes_local = ib.allocate(
             "int32", (1,), name="num_valid_boxes_local", scope="local"
         )
         num_valid_boxes_local[0] = 0
+        nkeep = if_then_else(tvm.tir.all(top_k > 0, top_k < valid_count[i]), top_k, valid_count[i])
 
         def nms_inner_loop(ib, j):
+            # The box j is valid, invalidate other boxes that overlap with j above iou_threshold
+
+            # When return_indices is False, no need to populate box_indices
+            if return_indices:
+                with ib.if_scope(tx + 0 == 0):
+                    orig_idx = sorted_index[i * num_anchors + j]
+                    box_indices[i, num_valid_boxes_local[0]] = indices[i, orig_idx]
+
+            num_valid_boxes_local[0] += 1
+
             offset_j = j * box_data_length
+            num_iter_per_thread = ceil_div(nkeep - (j + 1), nthread_tx)
 
-            with ib.for_range(0, j) as k:
+            with ib.for_range(0, num_iter_per_thread) as _k:
+                k = j + 1 + _k * nthread_tx + tx
                 offset_k = k * box_data_length
 
                 with ib.if_scope(
                     tvm.tir.all(
-                        out[base_idx + offset_j + score_index] > -1.0,  # if already surpressed
-                        out[base_idx + offset_k + score_index] > 0,
-                        tvm.tir.any(id_index < 0, out[base_idx + offset_k + id_index] >= 0),
+                        k < nkeep,
+                        out[base_idx + offset_k + score_index] > 0,  # is the box k still valid?
                         tvm.tir.any(
                             force_suppress > 0,
                             id_index < 0,
@@ -546,27 +564,22 @@ def nms_inner_loop(ib, j):
                         base_idx + offset_k + coord_start,
                     )
                     with ib.if_scope(iou >= iou_threshold):
-                        out[base_idx + offset_j + score_index] = -1.0
+                        # invalidate the box k
+                        out[base_idx + offset_k + score_index] = -1.0
                         with ib.if_scope(id_index >= 0):
-                            out[base_idx + offset_j + id_index] = -1.0
+                            out[base_idx + offset_k + id_index] = -1.0
 
-            # Has the box j survived IOU tests?
-            with ib.if_scope(out[base_idx + offset_j + score_index] > -1.0):
-                # When return_indices is False, no need to populate box_indices
-                if return_indices:
-                    orig_idx = sorted_index[i * num_anchors + j]
-                    box_indices[i, num_valid_boxes_local[0]] = indices[i, orig_idx]
-                num_valid_boxes_local[0] += 1
+                # Make sure to do the next loop in a lock step
+                ib.emit(tvm.tir.Call(None, "tir.tvm_storage_sync", tvm.runtime.convert(["shared"])))
 
         if isinstance(max_output_size, int):
             max_output_size = tvm.tir.const(max_output_size)
 
         with ib.if_scope(tvm.tir.all(iou_threshold > 0, valid_count[i] > 0)):
             # Apply nms
-            with ib.for_range(0, valid_count[i]) as j:
-                with ib.if_scope(
-                    tvm.tir.any(id_index < 0, out[base_idx + j * box_data_length + id_index] >= 0)
-                ):
+            with ib.for_range(0, nkeep) as j:
+                # Proceed to the inner loop if the box j is still valid
+                with ib.if_scope(out[base_idx + (j * box_data_length) + score_index] > -1.0):
                     with ib.if_scope(max_output_size > 0):
                         # No need to do more iteration if we already reach max_output_size boxes
                         with ib.if_scope(num_valid_boxes_local[0] < max_output_size):
@@ -574,7 +587,8 @@ def nms_inner_loop(ib, j):
                     with ib.else_scope():
                         nms_inner_loop(ib, j)
 
-            num_valid_boxes[i] = num_valid_boxes_local[0]
+            with ib.if_scope(tx + 0 == 0):
+                num_valid_boxes[i] = num_valid_boxes_local[0]
 
         with ib.else_scope():
             num_valid_boxes[i] = 0

From 6a4c51e187b2cea0ec14996580c89bb2f7176be0 Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Wed, 30 Dec 2020 03:00:37 -0800
Subject: [PATCH 007/357] [AutoScheduler] Use VM to extract tasks for dynamic
 models (#7173)

* use VM for dynamic shape

* make it work

* add test

* finalize

* finalize

* format

* address comment

* comment

* improve task extraction
---
 .../tvm/auto_scheduler/relay_integration.py   | 58 ++++++++++----
 python/tvm/auto_scheduler/utils.py            | 13 ++-
 src/auto_scheduler/compute_dag.cc             |  2 +-
 src/relay/backend/compile_engine.cc           |  3 +-
 .../test_auto_scheduler_task_extraction.py    | 80 +++++++++++--------
 5 files changed, 103 insertions(+), 53 deletions(-)

diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py
index 3287f3d4a1e5..eecf88bac9d8 100644
--- a/python/tvm/auto_scheduler/relay_integration.py
+++ b/python/tvm/auto_scheduler/relay_integration.py
@@ -22,8 +22,8 @@
 2. Provide auto-scheduling for all TOPI compute functions
 """
 
-import logging
 import json
+import logging
 import threading
 
 import tvm
@@ -31,11 +31,14 @@
 from tvm.ir.transform import PassContext
 from tvm.runtime import convert_to_object
 from tvm.te.tensor import ComputeOp, PlaceholderOp, Tensor
+from tvm.tir import Reduce
 from tvm.tir import expr as _expr
+
 from . import _ffi_api
 from .compute_dag import ComputeDAG, LayoutRewriteOption
 from .dispatcher import DispatchContext
 from .search_task import SearchTask
+from .utils import get_const_tuple
 from .workload_registry import register_workload_tensors
 
 logger = logging.getLogger("auto_scheduler")
@@ -56,9 +59,20 @@ def call_all_topi_funcs(mod, params, target):
         config={"relay.backend.use_auto_scheduler": True},
         disabled_pass={"AutoSchedulerLayoutRewrite"},
     ):
-        opt_mod, _ = relay.optimize(mod, target, params)
-        grc = graph_runtime_codegen.GraphRuntimeCodegen(None, target)
-        grc.codegen(opt_mod["main"])
+        try:
+            opt_mod, _ = relay.optimize(mod, target, params)
+            grc = graph_runtime_codegen.GraphRuntimeCodegen(None, target)
+            grc.codegen(opt_mod["main"])
+        except tvm.TVMError:
+            print(
+                "Get errors with GraphRuntimeCodegen for task extraction. "
+                "Fallback to VMCompiler."
+            )
+            compiler = relay.vm.VMCompiler()
+            if params:
+                compiler.set_params(params)
+            mod = tvm.IRModule.from_expr(mod) if isinstance(mod, relay.Function) else mod
+            compiler.lower(mod, target)
 
     autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent
 
@@ -192,7 +206,8 @@ def exit_layout_rewrite():
 
 
 def traverse_to_get_io_tensors(outs):
-    """Traverse from a list of output tensors to get both input and output tensors
+    """Traverse from a list of output tensors to get input/output tensors and
+    other useful information.
 
     Parameters
     ----------
@@ -202,36 +217,50 @@ def traverse_to_get_io_tensors(outs):
     Returns
     -------
     io_tensors: List[Tensor]
-        The input and output tensors
+        The input and output tensors with static shape
     has_layout_free: bool
         Whether the compute DAG has layout_free placeholders
+    has_complex_op: bool
+        Whether the topi compute function includes at least one complex (reduce) op
     """
     layout_free_ops = []
     inputs = []
 
+    has_complex_op = False
     visited = set()
 
     def traverse(t):
-        if t in visited:
+        nonlocal has_complex_op
+
+        # We cannot directly add tensors to the set, because the comparison of
+        # two tensors with ndim=0 is ambiguous.
+        assert t.handle is not None
+        if t.handle.value in visited:
             return
         if isinstance(t.op, PlaceholderOp):
             inputs.append(t)
         elif isinstance(t.op, ComputeOp):
+            has_complex_op = has_complex_op or any([isinstance(e, Reduce) for e in t.op.body])
             if "layout_free_placeholders" in t.op.attrs:
                 layout_free_ops.append(t.op)
             for x in t.op.input_tensors:
                 traverse(x)
-        visited.add(t)
+        visited.add(t.handle.value)
 
     for t in outs:
         traverse(t)
 
-    has_layout_free = len(layout_free_ops) > 0
-    return inputs + list(outs), has_layout_free
+    io_tensors = inputs + list(outs)
+    for tensor in io_tensors:
+        # Reject the compute if any of its I/O tensors has dynamic shape.
+        if any([not isinstance(v, int) for v in get_const_tuple(tensor.shape)]):
+            return ([], False, False)
+
+    return (io_tensors, len(layout_free_ops) > 0, has_complex_op)
 
 
 @tvm._ffi.register_func("auto_scheduler.relay_integration.auto_schedule_topi_compute")
-def auto_schedule_topi(outs, has_complex_op):
+def auto_schedule_topi(outs):
     """Use auto-scheduler to schedule any topi compute function.
 
     Note: This is used internally for relay integration. Do
@@ -241,8 +270,6 @@ def auto_schedule_topi(outs, has_complex_op):
     ----------
     outs: List[Tensor]
         The output tensors of topi compute functions
-    has_complex_op: bool
-        Whether the topi compute function includes at least one complex op.
 
     Returns
     -------
@@ -253,7 +280,10 @@ def auto_schedule_topi(outs, has_complex_op):
     # pylint: disable=import-outside-toplevel
     from tvm import relay
 
-    io_tensors, has_layout_free = traverse_to_get_io_tensors(outs)
+    io_tensors, has_layout_free, has_complex_op = traverse_to_get_io_tensors(outs)
+    if not io_tensors:  # The compute includes dynamic shapes which are not supported yet.
+        return None
+
     try:
         dag = ComputeDAG(io_tensors)
     except tvm.error.TVMError as err:
diff --git a/python/tvm/auto_scheduler/utils.py b/python/tvm/auto_scheduler/utils.py
index f3698fa7fd6a..334acaf02238 100644
--- a/python/tvm/auto_scheduler/utils.py
+++ b/python/tvm/auto_scheduler/utils.py
@@ -34,6 +34,7 @@
 except ImportError:
     psutil = None
 
+import tvm
 from tvm import rpc
 from tvm.tir import expr
 from tvm.tir.transform import Simplify
@@ -90,10 +91,16 @@ def get_const_tuple(in_tuple):
 
     Returns
     -------
-    out_tuple : Tuple[int]
-        The output.
+    out_tuple : Tuple[Union[int,tvm.tir.Var,tvm.tir.Any]]
+        The output tuple of int. The dynamic shape variables (Var or Any) will be preserved.
     """
-    return tuple(get_const_int(x) for x in in_tuple)
+    ret = []
+    for elem in in_tuple:
+        if isinstance(elem, (tvm.tir.Var, tvm.tir.expr.Any)):
+            ret.append(elem)
+        else:
+            ret.append(get_const_int(elem))
+    return tuple(ret)
 
 
 def list_to_tuple(x):
diff --git a/src/auto_scheduler/compute_dag.cc b/src/auto_scheduler/compute_dag.cc
index b65878225f5a..6ce7349c2e61 100755
--- a/src/auto_scheduler/compute_dag.cc
+++ b/src/auto_scheduler/compute_dag.cc
@@ -1409,7 +1409,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
                    << select->false_value << ")= " << '(' << preduce->source[0] << ','
                    << preduce->source[1] << ")\n";
               } else {
-                LOG(FATAL) << "Unsupported reduction operator" << combiner;
+                ss << "reduce" << combiner << "\n";
               }
             } else {
               ss << " = " << pop->body[k] << "\n";
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index 98d913662953..789f39de22d1 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -157,8 +157,7 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
             runtime::Registry::Get("auto_scheduler.relay_integration.auto_schedule_topi_compute");
         ICHECK(fauto_schedule != nullptr)
             << "auto_scheduler.relay_integration.auto_schedule_topi_compute is not registered";
-        bool has_complex_op = anchor_op_pattern_ >= kCommReduce;
-        ObjectRef obj = (*fauto_schedule)(tensor_outs, has_complex_op);
+        ObjectRef obj = (*fauto_schedule)(tensor_outs);
         if (obj.defined()) {
           schedule = Downcast<te::Schedule>(obj);
         }
diff --git a/tests/python/relay/test_auto_scheduler_task_extraction.py b/tests/python/relay/test_auto_scheduler_task_extraction.py
index 531d0412c97d..cfbca40cf379 100644
--- a/tests/python/relay/test_auto_scheduler_task_extraction.py
+++ b/tests/python/relay/test_auto_scheduler_task_extraction.py
@@ -132,6 +132,15 @@ def test_task_extraction():
     dtype = "float32"
     target = tvm.target.Target("llvm")
 
+    def verify_task_extraction(func, expected_task, include_simple_tasks=False):
+        mod = tvm.IRModule.from_expr(func)
+        tasks, task_weights = auto_scheduler.extract_tasks(
+            mod["main"], None, target, include_simple_tasks=include_simple_tasks
+        )
+
+        assert len(tasks) == expected_task
+        assert len(task_weights) == expected_task
+
     def get_func():
         data = relay.var("data", shape=(ishape), dtype=dtype)
         weight1 = relay.var("weight1", shape=(w1shape), dtype=dtype)
@@ -161,6 +170,29 @@ def get_simple_func():
         out = relay.image.affine_grid(data, (150, 150))
         return relay.Function([data], out)
 
+    def get_shape_of_func():
+        data = relay.var("data", shape=(relay.Any(), 28, 28), dtype="float32")
+        out = relay.shape_of(data)
+        return relay.Function([data], out)
+
+    def get_func_with_dynamic_shape():
+        data = relay.var("data", shape=(relay.Any(), 32), dtype="float32")
+        out = relay.max(data)
+        return relay.Function(relay.analysis.free_vars(out), out)
+
+    def get_func_with_control_flow():
+        data = relay.var("data", shape=(1, 3, 224, 224))
+        weight = relay.var("weight", shape=(32, 3, 3, 3))
+        eq1 = relay.var("e1", shape=[], dtype="float32")
+        eq2 = relay.var("e2", shape=[], dtype="float32")
+        eq = relay.equal(eq1, eq2)
+
+        true_branch = relay.zeros(shape=(1, 32, 222, 222), dtype="float32")
+        false_branch = relay.nn.conv2d(data, weight, kernel_size=(3, 3), channels=32)
+        ife = relay.If(eq, true_branch, false_branch)
+        out = relay.erf(ife)
+        return relay.Function([data, weight, eq1, eq2], out)
+
     def get_func_with_unsupported_op():
         def get_postproc_func():
             data = relay.var("data", shape=((1, 3, 6)), dtype=dtype)
@@ -180,48 +212,30 @@ def get_postproc_func():
         out = relay.Call(get_postproc_func(), [nms])
         return relay.Function([cls_prob, loc_pred, anchors], out)
 
-    func = get_func()
-    mod = tvm.IRModule.from_expr(func)
-    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], None, target)
-
     # Relay FuseOps puts two conv2ds to separate functions and results in two tasks.
-    assert len(tasks) == 2
-    assert len(task_weights) == 2
-
-    func = get_fused_func()
-    mod = tvm.IRModule.from_expr(func)
-    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], None, target)
+    verify_task_extraction(get_func(), 2)
 
     # By setting the function to primitive, Relay FuseOps will not break it and result in one task.
-    assert len(tasks) == 1
-    assert len(task_weights) == 1
-
-    func = get_simple_func()
-    mod = tvm.IRModule.from_expr(func)
-    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], None, target)
+    verify_task_extraction(get_fused_func(), 1)
 
     # The Relay function without complex ops will not form a task by default.
-    assert len(tasks) == 0
-    assert len(task_weights) == 0
-
-    tasks, task_weights = auto_scheduler.extract_tasks(
-        mod["main"], None, target, include_simple_tasks=True
-    )
+    verify_task_extraction(get_simple_func(), 0)
 
     # Every Relay function becomes a task regardless what ops in its body.
-    assert len(tasks) == 1
-    assert len(task_weights) == 1
+    verify_task_extraction(get_simple_func(), 1, True)
 
-    # Func1 (with NMS) -> Func2 (injective).
-    func = get_func_with_unsupported_op()
-    mod = tvm.IRModule.from_expr(func)
-    tasks, task_weights = auto_scheduler.extract_tasks(
-        mod["main"], None, target, include_simple_tasks=True
-    )
+    # The Relay function without any reduce op is considered as a simple task.
+    verify_task_extraction(get_shape_of_func(), 0)
+    verify_task_extraction(get_shape_of_func(), 1, True)
 
-    # The function with NMS should fail, but the other function with ReLU should be a task.
-    assert len(tasks) == 1
-    assert len(task_weights) == 1
+    # The Relay function with dynamic shape inputs/outputs will not be extracted.
+    verify_task_extraction(get_func_with_dynamic_shape(), 0)
+
+    # The Conv2D in the Relay function with control flow could still be a task.
+    verify_task_extraction(get_func_with_control_flow(), 1)
+
+    # Func1 (with NMS) -> Func2 (injective).
+    verify_task_extraction(get_func_with_unsupported_op(), 1, True)
 
 
 if __name__ == "__main__":

From f904d4fe95b16044a8d46edeea0b7b7792e0ef3c Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Wed, 30 Dec 2020 04:18:41 -0800
Subject: [PATCH 008/357] [AutoScheduler] Fix policy for zero-rank output
 (#7180)

---
 .../search_policy/sketch_policy.cc            | 15 ++--
 .../search_policy/sketch_policy.h             |  7 ++
 src/auto_scheduler/search_policy/utils.h      |  7 +-
 src/auto_scheduler/transform_step.cc          | 35 ++++++--
 .../unittest/test_auto_scheduler_common.py    | 17 ++++
 .../test_auto_scheduler_search_policy.py      | 83 ++++++++++++++-----
 .../test_auto_scheduler_sketch_generation.py  | 16 ++++
 7 files changed, 136 insertions(+), 44 deletions(-)

diff --git a/src/auto_scheduler/search_policy/sketch_policy.cc b/src/auto_scheduler/search_policy/sketch_policy.cc
index e2678373ef8b..1e20b0fff6ea 100644
--- a/src/auto_scheduler/search_policy/sketch_policy.cc
+++ b/src/auto_scheduler/search_policy/sketch_policy.cc
@@ -78,6 +78,8 @@ SketchPolicy::SketchPolicy(SearchTask task, CostModel program_cost_model,
   node->rand_gen = std::mt19937(seed);
   node->params = std::move(params);
   node->verbose = verbose;
+  node->sample_init_min_pop_ =
+      GetIntParam(node->params, SketchParamKey::SampleInitPopulation::min_population);
 
   if (init_search_callbacks) {
     PrintTitle("Call init-search callbacks", verbose);
@@ -382,8 +384,6 @@ Array<State> SketchPolicyNode::GenerateSketches() {
 Array<State> SketchPolicyNode::SampleInitPopulation(const Array<State>& sketches) {
   // Use this population as the parallel degree to do sampling
   int population = GetIntParam(params, SketchParamKey::EvolutionarySearch::population);
-  // At least we should sample this number of valid programs
-  int min_population = GetIntParam(params, SketchParamKey::SampleInitPopulation::min_population);
 
   auto tic_begin = std::chrono::high_resolution_clock::now();
 
@@ -397,9 +397,8 @@ Array<State> SketchPolicyNode::SampleInitPopulation(const Array<State>& sketches
 
   std::unordered_set<std::string> explored_state_strs;
   size_t iter = 1;
-  size_t target_size = min_population;
   size_t unchange_cnt = 0;
-  while (out_states.size() < target_size) {
+  while (static_cast<int>(out_states.size()) < sample_init_min_pop_) {
     std::vector<State> temp_states(population);
 
     // Sample a batch of states randomly
@@ -458,7 +457,7 @@ Array<State> SketchPolicyNode::SampleInitPopulation(const Array<State>& sketches
                             std::chrono::high_resolution_clock::now() - tic_begin)
                             .count();
       StdCout(verbose) << "Sample Iter: " << iter << std::fixed << std::setprecision(4)
-                       << "\t#Pop: " << out_states.size() << "\t#Target: " << target_size
+                       << "\t#Pop: " << out_states.size() << "\t#Target: " << sample_init_min_pop_
                        << "\tfail_ct: " << fail_ct << "\tTime elapsed: " << std::fixed
                        << std::setprecision(2) << duration << std::endl;
     }
@@ -466,9 +465,9 @@ Array<State> SketchPolicyNode::SampleInitPopulation(const Array<State>& sketches
     if (unchange_cnt == 5) {
       // Reduce the target size to avoid too-long time in this phase if no valid state was found
       // in the past iterations
-      if (target_size > 1) {
-        target_size /= 2;
-        StdCout(verbose) << "#Target has been reduced to " << target_size
+      if (sample_init_min_pop_ > 1) {
+        sample_init_min_pop_ /= 2;
+        StdCout(verbose) << "#Target has been reduced to " << sample_init_min_pop_
                          << " due to too many failures or duplications" << std::endl;
       }
       unchange_cnt = 0;
diff --git a/src/auto_scheduler/search_policy/sketch_policy.h b/src/auto_scheduler/search_policy/sketch_policy.h
index 3d135d1bda94..488634902a87 100644
--- a/src/auto_scheduler/search_policy/sketch_policy.h
+++ b/src/auto_scheduler/search_policy/sketch_policy.h
@@ -87,6 +87,8 @@ struct SketchParamKey {
   static constexpr const char* disable_change_compute_location = "disable_change_compute_location";
 };
 
+class SketchPolicy;
+
 /*!
  * \brief The search policy that searches in a hierarchical search space defined by sketches.
  * The policy randomly samples programs from the space defined by sketches
@@ -166,6 +168,11 @@ class SketchPolicyNode : public SearchPolicyNode {
 
   /*! \brief The cached sketches */
   Array<State> sketch_cache_;
+
+  /*! \brief The minimul output population of SampleInitPopulation */
+  int sample_init_min_pop_;
+
+  friend class SketchPolicy;
 };
 
 /*!
diff --git a/src/auto_scheduler/search_policy/utils.h b/src/auto_scheduler/search_policy/utils.h
index d59a6ca220ca..eb2cd69c9209 100644
--- a/src/auto_scheduler/search_policy/utils.h
+++ b/src/auto_scheduler/search_policy/utils.h
@@ -609,12 +609,11 @@ inline State FuseAllOuterSpaceIterators(const State& state, int stage_id, Iterat
     to_fuse.push_back(it);
   }
 
-  ICHECK(!to_fuse.empty());
   State tmp_s = state;
-  if (to_fuse.size() > 1) {
-    *fused_iter = tmp_s.fuse(stage_id, to_fuse);
-  } else {
+  if (to_fuse.size() == 1) {
     *fused_iter = to_fuse[0];
+  } else {
+    *fused_iter = tmp_s.fuse(stage_id, to_fuse);
   }
   return tmp_s;
 }
diff --git a/src/auto_scheduler/transform_step.cc b/src/auto_scheduler/transform_step.cc
index 5560907dcffa..5ba3eee07098 100755
--- a/src/auto_scheduler/transform_step.cc
+++ b/src/auto_scheduler/transform_step.cc
@@ -538,15 +538,25 @@ Iterator FuseStepNode::ApplyToState(State* state) const {
   Iterator new_it =
       Iterator(new_name, range, new_iter_kind, IteratorAnnotation::kNone, &orig_iters);
   Array<Iterator> new_iters;
-  new_iters.insert(new_iters.end(), stage->iters.begin(), stage->iters.begin() + fused_ids.front());
-  new_iters.push_back(new_it);
-  new_iters.insert(new_iters.end(), stage->iters.begin() + fused_ids.back() + 1,
-                   stage->iters.end());
+
+  if (fused_ids.empty()) {
+    new_iters.push_back(new_it);
+  } else {
+    new_iters.insert(new_iters.end(), stage->iters.begin(),
+                     stage->iters.begin() + fused_ids.front());
+    new_iters.push_back(new_it);
+    new_iters.insert(new_iters.end(), stage->iters.begin() + fused_ids.back() + 1,
+                     stage->iters.end());
+  }
 
   StateNode* pstate = state->CopyOnWrite();
   pstate->stages.Set(stage_id,
                      Stage(stage->op, stage->op_type, new_iters, stage->compute_at, stage->attrs));
 
+  if (fused_ids.empty()) {
+    return new_it;
+  }
+
   // Two vectors are used to represent the iterator relation before and after fuse
   // The original iterators in AttachMap will be updated with the new iterators
   std::vector<IterKey> from_iters;
@@ -583,9 +593,13 @@ IterVar FuseStepNode::ApplyToSchedule(Array<te::Stage>* stages,
   stage.fuse(to_fuse, &fused_axis);
 
   Array<IterVar> new_axes;
-  new_axes.insert(new_axes.end(), axes.begin(), axes.begin() + fused_ids.front());
-  new_axes.push_back(fused_axis);
-  new_axes.insert(new_axes.end(), axes.begin() + fused_ids.back() + 1, axes.end());
+  if (fused_ids.empty()) {
+    new_axes.push_back(fused_axis);
+  } else {
+    new_axes.insert(new_axes.end(), axes.begin(), axes.begin() + fused_ids.front());
+    new_axes.push_back(fused_axis);
+    new_axes.insert(new_axes.end(), axes.begin() + fused_ids.back() + 1, axes.end());
+  }
 
   stage_to_axes->Set(stage, std::move(new_axes));
   stages->Set(stage_id, std::move(stage));
@@ -683,9 +697,12 @@ void PragmaStepNode::ApplyToSchedule(Array<te::Stage>* stages,
     }
     ICHECK_LT(pos, pragma_type.size()) << "max step value not found.";
     int value = atoi(pragma_type.c_str() + pos + 1);
-    stage.pragma(axes[iter_id], "auto_unroll_max_step", value);
-    stage.pragma(axes[iter_id], "unroll_explicit", true);
+    if (iter_id < static_cast<int>(axes.size())) {
+      stage.pragma(axes[iter_id], "auto_unroll_max_step", value);
+      stage.pragma(axes[iter_id], "unroll_explicit", true);
+    }
   } else {
+    ICHECK_LT(iter_id, axes.size());
     stage.pragma(axes[iter_id], pragma_type);
   }
   stages->Set(stage_id, std::move(stage));
diff --git a/tests/python/unittest/test_auto_scheduler_common.py b/tests/python/unittest/test_auto_scheduler_common.py
index a037b680e2e1..2f9423104a68 100644
--- a/tests/python/unittest/test_auto_scheduler_common.py
+++ b/tests/python/unittest/test_auto_scheduler_common.py
@@ -145,6 +145,23 @@ def invalid_compute_definition():
     return [A, B]
 
 
+@auto_scheduler.register_workload
+def zero_rank_reduce_auto_scheduler_test(N):
+    A = tvm.te.placeholder((N,), name="A")
+    k = tvm.te.reduce_axis((0, N), name="k")
+    B = tvm.te.compute((), lambda: tvm.te.sum(A[k], k), name="B")
+
+    return [A, B]
+
+
+@auto_scheduler.register_workload
+def zero_rank_compute_auto_scheduler_test(N):
+    A = tvm.te.placeholder((N,), name="A")
+    B = tvm.te.compute((), lambda: A[0], name="B")
+
+    return [A, B]
+
+
 @auto_scheduler.register_workload
 def conv2d_winograd_nhwc_auto_scheduler_test(
     N, H, W, CI, CO, kernel_size=3, stride=1, padding=0, dilation=1
diff --git a/tests/python/unittest/test_auto_scheduler_search_policy.py b/tests/python/unittest/test_auto_scheduler_search_policy.py
index 73ce0a1685bf..c96dc63fec29 100644
--- a/tests/python/unittest/test_auto_scheduler_search_policy.py
+++ b/tests/python/unittest/test_auto_scheduler_search_policy.py
@@ -25,8 +25,13 @@
 import tvm
 import tvm.testing
 from tvm import auto_scheduler
+from tvm.auto_scheduler.utils import get_const_tuple
 
-from test_auto_scheduler_common import matmul_auto_scheduler_test
+from test_auto_scheduler_common import (
+    matmul_auto_scheduler_test,
+    zero_rank_compute_auto_scheduler_test,
+    zero_rank_reduce_auto_scheduler_test,
+)
 import multiprocessing
 
 
@@ -41,21 +46,21 @@ def callback(self, policy, inputs, results):
 
 
 def search_common(
-    workload=matmul_auto_scheduler_test,
+    task=None,
     target="llvm",
     search_policy="sketch",
-    seed=0,
     runner="local",
     num_measure_trials=100,
     cost_model=auto_scheduler.RandomModel(),
     init_search_callbacks=None,
 ):
-    print("Test search policy '%s' for '%s'" % (search_policy, target))
+    if task is None:
+        task = auto_scheduler.SearchTask(
+            func=matmul_auto_scheduler_test, args=(64, 64, 64), target=target
+        )
+    target = task.target
 
-    random.seed(seed)
-    N = 128
-    target = tvm.target.Target(target)
-    task = auto_scheduler.SearchTask(func=workload, args=(N, N, N), target=target)
+    print("Test search policy '%s' for '%s'" % (search_policy, target))
 
     with tempfile.NamedTemporaryFile() as fp:
         log_file = fp.name
@@ -72,6 +77,7 @@ def search_common(
         else:
             raise ValueError("Invalid policy: " + search_policy)
 
+        # Tune
         tuning_options = auto_scheduler.TuningOptions(
             num_measure_trials=num_measure_trials,
             num_measures_per_round=2,
@@ -80,33 +86,47 @@ def search_common(
             measure_callbacks=[auto_scheduler.RecordToFile(log_file), CustomMeasureCallback()],
         )
         task.tune(tuning_options=tuning_options, search_policy=search_policy)
+
+        # Compile with the best schedule
         sch, args = task.apply_best(log_file)
+        mod = tvm.build(sch, args, target)
 
-        try:
-            mod = tvm.build(sch, args, target)
+        # Compile with naive schedule for correctness check
+        sch, args = task.compute_dag.apply_steps_from_state(task.compute_dag.init_state)
+        mod_ref = tvm.build(sch, args, "llvm")
 
-            ctx = tvm.context(str(target), 0)
-            dtype = task.compute_dag.tensors[0].dtype
-            a = tvm.nd.array(np.random.uniform(size=(N, N)).astype(dtype), ctx)
-            b = tvm.nd.array(np.random.uniform(size=(N, N)).astype(dtype), ctx)
-            c = tvm.nd.array(np.zeros((N, N), dtype=dtype), ctx)
-            mod(a, b, c)
-            tvm.testing.assert_allclose(c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-5)
-        except Exception:
-            raise Exception("Error encountered with seed: %d" % (seed))
+        ctx = tvm.context(str(target), 0)
+        np_arrays = [np.random.uniform(size=get_const_tuple(x.shape)).astype(x.dtype) for x in args]
+
+        tvm_arrays = [tvm.nd.array(x, ctx) for x in np_arrays]
+        mod(*tvm_arrays)
+        actual = [x.asnumpy() for x in tvm_arrays]
+
+        tvm_arrays = [tvm.nd.array(x) for x in np_arrays]
+        mod_ref(*tvm_arrays)
+        expected = [x.asnumpy() for x in tvm_arrays]
+
+        for x, y in zip(actual, expected):
+            tvm.testing.assert_allclose(x, y, rtol=1e-5)
 
 
 @tvm.testing.requires_llvm
-def test_workload_registry_search_basic():
+def test_workload_registry_empty_policy():
     search_common(search_policy="empty", num_measure_trials=2)
 
+    N = 64
+    target = "llvm"
     search_common(
-        workload="matmul_auto_scheduler_test",
+        task=auto_scheduler.SearchTask(
+            func="matmul_auto_scheduler_test", args=(N, N, N), target=target
+        ),
         num_measure_trials=2,
         search_policy="empty",
     )
     search_common(
-        workload="matmul_auto_scheduler_test_rename_1",
+        task=auto_scheduler.SearchTask(
+            func="matmul_auto_scheduler_test_rename_1", args=(N, N, N), target=target
+        ),
         num_measure_trials=2,
         search_policy="empty",
     )
@@ -147,10 +167,27 @@ def test_sketch_search_policy_cuda_xgbmodel_rpc_runner():
     search_common(target="cuda", runner=measure_ctx.runner, cost_model=auto_scheduler.XGBModel())
 
 
+@tvm.testing.requires_llvm
+@tvm.testing.requires_cuda
+def test_sketch_search_policy_zero_rank():
+    measure_ctx = auto_scheduler.LocalRPCMeasureContext()
+    for target in ["llvm", "cuda"]:
+        task = auto_scheduler.SearchTask(
+            func=zero_rank_compute_auto_scheduler_test, args=(10,), target=target
+        )
+        search_common(task, runner=measure_ctx.runner)
+
+        task = auto_scheduler.SearchTask(
+            func=zero_rank_reduce_auto_scheduler_test, args=(10,), target=target
+        )
+        search_common(task, runner=measure_ctx.runner)
+
+
 if __name__ == "__main__":
-    test_workload_registry_search_basic()
+    test_workload_registry_empty_policy()
     test_sketch_search_policy_basic()
     test_sketch_search_policy_basic_spawn()
     test_sketch_search_policy_xgbmodel()
     test_sketch_search_policy_cuda_rpc_runner()
     test_sketch_search_policy_cuda_xgbmodel_rpc_runner()
+    test_sketch_search_policy_zero_rank()
diff --git a/tests/python/unittest/test_auto_scheduler_sketch_generation.py b/tests/python/unittest/test_auto_scheduler_sketch_generation.py
index 74d5729e4887..ddff6dd1a8d6 100644
--- a/tests/python/unittest/test_auto_scheduler_sketch_generation.py
+++ b/tests/python/unittest/test_auto_scheduler_sketch_generation.py
@@ -32,6 +32,7 @@
     softmax_nm_auto_scheduler_test,
     softmax_abcd_auto_scheduler_test,
     conv2d_winograd_nhwc_auto_scheduler_test,
+    zero_rank_reduce_auto_scheduler_test,
 )
 
 
@@ -252,6 +253,12 @@ def test_cpu_conv2d_winograd_sketch():
     assert sketches[1] != sketches[2]
 
 
+def test_cpu_zero_rank_sketch():
+    sketches = generate_sketches(zero_rank_reduce_auto_scheduler_test, (128,), "llvm")
+    """ 2 rfactor sketches + 1 multi-level tiling sketches """
+    assert len(sketches) == 3
+
+
 @tvm.testing.requires_cuda
 def test_cuda_matmul_sketch():
     sketches = generate_sketches(matmul_auto_scheduler_test, (512, 512, 512), "cuda")
@@ -385,6 +392,13 @@ def test_cuda_conv2d_winograd_sketch():
     assert_is_not_tiled(sketches[0].stages[12])
 
 
+@tvm.testing.requires_cuda
+def test_cuda_zero_rank_sketch():
+    sketches = generate_sketches(zero_rank_reduce_auto_scheduler_test, (128,), "cuda")
+    """ 1 cross thread reuction sketch + 1 multi-level tiling sketch """
+    assert len(sketches) == 2
+
+
 if __name__ == "__main__":
     test_cpu_matmul_sketch()
     test_cpu_conv2d_bn_relu_sketch()
@@ -392,9 +406,11 @@ def test_cuda_conv2d_winograd_sketch():
     test_cpu_min_sketch()
     test_cpu_softmax_sketch()
     test_cpu_conv2d_winograd_sketch()
+    test_cpu_zero_rank_sketch()
     test_cuda_matmul_sketch()
     test_cuda_conv2d_bn_relu_sketch()
     test_cuda_max_pool2d_sketch()
     test_cuda_min_sketch()
     test_cuda_softmax_sketch()
     test_cuda_conv2d_winograd_sketch()
+    test_cuda_zero_rank_sketch()

From 712b4a553fb417b743407be9194c0c4a545978d9 Mon Sep 17 00:00:00 2001
From: leowang1225 <810916296@qq.com>
Date: Wed, 30 Dec 2020 20:32:43 +0800
Subject: [PATCH 009/357] [Auto Scheduler][fix] Add dense strategy for mali
 (#7181)

Signed-off-by: leowang1225 <810916296@qq.com>
---
 python/tvm/relay/op/strategy/mali.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/python/tvm/relay/op/strategy/mali.py b/python/tvm/relay/op/strategy/mali.py
index c4cb4a135e8e..fc47bd65a8f7 100644
--- a/python/tvm/relay/op/strategy/mali.py
+++ b/python/tvm/relay/op/strategy/mali.py
@@ -171,9 +171,16 @@ def conv2d_winograd_without_weight_transfrom_strategy_mali(attrs, inputs, out_ty
 def dense_strategy_mali(attrs, inputs, out_type, target):
     """dense mali strategy"""
     strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_dense(topi.mali.dense),
-        wrap_topi_schedule(topi.mali.schedule_dense),
-        name="dense.mali",
-    )
+    if not is_auto_scheduler_enabled():
+        strategy.add_implementation(
+            wrap_compute_dense(topi.mali.dense),
+            wrap_topi_schedule(topi.mali.schedule_dense),
+            name="dense.mali",
+        )
+    else:
+        strategy.add_implementation(
+            wrap_compute_dense(topi.nn.dense, need_auto_scheduler_layout=True),
+            naive_schedule,
+            name="dense.mali",
+        )
     return strategy

From c6b766a4cea4e59384c2606deecdc5321ac3d41c Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Wed, 30 Dec 2020 17:29:06 -0800
Subject: [PATCH 010/357] [Relay][Op] Remove reverse attribute from reshape and
 reverse_reshape operators. (#7086)

---
 include/tvm/relay/attrs/transform.h           |  4 -
 src/relay/op/dyn/tensor/transform.cc          |  1 -
 src/relay/op/tensor/transform.cc              | 76 ++++++++++++++-----
 src/relay/op/tensor/transform.h               |  2 +-
 .../test_arm_compute_lib/test_reshape.py      |  1 -
 5 files changed, 59 insertions(+), 25 deletions(-)

diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index cbe989f93558..efa44e026c51 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -83,13 +83,9 @@ struct TransposeAttrs : public tvm::AttrsNode<TransposeAttrs> {
 /*! \brief Attributes used in reshape operators */
 struct ReshapeAttrs : public tvm::AttrsNode<ReshapeAttrs> {
   Array<Integer> newshape;
-  bool reverse;
   TVM_DECLARE_ATTRS(ReshapeAttrs, "relay.attrs.ReshapeAttrs") {
     TVM_ATTR_FIELD(newshape).describe(
         "The new shape. Should be compatible with the original shape.");
-    TVM_ATTR_FIELD(reverse)
-        .describe("Infer the special values from right to left if true")
-        .set_default(false);
   }
 };  // struct ReshapeAttrs
 
diff --git a/src/relay/op/dyn/tensor/transform.cc b/src/relay/op/dyn/tensor/transform.cc
index 815f24b6bda9..e4e81e3612fb 100644
--- a/src/relay/op/dyn/tensor/transform.cc
+++ b/src/relay/op/dyn/tensor/transform.cc
@@ -90,7 +90,6 @@ Array<te::Tensor> ReshapeCompute(const Attrs& attrs, const Array<te::Tensor>& in
 
 Expr MakeReshape(Expr data, Expr newshape) {
   auto attrs = make_object<ReshapeAttrs>();
-  attrs->reverse = false;
   static const Op& op = Op::Get("dyn.reshape");
   return Call(op, {data, newshape}, Attrs(attrs), {});
 }
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 6819ea93f249..19ca6129ecbe 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -455,13 +455,14 @@ RELAY_REGISTER_OP("transpose")
 TVM_REGISTER_NODE_TYPE(ReshapeAttrs);
 TVM_REGISTER_NODE_TYPE(ReshapeLikeAttrs);
 
-Array<IndexExpr> infer_newshape(const Array<IndexExpr>& data_shape, const Attrs& attrs) {
+Array<IndexExpr> InferNewShape(const Array<IndexExpr>& data_shape, const Attrs& attrs,
+                               bool reverse) {
   const auto* param = attrs.as<ReshapeAttrs>();
   Array<IndexExpr> oshape;
   Array<IndexExpr> ishape;
   Array<Integer> newshape;
 
-  if (param->reverse) {
+  if (reverse) {
     ishape.Assign(data_shape.rbegin(), data_shape.rend());
     newshape.Assign(param->newshape.rbegin(), param->newshape.rend());
   } else {
@@ -584,7 +585,6 @@ Array<IndexExpr> infer_newshape(const Array<IndexExpr>& data_shape, const Attrs&
 
 bool ReshapeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                 const TypeReporter& reporter) {
-  const auto* param = attrs.as<ReshapeAttrs>();
   // types: [data, result]
   ICHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
@@ -594,16 +594,12 @@ bool ReshapeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     return false;
   }
 
-  const auto& oshape = infer_newshape(data->shape, attrs);
+  const auto& oshape = InferNewShape(data->shape, attrs, false);
 
   // Verify that the sum of dimensions in the output shape is the sum of
   // dimensions in the input shape
   Array<IndexExpr> data_shape;
-  if (param->reverse) {
-    data_shape.Assign(data->shape.rbegin(), data->shape.rend());
-  } else {
-    data_shape = data->shape;
-  }
+  data_shape = data->shape;
 
   bool found_dynamic = false;
   int64_t oshape_sum = 1;
@@ -633,12 +629,58 @@ bool ReshapeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
         << "Input tensor shape and reshaped shape are not compatible";
   }
 
-  if (param->reverse) {
-    reporter->Assign(types[1],
-                     TensorType(Array<IndexExpr>(oshape.rbegin(), oshape.rend()), data->dtype));
-  } else {
-    reporter->Assign(types[1], TensorType(oshape, data->dtype));
+  reporter->Assign(types[1], TensorType(oshape, data->dtype));
+  return true;
+}
+
+bool ReverseReshapeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                       const TypeReporter& reporter) {
+  // types: [data, result]
+  ICHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) {
+    ICHECK(types[0].as<IncompleteTypeNode>())
+        << "reshape: expect input type to be TensorType but get " << types[0];
+    return false;
+  }
+
+  const auto& oshape = InferNewShape(data->shape, attrs, true);
+
+  // Verify that the sum of dimensions in the output shape is the sum of
+  // dimensions in the input shape
+  Array<IndexExpr> data_shape;
+  data_shape.Assign(data->shape.rbegin(), data->shape.rend());
+
+  bool found_dynamic = false;
+  int64_t oshape_sum = 1;
+  for (auto& x : oshape) {
+    // Check if we have a dynamic shape. If we do, we can't verify if the
+    // reshape is valid. Dynamic shapes are marker by using Any, but can also
+    // occur from SizeVar's. In the case of SizeVar, the shape expression can
+    // be an AST. We can't easily check if we have an AST because of a ShapeVar
+    // or some other reason, so our check for dynamic shape is just if we can
+    // convert the shape to in integer or not.
+    if (!x->IsInstance<tvm::Integer::ContainerType>()) {
+      found_dynamic = true;
+      break;
+    }
+    oshape_sum *= Downcast<tvm::Integer>(x)->value;
   }
+  int64_t data_shape_sum = 1;
+  for (auto& x : data_shape) {
+    if (!x->IsInstance<tvm::Integer::ContainerType>()) {
+      found_dynamic = true;
+      break;
+    }
+    data_shape_sum *= Downcast<tvm::Integer>(x)->value;
+  }
+  if (!found_dynamic) {
+    ICHECK_EQ(oshape_sum, data_shape_sum)
+        << "Input tensor shape and reshaped shape are not compatible";
+  }
+
+  reporter->Assign(types[1],
+                   TensorType(Array<IndexExpr>(oshape.rbegin(), oshape.rend()), data->dtype));
   return true;
 }
 
@@ -701,7 +743,7 @@ Array<te::Tensor> ReshapeCompute(const Attrs& attrs, const Array<te::Tensor>& in
   }
 
   if (newshape_has_any) {
-    newshape = infer_newshape(inputs[0]->shape, attrs);
+    newshape = InferNewShape(inputs[0]->shape, attrs, false);
   }
   return {topi::reshape(inputs[0], newshape)};
 }
@@ -709,7 +751,6 @@ Array<te::Tensor> ReshapeCompute(const Attrs& attrs, const Array<te::Tensor>& in
 Expr MakeReshape(Expr data, Array<Integer> newshape) {
   auto attrs = make_object<ReshapeAttrs>();
   attrs->newshape = std::move(newshape);
-  attrs->reverse = false;
   static const Op& op = Op::Get("reshape");
   return Call(op, {data}, Attrs(attrs), {});
 }
@@ -2871,7 +2912,6 @@ RELAY_REGISTER_OP("auto_scheduler_layout_transform")
 Expr MakeReverseReshape(Expr data, Array<Integer> newshape) {
   auto attrs = make_object<ReshapeAttrs>();
   attrs->newshape = std::move(newshape);
-  attrs->reverse = true;
   static const Op& op = Op::Get("contrib_reverse_reshape");
   return Call(op, {data}, Attrs(attrs), {});
 }
@@ -2896,7 +2936,7 @@ example below::
     .set_attrs_type<ReshapeAttrs>()
     .add_argument("data", "Tensor", "The input tensor.")
     .set_support_level(10)
-    .add_type_rel("Reshape", ReshapeRel)
+    .add_type_rel("ReverseReshape", ReverseReshapeRel)
     .set_attr<FTVMCompute>("FTVMCompute", ReshapeCompute)
     .set_attr<TOpPattern>("TOpPattern", kInjective);
 
diff --git a/src/relay/op/tensor/transform.h b/src/relay/op/tensor/transform.h
index 34aaf4689a59..a3770ff9cd8d 100644
--- a/src/relay/op/tensor/transform.h
+++ b/src/relay/op/tensor/transform.h
@@ -195,7 +195,7 @@ static inline Array<Array<Layout>> ConcatenateLayout(const Attrs& attrs,
  * \param attrs The attributes.
  * \return Output shape.
  */
-Array<IndexExpr> infer_newshape(const Array<IndexExpr>& data_shape, const Attrs& attrs);
+Array<IndexExpr> InferNewShape(const Array<IndexExpr>& data_shape, const Attrs& attrs);
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/contrib/test_arm_compute_lib/test_reshape.py b/tests/python/contrib/test_arm_compute_lib/test_reshape.py
index 9364c6b1a478..94942727416a 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_reshape.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_reshape.py
@@ -50,7 +50,6 @@ def _get_expected_codegen(input_shape, output_shape, dtype):
             "newshape": [[str(s) for s in output_shape]],
             "shape": [[list(output_shape)]],
             "dtype": [[dtype]],
-            "reverse": [["0"]],
         },
     }
 

From c02c9c528f91f9be3967b7d9ef9f1847f533590b Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Wed, 30 Dec 2020 23:04:40 -0700
Subject: [PATCH 011/357] Parallelize cumsum in get_valid_counts (#7123)

* Parallelize cumsum in get_valid_counts

* make the scan loop exclusive

* switch to directly using exclusive scan

* perform inner loop of final writes on anchor threads

* fix flaky test

fix lint

* remove final cuda kernel

Co-authored-by: masa <masa@pop-os.localdomain>
---
 python/tvm/topi/cuda/nms.py                  | 134 ++++++++++++++-----
 python/tvm/topi/vision/nms.py                |   2 +-
 tests/python/relay/test_op_level5.py         |   4 +-
 tests/python/topi/python/test_topi_vision.py |  21 +--
 4 files changed, 109 insertions(+), 52 deletions(-)

diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
index dd9d3f8a1d0e..2dc177a0fae8 100644
--- a/python/tvm/topi/cuda/nms.py
+++ b/python/tvm/topi/cuda/nms.py
@@ -95,7 +95,7 @@ def get_valid_boxes_ir(data, valid_boxes, score_threshold, id_index, score_index
     max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
     with ib.new_scope():
         nthread_tx = max_threads
-        nthread_bx = num_anchors // max_threads + 1
+        nthread_bx = ceil_div(num_anchors, max_threads)
         nthread_by = batch_size
         tx = te.thread_axis("threadIdx.x")
         bx = te.thread_axis("blockIdx.x")
@@ -151,31 +151,103 @@ def get_valid_indices_ir(valid_boxes, valid_count, valid_indices):
     valid_indices = ib.buffer_ptr(valid_indices)
 
     max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
+
+    # Copy boxes to valid_indices
     with ib.new_scope():
         nthread_tx = max_threads
-        nthread_bx = batch_size // max_threads + 1
+        nthread_bx = ceil_div(num_anchors, max_threads)
+        nthread_by = batch_size
         tx = te.thread_axis("threadIdx.x")
         bx = te.thread_axis("blockIdx.x")
+        by = te.thread_axis("blockIdx.y")
         ib.scope_attr(tx, "thread_extent", nthread_tx)
         ib.scope_attr(bx, "thread_extent", nthread_bx)
-        tid = bx * max_threads + tx
-        # TODO(mbrookhart): Parallelize the sum and cumsum here
-        current_index = ib.allocate("int32", (1,), name="current_index", scope="local")
-        with ib.if_scope(tid < batch_size):
-            current_index[0] = 0
-            valid_count[tid] = 0
-            with ib.for_range(0, num_anchors) as j:
-                idx = tid * num_anchors + j
-                valid_count[tid] = valid_count[tid] + valid_boxes[idx]
-                with ib.if_scope(valid_boxes[idx] == 1):
-                    valid_indices[idx] = current_index[0]
-                    current_index[0] = current_index[0] + 1
-                with ib.else_scope():
-                    valid_indices[idx] = -1
+        ib.scope_attr(by, "thread_extent", nthread_by)
+        tid = bx * nthread_tx + tx
+        with ib.if_scope(tid < num_anchors):
+            valid_indices[by, tid] = valid_boxes[by, tid]
+
+    nthread_tx = max_threads
+    nthread_bx = ceil_div(num_anchors, max_threads)
+    nthread_by = batch_size
+
+    ## The following algorithm performs parallel exclusive scan to get
+    ## a tensor that can later be used to select valid indices
+    # Up Sweep of exclusive scan
+    lim = tvm.tir.generic.cast(
+        tvm.tir.ceil(tvm.tir.log2(tvm.tir.generic.cast(num_anchors, "float64"))), "int64"
+    )
+    with ib.for_range(0, lim, dtype="int64") as l2_width:
+        width = 2 << l2_width
+
+        with ib.new_scope():
+            tx = te.thread_axis("threadIdx.x")
+            bx = te.thread_axis("blockIdx.x")
+            ib.scope_attr(tx, "thread_extent", nthread_tx)
+            ib.scope_attr(
+                bx,
+                "thread_extent",
+                tvm.tir.generic.cast(ceil_div(num_anchors, max_threads * width), "int32"),
+            )
+            tid = bx * nthread_tx + tx
+
+            by = te.thread_axis("blockIdx.y")
+            ib.scope_attr(by, "thread_extent", nthread_by)
+            start = ib.allocate("int64", (1,), name="start", scope="local")
+            middle = ib.allocate("int64", (1,), name="middle", scope="local")
+            end = ib.allocate("int64", (1,), name="end", scope="local")
+            start[0] = width * tid
+            with ib.if_scope(start[0] < num_anchors):
+                middle[0] = start[0] + tvm.tir.indexdiv(width, 2)
+                end[0] = tvm.te.min(start[0] + width, num_anchors)
+                with ib.if_scope(middle[0] < num_anchors):
+                    valid_indices[by * num_anchors + end[0] - 1] += valid_indices[
+                        by * num_anchors + middle[0] - 1
+                    ]
+
+    # Down Sweep of exclusive scan
+    with ib.new_scope():
+        bx = te.thread_axis("blockIdx.x")
+        ib.scope_attr(bx, "thread_extent", batch_size)
+        with ib.if_scope(bx < batch_size):
+            valid_count[bx] = valid_indices[(bx + 1) * num_anchors - 1]
+            valid_indices[(bx + 1) * num_anchors - 1] = 0
+
+    with ib.for_range(0, lim, dtype="int64") as l2_width:
+        width = 2 << (lim - l2_width - 1)
+
+        with ib.new_scope():
+            tx = te.thread_axis("threadIdx.x")
+            bx = te.thread_axis("blockIdx.x")
+            ib.scope_attr(tx, "thread_extent", nthread_tx)
+            ib.scope_attr(
+                bx,
+                "thread_extent",
+                tvm.tir.generic.cast(ceil_div(num_anchors, max_threads * width), "int32"),
+            )
+            tid = bx * nthread_tx + tx
+
+            by = te.thread_axis("blockIdx.y")
+            ib.scope_attr(by, "thread_extent", nthread_by)
+            start = ib.allocate("int64", (1,), name="start", scope="local")
+            middle = ib.allocate("int64", (1,), name="middle", scope="local")
+            end = ib.allocate("int64", (1,), name="end", scope="local")
+            tmp = ib.allocate("int32", (1,), name="end", scope="local")
+            start[0] = width * tid
+            with ib.if_scope(tvm.tir.all(start[0] < num_anchors)):
+                middle[0] = start[0] + tvm.tir.indexdiv(width, 2)
+                end[0] = tvm.tir.min(start[0] + width, num_anchors)
+                with ib.if_scope(middle[0] < num_anchors):
+                    tmp[0] = valid_indices[by * num_anchors + middle[0] - 1]
+                    valid_indices[by * num_anchors + middle[0] - 1] = valid_indices[
+                        by * num_anchors + end[0] - 1
+                    ]
+                    valid_indices[by * num_anchors + end[0] - 1] += tmp[0]
+
     return ib.get()
 
 
-def get_valid_counts_ir(data, valid_indices, out, out_indices):
+def get_valid_counts_ir(data, valid_indices, valid_boxes, out, out_indices):
     """Low level IR to get valid count of bounding boxes
     given a score threshold. Also prepares to move valid boxes to the
     top of input data.
@@ -203,8 +275,9 @@ def get_valid_counts_ir(data, valid_indices, out, out_indices):
     ib = tvm.tir.ir_builder.create()
 
     data = ib.buffer_ptr(data)
-
     valid_indices = ib.buffer_ptr(valid_indices)
+    valid_boxes = ib.buffer_ptr(valid_boxes)
+
     out = ib.buffer_ptr(out)
     out_indices = ib.buffer_ptr(out_indices)
     one = tvm.tir.const(1, dtype=out.dtype)
@@ -213,41 +286,36 @@ def get_valid_counts_ir(data, valid_indices, out, out_indices):
     nthread_tx = max_threads
     nthread_bx = num_anchors // max_threads + 1
     nthread_by = batch_size
-    nthread_bz = elem_length
     with ib.new_scope():
         tx = te.thread_axis("threadIdx.x")
         bx = te.thread_axis("blockIdx.x")
         by = te.thread_axis("blockIdx.y")
-        bz = te.thread_axis("blockIdx.z")
         ib.scope_attr(tx, "thread_extent", nthread_tx)
         ib.scope_attr(bx, "thread_extent", nthread_bx)
         ib.scope_attr(by, "thread_extent", nthread_by)
-        ib.scope_attr(bz, "thread_extent", nthread_bz)
         tid = bx * max_threads + tx
         with ib.if_scope(tid < num_anchors):
             i = by
             j = tid
-            k = bz
-            out[(i * num_anchors + j) * elem_length + k] = -one
+            with ib.for_range(0, elem_length) as k:
+                out[(i * num_anchors + j) * elem_length + k] = -one
             out_indices[i * num_anchors + j] = -1
     with ib.new_scope():
         tx = te.thread_axis("threadIdx.x")
         bx = te.thread_axis("blockIdx.x")
         by = te.thread_axis("blockIdx.y")
-        bz = te.thread_axis("blockIdx.z")
         ib.scope_attr(tx, "thread_extent", nthread_tx)
         ib.scope_attr(bx, "thread_extent", nthread_bx)
         ib.scope_attr(by, "thread_extent", nthread_by)
-        ib.scope_attr(bz, "thread_extent", nthread_bz)
         tid = bx * max_threads + tx
         with ib.if_scope(tid < num_anchors):
             i = by
             j = tid
-            k = bz
-            with ib.if_scope(valid_indices[i, tid] >= 0):
-                out[(i * num_anchors + valid_indices[i, tid]) * elem_length + k] = data[
-                    (i * num_anchors + j) * elem_length + k
-                ]
+            with ib.if_scope(valid_boxes[i, tid] > 0):
+                with ib.for_range(0, elem_length) as k:
+                    out[(i * num_anchors + valid_indices[i, tid]) * elem_length + k] = data[
+                        (i * num_anchors + j) * elem_length + k
+                    ]
                 out_indices[i * num_anchors + valid_indices[i, tid]] = j
     return ib.get()
 
@@ -321,10 +389,10 @@ def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
 
     out, out_indices = te.extern(
         [data.shape, (batch_size, num_anchors)],
-        [data, valid_indices],
-        lambda ins, outs: get_valid_counts_ir(ins[0], ins[1], outs[0], outs[1]),
+        [data, valid_indices, valid_boxes],
+        lambda ins, outs: get_valid_counts_ir(ins[0], ins[1], ins[2], outs[0], outs[1]),
         dtype=["int32", data.dtype],
-        in_buffers=[data_buf, valid_indices_buf],
+        in_buffers=[data_buf, valid_indices_buf, valid_boxes_buf],
         out_buffers=[out_buf, out_indices_buf],
         name="get_valid_counts",
         tag="get_valid_counts_gpu",
diff --git a/python/tvm/topi/vision/nms.py b/python/tvm/topi/vision/nms.py
index 035d19f25ec7..cbf136a5552c 100644
--- a/python/tvm/topi/vision/nms.py
+++ b/python/tvm/topi/vision/nms.py
@@ -213,7 +213,7 @@ def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
     out_indices: tvm.te.Tensor or numpy NDArray
         Related index in input data.
     """
-    if isinstance(score_threshold, float):
+    if isinstance(score_threshold, (float, int)):
         score_threshold = tvm.tir.const(score_threshold, dtype=data.dtype)
     id_index_const = tvm.tir.const(id_index, "int32")
     score_index_const = tvm.tir.const(score_index, "int32")
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 1ce8a182f034..cdf3b240507b 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -313,10 +313,8 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index):
         for target, ctx in tvm.testing.enabled_targets():
             intrp = relay.create_executor("debug", ctx=ctx, target=target)
             out = intrp.evaluate(func)(np_data)
+
             tvm.testing.assert_allclose(out[0].asnumpy(), np_out1, rtol=1e-3, atol=1e-04)
-            # get_valid_count for opencl doesn't do data rearrangement
-            if target in ["opencl"]:
-                return
             tvm.testing.assert_allclose(out[1].asnumpy(), np_out2, rtol=1e-3, atol=1e-04)
             tvm.testing.assert_allclose(out[2].asnumpy(), np_out3, rtol=1e-3, atol=1e-04)
 
diff --git a/tests/python/topi/python/test_topi_vision.py b/tests/python/topi/python/test_topi_vision.py
index 778843be37de..697ef8a24f67 100644
--- a/tests/python/topi/python/test_topi_vision.py
+++ b/tests/python/topi/python/test_topi_vision.py
@@ -105,27 +105,18 @@ def check_device(device):
         tvm_out1 = tvm.nd.array(np.zeros(np_out1.shape, dtype="int32"), ctx)
         tvm_out2 = tvm.nd.array(np.zeros(np_out2.shape, dtype=dtype), ctx)
         tvm_out3 = tvm.nd.array(np.zeros(np_out3.shape, dtype="int32"), ctx)
-        if device == "llvm":
-            f = tvm.build(s, [data, outs[0], outs[1], outs[2]], device)
-            f(tvm_input_data, tvm_out1, tvm_out2, tvm_out3)
-            tvm.testing.assert_allclose(tvm_out1.asnumpy(), np_out1, rtol=1e-3)
-            tvm.testing.assert_allclose(tvm_out2.asnumpy(), np_out2, rtol=1e-3)
-            tvm.testing.assert_allclose(tvm_out3.asnumpy(), np_out3, rtol=1e-3)
-        else:
-            f = tvm.build(s, [data, outs[0], outs[1]], device)
-            f(tvm_input_data, tvm_out1, tvm_out2)
-            tvm.testing.assert_allclose(tvm_out1.asnumpy(), np_out1, rtol=1e-3)
-            tvm.testing.assert_allclose(tvm_out2.asnumpy(), np_out2, rtol=1e-3)
+
+        f = tvm.build(s, [data, outs[0], outs[1], outs[2]], device)
+        f(tvm_input_data, tvm_out1, tvm_out2, tvm_out3)
+        tvm.testing.assert_allclose(tvm_out1.asnumpy(), np_out1, rtol=1e-3)
+        tvm.testing.assert_allclose(tvm_out2.asnumpy(), np_out2, rtol=1e-3)
+        tvm.testing.assert_allclose(tvm_out3.asnumpy(), np_out3, rtol=1e-3)
 
     for device in ["llvm", "cuda", "opencl"]:
         check_device(device)
 
 
 @tvm.testing.uses_gpu
-@pytest.mark.skip(
-    "Skip this test as it is intermittent."
-    "See https://github.com/apache/tvm/pull/4901#issuecomment-595040094"
-)
 def test_get_valid_counts():
     verify_get_valid_counts((1, 1000, 5), 0.5, -1, 0)
     verify_get_valid_counts((1, 2500, 6), 0, 0, 1)

From 6258fae6d1e9ab77b8065d4ffb81a5033665e0cc Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Fri, 1 Jan 2021 16:07:41 -0800
Subject: [PATCH 012/357] [Fix] Tensor core type issue for dense (#7187)

* fix tc type issue for dense

* fix lint

* rm float 32

Co-authored-by: Leyuan Wang <ziyu.guo@bytedance.com>
---
 python/tvm/relay/op/strategy/cuda.py | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 9d8420c69610..37946c01cb46 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -678,9 +678,26 @@ def dense_strategy_cuda(attrs, inputs, out_type, target):
         if target.kind.name == "cuda":
             if nvcc.have_tensorcore(target=target):
                 if (
-                    (i % 16 == 0 and b % 16 == 0 and o % 16 == 0)
-                    or (i % 16 == 0 and b % 8 == 0 and o % 32 == 0)
-                    or (i % 16 == 0 and b % 32 == 0 and o % 8 == 0)
+                    (
+                        data.dtype in ["float16", "int8", "uint8"]
+                        and (
+                            (i % 16 == 0 and b % 16 == 0 and o % 16 == 0)
+                            or (i % 16 == 0 and b % 8 == 0 and o % 32 == 0)
+                            or (i % 16 == 0 and b % 32 == 0 and o % 8 == 0)
+                        )
+                    )
+                    or (
+                        data.dtype in ["int4", "uint4"]
+                        and i % 32 == 0
+                        and b % 8 == 0
+                        and o % 8 == 0
+                    )
+                    or (
+                        data.dtype in ["int1", "uint1"]
+                        and i % 128 == 0
+                        and b % 8 == 0
+                        and o % 8 == 0
+                    )
                 ):
                     strategy.add_implementation(
                         wrap_compute_dense(topi.cuda.dense_tensorcore),

From 76a98252ec94bcfa373097952afc420a701c0cd8 Mon Sep 17 00:00:00 2001
From: insop <insop.song@gmail.com>
Date: Sun, 3 Jan 2021 02:46:22 -0800
Subject: [PATCH 013/357] Remove seemingly invalid SoftPlus (#7189)

- `Softplus` is added in 12/10/2020 from this https://github.com/apache/tvm/pull/7089
- However, I see that there were `SoftPlus` (not the P is in capital) was already in.
According to [Onnx spec](https://github.com/onnx/onnx/blob/master/docs/Operators.md), it is `Softplus` not `SoftPlus`.
---
 python/tvm/relay/frontend/onnx.py          | 9 ---------
 tests/python/frontend/onnx/test_forward.py | 1 -
 2 files changed, 10 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 6122c81d321a..1c544d309717 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -932,14 +932,6 @@ def _impl_v1(cls, inputs, attr, params):
         return _op.tanh(_expr.const(beta) * inputs[0]) * _expr.const(alpha)
 
 
-class SoftPlus(OnnxOpConverter):
-    """Operator converter for SoftPlus."""
-
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        return _op.log(_op.exp(inputs[0]) + _expr.const(1.0))
-
-
 class Softsign(OnnxOpConverter):
     """Operator converter for Softsign."""
 
@@ -2661,7 +2653,6 @@ def _get_convert_map(opset):
         "OneHot": OneHot.get_converter(opset),
         # 'Hardmax'
         "Softsign": Softsign.get_converter(opset),
-        "SoftPlus": SoftPlus.get_converter(opset),
         "Gemm": Gemm.get_converter(opset),
         "MatMul": MatMul.get_converter(opset),
         "Mod": Mod.get_converter(opset),
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 33dd048896b6..3d95a9a83ee3 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -1983,7 +1983,6 @@ def verify_single_ops(op, x, out_np, rtol=1e-5, atol=1e-5):
     verify_single_ops("Tanh", x, np.tanh(x))
     verify_single_ops("Sigmoid", x, 1 / (1 + np.exp(-x)))
     verify_single_ops("Softsign", x, x / (1 + np.abs(x)))
-    verify_single_ops("SoftPlus", x, np.log(1 + np.exp(x)))
 
 
 @tvm.testing.uses_gpu

From 86a8504a7ccb956591ffd1a529f625df7d20b520 Mon Sep 17 00:00:00 2001
From: insop <insop.song@gmail.com>
Date: Sun, 3 Jan 2021 02:46:55 -0800
Subject: [PATCH 014/357] [Frontend][MXNet] add _npi_subtract_scalar (#7191)

* [Frontend][MXNet] add _npi_subtract_scalar

- add mxnet numpy operator, subtract
- https://github.com/apache/tvm/issues/7186
- https://mxnet.apache.org/versions/master/api/python/docs/api/np/generated/mxnet.np.subtract.html

* Fix python style using black
---
 3rdparty/vta-hw                             |  2 +-
 python/tvm/relay/frontend/mxnet.py          |  2 ++
 tests/python/frontend/mxnet/test_forward.py | 20 ++++++++++++++++----
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/3rdparty/vta-hw b/3rdparty/vta-hw
index 57db5a718c74..87ce9acfae55 160000
--- a/3rdparty/vta-hw
+++ b/3rdparty/vta-hw
@@ -1 +1 @@
-Subproject commit 57db5a718c74a788c98120ebbe1230797be698c8
+Subproject commit 87ce9acfae550d1a487746e9d06c2e250076e54c
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index f2330c72e1f4..1085e904c386 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -2693,6 +2693,8 @@ def _mx_npi_where_rscalar(inputs, attrs):
     "_npi_multiply_scalar": _binop_scalar(_op.multiply),
     "_npi_add": _rename(_op.add),
     "_npi_add_scalar": _binop_scalar(_op.add),
+    "_npi_subtract": _rename(_op.subtract),
+    "_npi_subtract_scalar": _binop_scalar(_op.subtract),
     "_npi_where_rscalar": _mx_npi_where_rscalar,
     "_npi_less": _rename(_op.less),
     "_npi_less_equal": _mx_compare(_op.less_equal, _rename),
diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py
index f076a27755ad..d3be8c0506ba 100644
--- a/tests/python/frontend/mxnet/test_forward.py
+++ b/tests/python/frontend/mxnet/test_forward.py
@@ -2062,8 +2062,14 @@ def test_forward_npx_reshape(data_shape, out_shape, dtype, target, reverse, ctx,
 @tvm.testing.parametrize_targets
 @pytest.mark.parametrize("kind", ["graph", "vm", "debug"])
 def test_forward_npi_binary(data_shape, dtype, target, ctx, kind):
-    ref_ops = [mx.np.power, mx.np.multiply, mx.np.add, mx.np.less]
-    mx_ops = [mx.sym.np.power, mx.sym.np.multiply, mx.sym.np.add, mx.sym.np.less]
+    ref_ops = [mx.np.power, mx.np.multiply, mx.np.add, mx.np.subtract, mx.np.less]
+    mx_ops = [
+        mx.sym.np.power,
+        mx.sym.np.multiply,
+        mx.sym.np.add,
+        mx.sym.np.subtract,
+        mx.sym.np.less,
+    ]
     for i in range(len(ref_ops)):
         ref_op = ref_ops[i]
         mx_op = mx_ops[i]
@@ -2092,8 +2098,14 @@ def test_forward_npi_binary(data_shape, dtype, target, ctx, kind):
 @pytest.mark.parametrize("scalar", [1.0, 2.0, 3.0, 4.0])
 @pytest.mark.parametrize("kind", ["graph", "vm", "debug"])
 def test_forward_npi_binary_scalar(data_shape, dtype, scalar, target, ctx, kind):
-    ref_ops = [mx.np.power, mx.np.multiply, mx.np.add, mx.np.true_divide]
-    mx_ops = [mx.sym.np.power, mx.sym.np.multiply, mx.sym.np.add, mx.sym.np.true_divide]
+    ref_ops = [mx.np.power, mx.np.multiply, mx.np.add, mx.np.subtract, mx.np.true_divide]
+    mx_ops = [
+        mx.sym.np.power,
+        mx.sym.np.multiply,
+        mx.sym.np.add,
+        mx.sym.np.subtract,
+        mx.sym.np.true_divide,
+    ]
     for i in range(len(ref_ops)):
         ref_op = ref_ops[i]
         mx_op = mx_ops[i]

From 25f0252ba20824e6a4ed55bed490bc0f0bdcb5d0 Mon Sep 17 00:00:00 2001
From: cxcxcxcx <cxcxcxcx@gmail.com>
Date: Mon, 4 Jan 2021 00:34:28 -0800
Subject: [PATCH 015/357] Makes sure g_last_error is null terminated. (#7190)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This addresses GCC 10 error:

```
"src/runtime/crt/common/crt_runtime_api.c"
include/tvm/runtime/c_runtime_api.h: 在函数‘TVMAPISetLastError’中:
src/runtime/crt/common/crt_runtime_api.c:42:3: 错误：‘strncpy’ specified
bound 1024 equals destination size [-Werror=stringop-truncation]
   42 |   strncpy(g_last_error, msg, sizeof(g_last_error));
      |   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
cc1：所有的警告都被当作是错误
```
---
 src/runtime/crt/common/crt_runtime_api.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c
index fcfb51f9ef4c..960f844652a9 100644
--- a/src/runtime/crt/common/crt_runtime_api.c
+++ b/src/runtime/crt/common/crt_runtime_api.c
@@ -38,7 +38,10 @@
 
 static char g_last_error[1024];
 
-void TVMAPISetLastError(const char* msg) { strncpy(g_last_error, msg, sizeof(g_last_error)); }
+void TVMAPISetLastError(const char* msg) {
+  strncpy(g_last_error, msg, sizeof(g_last_error) - 1);
+  g_last_error[sizeof(g_last_error) - 1] = 0;
+}
 
 __attribute__((format(printf, 1, 2))) int TVMAPIErrorf(const char* msg, ...) {
   va_list args;

From eb64e259546574372c8bb88eee3a4b83130b8b7d Mon Sep 17 00:00:00 2001
From: Ritwik Das <ritwikdas54@gmail.com>
Date: Mon, 4 Jan 2021 01:22:43 -0800
Subject: [PATCH 016/357] Fix ICHECK_NOTNULL in logging.g (#7193)

---
 include/tvm/support/logging.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/tvm/support/logging.h b/include/tvm/support/logging.h
index d98363ea1c1b..ced1902a1bd1 100644
--- a/include/tvm/support/logging.h
+++ b/include/tvm/support/logging.h
@@ -139,10 +139,10 @@ constexpr const char* kTVM_INTERNAL_ERROR_MESSAGE =
 #define ICHECK_GE(x, y) ICHECK_BINARY_OP(_GE, >=, x, y)
 #define ICHECK_EQ(x, y) ICHECK_BINARY_OP(_EQ, ==, x, y)
 #define ICHECK_NE(x, y) ICHECK_BINARY_OP(_NE, !=, x, y)
-#define ICHECK_NOTNULL(x)                                                                        \
-  ((x) == nullptr ? dmlc::LogMessageFatal(__FILE__, __LINE__).stream()                           \
-                        << tvm::kTVM_INTERNAL_ERROR_MESSAGE << __INDENT << "Check not null: " #x \
-                        << ' ',                                                                  \
+#define ICHECK_NOTNULL(x)                                                    \
+  ((x) == nullptr ? dmlc::LogMessageFatal(__FILE__, __LINE__).stream()       \
+                        << tvm::kTVM_INTERNAL_ERROR_MESSAGE << ICHECK_INDENT \
+                        << "Check not null: " #x << ' ',                     \
    (x) : (x))  // NOLINT(*)
 
 /*! \brief The diagnostic level, controls the printing of the message. */

From 705323592b49e8971c70e46d604e85635438f16d Mon Sep 17 00:00:00 2001
From: Gaetano <mendola@gmail.com>
Date: Mon, 4 Jan 2021 15:39:54 +0100
Subject: [PATCH 017/357] Fixed temporary lock_guard instances. (#7199)

---
 src/target/generic_func.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/target/generic_func.cc b/src/target/generic_func.cc
index 16e5a5f9cdc6..5dbceec32ed7 100644
--- a/src/target/generic_func.cc
+++ b/src/target/generic_func.cc
@@ -51,7 +51,7 @@ struct GenericFunc::Manager {
 
 GenericFunc GenericFunc::Get(const std::string& name) {
   Manager* m = Manager::Global();
-  std::lock_guard<std::mutex>(m->mutex);
+  std::lock_guard<std::mutex> lock(m->mutex);
   auto it = m->fmap.find(name);
   if (it == m->fmap.end()) {
     auto f = make_object<GenericFuncNode>();
@@ -66,7 +66,7 @@ GenericFunc GenericFunc::Get(const std::string& name) {
 
 void GenericFunc::RegisterGenericFunc(GenericFunc func, const std::string& name) {
   Manager* m = Manager::Global();
-  std::lock_guard<std::mutex>(m->mutex);
+  std::lock_guard<std::mutex> lock(m->mutex);
   auto it = m->fmap.find(name);
   ICHECK(it == m->fmap.end()) << "GenericFunc already registered " << name;
   func->name_ = name;

From 361f508769a523072c0932bd40212d6a65d9789f Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Tue, 5 Jan 2021 06:04:55 +0900
Subject: [PATCH 018/357] [CUBLAS, CUDNN] Support dynamic batch size (#7194)

* support cudnn and cublas on dynamic batch size

* added test for cublas

* add comment on algo choice
---
 python/tvm/contrib/cudnn.py    | 81 +++++++++++++++++++++-------------
 python/tvm/topi/cuda/conv2d.py | 24 +++++-----
 python/tvm/topi/cuda/conv3d.py | 26 ++++++-----
 python/tvm/topi/cuda/dense.py  |  3 +-
 tests/python/relay/test_any.py | 50 +++++++++++++++++----
 5 files changed, 121 insertions(+), 63 deletions(-)

diff --git a/python/tvm/contrib/cudnn.py b/python/tvm/contrib/cudnn.py
index 6dc04c9f58fd..0e22e0c09274 100644
--- a/python/tvm/contrib/cudnn.py
+++ b/python/tvm/contrib/cudnn.py
@@ -342,36 +342,57 @@ def conv_forward(x, w, pad, stride, dilation, conv_mode, tensor_format, algo, co
     conv_dtype = x.dtype if conv_dtype is None else conv_dtype
     pad, stride, dilation, _, _ = _prepare_global_func_params(dims - 2, pad, stride, dilation)
 
-    oshape = conv_output_shape(
-        tensor_format,
-        pad,
-        stride,
-        dilation,
-        list(x.shape),
-        list(w.shape),
-        x.dtype,
-        conv_dtype,
-        groups,
-    )
-    if algo == -1:
-        # For now if we try to call `cudnnFindConvolutionForwardAlgorithm` when
-        # using INT8 data type, CuDNN will crash down.
-        # On the other hand, CuDNN only support IMPLICIT_PRECOMP_GEMM at NHWC format
-        if tensor_format == 1 and conv_dtype == "int32":
-            algo = 1
-        else:
-            algo = conv_find_algo(
-                tensor_format,
-                pad,
-                stride,
-                dilation,
-                list(x.shape),
-                list(w.shape),
-                oshape,
-                x.dtype,
-                conv_dtype,
-                groups,
-            )
+    x_shape = list(x.shape)
+
+    if isinstance(x.shape[0], tvm.tir.expr.IntImm):
+        oshape = conv_output_shape(
+            tensor_format,
+            pad,
+            stride,
+            dilation,
+            x_shape,
+            list(w.shape),
+            x.dtype,
+            conv_dtype,
+            groups,
+        )
+        if algo == -1:
+            # For now if we try to call `cudnnFindConvolutionForwardAlgorithm` when
+            # using INT8 data type, CuDNN will crash down.
+            # On the other hand, CuDNN only support IMPLICIT_PRECOMP_GEMM at NHWC format
+            if tensor_format == 1 and conv_dtype == "int32":
+                algo = 1
+            else:
+                algo = conv_find_algo(
+                    tensor_format,
+                    pad,
+                    stride,
+                    dilation,
+                    list(x.shape),
+                    list(w.shape),
+                    oshape,
+                    x.dtype,
+                    conv_dtype,
+                    groups,
+                )
+    else:
+        # The dynamic batch size case, pretend this is a single batch
+        x_shape[0] = 1
+        oshape = conv_output_shape(
+            tensor_format,
+            pad,
+            stride,
+            dilation,
+            x_shape,
+            list(w.shape),
+            x.dtype,
+            conv_dtype,
+            groups,
+        )
+        oshape[0] = x.shape[0]
+        # This picks CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
+        # It seems this is the fastest among algorithms that are always applicable
+        algo = 1
 
     if dims == 4:
         return te.extern(
diff --git a/python/tvm/topi/cuda/conv2d.py b/python/tvm/topi/cuda/conv2d.py
index ce9cebc3c963..63c7c9308284 100644
--- a/python/tvm/topi/cuda/conv2d.py
+++ b/python/tvm/topi/cuda/conv2d.py
@@ -96,17 +96,19 @@ def conv2d_cudnn(
     pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW))
     OH = (H + pt + pb - KH) // stride_h + 1
     OW = (W + pl + pr - KW) // stride_w + 1
-    cfg.add_flop(
-        groups
-        * 2
-        * N
-        * OH
-        * OW
-        * CO
-        * CI
-        * ((KH - 1) * dilation_h + 1)
-        * ((KW - 1) * dilation_w + 1)
-    )
+
+    if isinstance(N, int):
+        cfg.add_flop(
+            groups
+            * 2
+            * N
+            * OH
+            * OW
+            * CO
+            * CI
+            * ((KH - 1) * dilation_h + 1)
+            * ((KW - 1) * dilation_w + 1)
+        )
 
     if data.dtype == "int8" or kernel.dtype == "int8":
         if layout == "NCHW":
diff --git a/python/tvm/topi/cuda/conv3d.py b/python/tvm/topi/cuda/conv3d.py
index e5a3a53a89ff..530df31ed3dc 100644
--- a/python/tvm/topi/cuda/conv3d.py
+++ b/python/tvm/topi/cuda/conv3d.py
@@ -206,18 +206,20 @@ def conv3d_cudnn(
     OD = (D + 2 * pad_d - KD) // stride_d + 1
     OH = (H + 2 * pad_h - KH) // stride_h + 1
     OW = (W + 2 * pad_w - KW) // stride_w + 1
-    cfg.add_flop(
-        2
-        * N
-        * OD
-        * OH
-        * OW
-        * CO
-        * CI
-        * ((KD - 1) * dilation_d + 1)
-        * ((KH - 1) * dilation_h + 1)
-        * ((KW - 1) * dilation_w + 1)
-    )
+
+    if isinstance(N, int):
+        cfg.add_flop(
+            2
+            * N
+            * OD
+            * OH
+            * OW
+            * CO
+            * CI
+            * ((KD - 1) * dilation_d + 1)
+            * ((KH - 1) * dilation_h + 1)
+            * ((KW - 1) * dilation_w + 1)
+        )
 
     return cudnn.conv_forward(
         data,
diff --git a/python/tvm/topi/cuda/dense.py b/python/tvm/topi/cuda/dense.py
index 47b9db4f390a..85b9b19bdb02 100644
--- a/python/tvm/topi/cuda/dense.py
+++ b/python/tvm/topi/cuda/dense.py
@@ -42,7 +42,8 @@ def dense_cublas(cfg, data, weight, bias=None, out_dtype=None):
     batch, in_dim = data.shape
     out_dim, _ = weight.shape
     matmul = cublas.matmul(data, weight, False, True)
-    cfg.add_flop(batch * in_dim * out_dim * 2)
+    if isinstance(batch, int):
+        cfg.add_flop(batch * in_dim * out_dim * 2)
     if bias is not None:
         matmul = te.compute(
             (batch, out_dim), lambda i, j: matmul[i, j] + bias[j], tag=tag.BROADCAST
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index e6812aa3bbfa..cb3b5d42e553 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -72,12 +72,11 @@ def check_result(
                         str(e),
                         str(r),
                     )
-                    return
-
-                if flatten:
-                    r = r.flatten()
-                    e = e.flatten()
-                tvm.testing.assert_allclose(r, e, atol=2e-6)
+                else:
+                    if flatten:
+                        r = r.flatten()
+                        e = e.flatten()
+                    tvm.testing.assert_allclose(r, e, atol=2e-6)
 
 
 def verify_any_broadcast(x_shape, y_shape, x_np_shape, y_np_shape, op, np_op):
@@ -454,6 +453,7 @@ def verify_any_conv2d(
     dilation,
     static_data_shape,
     ref_out_shape,
+    use_cudnn=False,
 ):
     mod = tvm.IRModule()
     dtype = "float32"
@@ -463,7 +463,12 @@ def verify_any_conv2d(
     mod["main"] = relay.Function([data, kernel], y)
     data_np = np.random.uniform(size=static_data_shape).astype(dtype)
     kernel_np = np.random.uniform(size=kernel_shape).astype(dtype)
-    check_result([data_np, kernel_np], mod, ref_out_shape, assert_shape=True)
+
+    targets = None
+    if use_cudnn and tvm.get_global_func("tvm.contrib.cudnn.conv.output_shape", True):
+        targets = [("cuda -libs=cudnn", tvm.gpu(0))]
+
+    check_result([data_np, kernel_np], mod, ref_out_shape, assert_shape=True, targets=targets)
 
 
 # TODO(@kevinthesun): Support dynamic input height and width.
@@ -487,6 +492,16 @@ def test_any_conv2d():
         (2, 64, 224, 224),
         (2, 64, 222, 222),
     )
+    verify_any_conv2d(
+        (relay.Any(), 64, 224, 224),
+        (64, 64, 3, 3),
+        (1, 1),
+        (1, 1),
+        (1, 1),
+        (1, 64, 224, 224),
+        (1, 64, 224, 224),
+        use_cudnn=True,
+    )
 
 
 def verify_any_conv2d_NCHWc(
@@ -724,7 +739,13 @@ def test_any_batch_flatten():
 
 
 def verify_any_dense(
-    data_shape, weight_shape, units, static_data_shape, static_weight_shape, ref_out_shape
+    data_shape,
+    weight_shape,
+    units,
+    static_data_shape,
+    static_weight_shape,
+    ref_out_shape,
+    use_cublas=False,
 ):
     mod = tvm.IRModule()
     dtype = "float32"
@@ -734,7 +755,12 @@ def verify_any_dense(
     mod["main"] = relay.Function([data, weight], y)
     data_np = np.random.uniform(size=static_data_shape).astype(dtype)
     weight_np = np.random.uniform(size=static_weight_shape).astype(dtype)
-    check_result([data_np, weight_np], mod, ref_out_shape, assert_shape=True)
+
+    targets = None
+    if use_cublas and tvm.get_global_func("tvm.contrib.cublas.matmul", True):
+        targets = [("cuda -libs=cublas", tvm.gpu(0))]
+
+    check_result([data_np, weight_np], mod, ref_out_shape, assert_shape=True, targets=targets)
 
 
 # TODO(tvm-team) Fix dense schedule
@@ -744,6 +770,12 @@ def test_any_dense():
     verify_any_dense(any_dims(2), (50, relay.Any()), 50, (4, 40), (50, 40), (4, 50))
 
 
+@tvm.testing.uses_gpu
+def test_any_dense_dynamic_batch():
+    verify_any_dense((relay.Any(), 40), (50, 40), 50, (4, 40), (50, 40), (4, 50))
+    verify_any_dense((relay.Any(), 40), (50, 40), 50, (4, 40), (50, 40), (4, 50), use_cublas=True)
+
+
 @tvm.testing.uses_gpu
 def verify_any_pad(data_shape, pad_width, static_data_shape):
     mod = tvm.IRModule()

From 7163b5c02fd25326e7c68ccc3b41d30f4a912952 Mon Sep 17 00:00:00 2001
From: Trevor Morris <trevmorr@amazon.com>
Date: Mon, 4 Jan 2021 14:01:53 -0800
Subject: [PATCH 019/357] ReshapeAttrs no longer has reverse (#7205)

---
 src/runtime/contrib/tensorrt/tensorrt_ops.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/runtime/contrib/tensorrt/tensorrt_ops.cc b/src/runtime/contrib/tensorrt/tensorrt_ops.cc
index 1e6867b83cff..69bb1dccfb62 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_ops.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_ops.cc
@@ -921,7 +921,6 @@ class ReshapeOpConverter : public TensorRTOpConverter {
 
   void Convert(TensorRTOpConverterParams* params) const {
     auto input = params->inputs.at(0).tensor;
-    ICHECK_EQ(std::stoi(params->node.GetAttr<std::vector<std::string>>("reverse")[0]), false);
     auto str_newshape = params->node.GetAttr<std::vector<std::string>>("newshape");
     std::vector<int> new_shape;
     const int start_index = TRT_HAS_IMPLICIT_BATCH(params) ? 1 : 0;

From d05275298d9e630af6d8ff958753fd010759935c Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Mon, 4 Jan 2021 17:17:53 -0800
Subject: [PATCH 020/357] [ConvertLayout] slice_like support (#7184)

---
 src/relay/op/tensor/transform.cc              | 41 +++++++++++
 .../relay/test_pass_convert_op_layout.py      | 70 +++++++++++++++++++
 2 files changed, 111 insertions(+)

diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 19ca6129ecbe..1ff428ce333c 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -2752,6 +2752,46 @@ Expr MakeSliceLike(Expr data, Expr shape_like, Array<Integer> axes) {
   return Call(op, {data, shape_like}, Attrs(attrs), {});
 }
 
+Array<Array<Layout>> SliceLikeInferCorrectLayout(const Attrs& attrs,
+                                                 const Array<Layout>& new_in_layouts,
+                                                 const Array<Layout>& old_in_layouts,
+                                                 const Array<tvm::relay::Type>& old_in_types) {
+  Array<Integer> new_axes;
+  if (old_in_layouts.defined() && new_in_layouts.defined()) {
+    ICHECK_EQ(new_in_layouts.size(), 2);
+    ICHECK_EQ(new_in_layouts[0]->name, new_in_layouts[1]->name);
+    ICHECK_EQ(old_in_layouts.size(), 2);
+    ICHECK_EQ(old_in_layouts[0]->name, old_in_layouts[1]->name);
+
+    auto old_layout = old_in_layouts[0];
+    auto new_layout = new_in_layouts[0];
+
+    // Discard "const" qualifier.
+    auto* params = const_cast<SliceLikeAttrs*>(attrs.as<SliceLikeAttrs>());
+    ICHECK(params != nullptr);
+
+    for (auto axis : params->axes) {
+      auto new_axis = new_layout.IndexOf(old_layout[axis->value]);
+      // Cannot find the target axis in the new layout.
+      if (new_axis == -1) {
+        new_axes.clear();
+        break;
+      }
+      new_axes.push_back(new_axis);
+    }
+    if (!new_axes.empty()) {
+      params->axes = std::move(new_axes);
+      return Array<Array<Layout>>({{new_layout, new_layout}, {new_layout}});
+    }
+  }
+
+  if (old_in_layouts.defined()) {
+    ICHECK_EQ(old_in_layouts.size(), 2);
+    return {{old_in_layouts[0], old_in_layouts[1]}, {old_in_layouts[1]}};
+  }
+  return Array<Array<Layout>>({{Layout::Undef(), Layout::Undef()}, {Layout::Undef()}});
+}
+
 Array<te::Tensor> SliceLikeCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                    const Type& out_type) {
   const auto* param = attrs.as<SliceLikeAttrs>();
@@ -2801,6 +2841,7 @@ RELAY_REGISTER_OP("slice_like")
     .set_support_level(10)
     .add_type_rel("SliceLike", SliceLikeRel)
     .set_attr<FTVMCompute>("FTVMCompute", SliceLikeCompute)
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", SliceLikeInferCorrectLayout)
     .set_attr<TOpPattern>("TOpPattern", kInjective);
 
 // relay.layout_transform
diff --git a/tests/python/relay/test_pass_convert_op_layout.py b/tests/python/relay/test_pass_convert_op_layout.py
index 6765d1f69b00..4c4bb9dee937 100644
--- a/tests/python/relay/test_pass_convert_op_layout.py
+++ b/tests/python/relay/test_pass_convert_op_layout.py
@@ -499,6 +499,75 @@ def before():
     assert len(has_lt) == 1
 
 
+def test_slice_like_convert_layout():
+    def verify_slice_like(after, expected_axes):
+        # Verify if the slice_like after the convert layout has the expected axes.
+        has_expected = list()
+        checker = lambda x: has_expected.append(
+            isinstance(x, tvm.relay.expr.Call)
+            and x.op.name == "slice_like"
+            and str(x.attrs.axes) == str(expected_axes)
+        )
+        relay.analysis.post_order_visit(after, checker)
+        assert any(has_expected)
+
+    def func_nhwc():
+        x = relay.var("x", shape=(1, 56, 56, 64))
+        weight1 = relay.var("weight1", shape=(3, 3, 64, 32))
+        y = relay.nn.conv2d(
+            x,
+            weight1,
+            channels=32,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+            data_layout="NHWC",
+            kernel_layout="HWIO",
+        )
+        out = relay.slice_like(y, y, axes=[1, 2])
+        return relay.Function(analysis.free_vars(out), out)
+
+    after = run_opt_pass(func_nhwc(), transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
+    verify_slice_like(after, [2, 3])
+
+    def func_nchw():
+        x = relay.var("x", shape=(1, 64, 56, 56))
+        weight1 = relay.var("weight1", shape=(32, 64, 3, 3))
+        y = relay.nn.conv2d(
+            x,
+            weight1,
+            channels=32,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+            data_layout="NCHW",
+            kernel_layout="OIHW",
+        )
+        out = relay.slice_like(y, y, axes=[2, 3])
+        return relay.Function(analysis.free_vars(out), out)
+
+    after = run_opt_pass(func_nchw(), transform.ConvertLayout({"nn.conv2d": ["NHWC", "default"]}))
+    verify_slice_like(after, [1, 2])
+
+    def func_vars():
+        x = relay.var("x", shape=(1, 56, 56, 64))
+        weight1 = relay.var("weight1", shape=(3, 3, 64, 32))
+        y = relay.nn.conv2d(
+            x,
+            weight1,
+            channels=32,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+            data_layout="NHWC",
+            kernel_layout="HWIO",
+        )
+        # z has no layout information so convert layout won't happen.
+        z = relay.var("y", shape=(1, 56, 56, 32))
+        out = relay.slice_like(y, z, axes=[1, 2])
+        return relay.Function(analysis.free_vars(out), out)
+
+    after = run_opt_pass(func_vars(), transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
+    verify_slice_like(after, [1, 2])
+
+
 def test_resnet_convert_layout():
     def before():
         x = relay.var("x", shape=(1, 56, 56, 64))
@@ -1412,6 +1481,7 @@ def expected():
     test_conv_concat_convert_layout()
     test_dual_path_convert_layout()
     test_bn_convert_layout()
+    test_slice_like_convert_layout()
     test_resnet_convert_layout()
     test_scalar_convert_layout()
     test_conv_bn_convert_layout()

From 23bd82596f57c66adbd01bcb88f19209995d586c Mon Sep 17 00:00:00 2001
From: leowang1225 <810916296@qq.com>
Date: Tue, 5 Jan 2021 14:28:58 +0800
Subject: [PATCH 021/357] [AutoScheduler] Add custom build function (#7185)

* [AutoScheduler] Add custom build function

Signed-off-by: leowang1225 <810916296@qq.com>

* [AutoScheduler] Add custom build function

Signed-off-by: leowang1225 <810916296@qq.com>

* cheduler] Add custom build function

* [AutoScheduler] Add custom build function

Signed-off-by: leowang1225 <810916296@qq.com>

* [AutoScheduler] Add custom build function

Signed-off-by: leowang1225 <810916296@qq.com>

* [AutoScheduler] Add custom build function

Signed-off-by: leowang1225 <810916296@qq.com>
---
 python/tvm/auto_scheduler/measure.py | 44 ++++++++++++++++++++++------
 1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/python/tvm/auto_scheduler/measure.py b/python/tvm/auto_scheduler/measure.py
index 2f177a242835..47ffde4327c4 100644
--- a/python/tvm/auto_scheduler/measure.py
+++ b/python/tvm/auto_scheduler/measure.py
@@ -64,6 +64,18 @@
 MAX_FLOAT = 1e10
 
 
+class BuildFunc:
+    """store build_func name and callable to class variable.
+    name: str = "default"
+        The name of registered build function.
+    build_func: callable = tar.tar
+        The callable of registered build function.
+    """
+
+    name = "default"
+    build_func = tar.tar
+
+
 @tvm._ffi.register_object("auto_scheduler.MeasureCallback")
 class MeasureCallback(Object):
     """ The base class of measurement callback functions. """
@@ -303,12 +315,28 @@ class LocalBuilder(ProgramBuilder):
         This is used in a wrapper of the multiprocessing.Process.join().
     n_parallel : int = multiprocessing.cpu_count()
         Number of threads used to build in parallel.
-    build_func : str = 'default'
-        The name of registered build function.
+    build_func: callable or str = "default"
+        If is 'default', use default build function
+        If is 'ndk', use function for android ndk
+        If is callable, use it as custom build function, expect lib_format field.
     """
 
     def __init__(self, timeout=15, n_parallel=multiprocessing.cpu_count(), build_func="default"):
-        self.__init_handle_by_constructor__(_ffi_api.LocalBuilder, timeout, n_parallel, build_func)
+        if build_func == "default":
+            BuildFunc.name = "default"
+            BuildFunc.build_func = tar.tar
+        elif build_func == "ndk":
+            BuildFunc.name = "ndk"
+            BuildFunc.build_func = ndk.create_shared
+        elif callable(build_func):
+            BuildFunc.name = "custom"
+            BuildFunc.build_func = build_func
+        else:
+            raise ValueError("Invalid build_func" + build_func)
+
+        self.__init_handle_by_constructor__(
+            _ffi_api.LocalBuilder, timeout, n_parallel, BuildFunc.name
+        )
 
 
 @tvm._ffi.register_object("auto_scheduler.LocalRunner")
@@ -624,12 +652,10 @@ def local_build_worker(args):
         The build result of this Builder thread.
     """
     inp, build_func, timeout, verbose = args
-    if build_func == "default":
-        build_func = tar.tar
-    elif build_func == "ndk":
-        build_func = ndk.create_shared
-    else:
-        raise ValueError("Invalid build_func" + build_func)
+    assert build_func == BuildFunc.name, (
+        "BuildFunc.name: " + BuildFunc.name + ", but args is: " + build_func
+    )
+    build_func = BuildFunc.build_func
 
     res = call_func_with_timeout(timeout, _timed_func, args=(inp, build_func, verbose))
     if isinstance(res, TimeoutError):

From 18120609cc40a73f0cc4a821df858989d9426280 Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Tue, 5 Jan 2021 00:25:48 -0800
Subject: [PATCH 022/357] Fix prelu bug in onnx frontend. (#7208)

---
 python/tvm/relay/frontend/onnx.py          | 12 +++++-------
 tests/python/frontend/onnx/test_forward.py |  5 ++++-
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 1c544d309717..62396d839dc9 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -822,13 +822,11 @@ class Prelu(OnnxOpConverter):
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
         assert len(inputs) == 2, "Prelu need 2 inputs, {} given".format(len(inputs))
-        input_channels = infer_shape(inputs[0])[1]
-        alpha_shape = infer_shape(inputs[1])
-        if len(alpha_shape) != 1:
-            alpha = _op.reshape(inputs[1], (-1,))
-        else:
-            alpha = inputs[1]
-        return _op.nn.prelu(inputs[0], _op.broadcast_to(alpha, [input_channels]))
+        input_shape = _op.shape_of(inputs[0])
+        alpha = _op.broadcast_to_like(inputs[1], inputs[0])
+        alpha = _op.reshape(alpha, [-1])
+        output = _op.nn.prelu(_op.reshape(inputs[0], [-1]), alpha, axis=0)
+        return _op.reshape(output, input_shape)
 
 
 class Reciprocal(OnnxOpConverter):
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 3d95a9a83ee3..0f7fda7301cd 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -2037,12 +2037,15 @@ def verify_prelu(x_shape, a_shape):
 
         model = helper.make_model(graph, producer_name="prelu_test")
 
-        verify_with_ort(model, [x_shape, a_shape], list(x_shape))
+        verify_with_ort(
+            model, [x_shape, a_shape], list(x_shape), use_vm=True, convert_to_static=True
+        )
 
     verify_prelu([3, 4, 5, 6], [1, 4, 1, 1])
     verify_prelu([1, 8, 5, 6], [1, 8, 1, 1])
     verify_prelu([2, 12, 16, 16], [1, 12, 1, 1])
     verify_prelu([2, 12, 16, 16], [1])  # Test alpha broadcasting.
+    verify_prelu([3, 1], [3, 1])  # Test non NCHW workload.
 
 
 @tvm.testing.uses_gpu

From d3bb7622f69f2e774950ce2c6ff6bb354d0f5167 Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Tue, 5 Jan 2021 14:14:20 -0700
Subject: [PATCH 023/357] [PatternLang] Add Syntatic Sugar to the C++ pattern
 API and support DataType Attribute Matching (#7120)

* Add Syntatic Sugar for C++ Pattern API, Support DataType Attribute match

* add missing tests

* fix lint

* fix license edit

* fix bad rebase
---
 include/tvm/relay/dataflow_pattern.h          |  56 +++--
 python/tvm/relay/dataflow_pattern/__init__.py |   4 +-
 src/relay/ir/dataflow_matcher.cc              |  14 +-
 src/relay/ir/dataflow_pattern.cc              |  67 ++++--
 src/relay/transforms/simplify_expr.cc         |   9 +-
 tests/cpp/dataflow_pattern_test.cc            | 200 ++++++++++++++++++
 tests/python/relay/test_dataflow_pattern.py   |   6 +
 7 files changed, 319 insertions(+), 37 deletions(-)
 create mode 100644 tests/cpp/dataflow_pattern_test.cc

diff --git a/include/tvm/relay/dataflow_pattern.h b/include/tvm/relay/dataflow_pattern.h
index 909a4fe44eb1..5b2734f52ede 100644
--- a/include/tvm/relay/dataflow_pattern.h
+++ b/include/tvm/relay/dataflow_pattern.h
@@ -27,6 +27,9 @@
 #include <tvm/relay/expr.h>
 #include <tvm/relay/type.h>
 
+#include <string>
+#include <vector>
+
 namespace tvm {
 namespace relay {
 
@@ -46,6 +49,29 @@ class DFPatternNode : public Object {
  */
 class DFPattern : public ObjectRef {
  public:
+  /*! \brief Syntatic Sugar for creating a CallPattern */
+  DFPattern operator()(const std::vector<DFPattern>& args);
+  /*! \brief Syntatic Sugar for creating a CallPattern with an "add" op */
+  DFPattern operator+(const DFPattern& other);
+  /*! \brief Syntatic Sugar for creating a CallPattern with a "subtract" op */
+  DFPattern operator-(const DFPattern& other);
+  /*! \brief Syntatic Sugar for creating a CallPattern with a "multiply" op */
+  DFPattern operator*(const DFPattern& other);
+  /*! \brief Syntatic Sugar for creating a CallPattern with a "divide" op */
+  DFPattern operator/(const DFPattern& other);
+  /*! \brief Syntatic Sugar for creating an AltPattern */
+  DFPattern operator||(const DFPattern& other);
+  /*! \brief Syntatic Sugar for creating an AttrPattern */
+  DFPattern HasAttr(const Map<String, ObjectRef>& attrs);
+  /*! \brief Syntatic Sugar for creating a TypePattern */
+  DFPattern HasType(const Type& type);
+  /*! \brief Syntatic Sugar for creating a DataTypePattern with a DataType */
+  DFPattern HasDtype(const DataType& dtype);
+  /*! \brief Syntatic Sugar for creating a DataTypePattern with a data type's name */
+  DFPattern HasDtype(const std::string& dtype);
+  /*! \brief Syntatic Sugar for creating a ShapePattern */
+  DFPattern HasShape(const Array<PrimExpr> shape);
+
   TVM_DEFINE_OBJECT_REF_METHODS(DFPattern, ObjectRef, DFPatternNode);
 };
 
@@ -86,20 +112,11 @@ class VarPatternNode : public DFPatternNode {
    * \brief The name of the Var (optional).
    */
   String name;
-  /*!
-   * \brief type annotation of the variable.
-   * This field records user provided type annotation of the Var.
-   * This field is optional and can be None.
-   */
-  Type type_annotation;
 
   /*! \return The name hint of the variable */
   const String& name_hint() const { return name; }
 
-  void VisitAttrs(tvm::AttrVisitor* v) {
-    v->Visit("name", &name);
-    v->Visit("type_annotation", &type_annotation);
-  }
+  void VisitAttrs(tvm::AttrVisitor* v) { v->Visit("name", &name); }
 
   static constexpr const char* _type_key = "relay.dataflow_pattern.VarPattern";
   TVM_DECLARE_FINAL_OBJECT_INFO(VarPatternNode, DFPatternNode);
@@ -107,7 +124,7 @@ class VarPatternNode : public DFPatternNode {
 
 class VarPattern : public DFPattern {
  public:
-  TVM_DLL VarPattern(String name_hint, Type type_annotation);
+  TVM_DLL VarPattern(String name_hint);
   TVM_DEFINE_OBJECT_REF_METHODS(VarPattern, DFPattern, VarPatternNode);
 };
 
@@ -393,7 +410,7 @@ class AttrPatternNode : public DFPatternNode {
   /*! \brief The pattern. */
   DFPattern pattern;
   /*! \brief The attribute to match */
-  Attrs attrs;
+  DictAttrs attrs;
 
   void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("pattern", &pattern);
@@ -409,7 +426,7 @@ class AttrPatternNode : public DFPatternNode {
  */
 class AttrPattern : public DFPattern {
  public:
-  TVM_DLL AttrPattern(DFPattern pattern, Attrs attrs);
+  TVM_DLL AttrPattern(DFPattern pattern, DictAttrs attrs);
   TVM_DEFINE_OBJECT_REF_METHODS(AttrPattern, DFPattern, AttrPatternNode);
 };
 
@@ -447,6 +464,19 @@ class DominatorPattern : public DFPattern {
   TVM_DEFINE_OBJECT_REF_METHODS(DominatorPattern, DFPattern, DominatorPatternNode);
 };
 
+/*! \brief Syntatic Sugar for creating a VarPattern with a name */
+DFPattern IsVar(const String& name);
+/*! \brief Syntatic Sugar for creating a ConstantPattern */
+DFPattern IsConstant();
+/*! \brief Syntatic Sugar for creating a ExprPattern */
+DFPattern IsExpr(const Expr& expr);
+/*! \brief Syntatic Sugar for creating a ExprPattern base on an Op*/
+DFPattern IsOp(const String& op_name);
+/*! \brief Syntatic Sugar for creating a TuplePattern*/
+DFPattern IsTuple(const Array<DFPattern>& fields);
+/*! \brief Syntatic Sugar for creating a TupleGetItemPattern*/
+DFPattern IsTupleGetItem(const DFPattern tuple, int index = -1);
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_DATAFLOW_PATTERN_H_
diff --git a/python/tvm/relay/dataflow_pattern/__init__.py b/python/tvm/relay/dataflow_pattern/__init__.py
index 233c696fd716..f5161ad0bfa7 100644
--- a/python/tvm/relay/dataflow_pattern/__init__.py
+++ b/python/tvm/relay/dataflow_pattern/__init__.py
@@ -480,8 +480,8 @@ class VarPattern(DFPattern):
         The type annotation on the variable.
     """
 
-    def __init__(self, name_hint: str = "", type_annotation: Optional[tvm.ir.type.Type] = None):
-        self.__init_handle_by_constructor__(ffi.VarPattern, name_hint, type_annotation)
+    def __init__(self, name_hint: str = ""):
+        self.__init_handle_by_constructor__(ffi.VarPattern, name_hint)
 
 
 @register_df_node
diff --git a/src/relay/ir/dataflow_matcher.cc b/src/relay/ir/dataflow_matcher.cc
index c5cc3dd17429..e4c0c7fa1c94 100644
--- a/src/relay/ir/dataflow_matcher.cc
+++ b/src/relay/ir/dataflow_matcher.cc
@@ -124,6 +124,13 @@ bool MatchRetValue(const ObjectRef& lhs, const TVMRetValue& rhs) {
         return val->data == rhs.operator std::string();
       }
       break;
+    case kTVMDataType:
+      if (auto* val = lhs.as<tir::StringImmNode>()) {
+        return rhs.operator std::string() == val->value;
+      } else if (auto* val = lhs.as<StringObj>()) {
+        return rhs.operator std::string() == val->data;
+      }
+      break;
     case kTVMObjectHandle:
       if (rhs.IsObjectRef<String>()) {
         if (auto* val = lhs.as<tir::StringImmNode>()) {
@@ -140,7 +147,10 @@ bool MatchRetValue(const ObjectRef& lhs, const TVMRetValue& rhs) {
 }
 
 bool DFPatternMatcher::VisitDFPattern_(const AttrPatternNode* attr_pattern, const Expr& expr) {
-  bool matches = false;
+  bool matches = VisitDFPattern(attr_pattern->pattern, expr);
+  if (!matches) {
+    return matches;
+  }
   auto attributes = attr_pattern->attrs.as<DictAttrsNode>()->dict;
   if (const auto* op_node = expr.as<OpNode>()) {
     Op op = GetRef<Op>(op_node);
@@ -179,7 +189,7 @@ bool DFPatternMatcher::VisitDFPattern_(const AttrPatternNode* attr_pattern, cons
       }
     }
   }
-  return matches && VisitDFPattern(attr_pattern->pattern, expr);
+  return matches;
 }
 
 Array<DFPattern> reverse(const Array<DFPattern>& args) {
diff --git a/src/relay/ir/dataflow_pattern.cc b/src/relay/ir/dataflow_pattern.cc
index 46c53c8bd96c..086c3852b13f 100644
--- a/src/relay/ir/dataflow_pattern.cc
+++ b/src/relay/ir/dataflow_pattern.cc
@@ -22,6 +22,7 @@
  * \brief The dataflow pattern language for Relay.
  */
 #include <tvm/relay/dataflow_pattern.h>
+#include <tvm/runtime/data_type.h>
 
 namespace tvm {
 namespace relay {
@@ -44,29 +45,22 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
       p->Print(node->expr);
     });
 
-VarPattern::VarPattern(String name_hint, Type type_annotation) {
+VarPattern::VarPattern(String name_hint) {
   ObjectPtr<VarPatternNode> n = make_object<VarPatternNode>();
   n->name = std::move(name_hint);
-  n->type_annotation = std::move(type_annotation);
   data_ = std::move(n);
 }
 
 TVM_REGISTER_NODE_TYPE(VarPatternNode);
 
-TVM_REGISTER_GLOBAL("relay.dataflow_pattern.VarPattern")
-    .set_body_typed([](String name_hint, Type type_annotation) {
-      return VarPattern(name_hint, type_annotation);
-    });
+TVM_REGISTER_GLOBAL("relay.dataflow_pattern.VarPattern").set_body_typed([](String name_hint) {
+  return VarPattern(name_hint);
+});
 
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<VarPatternNode>([](const ObjectRef& ref, ReprPrinter* p) {
       auto* node = static_cast<const VarPatternNode*>(ref.get());
-      p->stream << "VarPattern(" << node->name_hint();
-      if (node->type_annotation.defined()) {
-        p->stream << ", ty=";
-        p->Print(node->type_annotation);
-      }
-      p->stream << ")";
+      p->stream << "VarPattern(" << node->name_hint() << ")";
     });
 
 TVM_REGISTER_NODE_TYPE(ConstantPatternNode);
@@ -241,7 +235,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
       p->stream << "TypePattern(" << node->pattern << " has dtype " << node->dtype << ")";
     });
 
-AttrPattern::AttrPattern(DFPattern pattern, Attrs attrs) {
+AttrPattern::AttrPattern(DFPattern pattern, DictAttrs attrs) {
   ObjectPtr<AttrPatternNode> n = make_object<AttrPatternNode>();
   n->pattern = std::move(pattern);
   n->attrs = std::move(attrs);
@@ -251,7 +245,7 @@ AttrPattern::AttrPattern(DFPattern pattern, Attrs attrs) {
 TVM_REGISTER_NODE_TYPE(AttrPatternNode);
 
 TVM_REGISTER_GLOBAL("relay.dataflow_pattern.AttrPattern")
-    .set_body_typed([](DFPattern pattern, Attrs attrs) { return AttrPattern(pattern, attrs); });
+    .set_body_typed([](DFPattern pattern, DictAttrs attrs) { return AttrPattern(pattern, attrs); });
 
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<AttrPatternNode>([](const ObjectRef& ref, ReprPrinter* p) {
@@ -263,6 +257,7 @@ DominatorPattern::DominatorPattern(DFPattern parent, DFPattern path, DFPattern c
   ObjectPtr<DominatorPatternNode> n = make_object<DominatorPatternNode>();
   n->parent = std::move(parent);
   n->path = std::move(path);
+
   n->child = std::move(child);
   data_ = std::move(n);
 }
@@ -281,5 +276,49 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
                 << ")";
     });
 
+// Syntatic Sugar
+DFPattern DFPattern::operator()(const std::vector<DFPattern>& args) {
+  return CallPattern(GetRef<DFPattern>(this->get()), Array<DFPattern>(args));
+}
+DFPattern DFPattern::operator+(const DFPattern& other) {
+  return IsOp("add")({GetRef<DFPattern>(this->get()), other});
+}
+DFPattern DFPattern::operator-(const DFPattern& other) {
+  return IsOp("subtract")({GetRef<DFPattern>(this->get()), other});
+}
+DFPattern DFPattern::operator*(const DFPattern& other) {
+  return IsOp("multiply")({GetRef<DFPattern>(this->get()), other});
+}
+DFPattern DFPattern::operator/(const DFPattern& other) {
+  return IsOp("divide")({GetRef<DFPattern>(this->get()), other});
+}
+DFPattern DFPattern::operator||(const DFPattern& other) {
+  return AltPattern(GetRef<DFPattern>(this->get()), other);
+}
+
+DFPattern DFPattern::HasAttr(const Map<String, ObjectRef>& attrs) {
+  return AttrPattern(GetRef<DFPattern>(this->get()), DictAttrs(attrs));
+}
+DFPattern DFPattern::HasType(const Type& type) {
+  return TypePattern(GetRef<DFPattern>(this->get()), type);
+}
+DFPattern DFPattern::HasDtype(const DataType& dtype) {
+  return DataTypePattern(GetRef<DFPattern>(this->get()), dtype);
+}
+DFPattern DFPattern::HasDtype(const std::string& dtype) {
+  return HasDtype(DataType(runtime::String2DLDataType(dtype)));
+}
+DFPattern DFPattern::HasShape(const Array<PrimExpr> shape) {
+  return ShapePattern(GetRef<DFPattern>(this->get()), shape);
+}
+DFPattern IsVar(const String& name) { return VarPattern(name); }
+DFPattern IsConstant() { return ConstantPattern(make_object<ConstantPatternNode>()); }
+DFPattern IsExpr(const Expr& expr) { return ExprPattern(expr); }
+DFPattern IsOp(const String& op_name) { return IsExpr(Op::Get(op_name)); }
+DFPattern IsTuple(const Array<DFPattern>& fields) { return TuplePattern(fields); }
+DFPattern IsTupleGetItem(const DFPattern tuple, int index) {
+  return TupleGetItemPattern(tuple, index);
+}
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/transforms/simplify_expr.cc b/src/relay/transforms/simplify_expr.cc
index cb42ab09aae4..0f78c260378c 100644
--- a/src/relay/transforms/simplify_expr.cc
+++ b/src/relay/transforms/simplify_expr.cc
@@ -33,9 +33,6 @@
 namespace tvm {
 namespace relay {
 
-static Op reshape_op = Op::Get("reshape");
-static Op reverse_reshape_op = Op::Get("contrib_reverse_reshape");
-
 /*!
  * \brief SimplifyReshape matches the pattern of consecutive reshape or reverse_reshape ops,
  *   and merges into one reshape op.
@@ -44,9 +41,9 @@ class SimplifyReshape {
  public:
   SimplifyReshape() {
     x_ = WildcardPattern(make_object<WildcardPatternNode>());
-    auto reshape1 = AltPattern(ExprPattern(reshape_op), ExprPattern(reverse_reshape_op));
-    auto reshape2 = AltPattern(ExprPattern(reshape_op), ExprPattern(reverse_reshape_op));
-    pattern_ = CallPattern(reshape1, {CallPattern(reshape2, {x_})});
+    auto reshape1 = IsOp("reshape") || IsOp("contrib_reverse_reshape");
+    auto reshape2 = IsOp("reshape") || IsOp("contrib_reverse_reshape");
+    pattern_ = reshape1({reshape2({x_})});
   }
 
   Expr callback(const Expr& pre, const Expr& post, const Map<DFPattern, Array<Expr>>& node_map) {
diff --git a/tests/cpp/dataflow_pattern_test.cc b/tests/cpp/dataflow_pattern_test.cc
new file mode 100644
index 000000000000..bdccaaa2e6ba
--- /dev/null
+++ b/tests/cpp/dataflow_pattern_test.cc
@@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <tvm/relay/dataflow_pattern.h>
+#include <tvm/tir/analysis.h>
+
+TEST(DFPattern, IsVar) {
+  using namespace tvm;
+  using namespace tvm::relay;
+  auto pattern = IsVar("add");
+  auto* node = pattern.as<VarPatternNode>();
+  ICHECK(node);
+  ICHECK(node->name == String("add"));
+}
+
+TEST(DFPattern, IsConstant) {
+  using namespace tvm;
+  using namespace tvm::relay;
+  auto pattern = IsConstant();
+  auto* node = pattern.as<ConstantPatternNode>();
+  ICHECK(node);
+}
+
+TEST(DFPattern, IsOp) {
+  using namespace tvm;
+  using namespace tvm::relay;
+  auto pattern = IsOp("add");
+  auto* node = pattern.as<ExprPatternNode>();
+  ICHECK(node);
+  ICHECK(node->expr == Op::Get("add"));
+}
+
+TEST(DFPattern, IsTuple) {
+  using namespace tvm;
+  using namespace tvm::relay;
+  auto a = WildcardPattern();
+  auto b = WildcardPattern();
+  auto pattern = IsTuple({a, b});
+  auto* node = pattern.as<TuplePatternNode>();
+  ICHECK(node);
+  ICHECK(node->fields[0] == a);
+  ICHECK(node->fields[1] == b);
+}
+
+TEST(DFPattern, IsTupleGetItem) {
+  using namespace tvm;
+  using namespace tvm::relay;
+  auto a = WildcardPattern();
+  auto b = WildcardPattern();
+  auto tuple = IsTuple({a, b});
+  auto pattern = IsTupleGetItem(tuple, 1);
+  auto* node = pattern.as<TupleGetItemPatternNode>();
+  ICHECK(node);
+  ICHECK(node->tuple == tuple);
+  ICHECK(node->index == 1);
+}
+
+TEST(DFPattern, ADD) {
+  using namespace tvm;
+  using namespace tvm::relay;
+  auto a = WildcardPattern();
+  auto b = WildcardPattern();
+  auto pattern = a + b;
+  auto* node = pattern.as<CallPatternNode>();
+  ICHECK(node);
+  ICHECK(node->args[0] == a);
+  ICHECK(node->args[1] == b);
+  auto* expr_pattern = node->op.as<ExprPatternNode>();
+  ICHECK(expr_pattern);
+  ICHECK(expr_pattern->expr == Op::Get("add"));
+}
+
+TEST(DFPattern, SUB) {
+  using namespace tvm;
+  using namespace tvm::relay;
+  auto a = WildcardPattern();
+  auto b = WildcardPattern();
+  auto pattern = a - b;
+  auto* node = pattern.as<CallPatternNode>();
+  ICHECK(node);
+  ICHECK(node->args[0] == a);
+  ICHECK(node->args[1] == b);
+  auto* expr_pattern = node->op.as<ExprPatternNode>();
+  ICHECK(expr_pattern);
+  ICHECK(expr_pattern->expr == Op::Get("subtract"));
+}
+
+TEST(DFPattern, MUL) {
+  using namespace tvm;
+  using namespace tvm::relay;
+  auto a = WildcardPattern();
+  auto b = WildcardPattern();
+  auto pattern = a * b;
+  auto* node = pattern.as<CallPatternNode>();
+  ICHECK(node);
+  ICHECK(node->args[0] == a);
+  ICHECK(node->args[1] == b);
+  auto* expr_pattern = node->op.as<ExprPatternNode>();
+  ICHECK(expr_pattern);
+  ICHECK(expr_pattern->expr == Op::Get("multiply"));
+}
+
+TEST(DFPattern, DIV) {
+  using namespace tvm;
+  using namespace tvm::relay;
+  auto a = WildcardPattern();
+  auto b = WildcardPattern();
+  auto pattern = a / b;
+  auto* node = pattern.as<CallPatternNode>();
+  ICHECK(node);
+  ICHECK(node->args[0] == a);
+  ICHECK(node->args[1] == b);
+  auto* expr_pattern = node->op.as<ExprPatternNode>();
+  ICHECK(expr_pattern);
+  ICHECK(expr_pattern->expr == Op::Get("divide"));
+}
+
+TEST(DFPattern, OR) {
+  using namespace tvm;
+  using namespace tvm::relay;
+  auto a = WildcardPattern();
+  auto b = WildcardPattern();
+  auto pattern = a || b;
+  auto* node = pattern.as<AltPatternNode>();
+  ICHECK(node);
+  ICHECK(node->left == a);
+  ICHECK(node->right == b);
+}
+
+TEST(DFPattern, HasAttr) {
+  using namespace tvm;
+  using namespace tvm::relay;
+  auto a = WildcardPattern();
+  Map<String, ObjectRef> attrs;
+  auto b = String("b");
+  attrs.Set("a", b);
+  auto pattern = a.HasAttr(attrs);
+  auto* node = pattern.as<AttrPatternNode>();
+  ICHECK(node);
+  ICHECK(node->pattern == a);
+  ICHECK(node->attrs->dict.at("a") == b);
+}
+
+TEST(DFPattern, HasType) {
+  using namespace tvm;
+  using namespace tvm::relay;
+  auto a = WildcardPattern();
+  TensorType type({1, 2, 3}, DataType(runtime::String2DLDataType("float32")));
+  auto pattern = a.HasType(type);
+  auto* node = pattern.as<TypePatternNode>();
+  ICHECK(node);
+  ICHECK(node->pattern == a);
+  ICHECK(node->type == type);
+}
+
+TEST(DFPattern, HasDtype) {
+  using namespace tvm;
+  using namespace tvm::relay;
+  auto a = WildcardPattern();
+  auto pattern = a.HasDtype("float32");
+  auto* node = pattern.as<DataTypePatternNode>();
+  ICHECK(node);
+  ICHECK(node->pattern == a);
+  ICHECK(runtime::DLDataType2String(node->dtype.operator DLDataType()) == "float32");
+}
+
+TEST(DFPattern, HasShape) {
+  using namespace tvm;
+  using namespace tvm::relay;
+  auto a = WildcardPattern();
+  Array<PrimExpr> shape{1, 2, 3};
+  auto pattern = a.HasShape(shape);
+  auto* node = pattern.as<ShapePatternNode>();
+  ICHECK(node);
+  ICHECK(node->pattern == a);
+  ICHECK(node->shape == shape);
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  testing::FLAGS_gtest_death_test_style = "threadsafe";
+  return RUN_ALL_TESTS();
+}
diff --git a/tests/python/relay/test_dataflow_pattern.py b/tests/python/relay/test_dataflow_pattern.py
index d99e55b7c33f..f30a4e747c33 100644
--- a/tests/python/relay/test_dataflow_pattern.py
+++ b/tests/python/relay/test_dataflow_pattern.py
@@ -401,6 +401,12 @@ def test_no_match_call_attr():
     assert not is_conv2d.match(relay.op.nn.conv2d(x, y))
 
 
+def test_match_call_attr_dtype():
+    is_cast = is_op("cast")(wildcard()).has_attr({"dtype": "float32"})
+    x = relay.var("x")
+    assert is_cast.match(relay.op.cast(x, "float32"))
+
+
 def test_match_diamond():
     # Pattern
     is_conv2d = is_op("nn.conv2d")(wildcard(), wildcard())

From 8b447411b1948bea3785059ffae4daa890b5a971 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Tue, 5 Jan 2021 13:16:31 -0800
Subject: [PATCH 024/357] =?UTF-8?q?[=C2=B5TVM]=20Raise=20a=20better=20erro?=
 =?UTF-8?q?r=20when=20project=5Fdir=20does=20not=20exist=20(#7165)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 python/tvm/micro/contrib/zephyr.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/python/tvm/micro/contrib/zephyr.py b/python/tvm/micro/contrib/zephyr.py
index 66254987cb8b..61aec2b771e0 100644
--- a/python/tvm/micro/contrib/zephyr.py
+++ b/python/tvm/micro/contrib/zephyr.py
@@ -58,6 +58,10 @@ def run(self, cmd, **kw):
         return subprocess.check_output(cmd, env=env, **kw)
 
 
+class ProjectNotFoundError(Exception):
+    """Raised when the project_dir supplied to ZephyrCompiler does not exist."""
+
+
 class FlashRunnerNotSupported(Exception):
     """Raised when the FLASH_RUNNER for a project isn't supported by this Zephyr adapter."""
 
@@ -95,6 +99,13 @@ def __init__(
             If given, additional environment variables present when invoking west, cmake, or make.
         """
         self._project_dir = project_dir
+        if not os.path.exists(project_dir):
+            # Raise this error instead of a potentially-more-cryptic compiler error due to a missing
+            # prj.conf.
+            raise ProjectNotFoundError(
+                f"project_dir supplied to ZephyrCompiler does not exist: {project_dir}"
+            )
+
         self._board = board
         if west_cmd is None:
             self._west_cmd = [sys.executable, "-mwest.app.main"]

From 197594bfbf9f4c7ef80bd05c75cc1e1615c3609f Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Tue, 5 Jan 2021 15:30:20 -0800
Subject: [PATCH 025/357] Allow condition in if op to be an array. (#7215)

---
 python/tvm/relay/frontend/onnx.py          |  3 +++
 tests/python/frontend/onnx/test_forward.py | 15 ++++++++++++---
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 62396d839dc9..4c9996bc855a 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -2266,6 +2266,9 @@ class If(OnnxOpConverter):
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
         cond = inputs[0]
+        # Convert array to bool if needed.
+        if len(infer_shape(cond)) > 0:
+            cond = _op.take(cond, _expr.const(0, dtype="int64"))
         then_branch = attr.get("then_branch", None)
         else_branch = attr.get("else_branch", None)
         assert then_branch is not None and else_branch is not None
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 0f7fda7301cd..df35a7e9bb56 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -3969,8 +3969,7 @@ def test_loop():
     verify_count_loop()
 
 
-@tvm.testing.uses_gpu
-def test_if():
+def verify_if(cond_array):
     # Given a bool scalar input cond.
     # return constant tensor x if cond is True, otherwise return constant tensor y.
     then_out = onnx.helper.make_tensor_value_info("then_out", onnx.TensorProto.FLOAT, [5])
@@ -4007,7 +4006,10 @@ def test_if():
     )
 
     if_model = onnx.helper.make_model(if_graph)
-    cond = np.array(1).astype("bool")
+    if cond_array:
+        cond = np.array([1]).astype("bool")
+    else:
+        cond = np.array(1).astype("bool")
     correct_out = x if cond else y
 
     for target, ctx in tvm.testing.enabled_targets():
@@ -4016,6 +4018,13 @@ def test_if():
             tvm.testing.assert_allclose(correct_out[i], tvm_out[i], rtol=1e-05, atol=1e-05)
 
 
+@tvm.testing.uses_gpu
+def test_if():
+    # Confirm that if works with cond as an array or scalar.
+    verify_if(cond_array=False)
+    verify_if(cond_array=True)
+
+
 @tvm.testing.uses_gpu
 def test_size():
     def verify_size(indata):

From 3f0dc420ff9b891a79f55181886b536ecc337796 Mon Sep 17 00:00:00 2001
From: insop <insop.song@gmail.com>
Date: Tue, 5 Jan 2021 17:17:02 -0800
Subject: [PATCH 026/357] [Frontend][MXNet] add _npi_stack, issue #7186 (#7209)

- https://github.com/apache/tvm/issues/7186
- add MxNet stack, `_npi_stack`
- https://mxnet.apache.org/versions/master/api/python/docs/api/np/generated/mxnet.np.stack.html?highlight=stack
---
 python/tvm/relay/frontend/mxnet.py          |  9 +++++++
 tests/python/frontend/mxnet/test_forward.py | 28 +++++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index 1085e904c386..b272ead9737d 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -2335,6 +2335,14 @@ def _mx_npi_concatenate(inputs, attrs):
         return _op.concatenate(tuple(inputs), axis=int(axis))
 
 
+def _mx_npi_stack(inputs, attrs):
+    axis = attrs.get_str("axis", "0")
+    if axis == "None":
+        return _op.reshape(_op.stack(tuple(inputs), axis=0), (-1,))
+    else:
+        return _op.stack(tuple(inputs), axis=int(axis))
+
+
 def _mx_npx_reshape(inputs, attrs):
     shape = attrs.get_int_tuple("newshape")
     reverse = attrs.get_bool("reverse", False)
@@ -2700,6 +2708,7 @@ def _mx_npi_where_rscalar(inputs, attrs):
     "_npi_less_equal": _mx_compare(_op.less_equal, _rename),
     "_npi_tanh": _rename(_op.tanh),
     "_npi_true_divide_scalar": _binop_scalar(_op.divide),
+    "_npi_stack": _mx_npi_stack,
 }
 
 # set identity list
diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py
index d3be8c0506ba..537349e073e1 100644
--- a/tests/python/frontend/mxnet/test_forward.py
+++ b/tests/python/frontend/mxnet/test_forward.py
@@ -2012,6 +2012,34 @@ def test_forward_npi_concatenate(data_shape1, data_shape2, axis, dtype, target,
     tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
 
 
+@pytest.mark.parametrize(
+    "data_shape1, data_shape2, axis",
+    [
+        ((3,), (3,), 0),
+        ((3,), (3,), -1),
+        ((1, 3, 2), (1, 3, 2), 2),
+        ((1, 3, 3), (1, 3, 3), 1),
+        ((1, 3), (1, 3), 0),
+    ],
+)
+@pytest.mark.parametrize("dtype", ["float64", "float32", "int64", "int32"])
+@tvm.testing.parametrize_targets
+@pytest.mark.parametrize("kind", ["graph", "vm", "debug"])
+def test_forward_npi_stack(data_shape1, data_shape2, axis, dtype, target, ctx, kind):
+    data_np1 = np.random.uniform(size=data_shape1).astype(dtype)
+    data_np2 = np.random.uniform(size=data_shape2).astype(dtype)
+    data1 = mx.sym.var("data1")
+    data2 = mx.sym.var("data2")
+    ref_res = mx.np.stack([mx.np.array(data_np1), mx.np.array(data_np2)], axis=axis)
+    mx_sym = mx.sym.np.stack([data1.as_np_ndarray(), data2.as_np_ndarray()], axis=axis)
+    mod, _ = relay.frontend.from_mxnet(
+        mx_sym, shape={"data1": data_shape1, "data2": data_shape2}, dtype=dtype
+    )
+    intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+    op_res = intrp.evaluate()(data_np1, data_np2)
+    tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5)
+
+
 @pytest.mark.parametrize("data_shape", [(2, 2, 2), (2, 7, 2), (2, 2, 2, 1, 2, 3, 1), (1, 8)])
 @pytest.mark.parametrize("dtype", ["float64", "float32", "int64", "int32", "bool"])
 @tvm.testing.parametrize_targets

From 040afb0245526e1cc71dc0ada6c3c5787394a5c6 Mon Sep 17 00:00:00 2001
From: Chenfan <chengfan.jcf@alibaba-inc.com>
Date: Wed, 6 Jan 2021 10:04:34 +0800
Subject: [PATCH 027/357] [Fix][Autoscheduler] Costmodel enhancement & bug fix
 for graph debug runtime (#7197)

* Enhancement for autoscheduler cost model

* Bug fix for graph_runtime_debug

* Update

* Lint fix

* Update

* Update

* Add file exist check for cost model load

* Update

* Update

* Lint fix

* Update

* Bug fix
---
 .../auto_scheduler/cost_model/xgb_model.py    | 25 ++++++++++++++++++-
 python/tvm/auto_scheduler/task_scheduler.py   | 13 ++++++++--
 src/auto_scheduler/feature.cc                 | 18 ++++++++-----
 3 files changed, 47 insertions(+), 9 deletions(-)

diff --git a/python/tvm/auto_scheduler/cost_model/xgb_model.py b/python/tvm/auto_scheduler/cost_model/xgb_model.py
index eb14dff0815c..f42648288bfa 100644
--- a/python/tvm/auto_scheduler/cost_model/xgb_model.py
+++ b/python/tvm/auto_scheduler/cost_model/xgb_model.py
@@ -88,7 +88,14 @@ class XGBModel(PythonBasedModel):
     their predictions.
     """
 
-    def __init__(self, verbose_eval=25, num_warmup_sample=100, seed=None):
+    def __init__(
+        self,
+        verbose_eval=25,
+        num_warmup_sample=100,
+        seed=None,
+        model_file=None,
+        adapative_training=False,
+    ):
         global xgb
         try:
             if xgb is None:
@@ -116,12 +123,15 @@ def __init__(self, verbose_eval=25, num_warmup_sample=100, seed=None):
         self.plan_size = 32
         self.num_warmup_sample = num_warmup_sample
         self.verbose_eval = verbose_eval
+        self.model_file = model_file
+        self.adapative_training = adapative_training
 
         super().__init__()
 
         # cache measurement input/result pairs and extracted features
         self.inputs = []
         self.results = []
+        self.last_train_length = 0
         self.inputs_feature_cache = []
 
     def update(self, inputs, results):
@@ -141,6 +151,15 @@ def update(self, inputs, results):
         self.inputs.extend(inputs)
         self.results.extend(results)
 
+        if (
+            self.adapative_training
+            and len(self.inputs) - self.last_train_length < self.last_train_length / 5
+        ):
+            # Set a training threshold related to `last_train_length` to reduce the training
+            # overhead when there're too many logs
+            return
+        self.last_train_length = len(self.inputs)
+
         # extract feature
         n_cached = len(self.inputs_feature_cache)
         features, normalized_throughputs, task_ids = get_per_store_features_from_measure_pairs(
@@ -176,6 +195,10 @@ def update(self, inputs, results):
             ],
         )
 
+        # Update the model file if it has been set
+        if self.model_file:
+            self.save(self.model_file)
+
     def predict(self, task, states):
         """Predict the scores of states
         Parameters
diff --git a/python/tvm/auto_scheduler/task_scheduler.py b/python/tvm/auto_scheduler/task_scheduler.py
index ab83ff40c461..975306f7be54 100644
--- a/python/tvm/auto_scheduler/task_scheduler.py
+++ b/python/tvm/auto_scheduler/task_scheduler.py
@@ -47,6 +47,7 @@ def make_search_policies(
     verbose,
     load_model_file=None,
     load_log_file=None,
+    adapative_training=False,
 ):
     """Make a list of search policies for a list of search tasks.
     It creates one policy per task.
@@ -70,6 +71,9 @@ def make_search_policies(
     load_log_file: Optional[str]
         Load measurement records from this file. If it is not None, the status of the
         task scheduler, search policies and cost models will be restored according to this file.
+    adapative_training: bool = False
+        Option used for XGBModel, which will reduce the model training frequency when there're too
+        many logs.
 
     Returns
     -------
@@ -82,11 +86,16 @@ def make_search_policies(
     if isinstance(search_policy, str):
         policy_type, model_type = search_policy.split(".")
         if model_type == "xgb":
-            cost_model = XGBModel(num_warmup_sample=len(tasks) * num_measures_per_round)
-            if load_model_file:
+            cost_model = XGBModel(
+                num_warmup_sample=len(tasks) * num_measures_per_round,
+                model_file=load_model_file,
+                adapative_training=adapative_training,
+            )
+            if load_model_file and os.path.isfile(load_model_file):
                 logger.info("TaskScheduler: Load pretrained model...")
                 cost_model.load(load_model_file)
             elif load_log_file:
+                logger.info("TaskScheduler: Reload measured states and train the model...")
                 cost_model.update_from_file(load_log_file)
         elif model_type == "random":
             cost_model = RandomModel()
diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc
index 47b9fb60aab4..a5d4958af769 100755
--- a/src/auto_scheduler/feature.cc
+++ b/src/auto_scheduler/feature.cc
@@ -1462,12 +1462,18 @@ void GetPerStoreFeaturesFromMeasurePairs(const Array<MeasureInput>& inputs,
     if (find_res == task_cache.end()) {
       if (inputs[i]->task->compute_dag.defined()) {  // the measure input is complete
         task = inputs[i]->task;
-      } else {  // the measure input is incomplete
-        // rebuild task for incomplete measure pairs read from file
-        Array<te::Tensor> tensors = (*workload_key_to_tensors)(workload_key);
-        task = SearchTask(ComputeDAG(tensors), workload_key, inputs[i]->task->target,
-                          inputs[i]->task->target_host, inputs[i]->task->hardware_params,
-                          inputs[i]->task->layout_rewrite_option);
+      } else {
+        // The measure input is incomplete, rebuild task for incomplete measure pairs read from file
+        try {
+          Array<te::Tensor> tensors = (*workload_key_to_tensors)(workload_key);
+          task = SearchTask(ComputeDAG(tensors), workload_key, inputs[i]->task->target,
+                            inputs[i]->task->target_host, inputs[i]->task->hardware_params,
+                            inputs[i]->task->layout_rewrite_option);
+        } catch (std::exception& e) {
+          // Cannot build ComputeDAG from workload key, the task may have not been registered in
+          // this search round
+          continue;
+        }
       }
       task_id = task_cache.size();
 

From 1f931e35ffa1f55b28debc55002b89e0888aac88 Mon Sep 17 00:00:00 2001
From: mbaret <55580676+mbaret@users.noreply.github.com>
Date: Wed, 6 Jan 2021 18:03:13 +0000
Subject: [PATCH 028/357] [RELAY] Fix reshape header file (#7218)

The header file definition of InferNewShape was
incorrect, this patch fixes it.

Change-Id: Id24b8eccb52323692fe88bdda46cc49cba54588c
---
 src/relay/op/tensor/transform.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/relay/op/tensor/transform.h b/src/relay/op/tensor/transform.h
index a3770ff9cd8d..95a83a905908 100644
--- a/src/relay/op/tensor/transform.h
+++ b/src/relay/op/tensor/transform.h
@@ -193,9 +193,11 @@ static inline Array<Array<Layout>> ConcatenateLayout(const Attrs& attrs,
  *
  * \param data_shape The input data shape.
  * \param attrs The attributes.
+ * \param reverse Whether to reverse the indices.
  * \return Output shape.
  */
-Array<IndexExpr> InferNewShape(const Array<IndexExpr>& data_shape, const Attrs& attrs);
+Array<IndexExpr> InferNewShape(const Array<IndexExpr>& data_shape, const Attrs& attrs,
+                               bool reverse);
 
 }  // namespace relay
 }  // namespace tvm

From 2dd5f8d365daa79fb8313dc1ae159aba2134331f Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Wed, 6 Jan 2021 14:53:53 -0800
Subject: [PATCH 029/357] =?UTF-8?q?[=C2=B5TVM]=20Add=20documentation=20(#7?=
 =?UTF-8?q?164)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../reference-vm/zephyr/pyproject.toml        |   4 +-
 docs/dev/index.rst                            |   8 +
 docs/dev/microtvm_design.rst                  | 348 ++++++++++++++++++
 docs/index.rst                                |   1 +
 docs/microtvm/index.rst                       |  72 ++++
 tutorials/micro/README.txt                    |   4 +-
 tutorials/micro/micro_reference_vm.py         |   8 +-
 tutorials/micro/micro_tflite.py               | 175 +++++----
 8 files changed, 551 insertions(+), 69 deletions(-)
 create mode 100644 docs/dev/microtvm_design.rst
 create mode 100644 docs/microtvm/index.rst

diff --git a/apps/microtvm/reference-vm/zephyr/pyproject.toml b/apps/microtvm/reference-vm/zephyr/pyproject.toml
index ed8182584e36..f21c272731c4 100644
--- a/apps/microtvm/reference-vm/zephyr/pyproject.toml
+++ b/apps/microtvm/reference-vm/zephyr/pyproject.toml
@@ -117,13 +117,13 @@ importer-keras = ["tensorflow", "tensorflow-estimator"]
 importer-onnx = ["onnx", "onnxruntime", "torch", "torchvision", "future"]
 importer-pytorch = ["torch", "torchvision", "future"]
 importer-tensorflow = ["tensorflow", "tensorflow-estimator"]
-importer-tflite = ["tlfite", "tensorflow", "tensorflow-estimator"]
+importer-tflite = ["tflite", "tensorflow", "tensorflow-estimator"]
 
 [tool.poetry.dev-dependencies]
 autodocsumm = "^0.1"
 black = "^19.10b0"
 sphinx = "^3.0"
-sphinx-gallery = "^0.4"
+sphinx-gallery = "^0.8"
 sphinx-rtd-theme = "^0.4"
 matplotlib = "^3.2"
 Image = "^1.5"
diff --git a/docs/dev/index.rst b/docs/dev/index.rst
index 71ae5d4ec68d..e9ec767fd018 100644
--- a/docs/dev/index.rst
+++ b/docs/dev/index.rst
@@ -396,3 +396,11 @@ Security
    :maxdepth: 1
 
    security
+
+
+microTVM
+--------
+.. toctree::
+   :maxdepth: 1
+
+   microtvm_design
diff --git a/docs/dev/microtvm_design.rst b/docs/dev/microtvm_design.rst
new file mode 100644
index 000000000000..0251144511a0
--- /dev/null
+++ b/docs/dev/microtvm_design.rst
@@ -0,0 +1,348 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+..    http://www.apache.org/licenses/LICENSE-2.0
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+**************************
+microTVM Design Document
+**************************
+
+.. contents:: Table of Contents
+    :depth: 3
+
+Background
+===========
+
+TVM is a model deployment framework that has demonstrated good performance across a wide range of
+models on traditional operating systems. Given TVM's layered approach to compilation, it is a
+natural extension to target bare metal devices. While most of the compilation flow does not need to
+change for a proof-of-concept implementation on such devices, the runtime cannot depend on:
+
+* **Virtual Memory**, and by extension any system-provided ``malloc``. Additionally, bare metal
+  devices typically have very limited memory (measured in KB). Because of this, libraries designed
+  for such platforms typically need to be more judicious in using memory, and need to release
+  memory when it is not in use.
+* Traditional OS abstractions, such as **files**, **libraries**, and **kernel functions**. Some
+  projects implement support for these, but they are by no means standard.
+* Support for programming languages other than **C**.
+
+Such changes require a different appraoch from the TVM C++ runtime typically used on traditional
+Operating Systems.
+
+Typical Use
+===========
+
+This section discusses our vision of the "typical" microTVM use case. Each component used to achieve
+this typical use case is intended to be designed for flexibility, but this unifying vision serves to
+motivate the inclusion of each part of the design.
+
+.. figure:: https://raw.githubusercontent.com/tvmai/web-data/main/images/dev/microtvm_workflow.svg
+   :align: center
+   :width: 85%
+
+The parts of this process are described below:
+
+#. **Model Import**. The user imports an existing model or describes a new model to TVM, producing a
+   *Relay module*.
+
+#. **Model Transformations**. The user can apply transformations, such as quantization, to the
+   model. After each transformation, the user should still have a Relay module.
+
+#. **Compilation** (Scheduling and Code Generation). TVM implements each operator into Tensor IR by
+   assigning a schedule and schedule configuration to each Relay operator. Then, code (C source or
+   compiled object) is generated for each operator.
+
+#. **Integration**. The generated code is integrated along with the TVM C Runtime library into a
+   user-supplied binary project. In some cases (such as when the project is standardized across
+   multiple SoC/development boards), this process is handled automatically.
+
+#. **Deployment**. The project is built and the residual firmware binary is flashed onto the device.
+   Model inference is driven either by TVM using an on-device RPC server, or on the device using the
+   on-device Graph Runtime.
+
+Design Goals
+============
+
+microTVM aims to achieve these design goals:
+
+1. **Portable Code**. microTVM can translate any Relay model into C code that can compile with only
+   a C standard library.
+2. **Minimal Overhead**. microTVM generates target-specific, highly optimized code. As much overhead
+   from the runtime should be removed.
+3. **Accessible Code**. microTVM considers C source code as a first-class output mechanism so that
+   it is easier for a firmware engineer to understand and tweak.
+
+Overview
+========
+
+microTVM requires changes at all levels of the TVM compiler stack. The following sub-sections enumerate
+these changes at a high level, and follow-on sections discuss the specifics in more detail.
+
+Modeling Target Platforms
+-------------------------
+
+TVM's search-based optimization approach allows it to largely avoid system-level modeling of targets
+in favor of experimental results. However, some modelling is necessary in order to ensure TVM is
+comparing apples-to-apples search results, and to avoid wasting time during the search by attempting
+to compile invalid code for a target.
+
+microTVM models these parts of the target:
+
+* The CPU used, through the ``-mcpu`` and ``-march`` target flags.
+* The presence or absence of accelerators, through the device components of the target (Currently
+  only the absence of accelerators can be expressed, but this mechanism should extend well).
+
+microTVM aims to model these parts of the target in the future:
+
+* Memory, modeled as a set of disjoint memory spaces, each with a label and size and prefetch/flush
+  behavior. Some memory may be shared with accelerators.
+* Target runtime configuration (i.e. clock tree configuration, clock speed, etc). This is intended
+  only to contribute to the AutoTVM schedule key and not for any other use.
+
+At this time, TVM does not intend to model:
+
+* Size, type, or relationship of caches, with the exception of prefetching or cache flushing.
+
+
+TVM Targets for microTVM
+-------------------------
+
+A central data structure in the compilation process is the ``tvm::target::Target`` class. TVM uses
+Target to decide which TIR schedules to enable and how to configure the code generator. The Target
+class should also uniquely identify the generated code for a particular operator, as autotuning
+logs use it to rank measured performance (but see Future Work).
+
+Targets are currently represented as strings structured similarly to command-line arguments. An
+example target is shown below:
+
+    ``c -keys=arm_cpu -mcpu=cortex-m7 -link-params -model=stm32f746xx -runtime=c -system-lib=1``
+
+The relevant parts to microTVM are:
+
+ * Code generator (``llvm`` or ``c``)
+ * ``-mcpu=cortex-m7``: used by TOPI to enable Cortex-M schedules, and, when the C source code
+   generator is selected, included in the output as a comment to help identify the code and
+   configure the downstream C compiler.
+ * ``-link-params``: include parameters as global constants to load from flash.
+ * ``-runtime=c``: build glue code to allow operators to work with the C runtime
+ * ``-system-lib=1``: emit a system library (i.e. which can be loaded by calling the PackedFunc
+   ``runtime.SystemLib``.
+
+Writing Schedules for microTVM
+------------------------------
+
+For operations scheduled on the CPU, microTVM initially plans to make use of specialized
+instructions and extern (i.e. hand-optimized) functions to achieve good performance. In TVM, this
+appraoch is generally accomplished through tensorization, in which TVM breaks a computation into
+small pieces, and a TIR extern function accelerates each small piece.
+
+TVM currently accomodates both approaches using ``tir.call_extern``. First, a pragma is attached to
+the schedule defining the extern function in portable C.
+
+    ``sched[output].pragma(n, "import_c", "void call_asm(int32_t* a, int32_t* b) { /* ... */ }")``
+
+Next, ``tensorize`` is used to split the computation.
+
+    ``sched[output].tensorize(owi, gemm)``
+
+There are a couple of caveats to this approach, all which could be resolved by linking generated
+code against external libraries:
+
+* Inline assembly is compiler-specific. While Clang and GCC have standardized on one syntax, this
+  may not be portable to other compilers. SDKs solve this by conditionally including a header file
+  depending on the compiler being used. However, taking this approach means that the generated code
+  needs additional compiler flags (i.e. ``-Isystempath/to/header``).
+* It may be helpful to reference helper functions from the generated code (e.g. to inline common
+  sequences of hand-optimized assembly).
+* Finally, the extern function invoked may be wholly written in an external library. If those
+  functions can be wholly inlined, this caveat is the same as the previous. If not, then additional
+  C code needs to be compiled and linked against the operator.
+
+At present, microTVM presumes that all eligible schedules can be compiled. This means that the user-
+supplied project (see next section) must include all libraries that are used by the generated code.
+When not using autotuning, TVM randomly chooses a fallback schedule, so all libraries would need to
+be supported. When using autotuning, TVM selects the best-performing schedule, so only that library
+is needed. There isn't currently a way to force TVM to pick a particular schedule outside of
+autotuning logs, but that would be a good addition.
+
+Finally, when using the ``llvm`` backend, the process is similar except that LLVM bitcode is included
+in the generated code (with an ``import_llvm`` pragma). LLVM bitcode provides a portable way to call
+inline assembly. However, it may be more complex to call external C functions, and helper functions
+are of course not easy to use from LLVM bitcode.
+
+Executing Models
+----------------
+
+The TVM compiler traditionally outputs 3 pieces:
+1. Model operator implementations, as discussed above.
+2. A model execution graph, encoded as JSON
+3. Simplified parameters
+
+To correctly execute the model, a Graph Runtime needs to reconstruct the graph in memory, load the
+parameters, and then invoke the operator implementations in the correct order.
+
+microTVM supports two ways to do this:
+
+1. **Host-Driven**. The Graph Runtime can run on the host and carry out execution by issuing
+   commands to the device using an RPC link with a UART-like transport.
+2. **Standalone**. A C Graph Runtime is available to be compiled on-device, but it is not
+   particularly memory efficient. This way enables standalone execution without any attached host.
+
+Host-Driven is designed for experimenting with models on-device and, like AutoTVM, uses the RPC server to
+drive computation on-device. Standalone is intended for deployment.
+
+Host-Driven Execution
+^^^^^^^^^^^^^^^^^^^^^
+
+In Host-Driven execution, the firmware binary is the following:
+
+1. Generated operator implementations from TVM
+2. The TVM C runtime
+3. SoC-specific initialization.
+4. The TVM RPC server.
+5. (optional) Simplified Parameters
+
+This firmware image is flashed onto the device and a GraphRuntime instance is created on the host.
+The GraphRuntime drives execution by sending RPC commands over a UART:
+
+.. figure:: https://raw.githubusercontent.com/tvmai/web-data/main/images/dev/microtvm_host_driven.svg
+   :align: center
+   :width: 85%
+
+Standalone Execution
+^^^^^^^^^^^^^^^^^^^^
+
+In Standalone execution, the GraphRuntime is instantiated on device:
+
+.. figure:: https://raw.githubusercontent.com/tvmai/web-data/main/images/dev/microtvm_standalone.svg
+   :align: center
+   :width: 85%
+
+microTVM Firmware
+------------------
+
+We can now discuss how microTVM firmware should behave. An important task common to both model
+execution strategies is configuring the SoC to match the way it performs in production. microTVM
+considers this task project- and SoC-dependent. Whether for AutoTVM, host-driven model inference, or
+in standalone deployment, the user is expected to supply a project whose main() does the following:
+
+1. Configure the SoC to match deployment performance.
+2. Initialize the TVM C Runtime.
+
+When configuring for host-driven inference or AutoTVM, the remaining tasks are well-defined:
+
+3. Initialize a transport (i.e. a UART) for use with the TVM RPC server.
+4. Launch the TVM RPC Server.
+
+When configuring for standalone deployment, the firmware needs to:
+
+1. Instantiate the system library by calling the ``runtime.SystemLib`` PackedFunc.
+2. Instantiate a GraphRuntime passing the system library module.
+3. Configure parameters and inputs as needed.
+4. Run the model.
+
+Parts of a microTVM Binary
+--------------------------
+
+To summarize, a microTVM firwmare binary image must contain these parts:
+
+1. Operator implementations, produced by TVM.
+2. The TVM C runtime library, supplied by TVM as a static library.
+3. SoC Initialization, supplied by the user.
+
+For Host-driven model execution, firmware also needs:
+
+4. The TVM RPC Server library.
+
+For Standalone model execution, firmware also needs:
+
+4. The TVM C GraphRuntime library, supplied by TVM as a static library.
+5. The remaining compiler outputs (Simplified Parameters and Graph JSON).
+
+The Automated Build Flow
+-------------------------
+
+Once code generation is complete, ``tvm.relay.build`` returns a ``tvm.runtime.Module`` and the
+user can save the generated C source or binary object to a ``.c`` or ``.o`` file. From this point, TVM
+can theoretically step back and the user can compile and run the code separately.
+
+However, for AutoTVM, TVM needs some automated flow to handle the following tasks:
+
+1. Integrate operator implementations, the TVM C Runtime library, and the TVM RPC Server library into the
+   firmware project containing user-supplied SoC Initialization.
+2. Build the resulting project.
+3. Program the built firmware onto a (specific) attached device.
+4. Identify the serial port or other transport to be used by TVM to drive remote execution.
+
+At present, TVM expects the user to supply an implementation of the ``tvm.micro.Compiler``,
+``tvm.micro.Flasher``, and ``tvm.micro.Transport`` interfaces. TVM then:
+
+1. Builds each piece separately as a library
+2. Builds the libraries into a binary firmware image.
+3. Programs the firmware image onto an attached device.
+4. Opens a serial port to serve as the RPC server transport.
+
+This design was chosen to reduce build times for microTVM (the common libraries need to be build
+only once per candidate operator implemmentation). In practice, these projects are extremely small
+and compile relatively quickly. Compared with the added complexity of this tighter build integration
+with TVM, the performance gains are likely not worth it. A future design will consolidate the build
+tasks into a single step and narrow the interface to provide a better integration.
+
+Measuring operator performance
+------------------------------
+
+The TVM C runtime depends on user-supplied functions to measure time on-device. Users should implement
+``TVMPlatformTimerStart`` and ``TVMPlatformTimerStop``. These functions should measure wall clock time, so there
+are some pitfalls in implementing this function:
+
+1. If the CPU could halt or sleep during a computation (i.e. if it is being done on an accelerator),
+   a cycle counter should likely not be used as these tend to stop counting while the CPU is asleep.
+2. The granularity of these functions can be relaxed as needed to extend the range of the timer
+   device. However, if granularity is too coarse, a sub-optimal schedule may be used.
+3. An error should be raised if the timer overflows.
+4. The timer should not interrupt computation unless absolutely necessary. Doing so may affect the
+   accuracy of the results.
+5. Calibrating the output against a wall clock is ideal, but it will likely be too cumbersome. A
+   future PR could enable some characterization of the platform timer by e.g. measuring the internal
+   oscillator against a reference such as an external crystal.
+
+Future Work
+===========
+
+Ahead-of-Time Runtime
+----------------------
+
+A limitation of the Graph Runtime is the amount of memory overhead required in parsing the JSON.
+The current implementation contributes significantly to the dynamic memory usage of microTVM,
+limiting its utility. An ahead-of-time runtime can avoid the need for any Graph JSON parsing and
+improve inference speed by generating C code to call the generated operator implementations directly
+rather than relying on a data-driven approach with the Graph Runtime.
+
+Memory Planning
+----------------
+
+The current memory planner attempts to limit the number of ``TVMBackendDeviceAlloc()`` calls
+issued for intermediate tensors only. Because scratchpads can vary widely, and because the planner
+coalesces memory allocations within 16x of each other, this strategy typically results in high
+peak memory usage.
+
+Heterogeneous Execution
+-----------------------
+
+Newer Cortex-M SoC can contain multiple CPUs and onboard ML accelerators.
+
+
+Autotuning Target
+-----------------
+
+As discussed previously,
diff --git a/docs/index.rst b/docs/index.rst
index f407fa2d4f29..3131be5381fc 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -44,6 +44,7 @@ For Developers
    contribute/index
    deploy/index
    dev/how_to
+   microtvm/index
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/microtvm/index.rst b/docs/microtvm/index.rst
new file mode 100644
index 000000000000..68583fed31f4
--- /dev/null
+++ b/docs/microtvm/index.rst
@@ -0,0 +1,72 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+.. _microtvm-index:
+
+microTVM: TVM on bare-metal
+===========================
+
+microTVM runs TVM models on bare-metal (i.e. IoT) devices. microTVM depends only on the C standard
+library, and doesn't require an operating system to execute. microTVM is currently under heavy
+development.
+
+.. figure:: https://raw.githubusercontent.com/tvmai/web-data/main/images/dev/microtvm_workflow.svg
+   :align: center
+   :width: 85%
+
+microTVM is:
+
+* an extension to TVM's compiler to allow it to target microcontrollers
+* a way to run the TVM RPC server on-device, to allow autotuning
+* a minimal C runtime that supports standalone model inference on bare metal devices.
+
+Supported Hardware
+~~~~~~~~~~~~~~~~~~
+
+microTVM currently tests against Cortex-M microcontrollers with the Zephyr RTOS; however, it is
+flexible and portable to other processors such as RISC-V and does not require Zephyr. The current
+demos run against QEMU and the following hardware:
+
+* `STM Nucleo-F746ZG <https://www.st.com/en/evaluation-tools/nucleo-f746zg.html>`_
+* `nRF 5340 Preview Development Kit <https://www.nordicsemi.com/Software-and-tools/Development-Kits/nRF5340-PDK>`_
+
+
+Getting Started with microTVM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Before working with microTVM, we recommend you have a supported development board. Then, follow these
+tutorials to get started with microTVM:
+
+1. :ref:`Start the microTVM Reference VM <tutorial-micro-reference-vm>`. The microTVM tutorials
+   depend on Zephyr and on a compiler toolchain for your hardware. The reference VM is a convenient
+   way to install those dependencies.
+2. Try the :doc:`microTVM with TFLite Tutorial </tutorials/micro/micro_tflite>`.
+3. Try running a more complex `CIFAR10-CNN model <https://github.com/areusch/microtvm-blogpost-eval>`_.
+
+
+How microTVM Works
+~~~~~~~~~~~~~~~~~~
+
+
+You can read more about the design of these pieces at the :doc:`microTVM Design Document </dev/microtvm_design>`.
+
+
+Help and Discussion
+~~~~~~~~~~~~~~~~~~~
+
+The `TVM Discuss Forum <https://discuss.tvm.ai>`_ is a great place to collaborate on microTVM tasks,
+and maintains a searchable history of past problems.
diff --git a/tutorials/micro/README.txt b/tutorials/micro/README.txt
index 0654353e3426..70a5e580ecd1 100644
--- a/tutorials/micro/README.txt
+++ b/tutorials/micro/README.txt
@@ -1,4 +1,4 @@
 .. _tutorial-micro:
 
-Micro TVM 
----------
+microTVM
+--------
diff --git a/tutorials/micro/micro_reference_vm.py b/tutorials/micro/micro_reference_vm.py
index 4b449a0e7e14..bcef6a0d2c64 100644
--- a/tutorials/micro/micro_reference_vm.py
+++ b/tutorials/micro/micro_reference_vm.py
@@ -15,6 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 """
+.. _tutorial-micro-reference-vm:
+
 ===================================
 microTVM Reference Virtual Machines
 ===================================
@@ -90,6 +92,8 @@
 
 .. _microTVM base box: https://app.vagrantup.com/tlcpack/boxes/microtvm
 
+Connect Hardware to the VM
+--------------------------
 
 Next, you need to configure USB passthrough to attach your physical development board to the virtual
 machine (rather than directly to your laptop's host OS).
@@ -102,8 +106,8 @@
  * `Parallels <https://kb.parallels.com/122993>`__
  * `VMWare Workstation <https://docs.vmware.com/en/VMware-Workstation-Pro/15.0/com.vmware.ws.using.doc/GUID-E003456F-EB94-4B53-9082-293D9617CB5A.html>`__
 
-Future use
-----------
+Rebuilding TVM inside the Reference VM
+--------------------------------------
 
 After the first boot, you'll need to ensure you keep the build, in ``$TVM_HOME/build-microtvm``,
 up-to-date when you modify the C++ runtime or checkout a different revision. You can either
diff --git a/tutorials/micro/micro_tflite.py b/tutorials/micro/micro_tflite.py
index 7ec5506aa9b5..feabcf71ae2c 100644
--- a/tutorials/micro/micro_tflite.py
+++ b/tutorials/micro/micro_tflite.py
@@ -15,83 +15,110 @@
 # specific language governing permissions and limitations
 # under the License.
 """
-Micro TVM with TFLite Models
-============================
+microTVM with TFLite Models
+===========================
 **Author**: `Tom Gall <https://github.com/tom-gall>`_
 
-This tutorial is an introduction to working with MicroTVM and a TFLite
+This tutorial is an introduction to working with microTVM and a TFLite
 model with Relay.
 """
 
-# %%
+######################################################################
+# .. note::
+#     If you want to run this tutorial on the microTVM Reference VM, download the Jupyter
+#     notebook using the link at the bottom of this page and save it into the TVM directory. Then:
+#
+#     #. Login to the reference VM with a modified ``vagrant ssh`` command:
+#
+#         ``$ vagrant ssh -- -L8888:localhost:8888``
+#
+#     #. Install jupyter:  ``pip install jupyterlab``
+#     #. ``cd`` to the TVM directory.
+#     #. Install tflite: poetry install -E importer-tflite
+#     #. Launch Jupyter Notebook: ``jupyter notebook``
+#     #. Copy the localhost URL displayed, and paste it into your browser.
+#     #. Navigate to saved Jupyter Notebook (``.ipynb`` file).
+#
+#
 # Setup
 # -----
 #
-# To get started, TFLite package needs to be installed as prerequisite.
+# Install TFLite
+# ^^^^^^^^^^^^^^
+#
+# To get started, TFLite package needs to be installed as prerequisite. You can do this in two ways:
 #
-# install tflite
+# 1. Install tflite with ``pip``
 #
-# .. code-block:: bash
+#     .. code-block:: bash
 #
-#   pip install tflite=2.1.0 --user
+#       pip install tflite=2.1.0 --user
 #
-# or you could generate TFLite package yourself. The steps are the following:
+# 2. Generate the TFLite package yourself. The steps are the following:
 #
-#   Get the flatc compiler.
-#   Please refer to https://github.com/google/flatbuffers for details
-#   and make sure it is properly installed.
+#     Get the flatc compiler.
+#     Please refer to https://github.com/google/flatbuffers for details
+#     and make sure it is properly installed.
 #
-# .. code-block:: bash
+#     .. code-block:: bash
 #
-#   flatc --version
+#       flatc --version
 #
-# Get the TFLite schema.
+#     Get the TFLite schema.
 #
-# .. code-block:: bash
+#     .. code-block:: bash
 #
-#   wget https://raw.githubusercontent.com/tensorflow/tensorflow/r1.13/tensorflow/lite/schema/schema.fbs
+#       wget https://raw.githubusercontent.com/tensorflow/tensorflow/r1.13/tensorflow/lite/schema/schema.fbs
 #
-# Generate TFLite package.
+#     Generate TFLite package.
 #
-# .. code-block:: bash
+#     .. code-block:: bash
 #
-#   flatc --python schema.fbs
+#       flatc --python schema.fbs
 #
-# Add the current folder (which contains generated tflite module) to PYTHONPATH.
+#     Add the current folder (which contains generated tflite module) to PYTHONPATH.
 #
-# .. code-block:: bash
+#     .. code-block:: bash
 #
-#   export PYTHONPATH=${PYTHONPATH:+$PYTHONPATH:}$(pwd)
+#       export PYTHONPATH=${PYTHONPATH:+$PYTHONPATH:}$(pwd)
 #
 # To validate that the TFLite package was installed successfully, ``python -c "import tflite"``
 #
-# CMSIS needs to be downloaded and the CMSIS_ST_PATH environment variable setup
-# This tutorial only supports the STM32F7xx series of boards.
-# Download from : https://www.st.com/en/embedded-software/stm32cubef7.html
-# After you've expanded the zip file
+# Install Zephyr (physical hardware only)
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 #
-# .. code-block:: bash
+# When running this tutorial with a host simulation (the default), you can use the host ``gcc`` to
+# build a firmware image that simulates the device. When compiling to run on physical hardware, you
+# need to install a *toolchain* plus some target-specific dependencies. microTVM allows you to
+# supply any compiler and runtime that can launch the TVM RPC server, but to get started, this
+# tutorial relies on the Zephyr RTOS to provide these pieces.
 #
-#   export CMSIS_ST_PATH=/path/to/STM32Cube_FW_F7_V1.16.0/Drivers/CMSIS
-
-# %%
-# Recreating your own Pre-Trained TFLite model
-# --------------------------------------------
+# You can install Zephyr by following the
+# `Installation Instructions <https://docs.zephyrproject.org/latest/getting_started/index.html>`_.
+#
+# Aside: Recreating your own Pre-Trained TFLite model
+#  The tutorial downloads a pretrained TFLite model. When working with microcontrollers
+#  you need to be mindful these are highly resource constrained devices as such standard
+#  models like MobileNet may not fit into their modest memory.
 #
-# The tutorial downloads a pretrained TFLite model. When working with microcontrollers
-# you need to be mindful these are highly resource constrained devices as such standard
-# models like MobileNet may not fit into their modest memory.
+#  For this tutorial, we'll make use of one of the TF Micro example models.
 #
-# For this tutorial, we'll make use of one of the TF Micro example models.
+#  If you wish to replicate the training steps see:
+#  https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/hello_world/train
 #
-# If you wish to replicate the training steps see:
-# https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/hello_world/train
+#    .. note::
 #
-#   .. note::
+#      If you accidentally download the example pretrained model from:
 #
-#     If you accidentally download the example pretrained model from:
-#     wget https://storage.googleapis.com/download.tensorflow.org/models/tflite/micro/hello_world_2020_04_13.zip
-#     this will fail due to an unimplemented opcode (114)
+#      ``wget https://storage.googleapis.com/download.tensorflow.org/models/tflite/micro/hello_world_2020_04_13.zip``
+#
+#      this will fail due to an unimplemented opcode (114)
+#
+# Load and prepare the Pre-Trained Model
+# --------------------------------------
+#
+# Load the pretrained TFLite model from a file in your current
+# directory into a buffer
 
 import os
 import numpy as np
@@ -101,13 +128,6 @@
 from tvm.contrib import graph_runtime, utils
 from tvm import relay
 
-# %%
-# Load and prepare the Pre-Trained Model
-# --------------------------------------
-#
-# Load the pretrained TFLite model from a file in your current
-# directory into a buffer
-
 model_url = "https://people.linaro.org/~tom.gall/sine_model.tflite"
 model_file = "sine_model.tflite"
 model_path = download_testdata(model_url, model_file, module="data")
@@ -137,8 +157,8 @@
 # is contained in the model.
 #
 # If you are unsure what that might be, this can be discovered by using
-# the visualize.py script within the Tensorflow project.
-# See : How do I inspect a .tflite file? `<https://www.tensorflow.org/lite/guide/faq>`_
+# the ``visualize.py`` script within the Tensorflow project.
+# See `How do I inspect a .tflite file? <https://www.tensorflow.org/lite/guide/faq>`_
 
 input_tensor = "dense_4_input"
 input_shape = (1,)
@@ -149,13 +169,26 @@
 )
 
 ######################################################################
+# Defining the target
+# -------------------
+#
 # Now we create a build config for relay. turning off two options
 # and then calling relay.build which will result in a C source
-# file.
+# file. When running on a simulated target, choose "host" below:
+TARGET = tvm.target.target.micro("host")
+
+# %%
+# Compiling for physical hardware
+#  When running on physical hardware, choose a target that describes
+#  the hardware. The STM32F746 Nucleo target is chosen in this commented
+#  code:
 #
-# .. code-block:: python
+#  .. code-block:: python
 #
-TARGET = tvm.target.target.micro("host")
+#     TARGET = tvm.target.target.micro("stm32f746xx")
+
+######################################################################
+# Now, compile the model for the target:
 
 with tvm.transform.PassContext(
     opt_level=3, config={"tir.disable_vectorize": True}, disabled_pass=["FuseOps"]
@@ -164,16 +197,34 @@
 
 
 # %%
-# Running on simulated device
-# ----------------------------------------------
+# Compiling for a simulated device
+# --------------------------------
 #
 # First, compile a static microTVM runtime for the targeted device. In this case, the host simulated
 # device is used.
-workspace = tvm.micro.Workspace()
-
 compiler = tvm.micro.DefaultCompiler(target=TARGET)
 opts = tvm.micro.default_options(os.path.join(tvm.micro.CRT_ROOT_DIR, "host"))
 
+# %%
+# Compiling for physical hardware
+#  For physical hardware, comment out the previous section and use this compiler definition instead.
+#
+#  .. code-block:: python
+#
+#     import subprocess
+#     from tvm.micro.contrib import zephyr
+#
+#     repo_root = subprocess.check_output(["git", "rev-parse", "--show-toplevel"], encoding='utf-8').strip()
+#     project_dir = f"{repo_root}/tests/micro/qemu/zephyr-runtime"
+#     compiler = zephyr.ZephyrCompiler(
+#            project_dir=project_dir,
+#            board="nucleo_f746zg" if "stm32f746" in str(TARGET) else "qemu_x86",
+#            zephyr_toolchain_variant="zephyr",
+#      )
+#
+#     opts = tvm.micro.default_options(os.path.join(tvm.micro.CRT_ROOT_DIR, "host"))
+
+workspace = tvm.micro.Workspace()
 micro_binary = tvm.micro.build_static_runtime(
     # the x86 compiler *expects* you to give the exact same dictionary for both
     # lib_opts and bin_opts. so the library compiler is mutating lib_opts and
@@ -182,7 +233,7 @@
     workspace,
     compiler,
     c_mod,
-    lib_opts=opts["bin_opts"],
+    lib_opts=opts["lib_opts"],
     bin_opts=opts["bin_opts"],
     # Use the microTVM memory manager. If, in your main.cc, you change TVMPlatformMemoryAllocate and
     # TVMPlatformMemoryFree to use e.g. malloc() and free(), you can omit this extra library.
@@ -195,9 +246,7 @@
 # computation. The `with session` line would typically flash an attached
 # microcontroller, but in this tutorial, it simply launches a subprocess
 # to stand in for an attached microcontroller.
-#
-# .. code-block:: python
-#
+
 flasher = compiler.flasher()
 with tvm.micro.Session(binary=micro_binary, flasher=flasher) as session:
     graph_mod = tvm.micro.create_local_graph_runtime(

From d777e7c612cf7a9aae4d8433c36f031c6b6f985c Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 6 Jan 2021 19:59:12 -0500
Subject: [PATCH 030/357] [TIR][REFACTOR] Enforce allocate to use the correct
 var pointer hint. (#7216)

* [TIR][REFACTOR] Enforce allocate to only accept buffer_var with correct PtrType.

This is a refactoring step to cleanup legacy issue of opaque buffer
var without ptr type information. Now all the allocation comes with the right
pointer data type. Places touched:

- TVMScript Parser: add the right info to get the correct pointer type.
- Cross thread all reduce: set the right pointer type.
- Storage rewrite: setup the right pointer type.
- Custom dtype: remap the variables with new pointer type.

x

* Address comments

Co-authored-by: Tristan Konolige <tristan.konolige@gmail.com>

Co-authored-by: Tristan Konolige <tristan.konolige@gmail.com>
---
 include/tvm/tir/op.h                          |   2 +-
 python/tvm/script/parser.py                   |  25 ++-
 python/tvm/script/scope_handler.py            |  13 +-
 python/tvm/tir/buffer.py                      |   5 +-
 src/driver/driver_api.cc                      |   3 +-
 src/target/source/codegen_cuda.cc             |   6 +-
 src/te/operation/cross_thread_reduction.cc    |   6 +-
 src/tir/ir/buffer.cc                          |  14 +-
 src/tir/ir/stmt.cc                            |   9 +-
 src/tir/ir/stmt_functor.cc                    |  14 +-
 src/tir/transforms/lower_custom_datatypes.cc  | 147 ++++++++++++------
 src/tir/transforms/lower_thread_allreduce.cc  |  16 +-
 src/tir/transforms/storage_rewrite.cc         |  34 ++--
 tests/cpp/ir_functor_test.cc                  |  10 +-
 tests/python/unittest/test_tir_constructor.py |   1 +
 15 files changed, 209 insertions(+), 96 deletions(-)

diff --git a/include/tvm/tir/op.h b/include/tvm/tir/op.h
index 61481d931763..4a907fca951d 100644
--- a/include/tvm/tir/op.h
+++ b/include/tvm/tir/op.h
@@ -1241,7 +1241,7 @@ inline void DivAmbiguityError(const TA& a) {
                 "please call div, indexdiv/indexmod, "
                 "floordiv/floormod or truncdiv/truncmod directly "
                 "to avoid ambiguity in the code. "
-                "Checkout these functions in expr_operator.h.");
+                "Checkout these functions in tir/op.h.");
 }
 
 // The following code are not intended to be used in the codebase.
diff --git a/python/tvm/script/parser.py b/python/tvm/script/parser.py
index db976d0ee677..33b0bab0d7e7 100644
--- a/python/tvm/script/parser.py
+++ b/python/tvm/script/parser.py
@@ -230,6 +230,19 @@ def parse_arg_list(self, func, node_call):
         """Match the arguments of a function call in the AST to the required
         arguments of the function. This handles positional arguments,
         positional arguments specified by name, keyword arguments, and varargs.
+
+        Parameters
+        ----------
+        func : Function
+            The function that provides the signature
+
+        node_call: ast.Call
+            The AST call node that calls into the function.
+
+        Returns
+        -------
+        arg_list : list
+            The parsed positional argument.
         """
         assert isinstance(node_call, ast.Call)
         # collect arguments
@@ -435,8 +448,8 @@ def transform_Assign(self, node):
                         node.rhs.span,
                     )
                 # Pattern 4
-                func.enter_scope(node, self.context)
                 arg_list = self.parse_arg_list(func, node.rhs)
+                func.enter_scope(node, self.context, arg_list, node.rhs.func_name.span)
                 func.body = self.parse_body(node)
                 return func.exit_scope(node, self.context, arg_list, node.rhs.func_name.span)
             elif isinstance(func, SpecialStmt):
@@ -532,9 +545,9 @@ def transform_For(self, node):
         self.current_col_offset = node.span.start_column
         self.context.new_scope(nodes=node.body.stmts)
         # for scope handler process the scope
-        func.enter_scope(node, self.context)
-        func.body = self.parse_body(node)
         arg_list = self.parse_arg_list(func, node.rhs)
+        func.enter_scope(node, self.context, arg_list, node.rhs.func_name.span)
+        func.body = self.parse_body(node)
         res = func.exit_scope(node, self.context, arg_list, node.rhs.func_name.span)
         # exit the scope
         self.context.pop_scope()
@@ -571,9 +584,9 @@ def transform_With(self, node):
         self.current_col_offset = node.body.span.start_column
         self.context.new_scope(nodes=node.body.stmts)
         # with scope handler process the scope
-        func.enter_scope(node, self.context)
-        func.body = self.parse_body(node)
         arg_list = self.parse_arg_list(func, node.rhs)
+        func.enter_scope(node, self.context, arg_list, node.rhs.func_name.span)
+        func.body = self.parse_body(node)
         res = func.exit_scope(node, self.context, arg_list, node.rhs.func_name.span)
         # exit the scope
         self.context.pop_scope()
@@ -689,7 +702,7 @@ def f():
         if isinstance(func, Intrin) and func.stmt:
             return func.handle(arg_list, node.call.func_name.span)
         elif isinstance(func, WithScopeHandler) and func.concise_scope and not func.def_symbol:
-            func.enter_scope(node, self.context)
+            func.enter_scope(node, self.context, arg_list, node.call.func_name.span)
             func.body = self.parse_body(node)
             return func.exit_scope(node, self.context, arg_list, node.call.func_name.span)
         elif isinstance(func, SpecialStmt) and not func.def_symbol:
diff --git a/python/tvm/script/scope_handler.py b/python/tvm/script/scope_handler.py
index 7f252e3e381d..21ed7f6e4682 100644
--- a/python/tvm/script/scope_handler.py
+++ b/python/tvm/script/scope_handler.py
@@ -35,7 +35,7 @@ def __init__(self, func):
     def signature(self):
         return "tir." + self.func.__name__, get_param_list(self.func)
 
-    def enter_scope(self, node, context):
+    def enter_scope(self, node, context, arg_list, span):
         pass
 
     def exit_scope(self, node, context, arg_list, span):
@@ -86,7 +86,7 @@ def allocate(extents, dtype, scope, condition=True, span=None):
         super().__init__(allocate, concise_scope=True, def_symbol=True)
         self.buffer_var = None
 
-    def enter_scope(self, node, context):
+    def enter_scope(self, node, context, arg_list, span):
         # define buffer vars in symbol table
         if isinstance(node, ast.With):
             names = WithScopeHandler.get_optional_var_names(node, context)
@@ -98,7 +98,12 @@ def enter_scope(self, node, context):
         else:
             raise Exception("Internal Bug")
 
-        self.buffer_var = tvm.te.var(name, "handle", span=from_synr_span(node.lhs.id.span))
+        def setup_buffer_var(extents, dtype, scope, condition=True, span=None):
+            """Setup buffer var for a given type."""
+            buffer_ptr_type = tvm.ir.PointerType(tvm.ir.PrimType(dtype))
+            self.buffer_var = tvm.tir.Var(name, buffer_ptr_type, span)
+
+        setup_buffer_var(*arg_list, span=from_synr_span(node.lhs.id.span))
         context.update_symbol(name, self.buffer_var)
 
 
@@ -187,7 +192,7 @@ def __init__(self, func):
         super().__init__(func)
         self.loop_vars = None
 
-    def enter_scope(self, node, context):
+    def enter_scope(self, node, context, arg_list, span):
         assert isinstance(node, ast.For)
 
         loop_var_names = list()
diff --git a/python/tvm/tir/buffer.py b/python/tvm/tir/buffer.py
index 2f50aa8e50a1..95966a5050e1 100644
--- a/python/tvm/tir/buffer.py
+++ b/python/tvm/tir/buffer.py
@@ -247,7 +247,10 @@ def decl_buffer(
         shape_dtype = shape[0].dtype if hasattr(shape[0], "dtype") else "int32"
         elem_offset = Var("%s_elem_offset" % name, shape_dtype)
     if data is None:
-        data = Var(name, PointerType(PrimType(dtype)), span)
+        # Bool is represented as uint1 in the IR, but stored as int8
+        storage_type = PrimType(dtype)
+        storage_type = PrimType("int8") if storage_type.dtype == "bool" else storage_type
+        data = Var(name, PointerType(storage_type), span)
     return _ffi_api.Buffer(
         data,
         dtype,
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index f88b6215f927..bbbb7e3f9eb5 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -69,7 +69,8 @@ Target DefaultTargetHost(Target target) {
 
 tir::Buffer BufferWithOffsetAlignment(Array<PrimExpr> shape, DataType dtype, std::string name,
                                       int data_alignment, int offset_factor, bool compact) {
-  auto data = tir::Var(name, PointerType(PrimType(dtype)));
+  DataType storage_dtype = (dtype == DataType::Bool() ? DataType::Int(8) : dtype);
+  auto data = tir::Var(name, PointerType(PrimType(storage_dtype)));
   bool has_any = false;
   if (!compact) {
     for (const auto& it : shape) {
diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc
index c0fb39f0a4f6..6c73716edc18 100644
--- a/src/target/source/codegen_cuda.cc
+++ b/src/target/source/codegen_cuda.cc
@@ -581,7 +581,11 @@ void CodeGenCUDA::VisitStmt_(const AllocateNode* op) {
   int32_t constant_size = op->constant_allocation_size();
   ICHECK_GT(constant_size, 0) << "Can only handle constant size stack allocation for now";
   const VarNode* buffer = op->buffer_var.as<VarNode>();
-  std::string scope = alloc_storage_scope_.at(buffer);
+  auto it = alloc_storage_scope_.find(buffer);
+  ICHECK(it != alloc_storage_scope_.end())
+      << "Buffer " << op->buffer_var << " is missing an AttrStmt with a \"storage_scope\" key";
+
+  std::string scope = it->second;
   if (scope.find("wmma.") == 0) {
     if (scope == "wmma.matrix_a" || scope == "wmma.matrix_b") {
       ICHECK(op->dtype == DataType::Float(16) || op->dtype == DataType::Int(8) ||
diff --git a/src/te/operation/cross_thread_reduction.cc b/src/te/operation/cross_thread_reduction.cc
index b0fb9b667558..da20dd875ba5 100644
--- a/src/te/operation/cross_thread_reduction.cc
+++ b/src/te/operation/cross_thread_reduction.cc
@@ -145,7 +145,8 @@ Stmt MakeCrossThreadReduction(const ComputeOpNode* self, const Stage& stage,
     Array<PrimExpr> lhs;
     for (size_t i = 0; i < size; ++i) {
       DataType t = reduces[i]->dtype;
-      normal_res_handles.emplace_back("normal_reduce_temp" + std::to_string(i), DataType::Handle());
+      normal_res_handles.emplace_back("normal_reduce_temp" + std::to_string(i),
+                                      PointerType(PrimType(t)));
       lhs.push_back(Load(t, normal_res_handles[i], 0, const_true(t.lanes())));
     }
     Array<PrimExpr> init_value = combiner->identity_element;
@@ -175,7 +176,8 @@ Stmt MakeCrossThreadReduction(const ComputeOpNode* self, const Stage& stage,
   freduce_args.push_back(const_true(1));
   std::vector<Var> res_handles(size);
   for (size_t idx = 0; idx < size; ++idx) {
-    res_handles[idx] = Var("reduce_temp" + std::to_string(idx), DataType::Handle());
+    DataType dtype = reduces[idx]->dtype;
+    res_handles[idx] = Var("reduce_temp" + std::to_string(idx), PointerType(PrimType(dtype)));
     freduce_args.push_back(res_handles[idx]);
   }
 
diff --git a/src/tir/ir/buffer.cc b/src/tir/ir/buffer.cc
index 23a2b3a3b3c7..1667eb7d1fbd 100644
--- a/src/tir/ir/buffer.cc
+++ b/src/tir/ir/buffer.cc
@@ -46,8 +46,9 @@ Array<PrimExpr> SimplifyArray(arith::Analyzer* ana, Array<PrimExpr> array) {
 }
 
 Buffer decl_buffer(Array<PrimExpr> shape, DataType dtype, String name, Span span) {
-  return Buffer(Var(name, PointerType(PrimType(dtype)), span), dtype, shape, Array<PrimExpr>(),
-                PrimExpr(), name, "", 0, 0, kDefault, span);
+  DataType storage_dtype = (dtype == DataType::Bool() ? DataType::Int(8) : dtype);
+  return Buffer(Var(name, PointerType(PrimType(storage_dtype)), span), dtype, shape,
+                Array<PrimExpr>(), PrimExpr(), name, "", 0, 0, kDefault, span);
 }
 
 // Split the given expression w.r.t the add operator
@@ -384,9 +385,14 @@ PrimExpr Buffer::access_ptr(int access_mask, DataType ptr_type, int content_lane
 Buffer::Buffer(Var data, DataType dtype, Array<PrimExpr> shape, Array<PrimExpr> strides,
                PrimExpr elem_offset, String name, String scope, int data_alignment,
                int offset_factor, BufferType buffer_type, Span span) {
-  ICHECK(IsPointerType(data->type_annotation, dtype))
+  DataType storage_dtype = dtype;
+  // specially handle bool
+  if (storage_dtype == DataType::Bool()) {
+    storage_dtype = DataType::Int(8);
+  }
+  ICHECK(IsPointerType(data->type_annotation, storage_dtype))
       << "Buffer data field expect to have the right pointer type annotation"
-      << " annotation=" << data->type_annotation << ", dtype=" << dtype;
+      << " annotation=" << data->type_annotation << ", storage_dtype=" << storage_dtype;
 
   auto n = make_object<BufferNode>();
   n->data = std::move(data);
diff --git a/src/tir/ir/stmt.cc b/src/tir/ir/stmt.cc
index 86960d9bd999..fd03046376f8 100644
--- a/src/tir/ir/stmt.cc
+++ b/src/tir/ir/stmt.cc
@@ -274,9 +274,12 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // Allocate
 Allocate::Allocate(Var buffer_var, DataType dtype, Array<PrimExpr> extents, PrimExpr condition,
                    Stmt body, Span span) {
-  // TODO(tvm-team): Add invariant check to make sure
-  // IsPointerPType(buffer_var->type_annotation, dtype)
-  // once we fix the allocate tvm script printing.
+  CHECK(IsPointerType(buffer_var->type_annotation, dtype))
+      << "The allocated data type (" << dtype
+      << ") does not match the type annotation of the buffer " << buffer_var << " ("
+      << buffer_var->type_annotation
+      << "). The data type should be an element of the pointer type.";
+
   for (size_t i = 0; i < extents.size(); ++i) {
     ICHECK(extents[i].defined());
     ICHECK(extents[i].dtype().is_scalar());
diff --git a/src/tir/ir/stmt_functor.cc b/src/tir/ir/stmt_functor.cc
index 529380bf9d59..e0ccb49fc454 100644
--- a/src/tir/ir/stmt_functor.cc
+++ b/src/tir/ir/stmt_functor.cc
@@ -480,7 +480,6 @@ class IRSubstitue : public StmtExprMutator {
   }
 
   PrimExpr VisitExpr_(const LoadNode* op) final {
-    // NOTE: we do not explicit recursivly mutate op->buffer_var
     PrimExpr ret = StmtExprMutator::VisitExpr_(op);
     op = ret.as<LoadNode>();
     if (auto mapped_var = vmap_(op->buffer_var)) {
@@ -491,7 +490,6 @@ class IRSubstitue : public StmtExprMutator {
   }
 
   Stmt VisitStmt_(const StoreNode* op) final {
-    // NOTE: we do not explicit recursivly mutate op->buffer_var
     Stmt ret = StmtExprMutator::VisitStmt_(op);
     op = ret.as<StoreNode>();
     if (auto mapped_var = vmap_(op->buffer_var)) {
@@ -501,6 +499,18 @@ class IRSubstitue : public StmtExprMutator {
     }
   }
 
+  Stmt VisitStmt_(const AttrStmtNode* op) final {
+    Stmt ret = StmtExprMutator::VisitStmt_(op);
+    op = ret.as<AttrStmtNode>();
+    // remap var node in attr
+    if (const auto* var_node = op->node.as<VarNode>()) {
+      if (auto mapped_var = vmap_(GetRef<Var>(var_node))) {
+        return AttrStmt(mapped_var, op->attr_key, op->value, op->body);
+      }
+    }
+    return ret;
+  }
+
  private:
   std::function<Optional<PrimExpr>(const Var&)> vmap_;
 };
diff --git a/src/tir/transforms/lower_custom_datatypes.cc b/src/tir/transforms/lower_custom_datatypes.cc
index a3e5a920a0b2..21f1b18d523b 100644
--- a/src/tir/transforms/lower_custom_datatypes.cc
+++ b/src/tir/transforms/lower_custom_datatypes.cc
@@ -44,14 +44,14 @@ class CustomDatatypesLowerer : public StmtExprMutator {
  public:
   explicit CustomDatatypesLowerer(const std::string& target) : target_(target) {}
 
-  inline PrimExpr VisitExpr_(const CastNode* op) final {
+  PrimExpr VisitExpr_(const CastNode* op) final {
     auto type_code = op->dtype.code();
     auto src_type_code = op->value.dtype().code();
     // If either datatype is a registered custom datatype, we must lower.
-    bool toBeLowered = datatype::Registry::Global()->GetTypeRegistered(type_code) ||
-                       datatype::Registry::Global()->GetTypeRegistered(src_type_code);
+    bool to_be_lowered = datatype::Registry::Global()->GetTypeRegistered(type_code) ||
+                         datatype::Registry::Global()->GetTypeRegistered(src_type_code);
     PrimExpr expr = StmtExprMutator::VisitExpr_(op);
-    if (toBeLowered) {
+    if (to_be_lowered) {
       auto lower = datatype::GetCastLowerFunc(target_, type_code, src_type_code);
       ICHECK(lower) << "Cast lowering function for target " << target_ << " destination type "
                     << static_cast<unsigned>(type_code) << " source type "
@@ -61,7 +61,7 @@ class CustomDatatypesLowerer : public StmtExprMutator {
     return expr;
   }
 
-  inline PrimExpr VisitExpr_(const FloatImmNode* imm) final {
+  PrimExpr VisitExpr_(const FloatImmNode* imm) final {
     auto type_code = imm->dtype.code();
     auto e = GetRef<PrimExpr>(imm);
     if (datatype::Registry::Global()->GetTypeRegistered(type_code)) {
@@ -73,35 +73,86 @@ class CustomDatatypesLowerer : public StmtExprMutator {
     return e;
   }
 
-  inline Stmt VisitStmt_(const AllocateNode* allocate) final {
-    bool toBeLowered = datatype::Registry::Global()->GetTypeRegistered(allocate->dtype.code());
-    Stmt stmt = StmtExprMutator::VisitStmt_(allocate);
-    allocate = stmt.as<AllocateNode>();
+  PrimExpr VisitExpr_(const VarNode* op) final {
+    Var var = GetRef<Var>(op);
 
-    if (toBeLowered) {
+    auto itr = var_remap_.find(var);
+    if (itr != var_remap_.end()) {
+      return itr->second;
+    } else {
+      return std::move(var);
+    }
+  }
+
+  Stmt VisitStmt_(const AllocateNode* allocate) final {
+    bool to_be_lowered = datatype::Registry::Global()->GetTypeRegistered(allocate->dtype.code());
+
+    if (to_be_lowered) {
       auto new_allocate_type = DataType::UInt(allocate->dtype.bits(), allocate->dtype.lanes());
-      return Allocate(allocate->buffer_var, new_allocate_type, allocate->extents,
-                      allocate->condition, allocate->body);
+      auto new_buffer_var =
+          Var(allocate->buffer_var->name_hint, PointerType(PrimType(new_allocate_type)));
+      var_remap_[allocate->buffer_var] = new_buffer_var;
+
+      Stmt stmt = StmtExprMutator::VisitStmt_(allocate);
+      allocate = stmt.as<AllocateNode>();
+
+      return Allocate(new_buffer_var, new_allocate_type, allocate->extents, allocate->condition,
+                      allocate->body);
+    } else {
+      return StmtExprMutator::VisitStmt_(allocate);
     }
-    return stmt;
   }
 
-  inline PrimExpr VisitExpr_(const LoadNode* load) final {
-    bool toBeLowered = datatype::Registry::Global()->GetTypeRegistered(load->dtype.code());
+  PrimExpr VisitExpr_(const LoadNode* load) final {
+    bool to_be_lowered = datatype::Registry::Global()->GetTypeRegistered(load->dtype.code());
     PrimExpr expr = StmtExprMutator::VisitExpr_(load);
     load = expr.as<LoadNode>();
-    if (toBeLowered) {
+    if (to_be_lowered) {
       auto new_load_type = DataType::UInt(load->dtype.bits());
-      return Load(new_load_type, load->buffer_var, load->index, load->predicate);
+      auto buffer_var = load->buffer_var;
+      auto it = var_remap_.find(buffer_var);
+      if (it != var_remap_.end()) {
+        buffer_var = it->second;
+      }
+      return Load(new_load_type, buffer_var, load->index, load->predicate);
     }
     return expr;
   }
 
-  inline PrimExpr VisitExpr_(const CallNode* call) final {
-    bool toBeLowered = datatype::Registry::Global()->GetTypeRegistered(call->dtype.code());
+  Stmt VisitStmt_(const StoreNode* op) final {
+    Stmt ret = StmtExprMutator::VisitStmt_(op);
+    op = ret.as<StoreNode>();
+
+    auto it = var_remap_.find(op->buffer_var);
+    if (it != var_remap_.end()) {
+      return Store(it->second, op->value, op->index, op->predicate);
+    } else {
+      return ret;
+    }
+  }
+
+  Stmt VisitStmt_(const AttrStmtNode* op) final {
+    Stmt ret = StmtExprMutator::VisitStmt_(op);
+    op = ret.as<AttrStmtNode>();
+    // Due to legacy reasons, some attr node can contain
+    // information(e.g. alignment) of buffer variables.
+    // remap these vars when needed
+    // TODO(tvm-team): remove the rewriting once the buffer var
+    // attrs are being refactored into the corresponding definition node
+    if (const auto* var_node = op->node.as<VarNode>()) {
+      auto it = var_remap_.find(GetRef<Var>(var_node));
+      if (it != var_remap_.end()) {
+        return AttrStmt(it->second, op->attr_key, op->value, op->body);
+      }
+    }
+    return ret;
+  }
+
+  PrimExpr VisitExpr_(const CallNode* call) final {
+    bool to_be_lowered = datatype::Registry::Global()->GetTypeRegistered(call->dtype.code());
     PrimExpr expr = StmtExprMutator::VisitExpr_(call);
     call = expr.as<CallNode>();
-    if (toBeLowered) {
+    if (to_be_lowered) {
       auto op = call->op.as<OpNode>();
       ICHECK(op != nullptr) << "Lowering non-intrinsic Calls not implemented";
       auto lower = datatype::GetIntrinLowerFunc(target_, op->name, call->dtype.code());
@@ -113,38 +164,42 @@ class CustomDatatypesLowerer : public StmtExprMutator {
     return expr;
   }
 
-#define DEFINE_MUTATE(OP, NodeName)                                                \
-  inline PrimExpr VisitExpr_(const NodeName* op) final {                           \
-    auto type_code = op->dtype.code();                                             \
-    bool toBeLowered = datatype::Registry::Global()->GetTypeRegistered(type_code); \
-    PrimExpr expr = StmtExprMutator::VisitExpr_(op);                               \
-    op = expr.as<NodeName>();                                                      \
-    if (toBeLowered) {                                                             \
-      auto lower = datatype::Get##OP##LowerFunc(target_, type_code);               \
-      ICHECK(lower) << #OP " lowering function for target " << target_ << " type " \
-                    << static_cast<unsigned>(type_code) << " not found";           \
-      return (*lower)(expr);                                                       \
-    }                                                                              \
-    return expr;                                                                   \
+#define TVM_DEFINE_MUTATE_CUSTOM_DTYPE(OP, NodeName)                                 \
+  PrimExpr VisitExpr_(const NodeName* op) final {                                    \
+    auto type_code = op->dtype.code();                                               \
+    bool to_be_lowered = datatype::Registry::Global()->GetTypeRegistered(type_code); \
+    PrimExpr expr = StmtExprMutator::VisitExpr_(op);                                 \
+    op = expr.as<NodeName>();                                                        \
+    if (to_be_lowered) {                                                             \
+      auto lower = datatype::Get##OP##LowerFunc(target_, type_code);                 \
+      ICHECK(lower) << #OP " lowering function for target " << target_ << " type "   \
+                    << static_cast<unsigned>(type_code) << " not found";             \
+      return (*lower)(expr);                                                         \
+    }                                                                                \
+    return expr;                                                                     \
   }
 
-  DEFINE_MUTATE(Add, AddNode);
-  DEFINE_MUTATE(Sub, SubNode);
-  DEFINE_MUTATE(Mul, MulNode);
-  DEFINE_MUTATE(Div, DivNode);
-  DEFINE_MUTATE(Mod, ModNode);
-  DEFINE_MUTATE(Min, MinNode);
-  DEFINE_MUTATE(Max, MaxNode);
-  DEFINE_MUTATE(EQ, EQNode);
-  DEFINE_MUTATE(NE, NENode);
-  DEFINE_MUTATE(LT, LTNode);
-  DEFINE_MUTATE(LE, LENode);
-  DEFINE_MUTATE(GT, GTNode);
-  DEFINE_MUTATE(GE, GENode);
+  TVM_DEFINE_MUTATE_CUSTOM_DTYPE(Add, AddNode);
+  TVM_DEFINE_MUTATE_CUSTOM_DTYPE(Sub, SubNode);
+  TVM_DEFINE_MUTATE_CUSTOM_DTYPE(Mul, MulNode);
+  TVM_DEFINE_MUTATE_CUSTOM_DTYPE(Div, DivNode);
+  TVM_DEFINE_MUTATE_CUSTOM_DTYPE(Mod, ModNode);
+  TVM_DEFINE_MUTATE_CUSTOM_DTYPE(Min, MinNode);
+  TVM_DEFINE_MUTATE_CUSTOM_DTYPE(Max, MaxNode);
+  TVM_DEFINE_MUTATE_CUSTOM_DTYPE(EQ, EQNode);
+  TVM_DEFINE_MUTATE_CUSTOM_DTYPE(NE, NENode);
+  TVM_DEFINE_MUTATE_CUSTOM_DTYPE(LT, LTNode);
+  TVM_DEFINE_MUTATE_CUSTOM_DTYPE(LE, LENode);
+  TVM_DEFINE_MUTATE_CUSTOM_DTYPE(GT, GTNode);
+  TVM_DEFINE_MUTATE_CUSTOM_DTYPE(GE, GENode);
   // Later changes may need to add more mutate functions as we support workloads with more ops.
 
+#undef TVM_DEFINE_MUTATE_CUSTOM_DTYPE
+
  private:
   std::string target_;
+  // remap buffer vars
+  std::unordered_map<Var, Var, ObjectPtrHash, ObjectPtrEqual> var_remap_;
 };
 
 namespace transform {
diff --git a/src/tir/transforms/lower_thread_allreduce.cc b/src/tir/transforms/lower_thread_allreduce.cc
index c24e26b58db0..f6cb096720da 100644
--- a/src/tir/transforms/lower_thread_allreduce.cc
+++ b/src/tir/transforms/lower_thread_allreduce.cc
@@ -224,14 +224,15 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
       PrimExpr index(0);
 
       for (size_t idx = 0; idx < size; ++idx) {
-        shared_bufs[idx] = Var("red_buf" + std::to_string(idx), DataType::Handle());
+        Type ptr_type = PointerType(PrimType(types[idx]));
+        shared_bufs[idx] = Var("red_buf" + std::to_string(idx), ptr_type);
         PrimExpr pred = const_true(types[idx].lanes());
         seq.emplace_back(Store(shared_bufs[idx], values[idx], index, pred));
 
         // Uses a local variable to store the shuffled data.
         // Later on, this allocation will be properly attached to this statement.
-        Var var("t" + std::to_string(idx), types[idx]);
-        Stmt s = Allocate(var, var.dtype(), {PrimExpr(1)}, pred, Evaluate(0));
+        Var var("t" + std::to_string(idx), ptr_type);
+        Stmt s = Allocate(var, types[idx], {PrimExpr(1)}, pred, Evaluate(0));
         local_vars.push_back(s);
       }
 
@@ -239,14 +240,15 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
       // a divergent control flow. Here it uses a variable to cache the current
       // active channels.
       //
-      Var mask_var("mask", DataType::UInt(32));
+      DataType mask_dtype = DataType::UInt(32);
+      Var mask_var("mask", PointerType(PrimType(mask_dtype)));
       {
         PrimExpr pred = const_true(1);
-        PrimExpr mask = Call(DataType::UInt(32), builtin::tvm_warp_activemask(), {});
+        PrimExpr mask = Call(mask_dtype, builtin::tvm_warp_activemask(), {});
         seq.emplace_back(Store(mask_var, mask, index, pred));
         // Push allocation with an empty body. Later this will be fixed
         // when the entire body is ready.
-        auto stmt = Allocate(mask_var, mask_var->dtype, {PrimExpr(1)}, pred, Evaluate(0));
+        auto stmt = Allocate(mask_var, mask_dtype, {PrimExpr(1)}, pred, Evaluate(0));
         local_vars.push_back(stmt);
       }
 
@@ -338,7 +340,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
       // previous iteration on the same buffer.
       seq.emplace_back(SyncThread("shared"));
       for (size_t idx = 0; idx < size; ++idx) {
-        shared_bufs[idx] = Var("red_buf" + std::to_string(idx), DataType::Handle());
+        shared_bufs[idx] = Var("red_buf" + std::to_string(idx), PointerType(PrimType(types[idx])));
         PrimExpr pred = const_true(types[idx].lanes());
         seq.emplace_back(Store(shared_bufs[idx], values[idx],
                                BufIndex(reduce_index, group_index, reduce_extent), pred));
diff --git a/src/tir/transforms/storage_rewrite.cc b/src/tir/transforms/storage_rewrite.cc
index 78c5ca7460ad..d4c5ca09650b 100644
--- a/src/tir/transforms/storage_rewrite.cc
+++ b/src/tir/transforms/storage_rewrite.cc
@@ -23,6 +23,7 @@
  *  Re-write data access to enable memory sharing when possible.
  */
 #include <tvm/arith/analyzer.h>
+#include <tvm/ir/type.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/target/target_info.h>
 #include <tvm/tir/analysis.h>
@@ -934,7 +935,12 @@ class VectorAllocRewriter : public StmtExprMutator {
       if (me->base % factor == 0 && me->coeff % factor == 0) {
         extents.Set(extents.size() - 1,
                     extents[extents.size() - 1] / make_const(extents[0].dtype(), factor));
-        return Allocate(op->buffer_var, tvec[0], extents, op->condition, op->body);
+        // create a new buffer var
+        DataType new_dtype = tvec[0];
+        Var new_buffer_var(op->buffer_var->name_hint, PointerType(PrimType(new_dtype)));
+        // update the remap req.
+        var_remap_.Set(op->buffer_var, new_buffer_var);
+        return Allocate(new_buffer_var, new_dtype, extents, op->condition, op->body);
       }
     }
     return stmt;
@@ -949,23 +955,21 @@ class VectorAllocRewriter : public StmtExprMutator {
 
   // Internal access map
   std::unordered_map<const VarNode*, std::vector<DataType> > acc_map_;
+  // Variables to remap
+  Map<tir::Var, PrimExpr> var_remap_;
   // internal analyzer
   arith::Analyzer analyzer_;
 };
 
-Stmt StorageRewrite(Stmt stmt) {
-  stmt = StoragePlanRewriter().Rewrite(std::move(stmt), true);
-  return VectorAllocRewriter()(std::move(stmt));
-}
-
 PrimFunc PointerValueTypeRewrite(PrimFunc f) {
   auto* n = f.CopyOnWrite();
   VectorAllocRewriter rewriter;
-  n->body = rewriter(n->body);
+  n->body = rewriter(std::move(n->body));
 
+  Map<tir::Var, PrimExpr> var_remap = std::move(rewriter.var_remap_);
   Array<tir::Var> args;
-  Map<tir::Var, PrimExpr> remap_vars;
 
+  // rewrite paramters if needed.
   for (Var var : f->params) {
     if (var.dtype().is_handle()) {
       const auto& tvec = rewriter.acc_map_[var.get()];
@@ -973,15 +977,14 @@ PrimFunc PointerValueTypeRewrite(PrimFunc f) {
       if (tvec.size() == 1) {
         tir::Var new_var(var->name_hint, PointerType(PrimType(tvec[0])));
         args.push_back(new_var);
-        remap_vars.Set(var, new_var);
-
+        var_remap.Set(var, new_var);
       } else {
         // always set data type to be non vectorized so
         // load/store can still work via scalarization
         if (tvec.size() != 0 && !var->type_annotation.defined()) {
           tir::Var new_var(var->name_hint, PointerType(PrimType(tvec[0].with_lanes(1))));
           args.push_back(new_var);
-          remap_vars.Set(var, new_var);
+          var_remap.Set(var, new_var);
         } else {
           args.push_back(var);
         }
@@ -991,9 +994,13 @@ PrimFunc PointerValueTypeRewrite(PrimFunc f) {
     }
   }
 
+  // no variable remap is needed.
+  if (var_remap.size() == 0) return f;
+
+  // remap the variables.
   ICHECK_EQ(args.size(), n->params.size());
   n->params = args;
-  n->body = Substitute(n->body, remap_vars);
+  n->body = Substitute(n->body, var_remap);
   return f;
 }
 
@@ -1003,8 +1010,7 @@ Pass StorageRewrite() {
   auto pass_func = [](PrimFunc f, IRModule m, PassContext ctx) {
     auto* n = f.CopyOnWrite();
     n->body = StoragePlanRewriter().Rewrite(std::move(n->body), true);
-    n->body = VectorAllocRewriter()(std::move(n->body));
-    return f;
+    return PointerValueTypeRewrite(std::move(f));
   };
   return CreatePrimFuncPass(pass_func, 0, "tir.StorageRewrite", {});
 }
diff --git a/tests/cpp/ir_functor_test.cc b/tests/cpp/ir_functor_test.cc
index 683caaa7c5de..9be83987ba57 100644
--- a/tests/cpp/ir_functor_test.cc
+++ b/tests/cpp/ir_functor_test.cc
@@ -114,8 +114,9 @@ TEST(IRF, StmtVisitor) {
   auto fmaketest = [&]() {
     auto z = x + 1;
     Stmt body = Evaluate(z);
-    Var buffer("b", DataType::Handle());
-    return Allocate(buffer, DataType::Float(32), {z, z}, const_true(), body);
+    DataType dtype = DataType::Float(32);
+    Var buffer("b", PointerType(PrimType(dtype)));
+    return Allocate(buffer, dtype, {z, z}, const_true(), body);
   };
   v(fmaketest());
   ICHECK_EQ(v.count, 3);
@@ -140,8 +141,9 @@ TEST(IRF, StmtMutator) {
   auto fmakealloc = [&]() {
     auto z = x + 1;
     Stmt body = Evaluate(z);
-    Var buffer("b", DataType::Handle());
-    return Allocate(buffer, DataType::Float(32), {1, z}, const_true(), body);
+    DataType dtype = DataType::Float(32);
+    Var buffer("b", PointerType(PrimType(dtype)));
+    return Allocate(buffer, dtype, {1, z}, const_true(), body);
   };
 
   auto fmakeif = [&]() {
diff --git a/tests/python/unittest/test_tir_constructor.py b/tests/python/unittest/test_tir_constructor.py
index 3cde5d7ad650..2bf4ba51937e 100644
--- a/tests/python/unittest/test_tir_constructor.py
+++ b/tests/python/unittest/test_tir_constructor.py
@@ -154,6 +154,7 @@ def test_stmt_constructor():
     assert x.index.value == 10
     assert x.value.value == 1
 
+    buffer_var = tvm.tir.Var("buf", tvm.ir.PointerType(tvm.ir.PrimType("float32")))
     x = tvm.tir.Allocate(buffer_var, "float32", [10], tvm.tir.const(1, "uint1"), nop)
     assert isinstance(x, tvm.tir.Allocate)
     assert x.dtype == "float32"

From 93d79bafcf854a928d248aab92782da36eec3b4a Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Wed, 6 Jan 2021 17:50:21 -0800
Subject: [PATCH 031/357] [AutoScheduler][Relay] Control compile engine cache
 via PassContext (#7220)

* [AutoScheduler][Relay] Control compile engine cache via PassContext

* lint

* lint
---
 .../tvm/auto_scheduler/relay_integration.py   | 35 +++++++------------
 src/relay/backend/compile_engine.cc           |  5 ++-
 src/relay/backend/utils.h                     |  9 +++++
 3 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py
index eecf88bac9d8..ea1a8cc39373 100644
--- a/python/tvm/auto_scheduler/relay_integration.py
+++ b/python/tvm/auto_scheduler/relay_integration.py
@@ -56,7 +56,10 @@ def call_all_topi_funcs(mod, params, target):
 
     with transform.PassContext(
         opt_level=3,
-        config={"relay.backend.use_auto_scheduler": True},
+        config={
+            "relay.backend.use_auto_scheduler": True,
+            "relay.backend.disable_compile_engine_cache": True,
+        },
         disabled_pass={"AutoSchedulerLayoutRewrite"},
     ):
         try:
@@ -105,7 +108,6 @@ def extract_tasks(
         The weight (i.e. the number of appearance) of extracted tasks
     """
     # pylint: disable=import-outside-toplevel
-    from tvm import relay
 
     if isinstance(target, str):
         target = tvm.target.Target(target)
@@ -123,17 +125,10 @@ def extract_tasks(
         build_thread.start()
         build_thread.join()
 
-    # query the compile engine to get the number of occurrence of all tasks
-    engine = relay.backend.compile_engine.get()
-    use_count_dict = {}
-    for k, v in engine.items():
-        use_count_dict[k] = v.use_count
-
     # create search tasks
     tasks = []
     weights = []
-    for wkl_key, ccache_key in env.wkl_key_to_ccache_key.items():
-        dag = ComputeDAG(wkl_key)
+    for wkl_key, weight in env.wkl_key_to_weight.items():
         tasks.append(
             SearchTask(
                 workload_key=wkl_key,
@@ -145,10 +140,7 @@ def extract_tasks(
                 layout_rewrite_option=LayoutRewriteOption.get_target_default(target, True),
             )
         )
-        weights.append(use_count_dict[ccache_key] + 1)
-
-    # clean the cached lowering results
-    engine.clear()
+        weights.append(weight)
 
     return tasks, weights
 
@@ -169,7 +161,7 @@ class TracingEnvironment:
     def __init__(self, tracing_mode):
         self.tracing_mode = tracing_mode
         self.relay_disable_build_cache = "false"
-        self.wkl_key_to_ccache_key = {}
+        self.wkl_key_to_weight = {}
 
     def __enter__(self):
         TracingEnvironment.current = self
@@ -178,17 +170,17 @@ def __enter__(self):
     def __exit__(self, exc_type, exc_val, exc_tb):
         TracingEnvironment.current = None
 
-    def add_workload_key(self, workload_key, ccache_key):
+    def add_workload_key(self, workload_key):
         """Add the workload key of a search task
 
         Parameters
         ----------
         workload_key: str
             The workload key of a task
-        ccache_key: CCacheKey
-            The corresponding ccache_key of the task
         """
-        self.wkl_key_to_ccache_key[workload_key] = ccache_key
+        if workload_key not in self.wkl_key_to_weight:
+            self.wkl_key_to_weight[workload_key] = 0
+        self.wkl_key_to_weight[workload_key] += 1
 
 
 @tvm._ffi.register_func("auto_scheduler.enter_layout_rewrite")
@@ -278,7 +270,6 @@ def auto_schedule_topi(outs):
         An initial schdule in the tracing mode.
     """
     # pylint: disable=import-outside-toplevel
-    from tvm import relay
 
     io_tensors, has_layout_free, has_complex_op = traverse_to_get_io_tensors(outs)
     if not io_tensors:  # The compute includes dynamic shapes which are not supported yet.
@@ -305,9 +296,7 @@ def auto_schedule_topi(outs):
     elif env.tracing_mode in [TracingMode.EXTRACT_TASK, TracingMode.EXTRACT_COMPLEX_TASK_ONLY]:
         # in the task extraction mode
         if has_complex_op or env.tracing_mode == TracingMode.EXTRACT_TASK:
-            engine = relay.backend.compile_engine.get()
-            ccache_key = engine.get_current_ccache_key()
-            env.add_workload_key(key, ccache_key)
+            env.add_workload_key(key)
         schedule = te.create_schedule([x.op for x in outs])
     elif env.tracing_mode == TracingMode.PREPARE_LAYOUT_REWRITE:
         # in prepare_layout_rewrite mode
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index 789f39de22d1..c969c3ba7f06 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -701,7 +701,9 @@ class CompileEngineImpl : public CompileEngineNode {
     } else {
       value = CCacheValue(make_object<CCacheValueNode>());
       value->use_count = 0;
-      cache_[key] = value;
+      if (!backend::IsCompileEngineCacheDisabled()) {
+        cache_[key] = value;
+      }
     }
     cur_ccache_key_ = key;
 
@@ -832,6 +834,7 @@ CompileEngine& CompileEngine::Global() {
 }
 
 TVM_REGISTER_PASS_CONFIG_OPTION("relay.backend.use_auto_scheduler", Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION("relay.backend.disable_compile_engine_cache", Bool);
 
 TVM_REGISTER_GLOBAL("relay.backend._make_LoweredOutput")
     .set_body_typed([](tvm::Array<te::Tensor> outputs, OpImplementation impl) {
diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h
index e1677205ffa1..6908ca85f582 100644
--- a/src/relay/backend/utils.h
+++ b/src/relay/backend/utils.h
@@ -303,6 +303,15 @@ inline bool IsAutoSchedulerEnabled() {
       .value();
 }
 
+/*!
+ * \brief Return whether the compile engine cache is disabled in the pass context.
+ */
+inline bool IsCompileEngineCacheDisabled() {
+  return transform::PassContext::Current()
+      ->GetConfig<Bool>("relay.backend.disable_compile_engine_cache", Bool(false))
+      .value();
+}
+
 }  // namespace backend
 }  // namespace relay
 }  // namespace tvm

From 9815ae2d9e17eece1a1009eb6436c80f931c734e Mon Sep 17 00:00:00 2001
From: Haozheng Fan <hzfan@apache.org>
Date: Fri, 8 Jan 2021 00:24:31 +0800
Subject: [PATCH 032/357] [Arith] Simplify cast (#7045)

---
 src/arith/canonical_simplify.cc               | 161 ++++++++++++++++++
 .../unittest/test_arith_canonical_simplify.py |  41 +++++
 2 files changed, 202 insertions(+)

diff --git a/src/arith/canonical_simplify.cc b/src/arith/canonical_simplify.cc
index d0a0702a0fb0..ba549959ac98 100644
--- a/src/arith/canonical_simplify.cc
+++ b/src/arith/canonical_simplify.cc
@@ -77,6 +77,27 @@ inline PrimExpr DivImpl(PrimExpr a, PrimExpr b, DivMode mode) {
   }
 }
 
+/*!
+ * \brief check if value fits in dtype
+ * \param value The value to be analyzed
+ * \param dtype The target dtype
+ * \param analyzer The analyzer
+ * \return whether value fits in dtype
+ */
+bool CastIsSafe(DataType dtype, PrimExpr value, Analyzer* analyzer) {
+  if (!IsIndexType(dtype)) {
+    return false;
+  }
+  ConstIntBound bound = analyzer->const_int_bound(value);
+  int64_t ubound = Downcast<IntImm>(max_value(dtype))->value;
+  int64_t lbound = Downcast<IntImm>(min_value(dtype))->value;
+  if (value.dtype().bits() <= dtype.bits() ||  // upcast is safe
+      (bound->max_value <= ubound && bound->min_value >= lbound)) {
+    return true;
+  }
+  return false;
+}
+
 /*!
  * \brief Internal "Split normal form" of expression.
  *
@@ -128,6 +149,58 @@ class SplitExprNode : public CanonicalExprNode {
 
   void MulToSelf(int64_t scale) { this->scale *= scale; }
 
+  /*!
+   * \brief check if cast can be pushed to sub-expressions
+   * \param dtype The target datatype
+   * \param analyzer The analyzer
+   * \return whether the cast can be safely pushed to children
+   */
+  bool CanPushCastToChildren(DataType dtype, Analyzer* analyzer) const {
+    // cast(dtype, index % upper_factor / lower_factor * scale) ==
+    // cast(dtype, index) % upper_factor / lower_factor * scale
+    // iff it is an upcast (dtype.bits >= self.dtype.bits) or all of
+    // its intermediate results fit in the range of dtype
+    if (dtype.bits() >= this->dtype.bits()) {
+      return true;  // upcast is safe
+    }
+    PrimExpr res = this->index;
+    if (this->scale == 0) {
+      return true;
+    }
+    if (!CastIsSafe(dtype, res, analyzer)) {
+      return false;
+    }
+    if (this->upper_factor != SplitExprNode::kPosInf) {
+      res = ModImpl(res, make_const(this->dtype, this->upper_factor), div_mode);
+      if (!CastIsSafe(dtype, res, analyzer)) {
+        return false;
+      }
+    }
+    if (this->lower_factor != 1) {
+      res = DivImpl(res, make_const(this->dtype, this->lower_factor), div_mode);
+      if (!CastIsSafe(dtype, res, analyzer)) {
+        return false;
+      }
+    }
+    if (this->scale != 1) {
+      ICHECK(!this->dtype.is_uint() || this->scale > 0);
+      res = res * make_const(this->dtype, this->scale);
+      if (!CastIsSafe(dtype, res, analyzer)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /*!
+   * \brief self = cast(dtype, self)
+   * \param dtype The target datatype
+   */
+  void PushCastToChildren(DataType dtype) {
+    this->index = cast(dtype, this->index);
+    this->dtype = dtype;
+  }
+
   inline bool IndexEqual(const SplitExpr& other) const;
   inline bool DivModeCompatibleTo(DivMode mode) const;
 
@@ -255,6 +328,69 @@ class SumExprNode : public CanonicalExprNode {
 
   void AddToSelf(const SumExpr& other, int64_t scale);
 
+  /*!
+   * \brief check if cast can be pushed to sub-expressions
+   * \param dtype The target datatype
+   * \param analyzer The analyzer
+   * \return whether the cast can be safely pushed to children
+   */
+  bool CanPushCastToChildren(DataType dtype, Analyzer* analyzer) const {
+    // cast(dtype, arg_1 + arg_2 + ... arg_n) ==
+    // cast(dtype, arg_1) + ... + cast(dtype, arg_n)
+    // iff it is an upcast (dtype.bits >= self.dtype.bits) or all of
+    // its intermediate results fit in the range of dtype
+    if (dtype.bits() >= this->dtype.bits()) {
+      return true;  // upcast is safe
+    }
+    PrimExpr res = make_const(dtype, 0);
+    for (size_t i = 0; i < args.size(); ++i) {
+      if (args[i]->scale > 0) {
+        res = res + args[i]->Normalize();
+        if (!CastIsSafe(dtype, res, analyzer)) {
+          return false;
+        }
+      }
+    }
+    if (base > 0) {
+      res = res + make_const(dtype, base);
+      if (!CastIsSafe(dtype, res, analyzer)) {
+        return false;
+      }
+    }
+    // negative scales follows using sub.
+    for (size_t i = 0; i < args.size(); ++i) {
+      if (args[i]->scale < 0) {
+        res = res - args[i]->NormalizeWithScale(-1);
+        if (!CastIsSafe(dtype, res, analyzer)) {
+          return false;
+        }
+      }
+    }
+    if (base < 0) {
+      res = res - make_const(dtype, -base);
+      if (!CastIsSafe(dtype, res, analyzer)) {
+        return false;
+      }
+    }
+    for (const auto& arg : args) {
+      if (!arg->CanPushCastToChildren(dtype, analyzer)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /*!
+   * \brief self = cast(dtype, self)
+   * \param dtype The target datatype
+   */
+  void PushCastToChildren(DataType dtype) {
+    for (auto& arg : args) {
+      arg.CopyOnWrite()->PushCastToChildren(dtype);
+    }
+    this->dtype = dtype;
+  }
+
   static constexpr const char* _type_key = "arith.SumExpr";
   TVM_DECLARE_FINAL_OBJECT_INFO(SumExprNode, CanonicalExprNode);
 
@@ -430,6 +566,7 @@ class CanonicalSimplifier::Impl : public RewriteSimplifier::Impl {
   PrimExpr VisitExpr_(const FloorDivNode* op) final;
   PrimExpr VisitExpr_(const FloorModNode* op) final;
   PrimExpr VisitExpr_(const ReduceNode* op) final;
+  PrimExpr VisitExpr_(const CastNode* op) final;
 
  private:
   /*!
@@ -1071,6 +1208,30 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const ReduceNode* op) {
   return ret;
 }
 
+PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const CastNode* op) {
+  if (!IsIndexType(op->dtype)) {
+    return Rewriter::VisitExpr_(op);
+  }
+  // normalize
+  PrimExpr value = this->CanonicalMutate(op->value);
+  // PushCastToChildren
+  if (value.as<SumExprNode>()) {
+    SumExpr se = Downcast<SumExpr>(value);
+    if (se->CanPushCastToChildren(op->dtype, analyzer_)) {
+      se.CopyOnWrite()->PushCastToChildren(op->dtype);
+      return std::move(se);
+    }
+  }
+  if (value.as<SplitExprNode>()) {
+    SplitExpr se = Downcast<SplitExpr>(value);
+    if (se->CanPushCastToChildren(op->dtype, analyzer_)) {
+      se.CopyOnWrite()->PushCastToChildren(op->dtype);
+      return std::move(se);
+    }
+  }
+  return Rewriter::VisitExpr_(op);
+}
+
 PrimExpr CanonicalSimplifier::operator()(const PrimExpr& expr) {
   return impl_->CanonicalSimplify(expr);
 }
diff --git a/tests/python/unittest/test_arith_canonical_simplify.py b/tests/python/unittest/test_arith_canonical_simplify.py
index 65c8ec3dfe02..c241b81da986 100644
--- a/tests/python/unittest/test_arith_canonical_simplify.py
+++ b/tests/python/unittest/test_arith_canonical_simplify.py
@@ -310,6 +310,46 @@ def test_complex_cases():
     ck.verify(res3, tdiv((x * 1024) + y, 256) - tdiv(y, 256) - (x * 4))
 
 
+def test_simplify_cast():
+    ck = CanonicalChecker()
+    tcast = tvm.tir.Cast
+    fld = tvm.te.floordiv
+    flm = tvm.te.floormod
+    # cast(i64, i + j + 1) - cast(i64, i)
+    i = te.var("i", dtype="int32")
+    j = te.var("j", dtype="int32")
+    res = tcast("int64", i + j + 1) - tcast("int64", i)
+    ck.verify(res, tcast("int64", j) + tvm.tir.const(1, "int64"))
+    # cast(i32, i + j + 1) - cast(i32, i)
+    i = te.var("i", dtype="int64")
+    j = te.var("j", dtype="int64")
+    ck.analyzer.update(i, tvm.arith.ConstIntBound(0, 10))
+    ck.analyzer.update(j, tvm.arith.ConstIntBound(0, 10))
+    res = tcast("int32", i + j + 1) - tcast("int32", i)
+    ck.verify(res, tcast("int32", j) + 1)
+    # cast(i32, i + j - 100)
+    i = te.var("i", dtype="int64")
+    j = te.var("j", dtype="int64")
+    ck.analyzer.update(i, tvm.arith.ConstIntBound(0, 2 ** 31 - 1))
+    ck.analyzer.update(j, tvm.arith.ConstIntBound(0, 10))
+    res = tcast("int32", i + j - 100)
+    ck.verify(res, res)
+    # cast(i32, flm(axis, 7i64) * 2i64 + 1i64) + 1i32
+    # - cast(i32, flm(axis, 7i64) * 2i64)
+    axis = te.var("axis", dtype="int64")
+    ck.analyzer.update(axis, tvm.arith.ConstIntBound(0, 42))
+    res = (
+        tcast(
+            "int32",
+            flm(axis, tvm.tir.const(7, "int64")) * tvm.tir.const(2, "int64")
+            + tvm.tir.const(1, "int64"),
+        )
+        + tvm.tir.const(1, "int32")
+        - tcast("int32", flm(axis, tvm.tir.const(7, "int64")) * tvm.tir.const(2, "int64"))
+    )
+    ck.verify(res, 2)
+
+
 if __name__ == "__main__":
     test_floormod_simplify()
     test_mul_sum_simplify()
@@ -321,3 +361,4 @@ def test_complex_cases():
     test_split_index_simplify()
     test_canonical_mixed()
     test_complex_cases()
+    test_simplify_cast()

From 8910f72a38288f09d9e12163095f1675a9ccee83 Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Thu, 7 Jan 2021 15:02:19 -0800
Subject: [PATCH 033/357] [ConvertLayout] Support transpose (#7214)

* [ConvertLayout] Support transpose

* format

* fix ci

* fix axes missing

* fix

* fix NCHW[x]c

* Update src/relay/op/tensor/transform.cc

* fix negative

* fix
---
 src/relay/op/tensor/transform.cc              | 75 ++++++++++++++++
 .../relay/test_pass_convert_op_layout.py      | 85 +++++++++++++++++++
 2 files changed, 160 insertions(+)

diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 1ff428ce333c..ecfde359d11d 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -419,6 +419,80 @@ bool TransposeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   return true;
 }
 
+Array<Array<Layout>> TransposeInferCorrectLayout(const Attrs& attrs,
+                                                 const Array<Layout>& new_in_layouts,
+                                                 const Array<Layout>& old_in_layouts,
+                                                 const Array<tvm::relay::Type>& old_in_types) {
+  // Discard "const" qualifier.
+  auto* params = const_cast<TransposeAttrs*>(attrs.as<TransposeAttrs>());
+  ICHECK(params != nullptr);
+
+  std::string in_layout_str = "";
+  std::string out_layout_str = "";
+
+  // Infer the input layout string and update the axes.
+  if (old_in_layouts.defined() && old_in_layouts[0].defined()) {
+    ICHECK_EQ(old_in_layouts.size(), 1);
+    auto old_layout = old_in_layouts[0];
+    Array<Integer> old_axes = params->axes;
+
+    // Deal with default axes and negative axes.
+    if (!old_axes.defined() || old_axes.size() == 0) {
+      for (int i = old_layout.ndim() - 1; i >= 0; --i) {
+        old_axes.push_back(i);
+      }
+    }
+    for (size_t i = 0; i < old_axes.size(); ++i) {
+      int axis = static_cast<int>(old_axes[i]->value);
+      if (axis < 0) {
+        int pos_axis = static_cast<int>(old_layout.ndim()) + axis;
+        old_axes.Set(i, pos_axis);
+      }
+    }
+
+    if (new_in_layouts.defined() && new_in_layouts[0].defined()) {
+      ICHECK_EQ(new_in_layouts.size(), 1);
+      auto new_layout = new_in_layouts[0];
+
+      // Update the axes based on the new layout.
+      Array<Integer> new_axes = Array<Integer>();
+      for (auto axis : old_axes) {
+        auto new_axis = new_layout.IndexOf(old_layout[axis->value]);
+        if (new_axis == -1) {  // Cannot find the target axis in the new layout.
+          new_axes.clear();
+          break;
+        }
+        new_axes.push_back(new_axis);
+      }
+      if (new_axes.defined() && new_axes.size() == new_layout.ndim()) {
+        params->axes = std::move(new_axes);
+        in_layout_str = new_layout.name();
+      }
+    }
+
+    // If the input layout string cannot be determined, propagate the old layout.
+    if (in_layout_str == "") {
+      params->axes = std::move(old_axes);
+      in_layout_str = old_layout.name();
+    }
+  }
+
+  // Infer the output layout string based on the input layout and the axes.
+  if (in_layout_str != "") {
+    for (auto axis : params->axes) {
+      ICHECK_LT(axis->value, in_layout_str.length());
+      out_layout_str += in_layout_str[axis->value];
+    }
+    try {
+      return Array<Array<Layout>>({{Layout(in_layout_str)}, {Layout(out_layout_str)}});
+    } catch (const dmlc::Error& e) {
+      // If the layout string is invalid for any reason, give up.
+      return Array<Array<Layout>>({{Layout::Undef()}, {Layout::Undef()}});
+    }
+  }
+  return Array<Array<Layout>>({{Layout::Undef()}, {Layout::Undef()}});
+}
+
 Array<te::Tensor> TransposeCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                    const Type& out_type) {
   const auto* param = attrs.as<TransposeAttrs>();
@@ -449,6 +523,7 @@ RELAY_REGISTER_OP("transpose")
     .set_support_level(3)
     .add_type_rel("Transpose", TransposeRel)
     .set_attr<FTVMCompute>("FTVMCompute", TransposeCompute)
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", TransposeInferCorrectLayout)
     .set_attr<TOpPattern>("TOpPattern", kInjective);
 
 /* relay.reshape */
diff --git a/tests/python/relay/test_pass_convert_op_layout.py b/tests/python/relay/test_pass_convert_op_layout.py
index 4c4bb9dee937..ca2469ea0a4c 100644
--- a/tests/python/relay/test_pass_convert_op_layout.py
+++ b/tests/python/relay/test_pass_convert_op_layout.py
@@ -568,6 +568,90 @@ def func_vars():
     verify_slice_like(after, [1, 2])
 
 
+def test_transpose_convert_layout():
+    def verify_transpose(after, expected_axes, expected_transform_cnt):
+        # Verify if the transpose after the convert layout has the expected axes.
+        has_expected = list()
+        checker = lambda x: has_expected.append(
+            isinstance(x, tvm.relay.expr.Call)
+            and x.op.name == "transpose"
+            and str(x.attrs.axes) == str(expected_axes)
+        )
+        relay.analysis.post_order_visit(after, checker)
+        assert any(has_expected), after
+
+        is_transform = list()
+        checker = lambda x: is_transform.append(
+            1 if isinstance(x, tvm.relay.expr.Call) and x.op.name == "layout_transform" else 0
+        )
+        relay.analysis.post_order_visit(after, checker)
+        assert (
+            sum(is_transform) == expected_transform_cnt
+        ), "Expected %s layout_transform, but get\n%s" % (expected_transform_cnt, after)
+
+    def nhwc_to_nchw():
+        x = relay.var("x", shape=(1, 56, 56, 64))
+        weight1 = relay.var("weight1", shape=(3, 3, 64, 32))
+        y = relay.nn.conv2d(
+            x,
+            weight1,
+            channels=32,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+            data_layout="NHWC",
+            kernel_layout="HWIO",
+        )
+        z = relay.var("z", shape=(56, 56, 32))
+        out = relay.add(y, z)
+        out = relay.transpose(out, axes=[0, 3, 1, 2])
+        out = relay.nn.batch_flatten(out)
+        func = relay.Function(analysis.free_vars(out), out)
+        return run_opt_pass(func, transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
+
+    verify_transpose(nhwc_to_nchw(), [0, 1, 2, 3], 3)
+
+    def nchw_to_nhwc():
+        x = relay.var("x", shape=(1, 64, 56, 56))
+        weight1 = relay.var("weight1", shape=(32, 64, 3, 3))
+        y = relay.nn.conv2d(
+            x,
+            weight1,
+            channels=32,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+            data_layout="NCHW",
+            kernel_layout="OIHW",
+        )
+        z = relay.var("z", shape=(32, 56, 56))
+        out = relay.add(y, z)
+        out = relay.transpose(out, axes=[0, 2, -1, 1])  # Also test a negative axis.
+        out = relay.nn.batch_flatten(out)
+        func = relay.Function(analysis.free_vars(out), out)
+        return run_opt_pass(func, transform.ConvertLayout({"nn.conv2d": ["NHWC", "default"]}))
+
+    verify_transpose(nchw_to_nhwc(), [0, 1, 2, 3], 3)
+
+    def default_axes():
+        x = relay.var("x", shape=(1, 64, 56, 56))
+        weight1 = relay.var("weight1", shape=(32, 64, 3, 3))
+        y = relay.nn.conv2d(
+            x,
+            weight1,
+            channels=32,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+            data_layout="NCHW",
+            kernel_layout="OIHW",
+        )
+        z = relay.var("z", shape=(32, 56, 56))
+        out = relay.add(y, z)
+        out = relay.transpose(out)  # No axes provided, will use the reversed axes.
+        func = relay.Function(analysis.free_vars(out), out)
+        return run_opt_pass(func, transform.ConvertLayout({"nn.conv2d": ["NHWC", "default"]}))
+
+    verify_transpose(default_axes(), [2, 1, 3, 0], 3)
+
+
 def test_resnet_convert_layout():
     def before():
         x = relay.var("x", shape=(1, 56, 56, 64))
@@ -1482,6 +1566,7 @@ def expected():
     test_dual_path_convert_layout()
     test_bn_convert_layout()
     test_slice_like_convert_layout()
+    test_transpose_convert_layout()
     test_resnet_convert_layout()
     test_scalar_convert_layout()
     test_conv_bn_convert_layout()

From 4911a08a1776dd63d1638c516d1c4097cb23da75 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Thu, 7 Jan 2021 17:23:36 -0800
Subject: [PATCH 034/357] [BUGFIX] Change debug_runtime to represent times in
 seconds internally (#7227)

* Add FrontendTestModule, a Module which can have Python functions.

* fix units and use of scientific notation in debug_runtime variable names

* remaining updates to formalize debug_runtime returns time in sec

* Add test for debug runtime output

* black format

* git-clang-format

* pylint
---
 python/tvm/contrib/debugger/debug_result.py   |  6 +--
 python/tvm/contrib/debugger/debug_runtime.py  |  2 +-
 python/tvm/support.py                         | 25 +++++++++
 .../graph/debug/graph_runtime_debug.cc        | 20 +++----
 src/support/ffi_testing.cc                    | 42 +++++++++++++++
 .../unittest/test_runtime_graph_debug.py      | 54 +++++++++++++++++--
 6 files changed, 130 insertions(+), 19 deletions(-)

diff --git a/python/tvm/contrib/debugger/debug_result.py b/python/tvm/contrib/debugger/debug_result.py
index 0b9810e74bb1..3159ab34397a 100644
--- a/python/tvm/contrib/debugger/debug_result.py
+++ b/python/tvm/contrib/debugger/debug_result.py
@@ -212,7 +212,7 @@ def get_debug_result(self, sort_by_time=True):
                     continue
                 name = node["name"]
                 shape = str(self._output_tensor_list[eid].shape)
-                time_us = round(time[0] * 1000000, 3)
+                time_us = round(time[0] * 1e6, 3)
                 time_percent = round(((time[0] / total_time) * 100), 3)
                 inputs = str(node["attrs"]["num_inputs"])
                 outputs = str(node["attrs"]["num_outputs"])
@@ -224,8 +224,8 @@ def get_debug_result(self, sort_by_time=True):
             # Sort on the basis of execution time. Prints the most expensive ops in the start.
             data = sorted(data, key=lambda x: x[2], reverse=True)
             # Insert a row for total time at the end.
-            rounded_total_time = round(total_time * 1000000, 3)
-            data.append(["Total_time", "-", rounded_total_time, "-", "-", "-", "-", "-"])
+            rounded_total_time_us = round(total_time * 1e6, 3)
+            data.append(["Total_time", "-", rounded_total_time_us, "-", "-", "-", "-", "-"])
 
         fmt = ""
         for i, _ in enumerate(header):
diff --git a/python/tvm/contrib/debugger/debug_runtime.py b/python/tvm/contrib/debugger/debug_runtime.py
index 4d2fab4358ba..289ac4c467e0 100644
--- a/python/tvm/contrib/debugger/debug_runtime.py
+++ b/python/tvm/contrib/debugger/debug_runtime.py
@@ -175,7 +175,7 @@ def _run_debug(self):
         Time consumed for each execution will be set as debug output.
 
         """
-        self.debug_datum._time_list = [[float(t) * 1e-6] for t in self.run_individual(10, 1, 1)]
+        self.debug_datum._time_list = [[float(t)] for t in self.run_individual(10, 1, 1)]
         for i, node in enumerate(self.debug_datum.get_graph_nodes()):
             num_outputs = self.debug_datum.get_graph_node_output_num(node)
             for j in range(num_outputs):
diff --git a/python/tvm/support.py b/python/tvm/support.py
index e0d688abb9e8..800bfe4e2546 100644
--- a/python/tvm/support.py
+++ b/python/tvm/support.py
@@ -15,7 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 """Support infra of TVM."""
+import ctypes
 import tvm._ffi
+from .runtime.module import Module
+from . import get_global_func
 
 
 def libinfo():
@@ -29,4 +32,26 @@ def libinfo():
     return {k: v for k, v in GetLibInfo().items()}  # pylint: disable=unnecessary-comprehension
 
 
+class FrontendTestModule(Module):
+    """A tvm.runtime.Module whose member functions are PackedFunc."""
+
+    def __init__(self, entry_name=None):
+        underlying_mod = get_global_func("testing.FrontendTestModule")()
+        handle = underlying_mod.handle
+
+        # Set handle to NULL to avoid cleanup in c++ runtime, transferring ownership.
+        # Both cython and ctypes FFI use c_void_p, so this is safe to assign here.
+        underlying_mod.handle = ctypes.c_void_p(0)
+
+        super(FrontendTestModule, self).__init__(handle)
+        if entry_name is not None:
+            self.entry_name = entry_name
+
+    def add_function(self, name, func):
+        self.get_function("__add_function")(name, func)
+
+    def __setitem__(self, key, value):
+        self.add_function(key, value)
+
+
 tvm._ffi._init_api("support", __name__)
diff --git a/src/runtime/graph/debug/graph_runtime_debug.cc b/src/runtime/graph/debug/graph_runtime_debug.cc
index 3353c117318b..0b8f39dd9f94 100644
--- a/src/runtime/graph/debug/graph_runtime_debug.cc
+++ b/src/runtime/graph/debug/graph_runtime_debug.cc
@@ -59,11 +59,11 @@ class GraphRuntimeDebug : public GraphRuntime {
     // warmup run
     GraphRuntime::Run();
     std::string tkey = module_->type_key();
-    std::vector<double> time_per_op(op_execs_.size(), 0);
+    std::vector<double> time_sec_per_op(op_execs_.size(), 0);
     if (tkey == "rpc") {
       // RPC modules rely on remote timing which implements the logic from the else branch.
       for (size_t index = 0; index < op_execs_.size(); ++index) {
-        time_per_op[index] += RunOpRPC(index, number, repeat, min_repeat_ms);
+        time_sec_per_op[index] += RunOpRPC(index, number, repeat, min_repeat_ms);
       }
     } else {
       for (int i = 0; i < repeat; ++i) {
@@ -71,7 +71,7 @@ class GraphRuntimeDebug : public GraphRuntime {
             tbegin, tend;
         double duration_ms = 0.0;
         do {
-          std::fill(time_per_op.begin(), time_per_op.end(), 0);
+          std::fill(time_sec_per_op.begin(), time_sec_per_op.end(), 0);
           if (duration_ms > 0.0) {
             number = static_cast<int>(std::max((min_repeat_ms / (duration_ms / number) + 1),
                                                number * 1.618));  // 1.618 is chosen by random
@@ -80,7 +80,7 @@ class GraphRuntimeDebug : public GraphRuntime {
           for (int k = 0; k < number; k++) {
             for (size_t index = 0; index < op_execs_.size(); ++index) {
               if (op_execs_[index]) {
-                time_per_op[index] += RunOpHost(index);
+                time_sec_per_op[index] += RunOpHost(index);
               }
             }
           }
@@ -92,19 +92,19 @@ class GraphRuntimeDebug : public GraphRuntime {
 
         LOG(INFO) << "Iteration: " << i;
         int op = 0;
-        for (size_t index = 0; index < time_per_op.size(); index++) {
+        for (size_t index = 0; index < time_sec_per_op.size(); index++) {
           if (op_execs_[index]) {
-            time_per_op[index] /= number;
-            LOG(INFO) << "Op #" << op++ << " " << GetNodeName(index) << ": " << time_per_op[index]
-                      << " us/iter";
+            time_sec_per_op[index] /= number;
+            LOG(INFO) << "Op #" << op++ << " " << GetNodeName(index) << ": "
+                      << time_sec_per_op[index] * 1e6 << " us/iter";
           }
         }
       }
     }
 
     std::ostringstream os;
-    for (size_t index = 0; index < time_per_op.size(); index++) {
-      os << time_per_op[index] << ",";
+    for (size_t index = 0; index < time_sec_per_op.size(); index++) {
+      os << time_sec_per_op[index] << ",";
     }
     return os.str();
   }
diff --git a/src/support/ffi_testing.cc b/src/support/ffi_testing.cc
index 839f52968b82..b06a8bb461be 100644
--- a/src/support/ffi_testing.cc
+++ b/src/support/ffi_testing.cc
@@ -23,6 +23,7 @@
  */
 #include <tvm/ir/attrs.h>
 #include <tvm/ir/env_func.h>
+#include <tvm/runtime/module.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/te/tensor.h>
 #include <tvm/tir/expr.h>
@@ -99,4 +100,45 @@ TVM_REGISTER_GLOBAL("testing.object_use_count").set_body([](TVMArgs args, TVMRet
   // and get another value.
   *ret = (obj.use_count() - 1);
 });
+
+class FrontendTestModuleNode : public runtime::ModuleNode {
+ public:
+  virtual const char* type_key() const { return "frontend_test"; }
+
+  static constexpr const char* kAddFunctionName = "__add_function";
+
+  virtual PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self);
+
+ private:
+  std::unordered_map<std::string, PackedFunc> functions_;
+};
+
+constexpr const char* FrontendTestModuleNode::kAddFunctionName;
+
+PackedFunc FrontendTestModuleNode::GetFunction(const std::string& name,
+                                               const ObjectPtr<Object>& sptr_to_self) {
+  if (name == kAddFunctionName) {
+    return TypedPackedFunc<void(std::string, PackedFunc)>(
+        [this, sptr_to_self](std::string func_name, PackedFunc pf) {
+          CHECK_NE(func_name, kAddFunctionName)
+              << "func_name: cannot be special function " << kAddFunctionName;
+          functions_[func_name] = pf;
+        });
+  }
+
+  auto it = functions_.find(name);
+  if (it == functions_.end()) {
+    return PackedFunc();
+  }
+
+  return it->second;
+}
+
+runtime::Module NewFrontendTestModule() {
+  auto n = make_object<FrontendTestModuleNode>();
+  return runtime::Module(n);
+}
+
+TVM_REGISTER_GLOBAL("testing.FrontendTestModule").set_body_typed(NewFrontendTestModule);
+
 }  // namespace tvm
diff --git a/tests/python/unittest/test_runtime_graph_debug.py b/tests/python/unittest/test_runtime_graph_debug.py
index 8aeaf1a1a23b..996d426efaa9 100644
--- a/tests/python/unittest/test_runtime_graph_debug.py
+++ b/tests/python/unittest/test_runtime_graph_debug.py
@@ -16,13 +16,19 @@
 # under the License.
 import json
 import os
+import re
+import sys
+import time
+
+import pytest
+
 import tvm
 import tvm.testing
 from tvm import te
 import numpy as np
 from tvm import rpc
 from tvm.contrib import utils
-from tvm.contrib.debugger import debug_runtime as graph_runtime
+from tvm.contrib.debugger import debug_runtime
 
 
 @tvm.testing.requires_llvm
@@ -60,8 +66,16 @@ def test_graph_simple():
 
     def check_verify():
         mlib = tvm.build(s, [A, B], "llvm", name="myadd")
+
+        def myadd(*args):
+            to_return = mlib["myadd"](*args)
+            time.sleep(0.25)
+            return to_return
+
+        mlib_proxy = tvm.support.FrontendTestModule()
+        mlib_proxy["myadd"] = myadd
         try:
-            mod = graph_runtime.create(graph, mlib, tvm.cpu(0))
+            mod = debug_runtime.create(graph, mlib_proxy, tvm.cpu(0))
         except ValueError:
             return
 
@@ -92,6 +106,36 @@ def check_verify():
         # Verify the tensors are dumped
         assert len(os.listdir(directory)) > 1
 
+        debug_lines = mod.debug_datum.get_debug_result().split("\n")
+
+        def split_debug_line(i):
+            to_return = re.split(r"  [ ]*", debug_lines[i])
+            assert to_return[-1] == ""
+            to_return = to_return[:-1]  # strip empty trailing part
+            return to_return
+
+        assert split_debug_line(0) == [
+            "Node Name",
+            "Ops",
+            "Time(us)",
+            "Time(%)",
+            "Shape",
+            "Inputs",
+            "Outputs",
+        ]
+        myadd_lines = split_debug_line(2)
+        assert myadd_lines[0] == "add"
+        assert myadd_lines[1] == "myadd"
+        runtime_sec = float(myadd_lines[2]) / 1e6  # printed in us
+
+        # Ensure runtime is at least the sleep time and less than a unit prefix order of magnitude.
+        # Here we just care that the prefix is correct.
+        assert runtime_sec > 0.25 and runtime_sec < 0.25 * 1000
+
+        total_lines = split_debug_line(3)
+        assert total_lines[0] == "Total_time"
+        assert total_lines[2] == myadd_lines[2]
+
         CHROME_TRACE_FILE_NAME = "_tvmdbg_execution_trace.json"
         assert os.path.exists(os.path.join(directory, CHROME_TRACE_FILE_NAME))
 
@@ -127,9 +171,9 @@ def check_remote():
         remote.upload(path_dso)
         mlib = remote.load_module("dev_lib.so")
         try:
-            mod = graph_runtime.create(graph, mlib, remote.cpu(0))
+            mod = debug_runtime.create(graph, mlib, remote.cpu(0))
         except ValueError:
-            print("Skip because debug graph_runtime not enabled")
+            print("Skip because debug runtime not enabled")
             return
         a = np.random.uniform(size=(n,)).astype(A.dtype)
         mod.run(x=tvm.nd.array(a, ctx))
@@ -142,4 +186,4 @@ def check_remote():
 
 
 if __name__ == "__main__":
-    test_graph_simple()
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 4e8cc4fc26e931e38017d198d29f45cba04f5a60 Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Thu, 7 Jan 2021 21:37:33 -0700
Subject: [PATCH 035/357] Fix Get Valid Counts when the number of boxes is zero
 (#7229)

---
 python/tvm/topi/cuda/nms.py | 172 +++++++++++++++++++-----------------
 1 file changed, 89 insertions(+), 83 deletions(-)

diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
index 2dc177a0fae8..8946446f3cdc 100644
--- a/python/tvm/topi/cuda/nms.py
+++ b/python/tvm/topi/cuda/nms.py
@@ -151,98 +151,104 @@ def get_valid_indices_ir(valid_boxes, valid_count, valid_indices):
     valid_indices = ib.buffer_ptr(valid_indices)
 
     max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-
-    # Copy boxes to valid_indices
-    with ib.new_scope():
-        nthread_tx = max_threads
-        nthread_bx = ceil_div(num_anchors, max_threads)
-        nthread_by = batch_size
-        tx = te.thread_axis("threadIdx.x")
-        bx = te.thread_axis("blockIdx.x")
-        by = te.thread_axis("blockIdx.y")
-        ib.scope_attr(tx, "thread_extent", nthread_tx)
-        ib.scope_attr(bx, "thread_extent", nthread_bx)
-        ib.scope_attr(by, "thread_extent", nthread_by)
-        tid = bx * nthread_tx + tx
-        with ib.if_scope(tid < num_anchors):
-            valid_indices[by, tid] = valid_boxes[by, tid]
-
-    nthread_tx = max_threads
-    nthread_bx = ceil_div(num_anchors, max_threads)
-    nthread_by = batch_size
-
-    ## The following algorithm performs parallel exclusive scan to get
-    ## a tensor that can later be used to select valid indices
-    # Up Sweep of exclusive scan
-    lim = tvm.tir.generic.cast(
-        tvm.tir.ceil(tvm.tir.log2(tvm.tir.generic.cast(num_anchors, "float64"))), "int64"
-    )
-    with ib.for_range(0, lim, dtype="int64") as l2_width:
-        width = 2 << l2_width
-
+    with ib.if_scope(num_anchors > 0):
+        # Copy boxes to valid_indices
         with ib.new_scope():
+            nthread_tx = max_threads
+            nthread_bx = ceil_div(num_anchors, max_threads)
+            nthread_by = batch_size
             tx = te.thread_axis("threadIdx.x")
             bx = te.thread_axis("blockIdx.x")
-            ib.scope_attr(tx, "thread_extent", nthread_tx)
-            ib.scope_attr(
-                bx,
-                "thread_extent",
-                tvm.tir.generic.cast(ceil_div(num_anchors, max_threads * width), "int32"),
-            )
-            tid = bx * nthread_tx + tx
-
             by = te.thread_axis("blockIdx.y")
+            ib.scope_attr(tx, "thread_extent", nthread_tx)
+            ib.scope_attr(bx, "thread_extent", nthread_bx)
             ib.scope_attr(by, "thread_extent", nthread_by)
-            start = ib.allocate("int64", (1,), name="start", scope="local")
-            middle = ib.allocate("int64", (1,), name="middle", scope="local")
-            end = ib.allocate("int64", (1,), name="end", scope="local")
-            start[0] = width * tid
-            with ib.if_scope(start[0] < num_anchors):
-                middle[0] = start[0] + tvm.tir.indexdiv(width, 2)
-                end[0] = tvm.te.min(start[0] + width, num_anchors)
-                with ib.if_scope(middle[0] < num_anchors):
-                    valid_indices[by * num_anchors + end[0] - 1] += valid_indices[
-                        by * num_anchors + middle[0] - 1
-                    ]
-
-    # Down Sweep of exclusive scan
-    with ib.new_scope():
-        bx = te.thread_axis("blockIdx.x")
-        ib.scope_attr(bx, "thread_extent", batch_size)
-        with ib.if_scope(bx < batch_size):
-            valid_count[bx] = valid_indices[(bx + 1) * num_anchors - 1]
-            valid_indices[(bx + 1) * num_anchors - 1] = 0
+            tid = bx * nthread_tx + tx
+            with ib.if_scope(tid < num_anchors):
+                valid_indices[by, tid] = valid_boxes[by, tid]
 
-    with ib.for_range(0, lim, dtype="int64") as l2_width:
-        width = 2 << (lim - l2_width - 1)
+        nthread_tx = max_threads
+        nthread_bx = ceil_div(num_anchors, max_threads)
+        nthread_by = batch_size
 
+        ## The following algorithm performs parallel exclusive scan to get
+        ## a tensor that can later be used to select valid indices
+        # Up Sweep of exclusive scan
+        lim = tvm.tir.generic.cast(
+            tvm.tir.ceil(tvm.tir.log2(tvm.tir.generic.cast(num_anchors, "float64"))), "int64"
+        )
+        with ib.for_range(0, lim, dtype="int64") as l2_width:
+            width = 2 << l2_width
+
+            with ib.new_scope():
+                tx = te.thread_axis("threadIdx.x")
+                bx = te.thread_axis("blockIdx.x")
+                ib.scope_attr(tx, "thread_extent", nthread_tx)
+                ib.scope_attr(
+                    bx,
+                    "thread_extent",
+                    tvm.tir.generic.cast(ceil_div(num_anchors, max_threads * width), "int32"),
+                )
+                tid = bx * nthread_tx + tx
+
+                by = te.thread_axis("blockIdx.y")
+                ib.scope_attr(by, "thread_extent", nthread_by)
+                start = ib.allocate("int64", (1,), name="start", scope="local")
+                middle = ib.allocate("int64", (1,), name="middle", scope="local")
+                end = ib.allocate("int64", (1,), name="end", scope="local")
+                start[0] = width * tid
+                with ib.if_scope(start[0] < num_anchors):
+                    middle[0] = start[0] + tvm.tir.indexdiv(width, 2)
+                    end[0] = tvm.te.min(start[0] + width, num_anchors)
+                    with ib.if_scope(middle[0] < num_anchors):
+                        valid_indices[by * num_anchors + end[0] - 1] += valid_indices[
+                            by * num_anchors + middle[0] - 1
+                        ]
+
+        # Down Sweep of exclusive scan
         with ib.new_scope():
-            tx = te.thread_axis("threadIdx.x")
             bx = te.thread_axis("blockIdx.x")
-            ib.scope_attr(tx, "thread_extent", nthread_tx)
-            ib.scope_attr(
-                bx,
-                "thread_extent",
-                tvm.tir.generic.cast(ceil_div(num_anchors, max_threads * width), "int32"),
-            )
-            tid = bx * nthread_tx + tx
-
-            by = te.thread_axis("blockIdx.y")
-            ib.scope_attr(by, "thread_extent", nthread_by)
-            start = ib.allocate("int64", (1,), name="start", scope="local")
-            middle = ib.allocate("int64", (1,), name="middle", scope="local")
-            end = ib.allocate("int64", (1,), name="end", scope="local")
-            tmp = ib.allocate("int32", (1,), name="end", scope="local")
-            start[0] = width * tid
-            with ib.if_scope(tvm.tir.all(start[0] < num_anchors)):
-                middle[0] = start[0] + tvm.tir.indexdiv(width, 2)
-                end[0] = tvm.tir.min(start[0] + width, num_anchors)
-                with ib.if_scope(middle[0] < num_anchors):
-                    tmp[0] = valid_indices[by * num_anchors + middle[0] - 1]
-                    valid_indices[by * num_anchors + middle[0] - 1] = valid_indices[
-                        by * num_anchors + end[0] - 1
-                    ]
-                    valid_indices[by * num_anchors + end[0] - 1] += tmp[0]
+            ib.scope_attr(bx, "thread_extent", batch_size)
+            with ib.if_scope(bx < batch_size):
+                valid_count[bx] = valid_indices[(bx + 1) * num_anchors - 1]
+                valid_indices[(bx + 1) * num_anchors - 1] = 0
+
+        with ib.for_range(0, lim, dtype="int64") as l2_width:
+            width = 2 << (lim - l2_width - 1)
+
+            with ib.new_scope():
+                tx = te.thread_axis("threadIdx.x")
+                bx = te.thread_axis("blockIdx.x")
+                ib.scope_attr(tx, "thread_extent", nthread_tx)
+                ib.scope_attr(
+                    bx,
+                    "thread_extent",
+                    tvm.tir.generic.cast(ceil_div(num_anchors, max_threads * width), "int32"),
+                )
+                tid = bx * nthread_tx + tx
+
+                by = te.thread_axis("blockIdx.y")
+                ib.scope_attr(by, "thread_extent", nthread_by)
+                start = ib.allocate("int64", (1,), name="start", scope="local")
+                middle = ib.allocate("int64", (1,), name="middle", scope="local")
+                end = ib.allocate("int64", (1,), name="end", scope="local")
+                tmp = ib.allocate("int32", (1,), name="end", scope="local")
+                start[0] = width * tid
+                with ib.if_scope(tvm.tir.all(start[0] < num_anchors)):
+                    middle[0] = start[0] + tvm.tir.indexdiv(width, 2)
+                    end[0] = tvm.tir.min(start[0] + width, num_anchors)
+                    with ib.if_scope(middle[0] < num_anchors):
+                        tmp[0] = valid_indices[by * num_anchors + middle[0] - 1]
+                        valid_indices[by * num_anchors + middle[0] - 1] = valid_indices[
+                            by * num_anchors + end[0] - 1
+                        ]
+                        valid_indices[by * num_anchors + end[0] - 1] += tmp[0]
+    with ib.else_scope():
+        with ib.new_scope():
+            bx = te.thread_axis("blockIdx.x")
+            ib.scope_attr(bx, "thread_extent", batch_size)
+            with ib.if_scope(bx < batch_size):
+                valid_count[bx] = 0
 
     return ib.get()
 

From 29da763ed373983982e11246a5882e0fab89a5b3 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Fri, 8 Jan 2021 08:17:38 -0500
Subject: [PATCH 036/357] [CI] make sure submodule checkout in clean state
 (#7228)

---
 Jenkinsfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 81439e95be16..67a41cd51430 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -80,7 +80,7 @@ def init_git() {
   checkout scm
   retry(5) {
     timeout(time: 2, unit: 'MINUTES') {
-      sh 'git submodule update --init'
+      sh 'git submodule update --init -f'
     }
   }
 }
@@ -89,7 +89,7 @@ def init_git_win() {
     checkout scm
     retry(5) {
         timeout(time: 2, unit: 'MINUTES') {
-            bat 'git submodule update --init'
+            bat 'git submodule update --init -f'
         }
     }
 }

From 54c995dbf7c96c1184c2baf64de87bc9566fe33a Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Fri, 8 Jan 2021 09:31:10 -0800
Subject: [PATCH 037/357] [AutoScheduler] Do not return naive schedule in
 tracing mode (#7226)

* [AutoScheduler] Do not return naive schedule in tracing mode

* lint

* fix
---
 python/tvm/auto_scheduler/relay_integration.py | 13 ++++++-------
 python/tvm/relay/op/strategy/generic.py        |  2 +-
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py
index ea1a8cc39373..fb60da19fe44 100644
--- a/python/tvm/auto_scheduler/relay_integration.py
+++ b/python/tvm/auto_scheduler/relay_integration.py
@@ -27,7 +27,7 @@
 import threading
 
 import tvm
-from tvm import autotvm, te, transform
+from tvm import autotvm, transform
 from tvm.ir.transform import PassContext
 from tvm.runtime import convert_to_object
 from tvm.te.tensor import ComputeOp, PlaceholderOp, Tensor
@@ -267,7 +267,7 @@ def auto_schedule_topi(outs):
     -------
     sch: Optional[te.Schedule]
         A tuned schedule or none (if not tuned) in the final build mode;
-        An initial schdule in the tracing mode.
+        None in the tracing mode so that the fallback topi schedule will be used.
     """
     # pylint: disable=import-outside-toplevel
 
@@ -282,7 +282,6 @@ def auto_schedule_topi(outs):
         return None
 
     key = register_workload_tensors(dag.hash_key(), io_tensors)
-
     target = tvm.target.Target.current()
 
     env = TracingEnvironment.current
@@ -293,11 +292,12 @@ def auto_schedule_topi(outs):
             return None
 
         schedule, _ = dag.apply_steps_from_state(state)
-    elif env.tracing_mode in [TracingMode.EXTRACT_TASK, TracingMode.EXTRACT_COMPLEX_TASK_ONLY]:
+        return schedule
+
+    if env.tracing_mode in [TracingMode.EXTRACT_TASK, TracingMode.EXTRACT_COMPLEX_TASK_ONLY]:
         # in the task extraction mode
         if has_complex_op or env.tracing_mode == TracingMode.EXTRACT_TASK:
             env.add_workload_key(key)
-        schedule = te.create_schedule([x.op for x in outs])
     elif env.tracing_mode == TracingMode.PREPARE_LAYOUT_REWRITE:
         # in prepare_layout_rewrite mode
         if (
@@ -315,11 +315,10 @@ def auto_schedule_topi(outs):
             new_key = json.dumps((new_dag.hash_key(),))
             if new_key != key:
                 dispatch_ctx.update(target, new_key, state)
-        return te.create_schedule([x.op for x in outs])
     else:
         raise ValueError("Invalid tracing mode: " + env.tracing_mode)
 
-    return schedule
+    return None
 
 
 def tensor_no_check_call(self, *indices):
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index ea572ba05cd1..9599e727b62b 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -35,7 +35,7 @@ def naive_schedule(_, outs, target):
     if "gpu" in target.keys:
         # For GPU, we at least need thread binding to make a valid schedule.
         # So the naive schedule cannot be compiled.
-        raise RuntimeError(
+        logger.debug(
             "Cannot compile for GPU targets if no tuned schedule is found. "
             "Please see the warning messages above for more information about the failed workloads."
         )

From 701bcc2c5faf0d8f589478000a92d3ad577e8df6 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Fri, 8 Jan 2021 18:00:24 -0800
Subject: [PATCH 038/357] [RELAY,TOPI] Threefry PRNG: splittable and stateless
 (#7083)

* [RELAY,TOPI] Threefry PRNG: splittable and stateless

* Fix sphinx?

* Lint fixes

* sphinx fixes round 2

* fix inputs for tests

* reorganize to random, fix uninitialized memory bug

* silence linter

* silence linter even further

* s

* strengthen Threefry key type checking, add tests

* replace static variable with function for Threefry key type

* lint fix

* Remove old todos, improve assert messages

* describe how random number is generated

* add tests for incorrect output size. also vary test sizes

Co-authored-by: Altan Haan <ahaan@octoml.ai>
---
 include/tvm/relay/attrs/random.h           |  42 +++
 python/tvm/relay/__init__.py               |   1 +
 python/tvm/relay/op/__init__.py            |   1 +
 python/tvm/relay/op/algorithm.py           |   2 +-
 python/tvm/relay/op/op_attrs.py            |   5 +
 python/tvm/relay/op/random/__init__.py     |  20 +
 python/tvm/relay/op/random/_kernel.py      |  29 ++
 python/tvm/relay/op/random/_make.py        |  20 +
 python/tvm/relay/op/random/kernel.py       | 134 +++++++
 python/tvm/relay/op/strategy/generic.py    |  44 +++
 python/tvm/topi/__init__.py                |   1 +
 python/tvm/topi/random/__init__.py         |  22 ++
 python/tvm/topi/random/kernel.py           | 410 +++++++++++++++++++++
 src/relay/op/random/kernel.cc              |  89 +++++
 tests/python/relay/test_prng.py            | 142 +++++++
 tests/python/topi/python/test_topi_prng.py | 116 ++++++
 16 files changed, 1077 insertions(+), 1 deletion(-)
 create mode 100644 include/tvm/relay/attrs/random.h
 create mode 100644 python/tvm/relay/op/random/__init__.py
 create mode 100644 python/tvm/relay/op/random/_kernel.py
 create mode 100644 python/tvm/relay/op/random/_make.py
 create mode 100644 python/tvm/relay/op/random/kernel.py
 create mode 100644 python/tvm/topi/random/__init__.py
 create mode 100644 python/tvm/topi/random/kernel.py
 create mode 100644 src/relay/op/random/kernel.cc
 create mode 100644 tests/python/relay/test_prng.py
 create mode 100644 tests/python/topi/python/test_topi_prng.py

diff --git a/include/tvm/relay/attrs/random.h b/include/tvm/relay/attrs/random.h
new file mode 100644
index 000000000000..8238f102dab8
--- /dev/null
+++ b/include/tvm/relay/attrs/random.h
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/relay/attrs/vision.h
+ * \brief Auxiliary attributes for random operators.
+ */
+#ifndef TVM_RELAY_ATTRS_RANDOM_H_
+#define TVM_RELAY_ATTRS_RANDOM_H_
+
+#include <tvm/ir/attrs.h>
+
+namespace tvm {
+namespace relay {
+
+struct ThreefryGenerateAttrs : public tvm::AttrsNode<ThreefryGenerateAttrs> {
+  Array<Integer> out_shape;
+
+  TVM_DECLARE_ATTRS(ThreefryGenerateAttrs, "relay.attrs.ThreefryGenerateAttrs") {
+    TVM_ATTR_FIELD(out_shape).describe("Shape of random numbers to generate");
+  }
+};
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_ATTRS_RANDOM_H_
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index cd96ecc7ee33..97f6d1cb60c0 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -45,6 +45,7 @@
 from .op import vision
 from .op import contrib
 from .op import dyn
+from .op import random
 from .op.reduce import *
 from .op.tensor import *
 from .op.transform import *
diff --git a/python/tvm/relay/op/__init__.py b/python/tvm/relay/op/__init__.py
index f6afa443d280..1f267abedc1a 100644
--- a/python/tvm/relay/op/__init__.py
+++ b/python/tvm/relay/op/__init__.py
@@ -43,6 +43,7 @@
 from . import image
 from . import vision
 from . import op_attrs
+from . import random
 
 
 # operator registry
diff --git a/python/tvm/relay/op/algorithm.py b/python/tvm/relay/op/algorithm.py
index 99140fcb3e11..6fd5c0645eed 100644
--- a/python/tvm/relay/op/algorithm.py
+++ b/python/tvm/relay/op/algorithm.py
@@ -17,9 +17,9 @@
 """Classic algorithm operation"""
 from __future__ import absolute_import as _abs
 
+from ..expr import Constant, Expr, TupleWrapper
 from . import _make
 from .dyn import _make as _dyn_make
-from ..expr import TupleWrapper, Expr, Constant
 
 
 def sort(data, axis=-1, is_ascend=1):
diff --git a/python/tvm/relay/op/op_attrs.py b/python/tvm/relay/op/op_attrs.py
index cb837b192a6c..41076817b374 100644
--- a/python/tvm/relay/op/op_attrs.py
+++ b/python/tvm/relay/op/op_attrs.py
@@ -552,3 +552,8 @@ class SpaceToBatchNDAttrs(Attrs):
 @tvm._ffi.register_object("relay.attrs.BatchToSpaceNDAttrs")
 class BatchToSpaceNDAttrs(Attrs):
     """Attributes used in BatchToSpaceNDAttrs operators"""
+
+
+@tvm._ffi.register_object("relay.attrs.ThreefryGenerateAttrs")
+class ThreefryGenerateAttrs(Attrs):
+    """Attributes used in ThreefryGenerateAttrs operators"""
diff --git a/python/tvm/relay/op/random/__init__.py b/python/tvm/relay/op/random/__init__.py
new file mode 100644
index 000000000000..8366f4a06dac
--- /dev/null
+++ b/python/tvm/relay/op/random/__init__.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=wildcard-import
+"""PRNG related operators."""
+from .kernel import *
+from . import _kernel
diff --git a/python/tvm/relay/op/random/_kernel.py b/python/tvm/relay/op/random/_kernel.py
new file mode 100644
index 000000000000..8be3397008d5
--- /dev/null
+++ b/python/tvm/relay/op/random/_kernel.py
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Splittable and parallelizable PRNG kernels."""
+# pylint: disable=invalid-name,unused-argument
+from __future__ import absolute_import
+
+from .. import strategy
+from ..op import register_strategy, register_pattern, OpPattern
+
+
+# Threefry
+register_strategy("random.threefry_generate", strategy.threefry_generate_strategy)
+register_pattern("random.threefry_generate", OpPattern.OPAQUE)
+register_strategy("random.threefry_split", strategy.threefry_split_strategy)
+register_pattern("random.threefry_split", OpPattern.OPAQUE)
diff --git a/python/tvm/relay/op/random/_make.py b/python/tvm/relay/op/random/_make.py
new file mode 100644
index 000000000000..51a8a6aa9339
--- /dev/null
+++ b/python/tvm/relay/op/random/_make.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Constructor APIs"""
+import tvm._ffi
+
+tvm._ffi._init_api("relay.op.random._make", __name__)
diff --git a/python/tvm/relay/op/random/kernel.py b/python/tvm/relay/op/random/kernel.py
new file mode 100644
index 000000000000..96634943128d
--- /dev/null
+++ b/python/tvm/relay/op/random/kernel.py
@@ -0,0 +1,134 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Splittable and parallelizable PRNG kernels."""
+# pylint: disable=invalid-name,unused-argument
+from __future__ import absolute_import
+
+import sys
+import numpy as np
+
+from ...expr import Constant
+from .... import nd
+from . import _make
+
+
+def threefry_key(seed):
+    """Create a new Threefry random number generator key.
+
+    Example
+    -------
+
+    .. code-block:: python
+
+        gen = threefry_key(0)
+        _, random_number = threefry_generate(gen, (4,))
+
+    Parameters
+    ----------
+    seed : int
+        Starting seed for the key
+
+    Returns
+    -------
+    key : relay.Expr
+        New key to pass to future uses of :py:func:`threefry_split` or
+        :py:func:`threefry_generate`.
+    """
+    s = np.frombuffer(seed.to_bytes(32, sys.byteorder), dtype="uint64")
+    a = np.concatenate((s, np.array([0, 0, 0, 0, 1 << 63, 0], dtype="uint64")))
+    return Constant(nd.array(a))
+
+
+def threefry_generate(key, shape):
+    """Generate an array of random bits (`uint64`) using the Threefry algorithm
+
+    Example
+    -------
+
+    .. code-block:: python
+
+        key = threefry_key(0)
+        new_key, random1 = threefry_generate(key, (4,))
+        _, random2 = threefry_generate(new_key, (4,))
+        # random1 and random2 are different random numbers
+
+    Parameters
+    ----------
+    key : relay.Expr
+        key that uniquely determines the random values. Multiple uses with the
+        same key will generate the same random values. This key should be
+        treated as an opaque pointer. You can create one from calling
+        :py:func:`threefry_key`, :py:func:`threefry_split`, or
+        :py:func:`threefry_generate`. **Do not use this key again after calling
+        this function.**
+
+    shape : Sequence[int]
+        Desired outputs shape of random numbers. **Currently the total
+        number of elements must be a multiple of 4.**
+
+    Returns
+    -------
+    new_key : relay.Expr
+        New key to pass to future uses of :py:func:`threefry_split` or
+        :py:func:`threefry_generate`.
+
+    random_array : relay.Expr
+        Array of random numbers. Has shape `shape`.
+    """
+    return _make.threefry_generate(key, shape)
+
+
+def threefry_split(key):
+    """Split an existing Threefry key into two new ones.
+
+    This is useful if you have to subsequent calls which each need their own
+    independent random number generation.
+
+    Example
+    -------
+
+    .. code-block:: python
+
+        def foo(key):
+            new_key, num = threefry_generate(key, (4,))
+            return num
+
+        key = threefry_key(0)
+        key1, key2 = threefry_split(key)
+        assert foo(key1) != foo(key2)
+
+    Parameters
+    ----------
+    key : relay.Expr
+        key that uniquely determines the random values. Multiple uses with the
+        same generator will generate the same random values. This generator should be
+        treated as an opaque pointer. You can create one from calling
+        :py:func:`threefry_key`, :py:func:`threefry_split`, or
+        :py:func:`threefry_generate`. **Do not use this generator again after calling
+        this function.**
+
+    Returns
+    -------
+    new_key_1 : relay.Expr
+        New key to pass to future uses of :py:func:`threefry_split` or
+        :py:func:`threefry_generate`.
+
+    new_key_2 : relay.Expr
+        New key to pass to future uses of :py:func:`threefry_split` or
+        :py:func:`threefry_generate`.
+    """
+    return _make.threefry_split(key)
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 9599e727b62b..363832ef8b2f 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -1317,3 +1317,47 @@ def argwhere_strategy(attrs, inputs, out_type, target):
         name="argwhere.generic",
     )
     return strategy
+
+
+# threefry_generate
+def wrap_compute_threefry_generate(topi_compute):
+    """Wrap threefry_generate topi compute"""
+
+    def _compute_threefry_generate(attrs, inputs, _):
+        return topi_compute(inputs[0], attrs.out_shape)
+
+    return _compute_threefry_generate
+
+
+@override_native_generic_func("threefry_generate_strategy")
+def threefry_generate_strategy(attrs, inputs, out_type, target):
+    """threefry_generate generic strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_threefry_generate(topi.random.threefry_generate),
+        wrap_topi_schedule(topi.generic.schedule_extern),
+        name="threefry_generate.generic",
+    )
+    return strategy
+
+
+# threefry_split
+def wrap_compute_threefry_split(topi_compute):
+    """Wrap threefry_split topi compute"""
+
+    def _compute_threefry_split(attrs, inputs, _):
+        return topi_compute(inputs[0])
+
+    return _compute_threefry_split
+
+
+@override_native_generic_func("threefry_split_strategy")
+def threefry_split_strategy(attrs, inputs, out_type, target):
+    """threefry_split generic strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_threefry_split(topi.random.threefry_split),
+        wrap_topi_schedule(topi.generic.schedule_extern),
+        name="threefry_split.generic",
+    )
+    return strategy
diff --git a/python/tvm/topi/__init__.py b/python/tvm/topi/__init__.py
index 97951d941f64..cb94b5b86c9e 100644
--- a/python/tvm/topi/__init__.py
+++ b/python/tvm/topi/__init__.py
@@ -54,6 +54,7 @@
 from . import image
 from . import sparse
 from . import hls
+from . import random
 
 # error reporting
 from .utils import InvalidShapeError
diff --git a/python/tvm/topi/random/__init__.py b/python/tvm/topi/random/__init__.py
new file mode 100644
index 000000000000..ee8d1d6385b7
--- /dev/null
+++ b/python/tvm/topi/random/__init__.py
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=wildcard-import
+"""Pseudorandom generator kernels and operators."""
+from __future__ import absolute_import
+
+from .kernel import *
diff --git a/python/tvm/topi/random/kernel.py b/python/tvm/topi/random/kernel.py
new file mode 100644
index 000000000000..576fd9254a79
--- /dev/null
+++ b/python/tvm/topi/random/kernel.py
@@ -0,0 +1,410 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Pseudorandom number kernels."""
+import tvm
+import tvm.topi
+from ... import tir
+from ...tir import ir_builder
+
+
+# Threefry PRNG with splitting based on
+# - J. K. Salmon, M. A. Moraes, R. O. Dror and D. E. Shaw, "Parallel random numbers: As easy as 1,
+#   2, 3," SC '11: Proceedings of 2011 International Conference for High Performance Computing,
+#   Networking, Storage and Analysis, Seattle, WA, 2011, pp. 1-12, doi: 10.1145/2063384.2063405.
+# - Claessen, K. ; Palka, M. (2013) "Splittable Pseudorandom Number Generators using Cryptographic
+#   Hashing". Proceedings of Haskell Symposium 2013 pp. 47-58.  MLA
+# - Ferguson, Niels, et al. "The Skein hash function family." Submission to NIST (round 3) 7.7.5
+#   (2010): 3.
+
+
+# Threefry is a counter based PRNG: given a unique input, it generates a unique random number. As
+# there is no state to maintain, we can apply it to a sequence of numbers (0..N) to generate a
+# sequence of random numbers in parallel. In order to make the PRNG splittable (that is we can
+# generate a sequence of random numbers in one place, and another sequence in another), we add a
+# path and key in addition to the counter. The path allows us to encode a sequence of splits (a 0 in
+# the path indicates the left result of a split, a 1 indicates the right). To avoid continuously
+# growing the path, we can compress an existing path into the key portion of the generator by
+# hashing the current key, path, and counter to create the new key (this same technique is used if
+# we run out of room for the counter). They key is initialized with a unique initial state.
+#
+# Random numbers are generated by applying the Threefry hash to the current key, path, and counter.
+
+# This module use encoding e4 from the appendix of "Splittable Pseudorandom Number Generators using
+# Cryptographic Hashing" (confusingly, the definition in the paper uses e3 to define the encoding
+# function). This encoding uses a 10 element uint64 tensor where each byte means the following:
+
+# .. code-block:
+
+#     gen:
+#     words: 0 1 2 3 | 4 5  | 6 7     | 8 9
+#     usage: key     | path | counter | position of next step in path encoded in binary
+#                                       ex: 0b00010 -> next path entry goes one from the right
+
+# Right now, counter only uses the rightmost word.
+
+# Threefry rotation constants from the Skein paper ("The Skein Hash Function Family"
+# https://www.schneier.com/wp-content/uploads/2015/01/skein.pdf)
+_ROTATIONS = {
+    4: [[14, 16], [52, 57], [23, 40], [5, 37], [25, 33], [46, 12], [58, 22], [32, 32]],
+    8: [
+        [46, 36, 19, 37],
+        [33, 27, 14, 42],
+        [17, 49, 36, 39],
+        [44, 9, 54, 56],
+        [39, 30, 34, 24],
+        [13, 50, 10, 17],
+        [25, 29, 39, 43],
+        [8, 35, 56, 22],
+    ],
+    16: [
+        [24, 13, 8, 47, 8, 17, 22, 37],
+        [38, 19, 10, 55, 49, 18, 23, 52],
+        [33, 4, 51, 13, 34, 41, 59, 17],
+        [5, 20, 48, 41, 47, 28, 16, 25],
+        [41, 9, 37, 31, 12, 47, 44, 30],
+        [16, 34, 56, 51, 4, 53, 42, 41],
+        [31, 44, 47, 46, 19, 42, 44, 25],
+        [9, 48, 35, 52, 23, 31, 37, 20],
+    ],
+}
+
+# Threefry permutation constants from the Skein paper ("The Skein Hash Function Family"
+# https://www.schneier.com/wp-content/uploads/2015/01/skein.pdf)
+_PERMUTATIONS = {
+    4: [0, 3, 2, 1],
+    8: [2, 1, 4, 7, 6, 5, 0, 3],
+    16: [0, 9, 2, 13, 6, 11, 4, 15, 10, 7, 12, 3, 14, 5, 8, 1],
+}
+
+
+def _threefry(
+    irb, key_buf, key_offset, counter_buf, counter_offset, out_buf, out_offset, out_shape
+):
+    """IRBuilder code for running Threefry
+
+    Parameters
+    ----------
+    irb: IRBuilder
+        IRBuilder that this code will be generated for.
+
+    key_buf: BufferVar
+        Buffer to read the key from.
+
+    key_offset: number
+        Threefry will write to :code:`key_buf[key_offset:key_offset+4]`
+
+    counter_buf: BufferVar
+        Buffer to read the counter from.
+
+    counter_offset: number
+        Threefry will write to :code:`counter_buf[counter_offset:counter_offset+4]`
+
+    out_buf: BufferVar
+        Buffer to read the counter from.
+
+    out_offset: number
+        Threefry will write to :code:`out_buf[out_offset:out_offset+4*product(out_shape)]`
+
+    out_shape: number
+        Determines the number of ouput states to generate. :code:`state[i]` will correspond to
+        counter+i.
+    """
+    nrounds = 20
+    nwords = 4
+    iwidth = 64
+    assert nrounds % 4 == 0
+    assert nwords in [4, 8, 16]
+
+    # The paper has constants for 32 bit threefry, but we keep the implementation simple by only
+    # using 64-bit words.
+    assert key_buf.dtype == "uint64", "threefry only supports 64-bit keys"
+    assert key_buf.dtype == counter_buf.dtype, "threefry key and counter must be the same dtype"
+
+    def mix(a, b, rotation):
+        x = a + b  # TODO should be wrapping
+        y = x ^ ((b << rotation) | (b >> (iwidth - rotation)))
+        return [x, y]
+
+    # temporary buffer for holding the results of _PERMUTATIONS
+    tmp = irb.allocate(out_buf.dtype, out_shape, name="tmp", scope="global")
+    tmp_offset = 0
+
+    # Initialize entire key. It is composed of the original key with one
+    # element appended. The appended element is the xor of all key words plus a
+    # constant.
+    full_key = irb.allocate("uint64", nwords + 1, name="full_key", scope="global")
+    for i in range(nwords):
+        full_key[i] = key_buf[key_offset + i]
+    # initial key constant, full_key[nwords] is equivalent to k_{N_W} in the Skein paper.
+    full_key[nwords] = tvm.tir.const(0x1BD11BDAA9FC1A22, dtype="uint64")
+    for i in range(nwords):
+        full_key[nwords] ^= key_buf[key_offset + i]
+
+    with irb.for_range(0, out_shape, dtype="uint64", name="i") as i:
+        for j in range(nwords):
+            out_buf[out_offset + i * nwords + j] = counter_buf[counter_offset + j] + i
+
+    def key_schedule(s, i):
+        # Threefry uses no tweak, so the key schedule is simple
+        if i == nwords - 1:
+            return full_key[(s + i) % (nwords + 1)] + tvm.tir.const(s, dtype="uint64")
+        return full_key[(s + i) % (nwords + 1)]
+
+    with irb.for_range(0, out_shape, name="l") as l:  # pylint: disable=invalid-name
+        for i in range(nrounds // 4):
+            for j in range(nwords):
+                out_buf[out_offset + l * nwords + j] += key_schedule(i, j)  # TODO wrapping
+            for k in range(4):
+                for j in range(nwords // 2):
+                    (
+                        out_buf[out_offset + l * nwords + j * 2 + 0],
+                        out_buf[out_offset + l * nwords + j * 2 + 1],
+                    ) = mix(
+                        out_buf[out_offset + l * nwords + j * 2 + 0],
+                        out_buf[out_offset + l * nwords + j * 2 + 1],
+                        _ROTATIONS[nwords][(i * 4 + k) % 8][j],
+                    )
+                for j in range(nwords):
+                    tmp[tmp_offset + l * nwords + j] = out_buf[
+                        out_offset + l * nwords + _PERMUTATIONS[nwords][j]
+                    ]
+                # number of rounds is even, so out always contains the result
+                (out_buf, tmp) = (tmp, out_buf)
+                (out_offset, tmp_offset) = (tmp_offset, out_offset)
+
+
+def threefry_generate(gen, out_shape):
+    """Generate a series of random values
+
+    Notes
+    -----
+    This function uses the counter portion of the generator state to generate a series of random
+    numbers in parallel. Random number `i` is generated by applying Threefry to the current
+    generator state with the counter portion incremented by `i`. This means that each random number
+    is generated independently from each other random number, so we can compute them in parallel.
+
+    If there is not enough room left in the counter to generate the desired shape of random values,
+    then a new generator is created by applying Threefry to the current key, path, and counter.
+    This new generator will have a reset counter.
+
+    Parameters
+    ----------
+    gen : Tensor[10, uint64]
+        Generator state. Can be create with :py:func:`tvm.relay.threefry_key`. This should not be
+        reused in another function, otherwise random numbers will be repeated.
+
+    out_shape : Sequence[int]
+        Output shape of the random numbers. Product of all dimensions must be a multiple of 4.
+
+    Returns
+    -------
+    new_gen : Tensor[10, uint64]
+        The new generator state to be used in subsequent calls.
+
+    rand : Tensor[out_shape, uint64]
+        Tensor of random numbers with shape `out_shape`.
+    """
+    out_len = tir.const(1)
+    for s in out_shape:
+        out_len *= s
+    assert (
+        out_len.value % 4 == 0
+    ), f"Threefry can only generate arrays who's size is a multiple of 4 ({out_len} was provided)."
+    assert (
+        out_len.value <= 2 ** 64 - 1
+    ), f"Can only generate up to 2^64 random numbers, but {out_len} were requested."
+
+    def gen_ir(gen_ptr, out_gen_ptr, out_array_ptr):
+        irb = ir_builder.create()
+        gen = irb.buffer_ptr(gen_ptr)
+        out_gen = irb.buffer_ptr(out_gen_ptr)
+        out_array = irb.buffer_ptr(out_array_ptr)
+
+        # Create a temporary array to hold the generator state we will use to create the random
+        # numbers. We cannot use gen because we may need to update the key + path if there is not
+        # enough room in the counter.
+        tmp = irb.allocate(gen.dtype, 10, name="tmp", scope="global")
+
+        # TODO(tkonolige): for now we only use the last word of the counter for counting. It is too
+        # much work to figure out how to do 128 bit addition.
+
+        # Max value for counter should be 2**64-2 because we need to reserve a special value to
+        # indicate the counter is used up.
+        with irb.if_scope(gen[7] < tir.const(2 ** 64 - 1, dtype=gen.dtype) - out_len):
+            for i in range(10):
+                tmp[i] = gen[i]
+        with irb.else_scope():
+            # no room left in the counter, we have to change the path or key
+            with irb.if_scope(gen[8] == 0 and gen[9] == 0):
+                # out of room in the path, have to generate new key
+
+                # The paper says the counter that we will be hashing should be a special value of
+                # all ones. We need to allocate some space for it because we cannot overwrite gen.
+                tmp_counter = irb.allocate(gen.dtype, 2, name="tmp_counter", scope="global")
+                tmp_counter[0] = tir.const(0xFFFFFFFFFFFFFFFF, dtype=gen.dtype)
+                tmp_counter[1] = tir.const(0xFFFFFFFFFFFFFFFF, dtype=gen.dtype)
+                _threefry(irb, gen, 0, tmp_counter, 0, tmp, 0, 1)
+                tmp[4] = tir.const(0, dtype=gen.dtype)  # zero path, i.e. no path
+                tmp[5] = tir.const(0, dtype=gen.dtype)
+                tmp[6] = tir.const(0, dtype=gen.dtype)  # zero counter
+                tmp[7] = tir.const(0, dtype=gen.dtype)
+                tmp[8] = tir.const(1 << 63, dtype=gen.dtype)  # one in the leftmost position
+                tmp[9] = tir.const(0, dtype=gen.dtype)
+            with irb.else_scope():
+                tmp[0] = gen[0]
+                tmp[1] = gen[1]
+                tmp[2] = gen[2]
+                tmp[3] = gen[3]
+                tmp[4] = gen[4] | gen[8]  # add a 1 to the path
+                tmp[5] = gen[5] | gen[9]
+                tmp[6] = tir.const(0, dtype=gen.dtype)  # zero counter
+                tmp[7] = tir.const(0, dtype=gen.dtype)
+                _shift_right(irb, gen[8], gen[9], tmp, 8, tmp, 9)
+
+        # Compute random values
+        _threefry(irb, tmp, 0, tmp, 4, out_array, 0, out_len // 4)
+
+        # Update generator state
+        out_gen[0] = tmp[0]  # key stays the same
+        out_gen[1] = tmp[1]
+        out_gen[2] = tmp[2]
+        out_gen[3] = tmp[3]
+        out_gen[4] = tmp[4]  # path stays the same
+        out_gen[5] = tmp[5]
+        out_gen[6] = tir.const(0, dtype=gen.dtype)  # unused, leave it as 0
+        out_gen[7] = tmp[7] + tir.Cast(gen.dtype, out_len)  # increment counter
+        out_gen[8] = tmp[8]  # path unchanged, so no update here
+        out_gen[9] = tmp[9]
+
+        return irb.get()
+
+    out_gen = tvm.tir.decl_buffer((10,), name="out_gen", dtype="uint64")
+    out_array = tvm.tir.decl_buffer(out_shape, name="out_array", dtype="uint64")
+    return tvm.te.extern(
+        [out_gen.shape, out_array.shape],
+        [gen],
+        lambda ins, outs: gen_ir(ins[0], outs[0], outs[1]),
+        out_buffers=[out_gen, out_array],
+        name="threefry_generate",
+        tag="threefry_generate",
+    )
+
+
+def _shift_right(irb, a, b, out_a, a_off, out_b, b_off):
+    """Binary shift a 128bit number composed of two 64 bit words right by one."""
+    with irb.if_scope(a == 1):
+        out_a[a_off] = tir.const(0, dtype=a.dtype)
+        out_b[b_off] = tir.const(0x8000000000000000, dtype=a.dtype)
+    with irb.else_scope():
+        with irb.if_scope(a == 0):
+            out_a[a_off] = tir.const(0, dtype=a.dtype)
+            out_b[b_off] = b >> 1
+        with irb.else_scope():
+            out_a[a_off] = a >> 1
+            out_b[b_off] = tir.const(0, dtype=a.dtype)
+
+
+def threefry_split(gen):
+    """Split a single generator state into two new ones
+
+    Notes
+    -----
+    The new generator is created by appending a one (for the right output) or a zero (for the left
+    output) to the end of the path portion of the generator If there is no longer and room in the
+    path, then we create a new key portion of the generator by applying Threefry to the old state,
+    path, and counter. i.e. :code:`new_key = threefry(old_key, [old_path, old_counter])`. This
+    resets the path portion of the new generator.
+
+    Parameters
+    ----------
+    gen : Tensor[10, uint64]
+        Generator state. Can be create with :py:func:`tvm.relay.threefry_key`. This should not be
+        reused in another function, otherwise random numbers will be repeated.
+
+    Returns
+    -------
+    out_gen_left : Tensor[10, uint64]
+        New generator state that is distinct from `out_gen_right`.
+
+    out_gen_right : Tensor[10, uint64]
+        New generator state that is distinct from `out_gen_left`.
+    """
+
+    def gen_ir(gen_ptr, out_left_ptr, out_right_ptr):
+        irb = ir_builder.create()
+        gen = irb.buffer_ptr(gen_ptr)
+        out_left = irb.buffer_ptr(out_left_ptr)
+        out_right = irb.buffer_ptr(out_right_ptr)
+
+        with irb.if_scope(gen[8] == 0 and gen[9] == 0):
+            # Generate new key because we have run out of room to extend the path
+            _threefry(irb, gen, 0, gen, 4, out_left, 0, 1)
+            out_left[4] = tir.const(0, dtype=gen.dtype)
+            out_left[5] = tir.const(0, dtype=gen.dtype)
+            out_left[6] = tir.const(0, dtype=gen.dtype)  # counter gets zeroed
+            out_left[7] = tir.const(0, dtype=gen.dtype)  # counter gets zeroed
+            out_left[8] = tir.const(
+                1 << 62, dtype=gen.dtype
+            )  # one in the second from the leftmost position
+            out_left[9] = tir.const(0, dtype=gen.dtype)
+
+            out_right[0] = out_left[0]
+            out_right[1] = out_left[1]
+            out_right[2] = out_left[2]
+            out_right[3] = out_left[3]
+            out_right[4] = tir.const(1 << 63, dtype=gen.dtype)  # one in the leftmost position
+            out_right[5] = tir.const(0, dtype=gen.dtype)
+            out_right[6] = tir.const(0, dtype=gen.dtype)
+            out_right[7] = tir.const(0, dtype=gen.dtype)
+            out_right[8] = tir.const(
+                1 << 62, dtype=gen.dtype
+            )  # one in the second from the leftmost position
+            out_right[9] = tir.const(0, dtype=gen.dtype)
+        with irb.else_scope():
+            out_left[0] = gen[0]
+            out_left[1] = gen[1]
+            out_left[2] = gen[2]
+            out_left[3] = gen[3]
+            out_left[4] = gen[4]  # adding a zero here, but its already zero padded
+            out_left[5] = gen[5]
+            out_left[6] = gen[6]
+            out_left[7] = gen[7]
+            # move path position over one bit
+            _shift_right(irb, gen[8], gen[9], out_left, 8, out_left, 9)
+
+            out_right[0] = gen[0]
+            out_right[1] = gen[1]
+            out_right[2] = gen[2]
+            out_right[3] = gen[3]
+            out_right[4] = gen[4] | gen[8]  # add a one to the path
+            out_right[5] = gen[5] | gen[9]
+            out_right[6] = gen[6]
+            out_right[7] = gen[7]
+            _shift_right(irb, gen[8], gen[9], out_right, 8, out_right, 9)
+
+        return irb.get()
+
+    out_left = tvm.tir.decl_buffer((10,), name="out_left", dtype="uint64")
+    out_right = tvm.tir.decl_buffer((10,), name="out_right", dtype="uint64")
+    return tvm.te.extern(
+        [out_left.shape, out_right.shape],
+        [gen],
+        lambda ins, outs: gen_ir(ins[0], outs[0], outs[1]),
+        out_buffers=[out_left, out_right],
+        name="threefry_split",
+        tag="threefry_split",
+    )
diff --git a/src/relay/op/random/kernel.cc b/src/relay/op/random/kernel.cc
new file mode 100644
index 000000000000..ec092a7e05f2
--- /dev/null
+++ b/src/relay/op/random/kernel.cc
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <tvm/relay/attrs/random.h>
+#include <tvm/relay/op.h>
+
+namespace tvm {
+namespace relay {
+
+TVM_REGISTER_NODE_TYPE(ThreefryGenerateAttrs);
+
+static TensorType ThreefryKeyType() { return TensorType({10}, tvm::DataType::UInt(64)); }
+
+bool ThreefryGenerateRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                         const TypeReporter& reporter) {
+  const ThreefryGenerateAttrs* param = attrs.as<ThreefryGenerateAttrs>();
+  ICHECK_EQ(types.size(), 2) << "ThreefryGenerate should have one input and one output";
+
+  reporter->Assign(types[0], ThreefryKeyType());
+
+  std::vector<IndexExpr> oshape;
+  for (auto& x : param->out_shape) {
+    oshape.push_back(x);
+  }
+  // generate returns the next key and an array of random values
+  // TODO(@tkonolige, @altanh): support other output dtypes?
+  reporter->Assign(types[1],
+                   TupleType({ThreefryKeyType(), TensorType(oshape, tvm::DataType::UInt(64))}));
+  return true;
+}
+
+Expr MakeThreefryGenerate(Expr key, Array<Integer> out_shape) {
+  auto attrs = make_object<ThreefryGenerateAttrs>();
+  attrs->out_shape = out_shape;
+  static const Op& op = Op::Get("random.threefry_generate");
+  return Call(op, {key}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_GLOBAL("relay.op.random._make.threefry_generate").set_body_typed(MakeThreefryGenerate);
+
+RELAY_REGISTER_OP("random.threefry_generate")
+    .describe(
+        R"doc(Generate an array of random numbers using the Threefry algorithm.)doc" TVM_ADD_FILELINE)
+    .set_num_inputs(1)
+    .set_attrs_type<ThreefryGenerateAttrs>()
+    .add_argument("key", "Tensor", "Input Threefry key")
+    .add_type_rel("ThreefryGenerate", ThreefryGenerateRel);
+
+bool ThreefrySplitRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                      const TypeReporter& reporter) {
+  ICHECK_EQ(types.size(), 2) << "ThreefrySplit should have one input and one output";
+
+  reporter->Assign(types[0], ThreefryKeyType());
+  reporter->Assign(types[1], TupleType({ThreefryKeyType(), ThreefryKeyType()}));
+
+  return true;
+}
+
+Expr MakeThreefrySplit(Expr key) {
+  static const Op& op = Op::Get("random.threefry_split");
+  return Call(op, {key}, Attrs(), {});
+}
+
+TVM_REGISTER_GLOBAL("relay.op.random._make.threefry_split").set_body_typed(MakeThreefrySplit);
+
+RELAY_REGISTER_OP("random.threefry_split")
+    .describe(R"doc(Split the input Threefry key into two new ones.)doc" TVM_ADD_FILELINE)
+    .set_num_inputs(1)
+    .add_argument("key", "Tensor", "Input Threefry key")
+    .add_type_rel("ThreefrySplit", ThreefrySplitRel);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/relay/test_prng.py b/tests/python/relay/test_prng.py
new file mode 100644
index 000000000000..2109d3b30a82
--- /dev/null
+++ b/tests/python/relay/test_prng.py
@@ -0,0 +1,142 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+import tvm
+import tvm.relay
+import tvm.testing
+from tvm.relay.testing import run_infer_type
+
+
+@tvm.testing.parametrize_targets
+def test_threefry_repeatability(target, ctx):
+    target, ctx = "llvm", tvm.cpu(0)
+    key1 = tvm.relay.random.threefry_key(1)
+    rand1 = tvm.relay.random.threefry_generate(key1, (12,))
+    out_key1, out1 = tvm.relay.create_executor(
+        "vm", tvm.IRModule.from_expr(tvm.relay.Function([], rand1)), target=target, ctx=ctx
+    ).evaluate()()
+
+    key2 = tvm.relay.random.threefry_key(1)
+    rand2 = tvm.relay.random.threefry_generate(key2, (12,))
+    out_key2, out2 = tvm.relay.create_executor(
+        "vm", tvm.IRModule.from_expr(tvm.relay.Function([], rand2)), target=target, ctx=ctx
+    ).evaluate()()
+
+    assert (
+        out1.asnumpy() == out2.asnumpy()
+    ).all(), "Generate on same seed should have the same output random numbers"
+
+    assert (
+        out_key1.asnumpy() == out_key2.asnumpy()
+    ).all(), "Generate on same seed should have the same next keys"
+
+
+@tvm.testing.parametrize_targets
+def test_threefry_split(target, ctx):
+    key = tvm.relay.random.threefry_key(1)
+    left, right = tvm.relay.TupleWrapper(tvm.relay.random.threefry_split(key), 2)
+    _, rand1 = tvm.relay.TupleWrapper(tvm.relay.random.threefry_generate(left, (16,)), 2)
+    _, rand2 = tvm.relay.TupleWrapper(tvm.relay.random.threefry_generate(right, (16,)), 2)
+    out1, out2 = tvm.relay.create_executor(
+        "vm",
+        tvm.IRModule.from_expr(tvm.relay.Function([], tvm.relay.Tuple((rand1, rand2)))),
+        target=target,
+        ctx=ctx,
+    ).evaluate()()
+
+    assert (
+        out1.asnumpy() != out2.asnumpy()
+    ).any(), "Generate after split should not have the same output"
+
+
+@tvm.testing.parametrize_targets
+def test_threefry_sequential_generate(target, ctx):
+    key = tvm.relay.random.threefry_key(1)
+    key, rand1 = tvm.relay.TupleWrapper(tvm.relay.random.threefry_generate(key, (4,)), 2)
+    _, rand2 = tvm.relay.TupleWrapper(tvm.relay.random.threefry_generate(key, (4,)), 2)
+    out1, out2 = tvm.relay.create_executor(
+        "vm",
+        tvm.IRModule.from_expr(tvm.relay.Function([], tvm.relay.Tuple((rand1, rand2)))),
+        target=target,
+        ctx=ctx,
+    ).evaluate()()
+
+    assert (
+        out1.asnumpy() != out2.asnumpy()
+    ).any(), "Sequential generates should not have the same output"
+
+
+def test_threefry_generate_infer():
+    oshape = (12,)
+    key_type = tvm.relay.TensorType([10], dtype="uint64")
+    gen_type = tvm.relay.TensorType(oshape, dtype="uint64")
+    expected_type = tvm.relay.TupleType([key_type, gen_type])
+
+    key = tvm.relay.random.threefry_key(1)
+    rand1 = tvm.relay.random.threefry_generate(key, oshape)
+    f = tvm.relay.Function([], rand1)
+    f = run_infer_type(f)
+    assert tvm.ir.structural_equal(f.ret_type, expected_type)
+
+
+def test_threefry_split_infer():
+    key_type = tvm.relay.TensorType([10], dtype="uint64")
+    expected_type = tvm.relay.TupleType([key_type, key_type])
+
+    key = tvm.relay.random.threefry_key(1)
+    out_keys = tvm.relay.random.threefry_split(key)
+    f = tvm.relay.Function([], out_keys)
+    f = run_infer_type(f)
+    assert tvm.ir.structural_equal(f.ret_type, expected_type)
+
+
+@pytest.mark.xfail(raises=tvm.error.TVMError)
+def test_threefry_generate_infer_fail():
+    # xfail: key size should be 10
+    fake_key = tvm.relay.const([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype="uint64")
+    rand1 = tvm.relay.random.threefry_generate(fake_key, (12,))
+    f = tvm.relay.Function([], rand1)
+    f = run_infer_type(f)
+
+
+@pytest.mark.xfail(raises=tvm.error.TVMError)
+def test_threefry_split_infer_fail():
+    # xfail: key size should be 10
+    fake_key = tvm.relay.const([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype="uint64")
+    out_keys = tvm.relay.random.threefry_split(fake_key)
+    f = tvm.relay.Function([], out_keys)
+    f = run_infer_type(f)
+
+
+@tvm.testing.requires_llvm
+@pytest.mark.xfail(raises=tvm.error.TVMError)
+def test_threefry_generate_incorrect_out_size():
+    key = tvm.relay.random.threefry_key(1)
+    # xfail: output size should be multiple of 4
+    key, rand1 = tvm.relay.TupleWrapper(tvm.relay.random.threefry_generate(key, (5,)), 2)
+    out1, out2 = tvm.relay.create_executor(
+        "vm",
+        tvm.IRModule.from_expr(tvm.relay.Function([], rand1)),
+        target=tvm.target.Target("llvm"),
+        ctx=tvm.context("cpu"),
+    ).evaluate()()
+
+
+if __name__ == "__main__":
+    test_threefry_repeatability(tvm.target.Target("llvm"), tvm.context("cpu"))
+    test_threefry_split(tvm.target.Target("llvm"), tvm.context("cpu"))
+    test_threefry_sequential_generate(tvm.target.Target("llvm"), tvm.context("cpu"))
diff --git a/tests/python/topi/python/test_topi_prng.py b/tests/python/topi/python/test_topi_prng.py
new file mode 100644
index 000000000000..43b0494ee6f5
--- /dev/null
+++ b/tests/python/topi/python/test_topi_prng.py
@@ -0,0 +1,116 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+import tvm.relay
+import tvm.testing
+import tvm.topi
+import numpy as np
+
+
+def threefry_split(target, ctx, gen):
+    gen_placeholder = tvm.te.placeholder(gen.shape, name="gen", dtype="uint64")
+    left_placeholder, right_placeholder = tvm.topi.random.threefry_split(gen_placeholder)
+    s = tvm.topi.generic.schedule_extern([left_placeholder, right_placeholder])
+    f = tvm.build(s, [gen_placeholder, left_placeholder, right_placeholder])
+    left = tvm.nd.array(np.zeros(gen.shape, dtype="uint64"))
+    right = tvm.nd.array(np.zeros(gen.shape, dtype="uint64"))
+    f(tvm.nd.array(gen), left, right)
+    return left.asnumpy(), right.asnumpy()
+
+
+def threefry_generate(target, ctx, gen, size):
+    gen_placeholder = tvm.te.placeholder(gen.shape, name="gen", dtype="uint64")
+    left_placeholder, right_placeholder = tvm.topi.random.threefry_generate(gen_placeholder, size)
+    s = tvm.topi.generic.schedule_extern([left_placeholder, right_placeholder])
+    f = tvm.build(s, [gen_placeholder, left_placeholder, right_placeholder])
+    out_gen = tvm.nd.array(np.zeros(gen.shape, dtype="uint64"))
+    rands = tvm.nd.array(np.zeros(size, dtype="uint64"))
+    f(tvm.nd.array(gen), out_gen, rands)
+    return out_gen.asnumpy(), rands.asnumpy()
+
+
+@tvm.testing.parametrize_targets
+def test_threefry_split(target, ctx):
+    # test that results of split do not equal eachother or the input
+    gen = tvm.relay.random.threefry_key(0).data.asnumpy()
+    a, b = threefry_split(target, ctx, gen)
+    assert (a != b).any() and (
+        a != gen
+    ).any(), "Splitting a gen should result in different output gens"
+    # unittest some split inputs
+    assert (a == np.array([0, 0, 0, 0, 0, 0, 0, 0, 1 << 62, 0], dtype="uint64")).all()
+    assert (b == np.array([0, 0, 0, 0, 1 << 63, 0, 0, 0, 1 << 62, 0], dtype="uint64")).all()
+
+    # test enough splits to go over path length
+    for i in range(129):
+        a, b = threefry_split(target, ctx, b)
+    assert (a[0:4] == b[0:4]).all(), "State part of split should be the same"
+    assert (b[0:4] != np.zeros(4, dtype="uint64")).any()
+
+    # check that split then generate does not generate the same for both sides
+    a, a_rands = threefry_generate(target, ctx, a, (100,))
+    b, b_rands = threefry_generate(target, ctx, b, (100,))
+    assert (
+        a_rands != b_rands
+    ).all(), "Numbers generated from different initial states should be different"
+
+    # check repeatability
+    _, rands1 = threefry_generate(target, ctx, a, (100,))
+    _, rands2 = threefry_generate(target, ctx, a, (100,))
+    assert (
+        rands1 == rands2
+    ).all(), "Numbers generated from the same initial state should be the same"
+
+    a1, b1 = threefry_split(target, ctx, a)
+    a2, b2 = threefry_split(target, ctx, a)
+    assert (a1 == a2).all() and (
+        b1 == b2
+    ).all(), "Split called on the same input should return the same result"
+
+
+@tvm.testing.parametrize_targets
+def test_threefry_generate(target, ctx):
+    gen = tvm.relay.random.threefry_key(0).data.asnumpy()
+
+    # check that we can generate some data
+    a, rands = threefry_generate(target, ctx, gen, (100,))
+    assert (
+        rands.shape[0] == 100 and len(rands.shape) == 1
+    ), "Output shape should match requested shape"
+
+    # check that gen out does not equal input
+    assert (a != gen).any(), "Output generator should be different from input generator"
+
+    # test enough generates to go over generate limit
+    gen = np.array(
+        [0, 0, 0, 0, 0, 0, 0, 2 ** 64 - 2, 1 << 63, 0], dtype="uint64"
+    )  # make counter large
+    a, rands = threefry_generate(target, ctx, gen, (100,))
+    assert gen[4] != a[4], "Overflow of counter should trigger path change"
+    assert a[7] == 100, "Overflow of counter should still update counter"
+
+    # check generate with path at length limit
+    gen = np.array([0, 0, 0, 0, 0, 0, 0, 2 ** 64 - 2, 0, 0], dtype="uint64")  # make counter large
+    a, rands = threefry_generate(target, ctx, gen, (100,))
+    assert (
+        gen[0:4] != a[0:4]
+    ).any(), "Overflowing counter with no space left in path should change state"
+
+
+if __name__ == "__main__":
+    test_threefry_split(tvm.target.Target("llvm"), tvm.context("cpu"))
+    test_threefry_generate(tvm.target.Target("llvm"), tvm.context("cpu"))

From 02ef6e6243dbe525dc7e0a2f10704add0d7c24d7 Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Fri, 8 Jan 2021 19:49:06 -0800
Subject: [PATCH 039/357] [TOPI] Treat undefined elements as constants in Array
 (#7232)

* [TOPI] Treat undefined elements as constants in Array

* Add a checker

* fix

* add test case
---
 include/tvm/topi/detail/constant_utils.h        | 5 +++--
 include/tvm/topi/transform.h                    | 1 +
 tests/python/topi/python/test_topi_transform.py | 1 +
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/include/tvm/topi/detail/constant_utils.h b/include/tvm/topi/detail/constant_utils.h
index 49ce21b5732e..92ff3a4e3804 100644
--- a/include/tvm/topi/detail/constant_utils.h
+++ b/include/tvm/topi/detail/constant_utils.h
@@ -48,7 +48,8 @@ using namespace tvm::te;
 inline bool IsConstInt(PrimExpr expr) { return expr->IsInstance<tvm::tir::IntImmNode>(); }
 
 /*!
- * \brief Test whether the given Array has every element as constant integer
+ * \brief Test whether the given Array has every element as constant integer.
+ * Undefined elements are also treat as constants.
  *
  * \param array the array to query
  *
@@ -57,7 +58,7 @@ inline bool IsConstInt(PrimExpr expr) { return expr->IsInstance<tvm::tir::IntImm
 inline bool IsConstIntArray(Array<PrimExpr> array) {
   bool is_const_int = true;
   for (auto const& elem : array) {
-    is_const_int &= elem->IsInstance<tvm::tir::IntImmNode>();
+    is_const_int &= !elem.defined() || elem->IsInstance<tvm::tir::IntImmNode>();
   }
   return is_const_int;
 }
diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h
index a04762f28feb..261fdf9970a3 100644
--- a/include/tvm/topi/transform.h
+++ b/include/tvm/topi/transform.h
@@ -612,6 +612,7 @@ inline Tensor strided_slice(const Tensor& x, const Array<PrimExpr>& begin,
 
   Array<PrimExpr> out_shape;
   if (!is_static) {
+    ICHECK_EQ(strides.size(), src_tensor_dim);
     for (size_t i = 0; i < src_tensor_dim; ++i) {
       out_shape.push_back(indexdiv(end[i] - begin[i], strides[i]));
     }
diff --git a/tests/python/topi/python/test_topi_transform.py b/tests/python/topi/python/test_topi_transform.py
index 30434f6fd266..e0018ba0c0d3 100644
--- a/tests/python/topi/python/test_topi_transform.py
+++ b/tests/python/topi/python/test_topi_transform.py
@@ -817,6 +817,7 @@ def test_strided_slice():
     verify_strided_slice((3, 4, 3), [1, -1, 0], [2, -3, 3], [1, -1, 1])
     verify_strided_slice((3, 4, 3), [1, 1, 0], [4, 4, 3])
     verify_strided_slice((3, 4, 3), [0, 2, 0], [1, 2, 3])
+    verify_strided_slice((3, 4, 3), [0, 0, 0], [None, None, None])
 
 
 @tvm.testing.uses_gpu

From d949d153a59448e14f594ae4ccb815de3817bef7 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sat, 9 Jan 2021 09:17:20 -0800
Subject: [PATCH 040/357] Revert "[AutoTVM-FIX] avoid unexpected value(1) of
 search space when get length for uninitiated search space (#7175)" (#7236)

This reverts commit f2ab977de0ac543cae77d3bef76af1b56dd61eed.
---
 python/tvm/autotvm/task/space.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/tvm/autotvm/task/space.py b/python/tvm/autotvm/task/space.py
index b24ab415c60a..cf9cd809aa8d 100644
--- a/python/tvm/autotvm/task/space.py
+++ b/python/tvm/autotvm/task/space.py
@@ -836,8 +836,6 @@ def _add_new_transform(self, space_class, name, axes, policy, **kwargs):
         return [Axis(None, i) for i in range(space_class.get_num_output(axes, policy, **kwargs))]
 
     def __len__(self):
-        if not self.space_map:
-            return 0
         if self._length is None:
             self._length = int(np.prod([len(x) for x in self.space_map.values()]))
         return self._length

From 69a06283a7f5bd3ae9c9390cf74055360039ffc0 Mon Sep 17 00:00:00 2001
From: Yanming Wang <yanmingwang01@gmail.com>
Date: Sun, 10 Jan 2021 17:45:27 -0800
Subject: [PATCH 041/357] [AutoTVM] Add index boundary check in
 ConfigSpace.get() (#7234)

* [AutoTVM] Add index boundary check in ConfigSpace.get()

* Fix unit test

Co-authored-by: Yanming Wang <yanmwang@amazon.com>
---
 python/tvm/autotvm/task/space.py             | 8 +++++---
 tests/python/unittest/test_autotvm_common.py | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/python/tvm/autotvm/task/space.py b/python/tvm/autotvm/task/space.py
index cf9cd809aa8d..afbfb4c03988 100644
--- a/python/tvm/autotvm/task/space.py
+++ b/python/tvm/autotvm/task/space.py
@@ -19,7 +19,7 @@
 """
 Template configuration space.
 
-Each template function can be parametrized by a ConfigSpace.
+Each template function can be parameterized by a ConfigSpace.
 The space is declared when we invoke the template function with ConfigSpace.
 During evaluation, we pass in a ConfigEntity, which contains a specific
 entity in the space. This entity contains deterministic parameters.
@@ -63,7 +63,7 @@ class TransformSpace(object):
         Each operator has some tunable parameters (e.g. the split factor).
         Then the tuning process is just to find good parameters of these op.
 
-    So the all the combinations of the parameters of these op forms our search space.
+    So all the combinations of the parameters of these op form our search space.
 
     Naming convention:
     We call the set of all possible values as XXXSpace. (XXX can be Split, Reorder, Config ...)
@@ -797,7 +797,7 @@ def add_flop(self, flop):
 
     def raise_error(self, msg):
         """register error in config
-        Using this to actively detect error when scheudling.
+        Using this to actively detect error when scheduling.
         Otherwise these error will occur during runtime, which
         will cost more time.
 
@@ -848,6 +848,8 @@ def get(self, index):
         index: int
             index in the space
         """
+        if index < 0 or index >= len(self):
+            raise IndexError("Index out of range: size {}, got index {}".format(len(self), index))
         entities = OrderedDict()
         t = index
         for name, space in self.space_map.items():
diff --git a/tests/python/unittest/test_autotvm_common.py b/tests/python/unittest/test_autotvm_common.py
index 917036fc24a1..60f7d8bafb1b 100644
--- a/tests/python/unittest/test_autotvm_common.py
+++ b/tests/python/unittest/test_autotvm_common.py
@@ -101,6 +101,6 @@ def get_sample_records(n):
 
     inps, ress = [], []
     for i in range(n):
-        inps.append(MeasureInput(target, tsk, tsk.config_space.get(i)))
+        inps.append(MeasureInput(target, tsk, tsk.config_space.get(i % len(tsk.config_space))))
         ress.append(MeasureResult((i + 1,), 0, i, time.time()))
     return list(zip(inps, ress))

From 89e3688137b9d8dd0e431cbdadb84d42dae9eee3 Mon Sep 17 00:00:00 2001
From: Meteorix <liuxin.ai@bytedance.com>
Date: Mon, 11 Jan 2021 09:51:31 +0800
Subject: [PATCH 042/357] [CUDA]batch_matmul tensorcore schedule (#7146)

* add batch_matmul_tensorcore

* add bmm cublas autotune

* add bmm tests

* out_shape for bmm_tensorcore

* fix comments

* code format

* add todos for tensorcore datatype checking

* fix lint

* fix have_tensorcore

* add dtype check for batch_matmul_tensorcore
---
 python/tvm/relay/op/strategy/cuda.py          |  16 +
 python/tvm/topi/cuda/__init__.py              |   1 +
 python/tvm/topi/cuda/batch_matmul.py          |  14 +-
 .../tvm/topi/cuda/batch_matmul_tensorcore.py  | 315 ++++++++++++++++++
 .../tvm/topi/cuda/conv2d_nhwc_tensorcore.py   |   1 +
 .../tvm/topi/cuda/conv3d_ndhwc_tensorcore.py  |   1 +
 python/tvm/topi/cuda/dense_tensorcore.py      |   1 +
 .../test_topi_batch_matmul_tensorcore.py      |  75 +++++
 8 files changed, 422 insertions(+), 2 deletions(-)
 create mode 100644 python/tvm/topi/cuda/batch_matmul_tensorcore.py
 create mode 100644 tests/python/topi/python/test_topi_batch_matmul_tensorcore.py

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 37946c01cb46..04c16ddd344c 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -732,6 +732,22 @@ def batch_matmul_strategy_cuda(attrs, inputs, out_type, target):
             name="batch_matmul_cublas.cuda",
             plevel=15,
         )
+    if target.kind.name == "cuda" and nvcc.have_tensorcore(target=target):
+        x, y = inputs
+        _, M, K = get_const_tuple(x.shape)
+        _, N, K = get_const_tuple(y.shape)
+        if x.dtype in ["float16", "int8", "uint8"] and (
+            (M % 8 == 0 and K % 16 == 0 and N % 32 == 0)
+            or (M % 16 == 0 and K % 16 == 0 and N % 16 == 0)
+            or (M % 32 == 0 and K % 16 == 0 and N % 8 == 0)
+        ):
+            strategy.add_implementation(
+                wrap_compute_batch_matmul(topi.cuda.batch_matmul_tensorcore),
+                wrap_topi_schedule(topi.cuda.schedule_batch_matmul_tensorcore),
+                name="batch_matmul_tensorcore.cuda",
+                plevel=20,
+            )
+
     return strategy
 
 
diff --git a/python/tvm/topi/cuda/__init__.py b/python/tvm/topi/cuda/__init__.py
index 23c625ae7ff7..42bf980bec4c 100644
--- a/python/tvm/topi/cuda/__init__.py
+++ b/python/tvm/topi/cuda/__init__.py
@@ -42,6 +42,7 @@
 from .pooling import *
 from .nn import schedule_lrn
 from .batch_matmul import *
+from .batch_matmul_tensorcore import *
 from .vision import *
 from .ssd import *
 from .nms import get_valid_counts, non_max_suppression
diff --git a/python/tvm/topi/cuda/batch_matmul.py b/python/tvm/topi/cuda/batch_matmul.py
index 8d34b2996593..006b866d6bad 100644
--- a/python/tvm/topi/cuda/batch_matmul.py
+++ b/python/tvm/topi/cuda/batch_matmul.py
@@ -21,7 +21,7 @@
 from tvm import te
 from tvm.contrib import cublas
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
-from .. import nn
+from .. import nn, generic
 from ..utils import traverse_inline, get_const_tuple, get_max_power2_factor
 
 
@@ -138,7 +138,8 @@ def _callback(op):
     return s
 
 
-def batch_matmul_cublas(x, y, out_shape=None):
+@autotvm.register_topi_compute("batch_matmul_cublas.cuda")
+def batch_matmul_cublas(cfg, x, y, out_shape=None):
     """Computes batch matrix multiplication of `x` and `y` when `x` and `y` are
     data in batch.
 
@@ -158,4 +159,13 @@ def batch_matmul_cublas(x, y, out_shape=None):
     output : tvm.te.Tensor
         3-D with shape [batch, M, N]
     """
+    b, m, k = x.shape
+    b, n, k = y.shape
+    cfg.add_flop(b * m * k * n * 2)
     return cublas.batch_matmul(x, y, False, True)
+
+
+@autotvm.register_topi_schedule("batch_matmul_cublas.cuda")
+def schedule_batch_matmul_cublas(_, outs):
+    """Schedule batch_matmul operator using CUBLAS"""
+    return generic.schedule_extern(outs)
diff --git a/python/tvm/topi/cuda/batch_matmul_tensorcore.py b/python/tvm/topi/cuda/batch_matmul_tensorcore.py
new file mode 100644
index 000000000000..59b92ec9e623
--- /dev/null
+++ b/python/tvm/topi/cuda/batch_matmul_tensorcore.py
@@ -0,0 +1,315 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,too-many-locals,unused-variable,unused-argument
+"""cuda batch_matmul operators"""
+import tvm
+from tvm import autotvm
+from tvm import te
+from ..utils import traverse_inline, get_const_tuple
+from .tensor_intrin import (
+    intrin_wmma_load_matrix_A,
+    intrin_wmma_load_matrix_W,
+    intrin_wmma_store_matrix,
+    intrin_wmma_gemm,
+)
+
+
+@autotvm.register_topi_compute("batch_matmul_tensorcore.cuda")
+def batch_matmul_tensorcore(cfg, x, y, out_shape=None):
+    """batch matmul tensorcore operator on cuda"""
+    # todo: deal with out_shape for broadcast, liuxin.ai
+    return batch_matmul_tensorcore_cuda(x, y)
+
+
+@autotvm.register_topi_schedule("batch_matmul_tensorcore.cuda")
+def schedule_batch_matmul_tensorcore(cfg, outs):
+    """Schedule for batch_matmul operator using Tensorcore
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of batch_matmul
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
+
+    def _schedule(cfg, s, C):
+        A, B = s[C].op.input_tensors
+        batch, m_dim, k_dim = get_const_tuple(A.shape)
+        batch, n_dim, k_dim = get_const_tuple(B.shape)
+        out_dtype = C.dtype
+        # inline astype fp16
+        s[A].compute_inline()
+        s[B].compute_inline()
+
+        # Explicit memory access
+        AS = s.cache_read(A, "shared", [C])
+        BS = s.cache_read(B, "shared", [C])
+        AF = s.cache_read(AS, "wmma.matrix_a", [C])
+        BF = s.cache_read(BS, "wmma.matrix_b", [C])
+        CF = s.cache_write(C, "wmma.accumulator")
+        CS = s.cache_read(CF, "shared", [C])
+
+        # fallback support
+        target = tvm.target.Target.current()
+        if cfg.is_fallback:
+            ref_log = autotvm.tophub.load_reference_log(
+                target.kind.name, target.model, "batch_matmul_tensorcore.cuda"
+            )
+            cfg.fallback_with_reference_log(ref_log)
+
+        # Deal with op fusion, such as bias/relu and slice after padding
+        if C.op not in s.outputs and "injective" in s.outputs[0].tag:
+            s[C].compute_inline()
+            C = s.outputs[0].output(0)
+
+        # create tuning space
+        cfg.define_knob("block_row_warps", [1, 2, 4])
+        cfg.define_knob("block_col_warps", [1, 2, 4])
+        cfg.define_knob("warp_row_tiles", [1, 2, 4])
+        cfg.define_knob("warp_col_tiles", [1, 2, 4])
+        cfg.define_knob("chunk", [1, 2, 4, 8])
+        cfg.define_knob("offset", [0, 8])
+        cfg.define_knob("offsetCS", [0, 8])
+        cfg.define_knob("vec", [1, 2, 4, 8])
+
+        # Ensure that the default parameters are applicable when autotvm is not in use
+        if m_dim % 32 == 0 and n_dim % 8 == 0:
+            cfg.define_knob("wmma_m", [32, 16, 8])
+        elif m_dim % 16 == 0 and n_dim % 16 == 0:
+            cfg.define_knob("wmma_m", [16, 8, 32])
+        elif m_dim % 8 == 0 and n_dim % 32 == 0:
+            cfg.define_knob("wmma_m", [8, 16, 32])
+
+        warp_size = 32
+        wmma_k = 16
+        block_row_warps = cfg["block_row_warps"].val
+        block_col_warps = cfg["block_col_warps"].val
+        warp_row_tiles = cfg["warp_row_tiles"].val
+        warp_col_tiles = cfg["warp_col_tiles"].val
+        chunk = cfg["chunk"].val
+        offset = cfg["offset"].val
+        offsetCS = cfg["offsetCS"].val
+        wmma_m = cfg["wmma_m"].val
+        vec = cfg["vec"].val
+
+        if wmma_m == 16:
+            wmma_n = 16
+        elif wmma_m == 8:
+            wmma_n = 32
+        elif wmma_m == 32:
+            wmma_n = 8
+
+        # Define the stride of intrin functions
+        AS_align = chunk * wmma_k + offset
+        BS_align = chunk * wmma_k + offset
+        CS_align = warp_col_tiles * block_col_warps * wmma_n + offsetCS
+        AS_stride = [AS_align, 1]
+        BS_stride = [BS_align, 1]
+        AF_stride = [wmma_k, 1]
+        BF_stride = [wmma_k, 1]
+        CF_stride = [warp_col_tiles * wmma_n, 1]
+        CS_stride = [CS_align, 1]
+
+        block_x = te.thread_axis("blockIdx.x")
+        block_y = te.thread_axis("blockIdx.y")
+        block_z = te.thread_axis("blockIdx.z")
+        thread_x = te.thread_axis("threadIdx.x")
+        thread_y = te.thread_axis("threadIdx.y")
+        thread_z = te.thread_axis("threadIdx.z")
+
+        # Schedule for dense computation
+        block_factor_m = wmma_m * warp_row_tiles * block_row_warps
+        block_factor_n = wmma_n * warp_col_tiles * block_col_warps
+        b, m, n = C.op.axis
+        block_i, bc = s[C].split(m, factor=block_factor_m)
+        block_j, oc = s[C].split(n, factor=block_factor_n)
+        s[C].reorder(b, block_i, block_j, bc, oc)
+        t = s[C].fuse(bc, oc)
+        t, vi = s[C].split(t, factor=vec)
+        t, tx = s[C].split(t, factor=warp_size)
+        t, ty = s[C].split(t, factor=block_row_warps)
+        t, tz = s[C].split(t, factor=block_col_warps)
+        s[C].bind(block_i, block_x)
+        s[C].bind(block_j, block_y)
+        s[C].bind(b, block_z)
+        s[C].bind(tz, thread_z)
+        s[C].bind(ty, thread_y)
+        s[C].bind(tx, thread_x)
+        s[C].vectorize(vi)
+
+        # Schedule for wmma store
+        s[CS].compute_at(s[C], block_j)
+        bs, bb, oo = CS.op.axis
+        s[CS].storage_align(bb, CS_align - 1, CS_align)
+        bb, bbi = s[CS].split(bb, factor=wmma_m)
+        oo, ooi = s[CS].split(oo, factor=wmma_n)
+        bb, bbii = s[CS].split(bb, factor=warp_row_tiles)
+        oo, ooii = s[CS].split(oo, factor=warp_col_tiles)
+        s[CS].reorder(bs, bb, oo, bbii, ooii, bbi, ooi)
+
+        # Schedule for wmma computation
+        s[CF].compute_at(s[CS], oo)
+        bs, warp_i, warp_j = CF.op.axis
+        warp_i, _ii = s[CF].split(warp_i, factor=wmma_m)
+        warp_j, _jj = s[CF].split(warp_j, factor=wmma_n)
+        (k,) = CF.op.reduce_axis
+        k, _k = s[CF].split(k, factor=wmma_k)
+        ko, ki = s[CF].split(k, factor=chunk)
+        s[CF].reorder(bs, ko, ki, warp_i, warp_j, _ii, _jj, _k)
+
+        # Schedule for  wmma_matrix_a load
+        s[AF].compute_at(s[CF], ki)
+        bs, b, i = AF.op.axis
+        b, b_ii = s[AF].split(b, factor=wmma_m)
+        i, i_jj = s[AF].split(i, factor=wmma_k)
+        s[AF].reorder(bs, b, i, b_ii, i_jj)
+
+        # Schedule for  wmma_matrix_b load
+        s[BF].compute_at(s[CF], ki)
+        bs, o, i = BF.op.axis
+        o, o_ii = s[BF].split(o, factor=wmma_n)
+        i, i_ii = s[BF].split(i, factor=wmma_k)
+        s[BF].reorder(bs, o, i, o_ii, i_ii)
+
+        # Schedule for A's(B's) shared memory load
+        def shared_shedule(stage, strides):
+            s[stage].compute_at(s[CF], ko)
+            bs, xo, yo = stage.op.axis
+            s[stage].storage_align(xo, strides - 1, strides)
+            t = s[stage].fuse(xo, yo)
+            t, vi = s[stage].split(t, factor=vec)
+            t, tx = s[stage].split(t, factor=warp_size)
+            t, ty = s[stage].split(t, factor=block_row_warps)
+            _, tz = s[stage].split(t, factor=block_col_warps)
+            s[stage].bind(ty, thread_y)
+            s[stage].bind(tz, thread_z)
+            s[stage].bind(tx, thread_x)
+            s[stage].vectorize(vi)
+
+        shared_shedule(AS, AS_align)
+        shared_shedule(BS, BS_align)
+
+        shape = (wmma_m, wmma_n, wmma_k)
+        # TODO: add checking here, datatype casting may cause precision loss
+        in_dtype = "float16"
+        AL_gemm = te.placeholder((wmma_m, wmma_k), name="AL_gemm", dtype=in_dtype)
+        BL_gemm = te.placeholder((wmma_n, wmma_k), name="BL_gemm", dtype=in_dtype)
+        k_gemm = te.reduce_axis((0, wmma_k), name="k_gemm")
+        CL_compute = te.compute(
+            (wmma_m, wmma_n),
+            lambda ii, jj: te.sum(
+                AL_gemm[ii, k_gemm].astype(out_dtype) * BL_gemm[jj, k_gemm].astype(out_dtype),
+                axis=k_gemm,
+            ),
+            name="CL_compute",
+        )
+
+        # lower the computation loops down to TensorCore hardware intrinsics
+        # by mapping the dense tensorcore to tensor intrinsics
+        s[AF].tensorize(
+            b_ii,
+            intrin_wmma_load_matrix_A(
+                AF_stride,
+                AS_stride,
+                shape,
+                "row_major",
+                (wmma_m, wmma_k),
+                (wmma_m, wmma_k),
+                "float16",
+            ),
+        )
+        s[BF].tensorize(
+            o_ii,
+            intrin_wmma_load_matrix_W(
+                BF_stride,
+                BS_stride,
+                shape,
+                "col_major",
+                (wmma_n, wmma_k),
+                (wmma_n, wmma_k),
+                "float16",
+            ),
+        )
+        s[CF].tensorize(
+            _ii,
+            intrin_wmma_gemm(AL_gemm, BL_gemm, CL_compute, AF_stride, BF_stride, CF_stride, shape),
+        )
+        s[CS].tensorize(
+            bbi,
+            intrin_wmma_store_matrix(
+                CS_stride, CF_stride, shape, out_dtype, (wmma_m, wmma_n), (wmma_m, wmma_n)
+            ),
+        )
+
+    def _callback(op):
+        if "batch_matmul_tensorcore" in op.tag:
+            _schedule(cfg, s, op.output(0))
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+def batch_matmul_tensorcore_cuda(x, y):
+    """Computes batch matrix multiplication of `x` and `y` when `x` and `y` are
+    data in batch.
+
+    Parameters
+    ----------
+    x : tvm.te.Tensor
+        3-D with shape [batch, M, K]
+
+    y : tvm.te.Tensor
+        3-D with shape [batch, N, K]
+
+    Returns
+    -------
+    output : tvm.te.Tensor
+        3-D with shape [batch, M, N]
+    """
+    assert len(x.shape) == 3 and len(y.shape) == 3, "only support 3-dim batch_matmul"
+    x_shape = get_const_tuple(x.shape)
+    y_shape = get_const_tuple(y.shape)
+    assert x_shape[0] == y_shape[0], "batch dimension doesn't match"
+    assert x_shape[2] == y_shape[2], "shapes of x and y is inconsistant"
+    batch, M, K = x.shape
+    N = y.shape[1]
+    out_dtype = x.dtype
+
+    assert (
+        (M % 8 == 0 and K % 16 == 0 and N % 32 == 0)
+        or (M % 16 == 0 and K % 16 == 0 and N % 16 == 0)
+        or (M % 32 == 0 and K % 16 == 0 and N % 8 == 0)
+    ), "The shape of (M, K, N) must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32)"
+
+    x_16 = te.compute((batch, M, K), lambda b, i, k: x[b, i, k].astype("float16"))
+    y_16 = te.compute((batch, N, K), lambda b, j, k: y[b, j, k].astype("float16"))
+
+    k = te.reduce_axis((0, K), name="k")
+    return te.compute(
+        (batch, M, N),
+        lambda b, i, j: te.sum(
+            x_16[b, i, k].astype(out_dtype) * y_16[b, j, k].astype(out_dtype), axis=k
+        ),
+        tag="batch_matmul_tensorcore",
+    )
diff --git a/python/tvm/topi/cuda/conv2d_nhwc_tensorcore.py b/python/tvm/topi/cuda/conv2d_nhwc_tensorcore.py
index f665cc779dc5..76f082f07b44 100644
--- a/python/tvm/topi/cuda/conv2d_nhwc_tensorcore.py
+++ b/python/tvm/topi/cuda/conv2d_nhwc_tensorcore.py
@@ -72,6 +72,7 @@ def nhwc_tensorcore_cuda(cfg, Input, Filter, stride, padding, dilation, out_dtyp
     ry = te.reduce_axis((0, kernel_h), name="ry")
     rx = te.reduce_axis((0, kernel_w), name="rx")
     # convert data type of input feature maps and weights
+    # TODO: add checking here, datatype casting may cause precision loss
     TransPaddedInput = te.compute(
         PaddedInput.shape, lambda n, h, w, c: PaddedInput[n, h, w, c].astype("float16")
     )
diff --git a/python/tvm/topi/cuda/conv3d_ndhwc_tensorcore.py b/python/tvm/topi/cuda/conv3d_ndhwc_tensorcore.py
index a5c4e81a4dc3..efb25744b802 100644
--- a/python/tvm/topi/cuda/conv3d_ndhwc_tensorcore.py
+++ b/python/tvm/topi/cuda/conv3d_ndhwc_tensorcore.py
@@ -75,6 +75,7 @@ def ndhwc_tensorcore_cuda(cfg, Input, Filter, stride, padding, dilation, out_dty
     ry = te.reduce_axis((0, kernel_h), name="ry")
     rx = te.reduce_axis((0, kernel_w), name="rx")
     # convert data type of input feature maps and weights
+    # TODO: add checking here, datatype casting may cause precision loss
     TransPaddedInput = te.compute(
         PaddedInput.shape, lambda n, d, h, w, c: PaddedInput[n, d, h, w, c].astype("float16")
     )
diff --git a/python/tvm/topi/cuda/dense_tensorcore.py b/python/tvm/topi/cuda/dense_tensorcore.py
index a59ebd7347bb..430f8044528c 100644
--- a/python/tvm/topi/cuda/dense_tensorcore.py
+++ b/python/tvm/topi/cuda/dense_tensorcore.py
@@ -245,6 +245,7 @@ def shared_shedule(stage, strides):
     shared_shedule(BS, BS_align)
 
     shape = (wmma_m, wmma_n, wmma_k)
+    # TODO: add checking here, datatype casting may cause precision loss
     in_dtype = "float16"
     AL_gemm = te.placeholder((wmma_m, wmma_k), name="AL_gemm", dtype=in_dtype)
     BL_gemm = te.placeholder((wmma_n, wmma_k), name="BL_gemm", dtype=in_dtype)
diff --git a/tests/python/topi/python/test_topi_batch_matmul_tensorcore.py b/tests/python/topi/python/test_topi_batch_matmul_tensorcore.py
new file mode 100644
index 000000000000..60f4bef3a855
--- /dev/null
+++ b/tests/python/topi/python/test_topi_batch_matmul_tensorcore.py
@@ -0,0 +1,75 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test code for batch_matmul operator"""
+import numpy as np
+import tvm
+from tvm import te
+from tvm import topi
+import tvm.topi.testing
+from tvm.topi.utils import get_const_tuple
+from tvm.contrib.pickle_memoize import memoize
+
+import tvm.testing
+
+_batch_matmul_implement = {
+    "gpu": (topi.cuda.batch_matmul_tensorcore, topi.cuda.schedule_batch_matmul_tensorcore),
+}
+
+
+def verify_batch_matmul(x_batch, y_batch, M, N, K):
+    x = te.placeholder((x_batch, M, K), name="x")
+    y = te.placeholder((y_batch, N, K), name="y")
+    dtype = x.dtype
+
+    # use memoize to pickle the test data for next time use
+    @memoize("topi.tests.test_topi_batch_matmul_tensorcore")
+    def get_ref_data():
+        a_np = np.random.uniform(size=(x_batch, M, K)).astype(dtype)
+        b_np = np.random.uniform(size=(y_batch, N, K)).astype(dtype)
+        c_np = tvm.topi.testing.batch_matmul(a_np, b_np)
+        return (a_np, b_np, c_np)
+
+    # get the test data
+    a_np, b_np, c_np = get_ref_data()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        print("Running on target: %s" % device)
+        with tvm.target.Target(device):
+            fcompute, fschedule = tvm.topi.testing.dispatch(device, _batch_matmul_implement)
+            out = fcompute(x, y)
+            s = fschedule([out])
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(b_np, ctx)
+        c = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=dtype), ctx)
+        f = tvm.build(s, [x, y, out], device, name="dense")
+        f(a, b, c)
+        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-3)
+
+    check_device("cuda")
+
+
+@tvm.testing.uses_gpu
+def test_batch_matmul():
+    verify_batch_matmul(1, 1, 16, 16, 32)
+    verify_batch_matmul(5, 5, 16, 16, 32)
+    verify_batch_matmul(5, 5, 16, 32, 32)
+    verify_batch_matmul(30, 30, 16, 32, 32)
+
+
+if __name__ == "__main__":
+    test_batch_matmul()

From 7f1476957cefcaeb61d17781c4e663d876d9fe0f Mon Sep 17 00:00:00 2001
From: Dmitriy Smirnov <dmitriy.smirnov@arm.com>
Date: Mon, 11 Jan 2021 14:24:42 +0000
Subject: [PATCH 043/357] [TFLite] Quantized version of unit test for Dense
 (#7113)

Added quantized version of unit test for FullyConnected/Dense
Added check for -1 in case if bias not supplied
---
 python/tvm/relay/frontend/tflite.py          | 19 ++---
 tests/python/frontend/tflite/test_forward.py | 83 +++++++++++++++-----
 2 files changed, 73 insertions(+), 29 deletions(-)

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index 94e9e0cccc5b..7a2aada4b22e 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -982,7 +982,7 @@ def convert_concatenation(self, op):
 
         input_tensors = self.get_input_tensors(op)
         assert len(input_tensors) >= 1, "input tensors should greater than 1"
-        in_exprs = [self.get_expr(input_tensor.tensor_idx) for input_tensor in input_tensors]
+        in_exprs = [self.get_tensor_expr(_) for _ in input_tensors]
 
         output_tensors = self.get_output_tensors(op)
         assert len(output_tensors) == 1, "output tensors length should be 1"
@@ -1830,14 +1830,15 @@ def convert_fully_connected(self, op):
         # if we have bias
         if len(input_tensors) == 3:
             bias_tensor = input_tensors[2]
-            bias_tensor_type = bias_tensor.tensor.Type()
-            # bias tensor type should be INT32 (quantization) or FLOAT32
-            assert bias_tensor_type in (TensorType.INT32, TensorType.FLOAT32)
-            bias_tensor_type_str = self.get_tensor_type_str(bias_tensor_type)
-            bias_expr = self.exp_tab.new_const(
-                self.get_tensor_value(bias_tensor), dtype=bias_tensor_type_str
-            )
-            out = _op.nn.bias_add(out, bias_expr)
+            if bias_tensor.tensor_idx != -1:
+                bias_tensor_type = bias_tensor.tensor.Type()
+                # bias tensor type should be INT32 (quantization) or FLOAT32
+                assert bias_tensor_type in (TensorType.INT32, TensorType.FLOAT32)
+                bias_tensor_type_str = self.get_tensor_type_str(bias_tensor_type)
+                bias_expr = self.exp_tab.new_const(
+                    self.get_tensor_value(bias_tensor), dtype=bias_tensor_type_str
+                )
+                out = _op.nn.bias_add(out, bias_expr)
 
         # Finally if the dense is quantized. Add a requantize at the end.
         if output_tensor.qnn_params:
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index 52dde38703d1..c8bd0947776f 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -3342,9 +3342,9 @@ def test_forward_sparse_to_dense():
 #######################################################################
 # Fully Connected
 # ---------------
-
-
-def _test_fully_connected(tensor_in_sizes, const_input, filter_in_sizes, bias_in_size=None):
+def _test_fully_connected(
+    tensor_in_sizes, const_input, filter_in_sizes, bias_in_size=None, quantized=False
+):
     """ One iteration of fully connected """
 
     total_size_1 = np.prod(tensor_in_sizes)
@@ -3356,11 +3356,11 @@ def _test_fully_connected(tensor_in_sizes, const_input, filter_in_sizes, bias_in
 
     # Initializes the input tensor with array containing incrementing
     # numbers from 1.
-    data_array = np.arange(1, total_size_1 + 1, dtype=np.float32)
-    filter_array = np.arange(1, total_size_2 + 1, dtype=np.float32)
+    data_array = np.arange(1, total_size_1 + 1, dtype=np.uint8 if quantized else np.float32)
+    filter_array = np.arange(1, total_size_2 + 1, dtype=np.uint8 if quantized else np.float32)
+    in_name = "input"
 
     with tf.Graph().as_default():
-        in_name = "input"
         in_data = (
             constant_op.constant(data_array, shape=tensor_in_sizes, dtype=np.float32, name=in_name)
             if const_input
@@ -3368,30 +3368,73 @@ def _test_fully_connected(tensor_in_sizes, const_input, filter_in_sizes, bias_in
         )
 
         in_filter = constant_op.constant(filter_array, shape=filter_in_sizes, dtype=np.float32)
-
-        # reshape N H W C into N H*W*C
-        in_data_reshape = array_ops.reshape(in_data, [tensor_in_sizes[0], -1])
-
-        out = math_ops.mat_mul(in_data_reshape, in_filter)
+        data_array = np.reshape(data_array, tensor_in_sizes)
 
         # if we have bias
         if bias_in_size:
             assert bias_in_size[0] == filter_in_sizes[1], "bias and filter size are mismatched"
-            bias_array = np.arange(1, bias_in_size[0] + 1, dtype=np.float32)
+            bias_array = np.arange(
+                1, bias_in_size[0] + 1, dtype=np.uint8 if quantized else np.float32
+            )
             in_bias = constant_op.constant(bias_array, shape=bias_in_size, dtype=np.float32)
-            out = nn_ops.bias_add(out, in_bias)
 
-        data_array = np.reshape(data_array, tensor_in_sizes).astype(np.float32)
-        compare_tflite_with_tvm(data_array, [] if const_input else in_data.name, [in_data], [out])
+        if quantized:
+            inq_data = tf.quantization.fake_quant_with_min_max_args(
+                in_data, min=-100, max=100, name="inq_0"
+            )
+            input_range = {"inq_0": (-100, 100)}
+            inq_filter = tf.quantization.fake_quant_with_min_max_args(
+                in_filter, min=-100, max=100, name="inq_1"
+            )
+            input_range = {"inq_0": (-100, 100), "inq_1": (-100, 100)}
+            # reshape N H W C into N H*W*C
+            inq_data_reshape = array_ops.reshape(inq_data, [tensor_in_sizes[0], -1])
+            out = math_ops.mat_mul(inq_data_reshape, inq_filter)
+            out = tf.quantization.fake_quant_with_min_max_args(out, min=-100, max=100, name="out")
+
+            # if we have bias
+            if bias_in_size:
+                out = nn_ops.bias_add(out, in_bias)
+
+            compare_tflite_with_tvm(
+                data_array,
+                inq_data.name,
+                [inq_data],
+                [out],
+                quantized=True,
+                input_range=input_range,
+                experimental_new_converter=True,
+            )
+        else:
+            # reshape N H W C into N H*W*C
+            in_data_reshape = array_ops.reshape(in_data, [tensor_in_sizes[0], -1])
+            out = math_ops.mat_mul(in_data_reshape, in_filter)
+
+            # if we have bias
+            if bias_in_size:
+                out = nn_ops.bias_add(out, in_bias)
+
+            compare_tflite_with_tvm(
+                data_array, in_data.name, [in_data], [out], experimental_new_converter=True
+            )
 
 
 def test_forward_fully_connected():
     """ Fully Connected """
-    for const_input in [False, True]:
-        _test_fully_connected([1, 1, 1, 150], const_input, [150, 100])
-        _test_fully_connected([1, 1, 1, 150], const_input, [150, 100], [100])
-        _test_fully_connected([5, 1, 1, 150], const_input, [150, 100])
-        _test_fully_connected([5, 1, 1, 150], const_input, [150, 100], [100])
+    for input_shape, weight_shape, bias_shape in [
+        ([1, 4], [4, 4], None),
+        ([1, 4], [4, 4], [4]),
+        ([1, 1, 1, 5], [5, 5], None),
+        ([1, 1, 10], [10, 103], None),
+        ([1, 1, 1, 150], [150, 100], None),
+        ([1, 1, 1, 150], [150, 100], None),
+        ([1, 1, 1, 150], [150, 100], [100]),
+        ([5, 1, 1, 150], [150, 100], None),
+        ([5, 1, 1, 150], [150, 100], [100]),
+    ]:
+        for const_input in [False, True]:
+            for quantized in [False, True]:
+                _test_fully_connected(input_shape, const_input, weight_shape, bias_shape, quantized)
 
 
 #######################################################################

From 77e4fd16bfb8175c83638870af9646d1027f0de7 Mon Sep 17 00:00:00 2001
From: Dmitriy Smirnov <dmitriy.smirnov@arm.com>
Date: Mon, 11 Jan 2021 14:26:05 +0000
Subject: [PATCH 044/357] [BYOC][ACL] Depthwise convolution support (#7206)

* [BYOC][ACL] Depthwise convolution support

Added support for depthwise convolution. ACL only supports depth-wise convolution when kernel size is 3x3 and 5x5 and strides are (1, 1) or (2, 2), if this is not the case then fallback to TVM.

Also rework tests to remove non-deterministic trials.

*Compute Library for the Arm Architecture (ACL).
*All credits to Luke Hutton @lhutton1

Change-Id: Ida1f5802a65377b84325edf14a0149242c1af857

* linter

* CHECK -> ICHECK

Co-authored-by: Luke Hutton <luke.hutton@arm.com>
---
 docs/deploy/arm_compute_lib.rst               |   8 +-
 .../tvm/relay/op/contrib/arm_compute_lib.py   | 109 ++++++-
 python/tvm/relay/testing/__init__.py          |   6 +-
 .../contrib/arm_compute_lib/codegen.cc        |  48 ++-
 .../contrib/arm_compute_lib/acl_runtime.cc    |  69 ++++-
 .../contrib/arm_compute_lib/acl_utils.cc      |  10 +
 .../contrib/arm_compute_lib/acl_utils.h       |   9 +
 .../test_arm_compute_lib/infrastructure.py    |  42 ---
 .../test_arm_compute_lib/test_conv2d.py       | 283 +++++++++++-------
 .../test_arm_compute_lib/test_dense.py        |  83 +++--
 .../test_arm_compute_lib/test_network.py      |   4 +-
 11 files changed, 450 insertions(+), 221 deletions(-)

diff --git a/docs/deploy/arm_compute_lib.rst b/docs/deploy/arm_compute_lib.rst
index a2eaa5fb5662..5d11241c1a34 100644
--- a/docs/deploy/arm_compute_lib.rst
+++ b/docs/deploy/arm_compute_lib.rst
@@ -15,7 +15,7 @@
     specific language governing permissions and limitations
     under the License.
 
-Relay Arm :sup:`®` Compute Library Integration
+Relay Arm:sup:`®` Compute Library Integration
 ==============================================
 **Author**: `Luke Hutton <https://github.com/lhutton1>`_
 
@@ -195,12 +195,14 @@ Operator support
 |                      |   Simple: nn.conv2d                                                     |
 |                      |   Composite: nn.pad?, nn.conv2d, nn.bias_add?, nn.relu?                 |
 |                      |                                                                         |
-|                      | (only groups = 1 supported)                                             |
+|                      | Normal and depth-wise (when kernel is 3x3 or 5x5 and strides are 1x1    |
+|                      | or 2x2) convolution supported. Grouped convolution is not supported.    |
 +----------------------+-------------------------------------------------------------------------+
 | qnn.conv2d           | uint8:                                                                  |
 |                      |   Composite: nn.pad?, nn.conv2d, nn.bias_add?, nn.relu?, qnn.requantize |
 |                      |                                                                         |
-|                      | (only groups = 1 supported)                                             |
+|                      | Normal and depth-wise (when kernel is 3x3 or 5x5 and strides are 1x1    |
+|                      | or 2x2) convolution supported. Grouped convolution is not supported.    |
 +----------------------+-------------------------------------------------------------------------+
 | nn.dense             | fp32:                                                                   |
 |                      |   Simple: nn.dense                                                      |
diff --git a/python/tvm/relay/op/contrib/arm_compute_lib.py b/python/tvm/relay/op/contrib/arm_compute_lib.py
index a78ad294b770..8a03cb173612 100644
--- a/python/tvm/relay/op/contrib/arm_compute_lib.py
+++ b/python/tvm/relay/op/contrib/arm_compute_lib.py
@@ -19,12 +19,15 @@
 import numpy as np
 import tvm
 
+from tvm._ffi import register_func
 from tvm.relay.expr import const
 from tvm.relay import transform
 from tvm.relay.build_module import bind_params_by_name
+from tvm.relay.testing.temp_op_attr import TempOpAttr
 
 from ...dataflow_pattern import wildcard, is_op, is_constant, is_expr
 from .register import register_pattern_table
+from ..strategy.generic import is_depthwise_conv2d
 
 
 def is_arm_compute_runtime_enabled():
@@ -71,6 +74,61 @@ def partition_for_arm_compute_lib(mod, params=None):
     return seq(mod)
 
 
+@register_func("relay.ext.arm_compute_lib.optimize")
+def preprocess_module(mod):
+    """
+    Pre-process a module containing functions ready for ACL codegen. For now we enforce OHWI
+    kernel layout and fold the transforms away.
+
+    Parameters
+    ----------
+    mod : Module
+        The module to run passes on.
+
+    Returns
+    -------
+    preprocessed_mod : The processed module.
+    """
+
+    def convert_layout_conv2d(conv2d_function):
+        def convert_conv(attrs, inputs, tinfos, desired_layouts):
+            new_attrs = dict(attrs)
+            data_info = tinfos[0]
+            weight_info = tinfos[1]
+            desired_data_layout, desired_kernel_layout = map(str, desired_layouts)
+            new_attrs["data_layout"] = desired_data_layout
+            new_attrs["kernel_layout"] = desired_kernel_layout
+
+            if is_depthwise_conv2d(
+                data_info.shape,
+                attrs["data_layout"],
+                weight_info.shape,
+                attrs["kernel_layout"],
+                attrs["groups"],
+            ):
+                dkl = desired_kernel_layout
+                new_attrs["kernel_layout"] = dkl[3] + dkl[1:3] + dkl[0]
+            return conv2d_function(*inputs, **new_attrs)
+
+        return convert_conv
+
+    with TempOpAttr(
+        "nn.conv2d", "FTVMConvertOpLayout", convert_layout_conv2d(tvm.relay.nn.conv2d)
+    ), TempOpAttr(
+        "qnn.conv2d", "FTVMConvertOpLayout", convert_layout_conv2d(tvm.relay.qnn.op.conv2d)
+    ):
+        seq = tvm.transform.Sequential(
+            [
+                transform.ConvertLayout(
+                    {"nn.conv2d": ["NHWC", "OHWI"], "qnn.conv2d": ["NHWC", "OHWI"]}
+                ),
+                transform.FoldConstant(),
+            ]
+        )
+        preprocessed_mod = seq(mod)
+    return preprocessed_mod
+
+
 @register_pattern_table("arm_compute_lib")
 def arm_compute_lib_pattern_table():
     """Get the ACL pattern table."""
@@ -236,8 +294,6 @@ def _func_wrapper(expr):
 def conv2d(expr):
     """Check if the external ACL codegen for conv2d should be used."""
     attrs, args = expr.attrs, expr.args
-    if attrs.groups != 1:
-        return False
     if attrs.data_layout != "NHWC":
         return False
     if attrs.out_dtype != "float32" and attrs.out_dtype != "":
@@ -248,14 +304,25 @@ def conv2d(expr):
     kernel_typ = args[1].checked_type
     if len(kernel_typ.shape) != 4 or kernel_typ.dtype != "float32":
         return False
+    is_depthwise = is_depthwise_conv2d(
+        data_typ.shape,
+        attrs["data_layout"],
+        kernel_typ.shape,
+        attrs["kernel_layout"],
+        attrs["groups"],
+    )
+    if is_depthwise:
+        return depthwise_conv2d(attrs, args)
+    # ACL doesn't support grouped convolution
+    if attrs.groups != 1 and not is_depthwise:
+        return False
     return True
 
 
 def qnn_conv2d(expr):
     """Check if the external ACL codegen for qnn.conv2d should be used."""
     attrs, args = expr.attrs, expr.args
-    if attrs.groups != 1:
-        return False
+
     if attrs.data_layout != "NHWC":
         return False
     if attrs.out_dtype != "int32" and attrs.out_dtype != "":
@@ -266,6 +333,40 @@ def qnn_conv2d(expr):
     kernel_typ = args[1].checked_type
     if len(kernel_typ.shape) != 4 or kernel_typ.dtype != "uint8":
         return False
+    is_depthwise = is_depthwise_conv2d(
+        data_typ.shape,
+        attrs["data_layout"],
+        kernel_typ.shape,
+        attrs["kernel_layout"],
+        attrs["groups"],
+    )
+    if is_depthwise:
+        return depthwise_conv2d(attrs, args)
+    # ACL doesn't support grouped convolution
+    if attrs.groups != 1 and not is_depthwise:
+        return False
+    return True
+
+
+def depthwise_conv2d(attrs, args):
+    """Check if the external ACL codegen for depthwise convolution should be used.
+
+    Note
+    ----
+    Relay does not have a depthwise conv2d operator whilst ACL does. We simply
+    separate the checks for depthwise for clarity.
+    """
+    kernel_typ = args[1].checked_type
+    # Only supports 3x3, 5x5 depthwise
+    if (
+        kernel_typ.shape[0] not in [3, 5]
+        or kernel_typ.shape[1] not in [3, 5]
+        or kernel_typ.shape[0] != kernel_typ.shape[1]
+    ):
+        return False
+    # Stride must be (1, 1) or (2, 2)
+    if (attrs.strides[0], attrs.strides[1]) not in [(1, 1), (2, 2)]:
+        return False
     return True
 
 
diff --git a/python/tvm/relay/testing/__init__.py b/python/tvm/relay/testing/__init__.py
index 0b81cb9c7ec6..f0c79bed1218 100644
--- a/python/tvm/relay/testing/__init__.py
+++ b/python/tvm/relay/testing/__init__.py
@@ -22,9 +22,9 @@
 
 import tvm
 from tvm import te
-import tvm.relay as relay
-import tvm.relay.op as op
-from tvm.relay import Prelude
+from tvm import relay
+from tvm.relay import op
+from tvm.relay.prelude import Prelude
 from tvm.testing import enabled_targets
 
 from . import mlp
diff --git a/src/relay/backend/contrib/arm_compute_lib/codegen.cc b/src/relay/backend/contrib/arm_compute_lib/codegen.cc
index a963242f82d5..e0669ae64bdb 100644
--- a/src/relay/backend/contrib/arm_compute_lib/codegen.cc
+++ b/src/relay/backend/contrib/arm_compute_lib/codegen.cc
@@ -24,6 +24,7 @@
 #include <tvm/ir/module.h>
 #include <tvm/relay/attrs/nn.h>
 #include <tvm/relay/type.h>
+#include <tvm/tir/analysis.h>
 
 #include <memory>
 #include <string>
@@ -126,7 +127,7 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
       nodes.activation = current_call;
       current_call = current_call->args[0].as<CallNode>();
     }
-    if (backend::IsOp(current_call, "nn.bias_add")) {
+    if (backend::IsOp(current_call, "add")) {
       nodes.bias = current_call;
       current_call = current_call->args[0].as<CallNode>();
     }
@@ -154,19 +155,32 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
    */
   std::shared_ptr<JSONGraphNode> CreateCompositeConvJSONNode(const CallNode* cn) {
     CompositeConvNode nodes = UnpackCompositeConvolution(cn);
-    std::string name = "nn.conv2d";
 
     const auto* conv_attr = nodes.conv->attrs.as<Conv2DAttrs>();
     ICHECK(conv_attr);
-    ICHECK(conv_attr->kernel_layout == "OHWI")
-        << "Kernel layout must be OHWI, has the module been pre-processed correctly?";
+
+    std::string name;
+    std::string name_prefix = "nn";
+
+    // Distinguish between normal and depth-wise convolution
+    if (conv_attr->channels.defined() &&
+        tvm::tir::ExprDeepEqual()(conv_attr->channels, conv_attr->groups) &&
+        conv_attr->groups != 1) {
+      name = "depthwise_conv2d";
+      ICHECK(conv_attr->kernel_layout == "IHWO")
+          << "Kernel layout must be IHWO, has the module been pre-processed correctly?";
+    } else {
+      name = "conv2d";
+      ICHECK(conv_attr->kernel_layout == "OHWI")
+          << "Kernel layout must be OHWI, has the module been pre-processed correctly?";
+    }
 
     // Inputs must be added in the same order they appear in the relay graph.
     std::vector<JSONGraphNodeEntry> inputs;
     inputs.push_back(VisitExpr(cn->args[0])[0]);
     inputs.push_back(VisitExpr(nodes.conv->args[1])[0]);
     if (nodes.requantize) {
-      name = "qnn.conv2d";
+      name_prefix = "qnn";
       inputs.push_back(VisitExpr(nodes.conv->args[2])[0]);  // input zero-point
       inputs.push_back(VisitExpr(nodes.conv->args[3])[0]);  // kernel zero-point
       inputs.push_back(VisitExpr(nodes.conv->args[4])[0]);  // input scale
@@ -180,7 +194,7 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
       inputs.push_back(VisitExpr(nodes.requantize->args[4])[0]);  // output zero-point
     }
 
-    auto json_node = std::make_shared<JSONGraphNode>(name, "kernel", inputs, 1);
+    auto json_node = std::make_shared<JSONGraphNode>(name_prefix + "." + name, "kernel", inputs, 1);
     SetCallNodeAttribute(json_node, nodes.conv);
 
     // Override attributes
@@ -224,10 +238,11 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
       nodes.requantize = current_call;
       current_call = current_call->args[0].as<CallNode>();
     }
-    if (backend::IsOp(current_call, "nn.bias_add")) {
+    if (backend::IsOp(current_call, "add")) {
       nodes.bias = current_call;
       current_call = current_call->args[0].as<CallNode>();
     }
+
     // Enforce a dense node exists at this point during traversal
     if (nodes.requantize) {
       ICHECK(backend::IsOp(current_call, "qnn.dense"));
@@ -329,25 +344,6 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
   }
 };
 
-/*!
- * \brief Pre-process a module containing functions ready for ACL codegen.
- *
- * For now we enforce OHWI kernel layout and fold the transforms away.
- *
- * \param mod The module to be pre-processed.
- * \return The processed module.
- */
-IRModule PreProcessModule(const IRModule& mod) {
-  IRModule preprocessed_module;
-  tvm::Map<String, Array<String>> desired_layouts = {{"nn.conv2d", {"NHWC", "OHWI"}},
-                                                     {"qnn.conv2d", {"NHWC", "OHWI"}}};
-  preprocessed_module = transform::ConvertLayout(desired_layouts)(mod);
-  preprocessed_module = transform::FoldConstant()(preprocessed_module);
-  return preprocessed_module;
-}
-
-TVM_REGISTER_GLOBAL("relay.ext.arm_compute_lib.optimize").set_body_typed(PreProcessModule);
-
 /*!
  * \brief Create a runtime module for ACL.
  *
diff --git a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
index 09879bdc6e95..ed8f6adbd083 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
+++ b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
@@ -32,6 +32,7 @@
 #include <arm_compute/core/Types.h>
 #include <arm_compute/runtime/NEON/functions/NEArithmeticAddition.h>
 #include <arm_compute/runtime/NEON/functions/NEConvolutionLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h>
 #include <arm_compute/runtime/NEON/functions/NEElementwiseOperations.h>
 #include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h>
 #include <arm_compute/runtime/NEON/functions/NEPoolingLayer.h>
@@ -131,6 +132,9 @@ class ACLRuntime : public JSONRuntimeBase {
         if ("nn.conv2d" == op_name || "qnn.conv2d" == op_name) {
           CreateConvolution2DLayer(&layer_, node, mm);
           num_pools++;
+        } else if ("nn.depthwise_conv2d" == op_name || "qnn.depthwise_conv2d" == op_name) {
+          CreateDepthwiseConvolution2DLayer(&layer_, node, mm);
+          num_pools++;
         } else if ("nn.dense" == op_name || "qnn.dense" == op_name) {
           CreateFullyConnectedLayer(&layer_, node, mm);
           num_pools++;
@@ -227,12 +231,7 @@ class ACLRuntime : public JSONRuntimeBase {
     arm_compute::ActivationLayerInfo act_info;
     if (node.HasAttr("activation_type")) {
       std::string activation_type = node.GetAttr<std::vector<std::string>>("activation_type")[0];
-      if (activation_type == "relu") {
-        act_info = arm_compute::ActivationLayerInfo(
-            arm_compute::ActivationLayerInfo::ActivationFunction::RELU);
-      } else {
-        LOG(FATAL) << "Unsupported activation function";
-      }
+      act_info = MakeACLActivationInfo(activation_type);
     }
 
     arm_compute::Size2D dilation_2d(std::stoi(dilation[0]), std::stoi(dilation[1]));
@@ -269,6 +268,64 @@ class ACLRuntime : public JSONRuntimeBase {
     layer->function = function;
   }
 
+  /*!
+   * \brief Create a 2D depthwise convolution layer.
+   *
+   * \param layer The ACL layer to build. Containing inputs, outputs and the ACL function.
+   * \param node The JSON representation of the operator.
+   * \param mm The ACL conv2d layer can request auxiliary memory from TVM.
+   */
+  void CreateDepthwiseConvolution2DLayer(
+      CachedLayer* layer, const JSONGraphNode& node,
+      const std::shared_ptr<arm_compute::MemoryManagerOnDemand>& mm) {
+    std::vector<std::string> padding = node.GetAttr<std::vector<std::string>>("padding");
+    std::vector<std::string> strides = node.GetAttr<std::vector<std::string>>("strides");
+    std::vector<std::string> dilation = node.GetAttr<std::vector<std::string>>("dilation");
+    arm_compute::PadStrideInfo pad_stride_info = MakeACLPadStride(padding, strides);
+
+    arm_compute::ActivationLayerInfo act_info;
+    if (node.HasAttr("activation_type")) {
+      std::string activation_type = node.GetAttr<std::vector<std::string>>("activation_type")[0];
+      act_info = MakeACLActivationInfo(activation_type);
+    }
+
+    arm_compute::Size2D dilation_2d(std::stoi(dilation[0]), std::stoi(dilation[1]));
+
+    // Collect inputs and outputs, handling both nn.conv2d and qnn.conv2d cases.
+    std::vector<JSONGraphNodeEntry> inputs = node.GetInputs();
+    size_t num_inputs = inputs.size();
+    bool has_bias;
+    if (node.GetOpName() == "qnn.depthwise_conv2d") {
+      ICHECK(num_inputs >= 8U && num_inputs <= 9U)
+          << "Quantized convolution requires 9 inputs with a bias, 8 inputs without.";
+      has_bias = num_inputs == 9;
+      layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[0], &inputs[4], &inputs[2]));
+      layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[1], &inputs[5], &inputs[3]));
+      if (has_bias) {
+        layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[6]));
+      }
+      layer->outputs.push_back(
+          MakeACLTensorFromJSONNode(node, &inputs[6 + has_bias], &inputs[7 + has_bias]));
+    } else {
+      ICHECK(num_inputs >= 2U && num_inputs <= 3U)
+          << "Convolution requires 3 inputs with a bias, 2 inputs without.";
+      has_bias = num_inputs == 3;
+      for (const auto& i : inputs) {
+        layer->inputs.push_back(MakeACLTensorFromJSONEntry(i));
+      }
+      layer->outputs.push_back(MakeACLTensorFromJSONNode(node));
+    }
+
+    // Depth multiplier is the final dimension in acl weights tensor (IWH*M*)
+    int depth_multiplier = layer->inputs[1].info()->tensor_shape()[3];
+
+    auto function = std::make_shared<arm_compute::NEDepthwiseConvolutionLayer>(mm);
+    function->configure(&layer->inputs[0], &layer->inputs[1],
+                        has_bias ? &layer->inputs[2] : nullptr, &layer->outputs[0], pad_stride_info,
+                        depth_multiplier, act_info, dilation_2d);
+    layer->function = function;
+  }
+
   /*!
    * \brief Create a fully connected (dense) layer.
    *
diff --git a/src/runtime/contrib/arm_compute_lib/acl_utils.cc b/src/runtime/contrib/arm_compute_lib/acl_utils.cc
index 604c619bf49c..3b2620987ab0 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_utils.cc
+++ b/src/runtime/contrib/arm_compute_lib/acl_utils.cc
@@ -134,6 +134,16 @@ arm_compute::DataType MakeACLDataType(const DLDataType& data_type) {
   }
 }
 
+arm_compute::ActivationLayerInfo MakeACLActivationInfo(const std::string& activation_type) {
+  auto act_func = arm_compute::ActivationLayerInfo::ActivationFunction::IDENTITY;
+  if (activation_type == "relu") {
+    act_func = arm_compute::ActivationLayerInfo::ActivationFunction::RELU;
+  } else {
+    LOG(FATAL) << "Activation " << activation_type << " unsupported by ACL runtime";
+  }
+  return {act_func};
+}
+
 template <typename T>
 std::vector<T> GetVectorFromDLTensor(const DLTensor* tensor) {
   ICHECK(tensor) << "Cannot convert a nullptr";
diff --git a/src/runtime/contrib/arm_compute_lib/acl_utils.h b/src/runtime/contrib/arm_compute_lib/acl_utils.h
index 576ed916ff60..dbb006fbb347 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_utils.h
+++ b/src/runtime/contrib/arm_compute_lib/acl_utils.h
@@ -108,6 +108,15 @@ arm_compute::PadStrideInfo MakeACLPadStride(const std::vector<std::string>& pad,
  */
 arm_compute::DataType MakeACLDataType(const DLDataType& data_type);
 
+/*!
+ * \brief Convert string to arm_compute::ActivationLayerInfo
+ *
+ * \param activation_type A string representing activation function.
+ * Currently supports the following options: "relu".
+ * \return arm_compute::ActivationLayerInfo.
+ */
+arm_compute::ActivationLayerInfo MakeACLActivationInfo(const std::string& activation_type);
+
 /*!
  * \brief Get a vector from DLTensor data.
  * \note Performs a copy of data.
diff --git a/tests/python/contrib/test_arm_compute_lib/infrastructure.py b/tests/python/contrib/test_arm_compute_lib/infrastructure.py
index c5d711d7afa3..80cd5847440e 100644
--- a/tests/python/contrib/test_arm_compute_lib/infrastructure.py
+++ b/tests/python/contrib/test_arm_compute_lib/infrastructure.py
@@ -303,45 +303,3 @@ def verify_codegen(
             f"Actual={codegen_str} \n"
             f"Expected={known_good_codegen_str}"
         )
-
-
-def generate_trials(space, r_factor=3):
-    """Generates a series of trials.
-
-    This algorithm generates a series of non-deterministic trials given a
-    space of options to test. A trial is generated by pulling a value from
-    each option in the space. On some occasions the values are shuffled to
-    ensure a different trial on each r_factor iteration. The algorithm ensures
-    that each value from an option is used at least once. The total number of
-    trials is determined by the r_factor * the option with the largest number
-    of values.
-
-    Parameters
-    ----------
-    space: List[List[Any]]
-        A list of different options with varying values to test.
-    r_factor: (optional) int
-        The repeat factor.
-
-    Returns
-    -------
-    A list of trials specifying values for each option.
-
-    """
-    np.random.seed(0)
-    max_len = 1
-    for option in space:
-        max_len = max(max_len, len(option))
-
-    num_trials = r_factor * max_len
-    trials = []
-    for i in range(num_trials):
-        trial = []
-        for option in space:
-            if i % len(option) == 0:
-                np.random.shuffle(option)
-            trial.append(option[i % len(option)])
-
-        trials.append(trial)
-
-    return trials
diff --git a/tests/python/contrib/test_arm_compute_lib/test_conv2d.py b/tests/python/contrib/test_arm_compute_lib/test_conv2d.py
index 4496a2a1afa9..cc5bbfec7c69 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_conv2d.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_conv2d.py
@@ -21,15 +21,14 @@
 import tvm
 from tvm import relay
 
-from .infrastructure import (
+from test_arm_compute_lib.infrastructure import (
     skip_runtime_test,
     skip_codegen_test,
     build_and_run,
     verify,
     verify_codegen,
-    generate_trials,
 )
-from .infrastructure import Device
+from test_arm_compute_lib.infrastructure import Device
 
 
 def _get_model(
@@ -57,7 +56,12 @@ def _get_model(
         if len(padding) == 2:
             padding = (padding[0], padding[1], padding[0], padding[1])
         shape = (shape[0], shape[1] + padding[0] * 2, shape[2] + padding[1] * 2, shape[3])
-    weight_shape = (kernel_h, kernel_w, shape[3] // groups, channels)
+    is_depthwise = shape[3] == channels == groups
+    weight_format = "HWOI" if is_depthwise else "HWIO"
+    if weight_format == "HWIO":
+        weight_shape = (kernel_h, kernel_w, shape[3] // groups, channels)
+    else:
+        weight_shape = (kernel_h, kernel_w, channels, shape[3] // groups)
     w = tvm.nd.array(np.random.uniform(-128, 127, weight_shape).astype(dtype))
     weights = relay.const(w, dtype)
     out = relay.nn.conv2d(
@@ -65,7 +69,7 @@ def _get_model(
         weights,
         kernel_size=(kernel_h, kernel_w),
         data_layout="NHWC",
-        kernel_layout="HWIO",
+        kernel_layout=weight_format,
         dilation=dilation,
         strides=strides,
         padding=padding,
@@ -75,7 +79,8 @@ def _get_model(
     )
     params = {"w": w}
     if has_bias:
-        b = tvm.nd.array(np.random.uniform(-128, 127, weight_shape[3]).astype(dtype))
+        bias_shape = weight_shape[2] if is_depthwise else weight_shape[3]
+        b = tvm.nd.array(np.random.uniform(-128, 127, bias_shape).astype(dtype))
         biasc = relay.const(b, dtype)
         out = relay.nn.bias_add(out, biasc, axis=3)
         params["b"] = b
@@ -134,7 +139,12 @@ def _get_qnn_model(
         if len(padding) == 2:
             padding = (padding[0], padding[1], padding[0], padding[1])
         shape = (shape[0], shape[1] + padding[0] * 2, shape[2] + padding[1] * 2, shape[3])
-    weight_shape = (kernel_h, kernel_w, shape[3] // groups, channels)
+    is_depthwise = shape[3] == channels == groups
+    weight_format = "HWOI" if is_depthwise else "HWIO"
+    if weight_format == "HWIO":
+        weight_shape = (kernel_h, kernel_w, shape[3] // groups, channels)
+    else:
+        weight_shape = (kernel_h, kernel_w, channels, shape[3] // groups)
     w = tvm.nd.array(np.random.uniform(0, 255, weight_shape).astype(dtype))
     weights = relay.const(w, dtype)
     out = relay.qnn.op.conv2d(
@@ -146,7 +156,7 @@ def _get_qnn_model(
         kernel_scale=relay.const(kernel_sc, "float32"),
         kernel_size=(kernel_h, kernel_w),
         data_layout="NHWC",
-        kernel_layout="HWIO",
+        kernel_layout=weight_format,
         dilation=dilation,
         strides=strides,
         padding=padding,
@@ -156,7 +166,8 @@ def _get_qnn_model(
     )
     params = {"w": w}
     if has_bias:
-        b = tvm.nd.array(np.random.uniform(0, 255, weight_shape[3]).astype("int32"))
+        bias_shape = weight_shape[2] if is_depthwise else weight_shape[3]
+        b = tvm.nd.array(np.random.uniform(-128, 127, bias_shape).astype("int32"))
         biasc = relay.const(b, "int32")
         out = relay.nn.bias_add(out, biasc, axis=3)
         params["b"] = b
@@ -188,21 +199,30 @@ def _get_expected_codegen(
 ):
     if len(padding) == 2:
         padding = (padding[0], padding[1], padding[0], padding[1])
-    weight_shape = (channels, kernel_h, kernel_w, shape[3] // groups)
     output_height = ((shape[1] - kernel_h + padding[0] + padding[2]) / strides[0]) + 1
     output_width = ((shape[2] - kernel_w + padding[1] + padding[3]) / strides[1]) + 1
     output_shape = (1, int(output_height), int(output_width), channels)
     out_dtype = "int32" if dtype == "uint8" else "float32"
+    is_depthwise = shape[3] == channels == groups
+    weight_format = "IHWO" if is_depthwise else "OHWI"
+    if weight_format == "IHWO":
+        weight_shape = (shape[3] // groups, kernel_h, kernel_w, channels)
+    else:
+        weight_shape = (channels, kernel_h, kernel_w, shape[3] // groups)
+    if is_depthwise:
+        name = "nn.depthwise_conv2d"
+    else:
+        name = "nn.conv2d"
 
     node = {
         "op": "kernel",
-        "name": "nn.conv2d",
+        "name": name,
         "inputs": [],
         "attrs": {
-            "groups": [["1"]],
+            "groups": [[str(groups)]],
             "num_outputs": "1",
             "data_layout": [["NHWC"]],
-            "kernel_layout": [["OHWI"]],
+            "kernel_layout": [[weight_format]],
             "channels": [[str(channels)]],
             "dilation": [[str(dilation[0]), str(dilation[1])]],
             "out_layout": [[""]],
@@ -229,7 +249,7 @@ def _get_expected_codegen(
 
     # qnn.conv2d params, input and kernel
     if dtype == "uint8":
-        node["name"] = "qnn.conv2d"
+        node["name"] = "qnn." + node["name"].split(".")[1]
         for param_dtype in ["int32", "float32"]:
             for _ in range(2):
                 inputs.append(
@@ -246,7 +266,10 @@ def _get_expected_codegen(
             {
                 "op": "const",
                 "name": "",
-                "attrs": {"shape": [[[weight_shape[0]]]], "dtype": [[bias_dtype]]},
+                "attrs": {
+                    "shape": [[[1, 1, 1, weight_shape[3] if is_depthwise else weight_shape[0]]]],
+                    "dtype": [[bias_dtype]],
+                },
             }
         )
 
@@ -275,29 +298,43 @@ def test_conv2d():
     device = Device()
     np.random.seed(0)
 
-    kernel_hs = [1, 2, 3, 5]
-    kernel_ws = [1, 2, 3, 5]
-    pad = [(1, 1), (2, 2), (2, 1)]
-    strides = [(1, 1), (2, 2)]
-    dilation = [(1, 1)]
-    out_channels = [4, 7, 16]
-    input_shapes = [(10, 10, 14), (12, 15, 16), (20, 20, 20)]
-    # composite operator (pad, bias, activation)
-    composite = [
-        (False, False, False),
-        (False, True, False),
-        (False, False, True),
-        (False, True, True),
-        (True, False, False),
-    ]
     dtype = "float32"
-    trials = generate_trials(
-        [kernel_hs, kernel_ws, pad, strides, dilation, out_channels, input_shapes, composite], 3
-    )
+    trials = [
+        # Normal convolution
+        [2, 2, (1, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, False, False), False],
+        [2, 1, (2, 2), (1, 1), (1, 1), 7, (12, 15, 16), (False, False, True), False],
+        [3, 3, (2, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, True, False), False],
+        [3, 3, (1, 1), (1, 1), (1, 1), 16, (12, 15, 16), (False, False, False), False],
+        [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False],
+        [1, 3, (1, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, True), False],
+        [2, 2, (2, 2), (1, 1), (1, 1), 4, (20, 20, 20), (False, True, False), False],
+        [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False],
+        [3, 3, (2, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, False), False],
+        [3, 3, (1, 1), (2, 2), (1, 1), 16, (10, 10, 14), (False, True, True), False],
+        # Depth-wise convolution
+        [3, 3, (1, 1), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, True), True],
+        [5, 5, (2, 2), (1, 1), (1, 1), 20, (20, 20, 20), (False, True, False), True],
+        [3, 3, (2, 2), (2, 2), (1, 1), 14, (10, 10, 14), (True, False, False), True],
+        [5, 5, (0, 0), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, False), True],
+        [3, 3, (1, 1), (2, 2), (1, 1), 14, (10, 10, 14), (False, True, True), True],
+    ]
 
-    for kernel_h, kernel_w, pad, stride, dilation, out_channels, input_shapes, composite in trials:
-        groups = 1
-        shape = (1, *input_shapes)
+    for (
+        kernel_h,
+        kernel_w,
+        pad,
+        stride,
+        dilation,
+        out_channels,
+        shape,
+        composite,
+        is_depthwise,
+    ) in trials:
+        shape = (1, *shape)
+        if is_depthwise:
+            groups = shape[3]
+        else:
+            groups = 1
         outputs = []
         inputs = {
             "a": tvm.nd.array(np.random.uniform(-128, 127, shape).astype(dtype)),
@@ -338,31 +375,43 @@ def test_codegen_conv2d():
     if skip_codegen_test():
         return
 
-    np.random.seed(0)
-
-    kernel_hs = [1, 2, 3, 5]
-    kernel_ws = [1, 2, 3, 5]
-    pad = [(1, 1), (2, 2), (2, 1)]
-    strides = [(1, 1), (2, 2)]
-    dilation = [(1, 1)]
-    out_channels = [4, 7, 16]
-    input_shapes = [(10, 10, 14), (12, 15, 16), (20, 20, 20)]
-    # composite operator (pad, bias, activation)
-    composite = [
-        (False, False, False),
-        (False, True, False),
-        (False, False, True),
-        (False, True, True),
-        (True, False, False),
-    ]
     dtype = "float32"
-    trials = generate_trials(
-        [kernel_hs, kernel_ws, pad, strides, dilation, out_channels, input_shapes, composite], 3
-    )
+    trials = [
+        # Normal convolution
+        [2, 2, (1, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, False, False), False],
+        [2, 1, (2, 2), (1, 1), (1, 1), 7, (12, 15, 16), (False, False, True), False],
+        [3, 3, (2, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, True, False), False],
+        [3, 3, (1, 1), (1, 1), (1, 1), 16, (12, 15, 16), (False, False, False), False],
+        [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False],
+        [1, 3, (1, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, True), False],
+        [2, 2, (2, 2), (1, 1), (1, 1), 4, (20, 20, 20), (False, True, False), False],
+        [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False],
+        [3, 3, (2, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, False), False],
+        [3, 3, (1, 1), (2, 2), (1, 1), 16, (10, 10, 14), (False, True, True), False],
+        # Depth-wise convolution
+        [3, 3, (1, 1), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, True), True],
+        [5, 5, (2, 2), (1, 1), (1, 1), 20, (20, 20, 20), (False, True, False), True],
+        [3, 3, (2, 2), (2, 2), (1, 1), 14, (10, 10, 14), (True, False, False), True],
+        [5, 5, (0, 0), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, False), True],
+        [3, 3, (1, 1), (2, 2), (1, 1), 14, (10, 10, 14), (False, True, True), True],
+    ]
 
-    for kernel_h, kernel_w, pad, stride, dilation, out_channels, input_shapes, composite in trials:
-        groups = 1
-        shape = (1, *input_shapes)
+    for (
+        kernel_h,
+        kernel_w,
+        pad,
+        stride,
+        dilation,
+        out_channels,
+        shape,
+        composite,
+        is_depthwise,
+    ) in trials:
+        shape = (1, *shape)
+        if is_depthwise:
+            groups = shape[3]
+        else:
+            groups = 1
         inputs = {"a"}
 
         args = (shape, kernel_h, kernel_w, pad, stride, dilation, groups, dtype, out_channels)
@@ -389,29 +438,43 @@ def test_qnn_conv2d():
     device = Device()
     np.random.seed(0)
 
-    kernel_hs = [1, 2, 3, 5]
-    kernel_ws = [1, 2, 3, 5]
-    pad = [(1, 1), (2, 2)]
-    strides = [(1, 1), (2, 2)]
-    dilation = [(1, 1)]
-    out_channels = [4, 7, 16]
-    input_shapes = [(10, 10, 14), (12, 15, 16), (20, 20, 20)]
-    # composite operator (pad, bias, activation)
-    composite = [
-        (False, False, False),
-        (False, True, False),
-        (False, False, True),
-        (False, True, True),
-        (True, False, False),
-    ]
     dtype = "uint8"
-    trials = generate_trials(
-        [kernel_hs, kernel_ws, pad, strides, dilation, out_channels, input_shapes, composite], 3
-    )
+    trials = [
+        # Normal convolution
+        [2, 2, (1, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, False, False), False],
+        [2, 1, (2, 2), (1, 1), (1, 1), 7, (12, 15, 16), (False, False, True), False],
+        [3, 3, (2, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, True, False), False],
+        [3, 3, (1, 1), (1, 1), (1, 1), 16, (12, 15, 16), (False, False, False), False],
+        [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False],
+        [1, 3, (1, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, True), False],
+        [2, 2, (2, 2), (1, 1), (1, 1), 4, (20, 20, 20), (False, True, False), False],
+        [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False],
+        [3, 3, (2, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, False), False],
+        [3, 3, (1, 1), (2, 2), (1, 1), 16, (10, 10, 14), (False, True, True), False],
+        # Depth-wise convolution
+        [3, 3, (1, 1), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, True), True],
+        [5, 5, (2, 2), (1, 1), (1, 1), 20, (20, 20, 20), (False, True, False), True],
+        [3, 3, (2, 2), (2, 2), (1, 1), 14, (10, 10, 14), (True, False, False), True],
+        [5, 5, (0, 0), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, False), True],
+        [3, 3, (1, 1), (2, 2), (1, 1), 14, (10, 10, 14), (False, True, True), True],
+    ]
 
-    for kernel_h, kernel_w, pad, stride, dilation, out_channels, input_shapes, composite in trials:
-        groups = 1
-        shape = (1, *input_shapes)
+    for (
+        kernel_h,
+        kernel_w,
+        pad,
+        stride,
+        dilation,
+        out_channels,
+        shape,
+        composite,
+        is_depthwise,
+    ) in trials:
+        shape = (1, *shape)
+        if is_depthwise:
+            groups = shape[3]
+        else:
+            groups = 1
         outputs = []
         inputs = {"a": tvm.nd.array(np.random.uniform(0, 255, shape).astype(dtype))}
 
@@ -463,36 +526,52 @@ def test_qnn_conv2d():
             "output scale": output_sc,
             "output zero point": output_zp,
         }
-        verify(outputs, atol=1, rtol=0, config=config, verify_saturation=True)
+
+        atol = 2 if is_depthwise else 1
+        verify(outputs, atol=atol, rtol=0, config=config, verify_saturation=True)
 
 
 def test_codegen_qnn_conv2d():
     if skip_codegen_test():
         return
 
-    kernel_hs = [1, 2, 3, 5]
-    kernel_ws = [1, 2, 3, 5]
-    pad = [(1, 1), (2, 2), (2, 1)]
-    strides = [(1, 1), (2, 2)]
-    dilation = [(1, 1)]
-    out_channels = [4, 7, 16]
-    input_shapes = [(10, 10, 14), (12, 15, 16), (20, 20, 20)]
-    # composite operator (pad, bias, activation)
-    composite = [
-        (False, False, False),
-        (False, True, False),
-        (False, False, True),
-        (False, True, True),
-        (True, False, False),
-    ]
     dtype = "uint8"
-    trials = generate_trials(
-        [kernel_hs, kernel_ws, pad, strides, dilation, out_channels, input_shapes, composite], 3
-    )
+    trials = [
+        # Normal convolution
+        [2, 2, (1, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, False, False), False],
+        [2, 1, (2, 2), (1, 1), (1, 1), 7, (12, 15, 16), (False, False, True), False],
+        [3, 3, (2, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, True, False), False],
+        [3, 3, (1, 1), (1, 1), (1, 1), 16, (12, 15, 16), (False, False, False), False],
+        [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False],
+        [1, 3, (1, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, True), False],
+        [2, 2, (2, 2), (1, 1), (1, 1), 4, (20, 20, 20), (False, True, False), False],
+        [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False],
+        [3, 3, (2, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, False), False],
+        [3, 3, (1, 1), (2, 2), (1, 1), 16, (10, 10, 14), (False, True, True), False],
+        # Depth-wise convolution
+        [3, 3, (1, 1), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, True), True],
+        [5, 5, (2, 2), (1, 1), (1, 1), 20, (20, 20, 20), (False, True, False), True],
+        [3, 3, (2, 2), (2, 2), (1, 1), 14, (10, 10, 14), (True, False, False), True],
+        [5, 5, (0, 0), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, False), True],
+        [3, 3, (1, 1), (2, 2), (1, 1), 14, (10, 10, 14), (False, True, True), True],
+    ]
 
-    for kernel_h, kernel_w, pad, stride, dilation, out_channels, input_shapes, composite in trials:
-        groups = 1
-        shape = (1, *input_shapes)
+    for (
+        kernel_h,
+        kernel_w,
+        pad,
+        stride,
+        dilation,
+        out_channels,
+        shape,
+        composite,
+        is_depthwise,
+    ) in trials:
+        shape = (1, *shape)
+        if is_depthwise:
+            groups = shape[3]
+        else:
+            groups = 1
         inputs = {"a"}
 
         input_zp = 100
diff --git a/tests/python/contrib/test_arm_compute_lib/test_dense.py b/tests/python/contrib/test_arm_compute_lib/test_dense.py
index 0279aa72eaf7..dba7be67a012 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_dense.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_dense.py
@@ -28,7 +28,6 @@
     build_and_run,
     verify,
     verify_codegen,
-    generate_trials,
 )
 
 
@@ -184,17 +183,19 @@ def test_dense():
     device = Device()
     np.random.seed(0)
 
-    dtype = ["float32"]
-    shape = [
-        (1, (1, 128), (16, 128), 16),
-        (1, (32, 32), (32, 32), 32),
-        (0, (1, 64), (1, 64), 1),
-        (0, (11, 2), (2, 2), 2),
+    dtype = "float32"
+    trials = [
+        [(1, 128), (16, 128), 16, True, 1],
+        [(1, 128), (16, 128), 16, False, 1],
+        [(32, 32), (32, 32), 32, True, 1],
+        [(32, 32), (32, 32), 32, False, 1],
+        [(1, 64), (1, 64), 1, True, 0],
+        [(1, 64), (1, 64), 1, False, 0],
+        [(11, 2), (2, 2), 2, True, 0],
+        [(11, 2), (2, 2), 2, False, 0],
     ]
-    composite = [False, True]
-    trials = generate_trials([dtype, shape, composite], 3)
 
-    for dtype, (acl_partitions, shape, weight_shape, units), composite in trials:
+    for shape, weight_shape, units, composite, acl_partitions in trials:
         outputs = []
         inputs = {"a": tvm.nd.array(np.random.uniform(-128, 127, shape).astype(dtype))}
         func, params = _get_model(
@@ -230,19 +231,26 @@ def test_codegen_dense():
 
     np.random.seed(0)
 
-    dtype = ["float32"]
-    shape = [(1, (1, 128), (16, 128), 16), (1, (32, 32), (32, 32), 32), (0, (1, 64), (1, 64), 1)]
-    composite = [False, True]
-    trials = generate_trials([dtype, shape, composite], 3)
+    dtype = "float32"
+    trials = [
+        [(1, 128), (16, 128), 16, True, 1],
+        [(1, 128), (16, 128), 16, False, 1],
+        [(32, 32), (32, 32), 32, True, 1],
+        [(32, 32), (32, 32), 32, False, 1],
+        [(1, 64), (1, 64), 1, True, 0],
+        [(1, 64), (1, 64), 1, False, 0],
+    ]
 
-    for dtype, (acl_partitions, shape, weight_shape, units), composite in trials:
+    for shape, weight_shape, units, composite, acl_partitions in trials:
         inputs = {"a"}
 
         args = (shape, weight_shape, units, dtype)
 
         func, params = _get_model(*args, var_names=iter(inputs), has_bias=composite)
         exp_codegen = _get_expected_codegen(*args, has_bias=composite)
-        verify_codegen(func, exp_codegen, acl_partitions, 1 - acl_partitions)
+        verify_codegen(
+            func, exp_codegen, acl_partitions, (1 - acl_partitions) * (2 - int(not composite))
+        )
 
 
 def test_qnn_dense():
@@ -254,19 +262,21 @@ def test_qnn_dense():
     device = Device()
     np.random.seed(0)
 
-    dtype = ["uint8"]
-    shape = [
-        (0, (4, 4), (4, 4), 4),
-        (1, (16, 16), (4, 16), 4),
-        (1, (1, 128), (16, 128), 16),
-        (1, (32, 32), (32, 32), 32),
-        (0, (1, 64), (1, 64), 1),
+    dtype = "uint8"
+    trials = [
+        [(4, 4), (4, 4), 4, True, 0],
+        [(4, 4), (4, 4), 4, False, 0],
+        [(16, 16), (4, 16), 4, True, 1],
+        [(16, 16), (4, 16), 4, False, 1],
+        [(1, 128), (16, 128), 16, True, 1],
+        [(1, 128), (16, 128), 16, False, 1],
+        [(32, 32), (32, 32), 32, True, 1],
+        [(32, 32), (32, 32), 32, False, 1],
+        [(1, 64), (1, 64), 1, True, 0],
+        [(1, 64), (1, 64), 1, False, 0],
     ]
 
-    composite = [False, True]
-    trials = generate_trials([dtype, shape, composite], 3)
-
-    for dtype, (acl_partitions, shape, weight_shape, units), composite in trials:
+    for shape, weight_shape, units, composite, acl_partitions in trials:
         outputs = []
         inputs = {"a": tvm.nd.array(np.random.uniform(0, 255, shape).astype(dtype))}
         input_zp = 100
@@ -328,12 +338,17 @@ def test_codegen_qnn_dense():
 
     np.random.seed(0)
 
-    dtype = ["uint8"]
-    shape = [(1, (1, 128), (16, 128), 16), (1, (32, 32), (32, 32), 32), (0, (1, 64), (1, 64), 1)]
-    composite = [False, True]
-    trials = generate_trials([dtype, shape, composite], 3)
+    dtype = "uint8"
+    trials = [
+        [(1, 128), (16, 128), 16, True, 1],
+        [(1, 128), (16, 128), 16, False, 1],
+        [(32, 32), (32, 32), 32, True, 1],
+        [(32, 32), (32, 32), 32, False, 1],
+        [(1, 64), (1, 64), 1, True, 0],
+        [(1, 64), (1, 64), 1, False, 0],
+    ]
 
-    for dtype, (acl_partitions, shape, weight_shape, units), composite in trials:
+    for shape, weight_shape, units, composite, acl_partitions in trials:
         inputs = {"a"}
         args = (shape, weight_shape, units, dtype)
 
@@ -357,7 +372,9 @@ def test_codegen_qnn_dense():
             has_bias=composite,
         )
         exp_codegen = _get_expected_codegen(*args, has_bias=composite)
-        verify_codegen(func, exp_codegen, acl_partitions, 2 - 2 * acl_partitions)
+        verify_codegen(
+            func, exp_codegen, acl_partitions, (1 - acl_partitions) * (3 - int(not composite))
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/python/contrib/test_arm_compute_lib/test_network.py b/tests/python/contrib/test_arm_compute_lib/test_network.py
index 898446b32ed9..462df143b447 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_network.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_network.py
@@ -123,7 +123,7 @@ def get_model():
         return mod, params, inputs
 
     _build_and_run_network(
-        *get_model(), device=device, tvm_ops=73, acl_partitions=18, atol=0.002, rtol=0.01
+        *get_model(), device=device, tvm_ops=56, acl_partitions=31, atol=0.002, rtol=0.01
     )
 
 
@@ -148,7 +148,7 @@ def get_model():
         return mod, params, inputs
 
     _build_and_run_network(
-        *get_model(), device=device, tvm_ops=42, acl_partitions=17, atol=8, rtol=0
+        *get_model(), device=device, tvm_ops=3, acl_partitions=30, atol=9, rtol=0
     )
 
 

From 72c9a51687c5882c5bd13d718a69892d45b5cc4b Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Mon, 11 Jan 2021 14:00:16 -0800
Subject: [PATCH 045/357] [FIX,TUTORIALS] Import tvm.testing in tutorials that
 use it (#7248)

---
 tutorials/autotvm/tune_conv2d_cuda.py            | 1 +
 tutorials/autotvm/tune_simple_template.py        | 3 ++-
 tutorials/frontend/using_external_lib.py         | 1 +
 tutorials/get_started/relay_quick_start.py       | 1 +
 tutorials/language/extern_op.py                  | 1 +
 tutorials/language/tensorize.py                  | 1 +
 tutorials/optimize/opt_matmul_auto_tensorcore.py | 1 +
 7 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/tutorials/autotvm/tune_conv2d_cuda.py b/tutorials/autotvm/tune_conv2d_cuda.py
index c32049567679..dc8e6e522249 100644
--- a/tutorials/autotvm/tune_conv2d_cuda.py
+++ b/tutorials/autotvm/tune_conv2d_cuda.py
@@ -55,6 +55,7 @@
 import tvm
 from tvm import te, topi, testing
 from tvm.topi.testing import conv2d_nchw_python
+import tvm.testing
 
 from tvm import autotvm
 
diff --git a/tutorials/autotvm/tune_simple_template.py b/tutorials/autotvm/tune_simple_template.py
index d7d43c794cda..bd2dcf3cfd1e 100644
--- a/tutorials/autotvm/tune_simple_template.py
+++ b/tutorials/autotvm/tune_simple_template.py
@@ -59,7 +59,8 @@
 
 import numpy as np
 import tvm
-from tvm import te, testing
+from tvm import te
+import tvm.testing
 
 # the module is called `autotvm`
 from tvm import autotvm
diff --git a/tutorials/frontend/using_external_lib.py b/tutorials/frontend/using_external_lib.py
index a150b683a531..8e7fcd70e3e9 100644
--- a/tutorials/frontend/using_external_lib.py
+++ b/tutorials/frontend/using_external_lib.py
@@ -37,6 +37,7 @@
 from tvm.contrib import graph_runtime as runtime
 from tvm import relay
 from tvm.relay import testing
+import tvm.testing
 
 ######################################################################
 # Create a simple network
diff --git a/tutorials/get_started/relay_quick_start.py b/tutorials/get_started/relay_quick_start.py
index 6da62f5ced4b..444b915ca7c8 100644
--- a/tutorials/get_started/relay_quick_start.py
+++ b/tutorials/get_started/relay_quick_start.py
@@ -44,6 +44,7 @@
 import tvm
 from tvm import te
 from tvm.contrib import graph_runtime
+import tvm.testing
 
 ######################################################################
 # Define Neural Network in Relay
diff --git a/tutorials/language/extern_op.py b/tutorials/language/extern_op.py
index 454237a33783..794101a4fb56 100644
--- a/tutorials/language/extern_op.py
+++ b/tutorials/language/extern_op.py
@@ -35,6 +35,7 @@
 from tvm import te
 import numpy as np
 from tvm.contrib import cblas
+import tvm.testing
 
 if not tvm.get_global_func("tvm.contrib.cblas.matmul", allow_missing=True):
     raise Exception("Not compiled with cblas support; can't build this tutorial")
diff --git a/tutorials/language/tensorize.py b/tutorials/language/tensorize.py
index e91cfe43ab46..a75b78b65ca4 100644
--- a/tutorials/language/tensorize.py
+++ b/tutorials/language/tensorize.py
@@ -36,6 +36,7 @@
 
 import tvm
 from tvm import te
+import tvm.testing
 import numpy as np
 
 ######################################################################
diff --git a/tutorials/optimize/opt_matmul_auto_tensorcore.py b/tutorials/optimize/opt_matmul_auto_tensorcore.py
index d81eca56210e..f5450b9524c6 100644
--- a/tutorials/optimize/opt_matmul_auto_tensorcore.py
+++ b/tutorials/optimize/opt_matmul_auto_tensorcore.py
@@ -50,6 +50,7 @@
 
 from tvm import autotvm
 from tvm.contrib import nvcc
+import tvm.testing
 
 
 def matmul_nn(A, B, L, dtype="float16", layout="NN"):

From 10b79290e936145600e334255104d5ad866cf92c Mon Sep 17 00:00:00 2001
From: Tianming Xu <tianmingxu.tmxu@gmail.com>
Date: Tue, 12 Jan 2021 18:09:48 +0800
Subject: [PATCH 046/357] add default value for leaky relu alpha (#7259)

---
 python/tvm/relay/op/nn/nn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index fef82e7c1fd3..562cee5f53bb 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -1488,7 +1488,7 @@ def relu(data):
     return _make.relu(data)
 
 
-def leaky_relu(data, alpha):
+def leaky_relu(data, alpha=0.01):
     """This operator takes data as input and does Leaky version
     of a Rectified Linear Unit.
 

From b84eb1650aad18622d572b5141c91e80f6d80a16 Mon Sep 17 00:00:00 2001
From: lixiaoquan <radioheads@163.com>
Date: Tue, 12 Jan 2021 18:10:53 +0800
Subject: [PATCH 047/357] [ONNX] Fix issues for Clip and RoiAlign (#7237)

---
 python/tvm/relay/frontend/onnx.py          |  8 ++++++--
 tests/python/frontend/onnx/test_forward.py | 24 ++++++++++++++++++++--
 2 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 4c9996bc855a..45457fd6c58c 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -2046,7 +2046,7 @@ def _impl_v1(cls, inputs, attr, params):
         x = inputs[0]
         rois = inputs[1]
         batch_indices = inputs[2]
-        mode = attr.get("mode", "avg")
+        mode = attr.get("mode", b"avg")
         if mode != b"avg":
             raise ValueError("RoiAlign in Relay only uses avg mode")
         output_height = attr.get("output_height", 1)
@@ -2056,7 +2056,7 @@ def _impl_v1(cls, inputs, attr, params):
         spatial_scale = attr.get("spatial_scale", 1.0)
 
         batch_indices = _op.expand_dims(batch_indices, axis=1, num_newaxis=1)
-        batch_indices = _op.cast(batch_indices, infer_type(rois).type_annotation.dtype)
+        batch_indices = _op.cast(batch_indices, infer_type(rois).checked_type.dtype)
         rois = _op.concatenate([batch_indices, rois], 1)
 
         return _vision.roi_align(
@@ -2074,6 +2074,10 @@ def convert_attributes(inputs, attr, params):
 
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
+        if "min" not in attr:
+            attr["min"] = -np.inf
+        if "max" not in attr:
+            attr["max"] = np.inf
         return Clip.convert_attributes(inputs, attr, params)
 
     @classmethod
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index df35a7e9bb56..96be6fba113a 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -840,7 +840,7 @@ def test_slice():
     )
 
 
-def _test_onnx_op_elementwise(inshape, outfunc, npargs, dtype, opname, kwargs):
+def _test_onnx_op_elementwise(inshape, outfunc, npargs, dtype, opname, kwargs, opset=None):
     indata = np.random.uniform(-1, 1, size=inshape).astype(dtype)
     outdata = outfunc(indata, **npargs)
 
@@ -856,7 +856,7 @@ def _test_onnx_op_elementwise(inshape, outfunc, npargs, dtype, opname, kwargs):
     model = helper.make_model(graph, producer_name=opname + "_test")
 
     for target, ctx in tvm.testing.enabled_targets():
-        tvm_out = get_tvm_output(model, indata, target, ctx, outdata.shape, dtype)
+        tvm_out = get_tvm_output(model, indata, target, ctx, outdata.shape, dtype, opset=opset)
         tvm.testing.assert_allclose(outdata, tvm_out)
 
 
@@ -881,6 +881,26 @@ def test_clip():
         {"min": -1.0, "max": 1.0},
     )
 
+    _test_onnx_op_elementwise(
+        (2, 4, 5, 6),
+        np.clip,
+        {"a_min": -np.inf, "a_max": 1.0},
+        "float32",
+        "Clip",
+        {"max": 1.0},
+        opset=1,
+    )
+
+    _test_onnx_op_elementwise(
+        (2, 4, 5, 6),
+        np.clip,
+        {"a_min": -1.0, "a_max": np.inf},
+        "float32",
+        "Clip",
+        {"min": -1.0},
+        opset=1,
+    )
+
 
 @tvm.testing.uses_gpu
 def test_clip_min_max_as_inputs():

From e3b2984ac2e5274a1d3fe1cdcb86d5dffe04066b Mon Sep 17 00:00:00 2001
From: Przemyslaw Tredak <ptredak@nvidia.com>
Date: Tue, 12 Jan 2021 05:46:17 -0800
Subject: [PATCH 048/357] Do not use ICHECK in nnvm (#7255)

---
 nnvm/include/nnvm/graph.h         |  4 ++--
 nnvm/include/nnvm/layout.h        | 40 +++++++++++++++----------------
 nnvm/include/nnvm/op.h            | 12 +++++-----
 nnvm/include/nnvm/tuple.h         |  4 ++--
 nnvm/src/core/graph.cc            | 10 ++++----
 nnvm/src/core/op.cc               |  2 +-
 nnvm/src/core/pass.cc             |  2 +-
 nnvm/src/core/symbolic.cc         | 22 ++++++++---------
 nnvm/src/pass/correct_layout.cc   | 12 +++++-----
 nnvm/src/pass/gradient.cc         | 16 ++++++-------
 nnvm/src/pass/graph_algorithm.h   | 10 ++++----
 nnvm/src/pass/infer_shape_type.cc | 24 +++++++++----------
 nnvm/src/pass/place_device.cc     | 12 +++++-----
 nnvm/src/pass/plan_memory.cc      |  4 ++--
 nnvm/src/pass/print_graph_ir.cc   |  2 +-
 nnvm/src/pass/saveload_json.cc    | 18 +++++++-------
 nnvm/tests/cpp/op_test.cc         |  2 +-
 nnvm/tests/cpp/tuple_test.cc      |  8 +++----
 18 files changed, 102 insertions(+), 102 deletions(-)

diff --git a/nnvm/include/nnvm/graph.h b/nnvm/include/nnvm/graph.h
index 6f624b758fa9..475494e62c4d 100644
--- a/nnvm/include/nnvm/graph.h
+++ b/nnvm/include/nnvm/graph.h
@@ -229,7 +229,7 @@ inline void DFSVisit(const std::vector<NodeEntry>& heads, FVisit fvisit);
 template <typename T>
 inline const T& Graph::GetAttr(const std::string& attr_name) const {
   auto it = attrs.find(attr_name);
-  ICHECK(it != attrs.end()) << "Cannot find attribute " << attr_name << " in the graph";
+  CHECK(it != attrs.end()) << "Cannot find attribute " << attr_name << " in the graph";
   return nnvm::unsafe_get<T>(*it->second);
 }
 
@@ -241,7 +241,7 @@ inline bool Graph::HasAttr(const std::string& attr_name) const {
 template <typename T>
 inline T Graph::MoveCopyAttr(const std::string& attr_name) {
   auto it = attrs.find(attr_name);
-  ICHECK(it != attrs.end()) << "Cannot find attribute " << attr_name << " in the graph";
+  CHECK(it != attrs.end()) << "Cannot find attribute " << attr_name << " in the graph";
   std::shared_ptr<any> sptr = it->second;
   attrs.erase(it);
   if (sptr.unique()) {
diff --git a/nnvm/include/nnvm/layout.h b/nnvm/include/nnvm/layout.h
index 6c46f9de9e0f..e2e99784c99e 100644
--- a/nnvm/include/nnvm/layout.h
+++ b/nnvm/include/nnvm/layout.h
@@ -220,7 +220,7 @@ class Layout {
     for (size_t i = pos; i < pos + len; ++i) {
       if (is_subdim(layout_simplified_[i])) {
         auto block_size = this->subsizeof(layout_simplified_[i]);
-        ICHECK_GT(block_size, 0);
+        CHECK_GT(block_size, 0);
         new_layout << block_size;
       }
       new_layout << layout_simplified_[i];
@@ -235,7 +235,7 @@ class Layout {
     for (int64_t i = this->ndim() - 1; i >= 0; --i) {
       if (is_subdim(layout_simplified_[i])) {
         auto block_size = this->subsizeof(layout_simplified_[i]);
-        ICHECK_GT(block_size, 0);
+        CHECK_GT(block_size, 0);
         new_layout << block_size;
       }
       new_layout << layout_simplified_[i];
@@ -251,13 +251,13 @@ class Layout {
    * \return A newly constructed Layout object.
    */
   inline Layout split(LayoutDim dim, size_t target_pos, uint32_t size) const {
-    ICHECK(target_pos <= this->ndim())
+    CHECK(target_pos <= this->ndim())
         << "Invalid split position " << target_pos << " for layout " << name_;
-    ICHECK(is_superdim(dim)) << "Cannot split a sub-dimension " << dim;
-    ICHECK(this->contains(dim)) << "Axis " << dim << " does not exist in " << name_;
-    ICHECK(!this->contains(to_subdim(dim)))
+    CHECK(is_superdim(dim)) << "Cannot split a sub-dimension " << dim;
+    CHECK(this->contains(dim)) << "Axis " << dim << " does not exist in " << name_;
+    CHECK(!this->contains(to_subdim(dim)))
         << "Dimension " << dim << " has already been split in " << name_;
-    ICHECK(size > 0) << "Invalid split size " << size;
+    CHECK(size > 0) << "Invalid split size " << size;
     std::ostringstream new_layout;
     for (size_t i = 0; i <= this->ndim(); ++i) {
       if (i == target_pos) {
@@ -293,11 +293,11 @@ class Layout {
    * \return the description of the dimension.
    */
   inline std::string at(size_t i) const {
-    ICHECK_LT(i, this->ndim()) << "position " << i << " exceeds ndim=" << this->ndim();
+    CHECK_LT(i, this->ndim()) << "position " << i << " exceeds ndim=" << this->ndim();
     std::ostringstream repr;
     if (is_subdim(layout_simplified_[i])) {
       auto factor = subsizeof(layout_simplified_[i]);
-      ICHECK_GT(factor, 0);
+      CHECK_GT(factor, 0);
       repr << factor;
     }
     repr << layout_simplified_[i];
@@ -328,7 +328,7 @@ class Layout {
    *         Return -1 if \p dim is not in the layout or the layout is undefined.
    */
   inline int64_t subsizeof(LayoutDim dim) const {
-    ICHECK(is_superdim(dim) || is_subdim(dim)) << "Invalid dim " << dim;
+    CHECK(is_superdim(dim) || is_subdim(dim)) << "Invalid dim " << dim;
     if (!this->defined() || !this->contains(to_subdim(dim))) {
       return -1;
     }
@@ -409,34 +409,34 @@ class Layout {
       const LayoutDim c = layout.at(i);
       if (is_superdim(c)) {
         int pos = c - 'A';
-        ICHECK_EQ(factor, 0) << "Invalid layout " << layout << ": invalid factor size " << factor
-                             << " before dimension " << c;
-        ICHECK_EQ(superdim_pos_[pos], -1)
+        CHECK_EQ(factor, 0) << "Invalid layout " << layout << ": invalid factor size " << factor
+                            << " before dimension " << c;
+        CHECK_EQ(superdim_pos_[pos], -1)
             << "Invalid layout " << layout << ": duplicate dimension " << c;
         superdim_pos_[pos] = curr++;
         layout_simplified_.push_back(c);
       } else if (is_subdim(c)) {
         int pos = c - 'a';
-        ICHECK_GT(factor, 0) << "Invalid layout " << layout << ": invalid factor size " << factor
-                             << " for dimension " << c;
-        ICHECK_EQ(subdim_pos_[pos], -1)
+        CHECK_GT(factor, 0) << "Invalid layout " << layout << ": invalid factor size " << factor
+                            << " for dimension " << c;
+        CHECK_EQ(subdim_pos_[pos], -1)
             << "Invalid layout " << layout << ": duplicate dimension " << c;
-        ICHECK_EQ(subdim_size_[pos], -1)
+        CHECK_EQ(subdim_size_[pos], -1)
             << "Invalid layout " << layout << ": duplicate dimension " << c;
         subdim_pos_[pos] = curr++;
         subdim_size_[pos] = factor;
         layout_simplified_.push_back(c);
         factor = 0;
       } else if (c >= '0' && c <= '9') {
-        ICHECK(factor >= 0) << "Invalid layout " << layout << ": _ is adjacent to a number.";
+        CHECK(factor >= 0) << "Invalid layout " << layout << ": _ is adjacent to a number.";
         factor = factor * 10 + c - '0';
       } else {
         LOG(FATAL) << "Invalid layout " << layout;
       }
     }
-    ICHECK(!layout_simplified_.empty()) << "Invalid layout " << layout;
+    CHECK(!layout_simplified_.empty()) << "Invalid layout " << layout;
     for (LayoutDim dim : layout_simplified_) {
-      ICHECK(is_superdim(dim) || superdim_pos_[dim - 'a'] >= 0)
+      CHECK(is_superdim(dim) || superdim_pos_[dim - 'a'] >= 0)
           << "Invalid layout " << layout << ": missing axis " << static_cast<char>(dim - 'a' + 'A');
     }
   }
diff --git a/nnvm/include/nnvm/op.h b/nnvm/include/nnvm/op.h
index be52b08ebe62..f53e0f25ee37 100644
--- a/nnvm/include/nnvm/op.h
+++ b/nnvm/include/nnvm/op.h
@@ -452,7 +452,7 @@ inline const OpMap<ValueType>& Op::GetAttr(const std::string& key) {
 template <typename ValueType>
 inline Op& Op::set_attr(  // NOLINT(*)
     const std::string& attr_name, const ValueType& value, int plevel) {
-  ICHECK_GT(plevel, 0) << "plevel in set_attr must be greater than 0";
+  CHECK_GT(plevel, 0) << "plevel in set_attr must be greater than 0";
   // update the attribute map of the key by creating new empty if needed.
   UpdateAttrMap(attr_name, [this, attr_name, value, plevel](any* pmap) {
     // the callback is in lockscope so is threadsafe.
@@ -461,7 +461,7 @@ inline Op& Op::set_attr(  // NOLINT(*)
       pm.attr_name_ = attr_name;
       *pmap = std::move(pm);
     }
-    ICHECK(pmap->type() == typeid(OpMap<ValueType>))
+    CHECK(pmap->type() == typeid(OpMap<ValueType>))
         << "Attribute " << attr_name << " of operator " << this->name
         << " is registered as inconsistent types"
         << " previously " << pmap->type().name() << " current " << typeid(OpMap<ValueType>).name();
@@ -471,8 +471,8 @@ inline Op& Op::set_attr(  // NOLINT(*)
       vec.resize(index_ + 1, std::make_pair(ValueType(), 0));
     }
     std::pair<ValueType, int>& p = vec[index_];
-    ICHECK(p.second != plevel) << "Attribute " << attr_name << " of operator " << this->name
-                               << " is already registered with same plevel=" << plevel;
+    CHECK(p.second != plevel) << "Attribute " << attr_name << " of operator " << this->name
+                              << " is already registered with same plevel=" << plevel;
     if (p.second < plevel) {
       vec[index_] = std::make_pair(value, plevel);
     }
@@ -547,9 +547,9 @@ inline bool OpMap<ValueType>::contains(const Op* op) const {
 
 template <typename ValueType>
 inline const ValueType& OpMap<ValueType>::operator[](const Op* op) const {
-  ICHECK(op != nullptr);
+  CHECK(op != nullptr);
   const uint32_t idx = op->index_;
-  ICHECK(idx < data_.size() && data_[idx].second)
+  CHECK(idx < data_.size() && data_[idx].second)
       << "Attribute " << attr_name_ << " has not been registered for Operator " << op->name;
   return data_[idx].first;
 }
diff --git a/nnvm/include/nnvm/tuple.h b/nnvm/include/nnvm/tuple.h
index af800e77dd07..c6d6125aa194 100644
--- a/nnvm/include/nnvm/tuple.h
+++ b/nnvm/include/nnvm/tuple.h
@@ -435,7 +435,7 @@ class TShape : public Tuple<dim_t> {
    */
   template <int dim>
   inline mshadow::Shape<dim> get() const {
-    ICHECK_EQ(dim, static_cast<int>(ndim()))
+    CHECK_EQ(dim, static_cast<int>(ndim()))
         << "dimension do not match target dimension " << dim << " vs " << ndim();
     const dim_t* d = this->data();
     mshadow::Shape<dim> s;
@@ -467,7 +467,7 @@ class TShape : public Tuple<dim_t> {
    * \return the flat 3d shape
    */
   inline mshadow::Shape<3> FlatTo3D(size_t axis_begin, size_t axis_end) const {
-    ICHECK(axis_end >= axis_begin);
+    CHECK(axis_end >= axis_begin);
     mshadow::Shape<3> s;
     if (ndim() == 0) return mshadow::Shape3(0, 0, 0);
     const dim_t* d = this->data();
diff --git a/nnvm/src/core/graph.cc b/nnvm/src/core/graph.cc
index 81dc9bc35992..e5042802906c 100644
--- a/nnvm/src/core/graph.cc
+++ b/nnvm/src/core/graph.cc
@@ -54,7 +54,7 @@ static void SubgraphSanityCheck(const std::vector<std::shared_ptr<Symbol>>& subg
         nnvm::Node* node = n.get();
         // if the node is visited, but on a different level, then check failed
         // if check failed here or before, we stop doing anything, but raise an error
-        ICHECK(!node2level.count(node) || node2level[node] == level)
+        CHECK(!node2level.count(node) || node2level[node] == level)
             << "A subgraph should not depend on the outputs of nodes on higher levels";
         // otherwise, this node belongs to the current level
         node2level[node] = level;
@@ -76,9 +76,9 @@ IndexedGraph::IndexedGraph(const Graph& g) {
   DFSVisit(g.outputs, [this, &inputs_rptr, &control_rptr, &subgraphs](const ObjectPtr& n) {
     const auto& is_ghost = Op::GetAttr<TIsGhost>("TIsGhost");
     if (!n->is_variable() && is_ghost.get(n->op(), false)) return;
-    ICHECK_LT(nodes_.size(), std::numeric_limits<uint32_t>::max());
+    CHECK_LT(nodes_.size(), std::numeric_limits<uint32_t>::max());
     uint32_t nid = static_cast<uint32_t>(nodes_.size());
-    ICHECK(n);
+    CHECK(n);
     for (const auto& subgraph : n->attrs.subgraphs) subgraphs.push_back(subgraph);
     // nodes_
     IndexedGraph::Node new_node;
@@ -96,7 +96,7 @@ IndexedGraph::IndexedGraph(const Graph& g) {
     // input entries
     for (const auto& e : n->inputs) {
       auto it = node2index_.find(e.node.get());
-      ICHECK(it != node2index_.end() && it->first == e.node.get());
+      CHECK(it != node2index_.end() && it->first == e.node.get());
       input_entries_.emplace_back(NodeEntry{it->second, e.index, e.version});
     }
     inputs_rptr.push_back(input_entries_.size());
@@ -104,7 +104,7 @@ IndexedGraph::IndexedGraph(const Graph& g) {
     for (const auto& nptr : n->control_deps) {
       if (!nptr->is_variable() && is_ghost.get(nptr->op(), false)) continue;
       auto it = node2index_.find(nptr.get());
-      ICHECK(it != node2index_.end()) << "control dep not found in graph";
+      CHECK(it != node2index_.end()) << "control dep not found in graph";
       control_deps_.push_back(it->second);
     }
     control_rptr.push_back(control_deps_.size());
diff --git a/nnvm/src/core/op.cc b/nnvm/src/core/op.cc
index 7f5d1999780d..08a11dff9a02 100644
--- a/nnvm/src/core/op.cc
+++ b/nnvm/src/core/op.cc
@@ -70,7 +70,7 @@ Op& Op::add_alias(const std::string& alias) {  // NOLINT(*)
 // find operator by name
 const Op* Op::Get(const std::string& name) {
   const Op* op = dmlc::Registry<Op>::Find(name);
-  ICHECK(op != nullptr) << "Operator " << name << " is not registered";
+  CHECK(op != nullptr) << "Operator " << name << " is not registered";
   return op;
 }
 
diff --git a/nnvm/src/core/pass.cc b/nnvm/src/core/pass.cc
index 9966d3d42300..974cd2b35918 100644
--- a/nnvm/src/core/pass.cc
+++ b/nnvm/src/core/pass.cc
@@ -45,7 +45,7 @@ Graph ApplyPasses(Graph g, const std::vector<std::string>& pass) {
   std::vector<const PassFunctionReg*> fpass;
   for (auto& name : pass) {
     auto* reg = dmlc::Registry<PassFunctionReg>::Find(name);
-    ICHECK(reg != nullptr) << "Cannot find pass " << name << " in the registry";
+    CHECK(reg != nullptr) << "Cannot find pass " << name << " in the registry";
     fpass.push_back(reg);
   }
 
diff --git a/nnvm/src/core/symbolic.cc b/nnvm/src/core/symbolic.cc
index 18d31dd3a937..12b8675d0bd7 100644
--- a/nnvm/src/core/symbolic.cc
+++ b/nnvm/src/core/symbolic.cc
@@ -58,7 +58,7 @@ inline void UpdateNodeVersion(Node* n) {
   if (fmutate_inputs.count(n->op()) != 0) {
     for (uint32_t i : fmutate_inputs[n->op()](n->attrs)) {
       NodeEntry& e = n->inputs[i];
-      ICHECK(e.node->is_variable()) << "Mutation target can only be Variable";
+      CHECK(e.node->is_variable()) << "Mutation target can only be Variable";
       // increase the version of the variable.
       e.version = ++nnvm::get<VariableParam>(e.node->attrs.parsed).version;
     }
@@ -186,7 +186,7 @@ void Symbol::Print(std::ostream& os) const {
 
 Symbol Symbol::operator[](size_t index) const {
   size_t nreturn = outputs.size();
-  ICHECK_LT(index, nreturn) << "Symbol only accept nonnegative index";
+  CHECK_LT(index, nreturn) << "Symbol only accept nonnegative index";
   if (nreturn == 1) {
     return *this;
   } else {
@@ -298,13 +298,13 @@ void Symbol::Compose(const array_view<const Symbol*>& args,
   for (size_t i = 0; i < args.size(); ++i) {
     // If the argument isn't a graph, it should have only one output.
     if (garg_idx.empty() || std::find(garg_idx.begin(), garg_idx.end(), i) == garg_idx.end())
-      ICHECK_EQ(args[i]->outputs.size(), 1U)
+      CHECK_EQ(args[i]->outputs.size(), 1U)
           << "Argument " << i << " is a tuple, single value is required";
   }
   for (const auto& kv : kwargs) {
     if (garg_names.empty() ||
         std::find(garg_names.begin(), garg_names.end(), kv.first) == garg_names.end())
-      ICHECK_EQ(kv.second->outputs.size(), 1U)
+      CHECK_EQ(kv.second->outputs.size(), 1U)
           << "Keyword Argument " << kv.first << " is a tuple, single value is required";
   }
   // assign new name
@@ -325,7 +325,7 @@ void Symbol::Compose(const array_view<const Symbol*>& args,
           sym = arg_vec[idx];
         } else {
           auto it = kwarg_map.find(arg_names[idx]);
-          ICHECK(it != kwarg_map.end());
+          CHECK(it != kwarg_map.end());
           sym = it->second;
           kwarg_map.erase(it);
         }
@@ -346,7 +346,7 @@ void Symbol::Compose(const array_view<const Symbol*>& args,
 
     if (n_req != kVarg) {
       n->inputs.resize(n_req);
-      ICHECK_LE(arg_vec.size(), n_req)
+      CHECK_LE(arg_vec.size(), n_req)
           << "Incorrect number of arguments, requires " << n_req << ", provided " << arg_vec.size();
       for (size_t i = 0; i < arg_vec.size(); ++i) {
         n->inputs[i] = arg_vec[i]->outputs[0];
@@ -378,7 +378,7 @@ void Symbol::Compose(const array_view<const Symbol*>& args,
         }
       }
     } else {
-      ICHECK_EQ(kwarg_map.size(), 0U) << "Variable length function do not accept kwargs";
+      CHECK_EQ(kwarg_map.size(), 0U) << "Variable length function do not accept kwargs";
       n->inputs.reserve(arg_vec.size());
       for (const Symbol* s : arg_vec) {
         n->inputs.push_back(s->outputs[0]);
@@ -396,7 +396,7 @@ void Symbol::Compose(const array_view<const Symbol*>& args,
     }
   } else {
     // general composition
-    ICHECK_EQ(args.size(), 0U) << "General composition only support kwargs for now";
+    CHECK_EQ(args.size(), 0U) << "General composition only support kwargs for now";
     size_t nmatched = 0;
     size_t arg_counter = 0;
     std::unordered_map<Node*, const NodeEntry*> replace_map;
@@ -456,7 +456,7 @@ void Symbol::Compose(const array_view<const Symbol*>& args,
     // update outputs in case the composed variable is part of outputs.
     for (size_t i = 0; i < outputs.size(); ++i) {
       if (outputs[i].node->is_variable()) {
-        ICHECK_EQ(args.size(), 0) << "Variable composition only supports keyword arguments";
+        CHECK_EQ(args.size(), 0) << "Variable composition only supports keyword arguments";
         const auto it = kwargs.find(outputs[i].node->attrs.name);
         if (it != kwargs.end()) outputs[i] = it->second->outputs[0];
       }
@@ -473,7 +473,7 @@ Symbol Symbol::operator()(const array_view<const Symbol*>& args,
 }
 
 void Symbol::AddControlDeps(const Symbol& src) {
-  ICHECK_EQ(outputs.size(), 1U) << "AddControlDeps only works for nongrouped symbol";
+  CHECK_EQ(outputs.size(), 1U) << "AddControlDeps only works for nongrouped symbol";
   Node* n = outputs[0].node.get();
   for (const NodeEntry& sp : src.outputs) {
     n->control_deps.push_back(sp.node);
@@ -517,7 +517,7 @@ Symbol Symbol::GetChildren() const {
 void Symbol::SetAttrs(const std::vector<std::pair<std::string, std::string> >& attrs) {
   Node* node = outputs[0].node.get();
   for (const NodeEntry& e : outputs) {
-    ICHECK(node == e.node.get()) << "Symbol.SetAttrs only works for non-grouped symbol";
+    CHECK(node == e.node.get()) << "Symbol.SetAttrs only works for non-grouped symbol";
   }
   for (const auto& kv : attrs) {
     if (kv.first == "name") {
diff --git a/nnvm/src/pass/correct_layout.cc b/nnvm/src/pass/correct_layout.cc
index 3a8cc16511ff..b9024a56d143 100644
--- a/nnvm/src/pass/correct_layout.cc
+++ b/nnvm/src/pass/correct_layout.cc
@@ -64,7 +64,7 @@ nnvm::Graph CorrectLayout(nnvm::Graph src) {
     if (new_node->is_variable()) {
       // Variable node. No operator. Only one output entry.
       auto input_iter = std::find(idx.input_nodes().cbegin(), idx.input_nodes().cend(), nid);
-      ICHECK(input_iter != idx.input_nodes().cend());
+      CHECK(input_iter != idx.input_nodes().cend());
       int64_t input_id = std::distance(idx.input_nodes().cbegin(), input_iter);
       if (src.HasAttr("layout_inputs")) {
         new_layouts[new_node.get()] = {
@@ -83,11 +83,11 @@ nnvm::Graph CorrectLayout(nnvm::Graph src) {
     for (size_t i = 0; i < num_inputs; ++i) {
       const IndexedGraph::NodeEntry& input_entry = inode.inputs[i];
       const ObjectPtr& new_input_node = mirror_vec[input_entry.node_id];
-      ICHECK(new_input_node != nullptr);
+      CHECK(new_input_node != nullptr);
 
       // fill inputs by previous node (DFS order) inferred layouts.
       const auto& layouts_iter = new_layouts.find(new_input_node.get());
-      ICHECK(layouts_iter != new_layouts.end());
+      CHECK(layouts_iter != new_layouts.end());
       request_ilayouts[i] = layouts_iter->second[input_entry.index];
     }
     // layouts produced by previous node.
@@ -108,10 +108,10 @@ nnvm::Graph CorrectLayout(nnvm::Graph src) {
 
     if (op_correct_layout.count(new_node->op())) {
       const auto& flayout = op_correct_layout[new_node->op()];
-      ICHECK(flayout(new_node->attrs, &request_ilayouts, &last_request_ilayouts, &produce_olayouts))
+      CHECK(flayout(new_node->attrs, &request_ilayouts, &last_request_ilayouts, &produce_olayouts))
           << "Layout infer fail";
-      ICHECK_EQ(request_ilayouts.size(), num_inputs);
-      ICHECK_EQ(produce_olayouts.size(), num_outputs);
+      CHECK_EQ(request_ilayouts.size(), num_inputs);
+      CHECK_EQ(produce_olayouts.size(), num_outputs);
     }
 
     // update new layouts
diff --git a/nnvm/src/pass/gradient.cc b/nnvm/src/pass/gradient.cc
index 902a968b102d..1df3af7ffaaf 100644
--- a/nnvm/src/pass/gradient.cc
+++ b/nnvm/src/pass/gradient.cc
@@ -85,10 +85,10 @@ Graph Gradient(Graph src) {
   using MirrorFun = std::function<int(const Node& node)>;
   using AttrHintFun = std::function<NodeEntry(const NodeEntry& src, const NodeEntry& like)>;
 
-  ICHECK_NE(src.attrs.count("grad_ys"), 0U) << "Gradient require grad_ys to be presented.";
-  ICHECK_NE(src.attrs.count("grad_ys_out_grad"), 0U)
+  CHECK_NE(src.attrs.count("grad_ys"), 0U) << "Gradient require grad_ys to be presented.";
+  CHECK_NE(src.attrs.count("grad_ys_out_grad"), 0U)
       << "Gradient require grad_ys_out_grad to be presented.";
-  ICHECK_NE(src.attrs.count("grad_xs"), 0U) << "Gradient require grad_xs to be presented.";
+  CHECK_NE(src.attrs.count("grad_xs"), 0U) << "Gradient require grad_xs to be presented.";
   const std::vector<NodeEntry>& ys = src.GetAttr<std::vector<NodeEntry> >("grad_ys");
   const std::vector<NodeEntry>& ys_out_grad =
       src.GetAttr<std::vector<NodeEntry> >("grad_ys_out_grad");
@@ -124,7 +124,7 @@ Graph Gradient(Graph src) {
     topo_order.push_back(node);
   });
 
-  ICHECK_EQ(ys.size(), ys_out_grad.size());
+  CHECK_EQ(ys.size(), ys_out_grad.size());
   for (size_t i = 0; i < ys.size(); ++i) {
     NodeEntry ograd = ys_out_grad[i];
     output_grads[ys[i].node.get()][ys[i].index].grads = {ograd};
@@ -132,7 +132,7 @@ Graph Gradient(Graph src) {
 
   // Check that all xs are reachable from ys
   for (size_t i = 0; i < xs.size(); ++i) {
-    ICHECK(output_grads.find(xs[i].node.get()) != output_grads.end())
+    CHECK(output_grads.find(xs[i].node.get()) != output_grads.end())
         << "Cannot differentiate with respect to the " << i + 1 << "-th variable "
         << "because it is unreachable from the outputs.";
   }
@@ -182,7 +182,7 @@ Graph Gradient(Graph src) {
       // Check for FGradient
       if (grad_fun_map.contains(ptr->op())) {
         input_grads = grad_fun_map[ptr->op()](fwd_node, out_agg_grads);
-        ICHECK_EQ((*rit)->inputs.size(), input_grads.size())
+        CHECK_EQ((*rit)->inputs.size(), input_grads.size())
             << "Gradient function not returning enough gradient";
       } else if (CheckGradAllZero(out_agg_grads, zero_ops)) {
         for (size_t i = 0; i < fwd_node->num_inputs(); ++i) {
@@ -206,9 +206,9 @@ Graph Gradient(Graph src) {
         LOG(FATAL) << "Operator " << fwd_node->op()->name << " is non-differentiable "
                    << "because it didn't register FGradient attribute.";
       }
-      for (const auto& nodeEntry : input_grads) ICHECK(nodeEntry.node);
+      for (const auto& nodeEntry : input_grads) CHECK(nodeEntry.node);
       auto git = input_grads.begin();
-      ICHECK((*rit)->inputs.size() <= input_grads.size());
+      CHECK((*rit)->inputs.size() <= input_grads.size());
       for (auto it = (*rit)->inputs.begin(); it != (*rit)->inputs.end(); ++it, ++git) {
         auto& output_grad_entry = output_grads[it->node.get()][it->index];
         // if any of the backward op can do shape inference, the hint is not necessary.
diff --git a/nnvm/src/pass/graph_algorithm.h b/nnvm/src/pass/graph_algorithm.h
index 4620079a0ab2..b305c08bc05f 100644
--- a/nnvm/src/pass/graph_algorithm.h
+++ b/nnvm/src/pass/graph_algorithm.h
@@ -45,7 +45,7 @@ namespace pass {
 inline uint32_t FindBestPath(const IndexedGraph& graph, const std::vector<uint32_t>& node_reward,
                              std::vector<uint32_t>* path) {
   const uint32_t num_nodes = static_cast<uint32_t>(graph.num_nodes());
-  ICHECK_EQ(num_nodes, node_reward.size());
+  CHECK_EQ(num_nodes, node_reward.size());
 
   std::vector<uint32_t> best_reward(node_reward.size(), 0);
   std::vector<uint32_t> next_node(node_reward.size(), num_nodes);
@@ -73,7 +73,7 @@ inline uint32_t FindBestPath(const IndexedGraph& graph, const std::vector<uint32
     path->push_back(nid);
     reward += node_reward[nid];
   }
-  ICHECK_EQ(reward, best_solution);
+  CHECK_EQ(reward, best_solution);
   return best_solution;
 }
 
@@ -90,8 +90,8 @@ inline uint32_t FindBestPath(const IndexedGraph& graph, const std::vector<uint32
  */
 inline uint32_t ColorNodeGroup(const IndexedGraph& graph, std::vector<uint32_t> node_importance,
                                uint32_t max_ncolor, std::vector<uint32_t>* color) {
-  ICHECK_NE(max_ncolor, 0U);
-  ICHECK_EQ(graph.num_nodes(), node_importance.size());
+  CHECK_NE(max_ncolor, 0U);
+  CHECK_EQ(graph.num_nodes(), node_importance.size());
 
   color->clear();
   color->resize(graph.num_nodes(), max_ncolor);
@@ -105,7 +105,7 @@ inline uint32_t ColorNodeGroup(const IndexedGraph& graph, std::vector<uint32_t>
     if (reward == 0) break;
     for (uint32_t nid : path) {
       if (node_importance[nid] != 0) {
-        ICHECK_EQ(color->at(nid), max_ncolor);
+        CHECK_EQ(color->at(nid), max_ncolor);
         color->at(nid) = cindex;
         // make the importance 0 after color is decided.
         node_importance[nid] = 0;
diff --git a/nnvm/src/pass/infer_shape_type.cc b/nnvm/src/pass/infer_shape_type.cc
index 859c5b385c4a..fde1691ee96a 100644
--- a/nnvm/src/pass/infer_shape_type.cc
+++ b/nnvm/src/pass/infer_shape_type.cc
@@ -49,7 +49,7 @@ Graph InferAttr(Graph&& ret, const AttrType empty_val, const char* infer_name,
 
   if (ret.attrs.count(input_name) != 0) {
     const AttrVector& shape_args = ret.GetAttr<AttrVector>(input_name);
-    ICHECK_LE(shape_args.size(), idx.input_nodes().size())
+    CHECK_LE(shape_args.size(), idx.input_nodes().size())
         << "More provided shapes than number of arguments.";
     for (size_t i = 0; i < shape_args.size(); ++i) {
       rshape[idx.entry_id(idx.input_nodes()[i], 0)] = shape_args[i];
@@ -88,22 +88,22 @@ Graph InferAttr(Graph&& ret, const AttrType empty_val, const char* infer_name,
     const uint32_t num_outputs = inode.source->num_outputs();
     if (inode.source->is_variable()) {
       // Variable node. No operator. Only one output entry.
-      ICHECK(inode.source->op() == nullptr);
-      ICHECK_EQ(num_outputs, 1U);
+      CHECK(inode.source->op() == nullptr);
+      CHECK_EQ(num_outputs, 1U);
       const uint32_t out_ent_id = idx.entry_id(nid, 0);
       if (shape_attr_key.length() != 0 && fis_none(rshape[out_ent_id])) {
         auto it = inode.source->attrs.dict.find(shape_attr_key);
         if (it != inode.source->attrs.dict.end()) {
           std::istringstream is(it->second);
-          ICHECK(is >> rshape[out_ent_id]) << "Invalid attribute";
+          CHECK(is >> rshape[out_ent_id]) << "Invalid attribute";
         }
       }
     } else if (is_backward.get(inode.source->op(), false) && inode.control_deps.size()) {
-      ICHECK_GE(inode.control_deps.size(), 1U)
+      CHECK_GE(inode.control_deps.size(), 1U)
           << "BackwardOp need to have control_deps to its forward op";
       const IndexedGraph::Node& fnode = idx[inode.control_deps[0]];
       ObjectPtr fwd_ptr = inode.source->control_deps[0];
-      ICHECK(fwd_ptr->op() != nullptr) << "Forward op cannot be a variable";
+      CHECK(fwd_ptr->op() != nullptr) << "Forward op cannot be a variable";
       // use gradient function to find out the correspondence.
       std::vector<NodeEntry> ograd(fwd_ptr->num_outputs());
       for (size_t i = 0; i < ograd.size(); ++i) {
@@ -119,18 +119,18 @@ Graph InferAttr(Graph&& ret, const AttrType empty_val, const char* infer_name,
           if (fis_none(rshape[eid])) {
             rshape[eid] = rshape[idx.entry_id(fnode.inputs[i])];
           } else if (!fis_none(rshape[idx.entry_id(fnode.inputs[i])])) {
-            ICHECK_EQ(rshape[eid], rshape[idx.entry_id(fnode.inputs[i])])
+            CHECK_EQ(rshape[eid], rshape[idx.entry_id(fnode.inputs[i])])
                 << "Backward shape inconsistent with the forward shape";
           }
           if (igrad_node == nullptr) {
             igrad_node = igrad[i].node.get();
           } else {
-            ICHECK(igrad_node == igrad[i].node.get());
+            CHECK(igrad_node == igrad[i].node.get());
           }
         }
       }
       // out grad entries
-      ICHECK(igrad_node != nullptr)
+      CHECK(igrad_node != nullptr)
           << "Cannot find matching backward op for " << inode.source->attrs.name;
       for (size_t i = 0; i < igrad_node->inputs.size(); ++i) {
         const NodeEntry& e = igrad_node->inputs[i];
@@ -164,9 +164,9 @@ Graph InferAttr(Graph&& ret, const AttrType empty_val, const char* infer_name,
             throw dmlc::Error("Error in operator " + inode.source->attrs.name + ": " + e.what());
           }
         } else {
-          ICHECK(!last_iter) << "Attribute " << infer_name << " is not registered by op "
-                             << inode.source->op()->name
-                             << " we are not able to complete the inference because of this";
+          CHECK(!last_iter) << "Attribute " << infer_name << " is not registered by op "
+                            << inode.source->op()->name
+                            << " we are not able to complete the inference because of this";
         }
       }
       // Save to the result map.
diff --git a/nnvm/src/pass/place_device.cc b/nnvm/src/pass/place_device.cc
index 4a9d93465de8..d45658ae24ab 100644
--- a/nnvm/src/pass/place_device.cc
+++ b/nnvm/src/pass/place_device.cc
@@ -33,11 +33,11 @@ namespace {
 // simply logic to place device according to device_group hint
 // insert copy node when there is
 Graph PlaceDevice(Graph src) {
-  ICHECK(src.attrs.count("device_group_attr_key"))
+  CHECK(src.attrs.count("device_group_attr_key"))
       << "Need graph attribute \"device_group_attr_key\" in PlaceDevice";
-  ICHECK(src.attrs.count("device_assign_map"))
+  CHECK(src.attrs.count("device_assign_map"))
       << "Need graph attribute \"device_assign_map\" in PlaceDevice";
-  ICHECK(src.attrs.count("device_copy_op"))
+  CHECK(src.attrs.count("device_copy_op"))
       << "Need graph attribute \"device_copy_op\" in PlaceDevice";
   std::string device_group_attr_key = src.GetAttr<std::string>("device_group_attr_key");
   const Op* copy_op = Op::Get(src.GetAttr<std::string>("device_copy_op"));
@@ -48,7 +48,7 @@ Graph PlaceDevice(Graph src) {
   // copy on write semanatics
   if (src.attrs.count("device") != 0) {
     device = src.MoveCopyAttr<DeviceVector>("device");
-    ICHECK_EQ(device.size(), idx.num_nodes());
+    CHECK_EQ(device.size(), idx.num_nodes());
   } else {
     device.resize(idx.num_nodes(), -1);
   }
@@ -60,7 +60,7 @@ Graph PlaceDevice(Graph src) {
     if (it != inode.source->attrs.dict.end()) {
       const std::string& device_group = it->second;
       auto dit = device_assign_map.find(device_group);
-      ICHECK(dit != device_assign_map.end())
+      CHECK(dit != device_assign_map.end())
           << "The device assignment not found for group " << device_group;
       device[nid] = dit->second;
     } else {
@@ -139,7 +139,7 @@ Graph PlaceDevice(Graph src) {
       }
     }
     if (inode.source->is_variable()) {
-      ICHECK(!need_mutate) << "consistency check";
+      CHECK(!need_mutate) << "consistency check";
     }
     if (need_mutate) {
       ObjectPtr new_node = Node::Create();
diff --git a/nnvm/src/pass/plan_memory.cc b/nnvm/src/pass/plan_memory.cc
index 42c54e366039..2c36cd2eef5a 100644
--- a/nnvm/src/pass/plan_memory.cc
+++ b/nnvm/src/pass/plan_memory.cc
@@ -112,7 +112,7 @@ class GraphAllocator {
   }
   // release a memory space.
   void Release(StorageID id, uint32_t node_id) {
-    ICHECK_NE(id, kBadStorageID);
+    CHECK_NE(id, kBadStorageID);
     if (id == kExternalStorageID || id == kDynamicStorageID) return;
     StorageEntry* e = data_[id].get();
     e->released_by_node = node_id;
@@ -219,7 +219,7 @@ size_t AllocMemory(const Graph& ret, const IndexedGraph& idx,
       std::vector<bool> identity;
       if (finplace_identity.count(inode.source->op()) != 0) {
         identity = finplace_identity[inode.source->op()](inode.source->attrs);
-        ICHECK_EQ(identity.size(), inplace_pairs.size())
+        CHECK_EQ(identity.size(), inplace_pairs.size())
             << "FInplaceOption and FInplaceIdentity returned vectors of different "
             << "size for operator " << inode.source->op()->name;
       } else {
diff --git a/nnvm/src/pass/print_graph_ir.cc b/nnvm/src/pass/print_graph_ir.cc
index 6604d810f288..4fe92e665961 100644
--- a/nnvm/src/pass/print_graph_ir.cc
+++ b/nnvm/src/pass/print_graph_ir.cc
@@ -41,7 +41,7 @@ AttrPrinter GetVectorPrinter_(const T& vec) {
 
 AttrPrinter GetVectorPrinter(const Graph& graph, const std::string& key) {
   auto it = graph.attrs.find(key);
-  ICHECK(it != graph.attrs.end()) << "Cannot find " << key << " in graph attr";
+  CHECK(it != graph.attrs.end()) << "Cannot find " << key << " in graph attr";
   const any& value = *(it->second);
   if (value.type() == typeid(std::vector<TShape>)) {
     return GetVectorPrinter_(nnvm::get<std::vector<TShape> >(value));
diff --git a/nnvm/src/pass/saveload_json.cc b/nnvm/src/pass/saveload_json.cc
index dbd8ee0f83d4..3916da43618d 100644
--- a/nnvm/src/pass/saveload_json.cc
+++ b/nnvm/src/pass/saveload_json.cc
@@ -72,13 +72,13 @@ struct JSONNode {
     }
     void Load(dmlc::JSONReader* reader) {
       reader->BeginArray();
-      ICHECK(reader->NextArrayItem()) << "invalid json format";
+      CHECK(reader->NextArrayItem()) << "invalid json format";
       reader->Read(&node_id);
-      ICHECK(reader->NextArrayItem()) << "invalid json format";
+      CHECK(reader->NextArrayItem()) << "invalid json format";
       reader->Read(&index);
       if (reader->NextArrayItem()) {
         reader->Read(&version);
-        ICHECK(!reader->NextArrayItem()) << "invalid json format";
+        CHECK(!reader->NextArrayItem()) << "invalid json format";
       } else {
         version = 0;
       }
@@ -226,12 +226,12 @@ std::shared_ptr<Symbol> JSONGraph2Symbol(const JSONGraph& jgraph, bool no_parse)
   for (const JSONNode& n : jgraph.nodes) {
     n.node->inputs.reserve(n.inputs.size());
     for (const JSONNode::Entry& e : n.inputs) {
-      ICHECK(e.node_id < jgraph.nodes.size());
+      CHECK(e.node_id < jgraph.nodes.size());
       n.node->inputs.emplace_back(NodeEntry{jgraph.nodes[e.node_id].node, e.index, e.version});
     }
     n.node->control_deps.reserve(n.control_deps.size());
     for (uint32_t nid : n.control_deps) {
-      ICHECK(nid < jgraph.nodes.size());
+      CHECK(nid < jgraph.nodes.size());
       n.node->control_deps.push_back(jgraph.nodes[nid].node);
     }
     for (const JSONGraph& subgraph : n.subgraphs) {
@@ -252,13 +252,13 @@ std::shared_ptr<Symbol> JSONGraph2Symbol(const JSONGraph& jgraph, bool no_parse)
   }
   // consistency check
   for (uint32_t nid : jgraph.arg_nodes) {
-    ICHECK(nid < jgraph.nodes.size());
-    ICHECK(jgraph.nodes[nid].node->is_variable());
+    CHECK(nid < jgraph.nodes.size());
+    CHECK(jgraph.nodes[nid].node->is_variable());
   }
   std::shared_ptr<Symbol> symbol = std::make_shared<Symbol>();
   symbol->outputs.reserve(jgraph.heads.size());
   for (const JSONNode::Entry& e : jgraph.heads) {
-    ICHECK(e.node_id < jgraph.nodes.size());
+    CHECK(e.node_id < jgraph.nodes.size());
     symbol->outputs.emplace_back(NodeEntry{jgraph.nodes[e.node_id].node, e.index, e.version});
   }
   return symbol;
@@ -266,7 +266,7 @@ std::shared_ptr<Symbol> JSONGraph2Symbol(const JSONGraph& jgraph, bool no_parse)
 
 // Load a graph from JSON file.
 Graph LoadJSON(Graph src) {
-  ICHECK_NE(src.attrs.count("json"), 0U) << "Load JSON require json to be presented.";
+  CHECK_NE(src.attrs.count("json"), 0U) << "Load JSON require json to be presented.";
   const std::string& json_str = nnvm::get<std::string>(*src.attrs.at("json"));
   bool no_parse = false;
   if (src.attrs.count("load_json_no_parse")) {
diff --git a/nnvm/tests/cpp/op_test.cc b/nnvm/tests/cpp/op_test.cc
index 39a998a4eebe..2ebd14688f46 100644
--- a/nnvm/tests/cpp/op_test.cc
+++ b/nnvm/tests/cpp/op_test.cc
@@ -35,7 +35,7 @@ TEST(Op, GetAttr) {
   auto add = Op::Get("add");
   auto nick = Op::GetAttr<std::string>("nick_name");
 
-  ICHECK_EQ(nick[add], "plus");
+  CHECK_EQ(nick[add], "plus");
 }
 
 int main(int argc, char** argv) {
diff --git a/nnvm/tests/cpp/tuple_test.cc b/nnvm/tests/cpp/tuple_test.cc
index e28ecd89f6fa..2c2c307aadce 100644
--- a/nnvm/tests/cpp/tuple_test.cc
+++ b/nnvm/tests/cpp/tuple_test.cc
@@ -28,18 +28,18 @@ TEST(Tuple, Basic) {
   Tuple<int> y{1, 2, 3, 5, 6};
   x = std::move(y);
 
-  ICHECK_EQ(x.ndim(), 5);
+  CHECK_EQ(x.ndim(), 5);
   Tuple<int> z{1, 2, 3, 5, 6};
   std::ostringstream os;
   os << z;
-  ICHECK_EQ(os.str(), "[1,2,3,5,6]");
+  CHECK_EQ(os.str(), "[1,2,3,5,6]");
   std::istringstream is(os.str());
   is >> y;
-  ICHECK_EQ(x, y);
+  CHECK_EQ(x, y);
   Tuple<nnvm::dim_t> ss{1, 2, 3};
   TShape s = ss;
   s = std::move(ss);
-  ICHECK((s == TShape{1, 2, 3}));
+  CHECK((s == TShape{1, 2, 3}));
 }
 
 int main(int argc, char** argv) {

From ac684f919fd5ea13545dcab863d786ac38b113c5 Mon Sep 17 00:00:00 2001
From: Trevor Morris <trevmorr@amazon.com>
Date: Tue, 12 Jan 2021 15:24:06 -0800
Subject: [PATCH 049/357] Fix TRT weight conversion when first dim of weight
 shape is 1 (#7253)

---
 python/tvm/relay/op/contrib/tensorrt.py        |  6 +++++-
 .../contrib/tensorrt/tensorrt_builder.cc       | 18 ++++++++++++------
 tests/python/contrib/test_tensorrt.py          |  2 ++
 3 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/python/tvm/relay/op/contrib/tensorrt.py b/python/tvm/relay/op/contrib/tensorrt.py
index bda71468d9e2..db9684d02ac9 100644
--- a/python/tvm/relay/op/contrib/tensorrt.py
+++ b/python/tvm/relay/op/contrib/tensorrt.py
@@ -140,7 +140,11 @@ def partition_for_tensorrt(
             RemoveDropoutPass(),
             transform.RemoveUnusedFunctions(),
             transform.ConvertLayout(
-                {"nn.conv2d": ["NCHW", "default"], "nn.conv3d": ["NCDHW", "default"]}
+                {
+                    "nn.conv2d": ["NCHW", "default"],
+                    "nn.conv3d": ["NCDHW", "default"],
+                    "nn.conv2d_transpose": ["NCHW", "default"],
+                }
             ),
             transform.FoldConstant(),
             transform.AnnotateTarget("tensorrt"),
diff --git a/src/runtime/contrib/tensorrt/tensorrt_builder.cc b/src/runtime/contrib/tensorrt/tensorrt_builder.cc
index 4060b240cf8e..ee47e67001f3 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_builder.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_builder.cc
@@ -91,10 +91,6 @@ void TensorRTBuilder::AddInput(int nid, uint32_t entry_id, const JSONGraphNode&
 void TensorRTBuilder::AddConstant(int nid, const DLTensor* data) {
   nvinfer1::Weights weight = GetDLTensorAsWeights(data, kDLCPU);
   std::vector<int> shape(data->shape, data->shape + data->ndim);
-  // Remove batch dim when not in explicit batch mode.
-  if (use_implicit_batch_ && shape.size() > 1 && shape[0] == 1) {
-    shape.erase(shape.begin());
-  }
   node_output_map_[nid] = {TensorRTOpInput(weight, shape)};
 }
 
@@ -212,8 +208,18 @@ nvinfer1::Weights TensorRTBuilder::GetDLTensorAsWeights(const DLTensor* dptr,
 
 nvinfer1::ITensor* TensorRTBuilder::GetInputAsTensor(const TensorRTOpInput& input) {
   if (input.type == kTensor) return input.tensor;
-  auto dims = VectorToTrtDims(input.weight_shape);
-  return network_->addConstant(dims, input.weight)->getOutput(0);
+  auto shape = input.weight_shape;
+  // Remove batch dim when not in explicit batch mode.
+  // Example:
+  // x = Relay dims (1, 32, 224, 224) which becomes TRT Dims (32, 224, 224)
+  // y = Relay dims (1, 32)
+  // z = add(x, y)
+  // y needs to have TRT dims (32,), otherwise broadcasting will result in z having
+  // TRT Dims(1, 32, 224, 224) when it should be (32, 224, 224).
+  if (use_implicit_batch_ && shape.size() > 1 && shape[0] == 1) {
+    shape.erase(shape.begin());
+  }
+  return network_->addConstant(VectorToTrtDims(shape), input.weight)->getOutput(0);
 }
 
 void TensorRTBuilder::CleanUp() {
diff --git a/tests/python/contrib/test_tensorrt.py b/tests/python/contrib/test_tensorrt.py
index 9b62ee2c4087..bd8d92eedb4c 100644
--- a/tests/python/contrib/test_tensorrt.py
+++ b/tests/python/contrib/test_tensorrt.py
@@ -385,6 +385,7 @@ def get_graph(
     run_and_verify_func(
         get_graph((1, 3, 16, 16), (3, 8, 7, 7), 3, [2, 2, 3, 3], [2, 2], [1, 1], 24)
     )
+    run_and_verify_func(get_graph((1, 3, 16, 16), (1, 3, 1, 1), channels=1))
 
 
 def test_conv2d_nhwc():
@@ -456,6 +457,7 @@ def get_graph(x_shape=(1, 16), k_shape=(32, 16)):
         return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
 
     run_and_verify_func(get_graph())
+    run_and_verify_func(get_graph(k_shape=(1, 16)))
 
 
 def test_bias_add():

From 4364386b92ad513277c9d8ad8c48ee9bfd520db1 Mon Sep 17 00:00:00 2001
From: Luyao Ren <375833274@qq.com>
Date: Wed, 13 Jan 2021 07:46:51 +0800
Subject: [PATCH 050/357] Add op_name in error message for Pool (#7243)

* add op_name in error message for Pool

* fix tiny issue for arguments

* fix tiny issue for LpPool

Co-authored-by: luyaor <luyaor@luyaordeMacBook-Pro.local>
---
 python/tvm/relay/frontend/onnx.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 45457fd6c58c..9405bc532702 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -167,7 +167,7 @@ def get_pad_pair(input1d, kernel1d, stride1d):
     return [pad_before, pad_after]
 
 
-def onnx_default_layout(dims):
+def onnx_default_layout(dims, op_name):
     if dims == 1:
         return "NCW"
     if dims == 2:
@@ -175,11 +175,11 @@ def onnx_default_layout(dims):
     if dims == 3:
         return "NCDHW"
 
-    msg = "Only 1D, 2D and 3D layouts are currently supported"
+    msg = "Only 1D, 2D and 3D layouts are currently supported for operator {}."
     raise tvm.error.OpAttributeInvalid(msg.format(op_name))
 
 
-def onnx_storage_order2layout(storage_order, dims=2):
+def onnx_storage_order2layout(storage_order, dims, op_name):
     """converter of onnx storage order parameter to tvm storage order format"""
     if storage_order not in (0, 1):
         raise tvm.error.OpAttributeInvalid("Mode of storage_order must be either 0 or 1")
@@ -191,7 +191,7 @@ def onnx_storage_order2layout(storage_order, dims=2):
     if dims == 3:
         return "NCDHW" if storage_order == 0 else "NDHWC"
 
-    msg = "Only 1D, 2D and 3D layouts are currently supported"
+    msg = "Only 1D, 2D and 3D layouts are currently supported for operator {}."
     raise tvm.error.OpAttributeInvalid(msg.format(op_name))
 
 
@@ -300,10 +300,10 @@ def _impl_v1(cls, inputs, attr, params):
 
         if "storage_order" in attr:
             attr["layout"] = onnx_storage_order2layout(
-                attr["storage_order"], dims=(len(input_shape) - 2)
+                attr["storage_order"], dims=(len(input_shape) - 2), op_name=cls.name
             )
         else:
-            attr["layout"] = onnx_default_layout(dims=(len(input_shape) - 2))
+            attr["layout"] = onnx_default_layout(dims=(len(input_shape) - 2), op_name=cls.name)
 
         return AttrCvt(
             op_name=dimension_picker(cls.name),
@@ -709,10 +709,10 @@ def _impl_v1(cls, inputs, attr, params):
 
         if "storage_order" in attr:
             attr["layout"] = onnx_storage_order2layout(
-                attr["storage_order"], dims=(len(input_shape) - 2)
+                attr["storage_order"], dims=(len(input_shape) - 2), op_name="LpPool"
             )
         else:
-            attr["layout"] = onnx_default_layout(dims=(len(input_shape) - 2))
+            attr["layout"] = onnx_default_layout(dims=(len(input_shape) - 2), op_name="LpPool")
 
         p = _expr.const(attr["p"], dtype)
         reci_p = _expr.const(1.0 / attr["p"], dtype)

From b5a7de879e67aca80aa25bf9ea9c46315dccb026 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Tue, 12 Jan 2021 16:04:30 -0800
Subject: [PATCH 051/357] Remove check_correctness in AutoTVM, which is busted
 (#7250)

---
 python/tvm/autotvm/measure/measure_methods.py | 60 ++++---------------
 tests/python/unittest/test_autotvm_measure.py | 28 ---------
 vta/scripts/tune_conv2d.py                    |  2 +-
 vta/scripts/tune_conv2d_transpose.py          |  2 +-
 vta/scripts/tune_dense.py                     |  2 +-
 vta/scripts/tune_group_conv2d.py              |  2 +-
 vta/scripts/tune_resnet.py                    |  2 +-
 .../python/integration/test_benchmark_gemm.py | 23 ++++---
 vta/tutorials/autotvm/tune_relay_vta.py       |  2 +-
 9 files changed, 26 insertions(+), 97 deletions(-)

diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 4d6c5daad378..f6f0e429a22a 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -36,7 +36,6 @@
 import tvm._ffi
 import tvm.ir.transform
 from tvm import nd, rpc as _rpc
-from tvm.target import Target
 from tvm.error import TVMError
 from tvm.driver import build
 from tvm.contrib import nvcc, ndk, tar
@@ -195,10 +194,6 @@ class RPCRunner(Runner):
         will be automatically increased.
     cooldown_interval: float, optional
         The cool down interval between two measurements.
-    check_correctness: bool, optional
-        Whether check correctness after measurement. This will use llvm cpu target to
-        call your template and get the reference output.
-        This can work for TOPI templates, but may not work for your custom template.
     enable_cpu_cache_flush: bool
         Whether to flush cache on CPU between repeated measurements.
         Flushing cache can make the measured latency of one operator closer to
@@ -219,7 +214,6 @@ def __init__(
         repeat=3,
         min_repeat_ms=0,
         cooldown_interval=0.1,
-        check_correctness=False,
         enable_cpu_cache_flush=False,
     ):
         super(RPCRunner, self).__init__(timeout, n_parallel)
@@ -234,10 +228,7 @@ def __init__(
         self.repeat = repeat
         self.min_repeat_ms = min_repeat_ms
 
-        self.ref_input = None
-        self.ref_output = None
         self.enable_cpu_cache_flush = enable_cpu_cache_flush
-        self.check_correctness = check_correctness
         self.cooldown_interval = cooldown_interval
 
         self.executor = LocalExecutor(timeout=timeout * (self.n_parallel + 1))
@@ -255,19 +246,6 @@ def set_task(self, task):
                 "and make sure you have free devices on the queue status."
             )
 
-        if self.check_correctness:
-            # use llvm cpu to generate a reference input/output
-            # this option works for tuning topi, but might not work for you custom op
-            with Target("llvm"):
-                s, arg_bufs = task.instantiate(task.config_space.get(0))
-            self.ref_input = [
-                np.random.uniform(size=get_const_tuple(x.shape)).astype(x.dtype) for x in arg_bufs
-            ]
-            func = build(s, arg_bufs, "llvm")
-            tvm_buf = [nd.array(x) for x in self.ref_input]
-            func(*tvm_buf)
-            self.ref_output = [x.asnumpy() for x in tvm_buf]
-
     def get_build_kwargs(self):
         kwargs = {}
         if (
@@ -312,8 +290,6 @@ def run(self, measure_inputs, build_results):
                     self.min_repeat_ms,
                     self.cooldown_interval,
                     remote_args,
-                    self.ref_input,
-                    self.ref_output,
                     self.enable_cpu_cache_flush,
                 )
                 futures.append(ret)
@@ -357,10 +333,6 @@ class LocalRunner(RPCRunner):
         will be automatically increased.
     cooldown_interval: float, optional
         The cool down interval between two measurements.
-    check_correctness: bool, optional
-        Whether check correctness after measurement. This will use llvm cpu target to
-        call your template and get the reference output.
-        This can work for TOPI templates, but may not work for your custom template.
     enable_cpu_cache_flush: bool
         Whether to flush cache on CPU between repeated measurements.
         Flushing cache can make the measured latency of one operator closer to
@@ -380,7 +352,6 @@ def __init__(
         repeat=3,
         min_repeat_ms=0,
         cooldown_interval=0.1,
-        check_correctness=False,
         enable_cpu_cache_flush=False,
     ):
         super(LocalRunner, self).__init__(
@@ -394,7 +365,6 @@ def __init__(
             repeat=repeat,
             min_repeat_ms=min_repeat_ms,
             cooldown_interval=cooldown_interval,
-            check_correctness=check_correctness,
             enable_cpu_cache_flush=enable_cpu_cache_flush,
         )
         self.tracker = None
@@ -512,8 +482,6 @@ def run_through_rpc(
     min_repeat_ms,
     cooldown_interval,
     remote_args,
-    ref_input=None,
-    ref_output=None,
     enable_cpu_cache_flush=False,
 ):
     """Run a generated library through rpc
@@ -544,10 +512,6 @@ def run_through_rpc(
         The cool down interval between two measurements
     remote_args: Tuple
         The argument for request_remote
-    ref_input: List of np.ndarray
-        The reference input used for checking correctness
-    ref_output: List of np.ndarray
-        The reference output used for checking correctness
     enable_cpu_cache_flush: bool
         Whether to flush cache on CPU between repeated measurements.
         Flushing cache can make the measured latency of one operator closer to
@@ -592,20 +556,16 @@ def run_through_rpc(
             f_preproc=f_prepare,
         )
 
-        # set input
-        if ref_input:
-            args = [nd.array(x, ctx=ctx) for x in ref_input]
-        else:
-            try:
-                random_fill = remote.get_function("tvm.contrib.random.random_fill")
-            except AttributeError:
-                raise AttributeError(
-                    "Please make sure USE_RANDOM is ON in the config.cmake " "on the remote devices"
-                )
-            args = [nd.empty(x[0], dtype=x[1], ctx=ctx) for x in build_result.arg_info]
-            for arg in args:
-                random_fill(arg)
-            ctx.sync()
+        try:
+            random_fill = remote.get_function("tvm.contrib.random.random_fill")
+        except AttributeError:
+            raise AttributeError(
+                "Please make sure USE_RANDOM is ON in the config.cmake " "on the remote devices"
+            )
+        args = [nd.empty(x[0], dtype=x[1], ctx=ctx) for x in build_result.arg_info]
+        for arg in args:
+            random_fill(arg)
+        ctx.sync()
 
         costs = time_f(*args).results
 
diff --git a/tests/python/unittest/test_autotvm_measure.py b/tests/python/unittest/test_autotvm_measure.py
index 1a18d6122bf0..9db9f18fa377 100644
--- a/tests/python/unittest/test_autotvm_measure.py
+++ b/tests/python/unittest/test_autotvm_measure.py
@@ -60,36 +60,8 @@ def test_task_tuner_without_measurement_spawn():
     p.join()
 
 
-def test_check_correctness():
-    task, target = get_sample_task()
-
-    measure_option = autotvm.measure_option(
-        builder=autotvm.LocalBuilder(), runner=autotvm.LocalRunner(check_correctness=True)
-    )
-
-    def _callback_correct(tuner, measure_inputs, measure_results):
-        for _, res in zip(measure_inputs, measure_results):
-            assert res.error_no == 0
-
-    tuner = autotvm.tuner.RandomTuner(task)
-    tuner.tune(n_trial=2, measure_option=measure_option, callbacks=[_callback_correct])
-
-    # a bad template
-    n = 128
-    target = tvm.target.Target("llvm -device=bad_device")
-    task = autotvm.task.create("testing/bad_matmul", args=(n, n, n, "float32"), target=target)
-
-    def _callback_wrong(tuner, measure_inputs, measure_results):
-        for _, res in zip(measure_inputs, measure_results):
-            assert res.error_no == MeasureErrorNo.WRONG_ANSWER
-
-    tuner = autotvm.tuner.RandomTuner(task)
-    tuner.tune(n_trial=2, measure_option=measure_option, callbacks=[_callback_wrong])
-
-
 if __name__ == "__main__":
     logging.basicConfig(level=logging.INFO)
 
     test_task_tuner_without_measurement()
     test_task_tuner_without_measurement_spawn()
-    test_check_correctness()
diff --git a/vta/scripts/tune_conv2d.py b/vta/scripts/tune_conv2d.py
index 2a1331f9f94b..6333ac245a95 100644
--- a/vta/scripts/tune_conv2d.py
+++ b/vta/scripts/tune_conv2d.py
@@ -159,7 +159,7 @@ def conv2d(N, CI, H, W, CO, KH, KW, strides, padding, dilation):
                 port=int(tracker_port),
                 number=5,
                 timeout=60,
-                check_correctness=True,
+                # check_correctness=True, # TODO: re-enable when check_correctness works again.
             ),
         )
 
diff --git a/vta/scripts/tune_conv2d_transpose.py b/vta/scripts/tune_conv2d_transpose.py
index ebfe7eb54e5c..e8721539ec77 100644
--- a/vta/scripts/tune_conv2d_transpose.py
+++ b/vta/scripts/tune_conv2d_transpose.py
@@ -151,7 +151,7 @@ def conv2d_transpose(N, CI, H, W, CO, KH, KW, strides, padding, opadding):
                 port=int(tracker_port),
                 number=5,
                 timeout=60,
-                check_correctness=True,
+                # check_correctness=True, # TODO: re-enable when check_correctness works again.
             ),
         )
 
diff --git a/vta/scripts/tune_dense.py b/vta/scripts/tune_dense.py
index 7e3aec86094b..6d600c4c322f 100644
--- a/vta/scripts/tune_dense.py
+++ b/vta/scripts/tune_dense.py
@@ -116,7 +116,7 @@ def dense(N, CI, CO):
                 port=int(tracket_port),
                 number=5,
                 timeout=60,
-                check_correctness=True,
+                # check_correctness=True, # TODO: re-enable when check_correctness works again.
             ),
         )
 
diff --git a/vta/scripts/tune_group_conv2d.py b/vta/scripts/tune_group_conv2d.py
index bfac4996e6ef..ebb7db88845f 100644
--- a/vta/scripts/tune_group_conv2d.py
+++ b/vta/scripts/tune_group_conv2d.py
@@ -154,7 +154,7 @@ def group_conv2d(N, CI, H, W, CO, KH, KW, strides, padding, dilation, group):
                 port=int(tracker_port),
                 number=5,
                 timeout=60,
-                check_correctness=True,
+                # check_correctness=True, # TODO: re-enable when check_correctness works again.
             ),
         )
 
diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py
index 04f430ef8624..a10d1de8c46b 100644
--- a/vta/scripts/tune_resnet.py
+++ b/vta/scripts/tune_resnet.py
@@ -295,7 +295,7 @@ def tune_tasks(
                 min_repeat_ms=150,
                 repeat=opt.measurements,
                 timeout=60,
-                check_correctness=True,
+                # check_correctness=True, # TODO: re-enable when check_correctness works again.
             ),
         ),
     }
diff --git a/vta/tests/python/integration/test_benchmark_gemm.py b/vta/tests/python/integration/test_benchmark_gemm.py
index 3ce2d9c9e4a9..824aed6efa02 100644
--- a/vta/tests/python/integration/test_benchmark_gemm.py
+++ b/vta/tests/python/integration/test_benchmark_gemm.py
@@ -59,7 +59,7 @@ def run_gemm_packed(env, remote, batch_size, channel, block):
         )  # relu
         res = te.compute(res_shape, lambda *i: res_min(*i).astype(env.inp_dtype), name="res")
 
-        def verify(s, check_correctness=True):
+        def verify(s):
             mod = vta.build(s, [data, weight, res], "ext_dev", env.target_host, name="gemm")
             temp = utils.tempdir()
             mod.save(temp.relpath("gemm.o"))
@@ -102,11 +102,9 @@ def verify(s, check_correctness=True):
             res_unpack = res_arr.asnumpy().reshape(
                 batch_size // env.BATCH, channel // env.BLOCK_OUT, env.BATCH, env.BLOCK_OUT
             )
-            if check_correctness:
-                tvm.testing.assert_allclose(res_unpack, res_ref)
             return cost
 
-        def run_schedule(load_inp, load_wgt, gemm, alu, store_out, print_ir, check_correctness):
+        def run_schedule(load_inp, load_wgt, gemm, alu, store_out, print_ir):
             s = te.create_schedule(res.op)
             s[data_buf].set_scope(env.inp_scope)
             s[weight_buf].set_scope(env.wgt_scope)
@@ -156,13 +154,13 @@ def run_schedule(load_inp, load_wgt, gemm, alu, store_out, print_ir, check_corre
 
             if print_ir:
                 print(tvm.lower(s, [data, weight, res], simple_mode=True))
-            return verify(s, check_correctness)
+            return verify(s)
 
         def gemm_normal(print_ir):
             mock = env.mock
             print("----- GEMM GOPS End-to-End Test-------")
 
-            def run_test(header, print_ir, check_correctness):
+            def run_test(header, print_ir):
                 cost = run_schedule(
                     env.dma_copy,
                     env.dma_copy,
@@ -170,14 +168,13 @@ def run_test(header, print_ir, check_correctness):
                     env.alu,
                     env.dma_copy,
                     print_ir,
-                    check_correctness,
                 )
                 gops = (num_ops / cost.mean) / float(10 ** 9)
                 print(header)
                 print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops))
 
             with vta.build_config():
-                run_test("NORMAL", print_ir, True)
+                run_test("NORMAL", print_ir)
 
         def gemm_unittest(print_ir):
             mock = env.mock
@@ -185,7 +182,7 @@ def gemm_unittest(print_ir):
 
             def run_test(header, print_ir):
                 cost = run_schedule(
-                    mock.dma_copy, mock.dma_copy, env.gemm, mock.alu, mock.dma_copy, print_ir, False
+                    mock.dma_copy, mock.dma_copy, env.gemm, mock.alu, mock.dma_copy, print_ir
                 )
                 gops = (num_ops / cost.mean) / float(10 ** 9)
                 print(header)
@@ -200,7 +197,7 @@ def alu_unittest(print_ir):
 
             def run_test(header, print_ir):
                 cost = run_schedule(
-                    mock.dma_copy, mock.dma_copy, mock.gemm, env.alu, mock.dma_copy, print_ir, False
+                    mock.dma_copy, mock.dma_copy, mock.gemm, env.alu, mock.dma_copy, print_ir
                 )
                 gops = (num_ops / cost.mean) / float(10 ** 9)
                 print(header)
@@ -216,7 +213,7 @@ def load_inp_unittest(print_ir):
 
             def run_test(header, print_ir):
                 cost = run_schedule(
-                    env.dma_copy, mock.dma_copy, mock.gemm, mock.alu, mock.dma_copy, print_ir, False
+                    env.dma_copy, mock.dma_copy, mock.gemm, mock.alu, mock.dma_copy, print_ir
                 )
                 gops = (num_ops / cost.mean) / float(10 ** 9)
                 bandwith = (batch_size * channel * env.INP_WIDTH / cost.mean) / float(10 ** 9)
@@ -236,7 +233,7 @@ def load_wgt_unittest(print_ir):
 
             def run_test(header, print_ir):
                 cost = run_schedule(
-                    mock.dma_copy, env.dma_copy, mock.gemm, mock.alu, mock.dma_copy, print_ir, False
+                    mock.dma_copy, env.dma_copy, mock.gemm, mock.alu, mock.dma_copy, print_ir
                 )
                 gops = (num_ops / cost.mean) / float(10 ** 9)
                 bandwith = (channel * channel * env.WGT_WIDTH / cost.mean) / float(10 ** 9)
@@ -256,7 +253,7 @@ def store_out_unittest(print_ir):
 
             def run_test(header, print_ir):
                 cost = run_schedule(
-                    mock.dma_copy, mock.dma_copy, mock.gemm, mock.alu, env.dma_copy, print_ir, False
+                    mock.dma_copy, mock.dma_copy, mock.gemm, mock.alu, env.dma_copy, print_ir
                 )
                 gops = (num_ops / cost.mean) / float(10 ** 9)
                 bandwith = (batch_size * channel * env.OUT_WIDTH / cost.mean) / float(10 ** 9)
diff --git a/vta/tutorials/autotvm/tune_relay_vta.py b/vta/tutorials/autotvm/tune_relay_vta.py
index 273f0af4af03..c5885b65c0f3 100644
--- a/vta/tutorials/autotvm/tune_relay_vta.py
+++ b/vta/tutorials/autotvm/tune_relay_vta.py
@@ -215,7 +215,7 @@ def compile_network(env, target, model, start_pack, stop_pack):
             port=tracker_port,
             number=5,
             timeout=60,
-            check_correctness=True,
+            # check_correctness=True, # TODO: re-enable when check_correctness works again.
         ),
     ),
 }

From 86479badd125125c9109595e9cb4fed3c099e061 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Wed, 13 Jan 2021 10:34:24 +0900
Subject: [PATCH 052/357] [Torch] Restore class-aware NMS for detection models
 by graph rewrite (#7154)

* add a pattern to rewrite nms to batched nms

* update object detection test to add rewrite

* updated tutorial

* add doc

* fixed coord_start

* test fixed by setting force_surpress=False

* revert tutorial change

* add some comment to explain the pattern

* update NMS pattern following frontend change
---
 python/tvm/relay/frontend/pytorch.py          |  14 +-
 python/tvm/relay/frontend/pytorch_utils.py    | 153 +++++++++++++++++-
 .../frontend/pytorch/test_object_detection.py |  20 ++-
 3 files changed, 176 insertions(+), 11 deletions(-)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 8e69739544e5..ca05954227f8 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -1866,18 +1866,18 @@ def nms(self, inputs, input_types):
         scores = inputs[1]
         iou_threshold = inputs[2]
 
-        num_boxes = _op.shape_of(scores)
-
         # TVM NMS assumes score > 0
         scores = scores - _op.min(scores) + _op.const(1.0)
+
+        num_boxes = _op.shape_of(scores)
+        # PyTorch NMS doesn't have score_threshold, so no need to run get_valid_count
+        indices = _op.transform.arange(_op.squeeze(num_boxes), dtype="int32")
+        indices = _op.expand_dims(indices, 0, 1)
+
         # Generate data with shape (1, num_anchors, 5)
         scores = AttrCvt(op_name="expand_dims", extras={"axis": -1, "num_newaxis": 1})([scores], {})
         data = _op.concatenate([scores, boxes], -1)
         data = _op.expand_dims(data, 0, 1)
-        # PyTorch NMS doesn't have score_threshold, so no need to run get_valid_count
-        indices = _op.transform.arange(_op.squeeze(num_boxes), dtype="int32")
-        indices = _op.expand_dims(indices, 0, 1)
-        ct = num_boxes
 
         # Perform Non-Maximum Suppression,
         # PyTorch NMS doesn't have parameter top_k and max_output_size
@@ -1885,7 +1885,7 @@ def nms(self, inputs, input_types):
         top_k = max_out_size = -1
         nms_ret = get_relay_op("non_max_suppression")(
             data=data,
-            valid_count=ct,
+            valid_count=num_boxes,
             indices=indices,
             max_output_size=max_out_size,
             iou_threshold=iou_threshold,
diff --git a/python/tvm/relay/frontend/pytorch_utils.py b/python/tvm/relay/frontend/pytorch_utils.py
index d0f0b9b4b019..6fc5a6af4a36 100644
--- a/python/tvm/relay/frontend/pytorch_utils.py
+++ b/python/tvm/relay/frontend/pytorch_utils.py
@@ -14,8 +14,17 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=import-outside-toplevel
+# pylint: disable=import-outside-toplevel, unused-argument, invalid-name
 """ Common utilities used by PyTorch frontend """
+from .. import op
+from ..dataflow_pattern import (
+    is_constant,
+    is_op,
+    rewrite,
+    is_tuple,
+    wildcard,
+    DFPatternCallback,
+)
 
 
 def is_version_greater_than(ver):
@@ -25,3 +34,145 @@ def is_version_greater_than(ver):
     return "".join(re.findall(r"(\d+\.)(\d+\.)(\d)", torch.__version__)[0]) > "".join(
         re.findall(r"(\d+\.)(\d+\.)(\d)", ver)[0]
     )
+
+
+def batched_nms_pattern(boxes, scores, idxs, iou_threshold, num_boxes, indices):
+    """A pattern to detect batched_nms function in torchvision
+
+    The inputs to this function, boxes, scores, idxs, iou_threshold are wildcard
+    patterns which can be used later in the rewriting to extract matched Relay fragments.
+
+    We want to detect the following PyTorch code snippet:
+
+    def batched_nms(boxes, scores, idxs, iou_threshold):
+        max_coordinate = boxes.max()
+        offsets = idxs.to(boxes) * (max_coordinate + torch.tensor(1).to(boxes))
+        boxes_for_nms = boxes + offsets[:, None]
+        keep = nms(boxes_for_nms, scores, iou_threshold)
+        return keep
+
+    Here is how PyTorch frontend lowers above PyTorch code. For simplicity, Relay ops for
+    dealing with dynamic strided_slice are omitted. %num_boxes, %indices are complex
+    expressions, but since we can use the wildcard part for them, we do not need to construct
+    their patterns.
+
+    %2 = expand_dims(%scores, axis=-1);
+    %3 = cast(%idxs, dtype="float32");
+    %4 = max(%boxes);
+    %5 = add(%4, 1f);
+    %6 = multiply(%3, %5);
+    %7 = strided_slice(%6, begin=[0], end=[4507], strides=[1]);
+    %8 = expand_dims(%7, axis=1);
+    %9 = add(%boxes, %8);
+    %10 = (%2, %9);
+    %11 = concatenate(%10, axis=-1);
+    %12 = expand_dims(%11, axis=0);
+    ...
+    ...
+    %17 = vision.non_max_suppression(%12, %num_boxes, %indices, -1, 0.7f, ...);
+
+    """
+    one = is_constant()
+    zero = is_constant()
+
+    # Equivelent PyTorch code from above snippet
+    # offsets = idxs.to(boxes) * (max_coordinate + torch.tensor(1).to(boxes))
+    cast = is_op("cast")(idxs)
+    mx = is_op("max")(boxes)
+    add = is_op("add")(mx, one)
+    mul = is_op("multiply")(cast, add)
+
+    # The following doesn't appear in the above Relay snippet. It is required for dynamic
+    # stride_slice handling
+    cast_like = is_op("cast_like")(zero, is_constant())
+    less = is_op("less")(is_constant(), cast_like)
+    shape_of = is_op("shape_of")(mul)
+    cast_like = is_op("cast_like")(shape_of, is_constant())
+    add = is_op("add")(is_constant(), cast_like)
+    where = is_op("where")(less, add, is_constant())
+    shape_of = is_op("shape_of")(mul)
+    cast = is_op("cast")(shape_of)
+
+    # This corresponds to offsets[:, None], where offsets is the result of multiplication
+    dyn_strided_slice = is_op("dyn.strided_slice")(mul, where, cast, is_constant())
+
+    # Add offsets to the boxes
+    expand_dims = is_op("expand_dims")(dyn_strided_slice)
+    add = is_op("add")(boxes, expand_dims)
+
+    # The rest of patterns correspond to the PyTorch frontend conversion
+    # function for torchvision::nms
+    score_expand_dims = is_op("expand_dims")(scores)
+    tup = is_tuple([score_expand_dims, add])
+    concat = is_op("concatenate")(tup)
+    data = is_op("expand_dims")(concat)
+
+    return is_op("vision.non_max_suppression")(
+        data, num_boxes, indices, is_constant(), iou_threshold
+    )
+
+
+class NMSRewrite(DFPatternCallback):
+    """A callback to rewrite nms and restore batched nms"""
+
+    def __init__(self):
+        super().__init__()
+        # exprs to extract
+        self.boxes = wildcard()
+        self.scores = wildcard()
+        self.idxs = wildcard()
+        self.iou_threshold = wildcard()
+        self.num_boxes = wildcard()
+        self.indices = wildcard()
+
+        self.pattern = batched_nms_pattern(
+            self.boxes,
+            self.scores,
+            self.idxs,
+            self.iou_threshold,
+            self.num_boxes,
+            self.indices,
+        )
+
+    def convert_batched_nms(self, boxes, scores, idxs, iou_thres, num_boxes, indices):
+        """Restore class-aware NMS using extracted class indices"""
+        scores = op.expand_dims(scores, axis=-1, num_newaxis=1)
+        idxs = op.expand_dims(idxs, axis=-1, num_newaxis=1)
+        idxs = op.cast(idxs, "float32")
+        data = op.concatenate([idxs, scores, boxes], -1)
+        data = op.expand_dims(data, 0, 1)
+
+        top_k = max_out_size = -1
+        out = op.vision.non_max_suppression(
+            data=data,
+            valid_count=num_boxes,
+            indices=indices,
+            max_output_size=max_out_size,
+            iou_threshold=iou_thres,
+            force_suppress=False,
+            top_k=top_k,
+            coord_start=2,
+            score_index=1,
+            id_index=0,
+            return_indices=True,
+            invalid_to_bottom=False,
+        )
+        return out.tuple_value
+
+    def callback(self, pre, post, node_map):
+        boxes = node_map[self.boxes][0]
+        scores = node_map[self.scores][0]
+        idxs = node_map[self.idxs][0]
+        iou_thres = node_map[self.iou_threshold][0]
+        num_boxes = node_map[self.num_boxes][0]
+        indices = node_map[self.indices][0]
+        return self.convert_batched_nms(boxes, scores, idxs, iou_thres, num_boxes, indices)
+
+
+def rewrite_nms_to_batched_nms(mod):
+    """Rewrite the input graph to replace non maximum surpression
+    in torchvision that does not take class id into account with the one
+    that avoids IOU tests between different classes.
+    """
+    mod["main"] = rewrite(NMSRewrite(), mod["main"])
+    return mod
diff --git a/tests/python/frontend/pytorch/test_object_detection.py b/tests/python/frontend/pytorch/test_object_detection.py
index e4545ec4ef5e..2c323776f087 100644
--- a/tests/python/frontend/pytorch/test_object_detection.py
+++ b/tests/python/frontend/pytorch/test_object_detection.py
@@ -26,6 +26,7 @@
 import tvm.testing
 from tvm import relay
 from tvm.runtime.vm import VirtualMachine
+from tvm.relay.frontend.pytorch_utils import rewrite_nms_to_batched_nms
 from tvm.contrib.download import download
 
 
@@ -108,15 +109,17 @@ def test_detection_models():
     with torch.no_grad():
         pt_res = scripted_model(data)
 
-    for target in ["llvm", "cuda"]:
+    def compile_and_run_vm(mod, params, data_np, target):
         with tvm.transform.PassContext(opt_level=3):
             vm_exec = relay.vm.compile(mod, target=target, params=params)
 
         ctx = tvm.context(target, 0)
         vm = VirtualMachine(vm_exec, ctx)
-
         vm.set_input("main", **{input_name: data_np})
-        tvm_res = vm.run()
+        return vm.run()
+
+    for target in ["cuda", "llvm"]:
+        tvm_res = compile_and_run_vm(mod, params, data_np, target)
 
         # Bounding boxes
         tvm.testing.assert_allclose(
@@ -132,3 +135,14 @@ def test_detection_models():
         score_threshold = 0.9
         print("Num boxes:", pt_res[0].cpu().numpy().shape[0])
         print("Num valid boxes:", np.sum(pt_res[1].cpu().numpy() >= score_threshold))
+
+    before = mod["main"]
+    mod = rewrite_nms_to_batched_nms(mod)
+    after = mod["main"]
+    assert not tvm.ir.structural_equal(after, before)
+
+    tvm_res_after_rewrite = compile_and_run_vm(mod, params, data_np, "llvm")
+
+    # Results should be equivalent after rewriting
+    for res1, res2 in zip(tvm_res, tvm_res_after_rewrite):
+        tvm.testing.assert_allclose(res1.asnumpy(), res2.asnumpy())

From 1d07f1a0f4e70872c2a52531b6bd8580d64c7538 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Wed, 13 Jan 2021 15:42:09 +0900
Subject: [PATCH 053/357] [THRUST] Faster multi dimensional argsort by
 segmented sort (#7195)

* remove sort nms

* add segmented sort by key impl

* bug fix, test pass

* updated fast path condition to work for all dims
---
 python/tvm/topi/cuda/nms.py          |   6 +-
 python/tvm/topi/cuda/sort.py         |  73 +----------------
 src/runtime/contrib/thrust/thrust.cu | 117 ++++++++++++++++-----------
 3 files changed, 72 insertions(+), 124 deletions(-)

diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
index 8946446f3cdc..a4080e585a45 100644
--- a/python/tvm/topi/cuda/nms.py
+++ b/python/tvm/topi/cuda/nms.py
@@ -819,11 +819,9 @@ def non_max_suppression(
     if (
         target
         and target.kind.name == "cuda"
-        and tvm.get_global_func("tvm.contrib.thrust.sort_nms", allow_missing=True)
+        and tvm.get_global_func("tvm.contrib.thrust.sort", allow_missing=True)
     ):
-        sort_tensor = argsort_thrust(
-            score_tensor, valid_count=None, axis=1, is_ascend=False, dtype=valid_count_dtype
-        )
+        sort_tensor = argsort_thrust(score_tensor, axis=1, is_ascend=False, dtype=valid_count_dtype)
     else:
         sort_tensor = argsort(score_tensor, axis=1, is_ascend=False, dtype=valid_count_dtype)
 
diff --git a/python/tvm/topi/cuda/sort.py b/python/tvm/topi/cuda/sort.py
index 18872a242160..9b6a18a8b06b 100644
--- a/python/tvm/topi/cuda/sort.py
+++ b/python/tvm/topi/cuda/sort.py
@@ -409,68 +409,6 @@ def sort_by_key_ir(
     )
 
 
-def argsort_nms_thrust(data, valid_count, axis=-1, is_ascend=1, dtype="float32"):
-    """Performs sorting along the given axis and returns an array of indicies
-    having same shape as an input array that index data in sorted order.
-
-    Parameters
-    ----------
-    data: tvm.te.Tensor
-        The input array.
-
-    valid_count : tvm.te.Tensor, optional
-        The number of valid elements to be sorted.
-
-    axis : int, optional
-        Axis long which to sort the input tensor.
-
-    is_ascend : boolean, optional
-        Whether to sort in ascending or descending order.
-
-    dtype : string, optional
-        DType of the output indices.
-
-    Returns
-    -------
-    out : tvm.te.Tensor
-        The output of this function.
-    """
-    ndim = len(data.shape)
-    if axis < 0:
-        axis = ndim + axis
-    if axis != ndim - 1:
-        # Prepare for sorting along axis -1.
-        axes = swap(list(range(ndim)), axis)
-        data = transpose(data, axes)
-
-    data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
-    valid_count_buf = tvm.tir.decl_buffer(
-        valid_count.shape, valid_count.dtype, "valid_count_buf", data_alignment=4
-    )
-    out_bufs = [
-        tvm.tir.decl_buffer(data.shape, data.dtype, "value_buf", data_alignment=8),
-        tvm.tir.decl_buffer(data.shape, "int32", "indices_buf", data_alignment=8),
-    ]
-    out = te.extern(
-        [data.shape, data.shape],
-        [data, valid_count],
-        lambda ins, outs: tvm.tir.call_packed(
-            "tvm.contrib.thrust.sort_nms", ins[0], ins[1], outs[0], outs[1], is_ascend
-        ),
-        in_buffers=[data_buf, valid_count_buf],
-        out_buffers=out_bufs,
-        dtype=[data.dtype, "int32"],
-        name="nms_argsort_gpu",
-        tag="nms_argsort_gpu",
-    )
-
-    if axis != ndim - 1:
-        axes = swap(list(range(ndim)), axis)
-        out = [transpose(o, axes) for o in out]
-
-    return out[1]
-
-
 def sort(data, axis=-1, is_ascend=1):
     """Performs sorting along the given axis and returns an array of
     sorted values with the same shape as the input data.
@@ -602,7 +540,7 @@ def argsort(data, axis=-1, is_ascend=1, dtype="float32"):
     return out
 
 
-def argsort_thrust(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"):
+def argsort_thrust(data, axis=-1, is_ascend=1, dtype="float32"):
     """Performs sorting along the given axis and returns an array of indicies
     having same shape as an input array that index data in sorted order.
 
@@ -611,9 +549,6 @@ def argsort_thrust(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"
     data: tvm.te.Tensor
         The input array.
 
-    valid_count : tvm.te.Tensor, optional
-        The number of valid elements to be sorted.
-
     axis : int, optional
         Axis long which to sort the input tensor.
 
@@ -628,11 +563,7 @@ def argsort_thrust(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"
     out : tvm.te.Tensor
         The output of this function.
     """
-    if valid_count is not None:
-        out = argsort_nms_thrust(data, valid_count, axis, is_ascend, dtype)
-    else:
-        out = topk_thrust(data, 0, axis, "indices", is_ascend, dtype)
-    return out
+    return topk_thrust(data, 0, axis, "indices", is_ascend, dtype)
 
 
 def schedule_sort(outs):
diff --git a/src/runtime/contrib/thrust/thrust.cu b/src/runtime/contrib/thrust/thrust.cu
index dddbb043fddc..6a48f1ad876a 100644
--- a/src/runtime/contrib/thrust/thrust.cu
+++ b/src/runtime/contrib/thrust/thrust.cu
@@ -22,7 +22,9 @@
  */
 
 #include <thrust/device_ptr.h>
+#include <thrust/device_vector.h>
 #include <thrust/sort.h>
+#include <thrust/gather.h>
 
 #include <tvm/runtime/registry.h>
 #include <dlpack/dlpack.h>
@@ -41,21 +43,19 @@ void thrust_sort(DLTensor* input,
                  DLTensor* out_values,
                  DLTensor* out_indices,
                  bool is_ascend,
-                 const std::function<int(int)> &get_sort_len) {
+                 int n_values) {
   thrust::device_ptr<DataType> data_ptr(static_cast<DataType *>(input->data));
   thrust::device_ptr<DataType> values_ptr(static_cast<DataType *>(out_values->data));
   thrust::device_ptr<IndicesType> indices_ptr(static_cast<IndicesType *>(out_indices->data));
 
-  int n_values = input->shape[input->ndim - 1];
-  int n_iter = 1;
-  for (int i = 0; i < input->ndim - 1; ++i) {
-    n_iter *= input->shape[i];
+  size_t size = 1;
+  for (int i = 0; i < input->ndim; ++i) {
+    size *= input->shape[i];
   }
+  thrust::copy(data_ptr, data_ptr + size, values_ptr);
 
-  thrust::copy(data_ptr, data_ptr + n_iter * n_values, values_ptr);
-
-  for (int i = 0 ; i < n_iter; ++i) {
-    n_values = get_sort_len(i);
+  if (size == static_cast<size_t>(input->shape[input->ndim - 1])) {
+    // A fast path for single segment case
     thrust::sequence(indices_ptr, indices_ptr + n_values);
     if (is_ascend) {
       thrust::sort_by_key(values_ptr, values_ptr + n_values, indices_ptr);
@@ -63,8 +63,47 @@ void thrust_sort(DLTensor* input,
       thrust::sort_by_key(values_ptr, values_ptr + n_values, indices_ptr,
                           thrust::greater<DataType>());
     }
-    values_ptr += n_values;
-    indices_ptr += n_values;
+  } else {
+    // segmented sort by key
+    // Follow the back-to-back stable_sort_by_key strategy explained below
+    // https://groups.google.com/g/thrust-users/c/BoLsxO6b4FY
+    thrust::device_vector<int64_t> argsort_order(size);
+    thrust::sequence(argsort_order.begin(), argsort_order.end());
+
+    // First, sort values and store the sorted order in argsort_order.
+    if (is_ascend) {
+      thrust::stable_sort_by_key(values_ptr, values_ptr + size, argsort_order.begin());
+    } else {
+      thrust::stable_sort_by_key(values_ptr, values_ptr + size, argsort_order.begin(),
+                                 thrust::greater<DataType>());
+    }
+
+    // The following is to create the indices array 0, 1, 2, 0, 1, 2 ... 0, 1, 2
+    // without materializing it
+    auto counting_iter = thrust::counting_iterator<int64_t>(0);
+    auto linear_index_to_sort_axis_index = [n_values] __host__ __device__(int64_t i) {
+      return i % n_values;
+    }; // NOLINT(*)
+    auto init_indices_iter = thrust::make_transform_iterator(counting_iter,
+                                                             linear_index_to_sort_axis_index);
+
+    // This will reorder indices 0, 1, 2 ... in the sorted order of values_ptr
+    thrust::gather(argsort_order.begin(), argsort_order.end(), init_indices_iter, indices_ptr);
+
+    thrust::device_vector<int> segment_ids(size);
+    auto linear_index_to_segment_id = [n_values] __host__ __device__(int64_t i) {
+      return i / n_values;
+    }; // NOLINT(*)
+    // We also reorder segment indices 0, 0, 0, 1, 1, 1 ... in the order of values_ptr
+    thrust::transform(argsort_order.begin(), argsort_order.end(), segment_ids.begin(),
+                      linear_index_to_segment_id);
+
+    // The second sort key-ed by segment_ids would bring segment_ids back to 0, 0, 0, 1, 1, 1 ...
+    // values_ptr and indices_ptr will also be sorted in the order of segmend_ids above
+    // Since sorting has been done in a stable way, relative orderings of values and indices
+    // in the segment do not change and hence they remain sorted.
+    auto key_val_zip = thrust::make_zip_iterator(thrust::make_tuple(values_ptr, indices_ptr));
+    thrust::stable_sort_by_key(segment_ids.begin(), segment_ids.end(), key_val_zip);
   }
 }
 
@@ -72,54 +111,54 @@ void thrust_sort_common(DLTensor* input,
                         DLTensor* values_out,
                         DLTensor* indices_out,
                         bool is_ascend,
-                        const std::function<int(int)> &get_sort_len,
+                        int sort_len,
                         std::string data_dtype,
                         std::string out_dtype) {
   if (data_dtype == "float32") {
     if (out_dtype == "int32") {
-      thrust_sort<float, int32_t>(input, values_out, indices_out, is_ascend, get_sort_len);
+      thrust_sort<float, int32_t>(input, values_out, indices_out, is_ascend, sort_len);
     } else if (out_dtype == "int64") {
-      thrust_sort<float, int64_t>(input, values_out, indices_out, is_ascend, get_sort_len);
+      thrust_sort<float, int64_t>(input, values_out, indices_out, is_ascend, sort_len);
     } else if (out_dtype == "float32") {
-      thrust_sort<float, float>(input, values_out, indices_out, is_ascend, get_sort_len);
+      thrust_sort<float, float>(input, values_out, indices_out, is_ascend, sort_len);
     } else if (out_dtype == "float64") {
-      thrust_sort<float, double>(input, values_out, indices_out, is_ascend, get_sort_len);
+      thrust_sort<float, double>(input, values_out, indices_out, is_ascend, sort_len);
     } else {
       LOG(FATAL) << "Unsupported output dtype: " << out_dtype;
     }
   } else if (data_dtype == "float64") {
     if (out_dtype == "int32") {
-      thrust_sort<double, int32_t>(input, values_out, indices_out, is_ascend, get_sort_len);
+      thrust_sort<double, int32_t>(input, values_out, indices_out, is_ascend, sort_len);
     } else if (out_dtype == "int64") {
-      thrust_sort<double, int64_t>(input, values_out, indices_out, is_ascend, get_sort_len);
+      thrust_sort<double, int64_t>(input, values_out, indices_out, is_ascend, sort_len);
     } else if (out_dtype == "float32") {
-      thrust_sort<double, float>(input, values_out, indices_out, is_ascend, get_sort_len);
+      thrust_sort<double, float>(input, values_out, indices_out, is_ascend, sort_len);
     } else if (out_dtype == "float64") {
-      thrust_sort<double, double>(input, values_out, indices_out, is_ascend, get_sort_len);
+      thrust_sort<double, double>(input, values_out, indices_out, is_ascend, sort_len);
     } else {
       LOG(FATAL) << "Unsupported output dtype: " << out_dtype;
     }
   } else if (data_dtype == "int32") {
     if (out_dtype == "int32") {
-      thrust_sort<int32_t, int32_t>(input, values_out, indices_out, is_ascend, get_sort_len);
+      thrust_sort<int32_t, int32_t>(input, values_out, indices_out, is_ascend, sort_len);
     } else if (out_dtype == "int64") {
-      thrust_sort<int32_t, int64_t>(input, values_out, indices_out, is_ascend, get_sort_len);
+      thrust_sort<int32_t, int64_t>(input, values_out, indices_out, is_ascend, sort_len);
     } else if (out_dtype == "float32") {
-      thrust_sort<int32_t, float>(input, values_out, indices_out, is_ascend, get_sort_len);
+      thrust_sort<int32_t, float>(input, values_out, indices_out, is_ascend, sort_len);
     } else if (out_dtype == "float64") {
-      thrust_sort<int32_t, double>(input, values_out, indices_out, is_ascend, get_sort_len);
+      thrust_sort<int32_t, double>(input, values_out, indices_out, is_ascend, sort_len);
     } else {
       LOG(FATAL) << "Unsupported output dtype: " << out_dtype;
     }
   }  else if (data_dtype == "int64") {
     if (out_dtype == "int32") {
-      thrust_sort<int64_t, int32_t>(input, values_out, indices_out, is_ascend, get_sort_len);
+      thrust_sort<int64_t, int32_t>(input, values_out, indices_out, is_ascend, sort_len);
     } else if (out_dtype == "int64") {
-      thrust_sort<int64_t, int64_t>(input, values_out, indices_out, is_ascend, get_sort_len);
+      thrust_sort<int64_t, int64_t>(input, values_out, indices_out, is_ascend, sort_len);
     } else if (out_dtype == "float32") {
-      thrust_sort<int64_t, float>(input, values_out, indices_out, is_ascend, get_sort_len);
+      thrust_sort<int64_t, float>(input, values_out, indices_out, is_ascend, sort_len);
     } else if (out_dtype == "float64") {
-      thrust_sort<int64_t, double>(input, values_out, indices_out, is_ascend, get_sort_len);
+      thrust_sort<int64_t, double>(input, values_out, indices_out, is_ascend, sort_len);
     } else {
       LOG(FATAL) << "Unsupported output dtype: " << out_dtype;
     }
@@ -128,25 +167,6 @@ void thrust_sort_common(DLTensor* input,
   }
 }
 
-TVM_REGISTER_GLOBAL("tvm.contrib.thrust.sort_nms")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-  ICHECK_GE(args.num_args, 5);
-  DLTensor* input = args[0];
-  DLTensor* valid_count = args[1];
-  DLTensor* values_out = args[2];
-  DLTensor* indices_out = args[3];
-  bool is_ascend = args[4];
-
-  auto data_dtype = DLDataType2String(input->dtype);
-  auto out_dtype = DLDataType2String(indices_out->dtype);
-
-  thrust::device_ptr<int> valid_count_ptr(static_cast<int *>(valid_count->data));
-  auto get_sort_len = [&valid_count_ptr](int i) { return valid_count_ptr[i]; };
-  thrust_sort_common(input, values_out, indices_out, is_ascend, get_sort_len,
-                     data_dtype, out_dtype);
-});
-
-
 TVM_REGISTER_GLOBAL("tvm.contrib.thrust.sort")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
   ICHECK_GE(args.num_args, 4);
@@ -159,8 +179,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.thrust.sort")
   auto out_dtype = DLDataType2String(indices_out->dtype);
 
   int n_values = input->shape[input->ndim - 1];
-  auto get_sort_len = [=](int i) { return n_values; };
-  thrust_sort_common(input, values_out, indices_out, is_ascend, get_sort_len,
+  thrust_sort_common(input, values_out, indices_out, is_ascend, n_values,
                      data_dtype, out_dtype);
 });
 

From 1f2b40fe371d22aaadc27fc1cc77778b59201f0f Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Wed, 13 Jan 2021 19:18:37 +0900
Subject: [PATCH 054/357] Unpack NMS inputs into bbox, scores and class ids
 (#7257)

commit fe8fda81774c2e1a4d434179f62e3a299e084cb7
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed Dec 30 20:31:29 2020 +0900

    fix write by a single thread

commit 0c21e36d58f81adeedec1749aeb04ed4e93a7f36
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Tue Dec 29 04:32:18 2020 +0900

    minor improvement when topk is available

commit 68c686617c818a81f31c6696c99c5dae68405bec
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Tue Dec 29 04:10:24 2020 +0900

    finish concat output

commit 37d7a198010a7bfef85158bbc22b6673e43b2973
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Tue Dec 29 03:59:28 2020 +0900

    fixed topk handling

commit 1913f9764dc5987deb2c6228112c18b98533831c
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Mon Dec 28 21:34:24 2020 +0900

    more refactoring

commit 70c65f099da7cf8a18ffbaadadbd6dc814a804fe
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Mon Dec 28 21:27:15 2020 +0900

    unpack input data

commit 3a273975b1456991fd3f70e055cd5f7c2cdd79fe
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Mon Dec 28 21:22:16 2020 +0900

    slight change to initialization

commit 9b42008b42004f5f05cdaa51e2f6feeadf99abb1
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Mon Dec 28 19:50:36 2020 +0900

    add some comments, remove check the check on negative class id

commit 0aa375d67ad14cae8431958e17d1901dd94d1f6b
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Mon Dec 28 19:39:49 2020 +0900

    leave a TODO on write by only one thread

commit d75ee0a62b8e2fb8912ff226ea8bedb8ed78764d
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Mon Dec 28 19:13:04 2020 +0900

    temp disable write by only thread 0

commit 20b563031adf56f93a7bcfe5b853c477175f4f80
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Sat Dec 26 10:06:43 2020 +0900

    use one block two avoid global sync issue

commit dd1e23068f6fdadc5cb3c3a1872c3fff42f4e2ea
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Sat Dec 26 07:59:19 2020 +0900

    make NMS inner loop parallel
fix write by a single thread
---
 python/tvm/topi/cuda/nms.py | 326 +++++++++++++++++++++++++-----------
 1 file changed, 227 insertions(+), 99 deletions(-)

diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
index a4080e585a45..6f3ed789ffc1 100644
--- a/python/tvm/topi/cuda/nms.py
+++ b/python/tvm/topi/cuda/nms.py
@@ -21,7 +21,7 @@
 from tvm import te
 
 from tvm.tir import if_then_else
-from .sort import argsort, argsort_thrust
+from .sort import argsort, argsort_thrust, is_thrust_available
 
 
 def cuda_atomic_add_rule(op):
@@ -412,7 +412,9 @@ def nms_ir(
     sorted_index,
     valid_count,
     indices,
-    out,
+    out_bboxes,
+    out_scores,
+    out_class_ids,
     box_indices,
     num_valid_boxes,
     max_output_size,
@@ -444,8 +446,14 @@ def nms_ir(
         dimension are like the output of arange(num_anchors) if get_valid_counts
         is not used before non_max_suppression.
 
-    out : Buffer
-        Output buffer, to be filled with sorted boxes.
+    out_bboxes : Buffer
+        Output buffer, to be filled with sorted box coordinates.
+
+    out_scores : Buffer
+        Output buffer, to be filled with sorted scores.
+
+    out_class_ids : Buffer
+        Output buffer, to be filled with sorted class ids.
 
     box_indices : Buffer
         A indices tensor mapping sorted indices to original indices
@@ -532,9 +540,13 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
     sorted_index = ib.buffer_ptr(sorted_index)
     valid_count = ib.buffer_ptr(valid_count)
     indices = ib.buffer_ptr(indices)
-    num_valid_boxes = ib.buffer_ptr(num_valid_boxes)
-    out = ib.buffer_ptr(out)
+
+    # outputs
+    out_bboxes = ib.buffer_ptr(out_bboxes)
+    out_scores = ib.buffer_ptr(out_scores)
+    out_class_ids = ib.buffer_ptr(out_class_ids)
     box_indices = ib.buffer_ptr(box_indices)
+    num_valid_boxes = ib.buffer_ptr(num_valid_boxes)
 
     if isinstance(iou_threshold, float):
         iou_threshold = tvm.tir.FloatImm("float32", iou_threshold)
@@ -557,31 +569,53 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
         ib.scope_attr(tx, "thread_extent", nthread_tx)
         ib.scope_attr(bx, "thread_extent", nthread_bx)
         i = by
-        base_idx = i * num_anchors * box_data_length
+        base_src_idx = i * num_anchors * box_data_length
+        base_bbox_idx = i * num_anchors * 4
+
         with ib.if_scope(tvm.tir.all(iou_threshold > 0, valid_count[i] > 0)):
             # Reorder output
             nkeep = if_then_else(
                 tvm.tir.all(top_k > 0, top_k < valid_count[i]), top_k, valid_count[i]
             )
             j = bx * max_threads + tx
-            with ib.if_scope(j < num_anchors):
-                box_indices[i * num_anchors + j] = -1
             with ib.if_scope(j < nkeep):
-                # Fill in out with sorted boxes
-                with ib.for_range(0, box_data_length) as k:
-                    out[(base_idx + j * box_data_length + k)] = data[
-                        (base_idx + sorted_index[i * num_anchors + j] * box_data_length + k)
-                    ]
+                src_idx = base_src_idx + sorted_index[i * num_anchors + j] * box_data_length
+                with ib.for_range(0, 4, for_type="unroll") as k:
+                    out_bboxes[(base_bbox_idx + j * 4 + k)] = data[src_idx + coord_start + k]
+
+                out_scores[i * num_anchors + j] = data[src_idx + score_index]
+
+                if id_index >= 0:
+                    out_class_ids[i * num_anchors + j] = data[src_idx + id_index]
+
             with ib.else_scope():
                 # Indices > nkeep are discarded
+                # Only needed for return_indices = False case
+                if return_indices is False:
+                    with ib.if_scope(j < num_anchors):
+                        with ib.for_range(0, 4, for_type="unroll") as k:
+                            out_bboxes[(base_bbox_idx + j * 4 + k)] = -1.0
+
+                        out_scores[i, j] = -1.0
+
+                        if id_index >= 0:
+                            out_class_ids[i, j] = -1.0
+
+            if return_indices:
                 with ib.if_scope(j < num_anchors):
-                    with ib.for_range(0, box_data_length) as k:
-                        out[(base_idx + j * box_data_length + k)] = -1.0
+                    box_indices[i * num_anchors + j] = -1
+
         with ib.else_scope():
             with ib.if_scope(j < valid_count[i]):
-                with ib.for_range(0, box_data_length) as k:
-                    offset = base_idx + j * box_data_length + k
-                    out[offset] = data[offset]
+                src_offset = base_src_idx + j * box_data_length
+
+                with ib.for_range(0, 4, for_type="unroll") as k:
+                    out_bboxes[base_bbox_idx + j * 4 + k] = data[src_offset + coord_start + k]
+                out_scores[i * num_anchors + j] = data[src_offset + score_index]
+
+                if id_index >= 0:
+                    out_class_ids[i * num_anchors + j] = data[src_offset + id_index]
+
                 box_indices[i * num_anchors + j] = j
 
     with ib.new_scope():
@@ -595,7 +629,7 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
 
         i = by
 
-        base_idx = i * num_anchors * box_data_length
+        base_bbox_idx = i * num_anchors * 4
         num_valid_boxes_local = ib.allocate(
             "int32", (1,), name="num_valid_boxes_local", scope="local"
         )
@@ -613,37 +647,36 @@ def nms_inner_loop(ib, j):
 
             num_valid_boxes_local[0] += 1
 
-            offset_j = j * box_data_length
+            offset_j = j * 4
             num_iter_per_thread = ceil_div(nkeep - (j + 1), nthread_tx)
 
             with ib.for_range(0, num_iter_per_thread) as _k:
                 k = j + 1 + _k * nthread_tx + tx
-                offset_k = k * box_data_length
+                offset_k = k * 4
 
                 with ib.if_scope(
                     tvm.tir.all(
                         k < nkeep,
-                        out[base_idx + offset_k + score_index] > 0,  # is the box k still valid?
+                        out_scores[i, k] > 0,  # is the box k still valid?
                         tvm.tir.any(
                             force_suppress > 0,
                             id_index < 0,
-                            out[base_idx + offset_k + id_index]
-                            == out[base_idx + offset_j + id_index],
+                            out_class_ids[i, k] == out_class_ids[i, j],
                         ),
                     )
                 ):
                     iou = calculate_overlap(
-                        out,
-                        base_idx + offset_j + coord_start,
-                        base_idx + offset_k + coord_start,
+                        out_bboxes,
+                        base_bbox_idx + offset_j,
+                        base_bbox_idx + offset_k,
                     )
                     with ib.if_scope(iou >= iou_threshold):
                         # invalidate the box k
-                        out[base_idx + offset_k + score_index] = -1.0
-                        with ib.if_scope(id_index >= 0):
-                            out[base_idx + offset_k + id_index] = -1.0
+                        out_scores[i, k] = -1.0
+
+                        if return_indices is False and id_index >= 0:
+                            out_class_ids[i, k] = -1.0
 
-                # Make sure to do the next loop in a lock step
                 ib.emit(tvm.tir.Call(None, "tir.tvm_storage_sync", tvm.runtime.convert(["shared"])))
 
         if isinstance(max_output_size, int):
@@ -653,9 +686,11 @@ def nms_inner_loop(ib, j):
             # Apply nms
             with ib.for_range(0, nkeep) as j:
                 # Proceed to the inner loop if the box j is still valid
-                with ib.if_scope(out[base_idx + (j * box_data_length) + score_index] > -1.0):
+                with ib.if_scope(out_scores[i, j] > -1.0):
                     with ib.if_scope(max_output_size > 0):
-                        # No need to do more iteration if we already reach max_output_size boxes
+                        # No need to do more iteration if we have already reached max_output_size
+                        # boxes
+                        # TODO(masahi): Add TIR while loop to realize early exit from the outer loop
                         with ib.if_scope(num_valid_boxes_local[0] < max_output_size):
                             nms_inner_loop(ib, j)
                     with ib.else_scope():
@@ -699,6 +734,145 @@ def _fetch_score_ir(data, score, axis):
     return ib.get()
 
 
+def _get_sorted_indices(data, data_buf, score_index, score_shape):
+    """Extract a 1D score tensor from the packed input and do argsort on it."""
+    score_buf = tvm.tir.decl_buffer(score_shape, data.dtype, "score_buf", data_alignment=8)
+    score_tensor = te.extern(
+        [score_shape],
+        [data],
+        lambda ins, outs: _fetch_score_ir(
+            ins[0],
+            outs[0],
+            score_index,
+        ),
+        dtype=[data.dtype],
+        in_buffers=[data_buf],
+        out_buffers=[score_buf],
+        name="fetch_score",
+        tag="fetch_score",
+    )
+
+    if is_thrust_available():
+        sort_tensor = argsort_thrust(score_tensor, axis=1, is_ascend=False, dtype="int32")
+    else:
+        sort_tensor = argsort(score_tensor, axis=1, is_ascend=False, dtype="int32")
+
+    return sort_tensor
+
+
+def _run_nms(
+    data,
+    data_buf,
+    sort_tensor,
+    valid_count,
+    indices,
+    max_output_size,
+    iou_threshold,
+    force_suppress,
+    top_k,
+    coord_start,
+    id_index,
+    score_index,
+    return_indices,
+):
+    """Run NMS using sorted scores."""
+    sort_tensor_buf = tvm.tir.decl_buffer(
+        sort_tensor.shape, sort_tensor.dtype, "sort_tensor_buf", data_alignment=8
+    )
+
+    valid_count_dtype = "int32"
+    valid_count_buf = tvm.tir.decl_buffer(
+        valid_count.shape, valid_count_dtype, "valid_count_buf", data_alignment=4
+    )
+    indices_buf = tvm.tir.decl_buffer(indices.shape, indices.dtype, "indices_buf", data_alignment=8)
+
+    batch_size = data.shape[0]
+    num_anchors = data.shape[1]
+
+    # output shapes
+    bbox_shape = (batch_size, num_anchors, 4)
+    score_shape = (batch_size, num_anchors)
+    class_id_shape = score_shape
+    box_indices_shape = score_shape
+    num_valid_boxes_shape = (batch_size, 1)
+
+    return te.extern(
+        [bbox_shape, score_shape, class_id_shape, box_indices_shape, num_valid_boxes_shape],
+        [data, sort_tensor, valid_count, indices],
+        lambda ins, outs: nms_ir(
+            ins[0],
+            ins[1],
+            ins[2],
+            ins[3],
+            outs[0],  # sorted bbox
+            outs[1],  # sorted scores
+            outs[2],  # sorted class ids
+            outs[3],  # box_indices
+            outs[4],  # num_valid_boxes
+            max_output_size,
+            iou_threshold,
+            force_suppress,
+            top_k,
+            coord_start,
+            id_index,
+            score_index,
+            return_indices,
+        ),
+        dtype=[data.dtype, "float32", "float32", "int32", "int32"],
+        in_buffers=[data_buf, sort_tensor_buf, valid_count_buf, indices_buf],
+        name="nms",
+        tag="nms",
+    )
+
+
+def _concatenate_outputs(
+    out_bboxes, out_scores, out_class_ids, out_shape, coord_start, score_index, id_index
+):
+    """Pack the results from NMS into a single 5D or 6D tensor."""
+    batch_size = out_bboxes.shape[0]
+    num_anchors = out_bboxes.shape[1]
+
+    def ir(out_bboxes, out_scores, out_class_ids, out):
+        ib = tvm.tir.ir_builder.create()
+
+        out_bboxes = ib.buffer_ptr(out_bboxes)
+        out_scores = ib.buffer_ptr(out_scores)
+        out_class_ids = ib.buffer_ptr(out_class_ids)
+        out = ib.buffer_ptr(out)
+
+        with ib.if_scope(num_anchors > 0):
+            max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
+            nthread_tx = max_threads
+            nthread_bx = ceil_div(num_anchors, nthread_tx)
+            tx = te.thread_axis("threadIdx.x")
+            bx = te.thread_axis("blockIdx.x")
+            by = te.thread_axis("blockIdx.y")
+            ib.scope_attr(tx, "thread_extent", nthread_tx)
+            ib.scope_attr(bx, "thread_extent", nthread_bx)
+            ib.scope_attr(by, "thread_extent", batch_size)
+
+            tid = bx * nthread_tx + tx
+            i = by
+
+            with ib.if_scope(tid < num_anchors):
+                with ib.for_range(0, 4, for_type="unroll") as j:
+                    out[i, tid, coord_start + j] = out_bboxes[i, tid, j]
+                out[i, tid, score_index] = out_scores[i, tid]
+                if id_index >= 0:
+                    out[i, tid, id_index] = out_class_ids[i, tid]
+
+        return ib.get()
+
+    return te.extern(
+        [out_shape],
+        [out_bboxes, out_scores, out_class_ids],
+        lambda ins, outs: ir(ins[0], ins[1], ins[2], outs[0]),
+        dtype=["float32"],
+        name="nms_output_concat",
+        tag="nms_output_concat",
+    )
+
+
 def non_max_suppression(
     data,
     valid_count,
@@ -790,75 +964,29 @@ def non_max_suppression(
         tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx)
         f(tvm_data, tvm_valid_count, tvm_out)
     """
-    batch_size = data.shape[0]
-    num_anchors = data.shape[1]
-
-    valid_count_dtype = "int32"
-    valid_count_buf = tvm.tir.decl_buffer(
-        valid_count.shape, valid_count_dtype, "valid_count_buf", data_alignment=4
-    )
-    score_axis = score_index
-    score_shape = (batch_size, num_anchors)
-    data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
-    score_buf = tvm.tir.decl_buffer(score_shape, data.dtype, "score_buf", data_alignment=8)
-    score_tensor = te.extern(
-        [score_shape],
-        [data],
-        lambda ins, outs: _fetch_score_ir(
-            ins[0],
-            outs[0],
-            score_axis,
-        ),
-        dtype=[data.dtype],
-        in_buffers=[data_buf],
-        out_buffers=[score_buf],
-        name="fetch_score",
-        tag="fetch_score",
-    )
-    target = tvm.target.Target.current()
-    if (
-        target
-        and target.kind.name == "cuda"
-        and tvm.get_global_func("tvm.contrib.thrust.sort", allow_missing=True)
-    ):
-        sort_tensor = argsort_thrust(score_tensor, axis=1, is_ascend=False, dtype=valid_count_dtype)
-    else:
-        sort_tensor = argsort(score_tensor, axis=1, is_ascend=False, dtype=valid_count_dtype)
-
-    sort_tensor_buf = tvm.tir.decl_buffer(
-        sort_tensor.shape, sort_tensor.dtype, "sort_tensor_buf", data_alignment=8
-    )
-
     data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
-    indices_buf = tvm.tir.decl_buffer(indices.shape, indices.dtype, "indices_buf", data_alignment=8)
 
-    out, box_indices, num_valid_boxes = te.extern(
-        [data.shape, score_shape, [batch_size, 1]],
-        [data, sort_tensor, valid_count, indices],
-        lambda ins, outs: nms_ir(
-            ins[0],
-            ins[1],
-            ins[2],
-            ins[3],
-            outs[0],
-            outs[1],
-            outs[2],
-            max_output_size,
-            iou_threshold,
-            force_suppress,
-            top_k,
-            coord_start,
-            id_index,
-            score_index,
-            return_indices,
-        ),
-        dtype=[data.dtype, "int32", "int32"],
-        in_buffers=[data_buf, sort_tensor_buf, valid_count_buf, indices_buf],
-        name="nms",
-        tag="nms",
+    sort_tensor = _get_sorted_indices(data, data_buf, score_index, (data.shape[0], data.shape[1]))
+
+    out_bboxes, out_scores, out_class_ids, box_indices, num_valid_boxes = _run_nms(
+        data,
+        data_buf,
+        sort_tensor,
+        valid_count,
+        indices,
+        max_output_size,
+        iou_threshold,
+        force_suppress,
+        top_k,
+        coord_start,
+        id_index,
+        score_index,
+        return_indices,
     )
 
     if return_indices:
         return [box_indices, num_valid_boxes]
 
-    return out
+    return _concatenate_outputs(
+        out_bboxes, out_scores, out_class_ids, data.shape, coord_start, score_index, id_index
+    )

From 39ee8d59ce75804e425db0faa53987d0a12f33bb Mon Sep 17 00:00:00 2001
From: Gustavo Romero <gromero@users.noreply.github.com>
Date: Wed, 13 Jan 2021 11:01:56 -0300
Subject: [PATCH 055/357] =?UTF-8?q?[=C2=B5TVM]=20Avoid=20listing=20links?=
 =?UTF-8?q?=20when=20probing=20serial=20ports=20(#7265)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SerialTransport.open() probes automatically the device name based upon a
grep regex if a device name is not provided. The code expects to find only
a single device. Currently when it probes for the available serial ports it
includes in the list the device names that are also symbolic links.

Since _find_openocd_serial_port() always returns a serial number for a
given serial port (not the device name path) the available device names
are always probed when the openocd flash runner is used.

It's not uncommon that device drivers create symbolic links for certain
kinds of serial devices, specially those that provide a serial port plus
an additional endpoint to program the device attached, like a ST-Link
interface, etc.

As a consequence the current code fails to select the correct device name
when symbolic links exist and the openocd flash runner is used.

That commit changes the probe behavior to avoid listing symbolic links when
probing the device name for the target serial port.

Signed-off-by: Gustavo Romero <gustavo.romero@linaro.org>
---
 python/tvm/micro/transport/serial.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/micro/transport/serial.py b/python/tvm/micro/transport/serial.py
index 6640bb5a8a0c..b72dee1397b1 100644
--- a/python/tvm/micro/transport/serial.py
+++ b/python/tvm/micro/transport/serial.py
@@ -67,7 +67,7 @@ def open(self):
         if self._port_path is not None:
             port_path = self._port_path
         else:
-            ports = list(serial.tools.list_ports.grep(self._grep, include_links=True))
+            ports = list(serial.tools.list_ports.grep(self._grep))
             if len(ports) != 1:
                 raise SerialPortNotFoundError(
                     f"grep expression should find 1 serial port; found {ports!r}"

From 006b9b53ab97e677933011d8b36a98a5a4ac7723 Mon Sep 17 00:00:00 2001
From: ANSHUMAN TRIPATHY <anshuman.t@huawei.com>
Date: Wed, 13 Jan 2021 19:50:02 +0530
Subject: [PATCH 056/357] [Frontend][TFLite] Densify Op added (#7048)

* [Frontend][TFLite] Densify Op added

* [1] Review comments handled

* TODO added for sparse_to_dense Op usage

* stale comments removed
---
 python/tvm/relay/frontend/tflite.py          | 215 ++++++++++++++++++-
 tests/python/frontend/tflite/test_forward.py |  48 +++++
 2 files changed, 253 insertions(+), 10 deletions(-)

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index 7a2aada4b22e..525fb41407d3 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -65,6 +65,7 @@ def __init__(self, model, subgraph, exp_tab):
         self.builtin_op_code = build_str_map(BuiltinOperator())
         self.activation_fn_type = build_str_map(ActivationFunctionType())
         self.builtin_options = build_str_map(BuiltinOptions())
+        self.prefetched_nodes = {}
 
         # Add more operators
         self.convert_map = {
@@ -80,6 +81,7 @@ def __init__(self, model, subgraph, exp_tab):
             "CONCATENATION": self.convert_concatenation,
             "CONV_2D": self.convert_conv2d,
             "COS": self.convert_cos,
+            "DENSIFY": self.convert_densify,
             "DEPTH_TO_SPACE": self.convert_depth_to_space,
             "DEPTHWISE_CONV_2D": self.convert_depthwise_conv2d,
             "DEQUANTIZE": self.convert_dequantize,
@@ -200,6 +202,10 @@ def convert_op_to_relay(self):
             assert isinstance(op, Operator)
             ret = self.convert_map[op_code_str](op)
 
+            # In case the Op can be prefetched, the output can be optimized out
+            if ret is None:
+                continue
+
             if len(output_tensors) == 1:
                 tensor_idx = output_tensors[0].tensor_idx
                 self.exp_tab.set_expr(get_tensor_name(self.subgraph, tensor_idx), ret)
@@ -338,7 +344,8 @@ def get_tensor_type_as_numpy(self, tensor_wrapper):
                 "Tensor type '{}' currently not supported".format(tensor_wrapper.tensor.Type())
             )
 
-    def get_tensor_value(self, tensor_wrapper):
+    # pylint: disable=no-else-return
+    def get_tensor_value(self, tensor_wrapper, is_sparse=False):
         """Get tensor buffer value from given tensor wrapper"""
         assert isinstance(tensor_wrapper, TensorWrapper)
 
@@ -350,7 +357,10 @@ def get_tensor_value(self, tensor_wrapper):
         else:
             shape = []
 
-        return np.frombuffer(data, dtype=dtype).reshape(shape)
+        if is_sparse:
+            return np.frombuffer(data, dtype=dtype)
+        else:
+            return np.frombuffer(data, dtype=dtype).reshape(shape)
 
     def get_tensor_type_str(self, tensor_type):
         """Get tensor type string representation when given TFLite tensor type"""
@@ -1662,11 +1672,15 @@ def _convert_reduce(self, relay_op, op):
         axis = tuple(axis_value) if len(axis_value.shape) > 0 else tuple((axis_value.item(),))
 
         # Options - keep_dims (bool)
-        assert op.BuiltinOptionsType() == BuiltinOptions.ReducerOptions
-        reduce_options = ReducerOptions()
-        op_options = op.BuiltinOptions()
-        reduce_options.Init(op_options.Bytes, op_options.Pos)
-        keep_dims = reduce_options.KeepDims()
+        # In case Options are not present, set keep_dims to False(default)
+        if op.BuiltinOptionsType():
+            assert op.BuiltinOptionsType() == BuiltinOptions.ReducerOptions
+            reduce_options = ReducerOptions()
+            op_options = op.BuiltinOptions()
+            reduce_options.Init(op_options.Bytes, op_options.Pos)
+            keep_dims = reduce_options.KeepDims()
+        else:
+            keep_dims = False
 
         if input_tensor.qnn_params:
             in_expr = _op.cast(in_expr, "int32")
@@ -2026,7 +2040,11 @@ def convert_conv(self, op, conv_type):
             else:
                 weight_expr = _op.transpose(weight_expr, axes=(1, 2, 3, 0))
         else:
-            weight_value = self.get_tensor_value(weight_tensor)
+            if self.is_prefetched(weight_tensor.tensor_idx):
+                weight_value = self.get_prefetched_node(weight_tensor.tensor_idx)
+            else:
+                weight_value = self.get_tensor_value(weight_tensor)
+
             # TFLite kernel layout:
             # convolution:
             # OC KH KW IC, we require KH KW IC OC (HWIO)
@@ -3196,22 +3214,199 @@ def convert_matrix_diag(self, op):
         out = _op.matrix_set_diag(input_expr, diagonal_expr)
         return out
 
+    def convert_densify(self, op):
+        """Convert TFLite DENSIFY"""
+        input_tensors = self.get_input_tensors(op)
+        assert len(input_tensors) == 1, "input tensors length should be 1"
+
+        output_tensors = self.get_output_tensors(op)
+        assert len(output_tensors) == 1, "output tensors length should be 1"
+        output_tensor = output_tensors[0]
+
+        sparse_weight_tensor = input_tensors[0]
+        sparse_weight_tensor_type_str = self.get_tensor_type_str(sparse_weight_tensor.tensor.Type())
+
+        # NOTE: With current implementation in TFLite, Densify Op does not need to be present
+        # in runtime.
+        # TODO(ANSHUMAN87): we need to use the sparse_indices output
+        # from below function and use that in sparse_to_dense Op.
+        # Once the stack corruption issue is resolved in sparse_to_dense Op.
+        _, dense_weight = prepare_dense_matrix_from_sparse(
+            sparse_weight_tensor.tensor,
+            self.get_tensor_value(sparse_weight_tensor, is_sparse=True),
+            sparse_weight_tensor_type_str,
+        )
+
+        self.set_prefetched_node(output_tensor.tensor_idx, dense_weight)
+
     def get_expr(self, input_tensor_idx):
         return self.exp_tab.get_expr(get_tensor_name(self.subgraph, input_tensor_idx))
 
     def has_expr(self, input_tensor_idx):
         return self.exp_tab.has_expr(get_tensor_name(self.subgraph, input_tensor_idx))
 
-    def get_tensor_expr(self, tensor):
+    def is_prefetched(self, input_tensor_idx):
+        return (
+            self.prefetched_nodes.get(get_tensor_name(self.subgraph, input_tensor_idx)) is not None
+        )
+
+    def set_prefetched_node(self, input_tensor_idx, value):
+        self.prefetched_nodes[get_tensor_name(self.subgraph, input_tensor_idx)] = value
+
+    def get_prefetched_node(self, input_tensor_idx):
+        return self.prefetched_nodes[get_tensor_name(self.subgraph, input_tensor_idx)]
+
+    def get_tensor_expr(self, tensor, is_sparse=False):
         """ Return the Relay expr for tensor. """
         if self.has_expr(tensor.tensor_idx):
             expr = self.get_expr(tensor.tensor_idx)
         else:
             type_str = self.get_tensor_type_str(tensor.tensor.Type())
-            expr = self.exp_tab.new_const(self.get_tensor_value(tensor), dtype=type_str)
+            expr = self.exp_tab.new_const(self.get_tensor_value(tensor, is_sparse), dtype=type_str)
         return expr
 
 
+# pylint: disable=no-else-return
+def prepare_dense_matrix_from_sparse(sparse_tensor, sparse_tensor_value, sparse_tensor_type):
+    """ Prepare sparse indices and dense matrix from TFLite sparse parameters. """
+    # The function is implemented based on TFLite sparse parameter specifications
+    # Please refer
+    # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/schema/schema.fbs#L89
+    # for details about each parameters
+    sparsity = sparse_tensor.Sparsity()
+    dense_shape = sparse_tensor.ShapeAsNumpy()
+    orig_rank = len(dense_shape)
+
+    # The traversal order of the dimensions defined in the `shape` field of the to be dense tensor.
+    traversal_order = sparsity.TraversalOrderAsNumpy()
+
+    # For an n-dimensional tensor with a k-dimensional block (0 <= k <= n),
+    # stores how a block dimension in (dn, ..., dn+k-1) maps to the original
+    # tensor dimension in (d0, ..., dn). It's stored in the order of (dn, ..., dn+k-1).
+    # If not block-sparse, this field is NULL.
+    block_map = sparsity.BlockMapAsNumpy()
+
+    total_rank = sparsity.TraversalOrderLength()
+    dense_mat = np.full(shape=dense_shape, fill_value=0, dtype=sparse_tensor_type).flatten()
+
+    from enum import Enum
+
+    # NOTE: Here the Vector term is borrowed from TFLite spec.
+    class VectorType(Enum):
+        Empty = 0
+        Int32 = 1
+        Uint16 = 2
+        Uint8 = 3
+
+    def _get_vector_flag(v_type):
+        if VectorType(v_type) == VectorType.Int32:
+            return N.Int32Flags
+        elif VectorType(v_type) == VectorType.Uint16:
+            return N.Uint16Flags
+        elif VectorType(v_type) == VectorType.Uint8:
+            return N.Uint8Flags
+        else:
+            raise tvm.error.OpNotImplemented("The provided type {} is not supported".format(v_type))
+
+    def _get_flattened_index(indices, shape):
+        index = 0
+        sub_elements = 1
+        for i in reversed(range(0, len(dense_shape))):
+            index += indices[i] * sub_elements
+            sub_elements *= shape[i]
+        return index
+
+    # DimensionMetadata per dimension: the metadata needed for
+    #     each dimension to locate the non-zero values in the original dense tensor
+    #     inline with traversal order parameter.
+    #
+    # sp_format has 2 possible values: {DENSE = 0, SPARSE_CSR = 1}
+    # If format = DENSE{0} : DenseSize represents size of that dimension
+    # If format = SPARSE_CSR{1} : array_segments represents how to segment the indices array,
+    #      each segment corresponds to one element in the previous dimension. array_indices
+    #      represents the index of the non-zero elements within this dimension
+    #      (as those in the CSR matrix format, where the first array is row pointers
+    #       and the second array is column indices).
+    sp_format = np.zeros(sparsity.DimMetadataLength())
+    dim_metadata = [None] * (2 * sparsity.DimMetadataLength())
+
+    # Below loop will fetch all meta data per dimension based on format type
+    # Dense or Sparse and will put it in an agnostic array for easy access
+    # while preparing dense buffer or indices.
+    for i in range(sparsity.DimMetadataLength()):
+        sp_format[i] = sparsity.DimMetadata(i).Format()
+        if sp_format[i] == 0:
+            dim_metadata[2 * i] = [sparsity.DimMetadata(i).DenseSize()]
+        else:
+            from flatbuffers import number_types as N
+
+            dim_metadata[2 * i] = (
+                sparsity.DimMetadata(i)
+                .ArraySegments()
+                .GetVectorAsNumpy(
+                    flags=_get_vector_flag(sparsity.DimMetadata(i).ArraySegmentsType()), off=4
+                )
+            )
+            dim_metadata[2 * i + 1] = (
+                sparsity.DimMetadata(i)
+                .ArrayIndices()
+                .GetVectorAsNumpy(
+                    flags=_get_vector_flag(sparsity.DimMetadata(i).ArrayIndicesType()), off=4
+                )
+            )
+
+    block_dim = 0
+    block_size = np.zeros(sparsity.BlockMapLength())
+
+    # Block size parameter if encoded in BSR format
+    for i in range(orig_rank):
+        if block_dim < sparsity.BlockMapLength() and block_map[block_dim] == i:
+            orig_dim = traversal_order[orig_rank + block_dim]
+            block_size[block_dim] = sparsity.DimMetadata(orig_dim).DenseSize()
+            block_dim += 1
+
+    indices_list = []
+
+    # Below function iterates through each applicable indices per dimension
+    # based on format type specified and finaly produce the dense matrix and the NZ indices.
+    def _def_prepare_dense_matrix_from_sparse(indices, level, prev_idx):
+        if level == len(indices):
+            start_pos = 0
+            orig_idx = np.zeros(orig_rank, dtype="int32")
+            while start_pos < orig_rank:
+                orig_idx[traversal_order[start_pos]] = indices[start_pos]
+                start_pos += 1
+            while start_pos < len(indices):
+                block_idx = traversal_order[start_pos] - orig_rank
+                orig_dim = block_map[block_idx]
+                orig_idx[orig_dim] = orig_idx[orig_dim] * block_size[block_idx] + indices[start_pos]
+                start_pos += 1
+            indices_list.append(orig_idx)
+            nonlocal value_idx
+            dense_mat[_get_flattened_index(orig_idx, dense_shape)] = sparse_tensor_value[value_idx]
+            value_idx += 1
+        else:
+            metadata_idx = 2 * level
+            if sp_format[level] == 0:
+                shape_of_level = dim_metadata[metadata_idx][0]
+                for idx in range(shape_of_level):
+                    indices[level] = idx
+                    _def_prepare_dense_matrix_from_sparse(
+                        indices, level + 1, prev_idx * shape_of_level + idx
+                    )
+            else:
+                array_segments = dim_metadata[metadata_idx]
+                array_indices = dim_metadata[metadata_idx + 1]
+                for idx in range(array_segments[prev_idx], array_segments[prev_idx + 1]):
+                    indices[level] = array_indices[idx]
+                    _def_prepare_dense_matrix_from_sparse(indices, level + 1, idx)
+
+    indices = np.zeros(total_rank)
+    value_idx = 0
+    _def_prepare_dense_matrix_from_sparse(indices, 0, 0)
+    return np.array(indices_list, dtype="int32"), dense_mat.reshape(dense_shape)
+
+
 def get_scalar_from_constant(expr):
     """ Returns scalar value from Relay constant scalar. """
     assert (
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index c8bd0947776f..f3653014be00 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -3691,6 +3691,50 @@ def test_forward_mobilenet_v3():
     )
 
 
+#######################################################################
+# Mobilenet V1 Sparse
+# -----------------
+
+
+def test_forward_sparse_mobilenet_v1():
+    """Test the Sparse version of Mobilenet V1 TF Lite model."""
+    # MobilenetV1
+    tflite_model_file = download_testdata(
+        "https://storage.googleapis.com/fast-convnets/tflite-models/mbv1_140_90_12b4_720.tflite",
+        "mbv1_140_90_12b4_720.tflite",
+    )
+    with open(tflite_model_file, "rb") as f:
+        tflite_model_buf = f.read()
+    data = np.random.uniform(size=(1, 224, 224, 3)).astype("float32")
+    tflite_output = run_tflite_graph(tflite_model_buf, data)
+    tvm_output = run_tvm_graph(tflite_model_buf, data, "float_image_input")
+    tvm.testing.assert_allclose(
+        np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]), rtol=1e-5, atol=1e-5
+    )
+
+
+#######################################################################
+# Mobilenet V2 Sparse
+# -----------------
+
+
+def test_forward_sparse_mobilenet_v2():
+    """Test the Sparse version of Mobilenet V2 TF Lite model."""
+    # MobilenetV1
+    tflite_model_file = download_testdata(
+        "https://storage.googleapis.com/fast-convnets/tflite-models/mbv2_200_85_11-16b2_744.tflite",
+        "mbv2_200_85_11-16b2_744.tflite",
+    )
+    with open(tflite_model_file, "rb") as f:
+        tflite_model_buf = f.read()
+    data = np.random.uniform(size=(1, 224, 224, 3)).astype("float32")
+    tflite_output = run_tflite_graph(tflite_model_buf, data)
+    tvm_output = run_tvm_graph(tflite_model_buf, data, "float_image_input")
+    tvm.testing.assert_allclose(
+        np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]), rtol=1e-5, atol=1e-5
+    )
+
+
 #######################################################################
 # Inception
 # ---------
@@ -4197,6 +4241,10 @@ def test_forward_mediapipe_hand_landmark():
     test_forward_coco_ssd_mobilenet_v1()
     test_forward_mediapipe_hand_landmark()
 
+    # End to End Sparse models
+    test_forward_sparse_mobilenet_v1()
+    test_forward_sparse_mobilenet_v2()
+
     # End to End quantized
     test_forward_qnn_inception_v1_net()
     test_forward_qnn_mobilenet_v1_net()

From 1410e6820c70d0fdbc9b5e711291be0854837b01 Mon Sep 17 00:00:00 2001
From: Robert Kimball <bobkimball@gmail.com>
Date: Wed, 13 Jan 2021 08:09:09 -0800
Subject: [PATCH 057/357] Change the all #pragma once to ifdef include guard
 (#7264)

---
 src/runtime/contrib/cblas/gemm_common.h | 5 ++++-
 src/runtime/vulkan/vulkan_common.h      | 5 ++++-
 src/runtime/vulkan/vulkan_module.h      | 5 ++++-
 src/runtime/vulkan/vulkan_shader.h      | 5 ++++-
 src/runtime/vulkan/vulkan_stream.h      | 5 ++++-
 5 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/src/runtime/contrib/cblas/gemm_common.h b/src/runtime/contrib/cblas/gemm_common.h
index 6c31fbdd06a3..9ccfa5183cd6 100644
--- a/src/runtime/contrib/cblas/gemm_common.h
+++ b/src/runtime/contrib/cblas/gemm_common.h
@@ -21,7 +21,9 @@
  * \file tvm/contrib/gemm.h
  * \brief Shared implementation of gemm
  */
-#pragma once
+
+#ifndef TVM_RUNTIME_CONTRIB_CBLAS_GEMM_COMMON_H_
+#define TVM_RUNTIME_CONTRIB_CBLAS_GEMM_COMMON_H_
 
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/registry.h>
@@ -215,3 +217,4 @@ inline void CallBatchGemm(TVMArgs args, TVMRetValue* ret, TBatchGemmOp op) {
 
 }  // namespace contrib
 }  // namespace tvm
+#endif  // TVM_RUNTIME_CONTRIB_CBLAS_GEMM_COMMON_H_
diff --git a/src/runtime/vulkan/vulkan_common.h b/src/runtime/vulkan/vulkan_common.h
index da604f6fa792..9cd1f257f091 100644
--- a/src/runtime/vulkan/vulkan_common.h
+++ b/src/runtime/vulkan/vulkan_common.h
@@ -16,7 +16,9 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#pragma once
+
+#ifndef TVM_RUNTIME_VULKAN_VULKAN_COMMON_H_
+#define TVM_RUNTIME_VULKAN_VULKAN_COMMON_H_
 
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
@@ -143,3 +145,4 @@ struct VulkanContext {
 }  // namespace vulkan
 }  // namespace runtime
 }  // namespace tvm
+#endif  // TVM_RUNTIME_VULKAN_VULKAN_COMMON_H_
diff --git a/src/runtime/vulkan/vulkan_module.h b/src/runtime/vulkan/vulkan_module.h
index 15c9ec313d63..c75a077a361d 100644
--- a/src/runtime/vulkan/vulkan_module.h
+++ b/src/runtime/vulkan/vulkan_module.h
@@ -16,7 +16,9 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#pragma once
+
+#ifndef TVM_RUNTIME_VULKAN_VULKAN_MODULE_H_
+#define TVM_RUNTIME_VULKAN_VULKAN_MODULE_H_
 
 #include <string>
 #include <unordered_map>
@@ -35,3 +37,4 @@ Module VulkanModuleCreate(std::unordered_map<std::string, VulkanShader> smap,
 using vulkan::VulkanModuleCreate;
 }  // namespace runtime
 }  // namespace tvm
+#endif  // TVM_RUNTIME_VULKAN_VULKAN_MODULE_H_
diff --git a/src/runtime/vulkan/vulkan_shader.h b/src/runtime/vulkan/vulkan_shader.h
index 7558a95ee45e..c9fbb13e938d 100644
--- a/src/runtime/vulkan/vulkan_shader.h
+++ b/src/runtime/vulkan/vulkan_shader.h
@@ -16,7 +16,9 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#pragma once
+
+#ifndef TVM_RUNTIME_VULKAN_VULKAN_SHADER_H_
+#define TVM_RUNTIME_VULKAN_VULKAN_SHADER_H_
 
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
@@ -55,3 +57,4 @@ using vulkan::VulkanShader;
 namespace dmlc {
 DMLC_DECLARE_TRAITS(has_saveload, ::tvm::runtime::vulkan::VulkanShader, true);
 }  // namespace dmlc
+#endif  // TVM_RUNTIME_VULKAN_VULKAN_SHADER_H_
diff --git a/src/runtime/vulkan/vulkan_stream.h b/src/runtime/vulkan/vulkan_stream.h
index c5094bdf28db..d096a644a1f0 100644
--- a/src/runtime/vulkan/vulkan_stream.h
+++ b/src/runtime/vulkan/vulkan_stream.h
@@ -16,7 +16,9 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#pragma once
+
+#ifndef TVM_RUNTIME_VULKAN_VULKAN_STREAM_H_
+#define TVM_RUNTIME_VULKAN_VULKAN_STREAM_H_
 
 #include <functional>
 #include <memory>
@@ -184,3 +186,4 @@ class VulkanStream {
 }  // namespace vulkan
 }  // namespace runtime
 }  // namespace tvm
+#endif  // TVM_RUNTIME_VULKAN_VULKAN_STREAM_H_

From 384714b58ed374cb1e385142b5dc4128041c945c Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Wed, 13 Jan 2021 11:12:38 -0700
Subject: [PATCH 058/357] Reorder dynamic to static and simplify inference,
 lower DynamicToStatic Opt Level (#7213)

* reorder dynamic to static and simplify inference, add a dropout unit test

* lower dynamic to static opt level

* autoformat test

* raise DynamicToStatic to opt level 2 to match Constant Folding
---
 src/relay/backend/build_module.cc         |  3 ++-
 src/relay/transforms/dynamic_to_static.cc |  2 +-
 tests/python/relay/test_op_level1.py      | 10 ++++++++++
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index e17d9c0e1ca6..08846925bede 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -278,10 +278,11 @@ class RelayBuildModule : public runtime::ModuleNode {
       pass_seqs.push_back(transform::Legalize());
     }
 
+    pass_seqs.push_back(transform::SimplifyInference());
+
     // Convert Dynamic ops to static versions
     pass_seqs.push_back(transform::DynamicToStatic());
 
-    pass_seqs.push_back(transform::SimplifyInference());
     PackedFunc fskip = PackedFunc([](TVMArgs args, TVMRetValue* rv) {
       Expr expr = args[0];
       *rv = false;
diff --git a/src/relay/transforms/dynamic_to_static.cc b/src/relay/transforms/dynamic_to_static.cc
index f78d05bd9d2c..c580f60c2a68 100644
--- a/src/relay/transforms/dynamic_to_static.cc
+++ b/src/relay/transforms/dynamic_to_static.cc
@@ -260,7 +260,7 @@ Pass DynamicToStatic() {
       [=](Function f, IRModule m, PassContext pc) {
         return Downcast<Function>(DynamicToStatic(f, m));
       };
-  return CreateFunctionPass(pass_func, 3, "DynamicToStatic", {});
+  return CreateFunctionPass(pass_func, 2, "DynamicToStatic", {});
 }
 
 TVM_REGISTER_GLOBAL("relay._transform.DynamicToStatic").set_body_typed([]() {
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 37a59c30f410..54d04da5e092 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -322,6 +322,16 @@ def test_dropout():
         yy = run_infer_type(y)
         assert yy.checked_type == input_ty
 
+    in_np = np.random.random([4, 5, 6]).astype("float32")
+    x = relay.const(in_np)
+    y = relay.nn.dropout(x, rate=0.5)
+    func = relay.Function([], y)
+    for target, ctx in tvm.testing.enabled_targets():
+        for backend in ["debug", "graph"]:
+            intrp = relay.create_executor("debug", ctx=ctx, target=target)
+            op_res = intrp.evaluate(func)()
+            tvm.testing.assert_allclose(op_res.asnumpy(), in_np, rtol=0.01)
+
 
 def test_batch_norm():
     for dtype in ["float16", "float32"]:

From 188c715f8a9ad4942033d810853d52684853747c Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 13 Jan 2021 14:26:27 -0500
Subject: [PATCH 059/357] [DOCS] Fix figure links (#7268)

---
 docs/dev/index.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/dev/index.rst b/docs/dev/index.rst
index e9ec767fd018..a098df12f1c1 100644
--- a/docs/dev/index.rst
+++ b/docs/dev/index.rst
@@ -49,7 +49,7 @@ In this guide, we will study an example compilation flow in the compiler. The fi
 - Runtime Execution: the user loads back a `runtime.Module` and runs the compiled functions in the supported runtime environment.
 
 
-.. figure:: https://raw.githubusercontent.com/tlcpack/web-data/main/images/design/tvm_dyn_workflow.svg
+.. figure:: https://raw.githubusercontent.com/tlc-pack/web-data/main/images/design/tvm_dyn_workflow.svg
    :align: center
    :width: 85%
 
@@ -201,7 +201,7 @@ except that the data structure of interest changes from the numpy.ndarray to tvm
 Logical Architecture Components
 -------------------------------
 
-.. figure:: https://raw.githubusercontent.com/tlcpack/web-data/main/images/design/tvm_static_overview.svg
+.. figure:: https://raw.githubusercontent.com/tlc-pack/web-data/main/images/design/tvm_static_overview.svg
    :align: center
    :width: 85%
 

From 35dabd6f8df0293319098429cec54dbacaeb3a83 Mon Sep 17 00:00:00 2001
From: Gustavo Romero <gromero@users.noreply.github.com>
Date: Wed, 13 Jan 2021 18:59:21 -0300
Subject: [PATCH 060/357] =?UTF-8?q?[=C2=B5TVM]=20Fix=20two=20warnings=20wh?=
 =?UTF-8?q?en=20deprecated=20forms=20are=20used=20(#7269)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* [µTVM] Specify loader for yaml.load

Specify the loader to be used by yaml.load as the current form used without
specifying explicitly a loader is deprecated since PyYAML 5.1 and will
throw a noisy warning.

For details, please see:
https://github.com/yaml/pyyaml/wiki/PyYAML-yaml.load(input)-Deprecation

Signed-off-by: Gustavo Romero <gustavo.romero@linaro.org>

* [µTVM] Avoid using tvm.target.create

Avoid using tvm.target.create as it's deprecated and use
tvm.target.Target directly instead.

Signed-off-by: Gustavo Romero <gustavo.romero@linaro.org>
---
 python/tvm/micro/compiler.py       | 2 +-
 python/tvm/micro/contrib/zephyr.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/micro/compiler.py b/python/tvm/micro/compiler.py
index 3b62e9347c7f..8d76555d7c31 100644
--- a/python/tvm/micro/compiler.py
+++ b/python/tvm/micro/compiler.py
@@ -96,7 +96,7 @@ def _target_from_sources(cls, sources):
             )
 
         target_str = next(iter(target_strs))
-        return tvm.target.create(target_str)
+        return tvm.target.Target(target_str)
 
     # Maps regexes identifying CPUs to the default toolchain prefix for that CPU.
     TOOLCHAIN_PREFIX_BY_CPU_REGEX = {
diff --git a/python/tvm/micro/contrib/zephyr.py b/python/tvm/micro/contrib/zephyr.py
index 61aec2b771e0..2451eac3cb61 100644
--- a/python/tvm/micro/contrib/zephyr.py
+++ b/python/tvm/micro/contrib/zephyr.py
@@ -387,7 +387,7 @@ def _get_flash_runner(cls, cmake_entries):
             return flash_runner
 
         with open(cmake_entries["ZEPHYR_RUNNERS_YAML"]) as f:
-            doc = yaml.load(f)
+            doc = yaml.load(f, Loader=yaml.FullLoader)
         return doc["flash-runner"]
 
     def _get_device_args(self, cmake_entries):

From 259652be27babc15ca3490be13ef75f074667252 Mon Sep 17 00:00:00 2001
From: Ritwik Das <ritwikdas54@gmail.com>
Date: Wed, 13 Jan 2021 13:59:42 -0800
Subject: [PATCH 061/357] Adding aten::unsqueeze_ to PT Frontend (#7231)

* Added Ops

* Regular

* Remove copy

* Remove copy

* Tests

* Black

Co-authored-by: Ubuntu <ubuntu@ip-172-31-27-149.us-east-2.compute.internal>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-19-34.us-east-2.compute.internal>
---
 python/tvm/relay/frontend/pytorch.py          | 1 +
 tests/python/frontend/pytorch/test_forward.py | 8 ++++++++
 2 files changed, 9 insertions(+)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index ca05954227f8..991e3a8a0032 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -2117,6 +2117,7 @@ def create_convert_map(self):
             "aten::to": self.to,
             "aten::squeeze": self.squeeze,
             "aten::unsqueeze": self.unsqueeze,
+            "aten::unsqueeze_": self.unsqueeze,
             "aten::cat": self.concatenate,
             "aten::slice": self.slice,
             "aten::split": self.split,
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index f76c697a2c81..7cdd450448ca 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -447,8 +447,16 @@ class Unsqueeze1(Module):
         def forward(self, *args):
             return args[0].unsqueeze(2)
 
+    class Unsqueeze2(Module):
+        def forward(self, *args):
+            _ = args[0].unsqueeze_(2)
+            # Check whether operations after inplace unsqueeze works as expected
+            y = args[0].squeeze(2)
+            return torch.add(y, y)
+
     input_data = torch.rand(input_shape).float()
     verify_model(Unsqueeze1().float().eval(), input_data=input_data)
+    verify_model(Unsqueeze2().float().eval(), input_data=input_data)
 
 
 @tvm.testing.uses_gpu

From af716e5d98a58f4e3e19c269e7837d9fc19adc64 Mon Sep 17 00:00:00 2001
From: Luis Vega <vegaluisjose@users.noreply.github.com>
Date: Wed, 13 Jan 2021 19:11:34 -0800
Subject: [PATCH 062/357] update vta-hw version (#7271)

---
 3rdparty/vta-hw | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/vta-hw b/3rdparty/vta-hw
index 87ce9acfae55..57db5a718c74 160000
--- a/3rdparty/vta-hw
+++ b/3rdparty/vta-hw
@@ -1 +1 @@
-Subproject commit 87ce9acfae550d1a487746e9d06c2e250076e54c
+Subproject commit 57db5a718c74a788c98120ebbe1230797be698c8

From c11959d3395e850e43a04427b4e4ba203dd2e647 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Wed, 13 Jan 2021 21:02:35 -0800
Subject: [PATCH 063/357] [FIX] Remove leftovers from check_correctness (#7272)

* [FIX] Remove leftovers from check_correctness

* remove unused numpy import
---
 python/tvm/autotvm/measure/measure_methods.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index f6f0e429a22a..cb801ba72872 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -31,8 +31,6 @@
 from collections import namedtuple
 import tempfile
 
-import numpy as np
-
 import tvm._ffi
 import tvm.ir.transform
 from tvm import nd, rpc as _rpc
@@ -578,13 +576,6 @@ def run_through_rpc(
             costs = list(costs)
             costs.sort()
             costs = tuple(costs[1:-1])
-
-        # check correctness of output
-        if ref_output:
-            for expected, real in zip(ref_output, args):
-                if not np.allclose(expected, real.asnumpy(), rtol=1e-4):
-                    logger.warning("Wrong Answer!")
-                    errno = MeasureErrorNo.WRONG_ANSWER
     except TVMError as exc:
         msg = str(exc)
         if "Stack trace returned" in msg:

From 8d3c0e79a4cd449e894ae873fa8244f29b6b13a3 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Wed, 13 Jan 2021 23:07:01 -0800
Subject: [PATCH 064/357] [CUDA] [Codegen] Ensuring atleast one thread block
 for dynamism (#7273)

---
 src/runtime/thread_storage_scope.h | 6 +++++-
 tests/python/relay/test_any.py     | 9 ++++++---
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/runtime/thread_storage_scope.h b/src/runtime/thread_storage_scope.h
index 1917096bb24c..c0393600b60c 100644
--- a/src/runtime/thread_storage_scope.h
+++ b/src/runtime/thread_storage_scope.h
@@ -215,7 +215,11 @@ class ThreadAxisConfig {
     ThreadWorkLoad w;
     std::fill(w.work_size, w.work_size + 6, 1);
     for (size_t i = 0; i < arg_index_map_.size(); ++i) {
-      w.work_size[arg_index_map_[i]] = static_cast<size_t>(x.values[base_ + i].v_int64);
+      // Dynamic shapes can result in 0 dim size. Guard to ensure that the dim size is atleast 1.
+      size_t size = static_cast<size_t>(x.values[base_ + i].v_int64);
+      if (size > 0) {
+        w.work_size[arg_index_map_[i]] = size;
+      }
     }
     return w;
   }
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index cb3b5d42e553..d30e7873dae7 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -845,7 +845,7 @@ def test_any_softmax():
     verify_any_softmax(any_dims(4), 2, (13, 11, 3, 1), (13, 11, 3, 1))
 
 
-def verify_any_topk(data_shape, kval, np_dshape, dtype, const_k=False):
+def verify_any_topk(data_shape, kval, np_dshape, dtype, ret_type="indices", const_k=False):
     mod = tvm.IRModule()
     data = relay.var("data", shape=data_shape, dtype=dtype)
     np_data = np.random.uniform(size=np_dshape).astype(dtype)
@@ -857,7 +857,9 @@ def verify_any_topk(data_shape, kval, np_dshape, dtype, const_k=False):
         k = relay.var("k", shape=(), dtype="int32")
         args = [data, k]
         in_vals = [np_data, kval]
-    out = relay.topk(data, k, ret_type="indices")
+    out = relay.topk(data, k, ret_type=ret_type)
+    if ret_type == "both":
+        out = out[0]
     mod["main"] = relay.Function(args, out)
 
     sorted = np.argsort(-np_data)
@@ -873,7 +875,8 @@ def verify_any_topk(data_shape, kval, np_dshape, dtype, const_k=False):
 def test_any_topk():
     verify_any_topk(any_dims(1), 5, (10,), "float32")
     verify_any_topk(any_dims(2), 2, (6, 3), "int32")
-    verify_any_topk(any_dims(2), 3, (6, 3), "float32", True)
+    verify_any_topk(any_dims(2), 3, (6, 3), "float32", const_k=True)
+    verify_any_topk(any_dims(1), 0, (0,), "float32", ret_type="both")
 
 
 @tvm.testing.uses_gpu

From 7f4aa247814885c43428b9a7069243974f474709 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Thu, 14 Jan 2021 06:19:56 -0800
Subject: [PATCH 065/357] [AutoScheduler] Fix layout rewrite for axis with
 extent=1 (#7279)

---
 src/auto_scheduler/compute_dag.cc                |  9 ++++++++-
 .../test_auto_scheduler_layout_rewrite.py        | 16 ++++++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/src/auto_scheduler/compute_dag.cc b/src/auto_scheduler/compute_dag.cc
index 6ce7349c2e61..735f0442b402 100755
--- a/src/auto_scheduler/compute_dag.cc
+++ b/src/auto_scheduler/compute_dag.cc
@@ -873,7 +873,14 @@ std::string GetNewLayout(const State& state, const int stage_id, const Stage& st
       ori_iter_name = new_axis_names[i];
     }
     if (placeholder_axis_names.count(ori_iter_name)) {
-      os << iter->range->extent << ori_iter_name;
+      PrimExpr extent;
+      if (iter->range.defined()) {
+        extent = iter->range->extent;
+      } else {
+        // This iter is simplified by InferBound, so it must have a length of one.
+        extent = 1;
+      }
+      os << extent << ori_iter_name;
       new_names.push_back(ori_iter_name);
     }
   }
diff --git a/tests/python/unittest/test_auto_scheduler_layout_rewrite.py b/tests/python/unittest/test_auto_scheduler_layout_rewrite.py
index 6ca56bde7c60..2fae7b838143 100644
--- a/tests/python/unittest/test_auto_scheduler_layout_rewrite.py
+++ b/tests/python/unittest/test_auto_scheduler_layout_rewrite.py
@@ -49,6 +49,21 @@ def test_apply_steps_with_layout_rewrite():
     assert bufs[1].shape[1] == 512
 
 
+def test_apply_steps_with_layout_rewrite_corner_case():
+    A, B, C = matmul_auto_scheduler_test(1, 1, 1)
+    dag = auto_scheduler.ComputeDAG([A, B, C])
+
+    s = dag.get_init_state()
+
+    s.compute_root(C)
+    i_j_fused = s.fuse(C, [s[C].iters[0], s[C].iters[1]])
+    s.parallel(C, i_j_fused)
+
+    _, bufs = dag.apply_steps_from_state(
+        s, layout_rewrite=auto_scheduler.LayoutRewriteOption.REWRITE_FOR_PRE_TRANSFORMED
+    )
+
+
 @tvm.testing.requires_llvm
 def test_correctness_layout_rewrite_rewrite_for_preTransformed():
     N = 128
@@ -169,5 +184,6 @@ def test_correctness_layout_rewrite_insert_transform_stage():
 
 if __name__ == "__main__":
     test_apply_steps_with_layout_rewrite()
+    test_apply_steps_with_layout_rewrite_corner_case()
     test_correctness_layout_rewrite_rewrite_for_preTransformed()
     test_correctness_layout_rewrite_insert_transform_stage()

From 51a2d664187ab8f0cdbc32ca87a3caa97a1bb049 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Thu, 14 Jan 2021 08:11:19 -0800
Subject: [PATCH 066/357] [AutoScheduler] Fix typos in feature extraction and
 cost model (#7280)

---
 .../tvm/auto_scheduler/cost_model/cost_model.py   |  4 ++--
 python/tvm/auto_scheduler/cost_model/xgb_model.py | 15 +++++++++++++++
 python/tvm/auto_scheduler/feature.py              | 10 +++-------
 python/tvm/autotvm/tuner/xgboost_tuner.py         |  2 +-
 src/auto_scheduler/feature.cc                     |  2 +-
 5 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/python/tvm/auto_scheduler/cost_model/cost_model.py b/python/tvm/auto_scheduler/cost_model/cost_model.py
index 32e276b31c6a..9ef4bcac7a99 100644
--- a/python/tvm/auto_scheduler/cost_model/cost_model.py
+++ b/python/tvm/auto_scheduler/cost_model/cost_model.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-""" Cost model that estimates the performance of programs """
+""" Cost models that estimate the performance of programs """
 import ctypes
 import numpy as np
 
@@ -31,7 +31,7 @@ class CostModel(Object):
 
 @tvm._ffi.register_object("auto_scheduler.RandomModel")
 class RandomModel(CostModel):
-    """A model returns random estimation for all inputs"""
+    """A model that returns random estimation for all inputs"""
 
     def __init__(self):
         self.__init_handle_by_constructor__(_ffi_api.RandomModel)
diff --git a/python/tvm/auto_scheduler/cost_model/xgb_model.py b/python/tvm/auto_scheduler/cost_model/xgb_model.py
index f42648288bfa..aab36c175c3c 100644
--- a/python/tvm/auto_scheduler/cost_model/xgb_model.py
+++ b/python/tvm/auto_scheduler/cost_model/xgb_model.py
@@ -86,6 +86,21 @@ class XGBModel(PythonBasedModel):
     of several samples, so we implemented a custom loss function and call it pack-sum-rmse.
     It is called "pack-sum" because we combine several samples into a "pack" and sum up
     their predictions.
+
+    Parameters
+    ----------
+    verbose_eval: int = 25
+        Print training log every `verbose_eval` iterations.
+    num_warmup_sample: int = 100
+        The minimum number of samples to start to use the trained model.
+        If the number of samples is less than this number, the model outputs random predictions.
+    seed: Optional[int]
+        The random seed
+    model_file: Optional[str]
+        If is not None, save model to this file after every update.
+    adapative_training: bool = False
+        Whether to use adapatie training, which reduces the training frequency when there are
+        too many logs.
     """
 
     def __init__(
diff --git a/python/tvm/auto_scheduler/feature.py b/python/tvm/auto_scheduler/feature.py
index 4c1883ad263f..bd6526187581 100644
--- a/python/tvm/auto_scheduler/feature.py
+++ b/python/tvm/auto_scheduler/feature.py
@@ -80,7 +80,7 @@ def unpack_feature(byte_arr: bytearray) -> Tuple[np.ndarray, np.ndarray, np.ndar
       ... // until i == n - 1
 
       float throughputs[sizes[n]];  // The normalized throughputs for n records
-      int   task_ids[size[n+1];   // The task ids for n records
+      int   task_ids[size[n+1]];    // The task ids for n records
 
     }
     To implement this format, we also store int as float, so we can store all numbers
@@ -135,7 +135,7 @@ def unpack_feature(byte_arr: bytearray) -> Tuple[np.ndarray, np.ndarray, np.ndar
     # unpack normalized_throughputs
     m = sizes[-2]
     normalized_throughputs = struct.unpack_from("%df" % m, byte_arr, offset=offset)
-    offset += m * SIZE_OF_INT32
+    offset += m * SIZE_OF_FLOAT32
 
     # unpack task_ids
     m = sizes[-1]
@@ -211,7 +211,7 @@ def get_per_store_features_from_measure_pairs(
 
 def get_per_store_features_from_states(
     states: List[Union[State, StateObject]], task: "SearchTask", max_n_bufs: Optional[int] = None
-) -> List[np.ndarray]:
+) -> np.ndarray:
     """Get per-store features from measurement input/result pairs
 
     Parameters
@@ -227,10 +227,6 @@ def get_per_store_features_from_states(
     -------
     features: np.ndarray
         Feature vectors
-    normalized_throughputs: np.ndarray
-        Normalized throughputs
-    task_ids: np.ndarray
-        Task ids
     """
     if isinstance(states[0], State):
         state_objects = [s.state_object for s in states]
diff --git a/python/tvm/autotvm/tuner/xgboost_tuner.py b/python/tvm/autotvm/tuner/xgboost_tuner.py
index 8f8ddfe7bd4e..2f4d0ee88ce9 100644
--- a/python/tvm/autotvm/tuner/xgboost_tuner.py
+++ b/python/tvm/autotvm/tuner/xgboost_tuner.py
@@ -64,7 +64,7 @@ class XGBTuner(ModelBasedTuner):
         top-(plan_size * diversity_filter_ratio) candidates according to the cost model
         and then pick batch_size of them according to the diversity metric.
 
-    log_interval: int, optional
+    log_interval: int = 50
         The verbose level.
         If is 0, output nothing.
         Otherwise, output debug information every `verbose` iterations.
diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc
index a5d4958af769..1b10cd5f2601 100755
--- a/src/auto_scheduler/feature.cc
+++ b/src/auto_scheduler/feature.cc
@@ -1518,7 +1518,7 @@ void GetPerStoreFeaturesFromMeasurePairs(const Array<MeasureInput>& inputs,
  *   ... // until i == n - 1
  *
  *   float throughputs[sizes[n]];  // The normalized throughputs for n records
- *   int   task_ids[size[n+1];   // The task ids for n records
+ *   int   task_ids[size[n+1]];   // The task ids for n records
  *
  * }
  * To implement this format, we also store int as float, so we can store all numbers

From 1677bb22e6220f4fd2b77b83afd581e9357751cd Mon Sep 17 00:00:00 2001
From: "Steven S. Lyubomirsky" <sslyu@cs.washington.edu>
Date: Thu, 14 Jan 2021 11:29:55 -0500
Subject: [PATCH 067/357] [PatternLang][Bugfix] Ensure CallNode attrs are not
 undefined before checking (#7278)

* Correct handling of call node attrs to handle non-operator calls (attrs may be undefined)

* Linting fix
---
 src/relay/ir/dataflow_matcher.cc            |  6 ++++-
 tests/python/relay/test_dataflow_pattern.py | 29 +++++++++++++++++++++
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/src/relay/ir/dataflow_matcher.cc b/src/relay/ir/dataflow_matcher.cc
index e4c0c7fa1c94..f4ea02a40d52 100644
--- a/src/relay/ir/dataflow_matcher.cc
+++ b/src/relay/ir/dataflow_matcher.cc
@@ -168,7 +168,11 @@ bool DFPatternMatcher::VisitDFPattern_(const AttrPatternNode* attr_pattern, cons
     // and replace the whole thing with a Visitor-based approach
     ReflectionVTable* reflection = ReflectionVTable::Global();
     auto attrs_node = const_cast<BaseAttrsNode*>(op->attrs.get());
-    auto attr_names = reflection->ListAttrNames(attrs_node);
+    // attrs may be undefined on non-op calls so we check first
+    std::vector<std::string> attr_names;
+    if (attrs_node) {
+      attr_names = reflection->ListAttrNames(attrs_node);
+    }
     for (auto kv : attributes) {
       std::string attr = kv.first;
       if (matches && std::find(attr_names.begin(), attr_names.end(), attr) != attr_names.end()) {
diff --git a/tests/python/relay/test_dataflow_pattern.py b/tests/python/relay/test_dataflow_pattern.py
index f30a4e747c33..8a2407adb303 100644
--- a/tests/python/relay/test_dataflow_pattern.py
+++ b/tests/python/relay/test_dataflow_pattern.py
@@ -389,6 +389,20 @@ def test_match_call_attr():
     y = relay.var("y")
     assert is_conv2d.match(relay.op.nn.conv2d(x, y))
 
+    # non-operator call
+    attr_dict = {"call_attr": "attr"}
+    call_has_attr = wildcard()(wildcard()).has_attr(attr_dict)
+    call_attr = tvm.ir.make_node("DictAttrs", **attr_dict)
+    a = relay.Var("a")
+    b = relay.Var("b")
+    assert call_has_attr.match(relay.Call(a, [b], attrs=call_attr))
+
+    # empty attrs should match anything
+    empty_attrs = tvm.ir.make_node("DictAttrs", **{})
+    call_has_empty_attrs = wildcard()(wildcard()).has_attr({})
+    assert call_has_empty_attrs.match(relay.Call(a, [b], attrs=empty_attrs))
+    assert call_has_empty_attrs.match(relay.Call(a, [b], attrs=call_attr))
+
 
 def test_no_match_call_attr():
     x = relay.var("x")
@@ -400,6 +414,21 @@ def test_no_match_call_attr():
     is_conv2d = is_op("nn.conv2d")(wildcard(), wildcard()).has_attr({"RandomAttr": "NCHW"})
     assert not is_conv2d.match(relay.op.nn.conv2d(x, y))
 
+    # non-operator calls
+    call_has_attr = wildcard()(wildcard()).has_attr({"call_attr": "attr"})
+    wrong_key = tvm.ir.make_node("DictAttrs", **{"wrong": "attr"})
+    wrong_value = tvm.ir.make_node("DictAttrs", **{"call_attr": "wrong"})
+    empty_attrs = tvm.ir.make_node("DictAttrs", **{})
+
+    a = relay.Var("a")
+    b = relay.Var("b")
+    # attrs left undefined
+    assert not call_has_attr.match(relay.Call(a, [b]))
+    # wrong attrs
+    assert not call_has_attr.match(relay.Call(a, [b], attrs=wrong_key))
+    assert not call_has_attr.match(relay.Call(a, [b], attrs=wrong_value))
+    assert not call_has_attr.match(relay.Call(a, [b], attrs=empty_attrs))
+
 
 def test_match_call_attr_dtype():
     is_cast = is_op("cast")(wildcard()).has_attr({"dtype": "float32"})

From bb6c26beb213810cdc3f9c6837af890142aaaa9e Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Thu, 14 Jan 2021 10:41:03 -0800
Subject: [PATCH 068/357] switch to more portable bash pipeline syntax (#7274)

---
 tests/scripts/task_sphinx_precheck.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/scripts/task_sphinx_precheck.sh b/tests/scripts/task_sphinx_precheck.sh
index fd67b0ab539b..894f7471bde4 100755
--- a/tests/scripts/task_sphinx_precheck.sh
+++ b/tests/scripts/task_sphinx_precheck.sh
@@ -36,7 +36,7 @@ make cython3
 echo "PreCheck sphinx doc generation WARNINGS.."
 cd docs
 make clean
-TVM_TUTORIAL_EXEC_PATTERN=none make html |& tee /tmp/$$.log.txt
+TVM_TUTORIAL_EXEC_PATTERN=none make html 2>1 | tee /tmp/$$.log.txt
 
 grep -v -E "__mro__|UserWarning|FutureWarning|tensorflow|Keras|pytorch|TensorFlow|403" < /tmp/$$.log.txt > /tmp/$$.logclean.txt || true
 echo "---------Sphinx Log----------"

From ac29624bdc918ae1c926173a5b55cd35839e05f1 Mon Sep 17 00:00:00 2001
From: Tom Gall <tom.gall@linaro.org>
Date: Thu, 14 Jan 2021 12:41:45 -0600
Subject: [PATCH 069/357] Add MicroTVM support for the STM32F746 Discovery
 board (#7225)

* Add MicroTVM support for the STM32F746 Discovery board

Signed-off-by: Tom Gall <tom.gall@linaro.org>

* Add reference to the discovery board in the docs

Signed-off-by: Tom Gall <tom.gall@linaro.org>
---
 docs/microtvm/index.rst            | 1 +
 python/tvm/micro/contrib/zephyr.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/docs/microtvm/index.rst b/docs/microtvm/index.rst
index 68583fed31f4..2371219af27f 100644
--- a/docs/microtvm/index.rst
+++ b/docs/microtvm/index.rst
@@ -42,6 +42,7 @@ flexible and portable to other processors such as RISC-V and does not require Ze
 demos run against QEMU and the following hardware:
 
 * `STM Nucleo-F746ZG <https://www.st.com/en/evaluation-tools/nucleo-f746zg.html>`_
+* `STM STM32F746 Discovery <https://www.st.com/en/evaluation-tools/32f746gdiscovery.html>`_
 * `nRF 5340 Preview Development Kit <https://www.nordicsemi.com/Software-and-tools/Development-Kits/nRF5340-PDK>`_
 
 
diff --git a/python/tvm/micro/contrib/zephyr.py b/python/tvm/micro/contrib/zephyr.py
index 2451eac3cb61..fa032e20c930 100644
--- a/python/tvm/micro/contrib/zephyr.py
+++ b/python/tvm/micro/contrib/zephyr.py
@@ -352,6 +352,7 @@ def _get_nrf_device_args(self):
     # kwargs passed to usb.core.find to find attached boards for the openocd flash runner.
     BOARD_USB_FIND_KW = {
         "nucleo_f746zg": {"idVendor": 0x0483, "idProduct": 0x374B},
+        "stm32f746g_disco": {"idVendor": 0x0483, "idProduct": 0x374B},
     }
 
     def openocd_serial(self, cmake_entries):

From f503d826be14091097b0525f89d6b1231a00cab9 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Thu, 14 Jan 2021 11:11:22 -0800
Subject: [PATCH 070/357] fix mcpu on os x (#7276)

---
 python/tvm/micro/compiler.py              | 6 ++++++
 python/tvm/target/target.py               | 2 +-
 tests/python/unittest/test_link_params.py | 2 +-
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/python/tvm/micro/compiler.py b/python/tvm/micro/compiler.py
index 8d76555d7c31..f59ac8dbc4a0 100644
--- a/python/tvm/micro/compiler.py
+++ b/python/tvm/micro/compiler.py
@@ -106,6 +106,12 @@ def _target_from_sources(cls, sources):
     }
 
     def _autodetect_toolchain_prefix(self, target):
+        # Treat absence of -mcpu as if -mcpu=native is specified. The gcc shipped with OS X
+        # complains if -mcpu=native is given, so this approach allows model targets to avoid
+        # specifying this flag e.g. for tutorials.
+        if "mcpu" not in target.attrs:
+            return self.TOOLCHAIN_PREFIX_BY_CPU_REGEX["native"]
+
         matches = []
         for regex, prefix in self.TOOLCHAIN_PREFIX_BY_CPU_REGEX.items():
             if re.match(regex, target.attrs["mcpu"]):
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index edbb0fa3792a..161cd549fade 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -232,7 +232,7 @@ def micro(model="unknown", options=None):
         Additional options
     """
     trans_table = {
-        "host": ["-mcpu=native"],
+        "host": [],
         "stm32f746xx": ["-mcpu=cortex-m7", "-march=armv7e-m"],
     }
     opts = _merge_opts(
diff --git a/tests/python/unittest/test_link_params.py b/tests/python/unittest/test_link_params.py
index da87a3177c7c..52d7a27838d7 100644
--- a/tests/python/unittest/test_link_params.py
+++ b/tests/python/unittest/test_link_params.py
@@ -347,7 +347,7 @@ def test_crt_link_params():
         mod, param_init = _make_mod_and_params(dtype)
         rand_input = _make_random_tensor(dtype, INPUT_SHAPE)
         main_func = mod["main"]
-        target = "c -mcpu=native --system-lib --runtime=c --link-params"
+        target = "c --system-lib --runtime=c --link-params"
         with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
             graph_json, lib, params = tvm.relay.build(mod, target, params=param_init)
             assert set(params.keys()) == {"p0", "p1"}  # NOTE: op folded

From d7a9a7c5bc5952e43db2ac0113fd40cc77453fbe Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Fri, 15 Jan 2021 06:15:00 +0900
Subject: [PATCH 071/357] [PatternLang] Add If pattern (#7282)

* Add if pattern

commit 1ee052fd494a5bdd881c242c3ea0c95cf2a613e5
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Sat Dec 26 22:19:17 2020 +0900

    add comment

commit c846a6999e9c9e48fbc019780e705a990f46cb22
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Sat Dec 26 21:14:20 2020 +0900

    max_out_size rewrite added to the test

commit 2c7c7fbd0e6563aba694e7fb6baa7bda8e4fadca
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Sat Dec 26 20:57:55 2020 +0900

    max_out_size rewrite working

commit 319e930acb8162c1ec4a5d4fb71d134580a68f13
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Sat Dec 26 20:43:16 2020 +0900

    refactor dyn strided slice pattern

commit fb6917b703440748800bde624bc20efaf5798b8a
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Sat Dec 26 11:21:33 2020 +0900

    update NMS pattern following frontend change

commit 255a98f1da8f300d4fe417cce3587c0d71e38ed3
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Thu Dec 24 05:19:31 2020 +0900

    add some comment to explain the pattern

commit 52cea1cc2bff533ca60acfc2416477fc8b058428
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed Dec 23 08:35:14 2020 +0900

    revert tutorial change

commit d3e0e0d7e2427c40067d6ad2680ec5b3f0076223
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed Dec 23 08:02:29 2020 +0900

    test fixed by setting force_surpress=False

commit 2fa1a574f932001be2d8f601338a342dab92f79c
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed Dec 23 07:22:32 2020 +0900

    fixed coord_start

commit 6ba88f27dec1bdb0b0ba746c268591a59264088e
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed Dec 23 06:50:46 2020 +0900

    add doc

commit 8d386b6a1c92ce4fe3349ff20e320199a1b5b310
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed Dec 23 05:27:26 2020 +0900

    updated tutorial

commit 3206b49ecfdd874e0ff8feb0fa586c4c4282f705
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed Dec 23 05:04:44 2020 +0900

    update object detection test to add rewrite

commit 74bebb2f4376aeb67d8c4aad395f9f2661fe6b3e
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed Dec 23 05:02:15 2020 +0900

    add a pattern to rewrite nms to batched nms

commit f410e6dde0ed949b90312c5a7ddbb6c234f9acc1
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Sat Dec 26 22:20:16 2020 +0900

    add comment

commit f1e078b0724bd22e7be0a812055e1c7c650d94da
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Sat Dec 26 19:54:22 2020 +0900

    Add if pattern

* add doc

* add test

* doc formatting

* cpplint fix
---
 docs/langref/relay_pattern.rst                | 16 +++++++
 include/tvm/relay/dataflow_pattern.h          | 20 +++++++++
 include/tvm/relay/dataflow_pattern_functor.h  |  3 ++
 python/tvm/relay/dataflow_pattern/__init__.py | 43 +++++++++++++++++++
 src/relay/ir/dataflow_matcher.cc              | 12 ++++++
 src/relay/ir/dataflow_pattern.cc              | 22 ++++++++++
 src/relay/ir/dataflow_pattern_functor.cc      |  6 +++
 src/relay/ir/indexed_graph.cc                 |  6 +++
 tests/python/relay/test_dataflow_pattern.py   | 38 ++++++++++++++++
 9 files changed, 166 insertions(+)

diff --git a/docs/langref/relay_pattern.rst b/docs/langref/relay_pattern.rst
index ff02e50eb5fb..992954c9a5b1 100644
--- a/docs/langref/relay_pattern.rst
+++ b/docs/langref/relay_pattern.rst
@@ -230,6 +230,21 @@ The next example is matching function nodes with a specific attribute:
         f = relay.Function([x, y], x + y).with_attr("Composite", "add")
         assert pattern.match(f)
 
+A Relay ``If`` expression can be matched if all of its condition, true branch and false branch
+are matched:
+
+.. code-block:: python
+
+    def test_match_if():
+        x = is_var("x")
+        y = is_var("y")
+        pat = is_if(is_op("less")(x, y), x, y)
+
+        x = relay.var("x")
+        y = relay.var("y")
+        cond = x < y
+
+        assert pat.match(relay.expr.If(cond, x, y))
 
 Matching Diamonds and Post-Dominator Graphs
 *******************************************
@@ -294,6 +309,7 @@ The high level design is to introduce a language of patterns for now we propose
             | is_op(op_name)
             | is_tuple()
             | is_tuple_get_item(pattern, index = None)
+            | is_if(cond, tru, fls)
             | pattern1 `|` pattern2
             | dominates(parent_pattern, path_pattern, child_pattern)
             | FunctionPattern(params, body)
diff --git a/include/tvm/relay/dataflow_pattern.h b/include/tvm/relay/dataflow_pattern.h
index 5b2734f52ede..1b0c0aca7ff6 100644
--- a/include/tvm/relay/dataflow_pattern.h
+++ b/include/tvm/relay/dataflow_pattern.h
@@ -260,6 +260,26 @@ class TupleGetItemPatternNode : public DFPatternNode {
   TVM_DECLARE_FINAL_OBJECT_INFO(TupleGetItemPatternNode, DFPatternNode);
 };
 
+class IfPatternNode : public DFPatternNode {
+ public:
+  DFPattern cond, true_branch, false_branch;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("cond", &cond);
+    v->Visit("true_branch", &true_branch);
+    v->Visit("false_branch", &false_branch);
+  }
+
+  static constexpr const char* _type_key = "relay.dataflow_pattern.IfPattern";
+  TVM_DECLARE_FINAL_OBJECT_INFO(IfPatternNode, DFPatternNode);
+};
+
+class IfPattern : public DFPattern {
+ public:
+  TVM_DLL IfPattern(DFPattern cond, DFPattern then_clause, DFPattern else_clause);
+  TVM_DEFINE_OBJECT_REF_METHODS(IfPattern, DFPattern, IfPatternNode);
+};
+
 class TupleGetItemPattern : public DFPattern {
  public:
   TVM_DLL TupleGetItemPattern(DFPattern tuple, int index);
diff --git a/include/tvm/relay/dataflow_pattern_functor.h b/include/tvm/relay/dataflow_pattern_functor.h
index f04977b86ccb..bff9e23ef046 100644
--- a/include/tvm/relay/dataflow_pattern_functor.h
+++ b/include/tvm/relay/dataflow_pattern_functor.h
@@ -91,6 +91,7 @@ class DFPatternFunctor<R(const DFPattern& n, Args...)> {
   virtual R VisitDFPattern_(const ShapePatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT;
   virtual R VisitDFPattern_(const TupleGetItemPatternNode* op,
                             Args... args) DFPATTERN_FUNCTOR_DEFAULT;
+  virtual R VisitDFPattern_(const IfPatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT;
   virtual R VisitDFPattern_(const TuplePatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT;
   virtual R VisitDFPattern_(const TypePatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT;
   virtual R VisitDFPattern_(const VarPatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT;
@@ -116,6 +117,7 @@ class DFPatternFunctor<R(const DFPattern& n, Args...)> {
     RELAY_DFPATTERN_FUNCTOR_DISPATCH(FunctionPatternNode);
     RELAY_DFPATTERN_FUNCTOR_DISPATCH(ShapePatternNode);
     RELAY_DFPATTERN_FUNCTOR_DISPATCH(TupleGetItemPatternNode);
+    RELAY_DFPATTERN_FUNCTOR_DISPATCH(IfPatternNode);
     RELAY_DFPATTERN_FUNCTOR_DISPATCH(TuplePatternNode);
     RELAY_DFPATTERN_FUNCTOR_DISPATCH(TypePatternNode);
     RELAY_DFPATTERN_FUNCTOR_DISPATCH(VarPatternNode);
@@ -144,6 +146,7 @@ class DFPatternVisitor : public DFPatternFunctor<void(const DFPattern&)> {
   void VisitDFPattern_(const ShapePatternNode* op) override;
   void VisitDFPattern_(const TupleGetItemPatternNode* op) override;
   void VisitDFPattern_(const TuplePatternNode* op) override;
+  void VisitDFPattern_(const IfPatternNode* op) override;
   void VisitDFPattern_(const TypePatternNode* op) override;
   void VisitDFPattern_(const VarPatternNode* op) override;
   void VisitDFPattern_(const WildcardPatternNode* op) override;
diff --git a/python/tvm/relay/dataflow_pattern/__init__.py b/python/tvm/relay/dataflow_pattern/__init__.py
index f5161ad0bfa7..6f764e1651da 100644
--- a/python/tvm/relay/dataflow_pattern/__init__.py
+++ b/python/tvm/relay/dataflow_pattern/__init__.py
@@ -314,6 +314,29 @@ def is_tuple_get_item(tuple_value: "DFPattern", index: Optional[int] = None) ->
     return TupleGetItemPattern(tuple_value, index)
 
 
+def is_if(cond, true_branch, false_branch):
+    """
+    Syntatic sugar for creating an IfPattern.
+
+    Parameters
+    ----------
+    cond: tvm.relay.dataflow_pattern.DFPattern
+        The pattern describing the condition of If.
+
+    true_branch: tvm.relay.dataflow_pattern.DFPattern
+        The pattern describing the true branch of If.
+
+    false_branch: tvm.relay.dataflow_pattern.DFPattern
+        The pattern describing the false branch of If.
+
+    Returns
+    -------
+    result: tvm.relay.dataflow_pattern.DFPattern
+        The resulting pattern.
+    """
+    return IfPattern(cond, true_branch, false_branch)
+
+
 def wildcard() -> "DFPattern":
     """
     Syntatic sugar for creating a WildcardPattern.
@@ -536,6 +559,26 @@ def __init__(
         self.__init_handle_by_constructor__(ffi.FunctionPattern, params, body)
 
 
+@register_df_node
+class IfPattern(DFPattern):
+    """A patern matching a Relay If.
+
+    Parameters
+    ----------
+    cond: tvm.relay.dataflow_pattern.DFPattern
+        The pattern describing the condition of If.
+
+    true_branch: tvm.relay.dataflow_pattern.DFPattern
+        The pattern describing the true branch of If.
+
+    false_branch: tvm.relay.dataflow_pattern.DFPattern
+        The pattern describing the false branch of If.
+    """
+
+    def __init__(self, cond: "DFPattern", true_branch: "DFPattern", false_branch: "DFPattern"):
+        self.__init_handle_by_constructor__(ffi.IfPattern, cond, true_branch, false_branch)
+
+
 @register_df_node
 class TuplePattern(DFPattern):
     """A patern matching a Relay Tuple.
diff --git a/src/relay/ir/dataflow_matcher.cc b/src/relay/ir/dataflow_matcher.cc
index f4ea02a40d52..459694b8f679 100644
--- a/src/relay/ir/dataflow_matcher.cc
+++ b/src/relay/ir/dataflow_matcher.cc
@@ -58,6 +58,7 @@ class DFPatternMatcher : public DFPatternFunctor<bool(const DFPattern&, const Ex
   bool VisitDFPattern_(const ShapePatternNode* op, const Expr& expr) override;
   bool VisitDFPattern_(const TupleGetItemPatternNode* op, const Expr& expr) override;
   bool VisitDFPattern_(const TuplePatternNode* op, const Expr& expr) override;
+  bool VisitDFPattern_(const IfPatternNode* op, const Expr& expr) override;
   bool VisitDFPattern_(const TypePatternNode* op, const Expr& expr) override;
   bool VisitDFPattern_(const VarPatternNode* op, const Expr& expr) override;
   bool VisitDFPattern_(const WildcardPatternNode* op, const Expr& expr) override;
@@ -411,6 +412,17 @@ bool DFPatternMatcher::VisitDFPattern_(const TuplePatternNode* op, const Expr& e
   return matches;
 }
 
+bool DFPatternMatcher::VisitDFPattern_(const IfPatternNode* op, const Expr& expr) {
+  if (const auto* if_node = expr.as<IfNode>()) {
+    auto cond = if_node->cond;
+    auto true_branch = if_node->true_branch;
+    auto false_branch = if_node->false_branch;
+    return VisitDFPattern(op->cond, cond) && VisitDFPattern(op->true_branch, true_branch) &&
+           VisitDFPattern(op->false_branch, false_branch);
+  }
+  return false;
+}
+
 Expr InferType(const Expr& expr) {
   auto mod = IRModule::FromExpr(expr);
   mod = transform::InferType()(mod);
diff --git a/src/relay/ir/dataflow_pattern.cc b/src/relay/ir/dataflow_pattern.cc
index 086c3852b13f..1e268fb00d97 100644
--- a/src/relay/ir/dataflow_pattern.cc
+++ b/src/relay/ir/dataflow_pattern.cc
@@ -112,6 +112,28 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
       p->stream << "FunctionPatternNode(" << node->params << ", " << node->body << ")";
     });
 
+IfPattern::IfPattern(DFPattern cond, DFPattern true_branch, DFPattern false_branch) {
+  ObjectPtr<IfPatternNode> n = make_object<IfPatternNode>();
+  n->cond = std::move(cond);
+  n->true_branch = std::move(true_branch);
+  n->false_branch = std::move(false_branch);
+  data_ = std::move(n);
+}
+
+TVM_REGISTER_NODE_TYPE(IfPatternNode);
+
+TVM_REGISTER_GLOBAL("relay.dataflow_pattern.IfPattern")
+    .set_body_typed([](DFPattern cond, DFPattern true_branch, DFPattern false_branch) {
+      return IfPattern(cond, true_branch, false_branch);
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<IfPatternNode>([](const ObjectRef& ref, ReprPrinter* p) {
+      auto* node = static_cast<const IfPatternNode*>(ref.get());
+      p->stream << "IfPattern(" << node->cond << ", " << node->true_branch << ", "
+                << node->false_branch << ")";
+    });
+
 TuplePattern::TuplePattern(tvm::Array<DFPattern> fields) {
   ObjectPtr<TuplePatternNode> n = make_object<TuplePatternNode>();
   n->fields = std::move(fields);
diff --git a/src/relay/ir/dataflow_pattern_functor.cc b/src/relay/ir/dataflow_pattern_functor.cc
index aaa4f84b3254..25b247306229 100644
--- a/src/relay/ir/dataflow_pattern_functor.cc
+++ b/src/relay/ir/dataflow_pattern_functor.cc
@@ -81,6 +81,12 @@ void DFPatternVisitor::VisitDFPattern_(const TuplePatternNode* op) {
   }
 }
 
+void DFPatternVisitor::VisitDFPattern_(const IfPatternNode* op) {
+  VisitDFPattern(op->cond);
+  VisitDFPattern(op->true_branch);
+  VisitDFPattern(op->false_branch);
+}
+
 void DFPatternVisitor::VisitDFPattern_(const TypePatternNode* op) { VisitDFPattern(op->pattern); }
 
 void DFPatternVisitor::VisitDFPattern_(const VarPatternNode* op) {}
diff --git a/src/relay/ir/indexed_graph.cc b/src/relay/ir/indexed_graph.cc
index 4ba053c429de..9ee5c9cf6b85 100644
--- a/src/relay/ir/indexed_graph.cc
+++ b/src/relay/ir/indexed_graph.cc
@@ -282,6 +282,12 @@ IndexedGraph<DFPattern> CreateIndexedGraph(const DFPattern& pattern) {
       }
     }
 
+    void VisitDFPattern_(const IfPatternNode* op, NodePtr parent) override {
+      VisitDFPattern(op->cond, graph_.node_map_[GetRef<DFPattern>(op)]);
+      VisitDFPattern(op->true_branch, graph_.node_map_[GetRef<DFPattern>(op)]);
+      VisitDFPattern(op->false_branch, graph_.node_map_[GetRef<DFPattern>(op)]);
+    }
+
     void VisitDFPattern_(const TypePatternNode* op, NodePtr parent) override {
       VisitDFPattern(op->pattern, graph_.node_map_[GetRef<DFPattern>(op)]);
     }
diff --git a/tests/python/relay/test_dataflow_pattern.py b/tests/python/relay/test_dataflow_pattern.py
index 8a2407adb303..934ebf462b95 100644
--- a/tests/python/relay/test_dataflow_pattern.py
+++ b/tests/python/relay/test_dataflow_pattern.py
@@ -127,6 +127,17 @@ def test_AttrPattern():
     assert op.attrs["TOpPattern"] == K_ELEMWISE
 
 
+def test_IfPattern():
+    x = is_var("x")
+    y = is_var("y")
+    pat = is_if(is_op("less")(x, y), x, y)
+
+    assert isinstance(pat, IfPattern)
+    assert isinstance(pat.cond, CallPattern)
+    assert isinstance(pat.true_branch, VarPattern)
+    assert isinstance(pat.false_branch, VarPattern)
+
+
 ## MATCHER TESTS
 
 
@@ -198,6 +209,30 @@ def test_no_match_func():
     assert not func_pattern.match(relay.Function([x, y], x - y))
 
 
+def test_match_if():
+    x = is_var("x")
+    y = is_var("y")
+    pat = is_if(is_op("less")(x, y), x, y)
+
+    x = relay.var("x")
+    y = relay.var("y")
+    cond = x < y
+
+    assert pat.match(relay.expr.If(cond, x, y))
+
+
+def test_no_match_if():
+    x = is_var("x")
+    y = is_var("y")
+    pat = is_if(is_op("less")(x, y), x, y)
+
+    x = relay.var("x")
+    y = relay.var("y")
+
+    assert not pat.match(relay.expr.If(x > y, x, y))
+    assert not pat.match(relay.expr.If(x < y, y, x))
+
+
 def test_match_option():
     x = relay.var("x")
     w = relay.var("w")
@@ -1541,3 +1576,6 @@ def test_partition_constant_embedding():
     test_partition_option()
     test_match_match()
     test_partition_constant_embedding()
+    test_IfPattern()
+    test_match_if()
+    test_no_match_if()

From c9474639dd3761b78a457ab274603d87a3dcf9b8 Mon Sep 17 00:00:00 2001
From: ANSHUMAN TRIPATHY <anshuman.t@huawei.com>
Date: Fri, 15 Jan 2021 07:59:28 +0530
Subject: [PATCH 072/357] [Frontend][Tensorflow] Sparse_Dense Op CSR scheduling
 issue resolved for Cuda & X86 (#7148)

* [Frontend][Tensorflow] Sparse_Dense Op CSR scheduling issue resolved for both cuda & x86

* [1] Review comments handled

* [2] Review comments handled

* [3] Review comments handled
---
 python/tvm/topi/cuda/sparse.py                | 45 ++++++++++++++++---
 python/tvm/topi/nn/sparse.py                  | 36 +++++++--------
 python/tvm/topi/x86/sparse.py                 | 18 ++++----
 .../frontend/tensorflow/test_forward.py       |  3 +-
 tests/python/topi/python/test_topi_sparse.py  | 13 ++++--
 5 files changed, 76 insertions(+), 39 deletions(-)

diff --git a/python/tvm/topi/cuda/sparse.py b/python/tvm/topi/cuda/sparse.py
index c59e6887d47e..f2cecacbc618 100644
--- a/python/tvm/topi/cuda/sparse.py
+++ b/python/tvm/topi/cuda/sparse.py
@@ -23,10 +23,10 @@
 from tvm import relay, te
 
 from .. import nn
-from ..utils import traverse_inline
+from ..utils import traverse_inline, get_const_tuple, prod, get_const_int
 
 
-def sparse_dense(data, weight_data, weight_indices, weight_indptr):
+def sparse_dense(data, weight_data, weight_indices, weight_indptr, sparse_lhs=False):
     """
     Computes sparse-dense matrix multiplication of `data` and
     `(weight_data, weight_indices, weight_indptr).T`
@@ -57,7 +57,7 @@ def sparse_dense(data, weight_data, weight_indices, weight_indptr):
         2-D with shape [M, N]
     """
     # pylint:disable=unused-argument
-    return nn.sparse_dense(data, weight_data, weight_indices, weight_indptr)
+    return nn.sparse_dense(data, weight_data, weight_indices, weight_indptr, sparse_lhs)
 
 
 def schedule_sparse_dense(outs):
@@ -65,11 +65,13 @@ def schedule_sparse_dense(outs):
     # pylint:disable=invalid-name
     s = te.create_schedule([x.op for x in outs])
 
-    # TODO(ANSHUMAN87): Add for sparse_dense_bsrmm_v1 also
     def _callback(op):
-        if op.tag == "sparse_dense_bsrmm_v2":
+        if op.tag == "sparse_dense_sp_rhs_bsrmm" or op.tag == "sparse_dense_sp_lhs_bsrmm":
             y_bsrmm = op.input_tensors[0]
-            assert y_bsrmm.op.tag == "sparse_dense_bsrmm_block_v2"
+            assert (
+                y_bsrmm.op.tag == "sparse_dense_sp_rhs_bsrmm_block"
+                or y_bsrmm.op.tag == "sparse_dense_sp_lhs_bsrmm_block"
+            )
             out = s.outputs[0].output(0)
 
             if op not in s.outputs:
@@ -91,6 +93,13 @@ def _callback(op):
             s[y_bsrmm_factored].compute_at(s[y_bsrmm], tx)
             s[y_bsrmm].set_store_predicate(thread_x.var.equal(0))
             s[out].set_store_predicate(thread_x.var.equal(0))
+        elif op.tag == "sparse_dense_sp_lhs_csrmm" or op.tag == "sparse_dense_sp_rhs_csrmm":
+            out = op.output(0)
+            const_size = get_const_int(prod(out.shape))
+            fused = s[out].fuse(*s[out].op.axis)
+            bx, tx = s[out].split(fused, factor=const_size)
+            s[out].bind(tx, te.thread_axis("threadIdx.x"))
+            s[out].bind(bx, te.thread_axis("blockIdx.x"))
 
     traverse_inline(s, outs[0].op, _callback)
     return s
@@ -279,7 +288,26 @@ def gen_ir(data, w_data, w_indices, w_indptr, out):
     return out
 
 
-def sparse_dense_padded(data, weight_data, weight_indices, weight_indptr):
+def is_valid_for_sparse_dense_padded(data, weight_data):
+    """
+    Check whether input is applicable for sparse_dense_padded op.
+    If not we should fall back to default scheduling.
+    """
+    # pylint:disable=invalid-name
+    warp_size = int(tvm.target.Target.current(allow_none=False).thread_warp_size)
+    m = get_const_tuple(data.checked_type.shape)[1]
+    if len(weight_data.shape) == 1:
+        bs_m = 1
+    else:
+        bs_m = weight_data.shape[1]
+
+    mb = m // bs_m
+    if mb >= warp_size:
+        return True
+    return False
+
+
+def sparse_dense_padded(data, weight_data, weight_indices, weight_indptr, sparse_lhs=False):
     """
     Computes sparse-dense matrix multiplication of `data` and
     `(weight_data, weight_indices, weight_indptr).T`
@@ -311,6 +339,8 @@ def sparse_dense_padded(data, weight_data, weight_indices, weight_indptr):
     output : tvm.te.Tensor
         2-D with shape [M, N]
     """
+    # TODO(ANSHUMAN87): Handle for sparse_lhs case too
+    assert not sparse_lhs, "Currently only sparse weight is supported."
     return sparse_dense_tir(data, weight_data, weight_indices, weight_indptr)
 
 
@@ -368,6 +398,7 @@ def _alter_sparse_dense_layout(_attrs, inputs, _tinfos, _out_type):
         isinstance(inputs[1], relay.Constant)
         and isinstance(inputs[2], relay.Constant)
         and isinstance(inputs[3], relay.Constant)
+        and is_valid_for_sparse_dense_padded(inputs[0], inputs[1].data.asnumpy())
     ):
         if len(inputs[1].data.asnumpy().shape) == 1:
             sparse_matrix = sp.csr_matrix(
diff --git a/python/tvm/topi/nn/sparse.py b/python/tvm/topi/nn/sparse.py
index 94d6d9a16330..cdccc80bb5f8 100644
--- a/python/tvm/topi/nn/sparse.py
+++ b/python/tvm/topi/nn/sparse.py
@@ -23,7 +23,7 @@
 from ..utils import get_const_tuple
 
 
-def sparse_dense_v2(data, weight_data, weight_indices, weight_indptr):
+def sparse_dense_sp_rhs(data, weight_data, weight_indices, weight_indptr):
     """
     Computes sparse-dense matrix multiplication of `data` and
     `(weight_data, weight_indices, weight_indptr).T`
@@ -52,13 +52,13 @@ def sparse_dense_v2(data, weight_data, weight_indices, weight_indptr):
     """
     assert len(weight_data.shape) in (1, 3)
     if len(weight_data.shape) == 1:
-        func = _sparse_dense_csrmm_v2
+        func = _sparse_dense_sp_rhs_csrmm
     if len(weight_data.shape) == 3:
-        func = _sparse_dense_bsrmm_v2
+        func = _sparse_dense_sp_rhs_bsrmm
     return func(data, weight_data, weight_indices, weight_indptr)
 
 
-def sparse_dense_v1(data_data, data_indices, data_indptr, weight):
+def sparse_dense_sp_lhs(data_data, data_indices, data_indptr, weight):
     """
     Computes sparse-dense matrix multiplication of
     `(data_data, data_indices, data_indptr)` and `weight.T`
@@ -87,9 +87,9 @@ def sparse_dense_v1(data_data, data_indices, data_indptr, weight):
     """
     assert len(data_data.shape) in (1, 3)
     if len(data_data.shape) == 1:
-        func = _sparse_dense_csrmm_v1
+        func = _sparse_dense_sp_lhs_csrmm
     if len(data_data.shape) == 3:
-        func = _sparse_dense_bsrmm_v1
+        func = _sparse_dense_sp_lhs_bsrmm
     return func(data_data, data_indices, data_indptr, weight)
 
 
@@ -128,12 +128,12 @@ def sparse_dense(dense_data, sparse_data, sparse_indices, sparse_indptr, sparse_
         2-D with shape [M, N]
     """
     if sparse_lhs:
-        return sparse_dense_v1(sparse_data, sparse_indices, sparse_indptr, dense_data)
+        return sparse_dense_sp_lhs(sparse_data, sparse_indices, sparse_indptr, dense_data)
     else:
-        return sparse_dense_v2(dense_data, sparse_data, sparse_indices, sparse_indptr)
+        return sparse_dense_sp_rhs(dense_data, sparse_data, sparse_indices, sparse_indptr)
 
 
-def _sparse_dense_csrmm_v1(data_data, data_indices, data_indptr, weight):
+def _sparse_dense_sp_lhs_csrmm(data_data, data_indices, data_indptr, weight):
     oshape = (get_const_tuple(data_indptr.shape)[0] - 1, get_const_tuple(weight.shape)[0])
 
     def f(row, i):
@@ -146,10 +146,10 @@ def f(row, i):
         weight_val = weight[i, data_indices[elem]]
         return te.sum(a_val * weight_val, axis=elem_idx)
 
-    return te.compute(oshape, f, tag="sparse_dense_csrmm_v1")
+    return te.compute(oshape, f, tag="sparse_dense_sp_lhs_csrmm")
 
 
-def _sparse_dense_csrmm_v2(data, weight_data, weight_indices, weight_indptr):
+def _sparse_dense_sp_rhs_csrmm(data, weight_data, weight_indices, weight_indptr):
     oshape = (get_const_tuple(data.shape)[0], get_const_tuple(weight_indptr.shape)[0] - 1)
 
     def f(i, row):
@@ -162,10 +162,10 @@ def f(i, row):
         weight_val = data[i, weight_indices[elem]]
         return te.sum(a_val * weight_val, axis=elem_idx)
 
-    return te.compute(oshape, f, tag="sparse_dense_csrmm_v2")
+    return te.compute(oshape, f, tag="sparse_dense_sp_rhs_csrmm")
 
 
-def _sparse_dense_bsrmm_v1(data_data, data_indices, data_indptr, weight):
+def _sparse_dense_sp_lhs_bsrmm(data_data, data_indices, data_indptr, weight):
     (m, _) = get_const_tuple(weight.shape)
     (_, bs_r, bs_c) = get_const_tuple(data_data.shape)
     (num_blocks_plus_1,) = get_const_tuple(data_indptr.shape)
@@ -187,16 +187,16 @@ def _compute_block(nb_j, j, i):
     idxm = tvm.tir.indexmod
 
     bsrmm_block = te.compute(
-        (num_blocks, bs_r, m), _compute_block, tag="sparse_dense_bsrmm_block_v1"
+        (num_blocks, bs_r, m), _compute_block, tag="sparse_dense_sp_lhs_bsrmm_block"
     )
     return te.compute(
         (num_blocks * bs_r, m),
         lambda m, n: bsrmm_block[idxd(m, bs_r), idxm(m, bs_r), n],
-        tag="sparse_dense_bsrmm_v1",
+        tag="sparse_dense_sp_lhs_bsrmm",
     )
 
 
-def _sparse_dense_bsrmm_v2(data, weight_data, weight_indices, weight_indptr):
+def _sparse_dense_sp_rhs_bsrmm(data, weight_data, weight_indices, weight_indptr):
     (m, _) = get_const_tuple(data.shape)
     (_, bs_r, bs_c) = get_const_tuple(weight_data.shape)
     (num_blocks_plus_1,) = get_const_tuple(weight_indptr.shape)
@@ -218,12 +218,12 @@ def _compute_block(i, nb_j, j):
     idxm = tvm.tir.indexmod
 
     bsrmm_block = te.compute(
-        (m, num_blocks, bs_r), _compute_block, tag="sparse_dense_bsrmm_block_v2"
+        (m, num_blocks, bs_r), _compute_block, tag="sparse_dense_sp_rhs_bsrmm_block"
     )
     return te.compute(
         (m, num_blocks * bs_r),
         lambda m, n: bsrmm_block[m, idxd(n, bs_r), idxm(n, bs_r)],
-        tag="sparse_dense_bsrmm_v2",
+        tag="sparse_dense_sp_rhs_bsrmm",
     )
 
 
diff --git a/python/tvm/topi/x86/sparse.py b/python/tvm/topi/x86/sparse.py
index b6291083c8c1..c6300f6701e0 100644
--- a/python/tvm/topi/x86/sparse.py
+++ b/python/tvm/topi/x86/sparse.py
@@ -28,15 +28,17 @@ def schedule_sparse_dense(outs):
 
     def _callback(op):
         simd_width = get_fp32_len()
-        if op.tag == "sparse_dense_csrmm" and op != outs[0].op:
-            (_, v_i) = s[op].op.axis
-            s[op].vectorize(v_i)
-            (y_o, y_i) = s[outs[0].op].split(s[outs[0].op].op.axis[1], 2 * simd_width)
-            s[op].compute_at(s[outs[0]], y_o)
-            s[outs[0].op].vectorize(y_i)
-        if op.tag == "sparse_dense_bsrmm":
+        if op.tag == "sparse_dense_sp_lhs_csrmm" or op.tag == "sparse_dense_sp_lhs_csrmm":
+            (y_o, y_i) = s[op].split(s[op].op.axis[1], 2)
+            fused = s[op].fuse(s[op].op.axis[0], y_o)
+            s[op].parallel(fused)
+            s[op].vectorize(y_i)
+        elif op.tag == "sparse_dense_sp_rhs_bsrmm" or op.tag == "sparse_dense_sp_rhs_bsrmm":
             y_bsrmm = op.input_tensors[0]
-            assert y_bsrmm.op.tag == "sparse_dense_bsrmm_block"
+            assert (
+                y_bsrmm.op.tag == "sparse_dense_sp_rhs_bsrmm_block"
+                or y_bsrmm.op.tag == "sparse_dense_sp_lhs_bsrmm_block"
+            )
             y_reshape = op
             (m, num_blocks, b_r) = s[y_bsrmm].op.axis
             bs_r = get_const_int(b_r.dom.extent)
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 22ed6c5b2edf..d71405796ede 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -1779,8 +1779,7 @@ def _test_sparse_dense_matmul(indices, values, A_shape, B_shape, dtype, flip=Fal
 
                 B_np = np.random.uniform(high=5.0, size=B_shape).astype(dtype)
 
-                # TODO(ANSHUMAN87): There is an issue in cuda scheduling for csr, work in progress
-                compare_tf_with_tvm([B_np], [B.name], result.name, no_gpu=True)
+                compare_tf_with_tvm([B_np], [B.name], result.name)
 
 
 def test_forward_sparse_dense_matmul():
diff --git a/tests/python/topi/python/test_topi_sparse.py b/tests/python/topi/python/test_topi_sparse.py
index e47bfddbf7fc..d5bd7aa1a21e 100644
--- a/tests/python/topi/python/test_topi_sparse.py
+++ b/tests/python/topi/python/test_topi_sparse.py
@@ -507,19 +507,24 @@ def test_sparse_dense_padded_alter_op():
         K = 128
         X_np = np.random.randn(M, K).astype("float32")
         W_sp_np = random_bsr_matrix(N, K, 2, 2, density=0.01, dtype="float32")
+        x = relay.var("x", relay.TensorType(X_np.shape, "float32"))
         mult = relay.op.nn.sparse_dense(
-            relay.Constant(tvm.nd.array(X_np)),
+            x,
             (
                 relay.Constant(tvm.nd.array(W_sp_np.data)),
                 relay.Constant(tvm.nd.array(W_sp_np.indices)),
                 relay.Constant(tvm.nd.array(W_sp_np.indptr)),
             ),
         )
-        f = relay.Function([], mult)
-        f = relay.transform.InferType()(tvm.IRModule.from_expr(f))
-        f_ = relay.transform.AlterOpLayout()(f)
+        f = relay.Function([x], mult)
+        f_ = relay.transform.InferType()(tvm.IRModule.from_expr(f))
+        f_ = relay.transform.AlterOpLayout()(f_)
         assert f_["main"].body.op.name == "nn.internal.sparse_dense_padded"
 
+        # build with cuda and AlterOpLayout to ensure that sparse_dense_padded is in action
+        with tvm.transform.PassContext(opt_level=3, required_pass="AlterOpLayout"):
+            x = relay.build(tvm.IRModule.from_expr(f), target=tvm.target.Target("cuda"))
+
 
 if __name__ == "__main__":
     test_csrmv()

From 4c5c086e2a259adeb486878c76c53896f3377fe8 Mon Sep 17 00:00:00 2001
From: "Steven S. Lyubomirsky" <sslyu@cs.washington.edu>
Date: Fri, 15 Jan 2021 09:05:42 -0500
Subject: [PATCH 073/357] [BYOC][bugfix] Handle empty tuples in annotation pass
 (#7288)

---
 src/relay/transforms/annotate_target.cc       |  5 ++--
 .../python/relay/test_pass_annotate_target.py | 26 +++++++++++++++++--
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/src/relay/transforms/annotate_target.cc b/src/relay/transforms/annotate_target.cc
index 76585cf1272f..e365dca3860f 100644
--- a/src/relay/transforms/annotate_target.cc
+++ b/src/relay/transforms/annotate_target.cc
@@ -144,11 +144,12 @@ class AnnotateTargetRewriter : public ExprRewriter {
      */
     Expr new_expr = expr;
     const CallNode* call = expr.as<CallNode>();
+    const TupleNode* tup = expr.as<TupleNode>();
     if (op_expr_to_target_.find(expr) != op_expr_to_target_.end()) {
       // Check whether expr has args, if not - do not insert compiler_end.
       if (expr->IsInstance<RefWriteNode>() || expr->IsInstance<RefCreateNode>() ||
-          expr->IsInstance<RefReadNode>() || expr->IsInstance<TupleNode>() ||
-          expr->IsInstance<TupleGetItemNode>() || (call && !call->args.empty())) {
+          expr->IsInstance<RefReadNode>() || expr->IsInstance<TupleGetItemNode>() ||
+          (call && !call->args.empty()) || (tup && !tup->fields.empty())) {
         std::string target = op_expr_to_target_[new_expr];
         new_expr = InsertAnnotation(new_expr, target, make_end_op);
         op_expr_to_target_[new_expr] = target;
diff --git a/tests/python/relay/test_pass_annotate_target.py b/tests/python/relay/test_pass_annotate_target.py
index 4f35066a8384..ce86cc603d6d 100644
--- a/tests/python/relay/test_pass_annotate_target.py
+++ b/tests/python/relay/test_pass_annotate_target.py
@@ -738,8 +738,8 @@ def after():
         mod = tvm.IRModule.from_expr(func)
         return mod
 
-    for annotate_non_call_ops in [True, False, True]:
-        result = transform.AnnotateTarget(target)(before())
+    for annotate_non_call_ops in [True, False]:
+        result = transform.AnnotateTarget(target, annotate_non_call_ops)(before())
         expected = transform.InferType()(after())
         assert tvm.ir.structural_equal(expected, result)
 
@@ -764,6 +764,27 @@ def after():
     assert tvm.ir.structural_equal(expected, result)
 
 
+def test_empty_tuple():
+    target = "test_empty_tuple"
+
+    """An empty tuple should behave just like a call with no args (see above test)."""
+
+    def before():
+        func = relay.Function([], relay.Tuple([]))
+        mod = tvm.IRModule.from_expr(func)
+        return mod
+
+    def after():
+        func = relay.Function([], relay.Tuple([]))
+        mod = tvm.IRModule.from_expr(func)
+        return mod
+
+    for annotate_non_call_ops in [True, False]:
+        result = transform.AnnotateTarget(target, annotate_non_call_ops)(before())
+        expected = transform.InferType()(after())
+        assert tvm.ir.structural_equal(expected, result)
+
+
 if __name__ == "__main__":
     test_extern_dnnl()
     test_composite_function()
@@ -780,3 +801,4 @@ def after():
     test_double_target()
     test_ends_with_tuple()
     test_ref_create_read_write()
+    test_empty_tuple()

From c3f50ff720afb301d21efcb1bd70f27d4608d113 Mon Sep 17 00:00:00 2001
From: Gustavo Romero <gromero@users.noreply.github.com>
Date: Fri, 15 Jan 2021 14:55:57 -0300
Subject: [PATCH 074/357] =?UTF-8?q?[=C2=B5TVM]=20Add=20ST=20STM32F746=20di?=
 =?UTF-8?q?sco=20board=20to=20tflite=20tutorial=20script=20(#7254)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently tutorial script 'micro_tflite.py' assumes that all boards with
target STM32F746 are Nucleo boards. As a consequence once that target is
selected the script automatically defaults to the Nucleo board. However,
the STM32F746 is also used on Discovery Kit boards (aka disco) which are
quite similar but have some differences, so Nucleo config and final image
don't work on the disco boards.

That commit adds a way to select a different dev board and adds comments
accordingly, informing how to use the script with STM32F746 disco boards.

Signed-off-by: Gustavo Romero <gustavo.romero@linaro.org>
---
 tutorials/micro/micro_tflite.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/tutorials/micro/micro_tflite.py b/tutorials/micro/micro_tflite.py
index feabcf71ae2c..c28918380265 100644
--- a/tutorials/micro/micro_tflite.py
+++ b/tutorials/micro/micro_tflite.py
@@ -179,13 +179,17 @@
 
 # %%
 # Compiling for physical hardware
-#  When running on physical hardware, choose a target that describes
-#  the hardware. The STM32F746 Nucleo target is chosen in this commented
-#  code:
+#  When running on physical hardware, choose a target and a board that
+#  describe the hardware. The STM32F746 Nucleo target and board is chosen in
+#  this commented code. Another option would be to choose the same target but
+#  the STM32F746 Discovery board instead. The disco board has the same
+#  microcontroller as the Nucleo board but a couple of wirings and configs
+#  differ, so it's necessary to select the "stm32f746g_disco" board below.
 #
 #  .. code-block:: python
 #
 #     TARGET = tvm.target.target.micro("stm32f746xx")
+#     BOARD = "nucleo_f746zg" # or "stm32f746g_disco"
 
 ######################################################################
 # Now, compile the model for the target:
@@ -217,12 +221,12 @@
 #     repo_root = subprocess.check_output(["git", "rev-parse", "--show-toplevel"], encoding='utf-8').strip()
 #     project_dir = f"{repo_root}/tests/micro/qemu/zephyr-runtime"
 #     compiler = zephyr.ZephyrCompiler(
-#            project_dir=project_dir,
-#            board="nucleo_f746zg" if "stm32f746" in str(TARGET) else "qemu_x86",
-#            zephyr_toolchain_variant="zephyr",
-#      )
+#         project_dir=project_dir,
+#         board=BOARD if "stm32f746" in str(TARGET) else "qemu_x86",
+#         zephyr_toolchain_variant="zephyr",
+#     )
 #
-#     opts = tvm.micro.default_options(os.path.join(tvm.micro.CRT_ROOT_DIR, "host"))
+#     opts = tvm.micro.default_options(f"{project_dir}/crt")
 
 workspace = tvm.micro.Workspace()
 micro_binary = tvm.micro.build_static_runtime(

From 4f1f5913693b8f16cdf4d87a36d9f3cff56c19fa Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Fri, 15 Jan 2021 12:46:55 -0800
Subject: [PATCH 075/357] Bring back numbered lists to TVM docs. (#7290)

* Upstream fix in https://github.com/tlc-pack/tlcpack-sphinx-addon/commit/995178d81e6e38eabbc28da2b285b68583c88769
---
 tests/scripts/task_ci_python_setup.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/scripts/task_ci_python_setup.sh b/tests/scripts/task_ci_python_setup.sh
index 5ae1478fadc6..f48ed49a2266 100755
--- a/tests/scripts/task_ci_python_setup.sh
+++ b/tests/scripts/task_ci_python_setup.sh
@@ -30,4 +30,4 @@ set -o pipefail
 #
 echo "Addtiional setup in" ${CI_IMAGE_NAME}
 
-python3 -m pip install --user tlcpack-sphinx-addon==0.1.3 synr==0.2.1
+python3 -m pip install --user tlcpack-sphinx-addon==0.1.4 synr==0.2.1

From 2992e9bfaa0ade5d5e0bcb9fe8a567ece600e205 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Sat, 16 Jan 2021 06:22:43 +0900
Subject: [PATCH 076/357] [VM] Per-input, data dependence specification for
 shape func (#7210)

* made TShapeDataDependant array

* add stub

* dyn strided slice working

* reshape also working

* remove log

* works on maskrcnn

* lint fix

* fix cpp test

* remove stale pop back

* add more doc

* dependant -> dependent

* remove redundant check

* remove data_dependent_
---
 include/tvm/relay/op_attr_types.h     |  4 +--
 python/tvm/relay/op/dyn/_transform.py | 29 ++++++++----------
 python/tvm/relay/op/op.py             | 12 +++++---
 src/relay/analysis/util.cc            | 13 +++++---
 src/relay/backend/compile_engine.cc   | 44 ++++++++++++++++-----------
 src/relay/transforms/fuse_ops.cc      |  2 +-
 src/relay/transforms/pass_utils.h     |  6 ++--
 tests/cpp/relay_build_module_test.cc  |  4 ++-
 8 files changed, 65 insertions(+), 49 deletions(-)

diff --git a/include/tvm/relay/op_attr_types.h b/include/tvm/relay/op_attr_types.h
index 1e9b86d9e0bc..f916dbeb713f 100644
--- a/include/tvm/relay/op_attr_types.h
+++ b/include/tvm/relay/op_attr_types.h
@@ -83,9 +83,9 @@ using TOpIsStateful = bool;
 using TNonComputational = bool;
 
 /*!
- * \brief Mark the operator whether output shape is data dependant.
+ * \brief Mark the operator whether output shape is data dependent.
  */
-using TShapeDataDependant = bool;
+using TShapeDataDependent = Array<Integer>;
 
 /*!
  * \brief Computation description interface.
diff --git a/python/tvm/relay/op/dyn/_transform.py b/python/tvm/relay/op/dyn/_transform.py
index b61d4f9655f6..a36b56214bc4 100644
--- a/python/tvm/relay/op/dyn/_transform.py
+++ b/python/tvm/relay/op/dyn/_transform.py
@@ -32,11 +32,8 @@
 
 
 @script
-def _reshape_shape_func_input_data(data, newshape, ndim):
+def _reshape_shape_func_input_data(data_shape, newshape, ndim):
     out = output_tensor((ndim,), "int64")
-    data_shape = allocate((len(data.shape),), "int64")
-    for x in const_range(len(data.shape)):
-        data_shape[x] = int64(data.shape[x])
     src_idx = 0
     dst_idx = 0
     infer_idx = -1
@@ -87,7 +84,7 @@ def _reshape_shape_func_input_data(data, newshape, ndim):
     return out
 
 
-@_reg.register_shape_func("dyn.reshape", True)
+@_reg.register_shape_func("dyn.reshape", [False, True])
 def dynamic_reshape_shape_func(attrs, inputs, out_ndims):
     return [_reshape_shape_func_input_data(*inputs, out_ndims[0])]
 
@@ -150,36 +147,36 @@ def one_hot_shape_func(attrs, inputs, _):
 
 
 @script
-def _strided_slice_shape_func_input_data(data, begin, end, strides, slice_mode):
-    ndim = len(data.shape)
+def _strided_slice_shape_func_input_data(data_shape, begin, end, strides, slice_mode):
+    ndim = len(data_shape)
     out = output_tensor((ndim,), "int64")
     for i in const_range(ndim):
         cbegin = int64(0)
-        cend = int64(data.shape[i])
+        cend = int64(data_shape[i])
         cstride = int64(1)
         if strides.shape[0] > i:
             cstride = int64(strides[i])
         if begin.shape[0] > i:
             cbegin = int64(begin[i])
             if cbegin < 0:
-                cbegin += int64(data.shape[i])
+                cbegin += int64(data_shape[i])
         if end.shape[0] <= i:
-            cend = int64(data.shape[i])
+            cend = int64(data_shape[i])
         elif slice_mode != 0:
             cstride = int64(1)
             if end[i] < 0:
-                cend = int64(data.shape[i])
+                cend = int64(data_shape[i])
             else:
                 cend = cbegin + int64(end[i])
         else:
-            if end[i] > data.shape[i]:
-                cend = int64(data.shape[i])
-            elif end[i] < -data.shape[i]:
+            if end[i] > data_shape[i]:
+                cend = int64(data_shape[i])
+            elif end[i] < -data_shape[i]:
                 cend = int64(-1)
             else:
                 cend = int64(end[i])
                 if cend < 0:
-                    cend += int64(data.shape[i])
+                    cend += int64(data_shape[i])
         assert cstride != 0, "Strides can't be zero."
         if cstride < 0:
             slice_range = cbegin - cend
@@ -192,7 +189,7 @@ def _strided_slice_shape_func_input_data(data, begin, end, strides, slice_mode):
     return out
 
 
-@_reg.register_shape_func("dyn.strided_slice", True)
+@_reg.register_shape_func("dyn.strided_slice", [False, True, True, True])
 def strided_slice_shape_func(attrs, inputs, _):
     """
     Shape func for strided_slice
diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py
index d4d20b3ebc4a..5882027fb1d8 100644
--- a/python/tvm/relay/op/op.py
+++ b/python/tvm/relay/op/op.py
@@ -356,7 +356,7 @@ def register_gradient(op_name, fgradient=None, level=10):
     return tvm.ir.register_op_attr(op_name, "FPrimalGradient", fgradient, level)
 
 
-def register_shape_func(op_name, data_dependant, shape_func=None, level=10):
+def register_shape_func(op_name, data_dependent, shape_func=None, level=10):
     """Register operator shape function for an op.
 
     Parameters
@@ -364,8 +364,10 @@ def register_shape_func(op_name, data_dependant, shape_func=None, level=10):
     op_name : str
         The name of the op.
 
-    data_dependant : bool
-        Whether the shape function depends on input data.
+    data_dependent : bool or list of bool
+        Whether the shape function depends on input data. If this is a list of bool,
+        the length of the list must be the same as the number of arguments of this op.
+        The list specifies per-input data dependence of the op.
 
     shape_func : function (attrs: Attrs, inputs: List[Tensor], out_ndims: List[IndexExpr])
                  -> shape_tensors: List<Tensor>
@@ -374,7 +376,9 @@ def register_shape_func(op_name, data_dependant, shape_func=None, level=10):
     level : int
         The priority level
     """
-    get(op_name).set_attr("TShapeDataDependant", data_dependant, level)
+    if not isinstance(data_dependent, list):
+        data_dependent = [data_dependent]
+    get(op_name).set_attr("TShapeDataDependent", data_dependent, level)
     return tvm.ir.register_op_attr(op_name, "FShapeFunc", shape_func, level)
 
 
diff --git a/src/relay/analysis/util.cc b/src/relay/analysis/util.cc
index bcfbc83da514..abb9e6b034c2 100644
--- a/src/relay/analysis/util.cc
+++ b/src/relay/analysis/util.cc
@@ -473,24 +473,27 @@ bool IsDynamic(const Type& ty) {
 
 TVM_REGISTER_GLOBAL("relay.ir.IsDynamic").set_body_typed(IsDynamic);
 
-bool IsDataDependant(const CallNode* call) {
-  static auto tshape_data_dependant = Op::GetAttrMap<TShapeDataDependant>("TShapeDataDependant");
+bool IsDataDependent(const CallNode* call) {
+  static auto tshape_data_dependent = Op::GetAttrMap<TShapeDataDependent>("TShapeDataDependent");
   Op op = Downcast<Op>(call->op);
 
-  if (!tshape_data_dependant.count(op)) {
+  if (!tshape_data_dependent.count(op)) {
     return false;
   }
 
   if (op->name == "strided_slice") {
     if (const auto* attrs = call->attrs.as<StridedSliceAttrs>()) {
       if (attrs->begin && attrs->end && attrs->strides) {
-        // not data dependant if begin, end and strides exist
+        // not data dependent if begin, end and strides exist
         return false;
       }
     }
   }
 
-  return tshape_data_dependant[op];
+  for (auto req : tshape_data_dependent[op]) {
+    if (req->value != 0) return true;
+  }
+  return false;
 }
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index c969c3ba7f06..a66ae0a7e2c0 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -435,9 +435,9 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
       LOG(FATAL) << "Free variable " << var->name_hint();
       return {};
     } else {
-      ICHECK(data_dependants_.size());
-      bool data_dependant = data_dependants_.back();
-      if (data_dependant) {
+      ICHECK(data_dependents_per_input_.size());
+      auto data_dependent = data_dependents_per_input_.back();
+      if (data_dependent) {
         param_states_[var] |= kNeedInputData;
         return param_data_[var];
       } else {
@@ -449,12 +449,12 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
 
   Array<te::Tensor> VisitExpr_(const ConstantNode* op) final {
     using tir::make_const;
-    ICHECK(data_dependants_.size());
-    bool data_dependant = data_dependants_.back();
+    ICHECK(data_dependents_per_input_.size());
+    bool data_dependent = data_dependents_per_input_.back();
     if (!op->is_scalar()) {
       // This is a constant weight, extract the shape of the weight tensor.
       // This can not be data dependent.
-      CHECK(!data_dependant);
+      CHECK(!data_dependent);
       auto ttype = op->checked_type().as<TensorTypeNode>();
       int ndim = static_cast<int>(ttype->shape.size());
       Array<PrimExpr> out_shape{ndim};
@@ -472,7 +472,7 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
       scalars_.push_back(value);
       return {value};
     }
-    if (data_dependant) {
+    if (data_dependent) {
       void* data = op->data->data;
       DataType dtype = DataType(op->data->dtype);
       auto value = tvm::te::compute(
@@ -507,27 +507,38 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
 
   Array<te::Tensor> VisitExpr_(const CallNode* call_node) final {
     static auto fshape_func = Op::GetAttrMap<FShapeFunc>("FShapeFunc");
-    static auto tshape_data_dependant = Op::GetAttrMap<TShapeDataDependant>("TShapeDataDependant");
+    static auto tshape_data_dependent = Op::GetAttrMap<TShapeDataDependent>("TShapeDataDependent");
     ICHECK(call_node->op.as<OpNode>()) << "Primitive function only allows call into primitive ops";
     Op op = Downcast<Op>(call_node->op);
-    ICHECK(data_dependants_.empty() || !data_dependants_.back())
+    ICHECK(data_dependents_per_input_.empty() || !data_dependents_per_input_.back())
         << "Error in op fusion: output of the shape func is fed to a "
-        << "data-dependant shape func";
+        << "data-dependent shape func";
     ICHECK_GT(fshape_func.count(op), 0) << "Internal error, cannot find ShapeFunc for " << op->name;
-    ICHECK_GT(tshape_data_dependant.count(op), 0)
-        << "Internal error, cannot find TShapeDataDependant for " << op->name;
+    ICHECK_GT(tshape_data_dependent.count(op), 0)
+        << "Internal error, cannot find TShapeDataDependent for " << op->name;
+
+    Array<Integer> dep_spec = tshape_data_dependent[op];
+    if (dep_spec.size() == 1) {
+      // This is for cases when data dependence is specified per op
+      // Replicate 0 or 1 flag to all arguments
+      for (size_t i = 1; i < call_node->args.size(); ++i) {
+        dep_spec.push_back(dep_spec[0]);
+      }
+    }
 
-    data_dependants_.push_back(IsDataDependant(call_node));
     // Visit all inputs
     Array<te::Tensor> inputs;
     int count_tuple = 0;
-    for (Expr arg : call_node->args) {
+    for (size_t i = 0; i < call_node->args.size(); ++i) {
+      Expr arg = call_node->args[i];
       if (arg->checked_type().as<TupleTypeNode>()) {
         ++count_tuple;
       }
+      data_dependents_per_input_.push_back(dep_spec[i]->value != 0);
       for (te::Tensor tensor : VisitExpr(arg)) {
         inputs.push_back(tensor);
       }
+      data_dependents_per_input_.pop_back();
     }
     if (count_tuple) {
       ICHECK_EQ(call_node->args.size(), 1U) << "Only allow function with a single tuple input";
@@ -549,7 +560,6 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
     }
     // Call shape function
     auto outputs = fshape_func[op](call_node->attrs, inputs, out_ndims);
-    data_dependants_.pop_back();
     readable_name_stream_ << "_" << op->name;
     return outputs;
   }
@@ -593,8 +603,8 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
   std::unordered_map<Expr, Array<te::Tensor>, ObjectPtrHash, ObjectPtrEqual> param_data_;
   /*! \brief Map from parameter to list of shape placeholder */
   std::unordered_map<Expr, Array<te::Tensor>, ObjectPtrHash, ObjectPtrEqual> param_shapes_;
-  /*! \brief Stack of data dependencies for shape function */
-  std::vector<bool> data_dependants_;
+  /*! \brief Stack of data dependencies for shape function, specified per each op input */
+  std::vector<bool> data_dependents_per_input_;
   /*! \brief Scalars used in the shape function */
   Array<te::Tensor> scalars_;
 };
diff --git a/src/relay/transforms/fuse_ops.cc b/src/relay/transforms/fuse_ops.cc
index 29f3bfa0a17e..1b28980a0a2f 100644
--- a/src/relay/transforms/fuse_ops.cc
+++ b/src/relay/transforms/fuse_ops.cc
@@ -241,7 +241,7 @@ class IndexedForwardGraph::Creator : private ExprVisitor {
     OpPatternKind op_pattern = kOpaque;
     if (const OpNode* opnode = call->op.as<OpNode>()) {
       auto op = GetRef<Op>(opnode);
-      if (IsDynamic(call->checked_type()) && IsDataDependant(call)) {
+      if (IsDynamic(call->checked_type()) && IsDataDependent(call)) {
         // output of a shape func can't be fed to a data-dependent shape func
         op_pattern = kOpaque;
       } else {
diff --git a/src/relay/transforms/pass_utils.h b/src/relay/transforms/pass_utils.h
index a2f22cbbf106..bb2f268a23d7 100644
--- a/src/relay/transforms/pass_utils.h
+++ b/src/relay/transforms/pass_utils.h
@@ -90,11 +90,11 @@ Expr TypeSubst(const Expr& expr, const tvm::Map<TypeVar, Type>& subst_map);
 bool IsDynamic(const Type& ty);
 
 /*!
- * \brief Check if call is data dependant.
+ * \brief Check if call is data dependent.
  * \param call The call to be checked.
- * \return Whether the call is data dependant.
+ * \return Whether the call is data dependent.
  */
-bool IsDataDependant(const CallNode* call);
+bool IsDataDependent(const CallNode* call);
 
 /*!
  * \brief Make arbitrary transformation preserve the out most function.
diff --git a/tests/cpp/relay_build_module_test.cc b/tests/cpp/relay_build_module_test.cc
index 3212f9079619..a15cdcd3926b 100644
--- a/tests/cpp/relay_build_module_test.cc
+++ b/tests/cpp/relay_build_module_test.cc
@@ -105,7 +105,9 @@ TEST(Relay, BuildModule) {
   }
   auto fgeneric = GenericFunc::Get("test.strategy_generic").set_default(*fs);
   (*reg)("add", "FTVMStrategy", fgeneric, 10);
-  (*reg)("add", "TShapeDataDependant", false, 10);
+  Array<Integer> dep;
+  dep.push_back(0);
+  (*reg)("add", "TShapeDataDependent", dep, 10);
   // build
   auto pfb = tvm::runtime::Registry::Get("relay.build_module._BuildModule");
   tvm::runtime::Module build_mod = (*pfb)();

From b52267e703cd6077fe1e9af357f8c918dfdd520e Mon Sep 17 00:00:00 2001
From: manupa-arm <manupa.karunaratne@arm.com>
Date: Fri, 15 Jan 2021 23:47:27 +0000
Subject: [PATCH 077/357] [uTVM] Initial BYOC support with c-source module
 (#6950)

This commit mainly introduces a byoc c-source module
example to uTVM. Moreover, it carries certain modifications
to the example codegen_c external module generator code
to generate utvm friendly c-source.

Change-Id: I09f3a42017d518dd5b6c89e3fe0a0332b80088b0
---
 .../backend/contrib/codegen_c/codegen.cc      |  27 ++--
 .../backend/contrib/codegen_c/codegen_c.h     | 104 ++++++++++---
 tests/micro/qemu/test_zephyr.py               | 140 ++++++++++++++++++
 3 files changed, 242 insertions(+), 29 deletions(-)

diff --git a/src/relay/backend/contrib/codegen_c/codegen.cc b/src/relay/backend/contrib/codegen_c/codegen.cc
index 998393d450c2..550afb3159fc 100644
--- a/src/relay/backend/contrib/codegen_c/codegen.cc
+++ b/src/relay/backend/contrib/codegen_c/codegen.cc
@@ -157,8 +157,7 @@ class CodegenC : public MemoizedExprTranslator<std::vector<Output>>, public Code
     for (size_t i = 0; i < out_shape.size(); ++i) {
       out_size *= out_shape[i];
     }
-    buf_stream << dtype << "* " << out << " = (" << dtype << "*)std::malloc(4 * " << out_size
-               << ");";
+    buf_stream << dtype << "* " << out << " = (" << dtype << "*)malloc(4 * " << out_size << ");";
     buf_decl_.push_back(buf_stream.str());
 
     decl_stream << ", " << out << ");";
@@ -229,25 +228,33 @@ class CSourceCodegen : public CSourceModuleCodegenBase {
     String func_name = std::get<1>(res);
 
     // Create headers
-    code_stream_ << "#include <cstring>\n";
-    code_stream_ << "#include <vector>\n";
+    code_stream_ << "#include <stdio.h>\n";
+    code_stream_ << "#include <stdlib.h>\n";
+    code_stream_ << "#include <string.h>\n";
     code_stream_ << "#include <tvm/runtime/c_runtime_api.h>\n";
-    code_stream_ << "#include <tvm/runtime/container.h>\n";
-    code_stream_ << "#include <tvm/runtime/packed_func.h>\n";
-    code_stream_ << "#include <dlpack/dlpack.h>\n";
-    code_stream_ << "using namespace tvm::runtime;\n";
+    code_stream_ << "#include <tvm/runtime/c_backend_api.h>\n";
+    if (!variables.empty()) {
+      // This segment would be generated in C++ because of the usage
+      // of tvm::runtime::Array. This is not ideal, but this to demonstrate
+      // constant copying process used packed imports in other external
+      // codegen. Moreover, in uTVM we dont expect this part to be generated.
+      code_stream_ << "#ifdef __cplusplus\n";
+      code_stream_ << "#include <tvm/runtime/ndarray.h>\n";
+      code_stream_ << "#include <tvm/runtime/packed_func.h>\n";
+      code_stream_ << "#endif\n";
+    }
 
     // Append some common macro for operator definition.
     const char* operator_macro = R"op_macro(
     #define CSOURCE_BINARY_OP_1D(p_ID_, p_OP_, p_DIM1_, p_DTYPE)       \
-      extern "C" void p_ID_(p_DTYPE* a, p_DTYPE* b, p_DTYPE* out) {    \
+      void p_ID_(p_DTYPE* a, p_DTYPE* b, p_DTYPE* out) {    \
         for (int64_t i = 0; i < p_DIM1_; ++i) {                        \
           out[i] = a[i] p_OP_ b[i];                                    \
         }                                                              \
       }
 
     #define CSOURCE_BINARY_OP_2D(p_ID_, p_OP_, p_DIM1_, p_DIM2_, p_DTYPE)  \
-      extern "C" void p_ID_(p_DTYPE* a, p_DTYPE* b, p_DTYPE* out) {        \
+      void p_ID_(p_DTYPE* a, p_DTYPE* b, p_DTYPE* out) {        \
         for (int64_t i = 0; i < p_DIM1_; ++i) {                            \
           for (int64_t j = 0; j < p_DIM2_; ++j) {                          \
             int64_t k = i * p_DIM2_ + j;                                   \
diff --git a/src/relay/backend/contrib/codegen_c/codegen_c.h b/src/relay/backend/contrib/codegen_c/codegen_c.h
index 9448b4d0738d..af835cfca02e 100644
--- a/src/relay/backend/contrib/codegen_c/codegen_c.h
+++ b/src/relay/backend/contrib/codegen_c/codegen_c.h
@@ -89,6 +89,40 @@ class CodegenCBase {
     indent_ -= 2;
   }
 
+  /*!
+   * \brief Creates a runtime function header
+   */
+  void PrintRuntimeFunctionHeader(std::string func_name) {
+    code_stream_ << "#ifdef __cplusplus\n";
+    code_stream_ << "extern \"C\" {\n";
+    code_stream_ << "#endif\n";
+    code_stream_ << "TVM_DLL int32_t ";
+    code_stream_ << func_name << "(";
+    code_stream_ << "TVMValue* args, ";
+    code_stream_ << "int* type_code, ";
+    code_stream_ << "int num_args, ";
+    code_stream_ << "TVMValue* out_value, ";
+    code_stream_ << "int* out_type_code) {\n";
+  }
+
+  /*!
+   * \brief Adds a line to convert TVMValue args to DLTensors
+   */
+  void PrintArgToData(int idx) {
+    PrintIndents();
+    code_stream_ << "DLTensor* arg" << idx << " = ";
+    code_stream_ << "(DLTensor*)(((TVMValue*)args)[" << idx << "].v_handle);\n";
+  }
+
+  /*!
+   * \brief Adds a line to convert TVMValue rets to DLTensors
+   */
+  void PrintRetToData(int idx) {
+    PrintIndents();
+    code_stream_ << "DLTensor* ret" << idx << " = ";
+    code_stream_ << "(DLTensor*)(((TVMValue*)args)[" << idx << "].v_handle);\n";
+  }
+
   /*!
    * \brief Gerenate C code for the external function.
    *
@@ -100,12 +134,12 @@ class CodegenCBase {
    * Array<NDArray> foo_consts;
    *
    * // An example code for the generated C function.
-   * extern "C" int foo_wrapper_(DLTensor* arg0,
+   * int foo_wrapper_(DLTensor* arg0,
    *                              DLTensor* arg1,
    *                              DLTensor* out) {
-   *   foo_(static_cast<float*>(arg0->data),
-   *        static_cast<float*>(arg1->data),
-   *        static_cast<float*>(out->data));
+   *   foo_((float*)(arg0->data),
+   *        (float*)(arg1->data),
+   *        (float*)(out->data));
    *   return 0;
    * }
    *
@@ -124,7 +158,8 @@ class CodegenCBase {
                             const std::string& const_arr_name, const std::vector<Output>& outs) {
     // Print signature
     code_stream_ << "\n";
-    code_stream_ << "extern \"C\" int " << func_name << "_wrapper_(";
+
+    code_stream_ << "int " << func_name << "_wrapper_(";
     for (size_t i = 0; i < args.size(); i++) {
       code_stream_ << "DLTensor* arg" << i << ",\n";
       code_stream_ << "\t";
@@ -142,26 +177,54 @@ class CodegenCBase {
     code_stream_ << func_name << "_(";
     for (size_t i = 0; i < args.size(); i++) {
       const auto& dtype_str = GetDtypeString(args[i]);
-      code_stream_ << "static_cast<" << dtype_str << "*>(arg" << i << "->data),\n";
+      code_stream_ << "(" << dtype_str << "*)(arg" << i << "->data),\n";
       PrintIndents();
     }
     for (size_t i = 0; i < outs.size() - 1; i++) {
-      code_stream_ << "static_cast<" << outs[i].dtype << "*>(out" << i << "->data),\n";
+      code_stream_ << "(" << outs[i].dtype << "*)(out" << i << "->data),\n";
       PrintIndents();
     }
-    code_stream_ << "static_cast<" << outs.back().dtype << "*>(out" << outs.size() - 1
-                 << "->data));\n";
+    code_stream_ << "(" << outs.back().dtype << "*)(out" << outs.size() - 1 << "->data));\n";
     PrintIndents();
     code_stream_ << "return 0;\n";
     ExitScope();
     code_stream_ << "}\n\n";
 
-    // Generate the macro
-    code_stream_ << "TVM_DLL_EXPORT_TYPED_FUNC(" << func_name << ", " << func_name
-                 << "_wrapper_);\n\n";
+    // Create the external function
+    PrintRuntimeFunctionHeader(func_name);
+    EnterScope();
+    for (size_t i = 0; i < args.size(); i++) {
+      PrintArgToData(i);
+    }
+    for (size_t i = 0; i < outs.size(); i++) {
+      PrintRetToData(args.size() + i);
+    }
+    PrintIndents();
+    code_stream_ << func_name << "_wrapper_(";
+    for (size_t i = 0; i < args.size(); i++) {
+      code_stream_ << "arg" << i << ",";
+    }
+    for (size_t i = 0; i < outs.size() - 1; i++) {
+      code_stream_ << "ret" << args.size() + i << ",";
+    }
+    code_stream_ << "ret" << args.size() + outs.size() - 1 << ");\n";
+    PrintIndents();
+    code_stream_ << "return 0;\n";
+    ExitScope();
+    code_stream_ << "}\n";
+    code_stream_ << "#ifdef __cplusplus\n";
+    code_stream_ << "}\n";
+    code_stream_ << "#endif\n";
 
     if (!const_arr_name.empty()) {
-      code_stream_ << "int " << func_name << "_init_wrapper_(Array<NDArray> arr) {\n";
+      // If there are constants, insert the __init_ and the wrapper
+      // This segment would be generated in C++ because of the usage
+      // of tvm::runtime::Array. This is not ideal, but this to demonstrate
+      // constant copying process used packed imports in other external
+      // codegen. Moreover, in uTVM we dont expect this part to be generated.
+      code_stream_ << "#ifdef __cplusplus\n";
+      code_stream_ << "int " << func_name
+                   << "_init_wrapper_(tvm::runtime::Array<tvm::runtime::NDArray> arr) {\n";
       EnterScope();
       PrintIndents();
       code_stream_ << func_name << "_consts = arr;\n";
@@ -170,6 +233,7 @@ class CodegenCBase {
       code_stream_ << "}\n\n";
       code_stream_ << "TVM_DLL_EXPORT_TYPED_FUNC(__init_" << func_name << ", " << func_name
                    << "_init_wrapper_);\n\n";
+      code_stream_ << "#endif\n";
     }
   }
 
@@ -202,11 +266,13 @@ class CodegenCBase {
                       const std::vector<Output>& outs) {
     // Create a declaration for global ndarrays that contain constant data.
     if (!const_arr_name.empty()) {
+      code_stream_ << "#ifdef __cplusplus\n";
       code_stream_ << const_arr_name << "\n\n";
+      code_stream_ << "#endif\n";
     }
     // Create the signature. For example, it could be:
-    // extern "C" void dnnl_0_(float* in0, float* in1, float* out0, float* out1) {}
-    code_stream_ << "extern \"C\" void " << ext_func_id << "_(";
+    // void dnnl_0_(float* in0, float* in1, float* out0, float* out1) {}
+    code_stream_ << "void " << ext_func_id << "_(";
 
     for (const auto& arg : args) {
       const auto& dtype_str = GetDtypeString(arg);
@@ -235,14 +301,14 @@ class CodegenCBase {
         continue;
       }
       this->PrintIndents();
-      code_stream_ << "std::memcpy(out" << i << ", " << outs[i].name << ", 4 * " << outs[i].size
+      code_stream_ << "memcpy(out" << i << ", " << outs[i].name << ", 4 * " << outs[i].size
                    << ");\n";
     }
 
     // Free buffers
     for (size_t i = 0; i < buf_decl.size(); i++) {
       this->PrintIndents();
-      code_stream_ << "std::free(buf_" << i << ");\n";
+      code_stream_ << "free(buf_" << i << ");\n";
     }
 
     this->ExitScope();
@@ -310,7 +376,7 @@ class CodegenCBase {
    * \return The created declaration
    */
   std::string CreateNDArrayPool(const std::string& symbol) const {
-    return "Array<NDArray> " + symbol + "_consts;";
+    return "tvm::runtime::Array<tvm::runtime::NDArray> " + symbol + "_consts;";
   }
 
   /*!
@@ -322,7 +388,7 @@ class CodegenCBase {
    * \return The created reference
    */
   std::string CreateDataReference(const std::string& symbol, int const_id) const {
-    return "static_cast<float*>(" + symbol + "_consts[" + std::to_string(const_id) + "]->data)";
+    return "(float*)(" + symbol + "_consts[" + std::to_string(const_id) + "]->data)";
   }
 
   /*!
diff --git a/tests/micro/qemu/test_zephyr.py b/tests/micro/qemu/test_zephyr.py
index 1c38c2dcd187..ab3a25d36543 100644
--- a/tests/micro/qemu/test_zephyr.py
+++ b/tests/micro/qemu/test_zephyr.py
@@ -33,6 +33,8 @@
 
 from tvm.micro.contrib import zephyr
 from tvm.contrib import utils
+from tvm.relay.expr_functor import ExprMutator
+from tvm.relay.op.annotation import compiler_begin, compiler_end
 
 BUILD = True
 DEBUG = False
@@ -198,5 +200,143 @@ def test_relay(platform):
         tvm.testing.assert_allclose(result, x_in * x_in + 1)
 
 
+class CcompilerAnnotator(ExprMutator):
+    """
+    This is used to create external functions for ccompiler.
+    A simple annotator that creates the following program:
+           |
+      -- begin --
+           |
+          add
+           |
+        subtract
+           |
+        multiply
+           |
+       -- end --
+           |
+    """
+
+    def __init__(self):
+        super(CcompilerAnnotator, self).__init__()
+        self.in_compiler = 0
+
+    def visit_call(self, call):
+        if call.op.name == "add":  # Annotate begin at args
+            if self.in_compiler == 1:
+                lhs = compiler_begin(super().visit(call.args[0]), "ccompiler")
+                rhs = compiler_begin(super().visit(call.args[1]), "ccompiler")
+                op = relay.add(lhs, rhs)
+                self.in_compiler = 2
+                return op
+        elif call.op.name == "subtract":
+            if self.in_compiler == 1:
+                lhs = super().visit(call.args[0])
+                rhs = super().visit(call.args[1])
+                if isinstance(lhs, relay.expr.Var):
+                    lhs = compiler_begin(lhs, "ccompiler")
+                if isinstance(rhs, relay.expr.Var):
+                    rhs = compiler_begin(rhs, "ccompiler")
+                return relay.subtract(lhs, rhs)
+        elif call.op.name == "multiply":  # Annotate end at output
+            self.in_compiler = 1
+            lhs = super().visit(call.args[0])
+            rhs = super().visit(call.args[1])
+            if isinstance(lhs, relay.expr.Var):
+                lhs = compiler_begin(lhs, "ccompiler")
+            if isinstance(rhs, relay.expr.Var):
+                rhs = compiler_begin(rhs, "ccompiler")
+            op = relay.multiply(lhs, rhs)
+            if self.in_compiler == 2:
+                op = compiler_end(op, "ccompiler")
+            self.in_compiler = 0
+            return op
+        return super().visit_call(call)
+
+
+def check_result(relay_mod, model, zephyr_board, map_inputs, out_shape, result):
+    """Helper function to verify results"""
+    TOL = 1e-5
+    target = tvm.target.target.micro(model)
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        graph, mod, params = tvm.relay.build(relay_mod, target=target)
+
+    with _make_session(model, target, zephyr_board, mod) as session:
+        rt_mod = tvm.micro.create_local_graph_runtime(
+            graph, session.get_system_lib(), session.context
+        )
+        rt_mod.set_input(**params)
+        for name, data in map_inputs.items():
+            rt_mod.set_input(name, data)
+        rt_mod.set_input(**params)
+        rt_mod.run()
+
+        out_shapes = out_shape if isinstance(out_shape, list) else [out_shape]
+        results = result if isinstance(result, list) else [result]
+
+        for idx, shape in enumerate(out_shapes):
+            out = tvm.nd.empty(shape, ctx=session.context)
+            out = rt_mod.get_output(idx, out)
+            tvm.testing.assert_allclose(out.asnumpy(), results[idx], rtol=TOL, atol=TOL)
+
+
+def test_byoc_utvm(platform):
+    """This is a simple test case to check BYOC capabilities of uTVM"""
+    model, zephyr_board = PLATFORMS[platform]
+    x = relay.var("x", shape=(10, 10))
+    w0 = relay.var("w0", shape=(10, 10))
+    w1 = relay.var("w1", shape=(10, 10))
+    w2 = relay.var("w2", shape=(10, 10))
+    w3 = relay.var("w3", shape=(10, 10))
+    w4 = relay.var("w4", shape=(10, 10))
+    w5 = relay.var("w5", shape=(10, 10))
+    w6 = relay.var("w6", shape=(10, 10))
+    w7 = relay.var("w7", shape=(10, 10))
+
+    # C compiler
+    z0 = relay.add(x, w0)
+    p0 = relay.subtract(z0, w1)
+    q0 = relay.multiply(p0, w2)
+
+    z1 = relay.add(x, w3)
+    p1 = relay.subtract(z1, w4)
+    q1 = relay.multiply(p1, w5)
+
+    # Other parts on TVM
+    z2 = relay.add(x, w6)
+    q2 = relay.subtract(z2, w7)
+
+    r = relay.concatenate((q0, q1, q2), axis=0)
+    f = relay.Function([x, w0, w1, w2, w3, w4, w5, w6, w7], r)
+    mod = tvm.IRModule()
+    ann = CcompilerAnnotator()
+    mod["main"] = ann.visit(f)
+    mod = tvm.relay.transform.PartitionGraph()(mod)
+    mod = tvm.relay.transform.InferType()(mod)
+
+    x_data = np.random.rand(10, 10).astype("float32")
+    w_data = []
+    for _ in range(8):
+        w_data.append(np.random.rand(10, 10).astype("float32"))
+
+    map_inputs = {"w{}".format(i): w_data[i] for i in range(8)}
+    map_inputs["x"] = x_data
+    check_result(
+        relay_mod=mod,
+        map_inputs=map_inputs,
+        out_shape=(30, 10),
+        result=np.concatenate(
+            (
+                ((x_data + w_data[0]) - w_data[1]) * w_data[2],
+                ((x_data + w_data[3]) - w_data[4]) * w_data[5],
+                x_data + w_data[6] - w_data[7],
+            ),
+            axis=0,
+        ),
+        model=model,
+        zephyr_board=zephyr_board,
+    )
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([os.path.dirname(__file__)] + sys.argv[1:]))

From 637c9da6316d0808e430c197e72359db672d2753 Mon Sep 17 00:00:00 2001
From: "Matt Welsh (OctoML)" <63477620+mdw-octoml@users.noreply.github.com>
Date: Fri, 15 Jan 2021 15:49:57 -0800
Subject: [PATCH 078/357] A few typo fixes in the uTVM design doc. (#7291)

---
 docs/dev/microtvm_design.rst | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/docs/dev/microtvm_design.rst b/docs/dev/microtvm_design.rst
index 0251144511a0..2c3eeb2faea3 100644
--- a/docs/dev/microtvm_design.rst
+++ b/docs/dev/microtvm_design.rst
@@ -36,7 +36,7 @@ change for a proof-of-concept implementation on such devices, the runtime cannot
   projects implement support for these, but they are by no means standard.
 * Support for programming languages other than **C**.
 
-Such changes require a different appraoch from the TVM C++ runtime typically used on traditional
+Such changes require a different approach from the TVM C++ runtime typically used on traditional
 Operating Systems.
 
 Typical Use
@@ -92,7 +92,7 @@ Modeling Target Platforms
 -------------------------
 
 TVM's search-based optimization approach allows it to largely avoid system-level modeling of targets
-in favor of experimental results. However, some modelling is necessary in order to ensure TVM is
+in favor of experimental results. However, some modeling is necessary in order to ensure TVM is
 comparing apples-to-apples search results, and to avoid wasting time during the search by attempting
 to compile invalid code for a target.
 
@@ -143,10 +143,10 @@ Writing Schedules for microTVM
 
 For operations scheduled on the CPU, microTVM initially plans to make use of specialized
 instructions and extern (i.e. hand-optimized) functions to achieve good performance. In TVM, this
-appraoch is generally accomplished through tensorization, in which TVM breaks a computation into
+approach is generally accomplished through tensorization, in which TVM breaks a computation into
 small pieces, and a TIR extern function accelerates each small piece.
 
-TVM currently accomodates both approaches using ``tir.call_extern``. First, a pragma is attached to
+TVM currently accommodates both approaches using ``tir.call_extern``. First, a pragma is attached to
 the schedule defining the extern function in portable C.
 
     ``sched[output].pragma(n, "import_c", "void call_asm(int32_t* a, int32_t* b) { /* ... */ }")``
@@ -183,10 +183,11 @@ are of course not easy to use from LLVM bitcode.
 Executing Models
 ----------------
 
-The TVM compiler traditionally outputs 3 pieces:
-1. Model operator implementations, as discussed above.
-2. A model execution graph, encoded as JSON
-3. Simplified parameters
+The TVM compiler traditionally outputs three pieces:
+
+1. Model operator implementations, as discussed above;
+2. A model execution graph, encoded as JSON; and
+3. Simplified parameters.
 
 To correctly execute the model, a Graph Runtime needs to reconstruct the graph in memory, load the
 parameters, and then invoke the operator implementations in the correct order.
@@ -206,11 +207,11 @@ Host-Driven Execution
 
 In Host-Driven execution, the firmware binary is the following:
 
-1. Generated operator implementations from TVM
-2. The TVM C runtime
+1. Generated operator implementations from TVM.
+2. The TVM C runtime.
 3. SoC-specific initialization.
 4. The TVM RPC server.
-5. (optional) Simplified Parameters
+5. (optional) Simplified Parameters.
 
 This firmware image is flashed onto the device and a GraphRuntime instance is created on the host.
 The GraphRuntime drives execution by sending RPC commands over a UART:
@@ -270,7 +271,7 @@ For Standalone model execution, firmware also needs:
 5. The remaining compiler outputs (Simplified Parameters and Graph JSON).
 
 The Automated Build Flow
--------------------------
+------------------------
 
 Once code generation is complete, ``tvm.relay.build`` returns a ``tvm.runtime.Module`` and the
 user can save the generated C source or binary object to a ``.c`` or ``.o`` file. From this point, TVM
@@ -287,12 +288,12 @@ However, for AutoTVM, TVM needs some automated flow to handle the following task
 At present, TVM expects the user to supply an implementation of the ``tvm.micro.Compiler``,
 ``tvm.micro.Flasher``, and ``tvm.micro.Transport`` interfaces. TVM then:
 
-1. Builds each piece separately as a library
+1. Builds each piece separately as a library.
 2. Builds the libraries into a binary firmware image.
 3. Programs the firmware image onto an attached device.
 4. Opens a serial port to serve as the RPC server transport.
 
-This design was chosen to reduce build times for microTVM (the common libraries need to be build
+This design was chosen to reduce build times for microTVM (the common libraries need to be built
 only once per candidate operator implemmentation). In practice, these projects are extremely small
 and compile relatively quickly. Compared with the added complexity of this tighter build integration
 with TVM, the performance gains are likely not worth it. A future design will consolidate the build
@@ -303,7 +304,7 @@ Measuring operator performance
 
 The TVM C runtime depends on user-supplied functions to measure time on-device. Users should implement
 ``TVMPlatformTimerStart`` and ``TVMPlatformTimerStop``. These functions should measure wall clock time, so there
-are some pitfalls in implementing this function:
+are some pitfalls in implementing these functions:
 
 1. If the CPU could halt or sleep during a computation (i.e. if it is being done on an accelerator),
    a cycle counter should likely not be used as these tend to stop counting while the CPU is asleep.
@@ -313,7 +314,7 @@ are some pitfalls in implementing this function:
 4. The timer should not interrupt computation unless absolutely necessary. Doing so may affect the
    accuracy of the results.
 5. Calibrating the output against a wall clock is ideal, but it will likely be too cumbersome. A
-   future PR could enable some characterization of the platform timer by e.g. measuring the internal
+   future PR could enable some characterization of the platform timer by, e.g., measuring the internal
    oscillator against a reference such as an external crystal.
 
 Future Work
@@ -339,7 +340,7 @@ peak memory usage.
 Heterogeneous Execution
 -----------------------
 
-Newer Cortex-M SoC can contain multiple CPUs and onboard ML accelerators.
+Newer Cortex-M SoCs can contain multiple CPUs and onboard ML accelerators.
 
 
 Autotuning Target

From ccccac2e2af239be62761f5618a2210a93326e8e Mon Sep 17 00:00:00 2001
From: Robert Kimball <bobkimball@gmail.com>
Date: Fri, 15 Jan 2021 15:52:58 -0800
Subject: [PATCH 079/357] Change const to used dtype if it is passed in (#7285)

* Add fix and unit test for const autoconvert dtype.

* formatting

* Address review comment, casting input value to int32

* Fix failing test

* Augment unit test
---
 python/tvm/relay/expr.py         |  8 +++---
 tests/python/relay/test_const.py | 44 ++++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 4 deletions(-)
 create mode 100644 tests/python/relay/test_const.py

diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index 7b6e4b4ccf80..8d73a090ed6f 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -488,7 +488,7 @@ def const(value, dtype=None):
         The constant value.
 
     dtype: str, optional
-        The data type of the value.
+        The data type of the resulting constant.
 
     Note
     ----
@@ -504,13 +504,13 @@ def const(value, dtype=None):
 
     if not dtype:
         # when dtype is None: int maps to "int32", float maps to "float32"
-        map_dtype = {_np.dtype("int64"): _np.int32, _np.dtype("float64"): _np.float32}.get(
+        dtype = {_np.dtype("int64"): _np.int32, _np.dtype("float64"): _np.float32}.get(
             value.dtype, None
         )
-        if map_dtype:
-            value = value.astype(map_dtype)
 
     if isinstance(value, (_np.ndarray, _np.generic)):
+        if dtype is not None:
+            value = value.astype(dtype)
         value = _nd.array(value)
 
     if not isinstance(value, _nd.NDArray):
diff --git a/tests/python/relay/test_const.py b/tests/python/relay/test_const.py
new file mode 100644
index 000000000000..14fff0f7e65e
--- /dev/null
+++ b/tests/python/relay/test_const.py
@@ -0,0 +1,44 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+import numpy as np
+from tvm import relay
+from tvm.relay.frontend.common import infer_type
+from tvm.relay import op as _op
+
+
+def test_const_dtype():
+    strides = (1, 1)
+    np_array = np.array(strides).astype("int32")
+    strides = _op.const(np_array, dtype="int64")
+
+    # strides needs to be autoconverted to int64 on Windows
+    assert infer_type(strides).checked_type.dtype == np.dtype(np.int64)
+
+    a = tvm.nd.array(np.random.randint(0, high=255, size=(2, 3), dtype="uint8"))
+    a = _op.const(a, dtype="uint8")
+    aa = a.data.asnumpy()
+    assert aa.dtype == np.dtype(np.uint8)
+
+    b = _op.const(1, dtype="int8")
+    bb = b.data.asnumpy()
+    assert bb.dtype == np.dtype(np.int8)
+
+    kshape = (3, 10, 3, 3)
+    w = relay.const(np.zeros(kshape, dtype="float32"))
+    assert w.data.asnumpy().dtype == np.dtype(np.float32)

From 3f15d062e335b83c5170de5ee9ae19b90dcb681c Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandron85@gmail.com>
Date: Sat, 16 Jan 2021 14:48:08 +0000
Subject: [PATCH 080/357] [TEST] Fix
 test_topi_batch_matmul_tensorcore.py:test_batch_matmul requirement (#7294)

* this test current sets a requirement to "uses_gpu", which
   causes it to fail in cpu-only machine
 * this patch changes it to be "requires_tensorcore", as per discussion
   on issue #7277
---
 tests/python/topi/python/test_topi_batch_matmul_tensorcore.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/topi/python/test_topi_batch_matmul_tensorcore.py b/tests/python/topi/python/test_topi_batch_matmul_tensorcore.py
index 60f4bef3a855..77df5be0a491 100644
--- a/tests/python/topi/python/test_topi_batch_matmul_tensorcore.py
+++ b/tests/python/topi/python/test_topi_batch_matmul_tensorcore.py
@@ -63,7 +63,7 @@ def check_device(device):
     check_device("cuda")
 
 
-@tvm.testing.uses_gpu
+@tvm.testing.requires_tensorcore
 def test_batch_matmul():
     verify_batch_matmul(1, 1, 16, 16, 32)
     verify_batch_matmul(5, 5, 16, 16, 32)

From 052ad3d92d20abdf221600005c2ccb130e39b6b4 Mon Sep 17 00:00:00 2001
From: ziheng <ziheng@apache.org>
Date: Sat, 16 Jan 2021 09:51:04 -0800
Subject: [PATCH 081/357] [TIR] Support Return in TIR (#7084)

---
 include/tvm/tir/builtin.h              |  4 ++
 include/tvm/tir/op.h                   |  9 ++++
 include/tvm/tir/op_attr_types.h        |  6 ++-
 python/tvm/tir/__init__.py             |  2 +-
 python/tvm/tir/op.py                   | 28 ++++++++---
 src/target/llvm/codegen_llvm.cc        | 12 +++++
 src/tir/op/builtin.cc                  |  4 ++
 src/tir/op/op.cc                       |  4 ++
 src/tir/transforms/make_packed_api.cc  | 66 +++++++++++++++++++++++++-
 tests/python/unittest/test_tir_base.py | 60 +++++++++++++++++++++++
 10 files changed, 185 insertions(+), 10 deletions(-)
 create mode 100644 tests/python/unittest/test_tir_base.py

diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h
index a150595ab551..6a40d86b8984 100644
--- a/include/tvm/tir/builtin.h
+++ b/include/tvm/tir/builtin.h
@@ -41,6 +41,10 @@ namespace tir {
 
 /*! \brief Collection of builtin intrinsics as ops */
 namespace builtin {
+/*!
+ * \brief Return value.
+ */
+TVM_DLL const Op& ret();
 /*!
  * \brief Reinterpret the value using the target type.
  */
diff --git a/include/tvm/tir/op.h b/include/tvm/tir/op.h
index 4a907fca951d..b5a62c907ed6 100644
--- a/include/tvm/tir/op.h
+++ b/include/tvm/tir/op.h
@@ -70,6 +70,15 @@ TVM_DLL Type GetType(const PrimExpr& expr);
  */
 TVM_DLL runtime::DataType GetRuntimeDataType(const Type& type);
 
+/*!
+ * \brief Return the value.
+ *
+ * \param value The returned value.
+ * \param span The location of this operation in the source.
+ * \return The return expression.
+ */
+TVM_DLL PrimExpr ret(PrimExpr value, Span span = Span());
+
 /*!
  * Query the maximum possible value of dtype.
  * \param dtype The data type.
diff --git a/include/tvm/tir/op_attr_types.h b/include/tvm/tir/op_attr_types.h
index ec7fc172cde8..3dcc4b943a79 100644
--- a/include/tvm/tir/op_attr_types.h
+++ b/include/tvm/tir/op_attr_types.h
@@ -74,7 +74,11 @@ enum class CallEffectKind : int {
   /*!
    * \brief Embed opaque information in the Expr, cannot be codegen.
    */
-  kEmbedInfo = 5
+  kEmbedInfo = 5,
+  /*!
+   * \brief Function that changes control flow
+   */
+  kControlJump = 6,
 };
 
 /*! \brief Use integer to record the kind. */
diff --git a/python/tvm/tir/__init__.py b/python/tvm/tir/__init__.py
index 1aac55fa9920..901c89ed9106 100644
--- a/python/tvm/tir/__init__.py
+++ b/python/tvm/tir/__init__.py
@@ -35,7 +35,7 @@
 from .function import PrimFunc
 
 from .op import call_packed, call_intrin, call_pure_extern, call_extern
-from .op import call_llvm_intrin, call_llvm_pure_intrin, all, any, min_value, max_value, trace
+from .op import call_llvm_intrin, call_llvm_pure_intrin, ret, all, any, min_value, max_value, trace
 from .op import exp, exp2, exp10, log, log2, log10, log1p, ldexp
 from .op import sin, sinh, asin, asinh
 from .op import cos, cosh, acos, acosh
diff --git a/python/tvm/tir/op.py b/python/tvm/tir/op.py
index ca61be4fcd83..182264f0db92 100644
--- a/python/tvm/tir/op.py
+++ b/python/tvm/tir/op.py
@@ -221,6 +221,22 @@ def call_llvm_pure_intrin(dtype, name, *args, span=None):
     )
 
 
+def ret(val):
+    """Create a tir return expression
+
+    Parameters
+    ----------
+    val : Expr
+        The returned tir expression, whose data type is int, float or void pointer.
+
+    Returns
+    -------
+    ret : PrimExpr
+        The return expression
+    """
+    return call_intrin(val.dtype, "tir.ret", val)
+
+
 def any(*args, span=None):
     """Create a new experssion of the union of all conditions in the arguments
 
@@ -241,10 +257,10 @@ def any(*args, span=None):
         raise ValueError("Any must take at least 1 argument")
     if len(args) == 1:
         return args[0]
-    ret = _ffi_api._OpOr(args[0], args[1], span)
+    val = _ffi_api._OpOr(args[0], args[1], span)
     for i in range(2, len(args)):
-        ret = _ffi_api._OpOr(ret, args[i], span)
-    return ret
+        val = _ffi_api._OpOr(val, args[i], span)
+    return val
 
 
 def all(*args, span=None):
@@ -268,10 +284,10 @@ def all(*args, span=None):
         raise ValueError("Any must take at least 1 argument")
     if len(args) == 1:
         return args[0]
-    ret = _ffi_api._OpAnd(args[0], args[1], span)
+    val = _ffi_api._OpAnd(args[0], args[1], span)
     for i in range(2, len(args)):
-        ret = _ffi_api._OpAnd(ret, args[i], span)
-    return ret
+        val = _ffi_api._OpAnd(val, args[i], span)
+    return val
 
 
 @tvm._ffi.register_func("tvm.default_trace_action")
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 70f094a186e7..34f3897cce88 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -927,6 +927,18 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
     value->addIncoming(then_value, then_value_block);
     value->addIncoming(else_value, else_value_block);
     return value;
+  } else if (op->op.same_as(builtin::ret())) {
+    auto const* val = op->args[0].as<IntImmNode>();
+    ICHECK(val) << "the tir.ret should be transformed to return zero "
+                << "before the llvm code generation.";
+    ICHECK_EQ(val->value, 0) << "the tir.ret should be transformed to "
+                             << "return zero before the llvm code generation.";
+    builder_->CreateRet(ConstInt32(0));
+    // LLVM allows exactly one terminator in a single basic block
+    // append a new dummy basic block to avoid error.
+    llvm::BasicBlock* ret_dummy = llvm::BasicBlock::Create(*ctx_, "ret_dummy", function_);
+    builder_->SetInsertPoint(ret_dummy);
+    return ret_dummy;
   } else if (op->op.same_as(builtin::reinterpret())) {
     llvm::Type* target = DTypeToLLVMType(op->dtype);
     return builder_->CreateBitCast(MakeValue(op->args[0]), target);
diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc
index 796b113a4054..1117571c8b75 100644
--- a/src/tir/op/builtin.cc
+++ b/src/tir/op/builtin.cc
@@ -42,6 +42,10 @@ TIR_DEFINE_BUILTIN_FUNC(reinterpret)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kPure))
     .set_num_inputs(1);
 
+TIR_DEFINE_BUILTIN_FUNC(ret)
+    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kControlJump))
+    .set_num_inputs(1);
+
 TIR_DEFINE_BUILTIN_FUNC(likely)
     .set_num_inputs(1)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kExprAnnotation))
diff --git a/src/tir/op/op.cc b/src/tir/op/op.cc
index b576fe4faee8..9fcb07149d19 100644
--- a/src/tir/op/op.cc
+++ b/src/tir/op/op.cc
@@ -145,6 +145,10 @@ void BinaryOpMatchTypes(PrimExpr& lhs, PrimExpr& rhs, Span span) {  // NOLINT(*)
   }
 }
 
+PrimExpr ret(PrimExpr value, Span span) {
+  return tir::Call(value.dtype(), tir::builtin::ret(), {value}, span);
+}
+
 // maximum and min limits
 PrimExpr max_value(const DataType& dtype, Span span) {
   using namespace tir;
diff --git a/src/tir/transforms/make_packed_api.cc b/src/tir/transforms/make_packed_api.cc
index 7c4a8ef92724..adbe78a6d627 100644
--- a/src/tir/transforms/make_packed_api.cc
+++ b/src/tir/transforms/make_packed_api.cc
@@ -41,6 +41,67 @@
 namespace tvm {
 namespace tir {
 
+class ReturnRewriter : public StmtMutator {
+ public:
+  explicit ReturnRewriter(Var ret_var, Var ret_tcode) : ret_var_(ret_var), ret_tcode_(ret_tcode) {}
+
+  Stmt VisitStmt_(const ForNode* node) override {
+    if (node->for_type == ForType::Parallel) in_parallel_ += 1;
+    Stmt ret = StmtMutator::VisitStmt_(node);
+    if (node->for_type == ForType::Parallel) in_parallel_ -= 1;
+    return ret;
+  }
+
+  Stmt VisitStmt_(const EvaluateNode* node) override {
+    Stmt ret = StmtMutator::VisitStmt_(node);
+    const EvaluateNode* eval = ret.as<EvaluateNode>();
+    ICHECK(eval);
+    if (const CallNode* call = eval->value.as<CallNode>()) {
+      if (call->op.same_as(builtin::ret())) {
+        ICHECK_EQ(in_parallel_, 0) << "tir.ret cannot be used in parallel scope.";
+        ICHECK_EQ(call->args.size(), 1) << "tir.ret expect a single argument.";
+        ret = WriteToOut(call->args[0], ret_var_, ret_tcode_);
+      }
+    }
+    return ret;
+  }
+
+ private:
+  std::pair<int, PrimExpr> ConvertForFFI(PrimExpr val) {
+    // convert val's data type to FFI data type, return type code
+    DataType dtype = val.dtype();
+    if (dtype.is_int() || dtype.is_uint()) {
+      return {kTVMArgInt, Cast(DataType::Int(64), val)};
+    } else if (dtype.is_float()) {
+      return {kTVMArgFloat, Cast(DataType::Float(64), val)};
+    } else if (dtype.is_void()) {
+      return {kTVMNullptr, val};
+    } else {
+      LOG(FATAL) << "data type " << dtype << " not supported yet";
+    }
+    return {kTVMNullptr, val};
+  }
+
+  Stmt WriteToOut(PrimExpr val, Var ret_var, Var ret_tcode) {
+    auto p = ConvertForFFI(val);
+    int tcode = p.first;
+    val = p.second;
+    Stmt store_val = Store(ret_var_, val, 0, const_true());
+    Stmt store_tcode = Store(ret_tcode_, tcode, 0, const_true());
+    Stmt ret_zero = Evaluate(tvm::ret(0));
+    return SeqStmt({store_val, store_tcode, ret_zero});
+  }
+
+  Var ret_var_;
+  Var ret_tcode_;
+  int in_parallel_{0};
+};
+
+Stmt RewriteReturn(Stmt body, Var ret_var, Var ret_tcode) {
+  ReturnRewriter rewriter(ret_var, ret_tcode);
+  return rewriter(body);
+}
+
 inline Stmt MakeAssertEQ(PrimExpr lhs, PrimExpr rhs, std::string msg) {
   return AssertStmt(lhs == rhs, tvm::tir::StringImm(msg), Evaluate(0));
 }
@@ -182,8 +243,9 @@ PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) {
     func = WithAttr(std::move(func), tvm::attr::kCallingConv, Integer(CallingConv::kCPackedFunc));
   }
 
-  Stmt body = AttrStmt(make_zero(DataType::Int(32)), attr::compute_scope,
-                       StringImm(name_hint + "_compute_"), func_ptr->body);
+  Stmt body = RewriteReturn(func_ptr->body, v_out_ret_value, v_out_ret_tcode);
+  body = AttrStmt(make_zero(DataType::Int(32)), attr::compute_scope,
+                  StringImm(name_hint + "_compute_"), body);
   // Set device context
   if (vmap.count(device_id.get())) {
     PrimExpr node = StringImm("default");
diff --git a/tests/python/unittest/test_tir_base.py b/tests/python/unittest/test_tir_base.py
new file mode 100644
index 000000000000..6e081a179059
--- /dev/null
+++ b/tests/python/unittest/test_tir_base.py
@@ -0,0 +1,60 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+from tvm import tir
+from tvm.ir.transform import PassContext
+
+
+def build_tir_func(func):
+    func = func.with_attr("global_symbol", "main")
+    pass_ctx = PassContext.current()
+    if pass_ctx.config.get("tir.noalias", True):
+        func = func.with_attr("tir.noalias", True)
+    mod = tvm.IRModule({"main": func})
+    func = tvm.build(mod)
+    return func
+
+
+def test_scalar_add():
+    a = tir.Var("a", "float32")
+    b = tir.Var("b", "float32")
+    c = a + b
+    c = tir.ret(c)
+    c = tir.Evaluate(c)
+    func = tir.PrimFunc([a, b], c)
+    func = build_tir_func(func)
+    out = func(1.0, 2.0)
+    assert out == 3.0
+
+
+def test_control_flow_jump():
+    ib = tvm.tir.ir_builder.create()
+    a = tir.Var("a", "float32")
+    b = tir.Var("b", "float32")
+    with ib.if_scope(True):
+        ib.emit(tir.Evaluate(tir.ret(a)))
+    ib.emit(tir.Evaluate(tir.ret(b)))
+    stmt = ib.get()
+    func = tir.PrimFunc([a, b], stmt)
+    func = build_tir_func(func)
+    out = func(1.0, 2.0)
+    assert out == 1.0
+
+
+if __name__ == "__main__":
+    test_scalar_add()
+    test_control_flow_jump()

From 09bb60ac096a143f681a21f18e6693f7c00e6731 Mon Sep 17 00:00:00 2001
From: "Matt Welsh (OctoML)" <63477620+mdw-octoml@users.noreply.github.com>
Date: Mon, 18 Jan 2021 06:01:09 -0800
Subject: [PATCH 082/357] Add QEMU setup to uTVM tutorial. (#7296)

---
 docker/install/ubuntu_install_qemu.sh |  0
 tutorials/micro/micro_reference_vm.py | 15 ++++++++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)
 mode change 100644 => 100755 docker/install/ubuntu_install_qemu.sh

diff --git a/docker/install/ubuntu_install_qemu.sh b/docker/install/ubuntu_install_qemu.sh
old mode 100644
new mode 100755
diff --git a/tutorials/micro/micro_reference_vm.py b/tutorials/micro/micro_reference_vm.py
index bcef6a0d2c64..07d29401c0e8 100644
--- a/tutorials/micro/micro_reference_vm.py
+++ b/tutorials/micro/micro_reference_vm.py
@@ -140,6 +140,19 @@
 
 .. code-block:: bash
 
-    $ poetry run python3 tests/micro/qemu/test_zephyr.py --microtvm-platforms=stm32f746xx
+    $ cd apps/microtvm/reference-vm/zephyr
+    $ poetry run python3 ../../../../tests/micro/qemu/test_zephyr.py --microtvm-platforms=stm32f746xx
+
+If you do not have physical hardware attached, but wish to run the tests using the
+local QEMU emulator running within the VM, run the following commands instead:
+
+.. code-block:: bash
+
+    $ cd /Users/yourusername/path/to/tvm
+    $ sudo ./docker/install/ubuntu_install_qemu.sh
+    $ cd apps/microtvm/reference-vm/zephyr/
+    $ poetry run pytest ../../../../tests/micro/qemu/test_zephyr.py --microtvm-platforms=host
+
+
 
 """

From 6fb10197787f7d70287856f2127d1859276803b4 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Mon, 18 Jan 2021 10:27:44 -0800
Subject: [PATCH 083/357] [TUTORIAL] Add gpu instructions and results to
 deploy_sparse (#7298)

---
 tutorials/frontend/deploy_sparse.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/tutorials/frontend/deploy_sparse.py b/tutorials/frontend/deploy_sparse.py
index dcf2fc4fe31d..9641fb8fd14c 100644
--- a/tutorials/frontend/deploy_sparse.py
+++ b/tutorials/frontend/deploy_sparse.py
@@ -102,10 +102,8 @@
 batch_size = 1
 # The length of each input sequence.
 seq_len = 128
-# TVM platform identifier. Although cuda is also supported, it requires
-# tuning that is outside the scope of this tutorial. Note that best
-# cpu performance can be achieved by setting -mcpu appropriately for
-# your specific machine.
+# TVM platform identifier. Note that best cpu performance can be achieved by setting -mcpu
+# appropriately for your specific machine. CUDA and ROCm are also supported.
 target = "llvm"
 # Which device to run on. Should be one of tvm.cpu() or tvm.gpu().
 ctx = tvm.cpu()
@@ -339,3 +337,17 @@ def benchmark():
 # Runtime:             165.26 ms           (12.83 ms)
 # Block Sparse Model with 1x1 blocks:
 # Runtime:             67.75 ms            (8.83 ms)
+
+# Here is the output of this script on a GPU (GTX 1070) with the target "cuda -libs=cublas".
+#
+# Dense Model Benchmark:
+# Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('dense_cublas.cuda', ('TENSOR', (1, 768), 'float32'), ('TENSOR', (2, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
+# Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('dense_cublas.cuda', ('TENSOR', (1, 768), 'float32'), ('TENSOR', (768, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
+# Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('dense_cublas.cuda', ('TENSOR', (128, 3072), 'float32'), ('TENSOR', (768, 3072), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
+# Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('dense_cublas.cuda', ('TENSOR', (128, 768), 'float32'), ('TENSOR', (3072, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
+# Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('dense_cublas.cuda', ('TENSOR', (128, 768), 'float32'), ('TENSOR', (768, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
+# Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('batch_matmul_cublas.cuda', ('TENSOR', (12, 128, 128), 'float32'), ('TENSOR', (12, 64, 128), 'float32'), (12, 128, 64)). A fallback configuration is used, which may bring great performance regression.
+# Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('batch_matmul_cublas.cuda', ('TENSOR', (12, 128, 64), 'float32'), ('TENSOR', (12, 128, 64), 'float32'), (12, 128, 128)). A fallback configuration is used, which may bring great performance regression.
+# Runtime:             10.64 ms            (0.29 ms)
+# Block Sparse Model with 1x1 blocks:
+# Runtime:             6.46 ms             (0.05 ms)

From 5e92eed4c4f02fc5e08f7d863ecb6e6b29c11c9a Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Mon, 18 Jan 2021 22:23:44 -0500
Subject: [PATCH 084/357] [COMMUNITY] @Laurawly => PMC (#7307)

---
 CONTRIBUTORS.md | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 909bdb700722..36743a345d21 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -42,38 +42,38 @@ We do encourage everyone to work anything they are interested in.
 - [Aditya Atluri](https://github.com/adityaatluri): @adityaatluri - rocm
 - [Matthew Barrett](https://github.com/mbaret): @mbaret - byoc, arm
 - [Matthew Brookhart](https://github.com/mbrookhart): @mbrookhart - relay, frontends
-- [Tianqi Chen](https://github.com/tqchen) (PPMC): @tqchen - topi, compiler, relay, docs
+- [Tianqi Chen](https://github.com/tqchen) (PMC): @tqchen - topi, compiler, relay, docs
 - [Liangfu Chen](https://github.com/liangfu): @liangfu - vta, chisel, intel FPGA, c runtime
 - [Wei Chen](https://github.com/wweic): @wweic - runtime, relay, vm
-- [Zhi Chen](https://github.com/zhiics) (PPMC): @zhiics - relay, quantization, pass manager
+- [Zhi Chen](https://github.com/zhiics) (PMC): @zhiics - relay, quantization, pass manager
 - [Chenfan](https://github.com/jcf94): @jcf94 - autoscheduling
 - [Yuwei Hu](https://github.com/Huyuwei): @Huyuwei - topi, frontends
 - [Nick Hynes](https://github.com/nhynes): @nhynes: - sgx, rust
 - [Animesh Jain](https://github.com/anijain2305): @anijain2305 - quantization, relay
-- [Ziheng Jiang](https://github.com/ZihengJiang) (PPMC): @ZihengJiang - relay, compiler
+- [Ziheng Jiang](https://github.com/ZihengJiang) (PMC): @ZihengJiang - relay, compiler
 - [Marisa Kirisame](https://github.com/MarisaKirisame): @MarisaKirisame - relay
 - [Wuwei Lin](https://github.com/vinx13): @vinx13 - relay, topi
-- [Yizhi Liu](https://github.com/yzhliu) (PPMC): @yzhliu - jvm, topi, relay
+- [Yizhi Liu](https://github.com/yzhliu) (PMC): @yzhliu - jvm, topi, relay
 - [Hao Lu](https://github.com/hlu1): @hlu1 - nnpack, frontends
-- [Masahiro Masuda](https://github.com/masahi) (PPMC): @masahi - topi, relay
-- [Thierry Moreau](https://github.com/tmoreau89) (PPMC): @tmoreau89 - vta
+- [Masahiro Masuda](https://github.com/masahi) (PMC): @masahi - topi, relay
+- [Thierry Moreau](https://github.com/tmoreau89) (PMC): @tmoreau89 - vta
 - [Kazutaka Morita](https://github.com/kazum): @kazum - frontends, opencl
 - [Krzysztof Parzyszek](https://github.com/kparzysz-quic): @kparzysz-quic - hexagon, llvm
-- [Jared Roesch](https://github.com/jroesch) (PPMC): @jroesch - relay
+- [Jared Roesch](https://github.com/jroesch) (PMC): @jroesch - relay
 - [Siju Samuel](https://github.com/siju-samuel): @siju-samuel - frontends
 - [Siva](https://github.com/srkreddy1238): @srkreddy1238 - frontends, golang
 - [Junru Shao](https://github.com/junrushao1994) @junrushao1994 - relay, compiler
-- [Haichen Shen](https://github.com/icemelon9) (PPMC): @icemelon9 - relay, topi
+- [Haichen Shen](https://github.com/icemelon9) (PMC): @icemelon9 - relay, topi
 - [Zhixun Tan](https://github.com/phisiart): @phisiart - opengl, web
 - [Andrew Tulloch](https://github.com/ajtulloch): @ajtulloch - topi, compiler, runtime
 - [Luis Vega](https://github.com/vegaluisjose): @vegaluisjose - vta, chisel
-- [Leyuan Wang](https://github.com/Laurawly): @Laurawly: - topi
+- [Leyuan Wang](https://github.com/Laurawly) (PMC): @Laurawly: - topi
 - [Yao Wang](https://github.com/kevinthesun): @kevinthesun: - topi, vision
 - [Jian Weng](https://github.com/were): @were: - hybrid script
 - [Zhao Wu](https://github.com/FrozenGene): @FrozenGene - runtime, topi, frontends
-- [Eddie Yan](https://github.com/eqy) (PPMC): @eqy - runtime, autotvm, rpc, topi
+- [Eddie Yan](https://github.com/eqy) (PMC): @eqy - runtime, autotvm, rpc, topi
 - [Hao Yu](https://github.com/comaniac): @comaniac - relay, byoc, ansor
-- [Lianmin Zheng](https://github.com/merrymercy) (PPMC): @merrymercy - autotvm, topi, relay
+- [Lianmin Zheng](https://github.com/merrymercy) (PMC): @merrymercy - autotvm, topi, relay
 
 ## Reviewers
 

From 5d95105a553bccb75c0cd428025d7904d876da0d Mon Sep 17 00:00:00 2001
From: Chenfan <chengfan.jcf@alibaba-inc.com>
Date: Tue, 19 Jan 2021 14:18:19 +0800
Subject: [PATCH 085/357] [AutoScheduler] Bug fix & Custom sketch support
 (#7260)

---
 python/tvm/auto_scheduler/__init__.py         |  7 ++-
 python/tvm/auto_scheduler/search_policy.py    | 35 ++++++++++++++-
 python/tvm/auto_scheduler/search_task.py      |  5 ++-
 python/tvm/auto_scheduler/task_scheduler.py   | 14 +++++-
 .../search_policy/sketch_policy.cc            | 25 +++++++++++
 .../search_policy/sketch_policy.h             | 34 ++++++++++++++
 .../search_policy/sketch_policy_rules.cc      | 27 +++++++++++
 .../search_policy/sketch_policy_rules.h       | 23 ++++++++++
 .../test_auto_scheduler_search_policy.py      | 27 +++++++++++
 .../test_auto_scheduler_sketch_generation.py  | 45 ++++++++++++++++++-
 10 files changed, 234 insertions(+), 8 deletions(-)

diff --git a/python/tvm/auto_scheduler/__init__.py b/python/tvm/auto_scheduler/__init__.py
index a03e156cc10f..57e58309525c 100644
--- a/python/tvm/auto_scheduler/__init__.py
+++ b/python/tvm/auto_scheduler/__init__.py
@@ -50,6 +50,11 @@
     is_auto_scheduler_enabled,
 )
 from .search_task import SearchTask, TuningOptions, HardwareParams, create_task, auto_schedule
-from .search_policy import EmptyPolicy, SketchPolicy, PreloadMeasuredStates
+from .search_policy import (
+    EmptyPolicy,
+    SketchPolicy,
+    PreloadMeasuredStates,
+    PreloadCustomSketchRule,
+)
 from .task_scheduler import TaskScheduler
 from .workload_registry import register_workload, make_workload_key
diff --git a/python/tvm/auto_scheduler/search_policy.py b/python/tvm/auto_scheduler/search_policy.py
index 5b15a48943d2..f0388a886c5f 100644
--- a/python/tvm/auto_scheduler/search_policy.py
+++ b/python/tvm/auto_scheduler/search_policy.py
@@ -61,6 +61,39 @@ def __init__(self, filename):
         self.__init_handle_by_constructor__(_ffi_api.PreloadMeasuredStates, filename)
 
 
+@tvm._ffi.register_object("auto_scheduler.PreloadCustomSketchRule")
+class PreloadCustomSketchRule(SearchCallback):
+    """
+    A SearchCallback for SketchSearchPolicy that allows users to add
+    custom sketch rule.
+
+    Notes
+    -----
+    This is an advanced feature. Make sure you're clear how it works and this should only be used
+    in SketchSearchPolicy.
+
+    Parameters
+    ----------
+    meet_condition_func: Callable
+        A function with `(policy, state, stage_id) -> int`. Should return one of the result
+        enumeration.
+    apply_func: Callable
+        A function with `(policy, state, stage_id) -> [[State, int], ...]`.
+    rule_name: str = "CustomSketchRule"
+        The name of this custom sketch rule.
+    """
+
+    # Result enumeration of the condition function.
+    PASS = 0  # Skip this rule and continue to try the next rules.
+    APPLY = 1  # Apply this rule and continue to try the next rules.
+    APPLY_AND_SKIP_REST = 2  # Apply this rule and skip the rest rules.
+
+    def __init__(self, meet_condition_func, apply_func, rule_name="CustomSketchRule"):
+        self.__init_handle_by_constructor__(
+            _ffi_api.PreloadCustomSketchRule, meet_condition_func, apply_func, rule_name
+        )
+
+
 @tvm._ffi.register_object("auto_scheduler.SearchPolicy")
 class SearchPolicy(Object):
     """ The base class of search policies. """
@@ -141,8 +174,6 @@ class SketchPolicy(SearchPolicy):
 
           - auto_scheduler.PreloadMeasuredStates
           - auto_scheduler.PreloadCustomSketchRule
-
-        TODO(jcf94): Add these search callback implementations.
     """
 
     DEFAULT_PARAMS = {
diff --git a/python/tvm/auto_scheduler/search_task.py b/python/tvm/auto_scheduler/search_task.py
index bfa596a1dc61..d985ed1341f5 100644
--- a/python/tvm/auto_scheduler/search_task.py
+++ b/python/tvm/auto_scheduler/search_task.py
@@ -228,6 +228,9 @@ def __init__(
         if isinstance(target_host, str):
             target_host = Target(target_host)
 
+        if layout_rewrite_option is None:
+            layout_rewrite_option = LayoutRewriteOption.get_target_default(target)
+
         self.__init_handle_by_constructor__(
             _ffi_api.SearchTask,
             compute_dag,
@@ -235,7 +238,7 @@ def __init__(
             target,
             target_host,
             hardware_params,
-            layout_rewrite_option or LayoutRewriteOption.get_target_default(target),
+            layout_rewrite_option,
         )
 
     def tune(self, tuning_options, search_policy=None):
diff --git a/python/tvm/auto_scheduler/task_scheduler.py b/python/tvm/auto_scheduler/task_scheduler.py
index 975306f7be54..420b5f765a97 100644
--- a/python/tvm/auto_scheduler/task_scheduler.py
+++ b/python/tvm/auto_scheduler/task_scheduler.py
@@ -72,7 +72,7 @@ def make_search_policies(
         Load measurement records from this file. If it is not None, the status of the
         task scheduler, search policies and cost models will be restored according to this file.
     adapative_training: bool = False
-        Option used for XGBModel, which will reduce the model training frequency when there're too
+        Option used by XGBModel to reduce the model training frequency when there're too
         many logs.
 
     Returns
@@ -275,7 +275,13 @@ def __init__(
                 self.group_task_ids.append([])
             self.group_task_ids[self.tag_to_group_id[tag]].append(i)
 
-    def tune(self, tune_option, search_policy="default", search_policy_params=None):
+    def tune(
+        self,
+        tune_option,
+        search_policy="default",
+        search_policy_params=None,
+        adapative_training=False,
+    ):
         """Tune a batch of tasks together.
 
         Parameters
@@ -290,6 +296,9 @@ def tune(self, tune_option, search_policy="default", search_policy_params=None):
             "sketch.random" for SketchPolicy + RandomModel.
         search_policy_params : Optional[Dict[str, Any]]
             The parameters of the search policy
+        adapative_training : bool = False
+            Option used by XGBModel to reduce the model training frequency when there're
+            too many logs.
         """
         # init members
         self.tune_option = tune_option
@@ -324,6 +333,7 @@ def tune(self, tune_option, search_policy="default", search_policy_params=None):
             tune_option.verbose,
             self.load_model_file,
             self.load_log_file,
+            adapative_training,
         )
 
         # do a round robin first to warm up
diff --git a/src/auto_scheduler/search_policy/sketch_policy.cc b/src/auto_scheduler/search_policy/sketch_policy.cc
index 1e20b0fff6ea..91721afdba74 100644
--- a/src/auto_scheduler/search_policy/sketch_policy.cc
+++ b/src/auto_scheduler/search_policy/sketch_policy.cc
@@ -671,6 +671,26 @@ Array<MeasureInput> SketchPolicyNode::PickStatesWithEpsGreedy(const Array<State>
   return inputs;
 }
 
+/********** PreloadCustomSketchRule **********/
+TVM_REGISTER_OBJECT_TYPE(PreloadCustomSketchRuleNode);
+
+PreloadCustomSketchRule::PreloadCustomSketchRule(PackedFunc meet_condition_func,
+                                                 PackedFunc apply_func, String rule_name) {
+  auto node = make_object<PreloadCustomSketchRuleNode>();
+  node->meet_condition_func = std::move(meet_condition_func);
+  node->apply_func = std::move(apply_func);
+  node->rule_name = std::move(rule_name);
+  data_ = std::move(node);
+}
+
+void PreloadCustomSketchRuleNode::Callback(SearchPolicyNode* policy) {
+  CHECK(policy->IsInstance<SketchPolicyNode>());
+  auto sketch_policy = dynamic_cast<SketchPolicyNode*>(policy);
+  sketch_policy->sketch_rules.push_back(
+      new RuleCustomSketch(meet_condition_func, apply_func, rule_name));
+  StdCout(policy->verbose) << "Custom sketch rule \"" << rule_name << "\" added." << std::endl;
+}
+
 TVM_REGISTER_GLOBAL("auto_scheduler.SketchPolicy")
     .set_body_typed([](SearchTask task, CostModel program_cost_model, Map<String, ObjectRef> params,
                        int seed, int verbose,
@@ -699,5 +719,10 @@ TVM_REGISTER_GLOBAL("auto_scheduler.PrintTitle").set_body_typed([](std::string t
   PrintTitle(title, 1);
 });
 
+TVM_REGISTER_GLOBAL("auto_scheduler.PreloadCustomSketchRule")
+    .set_body_typed([](PackedFunc meet_condition_func, PackedFunc apply_func, String rule_name) {
+      return PreloadCustomSketchRule(meet_condition_func, apply_func, rule_name);
+    });
+
 }  // namespace auto_scheduler
 }  // namespace tvm
diff --git a/src/auto_scheduler/search_policy/sketch_policy.h b/src/auto_scheduler/search_policy/sketch_policy.h
index 488634902a87..faf058b45b19 100644
--- a/src/auto_scheduler/search_policy/sketch_policy.h
+++ b/src/auto_scheduler/search_policy/sketch_policy.h
@@ -197,6 +197,40 @@ class SketchPolicy : public SearchPolicy {
   TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(SketchPolicy, SearchPolicy, SketchPolicyNode);
 };
 
+/*! \brief Pre-search callback function to load custom rules for sketch generation */
+class PreloadCustomSketchRuleNode : public SearchCallbackNode {
+ public:
+  /*! \brief The condition check function of this rule. */
+  PackedFunc meet_condition_func;
+  /*! \brief The apply function of this rule. */
+  PackedFunc apply_func;
+  /*! \brief The name of this rule. */
+  String rule_name;
+
+  void Callback(SearchPolicyNode* policy) final;
+
+  static constexpr const char* _type_key = "auto_scheduler.PreloadCustomSketchRule";
+  TVM_DECLARE_FINAL_OBJECT_INFO(PreloadCustomSketchRuleNode, SearchCallbackNode);
+};
+
+/*!
+ * \brief Managed reference to PreloadCustomSketchRuleNode.
+ * \sa PreloadCustomSketchRuleNode
+ */
+class PreloadCustomSketchRule : public SearchCallback {
+ public:
+  /*!
+   * \brief The constructor.
+   * \param meet_condition_func The condition check function of this rule.
+   * \param apply_func The apply function of this rule.
+   * \param rule_name The name of this rule.
+   */
+  PreloadCustomSketchRule(PackedFunc meet_condition_func, PackedFunc apply_func, String rule_name);
+
+  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(PreloadCustomSketchRule, SearchCallback,
+                                        PreloadCustomSketchRuleNode);
+};
+
 }  // namespace auto_scheduler
 }  // namespace tvm
 
diff --git a/src/auto_scheduler/search_policy/sketch_policy_rules.cc b/src/auto_scheduler/search_policy/sketch_policy_rules.cc
index f704fe9e82d5..110be6bd6f68 100644
--- a/src/auto_scheduler/search_policy/sketch_policy_rules.cc
+++ b/src/auto_scheduler/search_policy/sketch_policy_rules.cc
@@ -461,6 +461,33 @@ std::vector<std::pair<State, int>> RuleSpecialComputeLocationGPU::Apply(
   return {std::make_pair(std::move(tmp_s), stage_id - 1)};
 }
 
+/********** RuleCustomSketch **********/
+
+SketchGenerationRule::ConditionKind RuleCustomSketch::MeetCondition(const SketchPolicyNode& policy,
+                                                                    const State& state,
+                                                                    int stage_id) const {
+  auto ret = meet_condition_func_(tvm::runtime::GetRef<SketchPolicy>(&policy), state, stage_id);
+  if (ret.type_code() == 0) {
+    return ConditionKind(static_cast<int>(ret));
+  } else {
+    LOG(WARNING) << "Wrong rule condition value. Apply the rule and skip the rest";
+    return ConditionKind::kApplyAndSkipRest;
+  }
+}
+
+std::vector<std::pair<State, int>> RuleCustomSketch::Apply(const SketchPolicyNode& policy,
+                                                           const State& state, int stage_id) const {
+  Array<Array<ObjectRef>> apply_ret =
+      apply_func_(tvm::runtime::GetRef<SketchPolicy>(&policy), state, stage_id);
+  std::vector<std::pair<State, int>> ret;
+  for (const auto& item : apply_ret) {
+    CHECK_EQ(item.size(), 2);
+    auto next = item[1].as<IntImmNode>();
+    ret.emplace_back(Downcast<State>(item[0]), next->value);
+  }
+  return ret;
+}
+
 /********** Init Population **********/
 
 PopulationGenerationRule::ResultKind InitFillTileSize::Apply(SketchPolicyNode* policy, State* state,
diff --git a/src/auto_scheduler/search_policy/sketch_policy_rules.h b/src/auto_scheduler/search_policy/sketch_policy_rules.h
index 046f036d59d9..fc1916b8c67d 100644
--- a/src/auto_scheduler/search_policy/sketch_policy_rules.h
+++ b/src/auto_scheduler/search_policy/sketch_policy_rules.h
@@ -131,6 +131,29 @@ DEFINE_SKETCH_GENERATION_RULE(RuleCrossThreadReduction);
  * location of the producers of compute ops that perform "fake reduction" with const tensors. */
 DEFINE_SKETCH_GENERATION_RULE(RuleSpecialComputeLocationGPU);
 
+/*! \brief The rule that allows users to generate custom sketches. */
+class RuleCustomSketch : public SketchGenerationRule {
+ public:
+  RuleCustomSketch(PackedFunc meet_condition_func, PackedFunc apply_func,
+                   String rule_name = "CustomSketchRule")
+      : meet_condition_func_(std::move(meet_condition_func)),
+        apply_func_(std::move(apply_func)),
+        rule_name_(std::move(rule_name)) {}
+
+  ConditionKind MeetCondition(const SketchPolicyNode& policy, const State& state,
+                              int stage_id) const final;
+
+  std::vector<std::pair<State, int>> Apply(const SketchPolicyNode& policy, const State& state,
+                                           int stage_id) const final;
+
+  std::string GetRuleName() const final { return rule_name_; }
+
+ private:
+  PackedFunc meet_condition_func_;
+  PackedFunc apply_func_;
+  String rule_name_;
+};
+
 /********** Init Population **********/
 
 /*! \brief The base class for rules used to annotate the sketches to get the initial population. */
diff --git a/tests/python/unittest/test_auto_scheduler_search_policy.py b/tests/python/unittest/test_auto_scheduler_search_policy.py
index c96dc63fec29..30aafbd22390 100644
--- a/tests/python/unittest/test_auto_scheduler_search_policy.py
+++ b/tests/python/unittest/test_auto_scheduler_search_policy.py
@@ -183,6 +183,32 @@ def test_sketch_search_policy_zero_rank():
         search_common(task, runner=measure_ctx.runner)
 
 
+@tvm.testing.requires_llvm
+def test_sketch_search_policy_custom_sketch():
+    def meet_condition_func(search_policy, state, stage_id):
+        return auto_scheduler.PreloadCustomSketchRule.APPLY_AND_SKIP_REST
+
+    def apply_func(search_policy, state, stage_id):
+        ret = []
+        state = auto_scheduler.loop_state.State(state, search_policy.search_task.compute_dag)
+        C = state.stage_ops[2]
+
+        ret.append([state.state_object, -1])
+
+        s1 = state.copy()
+        i, _, _ = s1[C].iters
+        s1.split(C, i, [8])
+        ret.append([s1.state_object, -1])
+        return ret
+
+    search_common(
+        cost_model=auto_scheduler.XGBModel(),
+        init_search_callbacks=[
+            auto_scheduler.PreloadCustomSketchRule(meet_condition_func, apply_func)
+        ],
+    )
+
+
 if __name__ == "__main__":
     test_workload_registry_empty_policy()
     test_sketch_search_policy_basic()
@@ -191,3 +217,4 @@ def test_sketch_search_policy_zero_rank():
     test_sketch_search_policy_cuda_rpc_runner()
     test_sketch_search_policy_cuda_xgbmodel_rpc_runner()
     test_sketch_search_policy_zero_rank()
+    test_sketch_search_policy_custom_sketch()
diff --git a/tests/python/unittest/test_auto_scheduler_sketch_generation.py b/tests/python/unittest/test_auto_scheduler_sketch_generation.py
index ddff6dd1a8d6..f3be6c0bc518 100644
--- a/tests/python/unittest/test_auto_scheduler_sketch_generation.py
+++ b/tests/python/unittest/test_auto_scheduler_sketch_generation.py
@@ -36,9 +36,13 @@
 )
 
 
-def generate_sketches(workload_func, args, target, print_for_debug=False):
+def generate_sketches(
+    workload_func, args, target, print_for_debug=False, init_search_callbacks=None
+):
     task = auto_scheduler.SearchTask(func=workload_func, args=args, target=target)
-    policy = auto_scheduler.SketchPolicy(task, verbose=0)
+    policy = auto_scheduler.SketchPolicy(
+        task, verbose=0, init_search_callbacks=init_search_callbacks
+    )
     return policy.generate_sketches(print_for_debug)
 
 
@@ -259,6 +263,42 @@ def test_cpu_zero_rank_sketch():
     assert len(sketches) == 3
 
 
+def test_cpu_custom_sketch():
+    def meet_condition_func(search_policy, state, stage_id):
+        return auto_scheduler.PreloadCustomSketchRule.APPLY_AND_SKIP_REST
+
+    def apply_func(search_policy, state, stage_id):
+        ret = []
+        state = auto_scheduler.loop_state.State(state, search_policy.search_task.compute_dag)
+        C = state.stage_ops[2]
+
+        ret.append([state.state_object, -1])
+
+        s1 = state.copy()
+        i, _, _ = s1[C].iters
+        s1.split(C, i, [8, 2])
+        ret.append([s1.state_object, -1])
+        return ret
+
+    sketches = generate_sketches(
+        matmul_auto_scheduler_test,
+        (512, 512, 512),
+        "llvm",
+        init_search_callbacks=[
+            auto_scheduler.PreloadCustomSketchRule(meet_condition_func, apply_func)
+        ],
+    )
+    assert len(sketches) == 2
+    assert sketches[0].stages[2].iters[0].range.extent == 512
+    assert sketches[0].stages[2].iters[1].range.extent == 512
+    assert sketches[0].stages[2].iters[2].range.extent == 512
+    assert sketches[1].stages[2].iters[0].range.extent == 32
+    assert sketches[1].stages[2].iters[1].range.extent == 8
+    assert sketches[1].stages[2].iters[2].range.extent == 2
+    assert sketches[1].stages[2].iters[3].range.extent == 512
+    assert sketches[1].stages[2].iters[4].range.extent == 512
+
+
 @tvm.testing.requires_cuda
 def test_cuda_matmul_sketch():
     sketches = generate_sketches(matmul_auto_scheduler_test, (512, 512, 512), "cuda")
@@ -407,6 +447,7 @@ def test_cuda_zero_rank_sketch():
     test_cpu_softmax_sketch()
     test_cpu_conv2d_winograd_sketch()
     test_cpu_zero_rank_sketch()
+    test_cpu_custom_sketch()
     test_cuda_matmul_sketch()
     test_cuda_conv2d_bn_relu_sketch()
     test_cuda_max_pool2d_sketch()

From 7340c02d0efe0f5eb5692fb9f4cc7573c5d056cb Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Tue, 19 Jan 2021 02:00:11 -0500
Subject: [PATCH 086/357] [TIR][REFACTOR] ForNode introduce thread binding and
 remove legacy field (#7306)

[TIR][REFACTOR] ForNode update

- Remove deprecated device_api.
- Add ThreadBinding for_type.
- Add additional annotations.

More style consistency refactor to make the ForNode
to be consistent with rest of the codebase.

- ForType => ForKind
- Add constant prefix k to enum consts per Google C style
- Introduce ForKind to the python side.
---
 include/tvm/tir/stmt.h                        | 85 ++++++++++++-------
 python/tvm/script/scope_handler.py            |  8 +-
 python/tvm/te/hybrid/calls.py                 | 20 ++---
 python/tvm/te/hybrid/parser.py                | 14 +--
 python/tvm/tir/__init__.py                    |  2 +-
 python/tvm/tir/ir_builder.py                  | 24 +++---
 python/tvm/tir/stmt.py                        | 58 ++++++++++---
 python/tvm/topi/cuda/nms.py                   |  8 +-
 python/tvm/topi/cuda/rcnn/proposal.py         |  6 +-
 python/tvm/topi/cuda/sparse.py                | 22 ++---
 python/tvm/topi/nn/sparse.py                  | 12 +--
 python/tvm/topi/sparse/csrmm.py               |  4 +-
 python/tvm/topi/sparse/csrmv.py               |  2 +-
 python/tvm/topi/sparse/dense.py               |  8 +-
 python/tvm/topi/vision/rcnn/proposal.py       |  6 +-
 python/tvm/topi/x86/scatter.py                |  2 +-
 src/auto_scheduler/feature.cc                 | 14 +--
 src/autotvm/feature_visitor.cc                | 14 +--
 src/printer/tir_text_printer.cc               | 19 +++--
 src/printer/tvmscript_printer.cc              | 21 +++--
 src/target/llvm/codegen_cpu.cc                | 11 +--
 src/target/llvm/codegen_llvm.cc               |  4 +-
 src/target/source/codegen_cuda.cc             |  2 +-
 src/target/spirv/codegen_spirv.cc             |  2 +-
 src/te/operation/hybrid_op.cc                 | 27 +++---
 src/te/operation/op_utils.cc                  | 36 ++++----
 src/te/operation/op_utils.h                   | 10 +--
 ...hedule_postproc_rewrite_for_tensor_core.cc |  3 +-
 src/tir/ir/stmt.cc                            | 36 ++++----
 src/tir/transforms/combine_context_call.cc    |  2 +-
 src/tir/transforms/inject_double_buffer.cc    |  3 +-
 src/tir/transforms/inject_prefetch.cc         |  4 +-
 src/tir/transforms/inject_virtual_thread.cc   |  7 +-
 src/tir/transforms/ir_utils.cc                |  3 +-
 src/tir/transforms/loop_partition.cc          |  4 +-
 src/tir/transforms/make_packed_api.cc         |  4 +-
 src/tir/transforms/narrow_datatype.cc         |  4 +-
 src/tir/transforms/storage_flatten.cc         |  4 +-
 src/tir/transforms/storage_rewrite.cc         |  8 +-
 src/tir/transforms/unroll_loop.cc             | 10 +--
 src/tir/transforms/vectorize_loop.cc          | 13 +--
 .../unittest/test_arith_domain_touched.py     |  6 +-
 .../test_runtime_module_based_interface.py    |  3 +-
 .../unittest/test_runtime_module_load.py      |  6 +-
 .../unittest/test_target_codegen_cuda.py      |  2 +-
 .../unittest/test_target_codegen_llvm.py      |  2 +-
 .../test_target_codegen_static_init.py        |  2 +-
 .../unittest/test_target_codegen_vm_basic.py  |  2 +-
 .../python/unittest/test_te_hybrid_script.py  |  6 +-
 tests/python/unittest/test_tir_constructor.py |  2 +-
 tests/python/unittest/test_tir_nodes.py       |  2 +-
 .../test_tir_transform_remove_no_op.py        | 11 +--
 .../test_tir_transform_storage_rewrite.py     |  4 +-
 .../test_tir_transform_unroll_loop.py         |  8 +-
 .../unittest/test_tir_transform_vectorize.py  | 16 ++--
 tutorials/dev/low_level_custom_pass.py        |  4 +-
 vta/python/vta/transform.py                   | 12 ++-
 57 files changed, 359 insertions(+), 275 deletions(-)

diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h
index 2b7f1e67bda5..093d49ca2dd4 100644
--- a/include/tvm/tir/stmt.h
+++ b/include/tvm/tir/stmt.h
@@ -752,23 +752,34 @@ class Evaluate : public Stmt {
   TVM_DEFINE_OBJECT_REF_METHODS(Evaluate, Stmt, EvaluateNode);
 };
 
-/*! \brief Additional annotation of for loop. */
-enum class ForType : int {
-  /*! \brief serial execution. */
-  Serial = 0,
-  /*! \brief parallel execution on CPU. */
-  Parallel = 1,
-  /*! \brief Vector SIMD loop annotaion. */
-  Vectorized = 2,
-  /*! \brief Unroll annotation. */
-  Unrolled = 3
+/*!
+ * \brief The kind of the loop.
+ *
+ *  ForKind can change the control flow semantics
+ *  of the loop. So the kind field needs to be considered
+ *  in all TIR passes.
+ */
+enum class ForKind : int {
+  /*! \brief default semantics -- serial execution. */
+  kSerial = 0,
+  /*! \brief Parallel execution on CPU. */
+  kParallel = 1,
+  /*!
+   * \brief Vector SIMD loop.
+   *  The loop body will be vectorized.
+   */
+  kVectorized = 2,
+  /*! \brief The loop body must be unrolled. */
+  kUnrolled = 3,
+  /*!
+   * \brief The loop variable is bound to a thread in
+   * an environment. In the final stage of lowering,
+   * the loop is simply removed and the loop variable is
+   * mapped to the corresponding context thread.
+   */
+  kThreadBinding = 4
 };
 
-// Kevice api of for loop
-// kept for backward compatibility
-// consider refactor and remove later.
-enum class DeviceAPI : int { None = 0 };
-
 /*!
  * \brief A for loop, with poissible type annotations.
  *
@@ -787,39 +798,50 @@ class ForNode : public StmtNode {
   PrimExpr min;
   /*! \brief The extent of the iteration. */
   PrimExpr extent;
-  /*! \brief The type of the for loop. */
-  ForType for_type;
-  /*!
-   * \brief Deprecated, reserved for backward compatibility.
-   *  Consider refactor and remove later.
-   */
-  DeviceAPI device_api;
+  /*! \brief The kind of the for loop. */
+  ForKind kind;
   /*! \brief The body of the for loop. */
   Stmt body;
+  /*!
+   * \brief Only valid when kind == ForKind::kThreadBinding
+   * The context thread that this loop variable bounds to.
+   */
+  Optional<IterVar> thread_binding;
+  /*!
+   * \brief Additional annotations about the loop.
+   *
+   *  These annotations can be used as auxiliary hint
+   *  to future transformations. An annotation should
+   *  not change the control flow semantics of the loop
+   *  and can be ignored in most passes.
+   */
+  Map<String, ObjectRef> annotations;
 
   void VisitAttrs(AttrVisitor* v) {
     v->Visit("loop_var", &loop_var);
     v->Visit("min", &min);
     v->Visit("extent", &extent);
-    v->Visit("for_type", &for_type);
-    v->Visit("device_api", &device_api);
+    v->Visit("kind", &kind);
     v->Visit("body", &body);
+    v->Visit("thread_binding", &thread_binding);
+    v->Visit("annotations", &annotations);
     v->Visit("span", &span);
   }
 
   bool SEqualReduce(const ForNode* other, SEqualReducer equal) const {
     return equal.DefEqual(loop_var, other->loop_var) && equal(min, other->min) &&
-           equal(extent, other->extent) && equal(for_type, other->for_type) &&
-           equal(device_api, other->device_api) && equal(body, other->body);
+           equal(extent, other->extent) && equal(kind, other->kind) && equal(body, other->body) &&
+           equal(thread_binding, other->thread_binding) && equal(annotations, other->annotations);
   }
 
   void SHashReduce(SHashReducer hash_reduce) const {
     hash_reduce.DefHash(loop_var);
     hash_reduce(min);
     hash_reduce(extent);
-    hash_reduce(for_type);
-    hash_reduce(device_api);
+    hash_reduce(kind);
     hash_reduce(body);
+    hash_reduce(thread_binding);
+    hash_reduce(annotations);
   }
 
   static constexpr const char* _type_key = "tir.For";
@@ -832,8 +854,9 @@ class ForNode : public StmtNode {
  */
 class For : public Stmt {
  public:
-  TVM_DLL For(Var loop_var, PrimExpr min, PrimExpr extent, ForType for_type, DeviceAPI device_api,
-              Stmt body, Span span = Span());
+  TVM_DLL For(Var loop_var, PrimExpr min, PrimExpr extent, ForKind kind, Stmt body,
+              Optional<IterVar> thread_binding = NullOpt,
+              Map<String, ObjectRef> annotations = Map<String, ObjectRef>(), Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(For, Stmt, ForNode);
 };
@@ -1015,7 +1038,7 @@ inline bool IsPragmaKey(const std::string& attr_key) {
 TVM_DLL PrimExpr TypeAnnotation(DataType dtype, Span span = Span());
 
 // overload printing of for type.
-TVM_DLL std::ostream& operator<<(std::ostream& os, ForType for_type);
+TVM_DLL std::ostream& operator<<(std::ostream& os, ForKind kind);
 
 }  // namespace tir
 }  // namespace tvm
diff --git a/python/tvm/script/scope_handler.py b/python/tvm/script/scope_handler.py
index 21ed7f6e4682..9449cbdc156c 100644
--- a/python/tvm/script/scope_handler.py
+++ b/python/tvm/script/scope_handler.py
@@ -226,7 +226,7 @@ def serial(begin, end, span):
                 self.context.report_error("Expect exact 1 loop var", span)
             ana = tvm.arith.Analyzer()
             extent = end if begin == 0 else ana.simplify(end - begin)
-            return tvm.tir.For(self.loop_vars[0], begin, extent, 0, 0, self.body, span=span)
+            return tvm.tir.For(self.loop_vars[0], begin, extent, 0, self.body, span=span)
 
         super().__init__(serial)
 
@@ -241,7 +241,7 @@ def parallel(begin, end, span):
                 self.context.report_error("Expect exact 1 loop var")
             ana = tvm.arith.Analyzer()
             extent = end if begin == 0 else ana.simplify(end - begin)
-            return tvm.tir.For(self.loop_vars[0], begin, extent, 1, 0, self.body, span=span)
+            return tvm.tir.For(self.loop_vars[0], begin, extent, 1, self.body, span=span)
 
         super().__init__(parallel)
 
@@ -256,7 +256,7 @@ def vectorized(begin, end, span):
                 self.context.report_error("Expect exact 1 loop var")
             ana = tvm.arith.Analyzer()
             extent = end if begin == 0 else ana.simplify(end - begin)
-            return tvm.tir.For(self.loop_vars[0], begin, extent, 2, 0, self.body, span=span)
+            return tvm.tir.For(self.loop_vars[0], begin, extent, 2, self.body, span=span)
 
         super().__init__(vectorized)
 
@@ -271,6 +271,6 @@ def unroll(begin, end, span):
                 self.context.report_error("Expect exact 1 loop var")
             ana = tvm.arith.Analyzer()
             extent = end if begin == 0 else ana.simplify(end - begin)
-            return tvm.tir.For(self.loop_vars[0], begin, extent, 3, 0, self.body, span=span)
+            return tvm.tir.For(self.loop_vars[0], begin, extent, 3, self.body, span=span)
 
         super().__init__(unroll)
diff --git a/python/tvm/te/hybrid/calls.py b/python/tvm/te/hybrid/calls.py
index 761189115050..6785457c3bd7 100644
--- a/python/tvm/te/hybrid/calls.py
+++ b/python/tvm/te/hybrid/calls.py
@@ -23,18 +23,18 @@
 from tvm.target import Target
 from tvm.tir import expr as _expr
 from tvm.tir import call_intrin
-from tvm.tir.stmt import For
+from tvm.tir.stmt import ForKind
 
 from .utils import _internal_assert
 
 # pylint: disable=redefined-builtin,invalid-name
 
 LOOP_INTRIN = {
-    "range": For.Serial,
-    "unroll": For.Unrolled,
-    "parallel": For.Parallel,
-    "vectorize": For.Vectorized,
-    "const_range": (For.Unrolled,),
+    "range": ForKind.SERIAL,
+    "unroll": ForKind.UNROLLED,
+    "parallel": ForKind.PARALLEL,
+    "vectorize": ForKind.VECTORIZED,
+    "const_range": (ForKind.UNROLLED,),
 }
 
 
@@ -48,9 +48,9 @@ def _range(annotation, args):
         low, ext = args[0], args[1]
     if not tvm.tir.analysis.expr_deep_equal(low, const(0, dtype="int32")):
         ext = ext - low
-    for_type = LOOP_INTRIN[annotation]
+    kind = LOOP_INTRIN[annotation]
     iter_var = None
-    return iter_var, low, ext, for_type
+    return iter_var, low, ext, kind
 
 
 range = unroll = vectorize = parallel = const_range = _range  # pylint: disable=invalid-name
@@ -63,8 +63,8 @@ def bind(func_id, args):
     _internal_assert(isinstance(args[0], str), "A loop bind's first argument should be a string!")
     low, ext = const(0, "int32"), args[1]
     iter_var = tvm.te.thread_axis((low, ext), args[0])
-    for_type = None
-    return iter_var, low, ext, for_type
+    kind = None
+    return iter_var, low, ext, kind
 
 
 def _math_intrin(func_id, args):
diff --git a/python/tvm/te/hybrid/parser.py b/python/tvm/te/hybrid/parser.py
index d47b2ee879fc..7bb85e3da83c 100644
--- a/python/tvm/te/hybrid/parser.py
+++ b/python/tvm/te/hybrid/parser.py
@@ -480,14 +480,14 @@ def visit_Call(self, node):
         return op
 
     def visit_For(self, node):
-        iter_var, low, ext, for_type = self.visit(node.iter)
+        iter_var, low, ext, kind = self.visit(node.iter)
         _internal_assert(
             isinstance(node.target, ast.Name), "The loop iterator should be a variable!"
         )
 
         _name = node.target.id
 
-        if isinstance(for_type, tuple):
+        if isinstance(kind, tuple):
             low = self.analyzer.simplify(low)
             ext = self.analyzer.simplify(ext)
             _internal_assert(
@@ -511,14 +511,14 @@ def visit_For(self, node):
             return concat_list_to_block(bodies)
 
         if iter_var is None:
-            _internal_assert(for_type is not None, "The loop iterating function parse error!")
+            _internal_assert(kind is not None, "The loop iterating function parse error!")
             offset = iter_var = tvm.te.var(_name)
             if not tvm.tir.analysis.expr_deep_equal(low, tvm.runtime.const(0, "int32")):
                 offset = iter_var + low
             self.add_symbol(_name, Symbol.LoopVar, offset)
             _body = visit_list_to_block(self.visit, node.body)
         else:
-            _internal_assert(for_type is None, "The loop bind function parse error!")
+            _internal_assert(kind is None, "The loop bind function parse error!")
             self.add_symbol(_name, Symbol.ThreadBind, iter_var)
             self.device += 1
             _body = visit_list_to_block(self.visit, node.body)
@@ -526,13 +526,13 @@ def visit_For(self, node):
 
         _body = self.wrap_up_realize(node, _body)
 
-        if for_type is None:
+        if kind is None:
             res = _body
         else:
             _internal_assert(
-                not isinstance(for_type, tuple), "Micro expansion should be handled before!"
+                not isinstance(kind, tuple), "Micro expansion should be handled before!"
             )
-            res = tvm.tir.For(iter_var, tvm.runtime.const(0, "int32"), ext, for_type, 0, _body)
+            res = tvm.tir.For(iter_var, tvm.runtime.const(0, "int32"), ext, kind, _body)
 
         self.symbols.pop(_name)
         return res
diff --git a/python/tvm/tir/__init__.py b/python/tvm/tir/__init__.py
index 901c89ed9106..324c4daf19ba 100644
--- a/python/tvm/tir/__init__.py
+++ b/python/tvm/tir/__init__.py
@@ -27,7 +27,7 @@
 from .expr import Select, BufferLoad, ProducerLoad, Load, Ramp, Broadcast, Shuffle
 from .expr import Call, CallEffectKind, Let, IterVar, Any
 
-from .stmt import Stmt, LetStmt, AssertStmt, For
+from .stmt import Stmt, LetStmt, AssertStmt, ForKind, For
 from .stmt import BufferStore, BufferRealize, Store, ProducerStore, Allocate, AttrStmt
 from .stmt import ProducerRealize, SeqStmt
 from .stmt import IfThenElse, Evaluate, Prefetch, stmt_seq, stmt_list
diff --git a/python/tvm/tir/ir_builder.py b/python/tvm/tir/ir_builder.py
index 6dcc8580a221..437e8f6610f4 100644
--- a/python/tvm/tir/ir_builder.py
+++ b/python/tvm/tir/ir_builder.py
@@ -206,7 +206,7 @@ def scope_attr(self, node, attr_key, value):
             value = op.max(1, value)
         self.emit(lambda x: _stmt.AttrStmt(node, attr_key, value, x))
 
-    def for_range(self, begin, end, name="i", dtype="int32", for_type="serial"):
+    def for_range(self, begin, end, name="i", dtype="int32", kind="serial"):
         """Create a for iteration scope.
 
         Parameters
@@ -224,7 +224,7 @@ def for_range(self, begin, end, name="i", dtype="int32", for_type="serial"):
         dtype : str, optional
             The data type of iteration variable.
 
-        for_type : str, optional
+        kind : str, optional
             The special tag on the for loop.
 
         Returns
@@ -249,17 +249,17 @@ def for_range(self, begin, end, name="i", dtype="int32", for_type="serial"):
         extent = end if begin == 0 else (end - begin)
 
         def _exit_cb():
-            if for_type == "serial":
-                for_type_id = 0
-            elif for_type == "parallel":
-                for_type_id = 1
-            elif for_type == "vectorize":
-                for_type_id = 2
-            elif for_type == "unroll":
-                for_type_id = 3
+            if kind == "serial":
+                kind_id = _stmt.ForKind.SERIAL
+            elif kind == "parallel":
+                kind_id = _stmt.ForKind.PARALLEL
+            elif kind == "vectorize":
+                kind_id = _stmt.ForKind.VECTORIZED
+            elif kind == "unroll":
+                kind_id = _stmt.ForKind.UNROLLED
             else:
-                raise ValueError("Unknown for_type")
-            self.emit(_stmt.For(loop_var, begin, extent, for_type_id, 0, self._pop_seq()))
+                raise ValueError("Unknown kind")
+            self.emit(_stmt.For(loop_var, begin, extent, kind_id, self._pop_seq()))
 
         return WithScope(loop_var, _exit_cb)
 
diff --git a/python/tvm/tir/stmt.py b/python/tvm/tir/stmt.py
index 6857b68c261d..9e1ef56cca58 100644
--- a/python/tvm/tir/stmt.py
+++ b/python/tvm/tir/stmt.py
@@ -26,6 +26,7 @@
     assert isinstance(st, tvm.tir.stmt.Store)
     assert(st.buffer_var == a)
 """
+from enum import IntEnum
 import tvm._ffi
 
 from tvm.runtime import Object
@@ -82,6 +83,22 @@ def __init__(self, condition, message, body, span=None):
         self.__init_handle_by_constructor__(_ffi_api.AssertStmt, condition, message, body, span)
 
 
+class ForKind(IntEnum):
+    """The kind of the for loop.
+
+    note
+    ----
+    ForKind can change the control flow semantics
+    of the loop and need to be considered in all TIR passes.
+    """
+
+    SERIAL = 0
+    PARALLEL = 1
+    VECTORIZED = 2
+    UNROLLED = 3
+    THREAD_BINDING = 4
+
+
 @tvm._ffi.register_object("tir.For")
 class For(Stmt):
     """For node.
@@ -97,27 +114,44 @@ class For(Stmt):
     extent : PrimExpr
         The length of the loop.
 
-    for_type : int
-        The for type.
-
-    device_api : int
-        The device api type.
+    kind : ForKind
+        The type of the for.
 
     body : Stmt
         The body statement.
 
+    thread_binding: Optional[tir.IterVar]
+        The thread this loop binds to. Only valid
+        if kind is ThreadBinding
+
+    annotations: tvm.ir.Map
+        Additional annotation hints.
+
     span : Optional[Span]
         The location of this itervar in the source code.
     """
 
-    Serial = 0
-    Parallel = 1
-    Vectorized = 2
-    Unrolled = 3
-
-    def __init__(self, loop_var, min_val, extent, for_type, device_api, body, span=None):
+    def __init__(
+        self,
+        loop_var,
+        min_val,
+        extent,
+        kind,
+        body,
+        thread_binding=None,
+        annotations=None,
+        span=None,
+    ):
         self.__init_handle_by_constructor__(
-            _ffi_api.For, loop_var, min_val, extent, for_type, device_api, body, span
+            _ffi_api.For,
+            loop_var,
+            min_val,
+            extent,
+            kind,
+            body,
+            thread_binding,
+            annotations,
+            span,
         )
 
 
diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
index 6f3ed789ffc1..0c01cc9fbbdf 100644
--- a/python/tvm/topi/cuda/nms.py
+++ b/python/tvm/topi/cuda/nms.py
@@ -580,7 +580,7 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
             j = bx * max_threads + tx
             with ib.if_scope(j < nkeep):
                 src_idx = base_src_idx + sorted_index[i * num_anchors + j] * box_data_length
-                with ib.for_range(0, 4, for_type="unroll") as k:
+                with ib.for_range(0, 4, kind="unroll") as k:
                     out_bboxes[(base_bbox_idx + j * 4 + k)] = data[src_idx + coord_start + k]
 
                 out_scores[i * num_anchors + j] = data[src_idx + score_index]
@@ -593,7 +593,7 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
                 # Only needed for return_indices = False case
                 if return_indices is False:
                     with ib.if_scope(j < num_anchors):
-                        with ib.for_range(0, 4, for_type="unroll") as k:
+                        with ib.for_range(0, 4, kind="unroll") as k:
                             out_bboxes[(base_bbox_idx + j * 4 + k)] = -1.0
 
                         out_scores[i, j] = -1.0
@@ -609,7 +609,7 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
             with ib.if_scope(j < valid_count[i]):
                 src_offset = base_src_idx + j * box_data_length
 
-                with ib.for_range(0, 4, for_type="unroll") as k:
+                with ib.for_range(0, 4, kind="unroll") as k:
                     out_bboxes[base_bbox_idx + j * 4 + k] = data[src_offset + coord_start + k]
                 out_scores[i * num_anchors + j] = data[src_offset + score_index]
 
@@ -855,7 +855,7 @@ def ir(out_bboxes, out_scores, out_class_ids, out):
             i = by
 
             with ib.if_scope(tid < num_anchors):
-                with ib.for_range(0, 4, for_type="unroll") as j:
+                with ib.for_range(0, 4, kind="unroll") as j:
                     out[i, tid, coord_start + j] = out_bboxes[i, tid, j]
                 out[i, tid, score_index] = out_scores[i, tid]
                 if id_index >= 0:
diff --git a/python/tvm/topi/cuda/rcnn/proposal.py b/python/tvm/topi/cuda/rcnn/proposal.py
index 5b7884c7363b..e5e83b4911a3 100644
--- a/python/tvm/topi/cuda/rcnn/proposal.py
+++ b/python/tvm/topi/cuda/rcnn/proposal.py
@@ -181,7 +181,7 @@ def argsort_ir(data_buf, out_index_buf):
 
     idxm = tvm.tir.indexmod
 
-    with ib.for_range(0, batch, for_type="unroll") as b:
+    with ib.for_range(0, batch, kind="unroll") as b:
         start = b * num_bbox
         for i in range(2):
             bbox_id = tid * 2 + i
@@ -259,7 +259,7 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
     ib.scope_attr(tx, "thread_extent", nthread_tx)
     ib.scope_attr(bx, "thread_extent", nthread_bx)
     i = bx * max_threads + tx
-    with ib.for_range(0, batch, for_type="unroll", name="n") as b:
+    with ib.for_range(0, batch, kind="unroll", name="n") as b:
         base_idx = b * num_bbox
         with ib.if_scope(i < num_bbox):
             p_out[base_idx + i] = False
@@ -323,7 +323,7 @@ def prepare_output_ir(sorted_bbox_buf, remove_mask_buf, out_buf):
                     tvm.tir.all(i[0] < rpn_post_nms_top_n, p_remove[(b * num_bbox + j)] == False)
                 ):
                     p_out[offset_i] = tvm.tir.Cast("float32", b)
-                    with ib.for_range(0, 4, for_type="unroll") as k:
+                    with ib.for_range(0, 4, kind="unroll") as k:
                         p_out[offset_i + k + 1] = p_sorted_bbox[offset_j + k]
                     i[0] = i[0] + 1
 
diff --git a/python/tvm/topi/cuda/sparse.py b/python/tvm/topi/cuda/sparse.py
index f2cecacbc618..cb61d9686919 100644
--- a/python/tvm/topi/cuda/sparse.py
+++ b/python/tvm/topi/cuda/sparse.py
@@ -228,8 +228,8 @@ def gen_ir(data, w_data, w_indices, w_indptr, out):
         )
 
         # zero block
-        with ib.for_range(0, bs_m, name="x", for_type="unroll") as x:
-            with ib.for_range(0, bs_n, name="y", for_type="unroll") as y:
+        with ib.for_range(0, bs_m, name="x", kind="unroll") as x:
+            with ib.for_range(0, bs_n, name="y", kind="unroll") as y:
                 block[x, y] = 0.0
         # compute into thread local storage using warp_size chunks
         with ib.for_range(0, rowlength_bo, name="bb") as bb:
@@ -240,26 +240,26 @@ def gen_ir(data, w_data, w_indices, w_indptr, out):
             # each thread has a row
             # TODO: ideally we could vectorize this
             with ib.for_range(0, rowlength_bi, name="bi") as bi:
-                with ib.for_range(0, bs_m, name="x", for_type="unroll") as x:
-                    with ib.for_range(0, bs_k, name="z", for_type="unroll") as z:
+                with ib.for_range(0, bs_m, name="x", kind="unroll") as x:
+                    with ib.for_range(0, bs_k, name="z", kind="unroll") as z:
                         # This memory acces should be out of bounds when
                         # m_index >= mb (which occurs when the dense matrix
                         # rows % 32 != 0), but it seems to work just fine...
                         data_cache[bi, x, z] = data_ptr[indices[bi] * bs_k + z, m_index * bs_m + x]
             # cache w_data
             elem_idx = bb * rowlength_bi + tx
-            with ib.for_range(0, bs_n, name="y", for_type="unroll") as y:
-                with ib.for_range(0, bs_k, name="z", for_type="unroll") as z:
+            with ib.for_range(0, bs_n, name="y", kind="unroll") as y:
+                with ib.for_range(0, bs_k, name="z", kind="unroll") as z:
                     w_data_cache[tx, y, z] = w_data_ptr[row_start + elem_idx, y, z]
             with ib.for_range(0, mi, name="i") as i:
                 # thread local block matmul
-                with ib.for_range(0, bs_m, name="x", for_type="unroll") as x:
-                    with ib.for_range(0, bs_n, name="y", for_type="unroll") as y:
-                        with ib.for_range(0, bs_k, name="z", for_type="unroll") as z:
+                with ib.for_range(0, bs_m, name="x", kind="unroll") as x:
+                    with ib.for_range(0, bs_n, name="y", kind="unroll") as y:
+                        with ib.for_range(0, bs_k, name="z", kind="unroll") as z:
                             block[x, y] += data_cache[i, x, z] * w_data_cache[i, y, z]
         # store results
-        with ib.for_range(0, bs_m, name="x", for_type="unroll") as x:
-            with ib.for_range(0, bs_n, name="y", for_type="unroll") as y:
+        with ib.for_range(0, bs_m, name="x", kind="unroll") as x:
+            with ib.for_range(0, bs_n, name="y", kind="unroll") as y:
                 with ib.if_scope(m_index < mb):
                     with ib.if_scope(n_index < nb):
                         # It doesn't seem like we would be getting coelesced
diff --git a/python/tvm/topi/nn/sparse.py b/python/tvm/topi/nn/sparse.py
index cdccc80bb5f8..8145ed80af47 100644
--- a/python/tvm/topi/nn/sparse.py
+++ b/python/tvm/topi/nn/sparse.py
@@ -294,26 +294,26 @@ def _csr_transpose_ir(data, indices, indptr, out_data, out_indices, out_indptr):
     n = get_const_tuple(indptr.shape)[0] - 1
     nnz = get_const_tuple(data.shape)[0]
 
-    with irb.for_range(0, n, for_type="parallel", name="col") as col:
+    with irb.for_range(0, n, kind="parallel", name="col") as col:
         out_indptr_ptr[col] = 0
 
-    with irb.for_range(0, nnz, for_type="serial", name="nz_idx") as nz_idx:
+    with irb.for_range(0, nnz, kind="serial", name="nz_idx") as nz_idx:
         out_indptr_ptr[indices_ptr[nz_idx]] += 1
 
     cumsum = irb.allocate("int32", (1,), name="cumsum", scope="local")
     temp = irb.allocate("int32", (1,), name="temp", scope="local")
     cumsum[0] = 0
-    with irb.for_range(0, n, for_type="serial", name="col") as col:
+    with irb.for_range(0, n, kind="serial", name="col") as col:
         temp[0] = out_indptr_ptr[col]
         out_indptr_ptr[col] = cumsum[0]
         cumsum[0] += temp[0]
 
     out_indptr_ptr[n] = nnz
 
-    with irb.for_range(0, n, for_type="serial", name="row") as row:
+    with irb.for_range(0, n, kind="serial", name="row") as row:
         offset = indptr_ptr[row]
         diff = indptr_ptr[row + 1] - indptr_ptr[row]
-        with irb.for_range(0, diff, for_type="serial", name="idx") as idx:
+        with irb.for_range(0, diff, kind="serial", name="idx") as idx:
             real_idx = offset + idx
             col = indices_ptr[real_idx]
             dest = out_indptr_ptr[col]
@@ -325,7 +325,7 @@ def _csr_transpose_ir(data, indices, indptr, out_data, out_indices, out_indptr):
     last = irb.allocate("int32", (1,), name="last", scope="local")
     temp2 = irb.allocate("int32", (1,), name="temp2", scope="local")
     last[0] = 0
-    with irb.for_range(0, n, for_type="serial", name="col") as col:
+    with irb.for_range(0, n, kind="serial", name="col") as col:
         temp2[0] = out_indptr_ptr[col]
         out_indptr_ptr[col] = last[0]
         last[0] = temp2[0]
diff --git a/python/tvm/topi/sparse/csrmm.py b/python/tvm/topi/sparse/csrmm.py
index f578e6001351..39ba3332fc72 100644
--- a/python/tvm/topi/sparse/csrmm.py
+++ b/python/tvm/topi/sparse/csrmm.py
@@ -72,8 +72,8 @@ def csrmm_default_ir(data, indices, indptr, weight, out):
         out_ptr = irb.buffer_ptr(out)
         M = simplify(indptr.shape[0] - 1)
         _, N = weight.shape
-        with irb.for_range(0, N, for_type="vectorize", name="n") as n:
-            with irb.for_range(0, M, for_type="parallel", name="row") as row:
+        with irb.for_range(0, N, kind="vectorize", name="n") as n:
+            with irb.for_range(0, M, kind="parallel", name="row") as row:
                 dot = irb.allocate("float32", (1,), name="dot", scope="local")
                 out_ptr[row * N + n] = 0.0
                 dot[0] = 0.0
diff --git a/python/tvm/topi/sparse/csrmv.py b/python/tvm/topi/sparse/csrmv.py
index afe3bc76d121..a2d22afe01e0 100644
--- a/python/tvm/topi/sparse/csrmv.py
+++ b/python/tvm/topi/sparse/csrmv.py
@@ -63,7 +63,7 @@ def csrmv_default_ir(data, indices, indptr, weight, out):
         weight_ptr = irb.buffer_ptr(weight)
         out_ptr = irb.buffer_ptr(out)
         num_rows = indptr.shape[0] - 1
-        with irb.for_range(0, num_rows, for_type="parallel", name="row") as row:
+        with irb.for_range(0, num_rows, kind="parallel", name="row") as row:
             dot = irb.allocate("float32", (1,), name="dot", scope="local")
             out_ptr[row] = 0.0
             dot[0] = 0.0
diff --git a/python/tvm/topi/sparse/dense.py b/python/tvm/topi/sparse/dense.py
index d1516d0c20fc..5c63e44f691a 100644
--- a/python/tvm/topi/sparse/dense.py
+++ b/python/tvm/topi/sparse/dense.py
@@ -74,8 +74,8 @@ def dense_default_ir(data, indices, indptr, weight, out):
         out_ptr = irb.buffer_ptr(out)
         M = simplify(indptr.shape[0] - 1)
         N, K = weight.shape
-        with irb.for_range(0, N, for_type="vectorize", name="n") as n:
-            with irb.for_range(0, M, for_type="parallel", name="m") as m:
+        with irb.for_range(0, N, kind="vectorize", name="n") as n:
+            with irb.for_range(0, M, kind="parallel", name="m") as m:
                 dot = irb.allocate(dtype, (1,), name="dot", scope="local")
                 out_ptr[m * N + n] = tvm.tir.const(0, dtype)
                 dot[0] = tvm.tir.const(0, dtype)
@@ -153,8 +153,8 @@ def dense_default_ir(data, w_data, w_indices, w_indptr, out):
         out_ptr = irb.buffer_ptr(out)
         M, K = data.shape
         N = simplify(w_indptr.shape[0] - 1)
-        with irb.for_range(0, M, for_type="vectorize", name="m") as m:
-            with irb.for_range(0, N, for_type="parallel", name="n") as n:
+        with irb.for_range(0, M, kind="vectorize", name="m") as m:
+            with irb.for_range(0, N, kind="parallel", name="n") as n:
                 dot = irb.allocate(dtype, (1,), name="dot", scope="local")
                 out_ptr[m * N + n] = tvm.tir.const(0, dtype)
                 dot[0] = tvm.tir.const(0, dtype)
diff --git a/python/tvm/topi/vision/rcnn/proposal.py b/python/tvm/topi/vision/rcnn/proposal.py
index 89726efd5d0e..e15ba8cd27c7 100644
--- a/python/tvm/topi/vision/rcnn/proposal.py
+++ b/python/tvm/topi/vision/rcnn/proposal.py
@@ -208,7 +208,7 @@ def argsort_ir(data_buf, out_index_buf):
     temp_data = ib.allocate("float32", (1,), name="temp_data", scope="local")
     temp_index = ib.allocate("int32", (1,), name="temp_index", scope="local")
     idxm = tvm.tir.indexmod
-    with ib.for_range(0, batch, for_type="unroll") as b:
+    with ib.for_range(0, batch, kind="unroll") as b:
         start = b * num_bbox
         for i in range(2):
             with ib.for_range(0, (num_bbox + 1) // 2) as tid:
@@ -279,7 +279,7 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
     ib = tvm.tir.ir_builder.create()
     p_data = ib.buffer_ptr(sorted_bbox_buf)
     p_out = ib.buffer_ptr(out_buf)
-    with ib.for_range(0, batch, for_type="unroll", name="n") as b:
+    with ib.for_range(0, batch, kind="unroll", name="n") as b:
         base_idx = b * num_bbox
         for i in range(num_bbox):
             p_out[base_idx + i] = False
@@ -345,7 +345,7 @@ def prepare_output_ir(sorted_bbox_buf, remove_mask_buf, out_buf):
                         )
                     ):
                         p_out[offset_i] = tvm.tir.Cast("float32", b)
-                        with ib.for_range(0, 4, for_type="unroll") as k:
+                        with ib.for_range(0, 4, kind="unroll") as k:
                             p_out[offset_i + k + 1] = p_sorted_bbox[offset_j + k]
                         i[b] = i[b] + 1
 
diff --git a/python/tvm/topi/x86/scatter.py b/python/tvm/topi/x86/scatter.py
index 8147d3a00135..8bb3f57e82e4 100644
--- a/python/tvm/topi/x86/scatter.py
+++ b/python/tvm/topi/x86/scatter.py
@@ -84,7 +84,7 @@ def gen_ir(data_ptr, indices_ptr, out_ptr):
             out[i] = tvm.tir.Cast(data_ptr.dtype, 0)
 
         with ib.for_range(0, fused_indices_dimension) as i:
-            with ib.for_range(0, fused_data_dimension, for_type="parallel") as j:
+            with ib.for_range(0, fused_data_dimension, kind="parallel") as j:
                 offset = fused_data_dimension
                 index = j  # This is x_M, .. x_{N-1} part of the index into out.
                 # Build up the indices[0, y_0, .. y_{K-1}], .. indices[M-1, y_0, .. y_{K-1}] part
diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc
index 1b10cd5f2601..cf516d8452e2 100755
--- a/src/auto_scheduler/feature.cc
+++ b/src/auto_scheduler/feature.cc
@@ -618,7 +618,7 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
       is_gpu_ = true;
 
       // make a fake for node for blockIdx.x or threadIdx.x
-      Stmt fake_for_node = For(var, 0, extent, ForType::Parallel, DeviceAPI::None, node->body);
+      Stmt fake_for_node = For(var, 0, extent, ForKind::kParallel, node->body);
 
       outer_loop_prod_ *= extent;
       for_loop_stack_.push_back(fake_for_node.as<ForNode>());
@@ -642,11 +642,11 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
   void VisitStmt_(const ForNode* node) final {
     int64_t loop_extent = GetLoopExtent(node);
 
-    if (node->for_type == ForType::Vectorized) {
+    if (node->kind == ForKind::kVectorized) {
       vec_for_stack_.push_back(node);
-    } else if (node->for_type == ForType::Unrolled) {
+    } else if (node->kind == ForKind::kUnrolled) {
       unroll_for_stack_.push_back(node);
-    } else if (node->for_type == ForType::Parallel) {
+    } else if (node->kind == ForKind::kParallel) {
       parallel_for_stack_.push_back(node);
     }
 
@@ -656,11 +656,11 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
     for_loop_stack_.pop_back();
     outer_loop_prod_ /= loop_extent;
 
-    if (node->for_type == ForType::Vectorized) {
+    if (node->kind == ForKind::kVectorized) {
       vec_for_stack_.pop_back();
-    } else if (node->for_type == ForType::Unrolled) {
+    } else if (node->kind == ForKind::kUnrolled) {
       unroll_for_stack_.pop_back();
-    } else if (node->for_type == ForType::Parallel) {
+    } else if (node->kind == ForKind::kParallel) {
       parallel_for_stack_.pop_back();
     }
   }
diff --git a/src/autotvm/feature_visitor.cc b/src/autotvm/feature_visitor.cc
index 15e09755cee2..59cac9cc9827 100644
--- a/src/autotvm/feature_visitor.cc
+++ b/src/autotvm/feature_visitor.cc
@@ -34,19 +34,23 @@ void FeatureVisitor::VisitStmt_(const ForNode* op) {
   int64_t loop_extent = -1;
   if (extent != nullptr) loop_extent = extent->value;
   AnnotationType ann = kSerial;
-  switch (op->for_type) {
-    case ForType ::Parallel:
+  switch (op->kind) {
+    case ForKind ::kParallel:
       ann = kParallel;
       break;
-    case ForType::Unrolled:
+    case ForKind::kUnrolled:
       ann = kUnrolled;
       break;
-    case ForType::Vectorized:
+    case ForKind::kVectorized:
       ann = kVectorized;
       break;
-    case ForType::Serial:
+    case ForKind::kSerial:
       ann = kSerial;
       break;
+    case ForKind::kThreadBinding:
+      LOG(FATAL) << "Loop ThreadBinding is reserved for future used and "
+                 << "not yet supported in TIR";
+      break;
   }
 
   if (EnterItervar_(op->loop_var, loop_extent, ann)) {
diff --git a/src/printer/tir_text_printer.cc b/src/printer/tir_text_printer.cc
index 107817db29b3..4b0871ae2ce6 100644
--- a/src/printer/tir_text_printer.cc
+++ b/src/printer/tir_text_printer.cc
@@ -465,18 +465,21 @@ Doc TIRTextPrinter::VisitStmt_(const EvaluateNode* op) {
   return doc;
 }
 
-inline const char* ForType2String(ForType t) {
+inline const char* ForKind2String(ForKind t) {
   switch (t) {
-    case ForType::Serial:
+    case ForKind::kSerial:
       return "serial";
-    case ForType::Parallel:
+    case ForKind::kParallel:
       return "parallel";
-    case ForType::Vectorized:
+    case ForKind::kVectorized:
       return "vectorized";
-    case ForType::Unrolled:
+    case ForKind::kUnrolled:
       return "unroll";
+    case ForKind::kThreadBinding:
+      LOG(FATAL) << "Loop ThreadBinding is reserved for future used and "
+                 << "not yet supported in TIR";
   }
-  LOG(FATAL) << "Unknown ForType";
+  LOG(FATAL) << "Unknown ForKind";
   return "Unknown";
 }
 
@@ -484,8 +487,8 @@ Doc TIRTextPrinter::VisitStmt_(const ForNode* op) {
   Doc doc;
   doc << "for (" << Print(op->loop_var) << ", " << Print(op->min) << ", "
       << Print(op->min + op->extent) << ")";
-  if (op->for_type != ForType::Serial) {
-    doc << " " << Doc::StrLiteral(ForType2String(op->for_type));
+  if (op->kind != ForKind::kSerial) {
+    doc << " " << Doc::StrLiteral(ForKind2String(op->kind));
   }
   doc << PrintBody(op->body);
   return doc;
diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc
index 09f95e44b6d8..86b175e1676c 100644
--- a/src/printer/tvmscript_printer.cc
+++ b/src/printer/tvmscript_printer.cc
@@ -649,27 +649,30 @@ Doc TVMScriptPrinter::VisitStmt_(const EvaluateNode* op) {
   return doc;
 }
 
-inline const char* ForType2String(ForType t) {
+inline const char* ForKind2String(ForKind t) {
   switch (t) {
-    case ForType::Serial:
+    case ForKind::kSerial:
       return "serial";
-    case ForType::Parallel:
+    case ForKind::kParallel:
       return "parallel";
-    case ForType::Vectorized:
+    case ForKind::kVectorized:
       return "vectorized";
-    case ForType::Unrolled:
+    case ForKind::kUnrolled:
       return "unroll";
+    case ForKind::kThreadBinding:
+      LOG(FATAL) << "Loop ThreadBinding is reserved for future used and "
+                 << "not yet supported in TIR";
+      return "threadbinding";
   }
-  LOG(FATAL) << "Unknown ForType";
+  LOG(FATAL) << "Unknown ForKind";
   return "Unknown";
 }
 
 Doc TVMScriptPrinter::VisitStmt_(const ForNode* op) {
   Doc doc;
   var_not_in_headers.insert(op->loop_var.get());
-  doc << "for " << Print(op->loop_var)
-      << " in tir." + std::string(ForType2String(op->for_type)) + "(" << Print(op->min) << ", "
-      << Print(op->min + op->extent)
+  doc << "for " << Print(op->loop_var) << " in tir." + std::string(ForKind2String(op->kind)) + "("
+      << Print(op->min) << ", " << Print(op->min + op->extent)
       << "):" << Doc::Indent(4, Doc::NewLine() << PrintBody(op->body));
   return doc;
 }
diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc
index 6143e7050495..e2a8553199f0 100644
--- a/src/target/llvm/codegen_cpu.cc
+++ b/src/target/llvm/codegen_cpu.cc
@@ -976,12 +976,13 @@ void CodeGenCPU::VisitStmt_(const AttrStmtNode* op) {
 
 void CodeGenCPU::VisitStmt_(const ForNode* op) {
   ICHECK(is_zero(op->min));
-  if (op->for_type == ForType::Serial || op->for_type == ForType::Unrolled) {
+  if (op->kind == ForKind::kSerial || op->kind == ForKind::kUnrolled) {
     CodeGenLLVM::VisitStmt_(op);
-  } else if (op->for_type == ForType::Parallel) {
+  } else if (op->kind == ForKind::kParallel) {
     if (parallel_env_.penv == nullptr) {
-      CreateParallelLaunch(
-          For(op->loop_var, op->min, op->extent, op->for_type, op->device_api, op->body), 0);
+      CreateParallelLaunch(For(op->loop_var, op->min, op->extent, op->kind, op->body,
+                               op->thread_binding, op->annotations),
+                           0);
     } else {
       // already in parallel env.
       ICHECK(parallel_env_.task_id.defined());
@@ -1007,7 +1008,7 @@ void CodeGenCPU::VisitStmt_(const ForNode* op) {
       ++parallel_env_.parallel_loop_count;
     }
   } else {
-    LOG(FATAL) << "cannot handle for type " << op->for_type;
+    LOG(FATAL) << "cannot handle for type " << op->kind;
   }
 }
 
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 34f3897cce88..1dd76f6b9d51 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -1318,11 +1318,11 @@ void CodeGenLLVM::VisitStmt_(const StoreNode* op) {
 void CodeGenLLVM::VisitStmt_(const ForNode* op) {
   ICHECK(is_zero(op->min));
   analyzer_->Bind(op->loop_var, Range::FromMinExtent(op->min, op->extent));
-  if (op->for_type == ForType::Unrolled) {
+  if (op->kind == ForKind::kUnrolled) {
     LOG(WARNING) << "Unroll hint get ignore at CodeGenLLVM backend, "
                  << " consider set unroll_explicit=True";
   } else {
-    ICHECK(op->for_type == ForType::Serial);
+    ICHECK(op->kind == ForKind::kSerial);
   }
   CreateSerialFor(MakeValue(op->min), MakeValue(op->extent),
                   llvm::ConstantInt::getSigned(GetLLVMType(op->extent), 1), op->loop_var, op->body);
diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc
index 6c73716edc18..e5547315613f 100644
--- a/src/target/source/codegen_cuda.cc
+++ b/src/target/source/codegen_cuda.cc
@@ -84,7 +84,7 @@ std::string CodeGenCUDA::Finish() {
 
 void CodeGenCUDA::VisitStmt_(const tir::ForNode* op) {
   ICHECK(is_const_int(op->min, 0));
-  if (op->for_type == tir::ForType::Unrolled) {
+  if (op->kind == tir::ForKind::kUnrolled) {
     PrintIndent();
     stream << "#pragma unroll\n";
   }
diff --git a/src/target/spirv/codegen_spirv.cc b/src/target/spirv/codegen_spirv.cc
index c3b12ab943c6..51d136d5510e 100644
--- a/src/target/spirv/codegen_spirv.cc
+++ b/src/target/spirv/codegen_spirv.cc
@@ -492,7 +492,7 @@ void CodeGenSPIRV::VisitStmt_(const ForNode* op) {
   loop_var.SetIncoming(0, init_value, init_label);
   spirv::Value loop_cond = builder_->LT(loop_var, extent_value);
   uint32_t control =
-      (op->for_type == ForType::Unrolled ? spv::LoopControlUnrollMask : spv::LoopControlMaskNone);
+      (op->kind == ForKind::kUnrolled ? spv::LoopControlUnrollMask : spv::LoopControlMaskNone);
   builder_->MakeInst(spv::OpLoopMerge, merge_label, continue_label, control);
   builder_->MakeInst(spv::OpBranchConditional, loop_cond, body_label, merge_label,
                      weight_likely_branch_, 1);
diff --git a/src/te/operation/hybrid_op.cc b/src/te/operation/hybrid_op.cc
index 94e06d206ddb..65b8660ca1fb 100644
--- a/src/te/operation/hybrid_op.cc
+++ b/src/te/operation/hybrid_op.cc
@@ -234,9 +234,9 @@ Stmt ApplyLoopShapes(const Stage& stage, const std::unordered_map<IterVar, Range
         PrimExpr cond = likely(outer * factor < (op->extent - inner));
         ret = IfThenElse(cond, ret);
         ret = For(inner->var, PrimExpr(0), inner->dom->extent,
-                  IterVarTypeToForType(inner->iter_type), op->device_api, ret);
+                  IterVarTypeToForKind(inner->iter_type), ret);
         ret = For(outer->var, PrimExpr(0), outer->dom->extent,
-                  IterVarTypeToForType(outer->iter_type), op->device_api, ret);
+                  IterVarTypeToForKind(outer->iter_type), ret);
         splitted = true;
         return ret;
       }
@@ -277,8 +277,8 @@ Stmt ApplyLoopShapes(const Stage& stage, const std::unordered_map<IterVar, Range
         rmap[op->loop_var.get()] = indexdiv(parent, extent);
         body = tir::Substitute(body, rmap);
         under_outer = false;
-        return For(parent->var, PrimExpr(0), extent * op->extent, op->for_type, op->device_api,
-                   body);
+        return For(parent->var, PrimExpr(0), extent * op->extent, op->kind, body,
+                   op->thread_binding, op->annotations);
       } else if (under_outer) {
         Stmt body = this->VisitStmt(op->body);
         std::unordered_map<const VarNode*, PrimExpr> rmap;
@@ -331,8 +331,8 @@ Stmt ApplyLoopAnnotations(const Stage& stage, const std::unordered_map<IterVar,
           Stmt body = tir::Substitute(op->body, rmap);
           return AttrStmt(iter_var, "thread_extent", op->extent, body);
         } else {
-          return For(op->loop_var, op->min, op->extent, IterVarTypeToForType(attr->iter_type),
-                     op->device_api, op->body);
+          return For(op->loop_var, op->min, op->extent, IterVarTypeToForKind(attr->iter_type),
+                     op->body, op->thread_binding, op->annotations);
         }
       }
       return StmtMutator::VisitStmt_(op);
@@ -345,18 +345,18 @@ Stmt ApplyLoopAnnotations(const Stage& stage, const std::unordered_map<IterVar,
 
     const IterVar& actual = rebased.count(iter_var) ? rebased.find(iter_var)->second : iter_var;
     const VarNode* var = actual->var.get();
-    ForType expected = IterVarTypeToForType(iter_var->iter_type);
+    ForKind expected = IterVarTypeToForKind(iter_var->iter_type);
     IterVarAttr attr;
     if (stage->iter_var_attrs.count(iter_var)) {
       attr = stage->iter_var_attrs[iter_var];
-      expected = IterVarTypeToForType(attr->iter_type);
+      expected = IterVarTypeToForKind(attr->iter_type);
     }
 
     PostOrderVisit(stmt, [&found, &var, &attr, &expected, &need_change](const ObjectRef& node) {
       if (const ForNode* op = node.as<ForNode>()) {
         if (op->loop_var.get() == var) {
           ++found;
-          need_change = expected != op->for_type || (attr.defined() && attr->bind_thread.defined());
+          need_change = expected != op->kind || (attr.defined() && attr->bind_thread.defined());
         }
       }
     });
@@ -409,12 +409,13 @@ Stmt ApplyLoopOrder(const Stage& stage, const std::unordered_map<IterVar, Range>
       if (body_.same_as(op->body) && op->loop_var.get() == target->var.get())
         return GetRef<Stmt>(op);
       const Stmt& body = op->body.same_as(body_) ? op->body : body_;
-      ForType for_type = IterVarTypeToForType(target->iter_type);
+      ForKind kind = IterVarTypeToForKind(target->iter_type);
       if (stage->iter_var_attrs.count(target)) {
-        for_type = IterVarTypeToForType(stage->iter_var_attrs[target]->iter_type);
+        kind = IterVarTypeToForKind(stage->iter_var_attrs[target]->iter_type);
       }
       const Range& range = target->dom.defined() ? target->dom : dom_map.find(target)->second;
-      return For(target->var, range->min, range->extent, for_type, DeviceAPI::None, body);
+      return For(target->var, range->min, range->extent, kind, body, op->thread_binding,
+                 op->annotations);
     }
   };
 
@@ -448,7 +449,7 @@ std::vector<IterVar> GatherLoopVars(Stmt stmt) {
     if (const ForNode* op = node.as<ForNode>()) {
       Var loop_var(op->loop_var);
       Range dom = Range::FromMinExtent(op->min, op->extent);
-      res_.push_back(IterVar(dom, loop_var, ForTypeToIterVarType(op->for_type)));
+      res_.push_back(IterVar(dom, loop_var, ForKindToIterVarType(op->kind)));
     }
   });
   std::reverse(res_.begin(), res_.end());
diff --git a/src/te/operation/op_utils.cc b/src/te/operation/op_utils.cc
index f1991c181e67..32ffccbbec1f 100644
--- a/src/te/operation/op_utils.cc
+++ b/src/te/operation/op_utils.cc
@@ -77,7 +77,7 @@ std::vector<std::vector<Stmt> > MakeLoopNest(const Stage& stage,
         var = Var(iv->var->name_hint + ".init", bind_iv->var.dtype());
       }
 
-      ForType for_type = ForType::Serial;
+      ForKind kind = ForKind::kSerial;
       IterVarAttr it_attr;
       if (stage->iter_var_attrs.count(iv)) {
         it_attr = stage->iter_var_attrs[iv];
@@ -85,13 +85,13 @@ std::vector<std::vector<Stmt> > MakeLoopNest(const Stage& stage,
       if (it_attr.defined()) {
         switch (it_attr->iter_type) {
           case kUnrolled:
-            for_type = ForType::Unrolled;
+            kind = ForKind::kUnrolled;
             break;
           case kVectorized:
-            for_type = ForType::Vectorized;
+            kind = ForKind::kVectorized;
             break;
           case kParallelized:
-            for_type = ForType::Parallel;
+            kind = ForKind::kParallel;
             break;
           case kDataPar:
             break;
@@ -115,11 +115,11 @@ std::vector<std::vector<Stmt> > MakeLoopNest(const Stage& stage,
         nest[i + 1].emplace_back(LetStmt(var, cast(var.dtype(), dom->min), no_op));
         value_map[iv] = cast(var.dtype(), dom->min);
       } else if (is_zero(dom->min)) {
-        nest[i + 1].emplace_back(For(var, 0, dom->extent, for_type, DeviceAPI::None, no_op));
+        nest[i + 1].emplace_back(For(var, 0, dom->extent, kind, no_op));
         value_map[iv] = var;
       } else {
         Var idx(bind_iv->var->name_hint + ".idx", bind_iv->var.dtype());
-        nest[i + 1].emplace_back(For(idx, 0, dom->extent, for_type, DeviceAPI::None, no_op));
+        nest[i + 1].emplace_back(For(idx, 0, dom->extent, kind, no_op));
         PrimExpr new_value = dom->min + idx;
         value_map[iv] = new_value;
         nest[i + 1].emplace_back(LetStmt(var, new_value, no_op));
@@ -243,33 +243,33 @@ Stmt Substitute(Stmt s, const std::unordered_map<IterVar, PrimExpr>& value_map)
   return tir::Substitute(s, init);
 }
 
-IterVarType ForTypeToIterVarType(tir::ForType for_type) {
-  switch (for_type) {
-    case ForType::Serial:
+IterVarType ForKindToIterVarType(tir::ForKind kind) {
+  switch (kind) {
+    case ForKind::kSerial:
       return kDataPar;
-    case ForType::Parallel:
+    case ForKind::kParallel:
       return kParallelized;
-    case ForType::Vectorized:
+    case ForKind::kVectorized:
       return kVectorized;
-    case ForType::Unrolled:
+    case ForKind::kUnrolled:
       return kUnrolled;
     default:
       return kDataPar;
   }
 }
 
-tir::ForType IterVarTypeToForType(IterVarType iter_type) {
+tir::ForKind IterVarTypeToForKind(IterVarType iter_type) {
   switch (iter_type) {
     case kDataPar:
-      return ForType::Serial;
+      return ForKind::kSerial;
     case kParallelized:
-      return ForType::Parallel;
+      return ForKind::kParallel;
     case kVectorized:
-      return ForType::Vectorized;
+      return ForKind::kVectorized;
     case kUnrolled:
-      return ForType::Unrolled;
+      return ForKind::kUnrolled;
     default:
-      return ForType::Serial;
+      return ForKind::kSerial;
   }
 }
 
diff --git a/src/te/operation/op_utils.h b/src/te/operation/op_utils.h
index 16f7d96cfa77..e6bf2caae6e0 100644
--- a/src/te/operation/op_utils.h
+++ b/src/te/operation/op_utils.h
@@ -88,16 +88,16 @@ PrimExpr ReplaceTensor(PrimExpr expr, const std::unordered_map<Tensor, Tensor>&
 Stmt Substitute(Stmt stmt, const std::unordered_map<IterVar, PrimExpr>& value_map);
 
 /*!
- * \brief Converts Halide ForType to its corresponding IterVarType
- * \param for_type The ForType to be converted
+ * \brief Converts Halide ForKind to its corresponding IterVarType
+ * \param kind The ForKind to be converted
  */
-IterVarType ForTypeToIterVarType(tir::ForType for_type);
+IterVarType ForKindToIterVarType(tir::ForKind kind);
 
 /*!
- * \brief Converts IterVarType to its corresponding Halide ForType
+ * \brief Converts IterVarType to its corresponding Halide ForKind
  * \param iter_type The IterVarType to be converted
  */
-tir::ForType IterVarTypeToForType(IterVarType iter_type);
+tir::ForKind IterVarTypeToForKind(IterVarType iter_type);
 
 }  // namespace te
 }  // namespace tvm
diff --git a/src/te/schedule/schedule_postproc_rewrite_for_tensor_core.cc b/src/te/schedule/schedule_postproc_rewrite_for_tensor_core.cc
index f81d72e0fe02..74d1a19d2cfe 100644
--- a/src/te/schedule/schedule_postproc_rewrite_for_tensor_core.cc
+++ b/src/te/schedule/schedule_postproc_rewrite_for_tensor_core.cc
@@ -968,7 +968,8 @@ class TensorCoreIRMutator : public StmtExprMutator {
           scaled_extent_value = ori_extent_value / scale_factor;
         }
         PrimExpr scaled_extent = make_const(op->extent.dtype(), scaled_extent_value);
-        stmt = For(op->loop_var, op->min, scaled_extent, op->for_type, op->device_api, op->body);
+        stmt = For(op->loop_var, op->min, scaled_extent, op->kind, op->body, op->thread_binding,
+                   op->annotations);
       }
     }
     return stmt;
diff --git a/src/tir/ir/stmt.cc b/src/tir/ir/stmt.cc
index fd03046376f8..92dc38797544 100644
--- a/src/tir/ir/stmt.cc
+++ b/src/tir/ir/stmt.cc
@@ -128,8 +128,8 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // For
-For::For(Var loop_var, PrimExpr min, PrimExpr extent, ForType for_type, DeviceAPI device_api,
-         Stmt body, Span span) {
+For::For(Var loop_var, PrimExpr min, PrimExpr extent, ForKind kind, Stmt body,
+         Optional<IterVar> thread_binding, Map<String, ObjectRef> annotations, Span span) {
   ICHECK(min.defined());
   ICHECK(extent.defined());
   ICHECK(min.dtype().is_scalar());
@@ -141,36 +141,40 @@ For::For(Var loop_var, PrimExpr min, PrimExpr extent, ForType for_type, DeviceAP
   node->loop_var = std::move(loop_var);
   node->min = std::move(min);
   node->extent = std::move(extent);
-  node->for_type = for_type;
-  node->device_api = device_api;
+  node->kind = kind;
   node->body = std::move(body);
+  node->thread_binding = std::move(thread_binding);
+  node->annotations = std::move(annotations);
   node->span = std::move(span);
   data_ = std::move(node);
 }
 
-TVM_REGISTER_GLOBAL("tir.For").set_body_typed([](Var loop_var, PrimExpr min, PrimExpr extent,
-                                                 int for_type, int device_api, Stmt body,
-                                                 Span span) {
-  return For(loop_var, min, extent, static_cast<ForType>(for_type),
-             static_cast<DeviceAPI>(device_api), body, span);
-});
+TVM_REGISTER_GLOBAL("tir.For").set_body_typed(
+    [](Var loop_var, PrimExpr min, PrimExpr extent, int kind, Stmt body,
+       Optional<IterVar> thread_binding, Optional<Map<String, ObjectRef>> annotations, Span span) {
+      return For(loop_var, min, extent, static_cast<ForKind>(kind), body, thread_binding,
+                 annotations.value_or(Map<String, ObjectRef>()), span);
+    });
 
 TVM_REGISTER_NODE_TYPE(ForNode);
 
-std::ostream& operator<<(std::ostream& out, ForType type) {  // NOLINT(*)
+std::ostream& operator<<(std::ostream& out, ForKind type) {  // NOLINT(*)
   switch (type) {
-    case ForType::Serial:
+    case ForKind::kSerial:
       out << "for";
       break;
-    case ForType::Parallel:
+    case ForKind::kParallel:
       out << "parallel";
       break;
-    case ForType::Unrolled:
+    case ForKind::kUnrolled:
       out << "unrolled";
       break;
-    case ForType::Vectorized:
+    case ForKind::kVectorized:
       out << "vectorized";
       break;
+    case ForKind::kThreadBinding:
+      out << "launch_thread";
+      break;
   }
   return out;
 }
@@ -179,7 +183,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<ForNode>([](const ObjectRef& node, ReprPrinter* p) {
       auto* op = static_cast<const ForNode*>(node.get());
       p->PrintIndent();
-      p->stream << op->for_type << " (" << op->loop_var << ", ";
+      p->stream << op->kind << " (" << op->loop_var << ", ";
       p->Print(op->min);
       p->stream << ", ";
       p->Print(op->extent);
diff --git a/src/tir/transforms/combine_context_call.cc b/src/tir/transforms/combine_context_call.cc
index 03a0d5e751cf..4a3986460b15 100644
--- a/src/tir/transforms/combine_context_call.cc
+++ b/src/tir/transforms/combine_context_call.cc
@@ -72,7 +72,7 @@ class ContextCallCombiner final : public StmtExprMutator {
   }
 
   Stmt VisitStmt_(const ForNode* op) final {
-    if (op->for_type == ForType::Parallel) {
+    if (op->kind == ForKind::kParallel) {
       // Map of comparison expression to variable
       std::unordered_map<PrimExpr, Var, StructuralHash, StructuralEqual> temp;
       std::swap(temp, ctx_map_);
diff --git a/src/tir/transforms/inject_double_buffer.cc b/src/tir/transforms/inject_double_buffer.cc
index 22a6ca23c24c..7a16c06d8058 100644
--- a/src/tir/transforms/inject_double_buffer.cc
+++ b/src/tir/transforms/inject_double_buffer.cc
@@ -158,8 +158,7 @@ class DoubleBufferInjector : public StmtExprMutator {
           vmap[old_loop->loop_var.get()] = outer_var * factor + make_const(factor.dtype(), i);
           loop_seq.emplace_back(Substitute(old_loop->body, vmap));
         }
-        Stmt loop = For(outer_var, zero, outer_ext, old_loop->for_type, old_loop->device_api,
-                        SeqStmt::Flatten(loop_seq));
+        Stmt loop = For(outer_var, zero, outer_ext, old_loop->kind, SeqStmt::Flatten(loop_seq));
         // tail
         std::vector<Stmt> tail_seq;
         Stmt tail_body = StripDoubleBufferWrite()(old_loop->body);
diff --git a/src/tir/transforms/inject_prefetch.cc b/src/tir/transforms/inject_prefetch.cc
index b5c4cf5ec582..4ce9c7639b77 100644
--- a/src/tir/transforms/inject_prefetch.cc
+++ b/src/tir/transforms/inject_prefetch.cc
@@ -71,11 +71,11 @@ class PrefetchInjector : public StmtMutator {
   Stmt VisitStmt_(const ForNode* op) final {
     auto& var = op->loop_var;
     loop_nest_.push_back(var);
-    if (op->for_type == ForType::Vectorized) {
+    if (op->kind == ForKind::kVectorized) {
       vectorized_[var.get()] = IntSet::Interval(op->min, (op->min + op->extent) - 1);
     }
     Stmt ret = StmtMutator::VisitStmt_(op);
-    if (op->for_type == ForType::Vectorized) {
+    if (op->kind == ForKind::kVectorized) {
       vectorized_.erase(var.get());
     }
     loop_nest_.pop_back();
diff --git a/src/tir/transforms/inject_virtual_thread.cc b/src/tir/transforms/inject_virtual_thread.cc
index 5622d140a625..b24a0e95cd53 100644
--- a/src/tir/transforms/inject_virtual_thread.cc
+++ b/src/tir/transforms/inject_virtual_thread.cc
@@ -303,7 +303,10 @@ class VTInjector : public StmtExprMutator {
     if (extent.same_as(op->extent) && body.same_as(op->body)) {
       return GetRef<Stmt>(op);
     } else {
-      return For(op->loop_var, op->min, extent, op->for_type, op->device_api, body);
+      auto n = CopyOnWrite(op);
+      n->extent = std::move(extent);
+      n->body = std::move(body);
+      return Stmt(n);
     }
   }
   // IfThenElse
@@ -417,7 +420,7 @@ class VTInjector : public StmtExprMutator {
       Map<Var, PrimExpr> values{{var_, idx}};
       stmt = Substitute(stmt, values);
       return For(idx, make_zero(idx.dtype()), make_const(idx.dtype(), num_threads_),
-                 ForType::Serial, DeviceAPI::None, stmt);
+                 ForKind::kSerial, stmt);
     }
   }
 
diff --git a/src/tir/transforms/ir_utils.cc b/src/tir/transforms/ir_utils.cc
index 033a2e093a2a..cbae3f95ec68 100644
--- a/src/tir/transforms/ir_utils.cc
+++ b/src/tir/transforms/ir_utils.cc
@@ -149,7 +149,8 @@ class IRConvertSSA final : public StmtExprMutator {
       Stmt stmt = StmtExprMutator::VisitStmt_(op);
       scope_[v.get()].pop_back();
       op = stmt.as<ForNode>();
-      return For(new_var, op->min, op->extent, op->for_type, op->device_api, op->body);
+      return For(new_var, op->min, op->extent, op->kind, op->body, op->thread_binding,
+                 op->annotations);
     } else {
       defined_.insert(v.get());
       return StmtExprMutator::VisitStmt_(op);
diff --git a/src/tir/transforms/loop_partition.cc b/src/tir/transforms/loop_partition.cc
index a104dbb029eb..f1d816f0baef 100644
--- a/src/tir/transforms/loop_partition.cc
+++ b/src/tir/transforms/loop_partition.cc
@@ -607,8 +607,8 @@ inline Stmt LoopPartitioner::MakeFor(const Object* node, PrimExpr extent, Stmt b
     // If the loop extent is 1, do not create the loop anymore
     return Substitute(body, {{Var{for_node->loop_var}, make_const(DataType::Int(32), 0)}});
   } else {
-    return For(for_node->loop_var, IntImm(for_node->min.dtype(), 0), extent, for_node->for_type,
-               for_node->device_api, body);
+    ICHECK(for_node->kind != ForKind::kThreadBinding);
+    return For(for_node->loop_var, IntImm(for_node->min.dtype(), 0), extent, for_node->kind, body);
   }
 }
 
diff --git a/src/tir/transforms/make_packed_api.cc b/src/tir/transforms/make_packed_api.cc
index adbe78a6d627..0946af6f640a 100644
--- a/src/tir/transforms/make_packed_api.cc
+++ b/src/tir/transforms/make_packed_api.cc
@@ -46,9 +46,9 @@ class ReturnRewriter : public StmtMutator {
   explicit ReturnRewriter(Var ret_var, Var ret_tcode) : ret_var_(ret_var), ret_tcode_(ret_tcode) {}
 
   Stmt VisitStmt_(const ForNode* node) override {
-    if (node->for_type == ForType::Parallel) in_parallel_ += 1;
+    if (node->kind == ForKind::kParallel) in_parallel_ += 1;
     Stmt ret = StmtMutator::VisitStmt_(node);
-    if (node->for_type == ForType::Parallel) in_parallel_ -= 1;
+    if (node->kind == ForKind::kParallel) in_parallel_ -= 1;
     return ret;
   }
 
diff --git a/src/tir/transforms/narrow_datatype.cc b/src/tir/transforms/narrow_datatype.cc
index 0b248959ec6e..dc34626205a1 100644
--- a/src/tir/transforms/narrow_datatype.cc
+++ b/src/tir/transforms/narrow_datatype.cc
@@ -220,8 +220,8 @@ class DataTypeRewriter : public StmtExprMutator {
                           << ", but get " << s->GetTypeKey();
     PrimExpr e = VisitExpr(op->loop_var);
     Var var = Downcast<Var>(e);
-    return For(var, cast(var.dtype(), op->min), cast(var.dtype(), op->extent), op->for_type,
-               op->device_api, op->body);
+    return For(var, cast(var.dtype(), op->min), cast(var.dtype(), op->extent), op->kind, op->body,
+               op->thread_binding, op->annotations);
   }
 
   Stmt VisitStmt_(const AttrStmtNode* op) final {
diff --git a/src/tir/transforms/storage_flatten.cc b/src/tir/transforms/storage_flatten.cc
index d392866b3694..43fc1f1ec53f 100644
--- a/src/tir/transforms/storage_flatten.cc
+++ b/src/tir/transforms/storage_flatten.cc
@@ -318,14 +318,14 @@ class StorageFlattener : public StmtExprMutator {
     }
     for (int i = starts; i >= 0; --i) {
       if (i < starts) {
-        stmt = For(vars[i], 0, op->bounds[i]->extent, ForType::Serial, DeviceAPI::None, stmt);
+        stmt = For(vars[i], 0, op->bounds[i]->extent, ForKind::kSerial, stmt);
       } else {
         PrimExpr load = e.buffer.vload(e.RelIndex(args), e.buffer->dtype);
         PrimExpr address = Call(DataType::Handle(), builtin::address_of(), {load});
         PrimExpr prefetch = Call(op->buffer->dtype, builtin::prefetch(), {address, 0, 3, 1});
         stmt = Evaluate(prefetch);
         PrimExpr extent = (op->bounds[i]->extent - 1) / stride + 1;
-        stmt = For(vars[i], 0, extent, ForType::Serial, DeviceAPI::None, stmt);
+        stmt = For(vars[i], 0, extent, ForKind::kSerial, stmt);
       }
     }
     return stmt;
diff --git a/src/tir/transforms/storage_rewrite.cc b/src/tir/transforms/storage_rewrite.cc
index d4c5ca09650b..0b1429ca7efa 100644
--- a/src/tir/transforms/storage_rewrite.cc
+++ b/src/tir/transforms/storage_rewrite.cc
@@ -438,14 +438,14 @@ class StoragePlanRewriter : public StmtExprMutator {
     }
   }
   Stmt VisitStmt_(const ForNode* op) final {
-    ICHECK(op->for_type != ForType::Vectorized) << "VectorizeLoop before LiftStorageAlloc";
+    ICHECK(op->kind != ForKind::kVectorized) << "VectorizeLoop before LiftStorageAlloc";
     // remake all the allocation at the attach scope.
     if (attach_map_.count(op)) {
       auto& svec = attach_map_[op];
       Stmt stmt = StmtExprMutator::VisitStmt_(op);
       op = stmt.as<ForNode>();
-      return For(op->loop_var, op->min, op->extent, op->for_type, op->device_api,
-                 MakeAttach(svec, op->body));
+      return For(op->loop_var, op->min, op->extent, op->kind, MakeAttach(svec, op->body),
+                 op->thread_binding, op->annotations);
     } else {
       return StmtExprMutator::VisitStmt_(op);
     }
@@ -765,7 +765,7 @@ class StoragePlanRewriter : public StmtExprMutator {
         }
       } else if (s.stmt->IsInstance<ForNode>()) {
         const auto* op = static_cast<const ForNode*>(s.stmt);
-        if (op->for_type == ForType::Parallel) {
+        if (op->kind == ForKind::kParallel) {
           if (thread_scope_ == nullptr || thread_scope_ == op) {
             PlanNewScope(op);
           }
diff --git a/src/tir/transforms/unroll_loop.cc b/src/tir/transforms/unroll_loop.cc
index 71ad899273a6..c6e0b5c5f41e 100644
--- a/src/tir/transforms/unroll_loop.cc
+++ b/src/tir/transforms/unroll_loop.cc
@@ -100,13 +100,13 @@ class LoopUnroller : public StmtExprMutator {
     op = stmt.as<ForNode>();
     int value = GetExtent(op);
     // condition for auto unroll
-    bool auto_unroll = (op->for_type == ForType::Serial && value >= 0 && normal_loop_depth_ == 0 &&
+    bool auto_unroll = (op->kind == ForKind::kSerial && value >= 0 && normal_loop_depth_ == 0 &&
                         unroll_depth_ <= auto_max_depth_);
 
     auto_unroll =
         auto_unroll && (value * step_count_ <= auto_max_step_ || value <= auto_max_extent_);
 
-    if (op->for_type == ForType::Unrolled) {
+    if (op->kind == ForKind::kUnrolled) {
       ICHECK_GE(value, 0) << "Cannot unroll non-constant loop";
       auto_unroll = true;
     }
@@ -124,9 +124,9 @@ class LoopUnroller : public StmtExprMutator {
       return Unroll(op);
     } else {
       if (auto_unroll) {
-        if (op->for_type != ForType::Unrolled) {
-          return For(op->loop_var, op->min, op->extent, ForType::Unrolled, op->device_api,
-                     op->body);
+        if (op->kind != ForKind::kUnrolled) {
+          return For(op->loop_var, op->min, op->extent, ForKind::kUnrolled, op->body,
+                     op->thread_binding, op->annotations);
         }
       }
       return stmt;
diff --git a/src/tir/transforms/vectorize_loop.cc b/src/tir/transforms/vectorize_loop.cc
index 239f42266b83..66f4ae329f69 100644
--- a/src/tir/transforms/vectorize_loop.cc
+++ b/src/tir/transforms/vectorize_loop.cc
@@ -352,7 +352,7 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
   }
   // For
   Stmt VisitStmt_(const ForNode* op) final {
-    if (op->for_type == ForType::Vectorized) {
+    if (op->kind == ForKind::kVectorized) {
       LOG(WARNING) << "Detect vectorize inside vectorized loop, ignoring...";
     }
     ICHECK(is_zero(op->min));
@@ -365,7 +365,8 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     if (extent.same_as(op->extent) && body.same_as(op->body)) {
       return GetRef<Stmt>(op);
     } else {
-      return For(op->loop_var, op->min, extent, op->for_type, op->device_api, body);
+      return For(op->loop_var, op->min, extent, op->kind, body, op->thread_binding,
+                 op->annotations);
     }
   }
   // IfThenElse
@@ -436,7 +437,7 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     Var idx(var_->name_hint + ".s", var_->dtype);
     Map<Var, PrimExpr> values{{var_, idx}};
     stmt = Substitute(stmt, values);
-    return For(idx, 0, var_lanes_, ForType::Serial, DeviceAPI::None, stmt);
+    return For(idx, 0, var_lanes_, ForKind::kSerial, stmt);
   }
   // ProducerStore
   Stmt VisitStmt_(const ProducerStoreNode* op) final {
@@ -525,7 +526,7 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
 class LoopVectorizer : public StmtMutator {
  public:
   Stmt VisitStmt_(const ForNode* op) final {
-    if (op->for_type == ForType::Vectorized) {
+    if (op->kind == ForKind::kVectorized) {
       ICHECK(is_zero(op->min));
       auto* extent_as_int = op->extent.as<IntImmNode>();
       if (!extent_as_int || extent_as_int->value < 1) {
@@ -545,8 +546,8 @@ class VectorizeSkipper : public StmtMutator {
   Stmt VisitStmt_(const ForNode* op) final {
     Stmt stmt = StmtMutator::VisitStmt_(op);
     op = stmt.as<ForNode>();
-    if (op->for_type == ForType::Vectorized) {
-      return For(op->loop_var, op->min, op->extent, ForType::Serial, op->device_api, op->body);
+    if (op->kind == ForKind::kVectorized) {
+      return For(op->loop_var, op->min, op->extent, ForKind::kSerial, op->body);
     } else {
       return stmt;
     }
diff --git a/tests/python/unittest/test_arith_domain_touched.py b/tests/python/unittest/test_arith_domain_touched.py
index ca5df4af6a71..af06a038e1f7 100644
--- a/tests/python/unittest/test_arith_domain_touched.py
+++ b/tests/python/unittest/test_arith_domain_touched.py
@@ -31,14 +31,12 @@ def test_domain_touched():
         i,
         0,
         n,
-        0,
-        0,
+        tvm.tir.ForKind.SERIAL,
         tvm.tir.For(
             j,
             0,
             m,
-            0,
-            0,
+            tvm.tir.ForKind.SERIAL,
             tvm.tir.BufferStore(
                 a,
                 tvm.tir.BufferLoad(b, [i - 1, j + 1]) + tvm.tir.BufferLoad(a, [i - 1, j - 1]),
diff --git a/tests/python/unittest/test_runtime_module_based_interface.py b/tests/python/unittest/test_runtime_module_based_interface.py
index 64f87fb3c561..51a587242ae3 100644
--- a/tests/python/unittest/test_runtime_module_based_interface.py
+++ b/tests/python/unittest/test_runtime_module_based_interface.py
@@ -547,8 +547,7 @@ def make_func(symbol):
             i,
             0,
             n - 1,
-            0,
-            0,
+            tvm.tir.ForKind.SERIAL,
             tvm.tir.Store(Ab.data, tvm.tir.Load("float32", Ab.data, i) + 1, i + 1),
         )
         return tvm.tir.PrimFunc([Ab], stmt).with_attr("global_symbol", symbol)
diff --git a/tests/python/unittest/test_runtime_module_load.py b/tests/python/unittest/test_runtime_module_load.py
index 7befed3bbcdd..38800e8de6ad 100644
--- a/tests/python/unittest/test_runtime_module_load.py
+++ b/tests/python/unittest/test_runtime_module_load.py
@@ -55,7 +55,11 @@ def save_object(names):
         i = te.var("i")
         # for i in 0 to n-1:
         stmt = tvm.tir.For(
-            i, 0, n - 1, 0, 0, tvm.tir.Store(Ab.data, tvm.tir.Load(dtype, Ab.data, i) + 1, i + 1)
+            i,
+            0,
+            n - 1,
+            tvm.tir.ForKind.SERIAL,
+            tvm.tir.Store(Ab.data, tvm.tir.Load(dtype, Ab.data, i) + 1, i + 1),
         )
         mod = tvm.IRModule.from_expr(
             tvm.tir.PrimFunc([Ab], stmt).with_attr("global_symbol", "main")
diff --git a/tests/python/unittest/test_target_codegen_cuda.py b/tests/python/unittest/test_target_codegen_cuda.py
index e87767475ab2..a22fe10c1321 100644
--- a/tests/python/unittest/test_target_codegen_cuda.py
+++ b/tests/python/unittest/test_target_codegen_cuda.py
@@ -200,7 +200,7 @@ def test_cuda_shuffle():
 
     def MyVectorize():
         def vectorizer(op):
-            if op.for_type == tvm.tir.For.Vectorized:
+            if op.kind == tvm.tir.ForKind.VECTORIZED:
                 four = tvm.tir.const(4, "int32")
                 idx = tvm.tir.Ramp(thrx.var * four, tvm.tir.const(1, "int32"), 4)
                 all_ones = tvm.tir.const(1, "int32x4")
diff --git a/tests/python/unittest/test_target_codegen_llvm.py b/tests/python/unittest/test_target_codegen_llvm.py
index 4b67752367db..67c1f6bff429 100644
--- a/tests/python/unittest/test_target_codegen_llvm.py
+++ b/tests/python/unittest/test_target_codegen_llvm.py
@@ -761,7 +761,7 @@ def do_atomic_add(A):
         atomic_add_return = ib.allocate(A.dtype, (1,), name="atomic_add_return", scope="local")
         one = tvm.tir.const(1, A.dtype)
         A_ptr = ib.buffer_ptr(A)
-        with ib.for_range(0, n, name="i", for_type="parallel") as i:
+        with ib.for_range(0, n, name="i", kind="parallel") as i:
             atomic_add_return[0] = atomic_add(
                 tvm.tir.call_intrin("handle", "tir.address_of", A_ptr[0]), one
             )
diff --git a/tests/python/unittest/test_target_codegen_static_init.py b/tests/python/unittest/test_target_codegen_static_init.py
index 179e302984cc..b0c19dfcffeb 100644
--- a/tests/python/unittest/test_target_codegen_static_init.py
+++ b/tests/python/unittest/test_target_codegen_static_init.py
@@ -30,7 +30,7 @@ def test_static_callback():
     cp = te.thread_axis((0, 1), "cop")
     finit = tvm.tir.StringImm("TVMBackendRunOnce")
     ib.scope_attr(cp, "coproc_uop_scope", finit)
-    with ib.for_range(0, n, "i", for_type="parallel") as i:
+    with ib.for_range(0, n, "i", kind="parallel") as i:
         A[i] = A[i] + 1
     stmt = ib.get()
 
diff --git a/tests/python/unittest/test_target_codegen_vm_basic.py b/tests/python/unittest/test_target_codegen_vm_basic.py
index 26f1493c4ec1..9bbee76e2736 100644
--- a/tests/python/unittest/test_target_codegen_vm_basic.py
+++ b/tests/python/unittest/test_target_codegen_vm_basic.py
@@ -109,7 +109,7 @@ def test_vm_parallel():
     i = te.size_var("i")
     ib = tvm.tir.ir_builder.create()
     A = ib.buffer_ptr(Ab)
-    with ib.for_range(0, n, "i", for_type="parallel") as i:
+    with ib.for_range(0, n, "i", kind="parallel") as i:
         A[i] = A[i] + 1
     stmt = ib.get()
     mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([Ab], stmt).with_attr("global_symbol", "test"))
diff --git a/tests/python/unittest/test_te_hybrid_script.py b/tests/python/unittest/test_te_hybrid_script.py
index 06d409933f1f..be9956529dcc 100644
--- a/tests/python/unittest/test_te_hybrid_script.py
+++ b/tests/python/unittest/test_te_hybrid_script.py
@@ -267,9 +267,9 @@ def looptype(a, b, c):
     iloop = ir[0]
     jloop = ir[1]
     kloop = ir[2]
-    assert iloop.for_type == tvm.tir.For.Parallel
-    assert jloop.for_type == tvm.tir.For.Vectorized
-    assert kloop.for_type == tvm.tir.For.Unrolled
+    assert iloop.kind == tvm.tir.ForKind.PARALLEL
+    assert jloop.kind == tvm.tir.ForKind.VECTORIZED
+    assert kloop.kind == tvm.tir.ForKind.UNROLLED
 
     func, ins, outs = run_and_check(looptype, [a, b, c])
     run_and_check(func, ins, outs=outs)
diff --git a/tests/python/unittest/test_tir_constructor.py b/tests/python/unittest/test_tir_constructor.py
index 2bf4ba51937e..2cc21dbce91d 100644
--- a/tests/python/unittest/test_tir_constructor.py
+++ b/tests/python/unittest/test_tir_constructor.py
@@ -142,7 +142,7 @@ def test_stmt_constructor():
     assert isinstance(x, tvm.tir.AssertStmt)
     assert x.body == nop
 
-    x = tvm.tir.For(te.var("x"), 0, 10, 0, 0, nop)
+    x = tvm.tir.For(te.var("x"), 0, 10, tvm.tir.ForKind.SERIAL, nop)
     assert isinstance(x, tvm.tir.For)
     assert x.min.value == 0
     assert x.extent.value == 10
diff --git a/tests/python/unittest/test_tir_nodes.py b/tests/python/unittest/test_tir_nodes.py
index 4d57ed8ec366..bff60f70f53b 100644
--- a/tests/python/unittest/test_tir_nodes.py
+++ b/tests/python/unittest/test_tir_nodes.py
@@ -129,7 +129,7 @@ def test_basic():
 
 def test_stmt():
     x = tvm.tir.Evaluate(0)
-    tvm.tir.For(te.var("i"), 0, 1, tvm.tir.For.Serial, 0, x)
+    tvm.tir.For(te.var("i"), 0, 1, tvm.tir.ForKind.SERIAL, x)
 
 
 def test_dir():
diff --git a/tests/python/unittest/test_tir_transform_remove_no_op.py b/tests/python/unittest/test_tir_transform_remove_no_op.py
index 2edb8cf980c2..8b7a16952af9 100644
--- a/tests/python/unittest/test_tir_transform_remove_no_op.py
+++ b/tests/python/unittest/test_tir_transform_remove_no_op.py
@@ -34,20 +34,17 @@ def test_remove_no_op():
         i,
         0,
         4,
-        0,
-        0,
+        tvm.tir.ForKind.SERIAL,
         tvm.tir.For(
             j,
             0,
             n,
-            0,
-            0,
+            tvm.tir.ForKind.SERIAL,
             tvm.tir.For(
                 k,
                 0,
                 m,
-                0,
-                0,
+                tvm.tir.ForKind.SERIAL,
                 tvm.tir.IfThenElse((i * m + j + k < n), tvm.tir.Evaluate(m), tvm.tir.Evaluate(n)),
             ),
         ),
@@ -65,7 +62,7 @@ def test_remove_no_op():
     assert ret == store
 
     # remove zero extent loop
-    stmt3 = tvm.tir.For(i, 0, 0, 0, 0, store)
+    stmt3 = tvm.tir.For(i, 0, 0, tvm.tir.ForKind.SERIAL, store)
     mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([Ab], stmt3))
     ret = tvm.tir.transform.RemoveNoOp()(mod)["main"].body
     assert isinstance(ret, tvm.tir.Evaluate)
diff --git a/tests/python/unittest/test_tir_transform_storage_rewrite.py b/tests/python/unittest/test_tir_transform_storage_rewrite.py
index cc2b4273a5e3..49adcfb568a7 100644
--- a/tests/python/unittest/test_tir_transform_storage_rewrite.py
+++ b/tests/python/unittest/test_tir_transform_storage_rewrite.py
@@ -269,7 +269,7 @@ def verify(n):
 def test_parallel_alloc():
     ib = tvm.tir.ir_builder.create()
     n = te.var("n")
-    with ib.for_range(0, n, name="i", for_type="parallel") as i:
+    with ib.for_range(0, n, name="i", kind="parallel") as i:
         with ib.for_range(0, 10, name="j") as j:
             A = ib.allocate("float32", n, name="A", scope="global")
             A[j] = A[j] + 2
@@ -286,7 +286,7 @@ def test_parallel_alloc():
         ib.scope_attr(
             tvm.tir.const(1, "int32"), "pragma_scope", tvm.tir.StringImm("parallel_launch_point")
         )
-        with ib.for_range(0, n, name="i", for_type="parallel") as i:
+        with ib.for_range(0, n, name="i", kind="parallel") as i:
             with ib.for_range(0, 10, name="j") as j:
                 A = ib.allocate("float32", n, name="A", scope="global")
                 A[j] = A[j] + 2
diff --git a/tests/python/unittest/test_tir_transform_unroll_loop.py b/tests/python/unittest/test_tir_transform_unroll_loop.py
index 57b7810198c0..b511118f8b52 100644
--- a/tests/python/unittest/test_tir_transform_unroll_loop.py
+++ b/tests/python/unittest/test_tir_transform_unroll_loop.py
@@ -27,7 +27,7 @@ def test_unroll_loop():
     Aptr = ib.buffer_ptr(Ab)
     # for i in 0 to n-1:
     with ib.for_range(n, n + 2, name="i") as i:
-        with ib.for_range(0, 8, name="i", for_type="unroll") as j:
+        with ib.for_range(0, 8, name="i", kind="unroll") as j:
             Aptr[j + 1] = Aptr[i] + 1
 
     stmt = ib.get()
@@ -48,7 +48,7 @@ def test_unroll_loop():
     ):
         ret = tvm.tir.transform.UnrollLoop()(mod)["main"].body
         assert isinstance(ret, tvm.tir.For)
-        assert ret.for_type == tvm.tir.For.Unrolled
+        assert ret.kind == tvm.tir.ForKind.UNROLLED
 
     ib = tvm.tir.ir_builder.create()
     ib.scope_attr(tvm.tir.const(0, "int32"), "pragma_auto_unroll_max_step", 16)
@@ -63,9 +63,9 @@ def test_unroll_loop():
     ):
         ret = tvm.tir.transform.UnrollLoop()(mod)["main"].body
         assert isinstance(ret[0], tvm.tir.For)
-        assert ret[0].for_type == tvm.tir.For.Unrolled
+        assert ret[0].kind == tvm.tir.ForKind.UNROLLED
         assert isinstance(ret[1], tvm.tir.For)
-        assert ret[1].for_type != tvm.tir.For.Unrolled
+        assert ret[1].kind != tvm.tir.ForKind.UNROLLED
 
 
 def test_unroll_fake_loop():
diff --git a/tests/python/unittest/test_tir_transform_vectorize.py b/tests/python/unittest/test_tir_transform_vectorize.py
index 204e26feb6a9..5ae47e01f681 100644
--- a/tests/python/unittest/test_tir_transform_vectorize.py
+++ b/tests/python/unittest/test_tir_transform_vectorize.py
@@ -24,7 +24,7 @@ def test_vectorize_loop():
     ib = tvm.tir.ir_builder.create()
     A = ib.pointer("float32", name="A")
     with ib.for_range(0, n) as i:
-        with ib.for_range(0, 4, for_type="vectorize") as j:
+        with ib.for_range(0, 4, kind="vectorize") as j:
             A[j] = tvm.tir.const(1, A.dtype)
     stmt = ib.get()
 
@@ -45,7 +45,7 @@ def test_vectorize_vector():
     ib = tvm.tir.ir_builder.create()
     A = ib.pointer("float32x4", name="A")
     with ib.for_range(0, n) as i:
-        with ib.for_range(0, 4, for_type="vectorize") as j:
+        with ib.for_range(0, 4, kind="vectorize") as j:
             A[j] = tvm.tir.const(1, A.dtype)
     stmt = ib.get()
     assert isinstance(stmt.body, tvm.tir.For)
@@ -64,7 +64,7 @@ def test_vectorize_with_if():
     x = te.var("x")
     ib = tvm.tir.ir_builder.create()
     A = ib.pointer("float32", name="A")
-    with ib.for_range(0, 4, for_type="vectorize") as i:
+    with ib.for_range(0, 4, kind="vectorize") as i:
         with ib.if_scope(x < n):
             A[i] = A[i] + 1
         with ib.else_scope():
@@ -86,7 +86,7 @@ def test_vectorize_let():
     v = tvm.tir.Var("v", "float32")
     ib = tvm.tir.ir_builder.create()
     A = ib.pointer("float32", name="A")
-    with ib.for_range(0, 4, for_type="vectorize") as i:
+    with ib.for_range(0, 4, kind="vectorize") as i:
         ib.emit(lambda body: tvm.tir.LetStmt(v, A[i] + 1, body))
         A[i] = v + 2
 
@@ -100,7 +100,7 @@ def test_vectorize_with_le_cond():
     n = te.var("n")
     ib = tvm.tir.ir_builder.create()
     A = ib.pointer("float32", name="A")
-    with ib.for_range(0, 4, for_type="vectorize") as i:
+    with ib.for_range(0, 4, kind="vectorize") as i:
         with ib.if_scope(i <= n):
             A[i] = A[i] + 1
     stmt = ib.get()
@@ -115,7 +115,7 @@ def test_vectorize_with_ge_cond():
     n = te.var("n")
     ib = tvm.tir.ir_builder.create()
     A = ib.pointer("float32", name="A")
-    with ib.for_range(0, 4, for_type="vectorize") as i:
+    with ib.for_range(0, 4, kind="vectorize") as i:
         with ib.if_scope(i >= n):
             A[i] = A[i] + 1
     stmt = ib.get()
@@ -131,7 +131,7 @@ def test_vectorize_if_then_else():
     x = te.var("x")
     ib = tvm.tir.ir_builder.create()
     A = ib.pointer("float32", name="A")
-    with ib.for_range(0, 4, for_type="vectorize") as i:
+    with ib.for_range(0, 4, kind="vectorize") as i:
         A[i] = tvm.tir.call_intrin("float32", "tir.if_then_else", i > 0, A[i] + 1, A[i])
     stmt = ib.get()
 
@@ -143,7 +143,7 @@ def test_vectorize_if_then_else():
     ib = tvm.tir.ir_builder.create()
     A = ib.pointer("float32", name="A")
     with ib.for_range(0, n) as k:
-        with ib.for_range(0, 4, for_type="vectorize") as i:
+        with ib.for_range(0, 4, kind="vectorize") as i:
             A[k * 4 + i] = tvm.tir.call_intrin(
                 "float32", "tir.if_then_else", k > 0, A[k * 4 + i], 0
             )
diff --git a/tutorials/dev/low_level_custom_pass.py b/tutorials/dev/low_level_custom_pass.py
index 44fe59f99201..0bd656dd81dd 100644
--- a/tutorials/dev/low_level_custom_pass.py
+++ b/tutorials/dev/low_level_custom_pass.py
@@ -116,8 +116,8 @@ def vectorize8(op):
         name = op.loop_var.name
         lo, li = te.var(name + ".outer"), te.var(name + ".inner")
         body = tvm.tir.stmt_functor.substitute(op.body, {op.loop_var: lo * 8 + li})
-        body = tvm.tir.For(li, 0, 8, tvm.tir.For.Vectorized, 0, body)
-        body = tvm.tir.For(lo, 0, extent // 8, tvm.tir.For.Serial, 0, body)
+        body = tvm.tir.For(li, 0, 8, tvm.tir.ForKind.VECTORIZED, body)
+        body = tvm.tir.For(lo, 0, extent // 8, tvm.tir.ForKind.SERIAL, body)
         return body
     return None
 
diff --git a/vta/python/vta/transform.py b/vta/python/vta/transform.py
index a485d2cfb7b8..9770857fb0b9 100644
--- a/vta/python/vta/transform.py
+++ b/vta/python/vta/transform.py
@@ -231,7 +231,13 @@ def _merge_block(slist, body):
                     body = tvm.tir.AttrStmt(op.node, op.attr_key, op.value, body)
                 elif isinstance(op, tvm.tir.For):
                     body = tvm.tir.For(
-                        op.loop_var, op.min, op.extent, op.for_type, op.device_api, body
+                        op.loop_var,
+                        op.min,
+                        op.extent,
+                        op.kind,
+                        body,
+                        op.thread_binding,
+                        op.annotations,
                     )
                 else:
                     raise RuntimeError("unexpected op")
@@ -314,7 +320,9 @@ def _do_fold(stmt):
             if _match_pragma(stmt, "trim_loop"):
                 op = stmt.body
                 assert isinstance(op, tvm.tir.For)
-                return tvm.tir.For(op.loop_var, op.min, 2, op.for_type, op.device_api, op.body)
+                return tvm.tir.For(
+                    op.loop_var, op.min, 2, op.kind, op.body, op.thread_binding, op.annotations
+                )
             return None
 
         return f.with_body(

From f91b51d638874973a2d9ccbcb4d49cf7c668f516 Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@octoml.ai>
Date: Tue, 19 Jan 2021 06:32:42 -0800
Subject: [PATCH 087/357] [Relay][Frontend][Onnx] Compare against onnxruntime
 more consistently during testing (#7300)

Co-authored-by: Josh Fromm <jwfromm@uw.edu>
---
 python/tvm/relay/frontend/onnx.py          |  73 +-
 tests/python/frontend/onnx/test_forward.py | 801 +++++++--------------
 2 files changed, 302 insertions(+), 572 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 9405bc532702..7a3b168fc8fd 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -17,6 +17,7 @@
 # pylint: disable=invalid-name, import-self, len-as-condition, unused-argument, too-many-lines
 # pylint: disable=import-outside-toplevel
 """ONNX: Open Neural Network Exchange frontend for Relay."""
+import copy
 import warnings
 import numpy as np
 import tvm
@@ -1028,10 +1029,6 @@ def _impl_v9(cls, inputs, attr, params):
                 'Value {} in attribute "mode" of operator Upsample is not valid.'.format(mode)
             )
 
-        if method == "nearest_neighbor":
-            align_corners = False
-        else:
-            align_corners = True
         # in 3d case, we use the purely static op
         if dims == 5:
             if isinstance(scales, _expr.Call):
@@ -1065,7 +1062,7 @@ def _impl_v9(cls, inputs, attr, params):
                 scale_w,
                 layout=layout,
                 method=method,
-                align_corners=align_corners,
+                align_corners=False,
             )
         return out
 
@@ -1111,17 +1108,22 @@ class Split(OnnxOpConverter):
 
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
-        splits = attr.get("split", False)
-        if splits:
+        splits = attr.get("split", None)
+        if splits is not None:
+            indices = []
             attr["indices_or_sections"] = []
             index = 0
             for i in splits[:-1]:
                 index += i
-                attr["indices_or_sections"].append(index)
+                indices.append(index)
         # When splits isnt specified divide evenly over axis.
         else:
-            attr["indices_or_sections"] = attr["tvm_custom"]["num_outputs"]
-        return AttrCvt("split", ignores=["split"])(inputs, attr, params)
+            indices = attr["tvm_custom"]["num_outputs"]
+        output = _op.split(inputs[0], indices, attr.get("axis", 0))
+        # If the output of split is a single value, unpack if from the TupleWrapper
+        if len(output) == 1:
+            output = output[0]
+        return output
 
 
 class Slice(OnnxOpConverter):
@@ -1227,7 +1229,9 @@ class GatherND(OnnxOpConverter):
 
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
-        return _op.gather_nd(inputs[0], inputs[1])
+        indices_dims = len(infer_shape(inputs[1]))
+        indices = _op.transpose(inputs[1], axes=[-1] + list(range(indices_dims - 1)))
+        return _op.gather_nd(inputs[0], indices)
 
 
 class Scatter(OnnxOpConverter):
@@ -1538,15 +1542,6 @@ def _impl_v1(cls, inputs, attr, params):
 class Tile(Elemwise):
     """Operator converter for Tile"""
 
-    @classmethod
-    def _impl_v1(cls, inputs, attr, params):
-        if "repeats" not in attr:
-            raise tvm.error.OpAttributeInvalid(
-                'Attribute "repeats" should be set ' "for operator Tile."
-            )
-        reps = attr.pop("repeats")  # The number of times repeating the tensor data.
-        return _op.tile(inputs[0], reps)
-
     @classmethod
     def _impl_v6(cls, inputs, attr, params):
         return _op.tile(inputs[0], inputs[1])
@@ -2113,7 +2108,9 @@ def _impl_v11(cls, inputs, attr, params):
         cond = inputs[1]
         loop_deps = inputs[2:]
         num_deps = len(loop_deps)
-        body = attr["body"]
+        # Create a copy of the body function to prevent the original
+        # from being modified.
+        body = copy.copy(attr["body"])
         iter_dtype = infer_type(max_loop_count).checked_type.dtype
 
         # Determine what condition mode we're in.
@@ -2150,6 +2147,8 @@ def get_var(name, val, scan=False):
             checked_type = infer_type(val)
             if hasattr(checked_type, "type_annotation"):
                 checked_type = checked_type.type_annotation
+            if hasattr(checked_type, "checked_type"):
+                checked_type = checked_type.checked_type
             shape = get_const_tuple(checked_type.shape)
             actual_shape = []
             for dim in shape:
@@ -2185,8 +2184,14 @@ def get_var(name, val, scan=False):
         scan_output_init = []
         for i in range(num_scan_outputs):
             name, shape, dtype, _ = get_info(body.output[i + 1 + num_deps])
-            scan_output_vars.append(_expr.var(name, shape=([_ty.Any()] + shape), dtype=dtype))
-            scan_output_init.append(_op.reshape(_expr.const([]), [0] + shape))
+            if dtype == "float":
+                dtype = "float32"
+            scan_output_vars.append(
+                _expr.var(name, shape=([_ty.Any()] * (len(shape) + 1)), dtype=dtype)
+            )
+            scan_output_init.append(
+                _op.reshape(_expr.const(np.array([]).astype(dtype)), [0] + [1] * len(shape))
+            )
 
         # Now we can remove loop iter variables from our inner loop's inputs.
         # This is kind of a hack since we have graph inputs that we don't
@@ -2219,11 +2224,6 @@ def body_fn(*loop_inputs):
             new_loop_vars = [loop_outputs[i] for i in range(1, 1 + num_deps)]
             new_scan_outputs = [loop_outputs[i] for i in range(1 + num_deps, len(loop_outputs))]
 
-            # Increment counter.
-            if max_loop_count is not None:
-                incr = _expr.const(1, dtype=iter_dtype)
-                loop_count = loop_count + incr
-
             # Add new scan outputs to tracking
             combined_scan_outputs = []
             for i, scan in enumerate(scan_outputs):
@@ -2231,6 +2231,11 @@ def body_fn(*loop_inputs):
                 combined_scan = _op.concatenate([scan, new_scan], axis=0)
                 combined_scan_outputs.append(combined_scan)
 
+            # Increment counter.
+            if max_loop_count is not None:
+                incr = _expr.const(1, dtype=iter_dtype)
+                loop_count = loop_count + incr
+
             # Pack loop outputs for next iteration
             # [iter_count, cond, loop_deps, loop_scans]
             return [loop_count, max_count, new_cond] + new_loop_vars + combined_scan_outputs
@@ -2630,12 +2635,12 @@ def _get_convert_map(opset):
         "Greater": Greater.get_converter(opset),
         "Less": Less.get_converter(opset),
         "Log": Renamer("log"),
-        "ACos": Renamer("acos"),
-        "ACosh": Renamer("acosh"),
-        "ASin": Renamer("asin"),
-        "ASinh": Renamer("asinh"),
-        "ATan": Renamer("atan"),
-        "ATanh": Renamer("atanh"),
+        "Acos": Renamer("acos"),
+        "Acosh": Renamer("acosh"),
+        "Asin": Renamer("asin"),
+        "Asinh": Renamer("asinh"),
+        "Atan": Renamer("atan"),
+        "Atanh": Renamer("atanh"),
         "Cos": Renamer("cos"),
         "Cosh": Renamer("cosh"),
         "Sin": Renamer("sin"),
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 96be6fba113a..20937d2060c5 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 import numpy as np
-import math
 import onnx
 from onnx import helper, TensorProto, mapping, numpy_helper
 import torch
@@ -94,7 +93,7 @@ def get_tvm_output(
     # execute
     m.run()
     # get outputs
-    if isinstance(output_shape, list) and isinstance(output_dtype, list):
+    if isinstance(output_shape, list):
         tvm_output_list = []
         for i, _ in enumerate(output_shape):
             tvm_output = m.get_output(i)
@@ -105,17 +104,19 @@ def get_tvm_output(
         return tvm_output.asnumpy()
 
 
-def get_onnxruntime_output(model, inputs, dtype="float32"):
+def get_onnxruntime_output(model, inputs):
     import onnxruntime.backend
 
     rep = onnxruntime.backend.prepare(model, "CPU")
-    if isinstance(inputs, list) and len(inputs) > 1:
-        return rep.run(inputs)
-    elif isinstance(inputs, list) and len(inputs) == 1:
+    if isinstance(inputs, list) and len(inputs) == 1:
         inp = inputs[0]
     else:
         inp = inputs
-    return rep.run(inp.astype(dtype))[0]
+    output = rep.run(inp)
+    # Unpack output if there's only a single value.
+    if len(output) == 1:
+        output = output[0]
+    return output
 
 
 def verify_with_ort_with_inputs(
@@ -130,15 +131,11 @@ def verify_with_ort_with_inputs(
     dtype="float32",
     rtol=1e-5,
     atol=1e-5,
+    apply_softmax=False,
 ):
-    def flatten(out):
-        if isinstance(out, list) and len(out) == 1:
-            out = out[0]
-        if isinstance(out, np.ndarray):
-            return out.flatten()
-        return out
-
-    ort_out = get_onnxruntime_output(model, inputs, dtype)
+    if opset is not None:
+        model.opset_import[0].version = opset
+    ort_out = get_onnxruntime_output(model, inputs)
 
     if targets is None:
         targets = [tgt for (tgt, _) in tvm.testing.enabled_targets()]
@@ -157,8 +154,15 @@ def flatten(out):
             )
         else:
             tvm_out = get_tvm_output(model, inputs, target, ctx, out_shape, dtype, opset=opset)
-
-        tvm.testing.assert_allclose(flatten(ort_out), flatten(tvm_out), rtol=rtol, atol=atol)
+        if not isinstance(tvm_out, list):
+            tvm_out = [tvm_out]
+        if not isinstance(ort_out, list):
+            ort_out = [ort_out]
+        for tvm_val, ort_val in zip(tvm_out, ort_out):
+            if apply_softmax:
+                ort_val = scipy.special.softmax(ort_val)
+                tvm_val = scipy.special.softmax(tvm_val)
+            tvm.testing.assert_allclose(ort_val, tvm_val, rtol=rtol, atol=atol)
 
 
 def verify_with_ort(
@@ -342,7 +346,7 @@ def verify_depth_to_space(inshape, outshape, mode, blockSize):
 
     model = helper.make_model(graph, producer_name="depth_to_space_test")
 
-    verify_with_ort(model, [inshape], outshape)
+    verify_with_ort(model, [inshape], [outshape])
 
 
 @tvm.testing.uses_gpu
@@ -365,7 +369,7 @@ def verify_space_to_depth(inshape, outshape, blockSize):
 
     model = helper.make_model(graph, producer_name="space_to_depth_test")
 
-    verify_with_ort(model, [inshape], outshape)
+    verify_with_ort(model, [inshape], [outshape])
 
 
 @tvm.testing.uses_gpu
@@ -494,11 +498,8 @@ def test_squeeze():
     )
 
     model = helper.make_model(graph, producer_name="squeeze_test")
-
-    for target, ctx in tvm.testing.enabled_targets():
-        x = np.random.uniform(size=in_shape).astype("float32")
-        tvm_out = get_tvm_output(model, x, target, ctx, out_shape, "float32")
-        tvm.testing.assert_allclose(out_shape, tvm_out.shape)
+    x = np.random.uniform(size=in_shape).astype("float32")
+    verify_with_ort_with_inputs(model, [x], [out_shape])
 
 
 @tvm.testing.uses_gpu
@@ -518,11 +519,7 @@ def test_flatten():
     )
 
     model = helper.make_model(graph, producer_name="flatten_test")
-
-    for target, ctx in tvm.testing.enabled_targets():
-        x = np.random.uniform(size=in_shape).astype("int32")
-        tvm_out = get_tvm_output(model, x, target, ctx, ref_shape, "float32")
-        tvm.testing.assert_allclose(ref_shape, tvm_out.shape)
+    verify_with_ort(model, [in_shape])
 
 
 @tvm.testing.uses_gpu
@@ -540,16 +537,12 @@ def test_unsqueeze():
     )
 
     model = helper.make_model(graph, producer_name="squeeze_test")
-
-    for target, ctx in tvm.testing.enabled_targets():
-        x = np.random.uniform(size=in_shape).astype("float32")
-        tvm_out = get_tvm_output(model, x, target, ctx, out_shape, "float32")
-        tvm.testing.assert_allclose(out_shape, tvm_out.shape)
+    verify_with_ort(model, [in_shape])
 
 
 def verify_gather(in_shape, indices, axis, dtype):
     x = np.random.uniform(size=in_shape).astype(dtype)
-    indices = np.array(indices, dtype="int32")
+    indices = np.array(indices, dtype="int64")
     out_np = np.take(x, indices, axis=axis)
 
     y = helper.make_node("Gather", ["in", "indices"], ["out"], axis=axis)
@@ -558,16 +551,19 @@ def verify_gather(in_shape, indices, axis, dtype):
         [y],
         "gather_test",
         inputs=[
-            helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape)),
-            helper.make_tensor_value_info("indices", TensorProto.INT32, list(indices.shape)),
+            helper.make_tensor_value_info(
+                "in", mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(dtype)], list(in_shape)
+            ),
+            helper.make_tensor_value_info("indices", TensorProto.INT64, list(indices.shape)),
+        ],
+        outputs=[
+            helper.make_tensor_value_info(
+                "out", mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(dtype)], list(out_np.shape)
+            )
         ],
-        outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_np.shape))],
     )
     model = helper.make_model(graph, producer_name="gather_test")
-
-    for target, ctx in tvm.testing.enabled_targets():
-        tvm_out = get_tvm_output(model, [x, indices], target, ctx, out_np.shape)
-        tvm.testing.assert_allclose(out_np, tvm_out)
+    verify_with_ort_with_inputs(model, [x, indices], dtype=dtype)
 
 
 @tvm.testing.uses_gpu
@@ -660,10 +656,7 @@ def _test_slice_iteration_v1(indata, outdata, starts, ends, axes=None):
     )
 
     model = helper.make_model(graph, producer_name="slice_test")
-
-    for target, ctx in tvm.testing.enabled_targets():
-        tvm_out = get_tvm_output(model, indata, target, ctx, outdata.shape, "float32", opset=1)
-        tvm.testing.assert_allclose(outdata, tvm_out)
+    verify_with_ort_with_inputs(model, [indata], [outdata.shape], opset=1)
 
 
 def _test_slice_iteration_v10(indata, outdata, **attrs):
@@ -738,14 +731,14 @@ def add_noop_to_input_attr(attr_name, attr):
 
     if axes:
         axes = np.asarray(axes)
-        inputs.append(helper.make_tensor_value_info("axes", TensorProto.INT32, list(axes.shape)))
-        initializer.append(helper.make_tensor("axes", TensorProto.INT32, list(axes.shape), axes))
+        inputs.append(helper.make_tensor_value_info("axes", TensorProto.INT64, list(axes.shape)))
+        initializer.append(helper.make_tensor("axes", TensorProto.INT64, list(axes.shape), axes))
 
     if steps:
         assert axes is not None and len(axes) == len(steps)
         steps = np.asarray(steps)
-        inputs.append(helper.make_tensor_value_info("steps", TensorProto.INT32, list(axes.shape)))
-        initializer.append(helper.make_tensor("steps", TensorProto.INT32, list(steps.shape), steps))
+        inputs.append(helper.make_tensor_value_info("steps", TensorProto.INT64, list(axes.shape)))
+        initializer.append(helper.make_tensor("steps", TensorProto.INT64, list(steps.shape), steps))
 
     y = helper.make_node("Slice", ["data", *slice_inputs], ["out"])
 
@@ -758,10 +751,7 @@ def add_noop_to_input_attr(attr_name, attr):
         initializer=initializer,
     )
     model = helper.make_model(graph, producer_name="slice_test")
-
-    for target, ctx in tvm.testing.enabled_targets():
-        tvm_out = get_tvm_output_with_vm(model, indata, target, ctx, opset=10, freeze_params=True)
-        tvm.testing.assert_allclose(outdata, tvm_out)
+    verify_with_ort_with_inputs(model, [indata], opset=10, freeze_params=True, use_vm=True)
 
 
 # TODO(mbrookhart): enable once VM supports heterogenous execution
@@ -854,10 +844,7 @@ def _test_onnx_op_elementwise(inshape, outfunc, npargs, dtype, opname, kwargs, o
     )
 
     model = helper.make_model(graph, producer_name=opname + "_test")
-
-    for target, ctx in tvm.testing.enabled_targets():
-        tvm_out = get_tvm_output(model, indata, target, ctx, outdata.shape, dtype, opset=opset)
-        tvm.testing.assert_allclose(outdata, tvm_out)
+    verify_with_ort_with_inputs(model, [indata], [outdata.shape], opset=opset, dtype=dtype)
 
 
 @tvm.testing.uses_gpu
@@ -879,6 +866,7 @@ def test_clip():
         "float32",
         "Clip",
         {"min": -1.0, "max": 1.0},
+        opset=6,
     )
 
     _test_onnx_op_elementwise(
@@ -888,7 +876,7 @@ def test_clip():
         "float32",
         "Clip",
         {"max": 1.0},
-        opset=1,
+        opset=6,
     )
 
     _test_onnx_op_elementwise(
@@ -898,7 +886,7 @@ def test_clip():
         "float32",
         "Clip",
         {"min": -1.0},
-        opset=1,
+        opset=6,
     )
 
 
@@ -919,7 +907,7 @@ def test_clip_min_max_as_inputs():
     )
     model = helper.make_model(graph, producer_name="clip_test")
 
-    verify_with_ort(model, [input_shape], input_shape)
+    verify_with_ort(model, [input_shape], out_shape=[input_shape])
 
 
 @tvm.testing.uses_gpu
@@ -941,10 +929,7 @@ def _test_finite_ops(inshape, outfunc, npargs, dtype, opname, kwargs):
     )
 
     model = helper.make_model(graph, producer_name=opname + "_test")
-
-    for target, ctx in tvm.testing.enabled_targets():
-        tvm_out = get_tvm_output(model, indata, target, ctx, outdata.shape, dtype)
-        tvm.testing.assert_allclose(outdata, tvm_out)
+    verify_with_ort_with_inputs(model, [indata], [outdata.shape], dtype=dtype)
 
 
 @tvm.testing.uses_gpu
@@ -957,10 +942,9 @@ def test_isnan():
     _test_finite_ops((2, 4, 5, 6), np.isnan, {}, "float32", "IsNaN", {})
 
 
-def verify_gather_nd(in_shape, indices, dtype):
+def verify_gather_nd(in_shape, indices, out_shape, dtype="float32"):
     x = np.random.uniform(size=in_shape).astype(dtype)
-    indices = np.array(indices, dtype="int32")
-    out_np = tvm.topi.testing.gather_nd_python(x, indices)
+    indices = np.array(indices, dtype="int64")
 
     y = helper.make_node("GatherND", ["in", "indices"], ["out"])
 
@@ -968,23 +952,27 @@ def verify_gather_nd(in_shape, indices, dtype):
         [y],
         "gather_test",
         inputs=[
-            helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape)),
-            helper.make_tensor_value_info("indices", TensorProto.INT32, list(indices.shape)),
+            helper.make_tensor_value_info(
+                "in", mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(dtype)], list(in_shape)
+            ),
+            helper.make_tensor_value_info("indices", TensorProto.INT64, list(indices.shape)),
+        ],
+        outputs=[
+            helper.make_tensor_value_info(
+                "out", mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(dtype)], list(out_shape)
+            )
         ],
-        outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_np.shape))],
     )
     model = helper.make_model(graph, producer_name="gather_test")
-
-    for target, ctx in tvm.testing.enabled_targets():
-        tvm_out = get_tvm_output(model, [x, indices], target, ctx, out_np.shape)
-        tvm.testing.assert_allclose(out_np, tvm_out)
+    verify_with_ort_with_inputs(model, [x, indices], [out_shape])
 
 
 @tvm.testing.uses_gpu
 def test_gather_nd():
-    verify_gather_nd((2, 2), [[0, 0], [1, 1]], "int32")
-    verify_gather_nd((3, 3, 3), [[0, 1], [1, 0]], "float32")
-    verify_gather_nd((4, 3, 5, 6), [[2, 1, 0, 0]], "float32")
+    verify_gather_nd([2, 2], [[0, 0], [1, 1]], [2], "int32")
+    verify_gather_nd([2, 2], [[1], [0]], [2, 2])
+    verify_gather_nd([2, 2, 2], [[0, 1], [1, 0]], [2, 2])
+    verify_gather_nd([2, 2, 2], [[[0, 1]], [[1, 0]]], [2, 1, 2])
 
 
 # TODO(mbrookhart): enable once VM supports heterogenous execution
@@ -1011,6 +999,7 @@ def test_onehot():
 
     model = helper.make_model(graph, producer_name="onehot_test")
 
+    # TODO(jwfromm): Replace test against np with test against onnxrt once we update versions.
     for target, ctx in tvm.testing.enabled_targets():
         tvm_out = get_tvm_output_with_vm(
             model, [indices_array, np.array([depth]).astype("int32"), values], target, ctx
@@ -1022,10 +1011,10 @@ def test_onehot():
 def test_matmul():
     a_shape = (4, 3)
     b_shape = (3, 4)
+    out_shape = [a_shape[0], b_shape[1]]
 
     a_array = np.random.uniform(size=a_shape).astype("float32")
     b_array = np.random.uniform(size=b_shape).astype("float32")
-    out_np = np.matmul(a_array, b_array)
 
     mul_node = helper.make_node("MatMul", ["a", "b"], ["out"])
 
@@ -1036,14 +1025,11 @@ def test_matmul():
             helper.make_tensor_value_info("a", TensorProto.FLOAT, list(a_shape)),
             helper.make_tensor_value_info("b", TensorProto.FLOAT, list(b_shape)),
         ],
-        outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_np.shape))],
+        outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))],
     )
 
     model = helper.make_model(graph, producer_name="matmul_test")
-
-    for target, ctx in tvm.testing.enabled_targets():
-        tvm_out = get_tvm_output(model, [a_array, b_array], target, ctx, out_np.shape)
-        tvm.testing.assert_allclose(out_np, tvm_out, rtol=1e-5, atol=1e-5)
+    verify_with_ort_with_inputs(model, [a_array, b_array])
 
 
 def verify_batch_matmul(a_shape, b_shape, out_shape, target, ctx):
@@ -1063,10 +1049,7 @@ def verify_batch_matmul(a_shape, b_shape, out_shape, target, ctx):
     )
 
     model = helper.make_model(graph, producer_name="matmul_test")
-    onnx_out = get_onnxruntime_output(model, [a_array, b_array], "float32")[0]
-
-    tvm_out = get_tvm_output_with_vm(model, [a_array, b_array], target, ctx)
-    tvm.testing.assert_allclose(onnx_out, tvm_out, rtol=1e-5, atol=1e-5)
+    verify_with_ort_with_inputs(model, [a_array, b_array], use_vm=True, targets=[target])
 
 
 # TODO(mbrookhart): enable cuda once VM supports heterogenous execution
@@ -1152,29 +1135,7 @@ def verify_lrn(shape, nsize, dtype, alpha=None, beta=None, bias=None):
         outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(shape))],
     )
     model = helper.make_model(graph, producer_name="lrn_test")
-
-    def _get_python_lrn():
-        square_sum = np.zeros(shape).astype(dtype)
-        for n, c, h, w in np.ndindex(in_array.shape):
-            square_sum[n, c, h, w] = sum(
-                in_array[
-                    n,
-                    max(0, c - int(math.floor((nsize - 1) / 2))) : min(
-                        5, c + int(math.ceil((nsize - 1) / 2)) + 1
-                    ),
-                    h,
-                    w,
-                ]
-                ** 2
-            )
-        py_out = in_array / ((bias + (alpha / nsize) * square_sum) ** beta)
-        return py_out
-
-    for target, ctx in tvm.testing.enabled_targets():
-        input_name = model.graph.input[0].name
-        py_out = _get_python_lrn()
-        tvm_out = get_tvm_output(model, in_array, target, ctx, py_out.shape, "float32")
-        tvm.testing.assert_allclose(py_out, tvm_out, rtol=1e-5, atol=1e-5)
+    verify_with_ort_with_inputs(model, [in_array])
 
 
 @tvm.testing.uses_gpu
@@ -1184,21 +1145,10 @@ def test_lrn():
 
 
 def verify_instance_norm(shape, axis=1):
-    def _get_python_instance_norm(x, gamma, beta, epsilon=1e-5):
-        dims_x = len(x.shape)
-        axis = tuple(range(2, dims_x))
-        mean = np.mean(x, axis=axis, keepdims=True)
-        var = np.var(x, axis=axis, keepdims=True)
-        dim_ones = (1,) * (dims_x - 2)
-        gamma = gamma.reshape(-1, *dim_ones)
-        beta = beta.reshape(-1, *dim_ones)
-        return gamma * (x - mean) / np.sqrt(var + epsilon) + beta
-
     x = np.random.randn(*shape).astype(np.float32)
     gamma = np.random.randn(shape[1]).astype(np.float32)
     beta = np.random.randn(shape[1]).astype(np.float32)
     epsilon = 1e-5
-    y = _get_python_instance_norm(x, gamma, beta, epsilon).astype(np.float32)
 
     node = onnx.helper.make_node(
         "InstanceNormalization",
@@ -1217,9 +1167,7 @@ def _get_python_instance_norm(x, gamma, beta, epsilon=1e-5):
         outputs=[helper.make_tensor_value_info("y", TensorProto.FLOAT, list(shape))],
     )
     model = helper.make_model(graph, producer_name="instance_norm_test")
-    for target, ctx in tvm.testing.enabled_targets():
-        tvm_out = get_tvm_output(model, [x, gamma, beta], target, ctx, shape, "float32")
-        tvm.testing.assert_allclose(y, tvm_out, rtol=1e-5, atol=1e-5)
+    verify_with_ort_with_inputs(model, [x, gamma, beta], out_shape=[shape])
 
 
 @tvm.testing.uses_gpu
@@ -1230,14 +1178,13 @@ def test_instance_norm():
     verify_instance_norm((8, 7, 6, 5, 4))
 
 
-def _test_upsample_nearest():
+def verify_upsample_nearest():
     scale = 2
     in_shape = (1, 1, 3, 3)
     out_shape = (1, 1, 3 * scale, 3 * scale)
     y = helper.make_node("Upsample", ["in"], ["out"], mode="nearest", scales=[1.0, 1.0, 2.0, 2.0])
 
     in_array = np.random.uniform(size=in_shape).astype(np.float32)
-    out_array = tvm.topi.testing.upsampling_python(in_array, (scale, scale), "NCHW")
 
     graph = helper.make_graph(
         [y],
@@ -1247,13 +1194,10 @@ def _test_upsample_nearest():
     )
 
     model = helper.make_model(graph, producer_name="upsample_nearest_test")
+    verify_with_ort_with_inputs(model, [in_array], [out_shape], opset=7)
 
-    for target, ctx in tvm.testing.enabled_targets():
-        tvm_out = get_tvm_output(model, in_array, target, ctx, out_shape, "float32")
-        tvm.testing.assert_allclose(out_array, tvm_out)
 
-
-def _test_upsample3d_nearest():
+def verify_upsample3d_nearest():
     scale = 2
     in_shape = (1, 1, 3, 3, 3)
     out_shape = (1, 1, 3 * scale, 3 * scale, 3 * scale)
@@ -1262,7 +1206,6 @@ def _test_upsample3d_nearest():
     )
 
     in_array = np.random.uniform(size=in_shape).astype(np.float32)
-    out_array = tvm.topi.testing.upsampling3d_python(in_array, (scale, scale, scale), "NCDHW")
 
     graph = helper.make_graph(
         [y],
@@ -1272,20 +1215,17 @@ def _test_upsample3d_nearest():
     )
 
     model = helper.make_model(graph, producer_name="upsample_nearest_test")
-
-    for target, ctx in tvm.testing.enabled_targets():
-        tvm_out = get_tvm_output(model, in_array, target, ctx, out_shape, "float32")
-        tvm.testing.assert_allclose(out_array, tvm_out)
+    # Upsample is deprecated after opset 9
+    verify_with_ort_with_inputs(model, [in_array], [out_shape], opset=7)
 
 
-def _test_upsample_bilinear():
+def verify_upsample_bilinear():
     scale = 2
     in_shape = (1, 1, 3, 3)
     out_shape = (1, 1, 3 * scale, 3 * scale)
     y = helper.make_node("Upsample", ["in"], ["out"], mode="linear", scales=[1.0, 1.0, 2.0, 2.0])
 
     in_array = np.random.uniform(size=in_shape).astype(np.float32)
-    out_array = tvm.topi.testing.bilinear_resize_python(in_array, (3 * scale, 3 * scale), "NCHW")
 
     graph = helper.make_graph(
         [y],
@@ -1295,51 +1235,10 @@ def _test_upsample_bilinear():
     )
 
     model = helper.make_model(graph, producer_name="upsample_bilinear_test")
-
-    for target, ctx in tvm.testing.enabled_targets():
-        tvm_out = get_tvm_output(model, in_array, target, ctx, out_shape, "float32")
-        tvm.testing.assert_allclose(out_array, tvm_out, rtol=1e-5, atol=1e-5)
+    verify_with_ort_with_inputs(model, [in_array], [out_shape], opset=7)
 
 
-def _test_upsample_bilinear_opset9():
-    scale = 2
-    in_shape = (1, 1, 3, 3)
-    out_shape = (1, 1, 3 * scale, 3 * scale)
-    y = helper.make_node("Upsample", ["in", "scales"], ["out"], mode="linear")
-    scales = [1, 1, 2, 2]
-    in_array = np.random.uniform(size=in_shape).astype(np.float32)
-    out_array = tvm.topi.testing.bilinear_resize_python(in_array, (3 * scale, 3 * scale), "NCHW")
-
-    ref_node = helper.make_node(
-        "Constant",
-        inputs=[],
-        outputs=["const"],
-        value=onnx.helper.make_tensor(
-            name="const_tensor",
-            data_type=TensorProto.FLOAT,
-            dims=scales,
-            vals=np.random.random(scales).flatten().astype(float),
-        ),
-    )
-
-    shape_node = helper.make_node("Shape", ["const"], ["scales"])
-
-    graph = helper.make_graph(
-        [ref_node, shape_node, y],
-        "upsample_bilinear_opset9_test",
-        inputs=[helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))],
-        outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))],
-    )
-
-    model = helper.make_model(graph, producer_name="upsample_bilinear_opset9_test")
-
-    for target, ctx in tvm.testing.enabled_targets():
-        tvm_out = get_tvm_output_with_vm(
-            model, [in_array], target, ctx, opset=9, freeze_params=True
-        )
-
-
-def _test_upsample3d_trilinear():
+def verify_upsample3d_trilinear():
     scale = 2
     in_shape = (1, 1, 3, 3, 3)
     out_shape = (1, 1, 3 * scale, 3 * scale, 3 * scale)
@@ -1374,7 +1273,8 @@ def _test_upsample3d_trilinear():
     )
 
     model = helper.make_model(graph, producer_name="upsample_trilinear_test")
-
+    # TODO(jwfromm): Trilinear upsampling not supported in 1.0.0 onnxruntime.
+    # Replace topi comparison with verify_with_ort once we update.
     for target, ctx in tvm.testing.enabled_targets():
         tvm_out = get_tvm_output(model, in_array, target, ctx, out_shape, "float32")
         tvm.testing.assert_allclose(out_array, tvm_out, rtol=1e-5, atol=1e-5)
@@ -1383,41 +1283,36 @@ def _test_upsample3d_trilinear():
 # TODO(mbrookhart): enable once VM supports heterogenous execution
 # @tvm.testing.uses_gpu
 def test_upsample():
-    _test_upsample_nearest()
-    _test_upsample_bilinear()
-    _test_upsample_bilinear_opset9()
-    _test_upsample3d_nearest()
-    _test_upsample3d_trilinear()
+    verify_upsample_nearest()
+    verify_upsample_bilinear()
+    verify_upsample3d_nearest()
+    verify_upsample3d_trilinear()
 
 
-def _test_softmax(inshape, axis):
+def verify_softmax(inshape, axis):
     opname = "Softmax"
     indata = np.random.uniform(size=inshape).astype(np.float32)
     outshape = inshape
-    outdata = tvm.topi.testing.softmax_python(indata)
-    if isinstance(axis, int):
-        y = helper.make_node(opname, ["in"], ["out"], axis=axis)
-    elif axis is None:
-        y = helper.make_node(opname, ["in"], ["out"])
+    y = helper.make_node(opname, ["in"], ["out"])
+    if axis is not None:
+        axis_attr = helper.make_attribute("axis", axis)
+        y.attribute.append(axis_attr)
 
     graph = helper.make_graph(
         [y],
         opname + "_test",
         inputs=[helper.make_tensor_value_info("in", TensorProto.FLOAT, list(indata.shape))],
-        outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(outdata.shape))],
+        outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(outshape))],
     )
 
     model = helper.make_model(graph, producer_name=opname + "_test")
-
-    for target, ctx in tvm.testing.enabled_targets():
-        tvm_out = get_tvm_output(model, indata, target, ctx, outshape, "float32")
-        tvm.testing.assert_allclose(outdata, tvm_out, rtol=1e-5, atol=1e-5)
+    verify_with_ort_with_inputs(model, [indata])
 
 
 @tvm.testing.uses_gpu
 def test_softmax():
-    _test_softmax((1, 10), None)
-    _test_softmax((1, 10), 1)
+    verify_softmax((1, 10), None)
+    verify_softmax((1, 10), 1)
 
 
 def verify_min(input_dim):
@@ -1427,8 +1322,6 @@ def verify_min(input_dim):
     a_np2 = np.random.uniform(size=input_dim).astype(dtype)
     a_np3 = np.random.uniform(size=input_dim).astype(dtype)
 
-    b_np = np.min((a_np1, a_np2, a_np3), axis=0)
-
     min_node = helper.make_node("Min", ["a_np1", "a_np2", "a_np3"], ["out"])
 
     graph = helper.make_graph(
@@ -1439,14 +1332,11 @@ def verify_min(input_dim):
             helper.make_tensor_value_info("a_np2", TensorProto.FLOAT, list(input_dim)),
             helper.make_tensor_value_info("a_np3", TensorProto.FLOAT, list(input_dim)),
         ],
-        outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(b_np.shape))],
+        outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(input_dim))],
     )
 
     model = helper.make_model(graph, producer_name="Min_test")
-
-    for target, ctx in tvm.testing.enabled_targets():
-        tvm_out = get_tvm_output(model, [a_np1, a_np2, a_np3], target, ctx, b_np.shape)
-        tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
+    verify_with_ort_with_inputs(model, [a_np1, a_np2, a_np3])
 
 
 @tvm.testing.uses_gpu
@@ -1462,8 +1352,6 @@ def verify_max(input_dim):
     a_np2 = np.random.uniform(size=input_dim).astype(dtype)
     a_np3 = np.random.uniform(size=input_dim).astype(dtype)
 
-    b_np = np.max((a_np1, a_np2, a_np3), axis=0)
-
     max_node = helper.make_node("Max", ["a_np1", "a_np2", "a_np3"], ["out"])
 
     graph = helper.make_graph(
@@ -1474,14 +1362,11 @@ def verify_max(input_dim):
             helper.make_tensor_value_info("a_np2", TensorProto.FLOAT, list(input_dim)),
             helper.make_tensor_value_info("a_np3", TensorProto.FLOAT, list(input_dim)),
         ],
-        outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(b_np.shape))],
+        outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(input_dim))],
     )
 
     model = helper.make_model(graph, producer_name="Max_test")
-
-    for target, ctx in tvm.testing.enabled_targets():
-        tvm_out = get_tvm_output(model, [a_np1, a_np2, a_np3], target, ctx, b_np.shape)
-        tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
+    verify_with_ort_with_inputs(model, [a_np1, a_np2, a_np3])
 
 
 @tvm.testing.uses_gpu
@@ -1497,8 +1382,6 @@ def verify_mean(input_dim):
     a_np2 = np.random.uniform(size=input_dim).astype(dtype)
     a_np3 = np.random.uniform(size=input_dim).astype(dtype)
 
-    b_np = np.mean((a_np1, a_np2, a_np3), axis=0)
-
     mean_node = helper.make_node("Mean", ["a_np1", "a_np2", "a_np3"], ["out"])
 
     graph = helper.make_graph(
@@ -1509,14 +1392,11 @@ def verify_mean(input_dim):
             helper.make_tensor_value_info("a_np2", TensorProto.FLOAT, list(input_dim)),
             helper.make_tensor_value_info("a_np3", TensorProto.FLOAT, list(input_dim)),
         ],
-        outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(b_np.shape))],
+        outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(input_dim))],
     )
 
     model = helper.make_model(graph, producer_name="Mean_test")
-
-    for target, ctx in tvm.testing.enabled_targets():
-        tvm_out = get_tvm_output(model, [a_np1, a_np2, a_np3], target, ctx, b_np.shape)
-        tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
+    verify_with_ort_with_inputs(model, [a_np1, a_np2, a_np3])
 
 
 @tvm.testing.uses_gpu
@@ -1530,22 +1410,17 @@ def verify_hardsigmoid(input_dim, alpha, beta):
 
     a_np1 = np.random.uniform(size=input_dim).astype(dtype)
 
-    b_np = np.clip(a_np1 * alpha + beta, 0, 1)
-
     hardsigmoid_node = helper.make_node("HardSigmoid", ["a_np1"], ["out"], alpha=alpha, beta=beta)
 
     graph = helper.make_graph(
         [hardsigmoid_node],
         "HardSigmoid_test",
         inputs=[helper.make_tensor_value_info("a_np1", TensorProto.FLOAT, list(input_dim))],
-        outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(b_np.shape))],
+        outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(input_dim))],
     )
 
     model = helper.make_model(graph, producer_name="HardSigmoid_test")
-
-    for target, ctx in tvm.testing.enabled_targets():
-        tvm_out = get_tvm_output(model, [a_np1], target, ctx, b_np.shape)
-        tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
+    verify_with_ort_with_inputs(model, [a_np1])
 
 
 @tvm.testing.uses_gpu
@@ -1554,98 +1429,51 @@ def test_forward_hardsigmoid():
     verify_hardsigmoid((20, 20), 0.3, 0.4)
 
 
-def verify_argmin(input_dim, axis=None, keepdims=None):
-    def _argmin_numpy(data, axis=0, keepdims=True):
-        result = np.argmin(data, axis=axis)
-        if keepdims == 1:
-            result = np.expand_dims(result, axis)
-        return result.astype(data.dtype)
-
+def verify_argreduce(input_dim, op_name, axis=None, keepdims=None):
     a_np1 = np.random.uniform(-10, 10, input_dim).astype(np.int32)
-    if keepdims is None and axis is None:
-        b_np = _argmin_numpy(a_np1)
-        node = onnx.helper.make_node("ArgMin", inputs=["a_np1"], outputs=["out"])
-    elif axis is None:
-        b_np = _argmin_numpy(a_np1, keepdims=keepdims)
-        node = onnx.helper.make_node("ArgMin", inputs=["a_np1"], outputs=["out"], keepdims=keepdims)
-    elif keepdims is None:
-        b_np = _argmin_numpy(a_np1, axis=axis)
-        node = onnx.helper.make_node("ArgMin", inputs=["a_np1"], outputs=["out"], axis=axis)
+    out_shape = list(a_np1.shape)
+    def_axis = axis if axis is not None else 0
+    if keepdims == 1 or keepdims == None:
+        out_shape[def_axis] = 1
     else:
-        b_np = _argmin_numpy(a_np1, axis=axis, keepdims=keepdims)
-        node = onnx.helper.make_node(
-            "ArgMin", inputs=["a_np1"], outputs=["out"], axis=axis, keepdims=keepdims
-        )
-    graph = helper.make_graph(
-        [node],
-        "argmin_test",
-        inputs=[helper.make_tensor_value_info("a_np1", TensorProto.INT32, list(a_np1.shape))],
-        outputs=[helper.make_tensor_value_info("out", TensorProto.INT32, list(b_np.shape))],
-    )
-
-    model = helper.make_model(graph, producer_name="argmin_test")
-
-    for target, ctx in tvm.testing.enabled_targets():
-        tvm_out = get_tvm_output(model, [a_np1], target, ctx, b_np.shape, b_np.dtype)
-        tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
-
+        out_shape.pop(def_axis)
 
-def verify_argmax(input_dim, axis=None, keepdims=None):
-    def _argmax_numpy(data, axis=0, keepdims=True):
-        result = np.argmax(data, axis=axis)
-        if keepdims == 1:
-            result = np.expand_dims(result, axis)
-        return result.astype(data.dtype)
+    node = onnx.helper.make_node(op_name, inputs=["a_np1"], outputs=["out"])
 
-    a_np1 = np.random.uniform(-10, 10, input_dim).astype(np.int32)
-    if keepdims is None and axis is None:
-        b_np = _argmax_numpy(a_np1)
-        node = onnx.helper.make_node("ArgMax", inputs=["a_np1"], outputs=["out"])
-    elif axis is None:
-        b_np = _argmax_numpy(a_np1, keepdims=keepdims)
-        node = onnx.helper.make_node("ArgMax", inputs=["a_np1"], outputs=["out"], keepdims=keepdims)
-    elif keepdims is None:
-        b_np = _argmax_numpy(a_np1, axis=axis)
-        node = onnx.helper.make_node("ArgMax", inputs=["a_np1"], outputs=["out"], axis=axis)
-    else:
-        b_np = _argmax_numpy(a_np1, axis=axis, keepdims=keepdims)
-        node = onnx.helper.make_node(
-            "ArgMax", inputs=["a_np1"], outputs=["out"], axis=axis, keepdims=keepdims
-        )
+    if keepdims is not None:
+        keepdims_attr = helper.make_attribute("keepdims", keepdims)
+        node.attribute.append(keepdims_attr)
+    if axis is not None:
+        axis_attr = helper.make_attribute("axis", axis)
+        node.attribute.append(axis_attr)
 
     graph = helper.make_graph(
         [node],
-        "argmax_test",
+        "argreduce_test",
         inputs=[helper.make_tensor_value_info("a_np1", TensorProto.INT32, list(a_np1.shape))],
-        outputs=[helper.make_tensor_value_info("out", TensorProto.INT32, list(b_np.shape))],
+        outputs=[helper.make_tensor_value_info("out", TensorProto.INT64, list(out_shape))],
     )
 
-    model = helper.make_model(graph, producer_name="argmax_test")
-
-    for target, ctx in tvm.testing.enabled_targets():
-        tvm_out = get_tvm_output(model, [a_np1], target, ctx, b_np.shape, b_np.dtype)
-        tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
+    model = helper.make_model(graph, producer_name="argreduce_test")
+    verify_with_ort_with_inputs(model, [a_np1])
 
 
 @tvm.testing.uses_gpu
 def test_forward_arg_min_max():
     """Verify argmin and argmax"""
-    verify_argmin([3, 4, 4])
-    verify_argmax([3, 4, 4])
-    verify_argmin([3, 4, 4], axis=1)
-    verify_argmax([3, 4, 4], axis=0)
-    verify_argmin([3, 4, 4], keepdims=0)
-    verify_argmax([3, 4, 4], keepdims=1)
+    verify_argreduce([3, 4, 4], "ArgMin")
+    verify_argreduce([3, 4, 4], "ArgMax")
+    verify_argreduce([3, 4, 4], "ArgMin", axis=1)
+    verify_argreduce([3, 4, 4], "ArgMax", axis=0)
+    verify_argreduce([3, 4, 4], "ArgMin", keepdims=0)
+    verify_argreduce([3, 4, 4], "ArgMax", keepdims=1)
     for axis in [None, 0, 1, 2]:
         for keepdims in [None, True, False]:
-            verify_argmin([3, 4, 4], axis, keepdims)
-            verify_argmax([3, 4, 4], axis, keepdims)
+            verify_argreduce([3, 4, 4], "ArgMin", axis, keepdims)
+            verify_argreduce([3, 4, 4], "ArgMax", axis, keepdims)
 
 
 def verify_constantofshape(input_dim, value, dtype):
-    out = np.empty(shape=input_dim, dtype=dtype)
-    out.fill(value)
-
     fill_node = helper.make_node(
         "ConstantOfShape",
         ["input"],
@@ -1655,22 +1483,22 @@ def verify_constantofshape(input_dim, value, dtype):
         ),
     )
 
-    inputs = [helper.make_tensor_value_info("input", TensorProto.FLOAT, input_dim)]
+    inputs = [helper.make_tensor_value_info("input", TensorProto.INT64, [len(input_dim)])]
 
     graph = helper.make_graph(
         [fill_node],
         "fill_test",
         inputs,
-        outputs=[helper.make_tensor_value_info("output", TensorProto.FLOAT, list(out.shape))],
+        outputs=[
+            helper.make_tensor_value_info(
+                "output", mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(dtype)], input_dim
+            )
+        ],
     )
 
     model = helper.make_model(graph, producer_name="fill_test")
-
-    for target, ctx in tvm.testing.enabled_targets():
-        input_np = np.array(input_dim).astype("float32")
-        tvm_out = get_tvm_output_with_vm(model, [input_np], target, ctx)
-
-        tvm.testing.assert_allclose(out, tvm_out, rtol=1e-5, atol=1e-5)
+    input_np = np.array(input_dim).astype("int64")
+    verify_with_ort_with_inputs(model, [input_np], use_vm=True)
 
 
 # TODO(mbrookhart): enable once VM supports heterogenous execution
@@ -1708,10 +1536,7 @@ def verify_pad(indata, pads, mode="constant", value=0.0):
         outputs=[helper.make_tensor_value_info("output", TensorProto.FLOAT, list(outdata.shape))],
     )
     model = helper.make_model(graph, producer_name="pad_test")
-    #  tvm result
-    for target, ctx in tvm.testing.enabled_targets():
-        tvm_out = get_tvm_output(model, indata, target, ctx, outdata.shape, "float32", opset=2)
-    tvm.testing.assert_allclose(outdata, tvm_out, rtol=1e-5, atol=1e-5)
+    verify_with_ort_with_inputs(model, [indata], [outdata.shape], dtype="float32", opset=2)
 
 
 def verify_pad_v11(indata, pads, mode="constant", value=0.0):
@@ -1760,10 +1585,7 @@ def verify_pad_v11(indata, pads, mode="constant", value=0.0):
             ],
         )
     model = helper.make_model(graph, producer_name="pad_test")
-    #  tvm result
-    for target, ctx in tvm.testing.enabled_targets():
-        tvm_out = get_tvm_output_with_vm(model, inputs, target, ctx, opset=11, freeze_params=False)
-    tvm.testing.assert_allclose(outdata, tvm_out, rtol=1e-5, atol=1e-5)
+    verify_with_ort_with_inputs(model, inputs, opset=11, use_vm=True)
 
 
 # TODO(mbrookhart): enable once VM supports heterogenous execution
@@ -1804,7 +1626,7 @@ def verify_reduce_func(func, data, axis, keepdims):
 
     model = helper.make_model(graph, producer_name="reduce_test")
 
-    verify_with_ort_with_inputs(model, [data], outshape)
+    verify_with_ort_with_inputs(model, [data], [outshape])
 
 
 @tvm.testing.uses_gpu
@@ -1849,32 +1671,45 @@ def test_all_reduce_funcs():
             )
 
 
-def verify_split(indata, outdatas, split, axis=0, pass_split=True):
+def verify_split(indata, outdatas, split, axis=0, pass_split=True, opset=11):
     indata = np.array(indata).astype(np.float32)
     outdatas = [np.array(o).astype(np.float32) for o in outdatas]
+    inputs = [helper.make_tensor_value_info("input", TensorProto.FLOAT, list(indata.shape))]
+    input_names = ["input"]
+    initializer = []
+
     if split:
         split_index = range(len(split))
     else:
         split_index = range(len(outdatas))
+
     if pass_split:
-        node = helper.make_node(
-            "Split",
-            inputs=["input"],
-            outputs=["output_{}".format(i) for i in range(len(split_index))],
-            axis=axis,
-            split=split,
-        )
-    else:
-        node = helper.make_node(
-            "Split",
-            inputs=["input"],
-            outputs=["output_{}".format(i) for i in range(len(split_index))],
-            axis=axis,
-        )
+        if opset >= 13:
+            input_names.append("split")
+            np_split = np.array(split).astype(np.int64)
+            inputs.append(
+                helper.make_tensor_value_info("split", TensorProto.INT64, list(np_split.shape))
+            )
+            indata = [indata, np_split]
+            initializer.append(
+                helper.make_tensor("split", TensorProto.INT64, list(np_split.shape), np_split)
+            )
+    node = helper.make_node(
+        "Split",
+        inputs=input_names,
+        outputs=["output_{}".format(i) for i in range(len(split_index))],
+        axis=axis,
+    )
+
+    if pass_split and opset < 13:
+        split_attr = helper.make_attribute("split", split)
+        node.attribute.append(split_attr)
+
     graph = helper.make_graph(
         [node],
         "split_test",
-        inputs=[helper.make_tensor_value_info("input", TensorProto.FLOAT, list(indata.shape))],
+        inputs=inputs,
+        initializer=initializer,
         outputs=[
             helper.make_tensor_value_info(
                 "output_{}".format(i), TensorProto.FLOAT, list(outdatas[i].shape)
@@ -1883,18 +1718,7 @@ def verify_split(indata, outdatas, split, axis=0, pass_split=True):
         ],
     )
     model = helper.make_model(graph, producer_name="split_test")
-
-    import onnxruntime.backend
-
-    rep = onnxruntime.backend.prepare(model, "CPU")
-    onnx_out = rep.run(indata)
-
-    for target, ctx in tvm.testing.enabled_targets():
-        output_shape = [o.shape for o in outdatas]
-        output_type = ["float32", "float32", "float32"]
-        tvm_out = get_tvm_output(model, indata, target, ctx, output_shape, output_type)
-        for o, t in zip(onnx_out, tvm_out):
-            tvm.testing.assert_allclose(o, t)
+    verify_with_ort_with_inputs(model, indata, out_shape=list(range(len(split_index))), opset=opset)
 
 
 @tvm.testing.uses_gpu
@@ -1914,6 +1738,8 @@ def test_split():
     )
     # Split evenly (unstack)
     verify_split([1, 2, 3], [[1], [2], [3]], False, 0, False)
+    # Split a single value to a single value
+    verify_split([1], [[1]], [1], pass_split=True)
 
 
 @tvm.testing.uses_gpu
@@ -1922,50 +1748,52 @@ def test_binary_ops():
     dtype = "float32"
     out_shape = in_shape
 
-    def verify_binary_ops(op, x, y, out_np, x_name="in1", y_name="in2", broadcast=None):
-        if broadcast is None:
-            z = helper.make_node(op, [x_name, y_name], ["out"])
-        else:
-            z = helper.make_node(op, [x_name, y_name], ["out"], broadcast=1)
+    def verify_binary_ops(op, x, y, out_type="float32"):
+        z = helper.make_node(op, ["in1", "in2"], ["out"])
         graph = helper.make_graph(
             [z],
             "_test",
             inputs=[
-                helper.make_tensor_value_info(x_name, TensorProto.FLOAT, list(in_shape)),
-                helper.make_tensor_value_info(y_name, TensorProto.FLOAT, list(in_shape)),
+                helper.make_tensor_value_info("in1", TensorProto.FLOAT, x.shape),
+                helper.make_tensor_value_info("in2", TensorProto.FLOAT, y.shape),
+            ],
+            outputs=[
+                helper.make_tensor_value_info(
+                    "out", mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(out_type)], list(out_shape)
+                )
             ],
-            outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))],
         )
         model = helper.make_model(graph, producer_name="_test")
-        for target, ctx in tvm.testing.enabled_targets():
-            tvm_out = get_tvm_output(model, [x, y], target, ctx)
-            tvm.testing.assert_allclose(out_np, tvm_out, rtol=1e-5, atol=1e-5)
+        verify_with_ort_with_inputs(model, [x, y])
 
     x = np.random.uniform(size=in_shape).astype(dtype)
     y = np.random.uniform(size=in_shape).astype(dtype)
     z = np.random.uniform(size=(3,)).astype(dtype)
-    verify_binary_ops("Add", x, y, x + y, broadcast=None)
-    verify_binary_ops("Add", x, z, x + z, broadcast=True)
-    verify_binary_ops("Sub", x, y, x - y, broadcast=None)
-    verify_binary_ops("Sub", x, z, x - z, broadcast=True)
-    verify_binary_ops("Mul", x, y, x * y, broadcast=None)
-    verify_binary_ops("Mul", x, z, x * z, broadcast=True)
-    verify_binary_ops("Mul", x, x, x * x, x_name="in1", y_name="in1", broadcast=None)
-    verify_binary_ops("Div", x, y, x / y, broadcast=None)
-    verify_binary_ops("Div", x, z, x / z, broadcast=True)
-    verify_binary_ops("Sum", x, y, x + y, broadcast=None)
-    verify_binary_ops("Greater", x, y, x > y, broadcast=True)
-    verify_binary_ops("Less", x, y, x < y, broadcast=True)
-    verify_binary_ops("Equal", x, y, x == y, broadcast=True)
-
-
-@tvm.testing.uses_gpu
-def test_single_ops():
+    verify_binary_ops("Add", x, y)
+    verify_binary_ops("Add", x, z)
+    verify_binary_ops("Sub", x, y)
+    verify_binary_ops("Sub", x, z)
+    verify_binary_ops("Mul", x, y)
+    verify_binary_ops("Mul", x, z)
+    verify_binary_ops("Div", x, y)
+    verify_binary_ops("Div", x, z)
+    verify_binary_ops("Sum", x, y)
+    verify_binary_ops("Sum", x, z)
+    verify_binary_ops("Greater", x, y, "bool")
+    verify_binary_ops("Greater", x, z, "bool")
+    verify_binary_ops("Less", x, y, "bool")
+    verify_binary_ops("Less", x, z, "bool")
+    verify_binary_ops("Equal", x, y, "bool")
+    verify_binary_ops("Equal", x, z, "bool")
+
+
+@tvm.testing.uses_gpu
+def test_unary_ops():
     in_shape = (1, 2, 3, 3)
     dtype = "float32"
     out_shape = in_shape
 
-    def verify_single_ops(op, x, out_np, rtol=1e-5, atol=1e-5):
+    def verify_unary_ops(op, x, rtol=1e-5, atol=1e-5):
         z = helper.make_node(op, ["in1"], ["out"])
         graph = helper.make_graph(
             [z],
@@ -1976,33 +1804,31 @@ def verify_single_ops(op, x, out_np, rtol=1e-5, atol=1e-5):
             outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))],
         )
         model = helper.make_model(graph, producer_name="_test")
-        for target, ctx in tvm.testing.enabled_targets():
-            tvm_out = get_tvm_output(model, [x], target, ctx)
-            tvm.testing.assert_allclose(out_np, tvm_out, rtol=rtol, atol=atol)
+        verify_with_ort_with_inputs(model, [x], rtol=rtol, atol=atol)
 
     x = np.random.uniform(size=in_shape).astype(dtype)
-    verify_single_ops("Neg", x, -x)
-    verify_single_ops("Abs", x, np.abs(x))
-    verify_single_ops("Reciprocal", x, 1 / x)
-    verify_single_ops("Sqrt", x, np.sqrt(x))
-    verify_single_ops("Relu", x, np.maximum(x, 0))
-    verify_single_ops("Exp", x, np.exp(x))
-    verify_single_ops("Log", x, np.log(x))
-    verify_single_ops("Log", x, np.log(x))
-    verify_single_ops("ACos", x, np.arccos(x))
-    verify_single_ops("ACosh", x, np.arccosh(x))
-    verify_single_ops("ASin", x, np.arcsin(x))
-    verify_single_ops("ASinh", x, np.arcsinh(x))
-    verify_single_ops("ATan", x, np.arctan(x))
-    verify_single_ops("ATanh", x, np.arctanh(x))
-    verify_single_ops("Cos", x, np.cos(x))
-    verify_single_ops("Cosh", x, np.cosh(x))
-    verify_single_ops("Sin", x, np.sin(x))
-    verify_single_ops("Sinh", x, np.sinh(x))
-    verify_single_ops("Tan", x, np.tan(x))
-    verify_single_ops("Tanh", x, np.tanh(x))
-    verify_single_ops("Sigmoid", x, 1 / (1 + np.exp(-x)))
-    verify_single_ops("Softsign", x, x / (1 + np.abs(x)))
+    verify_unary_ops("Neg", x)
+    verify_unary_ops("Abs", x)
+    verify_unary_ops("Reciprocal", x)
+    verify_unary_ops("Sqrt", x)
+    verify_unary_ops("Relu", x)
+    verify_unary_ops("Exp", x)
+    verify_unary_ops("Log", x)
+    verify_unary_ops("Log", x)
+    verify_unary_ops("Acos", x)
+    verify_unary_ops("Acosh", x)
+    verify_unary_ops("Asin", x)
+    verify_unary_ops("Asinh", x)
+    verify_unary_ops("Atan", x)
+    verify_unary_ops("Atanh", x)
+    verify_unary_ops("Cos", x)
+    verify_unary_ops("Cosh", x)
+    verify_unary_ops("Sin", x)
+    verify_unary_ops("Sinh", x)
+    verify_unary_ops("Tan", x)
+    verify_unary_ops("Tanh", x)
+    verify_unary_ops("Sigmoid", x)
+    verify_unary_ops("Softsign", x)
 
 
 @tvm.testing.uses_gpu
@@ -2058,7 +1884,11 @@ def verify_prelu(x_shape, a_shape):
         model = helper.make_model(graph, producer_name="prelu_test")
 
         verify_with_ort(
-            model, [x_shape, a_shape], list(x_shape), use_vm=True, convert_to_static=True
+            model,
+            [x_shape, a_shape],
+            out_shape=[list(x_shape)],
+            use_vm=True,
+            convert_to_static=True,
         )
 
     verify_prelu([3, 4, 5, 6], [1, 4, 1, 1])
@@ -2085,46 +1915,6 @@ def ThresholdedRelu_x(x, alpha):
     )
 
 
-@tvm.testing.uses_gpu
-def test_ScaledTanh():
-    def ScaledTanh_x(x, alpha, beta):
-        return alpha * np.tanh(beta * x)
-
-    _test_onnx_op_elementwise(
-        (2, 4, 5, 6),
-        ScaledTanh_x,
-        {"alpha": 0.25, "beta": 0.3},
-        "float32",
-        "ScaledTanh",
-        {"alpha": 0.25, "beta": 0.3},
-    )
-
-
-@tvm.testing.uses_gpu
-def test_ParametricSoftplus():
-    def ParametricSoftplus_x(x, alpha, beta):
-        return alpha * np.log(np.exp(beta * x) + 1)
-
-    _test_onnx_op_elementwise(
-        (2, 4, 5, 6),
-        ParametricSoftplus_x,
-        {"alpha": 0.25, "beta": 0.3},
-        "float32",
-        "ParametricSoftplus",
-        {"alpha": 0.25, "beta": 0.3},
-    )
-
-
-@tvm.testing.uses_gpu
-def test_Scale():
-    def Scale_x(x, scale):
-        return scale * x
-
-    _test_onnx_op_elementwise(
-        (2, 4, 5, 6), Scale_x, {"scale": 0.25}, "float32", "Scale", {"scale": 0.25}
-    )
-
-
 @tvm.testing.uses_gpu
 def test_LogSoftmax():
     _test_onnx_op_elementwise(
@@ -2138,8 +1928,8 @@ def check_torch_conversion(model, input_size):
     # Set verbose=True for more output
     torch.onnx.export(model(), dummy_input, file_name, export_params=True, verbose=False)
     onnx_model = onnx.load(file_name)
-    input_data = np.random.uniform(size=input_size).astype("int32")
-    verify_with_ort_with_inputs(onnx_model, [input_data])
+    input_data = np.random.uniform(size=input_size).astype("float32")
+    verify_with_ort_with_inputs(onnx_model, [input_data], apply_softmax=True)
 
 
 @tvm.testing.uses_gpu
@@ -2191,7 +1981,6 @@ def Sign_x(x):
 
 def verify_not(indata, dtype):
     x = indata.astype(dtype)
-    outdata = np.logical_not(x)
 
     node = helper.make_node(
         "Not",
@@ -2203,14 +1992,11 @@ def verify_not(indata, dtype):
         [node],
         "not_test",
         inputs=[helper.make_tensor_value_info("in", TensorProto.BOOL, list(x.shape))],
-        outputs=[helper.make_tensor_value_info("out", TensorProto.BOOL, list(outdata.shape))],
+        outputs=[helper.make_tensor_value_info("out", TensorProto.BOOL, list(x.shape))],
     )
 
     model = helper.make_model(graph, producer_name="not_test")
-
-    for target, ctx in tvm.testing.enabled_targets():
-        tvm_out = get_tvm_output(model, [x], target, ctx, outdata.shape)
-        tvm.testing.assert_allclose(outdata, tvm_out)
+    verify_with_ort_with_inputs(model, [x])
 
 
 @tvm.testing.uses_gpu
@@ -2245,10 +2031,7 @@ def verify_and(indata, dtype):
     )
 
     model = helper.make_model(graph, producer_name="and_test")
-
-    for target, ctx in tvm.testing.enabled_targets():
-        tvm_out = get_tvm_output(model, [x, y], target, ctx, outdata.shape)
-        tvm.testing.assert_allclose(outdata, tvm_out)
+    verify_with_ort_with_inputs(model, [x, y], [outdata.shape])
 
 
 @tvm.testing.uses_gpu
@@ -2279,22 +2062,6 @@ def test_and():
     verify_and(indata=[x, y], dtype=bool)
 
 
-def verify_tile_v1(indata, outdata, **kwargs):
-    node = helper.make_node("Tile", inputs=["in"], outputs=["out"], **kwargs)
-    graph = helper.make_graph(
-        [node],
-        "tile_test",
-        inputs=[helper.make_tensor_value_info("in", TensorProto.FLOAT, list(indata.shape))],
-        outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(outdata.shape))],
-    )
-
-    model = helper.make_model(graph, producer_name="tile_test")
-
-    for target, ctx in tvm.testing.enabled_targets():
-        tvm_out = get_tvm_output(model, [indata], target, ctx, outdata.shape, opset=1)
-        tvm.testing.assert_allclose(outdata, tvm_out)
-
-
 def verify_tile_v6(indata, repeats, outdata):
     node = helper.make_node("Tile", inputs=["input", "repeats"], outputs=["out"])
     graph = helper.make_graph(
@@ -2308,10 +2075,7 @@ def verify_tile_v6(indata, repeats, outdata):
     )
 
     model = helper.make_model(graph, producer_name="tile_test")
-
-    for target, ctx in tvm.testing.enabled_targets():
-        tvm_out = get_tvm_output_with_vm(model, [indata, repeats], target, ctx, opset=6)
-        tvm.testing.assert_allclose(outdata, tvm_out)
+    verify_with_ort_with_inputs(model, [indata, repeats], use_vm=True, opset=6)
 
 
 # TODO(mbrookhart): enable once VM supports heterogenous execution
@@ -2320,7 +2084,6 @@ def test_tile():
     x = np.random.rand(2, 3, 4, 5).astype(np.float32)
     repeats = np.random.randint(low=1, high=10, size=(np.ndim(x),)).astype(np.int64)
     z = np.tile(x, repeats)
-    verify_tile_v1(x, z, repeats=repeats)
     verify_tile_v6(x, repeats, z)
 
 
@@ -2333,10 +2096,7 @@ def verify_erf(indata, outdata):
         outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(outdata.shape))],
     )
     model = helper.make_model(graph, producer_name="erf_test")
-
-    for target, ctx in tvm.testing.enabled_targets():
-        tvm_out = get_tvm_output(model, [indata], target, ctx, outdata.shape)
-        tvm.testing.assert_allclose(outdata, tvm_out)
+    verify_with_ort_with_inputs(model, [indata], [outdata.shape])
 
 
 @tvm.testing.uses_gpu
@@ -2359,10 +2119,7 @@ def verify_where(condition, x, y, dtype, outdata):
         outputs=[helper.make_tensor_value_info("out", dtype, list(outdata.shape))],
     )
     model = helper.make_model(graph, producer_name="where_test")
-
-    for target, ctx in tvm.testing.enabled_targets():
-        tvm_out = get_tvm_output(model, [condition, x, y], target, ctx, outdata.shape)
-        tvm.testing.assert_allclose(outdata, tvm_out)
+    verify_with_ort_with_inputs(model, [condition, x, y], [outdata.shape])
 
 
 @tvm.testing.uses_gpu
@@ -2422,10 +2179,7 @@ def verify_or(indata, dtype):
     )
 
     model = helper.make_model(graph, producer_name="or_test")
-
-    for target, ctx in tvm.testing.enabled_targets():
-        tvm_out = get_tvm_output(model, [x, y], target, ctx, outdata.shape)
-        tvm.testing.assert_allclose(outdata, tvm_out)
+    verify_with_ort_with_inputs(model, [x, y], [outdata.shape])
 
 
 @tvm.testing.uses_gpu
@@ -2479,7 +2233,7 @@ def verify_batch_norm(in_shape):
         model = helper.make_model(graph, producer_name="batchnorm_test")
         # X, scale, b, mean, var
         inshapes = [in_shape, in_shape[1], in_shape[1], in_shape[1], in_shape[1]]
-        verify_with_ort(model, inshapes, in_shape)
+        verify_with_ort(model, inshapes, out_shape=[in_shape])
 
     verify_batch_norm([1, 3, 224, 224])
     verify_batch_norm([1, 3, 24, 24])
@@ -2517,7 +2271,7 @@ def verify_batch_norm_dynamic_subgraph(in_shape, o_shape):
 
         # X, inp, scale, b, mean, var
         inshapes = [in_shape, o_shape, in_shape[1], in_shape[1], in_shape[1], in_shape[1]]
-        verify_with_ort(model, inshapes, in_shape, use_vm=True)
+        verify_with_ort(model, inshapes, out_shape=[in_shape], use_vm=True)
 
     verify_batch_norm_dynamic_subgraph([16, 16, 10, 10], [160, 160])
 
@@ -2581,7 +2335,7 @@ def verify_conv(
 
     model = helper.make_model(graph, producer_name="conv_test")
 
-    verify_with_ort(model, [x_shape, w_shape], y_shape, use_vm=True, convert_to_static=True)
+    verify_with_ort(model, [x_shape, w_shape], [y_shape], use_vm=True, convert_to_static=True)
 
 
 @tvm.testing.uses_gpu
@@ -2735,7 +2489,7 @@ def verify_convtranspose_with_padding(
 
     model = helper.make_model(graph, producer_name="conv_test")
 
-    verify_with_ort(model, [x_shape, w_shape], y_shape, use_vm=True, convert_to_static=True)
+    verify_with_ort(model, [x_shape, w_shape], [y_shape], use_vm=True, convert_to_static=True)
 
 
 def verify_convtranspose(x_shape, w_shape, y_shape, p):
@@ -2908,7 +2662,7 @@ def verify_pooling(x_shape, kernel_shape, strides, pads, out_shape, mode, auto_p
     )
 
     model = helper.make_model(graph, producer_name="pooling_test")
-    verify_with_ort(model, [x_shape], out_shape, use_vm=True, convert_to_static=True)
+    verify_with_ort(model, [x_shape], [out_shape], use_vm=True, convert_to_static=True)
 
 
 @tvm.testing.uses_gpu
@@ -3013,7 +2767,7 @@ def verify_mod(x_shape, y_shape, fmod, out_shape, dtype="float32"):
         outputs=[helper.make_tensor_value_info("z", onnx_dtype, list(out_shape))],
     )
     model = helper.make_model(graph, producer_name="mod_test")
-    verify_with_ort_with_inputs(model, [x_np, y_np], out_shape)
+    verify_with_ort_with_inputs(model, [x_np, y_np], [out_shape])
 
 
 @tvm.testing.uses_gpu
@@ -3066,10 +2820,7 @@ def verify_xor(x_shape, y_shape):
         outputs=[helper.make_tensor_value_info("z", onnx_dtype, list(out_shape))],
     )
     model = helper.make_model(graph, producer_name="xor_test")
-
-    for target, ctx in tvm.testing.enabled_targets():
-        tvm_out = get_tvm_output(model, [x_np, y_np], target, ctx, out_shape)
-        tvm.testing.assert_allclose(np_out, tvm_out, rtol=1e-5, atol=1e-5)
+    verify_with_ort_with_inputs(model, [x_np, y_np], [out_shape])
 
 
 @tvm.testing.uses_gpu
@@ -3106,7 +2857,7 @@ def verify_max_roi_pool(x_shape, rois_shape, pooled_shape, spatial_scale, out_sh
     )
 
     model = helper.make_model(graph, producer_name="pool_test")
-    verify_with_ort(model, [x_shape, rois_shape], out_shape)
+    verify_with_ort(model, [x_shape, rois_shape], [out_shape])
 
 
 @tvm.testing.uses_gpu
@@ -3158,7 +2909,7 @@ def verify_lppool(x_shape, kernel_shape, p, strides, pads, out_shape, auto_pad="
     )
 
     model = helper.make_model(graph, producer_name="lppool_test")
-    verify_with_ort(model, [x_shape], out_shape, use_vm=True, convert_to_static=True)
+    verify_with_ort(model, [x_shape], [out_shape], use_vm=True, convert_to_static=True)
 
 
 @tvm.testing.uses_gpu
@@ -3350,18 +3101,7 @@ def verify_rnn(
 
     model = helper.make_model(graph, producer_name="rnn_test")
 
-    for target, ctx in tvm.testing.enabled_targets():
-        onnx_out = get_onnxruntime_output(model, input_values, "float32")
-        tvm_out = get_tvm_output(
-            model,
-            input_values,
-            target,
-            ctx,
-            output_shapes,
-            output_dtype=["float32"] * len(output_shapes),
-        )
-        for o_out, t_out in zip(onnx_out, tvm_out):
-            tvm.testing.assert_allclose(o_out, t_out, rtol=5e-3, atol=5e-3)
+    verify_with_ort_with_inputs(model, input_values, output_shapes, atol=1e-2, rtol=1e-2)
 
 
 @tvm.testing.uses_gpu
@@ -3566,7 +3306,7 @@ def verify(ishape, oshape, scales, mode, coord_trans):
 
         model = helper.make_model(graph, producer_name="resize_test")
 
-        verify_with_ort(model, [ishape], oshape, use_vm=True, opset=11, freeze_params=True)
+        verify_with_ort(model, [ishape], [oshape], use_vm=True, opset=11, freeze_params=True)
 
     # upsampling
     verify([1, 16, 32, 32], [1, 16, 64, 64], [], "nearest", "asymmetric")
@@ -3603,9 +3343,7 @@ def verify_opset_10(ishape, scales, mode):
         )
 
         model = helper.make_model(graph, producer_name="resize_test")
-        model.opset_import[0].version = 10
-
-        verify_with_ort(model, [ishape], oshape, use_vm=True, freeze_params=True)
+        verify_with_ort(model, [ishape], [oshape], use_vm=True, freeze_params=True, opset=10)
 
     verify_opset_10([1, 16, 32, 32], [1, 1, 2, 2], "nearest")
     verify_opset_10([1, 16, 32, 32], [1, 1, 0.5, 0.5], "linear")
@@ -3674,11 +3412,7 @@ def verify_topk(input_dims, K, axis=-1):
         model = helper.make_model(graph, producer_name="topk_test")
 
         indata = np.random.uniform(-10, 10, input_dims).astype(np.float32)
-        onnx_out = get_onnxruntime_output(model, [indata, np.array([K])])
-
-        for target, ctx in [("llvm", tvm.cpu())]:
-            tvm_out = get_tvm_output_with_vm(model, [indata, np.array(K)], target, ctx)
-            tvm.testing.assert_allclose(onnx_out, tvm_out, rtol=1e-05, atol=1e-05)
+        verify_with_ort_with_inputs(model, [indata, np.array([K])], use_vm=True)
 
     for n in [12, 32]:
         for shape in [[n], [n, n], [n, n, n]]:
@@ -3731,7 +3465,9 @@ def verify_roi_align(
         np_rois = np.random.uniform(size=[num_roi, 4]).astype("float32") * input_dims[2]
         np_batch_indicies = np.random.randint(low=0, high=input_dims[0], size=num_roi)
 
-        verify_with_ort_with_inputs(model, [np_data, np_rois, np_batch_indicies], output_dims)
+        verify_with_ort_with_inputs(
+            model, [np_data, np_rois, np_batch_indicies], out_shape=[output_dims]
+        )
 
     verify_roi_align((1, 4, 16, 16), 32, 7, 7, sampling_ratio=0, spatial_scale=1.0)
     verify_roi_align((4, 4, 16, 32), 32, 7, 7, sampling_ratio=0, spatial_scale=1.0)
@@ -3914,12 +3650,7 @@ def verify_cond_loop():
     trip_count = np.array(40).astype(np.int64)
     cond = np.array(1).astype(np.bool)
     input_vals = [trip_count, cond, y]
-    onnx_out = get_onnxruntime_output(loop_model, input_vals)
-
-    for target, ctx in [("llvm", tvm.cpu())]:
-        tvm_out = get_tvm_output_with_vm(loop_model, input_vals, target, ctx, freeze_params=True)
-        for i in range(len(tvm_out)):
-            tvm.testing.assert_allclose(onnx_out[i], tvm_out[i], rtol=1e-05, atol=1e-05)
+    verify_with_ort_with_inputs(loop_model, input_vals, use_vm=True, freeze_params=True)
 
 
 def verify_count_loop():
@@ -3974,12 +3705,7 @@ def verify_count_loop():
     trip_count = np.array(5).astype(np.int64)
     cond = np.array(1).astype(np.bool)
     input_vals = [trip_count, cond, y]
-    onnx_out = get_onnxruntime_output(loop_model, input_vals)
-
-    for target, ctx in [("llvm", tvm.cpu())]:
-        tvm_out = get_tvm_output_with_vm(loop_model, input_vals, target, ctx, freeze_params=True)
-        for i in range(len(tvm_out)):
-            tvm.testing.assert_allclose(onnx_out[i], tvm_out[i], rtol=1e-05, atol=1e-05)
+    verify_with_ort_with_inputs(loop_model, input_vals, use_vm=True, freeze_params=True)
 
 
 def test_loop():
@@ -3999,11 +3725,11 @@ def verify_if(cond_array):
     y = np.array([5, 4, 3, 2, 1]).astype(np.float32)
 
     then_const_node = onnx.helper.make_node(
-        "Constant", inputs=[], outputs=["then_out"], value=onnx.numpy_helper.from_array(x)
+        "Constant", inputs=[], outputs=["then_out"], value=numpy_helper.from_array(x)
     )
 
     else_const_node = onnx.helper.make_node(
-        "Constant", inputs=[], outputs=["else_out"], value=onnx.numpy_helper.from_array(y)
+        "Constant", inputs=[], outputs=["else_out"], value=numpy_helper.from_array(y)
     )
 
     then_body = onnx.helper.make_graph([then_const_node], "then_body", [], [then_out])
@@ -4032,6 +3758,8 @@ def verify_if(cond_array):
         cond = np.array(1).astype("bool")
     correct_out = x if cond else y
 
+    # TODO(jwfromm): Onnxruntime 1.0.0 is buggy with If statements. Replace this with
+    # verify_with_ort once we update versions.
     for target, ctx in tvm.testing.enabled_targets():
         tvm_out = get_tvm_output_with_vm(if_model, [cond], target, ctx, freeze_params=True)
         for i in range(len(tvm_out)):
@@ -4204,15 +3932,12 @@ def verify_softplus(indata):
     test_pad()
     test_split()
     test_binary_ops()
-    test_single_ops()
+    test_unary_ops()
     test_leaky_relu()
     test_elu()
     test_selu()
     test_prelu()
     test_ThresholdedRelu()
-    test_ScaledTanh()
-    test_ParametricSoftplus()
-    test_Scale()
     test_LogSoftmax()
     test_resnet()
     test_inception()

From 2290cc0f79e9f9c255e10bd3775c711591c34e99 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Wed, 20 Jan 2021 04:08:52 +0900
Subject: [PATCH 088/357] [TOPI] Minor perf improvement for GPU scatter (#7233)

* improve scatter 4d init

* do not launch sorting based scatter for small input

* do not use hard coded num threads

* separate sort based implementation

* register scatter as autotvm task

* add missing import

* fix strategy

* add dedicated schedule and dummy flop

* add test tuning script

* try adding dummy knob

* skip random_fill when a tuning workload is from scatter

This reverts commit 1fed88321e640b509fc46fac7da3b3cb79719552.

* cleanup memcpy ir

* remove scatter tuning script

* make sure zero init arguments

* add comment on why skip random init for scatter

* restore ctx sync

Co-authored-by: masa <masa@pop-os.localdomain>
---
 python/tvm/autotvm/measure/measure_methods.py |   9 +-
 python/tvm/relay/op/strategy/cuda.py          |  15 +-
 python/tvm/relay/op/strategy/generic.py       |   2 +-
 python/tvm/topi/cuda/scatter.py               | 179 ++++++++++--------
 4 files changed, 123 insertions(+), 82 deletions(-)

diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index cb801ba72872..ffe4b97e33db 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -30,6 +30,7 @@
 from random import getrandbits
 from collections import namedtuple
 import tempfile
+import numpy as np
 
 import tvm._ffi
 import tvm.ir.transform
@@ -560,9 +561,11 @@ def run_through_rpc(
             raise AttributeError(
                 "Please make sure USE_RANDOM is ON in the config.cmake " "on the remote devices"
             )
-        args = [nd.empty(x[0], dtype=x[1], ctx=ctx) for x in build_result.arg_info]
-        for arg in args:
-            random_fill(arg)
+        args = [nd.array(np.zeros(x[0], dtype=x[1]), ctx=ctx) for x in build_result.arg_info]
+        if "scatter" not in measure_input.task.name:
+            # the index tensor of scatter op cannot be randomly initialized
+            for arg in args:
+                random_fill(arg)
         ctx.sync()
 
         costs = time_f(*args).results
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 04c16ddd344c..3863df0fd831 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -783,10 +783,23 @@ def scatter_cuda(attrs, inputs, out_type, target):
     strategy = _op.OpStrategy()
     strategy.add_implementation(
         wrap_compute_scatter(topi.cuda.scatter),
-        wrap_topi_schedule(topi.generic.schedule_extern),
+        wrap_topi_schedule(topi.cuda.schedule_scatter),
         name="scatter.cuda",
         plevel=10,
     )
+
+    rank = len(inputs[0].shape)
+
+    with SpecializedCondition(rank == 1):
+        if target.kind.name == "cuda" and get_global_func(
+            "tvm.contrib.thrust.stable_sort_by_key", allow_missing=True
+        ):
+            strategy.add_implementation(
+                wrap_compute_scatter(topi.cuda.scatter_via_sort),
+                wrap_topi_schedule(topi.cuda.schedule_scatter_via_sort),
+                name="scatter_via_sort.cuda",
+                plevel=9,  # use the sequential version by default
+            )
     return strategy
 
 
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 363832ef8b2f..8dd9dc5844dd 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -1123,7 +1123,7 @@ def wrap_compute_scatter(topi_compute):
     """Wrap scatter topi compute"""
 
     def _compute_scatter(attrs, inputs, _):
-        return [topi_compute(inputs[0], inputs[1], inputs[2], axis=attrs.axis)]
+        return [topi_compute(inputs[0], inputs[1], inputs[2], attrs.axis)]
 
     return _compute_scatter
 
diff --git a/python/tvm/topi/cuda/scatter.py b/python/tvm/topi/cuda/scatter.py
index be602c8ab7a3..b34bd1df14e4 100644
--- a/python/tvm/topi/cuda/scatter.py
+++ b/python/tvm/topi/cuda/scatter.py
@@ -17,16 +17,33 @@
 # pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements, singleton-comparison, unused-argument
 """Scatter operator """
 import tvm
-from tvm import te
+from tvm import te, autotvm
 from ..scatter import _verify_scatter_nd_inputs
+from ..generic import schedule_extern
 from .nms import atomic_add
 from .sort import stable_sort_by_key_thrust, is_thrust_available
+from ..utils import prod
 
 
 def ceil_div(a, b):
     return (a + b - 1) // b
 
 
+def _memcpy_ir(ib, out_ptr, data_ptr, shape):
+    fused = prod(shape)
+    with ib.new_scope():
+        num_thread = int(tvm.target.Target.current(allow_none=False).max_num_threads)
+        num_blocks = ceil_div(fused, num_thread)
+        bx = te.thread_axis("blockIdx.x")
+        ib.scope_attr(bx, "thread_extent", num_blocks)
+        tx = te.thread_axis("threadIdx.x")
+        ib.scope_attr(tx, "thread_extent", num_thread)
+        tid = bx * num_thread + tx
+
+        with ib.if_scope(tid < fused):
+            out_ptr[tid] = data_ptr[tid]
+
+
 def gen_ir_1d(data, indices, updates, axis, out, update_func):
     """Generate scatter ir for 1d inputs
 
@@ -63,10 +80,7 @@ def gen_ir_1d(data, indices, updates, axis, out, update_func):
     out_ptr = ib.buffer_ptr(out)
     data_ptr = ib.buffer_ptr(data)
 
-    with ib.new_scope():
-        bx = te.thread_axis("blockIdx.x")
-        ib.scope_attr(bx, "thread_extent", n)
-        out_ptr[bx] = data_ptr[bx]
+    _memcpy_ir(ib, out_ptr, data_ptr, data.shape)
 
     indices_ptr = ib.buffer_ptr(indices)
     updates_ptr = ib.buffer_ptr(updates)
@@ -114,8 +128,6 @@ def gen_ir_2d(data, indices, updates, axis, out, update_func):
     ret : tir
         The computational ir.
     """
-    warp_size = tvm.target.Target.current(False).thread_warp_size
-
     n = data.shape[0]
     c = data.shape[1]
 
@@ -124,16 +136,7 @@ def gen_ir_2d(data, indices, updates, axis, out, update_func):
     out_ptr = ib.buffer_ptr(out)
     data_ptr = ib.buffer_ptr(data)
 
-    with ib.new_scope():
-        bx = te.thread_axis("blockIdx.x")
-        ib.scope_attr(bx, "thread_extent", n)
-        tx = te.thread_axis("threadIdx.x")
-        ib.scope_attr(tx, "thread_extent", warp_size)
-        with ib.for_range(0, ceil_div(c, warp_size), name="j") as j_:
-            j = j_ * warp_size + tx
-            with ib.if_scope(j < c):
-                idx = bx * c + j
-                out_ptr[idx] = data_ptr[idx]
+    _memcpy_ir(ib, out_ptr, data_ptr, data.shape)
 
     indices_ptr = ib.buffer_ptr(indices)
     updates_ptr = ib.buffer_ptr(updates)
@@ -205,18 +208,7 @@ def gen_ir_3d(data, indices, updates, axis, out, update_func):
     out_ptr = ib.buffer_ptr(out)
     data_ptr = ib.buffer_ptr(data)
 
-    with ib.new_scope():
-        bx = te.thread_axis("blockIdx.x")
-        ib.scope_attr(bx, "thread_extent", n)
-        by = te.thread_axis("blockIdx.y")
-        ib.scope_attr(by, "thread_extent", c)
-        tx = te.thread_axis("threadIdx.x")
-        ib.scope_attr(tx, "thread_extent", warp_size)
-        with ib.for_range(0, ceil_div(h, warp_size), name="k") as k_:
-            k = k_ * warp_size + tx
-            with ib.if_scope(k < h):
-                idx = (bx * c + by) * h + k
-                out_ptr[idx] = data_ptr[idx]
+    _memcpy_ir(ib, out_ptr, data_ptr, data.shape)
 
     indices_ptr = ib.buffer_ptr(indices)
     updates_ptr = ib.buffer_ptr(updates)
@@ -311,20 +303,7 @@ def gen_ir_4d(data, indices, updates, axis, out, update_func):
 
     out_ptr = ib.buffer_ptr(out)
     data_ptr = ib.buffer_ptr(data)
-    with ib.new_scope():
-        i = te.thread_axis("blockIdx.x")
-        ib.scope_attr(i, "thread_extent", n)
-        j = te.thread_axis("blockIdx.y")
-        ib.scope_attr(j, "thread_extent", c)
-        k = te.thread_axis("blockIdx.z")
-        ib.scope_attr(k, "thread_extent", h)
-        tx = te.thread_axis("threadIdx.x")
-        ib.scope_attr(tx, "thread_extent", warp_size)
-        with ib.for_range(0, ceil_div(w, warp_size), name="l") as l_:
-            l = l_ * warp_size + tx
-            with ib.if_scope(l < w):
-                idx = ((i * c + j) * h + k) * w + l
-                out_ptr[idx] = data_ptr[idx]
+    _memcpy_ir(ib, out_ptr, data_ptr, data.shape)
 
     indices_ptr = ib.buffer_ptr(indices)
     updates_ptr = ib.buffer_ptr(updates)
@@ -417,7 +396,71 @@ def gen_ir_4d(data, indices, updates, axis, out, update_func):
     return ib.get()
 
 
-def gen_scatter_1d_thrust(data, indices_sorted, updates_sorted, axis, out, _):
+@autotvm.register_topi_compute("scatter.cuda")
+def scatter(cfg, data, indices, updates, axis=0):
+    """Update data at positions defined by indices with values in updates
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    indices : relay.Expr
+        The index locations to update.
+
+    updates : relay.Expr
+        The values to update.
+
+    axis : int
+        The axis to scatter on
+
+    Returns
+    -------
+    ret : relay.Expr
+        The computed result.
+    """
+    if axis < 0:
+        axis += len(data.shape)
+    assert axis >= 0
+    assert axis < len(data.shape)
+
+    rank = len(data.shape)
+    assert 1 <= rank <= 4, "scatter only supports 1-4 dimensions"
+
+    ir_funcs = {
+        1: gen_ir_1d,
+        2: gen_ir_2d,
+        3: gen_ir_3d,
+        4: gen_ir_4d,
+    }
+
+    def update_func(dst_ptr, dst_index, update):
+        dst_ptr[dst_index] = update
+
+    out_shape = data.shape
+    out_buf = tvm.tir.decl_buffer(out_shape, data.dtype, "out_buf")
+
+    cfg.add_flop(1)  # A dummy value to satisfy AutoTVM
+
+    out = te.extern(
+        [out_shape],
+        [data, indices, updates],
+        lambda ins, outs: ir_funcs[rank](ins[0], ins[1], ins[2], axis, outs[0], update_func),
+        dtype=data.dtype,
+        out_buffers=[out_buf],
+        name="scatter_gpu",
+        tag="scatter_gpu",
+    )
+
+    return out
+
+
+@autotvm.register_topi_schedule("scatter.cuda")
+def schedule_scatter(_, outs):
+    return schedule_extern(outs)
+
+
+def gen_scatter_1d_thrust(data, indices_sorted, updates_sorted, out):
     """Generate scatter ir for 1d inputs, using a sorting based approach.
     By sorting indices and comparing neighboring two indices, we can tell which
     of elements in the indices tensor can scatter its update value into the output.
@@ -438,9 +481,6 @@ def gen_scatter_1d_thrust(data, indices_sorted, updates_sorted, axis, out, _):
     updates : tir.Tensor
         The values to update, sorted by indices.
 
-    axis : int
-        The axis to scatter on. It must be 0 for this function.
-
     out : tir.Tensor
         The output tensor.
 
@@ -449,7 +489,6 @@ def gen_scatter_1d_thrust(data, indices_sorted, updates_sorted, axis, out, _):
     ret : tir
         The computational ir.
     """
-    assert axis == 0
     n = data.shape[0]
 
     ib = tvm.tir.ir_builder.create()
@@ -504,7 +543,8 @@ def gen_scatter_1d_thrust(data, indices_sorted, updates_sorted, axis, out, _):
     return ib.get()
 
 
-def scatter(data, indices, updates, axis=0):
+@autotvm.register_topi_compute("scatter_via_sort.cuda")
+def scatter_via_sort(cfg, data, indices, updates, axis=0):
     """Update data at positions defined by indices with values in updates
 
     Parameters
@@ -528,49 +568,34 @@ def scatter(data, indices, updates, axis=0):
     """
     if axis < 0:
         axis += len(data.shape)
-    assert axis >= 0
-    assert axis < len(data.shape)
+    assert axis == 0 and len(data.shape) == 1, "sorting based scatter only supported for 1d input"
+    assert is_thrust_available(), "Thrust is required for this op"
 
-    rank = len(data.shape)
-    assert 1 <= rank <= 4, "scatter only supports 1-4 dimensions"
-
-    ir_funcs = {
-        1: gen_ir_1d,
-        2: gen_ir_2d,
-        3: gen_ir_3d,
-        4: gen_ir_4d,
-    }
-
-    def update_func(dst_ptr, dst_index, update):
-        dst_ptr[dst_index] = update
+    cfg.add_flop(1)  # A dummy value to satisfy AutoTVM
 
     out_shape = data.shape
     out_buf = tvm.tir.decl_buffer(out_shape, data.dtype, "out_buf")
 
-    in_bufs = [data]
-
-    if rank == 1 and is_thrust_available():
-        ir_funcs[1] = gen_scatter_1d_thrust
-        indices_sorted, updates_sorted = stable_sort_by_key_thrust(
-            indices, updates, for_scatter=True
-        )
-        in_bufs += [indices_sorted, updates_sorted]
-    else:
-        in_bufs += [indices, updates]
+    indices_sorted, updates_sorted = stable_sort_by_key_thrust(indices, updates, for_scatter=True)
 
     out = te.extern(
         [out_shape],
-        in_bufs,
-        lambda ins, outs: ir_funcs[rank](ins[0], ins[1], ins[2], axis, outs[0], update_func),
+        [data, indices_sorted, updates_sorted],
+        lambda ins, outs: gen_scatter_1d_thrust(ins[0], ins[1], ins[2], outs[0]),
         dtype=data.dtype,
         out_buffers=[out_buf],
-        name="scatter_gpu",
-        tag="scatter_gpu",
+        name="scatter_via_sort_gpu",
+        tag="scatter_via_sort_gpu",
     )
 
     return out
 
 
+@autotvm.register_topi_schedule("scatter_via_sort.cuda")
+def schedule_scatter_via_sort(_, outs):
+    return schedule_extern(outs)
+
+
 def gen_scatter_add_1d_atomic(data, indices, updates, axis, out, _):
     """Generate scatter add ir for 1d inputs, using atomic_add instruction
 

From f8c55db33c47a4a71d949f690a521d6a74aaef47 Mon Sep 17 00:00:00 2001
From: Dmitriy Smirnov <dmitriy.smirnov@arm.com>
Date: Wed, 20 Jan 2021 00:42:39 +0000
Subject: [PATCH 089/357] [TFLite] Added ability to infer shapes for arguments
 (#7293)

Added an ability to infer argument shapes if shapes are not present in
TFLite files. The set of networks on which the patch was tested is
internal to Arm. Any help with creating unit tests would be appreciated.
---
 python/tvm/relay/frontend/tflite.py | 35 +++++++++++++++++++----------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index 525fb41407d3..316815c980e3 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -353,7 +353,7 @@ def get_tensor_value(self, tensor_wrapper, is_sparse=False):
         data = tensor_wrapper.buffer.DataAsNumpy()
 
         if tensor_wrapper.tensor.ShapeLength() != 0:
-            shape = to_int_list(tensor_wrapper.tensor.ShapeAsNumpy())
+            shape = to_int_list(self.get_tensor_shape(tensor_wrapper))
         else:
             shape = []
 
@@ -1417,7 +1417,7 @@ def convert_gather(self, op):
         axis = gather_options.Axis()
 
         # Check the indices are with in bounds.
-        data_shape = to_int_list(input_tensors[0].tensor.ShapeAsNumpy())
+        data_shape = to_int_list(self.get_tensor_shape(input_tensors[0]))
         data_dim = len(data_shape)
 
         axis = data_dim + axis if axis < 0 else axis
@@ -1535,7 +1535,7 @@ def convert_strided_slice(self, op):
         new_axis_mask = options.NewAxisMask()
         shrink_axis_mask = options.ShrinkAxisMask()
 
-        data_shape = to_int_list(input_tensors[0].tensor.ShapeAsNumpy())
+        data_shape = to_int_list(self.get_tensor_shape(input_tensors[0]))
         data_dim = len(data_shape)
         stride_dim = len(stride)
 
@@ -1792,7 +1792,7 @@ def convert_fully_connected(self, op):
         output_tensor_type = output_tensor.tensor.Type()
         output_tensor_type_str = self.get_tensor_type_str(output_tensor_type)
 
-        weight_tensor_shape = to_int_list(weight_tensor.tensor.ShapeAsNumpy())
+        weight_tensor_shape = to_int_list(self.get_tensor_shape(weight_tensor))
 
         # Weight should have only 2 dimensions(TFLite convention)
         assert len(weight_tensor_shape) == 2, "Weight should be only 2-dim"
@@ -1987,16 +1987,16 @@ def convert_conv(self, op, conv_type):
         padding = conv_options.Padding()
         fused_activation_fn = conv_options.FusedActivationFunction()
 
-        _, input_h, input_w, input_c = to_int_list(input_tensor.tensor.ShapeAsNumpy())
+        _, input_h, input_w, input_c = to_int_list(self.get_tensor_shape(input_tensor))
 
         if is_depthwise_conv:
             # TFLite depthwise convolution kernel layout is:
             # 1 KH KW C(input_c * depth_multiplier)
-            _, kernel_h, kernel_w, in_channels = to_int_list(weight_tensor.tensor.ShapeAsNumpy())
+            _, kernel_h, kernel_w, in_channels = to_int_list(self.get_tensor_shape(weight_tensor))
             assert in_channels == input_c * depth_multiplier
         else:
             output_channels, kernel_h, kernel_w, _ = to_int_list(
-                weight_tensor.tensor.ShapeAsNumpy()
+                self.get_tensor_shape(weight_tensor)
             )
 
         dilated_kernel_h = dilation_h * (kernel_h - 1) + 1
@@ -2219,7 +2219,7 @@ def convert_slice(self, op):
         size = list(self.get_tensor_value(input_tensors[2]))
         # strided_slice(Relay) needs the slice's end indices, not the size
         end = size
-        input_tensor_shape = to_int_list(input_tensor.tensor.ShapeAsNumpy())
+        input_tensor_shape = to_int_list(self.get_tensor_shape(input_tensor))
         input_tensor_rank = len(input_tensor_shape)
         for i in range(input_tensor_rank):
             if size[i] == -1:
@@ -2381,7 +2381,8 @@ def convert_pool2d(self, op, pool_type):
 
         in_expr = self.get_expr(input_tensor_idx)
 
-        _, input_h, input_w, _ = to_int_list(input_tensor.tensor.ShapeAsNumpy())
+        _, input_h, input_w, _ = to_int_list(self.get_tensor_shape(input_tensor))
+
         if padding == Padding.VALID:
             pass
         elif padding == Padding.SAME:
@@ -2771,12 +2772,13 @@ def convert_transpose_conv(self, op):
 
         # Input (data) Tensor. NHWC layout
         input_tensor = input_tensors[2]
-        _, input_h, input_w, input_c = to_int_list(input_tensor.tensor.ShapeAsNumpy())
+        _, input_h, input_w, input_c = to_int_list(self.get_tensor_shape(input_tensor))
         # Weights tensor. TFLite uses OHWI layout
         weights_tensor = input_tensors[1]
         out_channels, kernel_h, kernel_w, in_channels = to_int_list(
-            weights_tensor.tensor.ShapeAsNumpy()
+            self.get_tensor_shape(weights_tensor)
         )
+
         assert (
             input_c == in_channels
         ), "Input channel in the filter should match to channel in the input"
@@ -3204,7 +3206,7 @@ def convert_matrix_diag(self, op):
             ), "TFLite MATRIX_DIAG requires diagonal and output tensors' \
                     scale and zero points to be equal"
 
-        shape = to_int_list(diagonal.tensor.ShapeAsNumpy())
+        shape = to_int_list(self.get_tensor_shape(diagonal))
         shape = np.append(shape, shape[-1])
         dtype = self.get_tensor_type_str(diagonal.tensor.Type())
 
@@ -3265,6 +3267,15 @@ def get_tensor_expr(self, tensor, is_sparse=False):
             expr = self.exp_tab.new_const(self.get_tensor_value(tensor, is_sparse), dtype=type_str)
         return expr
 
+    def get_tensor_shape(self, tensor_wrapper):
+        """ Returns tensor shape. Infers shape if the shape is empty. """
+        assert isinstance(tensor_wrapper, TensorWrapper), "Expecting TensorWrapper here"
+        return (
+            tensor_wrapper.tensor.ShapeAsNumpy()
+            if tensor_wrapper.tensor.ShapeLength() > 0
+            else _infer_shape(self.get_tensor_expr(tensor_wrapper))
+        )
+
 
 # pylint: disable=no-else-return
 def prepare_dense_matrix_from_sparse(sparse_tensor, sparse_tensor_value, sparse_tensor_type):

From 62f251bb34c16c5634eaafe4b43bd277189bcb37 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Wed, 20 Jan 2021 18:11:55 +0900
Subject: [PATCH 090/357] [TOPI] Make cumsum IR reusable, add thrust scan
 (#7303)

* import changes from scan branch

commit cf0d4fdf3bf8fa6e1d6abf631042de28176923c3
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri Dec 25 10:12:01 2020 +0900

    get valid count test working

commit eb142d3ee9bb16ddf8d37fdec10c1bcda209deaa
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri Dec 25 07:22:00 2020 +0900

    integrate new cumsum change

commit f89684d73dad1f863b4fd291e8804b5c24eae94f
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri Dec 25 06:56:46 2020 +0900

    remove ceil_div from nms

commit a2ad4dea87d9a637745fb0a40ff9bbdde286194a
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Sun Dec 20 20:36:34 2020 +0900

    add api for returning reduction from ex scan output

commit b7f4ef7006b722e365533bec53b1f104aa056da2
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Sun Dec 20 19:49:07 2020 +0900

    move ceil_div to utils

commit a9a57e34317b1f254165c3a88e465e33c7fda01b
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Sun Dec 20 19:38:15 2020 +0900

    rename prefix_scan.py to scan.py

commit 03ed43ff550a435a28740ce1fa62cea71b90cf2c
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Sat Dec 19 06:12:55 2020 +0900

    surpress cpplint

commit abceac980d8dfd94072acc228108d1fcd94a214c
Author: masa <masa@pop-os.localdomain>
Date:   Fri Dec 18 20:36:24 2020 +0900

    support more data type

commit 3e7d1f81821a1e221cbb1322ef5b23f273f51c42
Author: masa <masa@pop-os.localdomain>
Date:   Fri Dec 18 20:09:51 2020 +0900

    1d thrust scan working

commit ac13b407e21a83ca57240cad205c32a5d000f999
Author: masa <masa@pop-os.localdomain>
Date:   Fri Dec 18 19:49:25 2020 +0900

    adding thrust scan support

commit 65634e86c33786541485dc6461a96da833332297
Author: masa <masa@pop-os.localdomain>
Date:   Fri Dec 18 19:01:11 2020 +0900

    add thrust scan python stub

commit 9876c901ee8b406bc9d75ba91c4734d55f85811b
Author: masa <masa@pop-os.localdomain>
Date:   Fri Dec 18 20:55:14 2020 +0900

    introduce prefix_scan.py and move scan ir in nms.py

commit 667bdd3b135a03b53937fdb664915e07f1365ee1
Author: masa <masa@pop-os.localdomain>
Date:   Fri Dec 18 15:06:18 2020 +0900

    make the scan loop exclusive

commit 480787bc072bfc59dcc279038c772f8ad2ec03e9
Author: mbrookhart <mbrookhart@octoml.ai>
Date:   Thu Dec 17 10:01:11 2020 -0700

    Parallelize cumsum in get_valid_counts

* fix for 1d scan

* rename

* cast to out dtype

* do not run return reduction for inclusive scan

* remove another ceil_div definition

* adding scan test

* add scheduling for scan op, fixed scan 1d test

* pylint fix

* add doc string

* add more thrust scan test

* add dynamic get valid count test, including empty size tensor

* fix hard coded gpu targets for cpu only env

* try retunring early if scan_size is 0

* another change for empty tensor and thrust path

Co-authored-by: masa <masa@pop-os.localdomain>
---
 python/tvm/topi/cuda/nms.py          | 151 +---------
 python/tvm/topi/cuda/scan.py         | 406 +++++++++++++++++++++++++++
 python/tvm/topi/cuda/scatter.py      |   6 +-
 python/tvm/topi/cuda/sort.py         |   5 +-
 python/tvm/topi/cuda/sparse.py       |   5 +-
 python/tvm/topi/utils.py             |   5 +
 src/runtime/contrib/thrust/thrust.cu |  76 +++++
 tests/python/contrib/test_sort.py    |  35 +--
 tests/python/contrib/test_thrust.py  | 123 ++++++++
 tests/python/relay/test_any.py       |  48 ++++
 10 files changed, 666 insertions(+), 194 deletions(-)
 create mode 100644 python/tvm/topi/cuda/scan.py
 create mode 100644 tests/python/contrib/test_thrust.py

diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
index 0c01cc9fbbdf..32691da90ecc 100644
--- a/python/tvm/topi/cuda/nms.py
+++ b/python/tvm/topi/cuda/nms.py
@@ -22,6 +22,8 @@
 
 from tvm.tir import if_then_else
 from .sort import argsort, argsort_thrust, is_thrust_available
+from .scan import exclusive_scan
+from ..utils import ceil_div
 
 
 def cuda_atomic_add_rule(op):
@@ -51,10 +53,6 @@ def atomic_add(x, y):
     return tvm.tir.call_intrin(y.dtype, "tir.atomic_add", x, y)
 
 
-def ceil_div(a, b):
-    return tvm.tir.indexdiv(a + b - 1, b)
-
-
 def get_valid_boxes_ir(data, valid_boxes, score_threshold, id_index, score_index):
     """Low level IR to identify bounding boxes given a score threshold.
 
@@ -123,136 +121,6 @@ def get_valid_boxes_ir(data, valid_boxes, score_threshold, id_index, score_index
     return ib.get()
 
 
-def get_valid_indices_ir(valid_boxes, valid_count, valid_indices):
-    """Low level IR to get the ouput indices of valid boxes
-    and the count of valid boxes
-
-    Parameters
-    ----------
-    valid_boxes: Buffer
-        2D Buffer  indicating valid boxes with shape [batch_size, num_anchors].
-
-    Returns
-    -------
-    valid_count: Buffer
-        1D Buffer of number of valid boxes per batch [batch_size].
-
-    valid_indices: Buffer
-        2D Buffer indicating output sorted indcies of valid boxes [batch_size, num_anchors].
-    """
-    batch_size = valid_boxes.shape[0]
-    num_anchors = valid_boxes.shape[1]
-
-    ib = tvm.tir.ir_builder.create()
-
-    valid_boxes = ib.buffer_ptr(valid_boxes)
-
-    valid_count = ib.buffer_ptr(valid_count)
-    valid_indices = ib.buffer_ptr(valid_indices)
-
-    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-    with ib.if_scope(num_anchors > 0):
-        # Copy boxes to valid_indices
-        with ib.new_scope():
-            nthread_tx = max_threads
-            nthread_bx = ceil_div(num_anchors, max_threads)
-            nthread_by = batch_size
-            tx = te.thread_axis("threadIdx.x")
-            bx = te.thread_axis("blockIdx.x")
-            by = te.thread_axis("blockIdx.y")
-            ib.scope_attr(tx, "thread_extent", nthread_tx)
-            ib.scope_attr(bx, "thread_extent", nthread_bx)
-            ib.scope_attr(by, "thread_extent", nthread_by)
-            tid = bx * nthread_tx + tx
-            with ib.if_scope(tid < num_anchors):
-                valid_indices[by, tid] = valid_boxes[by, tid]
-
-        nthread_tx = max_threads
-        nthread_bx = ceil_div(num_anchors, max_threads)
-        nthread_by = batch_size
-
-        ## The following algorithm performs parallel exclusive scan to get
-        ## a tensor that can later be used to select valid indices
-        # Up Sweep of exclusive scan
-        lim = tvm.tir.generic.cast(
-            tvm.tir.ceil(tvm.tir.log2(tvm.tir.generic.cast(num_anchors, "float64"))), "int64"
-        )
-        with ib.for_range(0, lim, dtype="int64") as l2_width:
-            width = 2 << l2_width
-
-            with ib.new_scope():
-                tx = te.thread_axis("threadIdx.x")
-                bx = te.thread_axis("blockIdx.x")
-                ib.scope_attr(tx, "thread_extent", nthread_tx)
-                ib.scope_attr(
-                    bx,
-                    "thread_extent",
-                    tvm.tir.generic.cast(ceil_div(num_anchors, max_threads * width), "int32"),
-                )
-                tid = bx * nthread_tx + tx
-
-                by = te.thread_axis("blockIdx.y")
-                ib.scope_attr(by, "thread_extent", nthread_by)
-                start = ib.allocate("int64", (1,), name="start", scope="local")
-                middle = ib.allocate("int64", (1,), name="middle", scope="local")
-                end = ib.allocate("int64", (1,), name="end", scope="local")
-                start[0] = width * tid
-                with ib.if_scope(start[0] < num_anchors):
-                    middle[0] = start[0] + tvm.tir.indexdiv(width, 2)
-                    end[0] = tvm.te.min(start[0] + width, num_anchors)
-                    with ib.if_scope(middle[0] < num_anchors):
-                        valid_indices[by * num_anchors + end[0] - 1] += valid_indices[
-                            by * num_anchors + middle[0] - 1
-                        ]
-
-        # Down Sweep of exclusive scan
-        with ib.new_scope():
-            bx = te.thread_axis("blockIdx.x")
-            ib.scope_attr(bx, "thread_extent", batch_size)
-            with ib.if_scope(bx < batch_size):
-                valid_count[bx] = valid_indices[(bx + 1) * num_anchors - 1]
-                valid_indices[(bx + 1) * num_anchors - 1] = 0
-
-        with ib.for_range(0, lim, dtype="int64") as l2_width:
-            width = 2 << (lim - l2_width - 1)
-
-            with ib.new_scope():
-                tx = te.thread_axis("threadIdx.x")
-                bx = te.thread_axis("blockIdx.x")
-                ib.scope_attr(tx, "thread_extent", nthread_tx)
-                ib.scope_attr(
-                    bx,
-                    "thread_extent",
-                    tvm.tir.generic.cast(ceil_div(num_anchors, max_threads * width), "int32"),
-                )
-                tid = bx * nthread_tx + tx
-
-                by = te.thread_axis("blockIdx.y")
-                ib.scope_attr(by, "thread_extent", nthread_by)
-                start = ib.allocate("int64", (1,), name="start", scope="local")
-                middle = ib.allocate("int64", (1,), name="middle", scope="local")
-                end = ib.allocate("int64", (1,), name="end", scope="local")
-                tmp = ib.allocate("int32", (1,), name="end", scope="local")
-                start[0] = width * tid
-                with ib.if_scope(tvm.tir.all(start[0] < num_anchors)):
-                    middle[0] = start[0] + tvm.tir.indexdiv(width, 2)
-                    end[0] = tvm.tir.min(start[0] + width, num_anchors)
-                    with ib.if_scope(middle[0] < num_anchors):
-                        tmp[0] = valid_indices[by * num_anchors + middle[0] - 1]
-                        valid_indices[by * num_anchors + middle[0] - 1] = valid_indices[
-                            by * num_anchors + end[0] - 1
-                        ]
-                        valid_indices[by * num_anchors + end[0] - 1] += tmp[0]
-    with ib.else_scope():
-        with ib.new_scope():
-            bx = te.thread_axis("blockIdx.x")
-            ib.scope_attr(bx, "thread_extent", batch_size)
-            with ib.if_scope(bx < batch_size):
-                valid_count[bx] = 0
-
-    return ib.get()
-
-
 def get_valid_counts_ir(data, valid_indices, valid_boxes, out, out_indices):
     """Low level IR to get valid count of bounding boxes
     given a score threshold. Also prepares to move valid boxes to the
@@ -374,19 +242,8 @@ def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
     valid_indices_buf = tvm.tir.decl_buffer(
         (batch_size, num_anchors), "int32", "valid_indices_buf", data_alignment=8
     )
-    valid_count_buf = tvm.tir.decl_buffer(
-        (batch_size,), "int32", "valid_count_buf", data_alignment=8
-    )
-    valid_count, valid_indices = te.extern(
-        [(batch_size,), (batch_size, num_anchors)],
-        [valid_boxes],
-        lambda ins, outs: get_valid_indices_ir(ins[0], outs[0], outs[1]),
-        dtype=["int32"],
-        in_buffers=[valid_boxes_buf],
-        out_buffers=[valid_count_buf, valid_indices_buf],
-        name="get_valid_indices",
-        tag="get_valid_indices_gpu",
-    )
+
+    valid_indices, valid_count = exclusive_scan(valid_boxes, axis=1, return_reduction=True)
 
     out_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "out_buf", data_alignment=8)
     out_indices_buf = tvm.tir.decl_buffer(
diff --git a/python/tvm/topi/cuda/scan.py b/python/tvm/topi/cuda/scan.py
new file mode 100644
index 000000000000..f19e4a14239a
--- /dev/null
+++ b/python/tvm/topi/cuda/scan.py
@@ -0,0 +1,406 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, too-many-locals, too-many-statements
+"Scan related operators"
+import tvm
+from tvm import te
+from tvm._ffi import get_global_func
+from ..transform import expand_dims, squeeze
+from ..utils import ceil_div
+from ..math import cast
+from .. import tag
+from .injective import schedule_injective_from_existing
+
+
+def exclusive_sum_scan2d_ir(data, output, reduction=None):
+    """Low level IR to do exclusive sum scan along rows of 2D input.
+
+    Parameters
+    ----------
+    data : Buffer
+        Input data. 2-D Buffer with shape [batch_size, scan_axis_size].
+
+    output: Buffer
+        A buffer to store the output scan, of the same size as data
+
+    reduction: Buffer, optional
+        1D Buffer of size [batch_size], to store the sum of each row.
+    """
+
+    batch_size = data.shape[0]
+    scan_axis_size = data.shape[1]
+
+    ib = tvm.tir.ir_builder.create()
+
+    data = ib.buffer_ptr(data)
+    output = ib.buffer_ptr(output)
+
+    out_dtype = output.dtype
+
+    if reduction is not None:
+        reduction = ib.buffer_ptr(reduction)
+
+    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
+
+    with ib.if_scope(scan_axis_size == 0):
+        with ib.new_scope():
+            bx = te.thread_axis("blockIdx.x")
+            ib.scope_attr(bx, "thread_extent", batch_size)
+            with ib.if_scope(bx < batch_size):
+                if reduction is not None:
+                    reduction[bx] = 0
+    with ib.else_scope():
+        with ib.new_scope():
+            nthread_tx = max_threads
+            nthread_bx = ceil_div(scan_axis_size, max_threads)
+            nthread_by = batch_size
+            tx = te.thread_axis("threadIdx.x")
+            bx = te.thread_axis("blockIdx.x")
+            by = te.thread_axis("blockIdx.y")
+            ib.scope_attr(tx, "thread_extent", nthread_tx)
+            ib.scope_attr(bx, "thread_extent", nthread_bx)
+            ib.scope_attr(by, "thread_extent", nthread_by)
+            tid = bx * nthread_tx + tx
+            with ib.if_scope(tid < scan_axis_size):
+                output[by, tid] = data[by, tid]
+
+        nthread_tx = max_threads
+        nthread_bx = ceil_div(scan_axis_size, max_threads)
+        nthread_by = batch_size
+
+        # The following algorithm performs parallel exclusive scan
+        # Up Sweep of exclusive scan
+        lim = tvm.tir.generic.cast(
+            tvm.tir.ceil(tvm.tir.log2(tvm.tir.generic.cast(scan_axis_size, "float64"))), "int64"
+        )
+        with ib.for_range(0, lim, dtype="int64") as l2_width:
+            width = 2 << l2_width
+
+            with ib.new_scope():
+                tx = te.thread_axis("threadIdx.x")
+                bx = te.thread_axis("blockIdx.x")
+                ib.scope_attr(tx, "thread_extent", nthread_tx)
+                ib.scope_attr(
+                    bx,
+                    "thread_extent",
+                    tvm.tir.generic.cast(ceil_div(scan_axis_size, max_threads * width), "int32"),
+                )
+                tid = bx * nthread_tx + tx
+
+                by = te.thread_axis("blockIdx.y")
+                ib.scope_attr(by, "thread_extent", nthread_by)
+                start = ib.allocate("int64", (1,), name="start", scope="local")
+                middle = ib.allocate("int64", (1,), name="middle", scope="local")
+                end = ib.allocate("int64", (1,), name="end", scope="local")
+                start[0] = width * tid
+                with ib.if_scope(start[0] < scan_axis_size):
+                    middle[0] = start[0] + tvm.tir.indexdiv(width, 2)
+                    end[0] = tvm.te.min(start[0] + width, scan_axis_size)
+                    with ib.if_scope(middle[0] < scan_axis_size):
+                        output[by * scan_axis_size + end[0] - 1] += output[
+                            by * scan_axis_size + middle[0] - 1
+                        ]
+
+        # Down Sweep of exclusive scan
+        with ib.new_scope():
+            bx = te.thread_axis("blockIdx.x")
+            ib.scope_attr(bx, "thread_extent", batch_size)
+            with ib.if_scope(bx < batch_size):
+                if reduction is not None:
+                    reduction[bx] = output[(bx + 1) * scan_axis_size - 1]
+                output[(bx + 1) * scan_axis_size - 1] = cast(0, out_dtype)
+
+        with ib.for_range(0, lim, dtype="int64") as l2_width:
+            width = 2 << (lim - l2_width - 1)
+
+            with ib.new_scope():
+                tx = te.thread_axis("threadIdx.x")
+                bx = te.thread_axis("blockIdx.x")
+                ib.scope_attr(tx, "thread_extent", nthread_tx)
+                ib.scope_attr(
+                    bx,
+                    "thread_extent",
+                    tvm.tir.generic.cast(ceil_div(scan_axis_size, max_threads * width), "int32"),
+                )
+                tid = bx * nthread_tx + tx
+
+                by = te.thread_axis("blockIdx.y")
+                ib.scope_attr(by, "thread_extent", nthread_by)
+                start = ib.allocate("int64", (1,), name="start", scope="local")
+                middle = ib.allocate("int64", (1,), name="middle", scope="local")
+                end = ib.allocate("int64", (1,), name="end", scope="local")
+                tmp = ib.allocate(out_dtype, (1,), name="end", scope="local")
+                start[0] = width * tid
+                with ib.if_scope(tvm.tir.all(start[0] < scan_axis_size)):
+                    middle[0] = start[0] + tvm.tir.indexdiv(width, 2)
+                    end[0] = tvm.tir.min(start[0] + width, scan_axis_size)
+                    with ib.if_scope(middle[0] < scan_axis_size):
+                        tmp[0] = output[by * scan_axis_size + middle[0] - 1]
+                        output[by * scan_axis_size + middle[0] - 1] = output[
+                            by * scan_axis_size + end[0] - 1
+                        ]
+                        output[by * scan_axis_size + end[0] - 1] += tmp[0]
+    return ib.get()
+
+
+def get_reduction_from_exclusive_scan(data, ex_scan_output):
+    """Return the sum of the last element of data and the exclusive scan output.
+    The is the reduction of data along each row (for 2-D case).
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        Input data. 1-D tensor with shape [scan_axis_size], or
+        2-D tensor with shape [batch_size, scan_axis_size].
+
+    ex_scan_output : tvm.te.Tensor
+        1-D tensor that is the exclusive scan of the input, or
+        2-D tensor storing the exclusive scan of each row.
+
+    Returns
+    -------
+    reduction : tvm.te.Tensor
+        1-D tensor storing the reduction of each row.
+    """
+    ndim = len(data.shape)
+    if ndim == 1:
+        data = expand_dims(data, axis=0)
+        ex_scan_output = expand_dims(ex_scan_output, axis=0)
+
+    def ir(data, data_ex_scan, reduction):
+        batch_size = data.shape[0]
+        num_anchors = data.shape[1]
+
+        ib = tvm.tir.ir_builder.create()
+
+        data = ib.buffer_ptr(data)
+        data_ex_scan = ib.buffer_ptr(data_ex_scan)
+        reduction = ib.buffer_ptr(reduction)
+
+        max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
+        with ib.new_scope():
+            nthread_tx = max_threads
+            nthread_bx = ceil_div(batch_size, max_threads)
+            tx = te.thread_axis("threadIdx.x")
+            bx = te.thread_axis("blockIdx.x")
+            ib.scope_attr(tx, "thread_extent", nthread_tx)
+            ib.scope_attr(bx, "thread_extent", nthread_bx)
+            tid = bx * max_threads + tx
+            with ib.if_scope(tid < batch_size):
+                with ib.if_scope(num_anchors > 0):
+                    reduction[tid] = data_ex_scan[tid, num_anchors - 1] + data[tid, num_anchors - 1]
+                with ib.else_scope():
+                    reduction[tid] = 0
+
+        return ib.get()
+
+    assert len(data.shape) == 2, "Only 2D input supported for now"
+    data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "valid_indices_buf", data_alignment=8)
+    ex_scan_output_buf = tvm.tir.decl_buffer(
+        ex_scan_output.shape, ex_scan_output.dtype, "ex_scan_output_buf", data_alignment=8
+    )
+
+    reduction = te.extern(
+        [(data.shape[0],)],
+        [data, ex_scan_output],
+        lambda ins, outs: ir(ins[0], ins[1], outs[0]),
+        dtype=[ex_scan_output.dtype],
+        in_buffers=[data_buf, ex_scan_output_buf],
+        name="ex_scan_reduction",
+        tag="ex_scan_reduction_gpu",
+    )
+
+    if ndim == 1:
+        return squeeze(reduction, 0)
+
+    return reduction
+
+
+def is_thrust_available():
+    """Test if thrust based scan ops are available."""
+    return get_global_func("tvm.contrib.thrust.sum_scan", allow_missing=True) is not None
+
+
+def scan_thrust(data, output_dtype, exclusive=True, return_reduction=False):
+    """Do exclusive scan on 1D input or along rows of 2D input, using thrust.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        Input data. 1-D tensor with shape [scan_axis_size], or
+        2-D tensor with shape [batch_size, scan_axis_size].
+
+    output_dtype: string
+        The dtype of the output scan tensor.
+
+    exclusive: bool, optional
+        Whether or not do exclusive or inclusive scan.
+
+    return_reduction: bool, optional
+        Whether or not return a 1-D tensor storing the reduction of each row.
+        Reductions are computed as part of the upsweep pass, so there is no extra cost.
+        If False, reductions are ignored.
+
+    Returns
+    -------
+    output : tvm.te.Tensor
+        1-D tensor that is the exclusive scan of the input, or
+        2-D tensor storing the exclusive scan of each row.
+
+    reduction : tvm.te.Tensor, optional
+        1-D tensor storing the reduction of each row.
+        Returned if return_reduction is True.
+    """
+    data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
+    output_buf = tvm.tir.decl_buffer(data.shape, output_dtype, "output_buf", data_alignment=8)
+    output = te.extern(
+        [data.shape],
+        [data],
+        lambda ins, outs: tvm.tir.call_packed(
+            "tvm.contrib.thrust.sum_scan", ins[0], outs[0], exclusive
+        ),
+        dtype=[output_dtype],
+        in_buffers=[data_buf],
+        out_buffers=[output_buf],
+        name="exclusive_sum_scan2d",
+        tag="exclusive_sum_scan2d_gpu",
+    )
+
+    if return_reduction:
+        assert exclusive, "return_reduction should be False for inclusive scan"
+        reduction = get_reduction_from_exclusive_scan(data, output)
+        return output, reduction
+
+    return output
+
+
+def exclusive_scan(data, axis=-1, return_reduction=False, output_dtype=None):
+    """Do exclusive scan on 1D input or along rows of 2D input.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        Input data. 1-D tensor with shape [scan_axis_size], or
+        2-D tensor with shape [batch_size, scan_axis_size].
+
+    axis: int, optional
+        The axis to do scan on. For now, only the inner most axis is supported.
+
+    return_reduction: bool, optional
+        Whether or not return a 1-D tensor storing the reduction of each row.
+        Reductions are computed as part of the upsweep pass, so there is no extra cost.
+        If False, reductions are ignored.
+
+    output_dtype: string, optional
+        The dtype of the output scan tensor. If not provided, the dtype of the input is used.
+
+    Returns
+    -------
+    output : tvm.te.Tensor
+        1-D tensor that is the exclusive scan of the input, or
+        2-D tensor storing the exclusive scan of each row.
+
+    reduction : tvm.te.Tensor, optional
+        1-D tensor storing the reduction of each row.
+        Returned if return_reduction is True.
+    """
+    # TODO(masahi): Support other binary operators
+    ndim = len(data.shape)
+    if axis < 0:
+        axis += ndim
+    assert axis == ndim - 1, "Only support scan on the inner most axis."
+
+    if output_dtype is None:
+        output_dtype = data.dtype
+
+    target = tvm.target.Target.current()
+    if target and target.kind.name == "cuda" and is_thrust_available():
+        return scan_thrust(data, output_dtype, exclusive=True, return_reduction=return_reduction)
+
+    if ndim == 1:
+        # TIR exclusive scan accepts only 2D inputs.
+        data = expand_dims(data, axis=0)
+
+    data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
+    output_buf = tvm.tir.decl_buffer(data.shape, output_dtype, "output_buf", data_alignment=8)
+
+    if len(data.shape) == 2:
+        if return_reduction:
+            output, reduction = te.extern(
+                [data.shape, (data.shape[0],)],
+                [data],
+                lambda ins, outs: exclusive_sum_scan2d_ir(ins[0], outs[0], outs[1]),
+                dtype=[data.dtype, output_dtype],
+                in_buffers=[data_buf],
+                name="exclusive_scan",
+                tag="exclusive_scan_gpu",
+            )
+        else:
+            output = te.extern(
+                [data.shape],
+                [data],
+                lambda ins, outs: exclusive_sum_scan2d_ir(ins[0], outs[0]),
+                dtype=[output_dtype],
+                in_buffers=[data_buf],
+                out_buffers=[output_buf],
+                name="exclusive_scan",
+                tag="exclusive_scan_gpu",
+            )
+            reduction = None
+    else:
+        assert False, "Unsupported dimension {}".format(ndim)
+
+    if ndim == 1:
+        output = squeeze(output, 0)
+        if return_reduction:
+            reduction = squeeze(reduction, 0)
+
+    if return_reduction:
+        return output, reduction
+
+    return output
+
+
+def schedule_scan(outs):
+    """Schedule for scan operator.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of scan
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+      The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
+    scheduled_ops = []
+
+    def traverse(op):
+        if tag.is_injective(op.tag):
+            schedule_injective_from_existing(s, op.output(0))
+        for tensor in op.input_tensors:
+            if tensor.op.input_tensors and tensor.op not in scheduled_ops:
+                traverse(tensor.op)
+        scheduled_ops.append(op)
+
+    for out in outs:
+        traverse(out.op)
+    return s
diff --git a/python/tvm/topi/cuda/scatter.py b/python/tvm/topi/cuda/scatter.py
index b34bd1df14e4..444fb25cc34b 100644
--- a/python/tvm/topi/cuda/scatter.py
+++ b/python/tvm/topi/cuda/scatter.py
@@ -22,11 +22,7 @@
 from ..generic import schedule_extern
 from .nms import atomic_add
 from .sort import stable_sort_by_key_thrust, is_thrust_available
-from ..utils import prod
-
-
-def ceil_div(a, b):
-    return (a + b - 1) // b
+from ..utils import prod, ceil_div
 
 
 def _memcpy_ir(ib, out_ptr, data_ptr, shape):
diff --git a/python/tvm/topi/cuda/sort.py b/python/tvm/topi/cuda/sort.py
index 9b6a18a8b06b..18340385205e 100644
--- a/python/tvm/topi/cuda/sort.py
+++ b/python/tvm/topi/cuda/sort.py
@@ -23,6 +23,7 @@
 from .injective import schedule_injective_from_existing
 from ..transform import strided_slice, transpose
 from .. import tag
+from ..utils import ceil_div
 
 
 def swap(arr, axis):
@@ -61,10 +62,6 @@ def traverse(op):
     return s
 
 
-def ceil_div(a, b):
-    return tvm.tir.indexdiv(a + b - 1, b)
-
-
 def _sort_init(ib, shape, axis, keys_in, keys_out, values_out=None, value_init_func=None):
     """Initialize the output buffers by copying from inputs"""
     axis_mul_before = 1
diff --git a/python/tvm/topi/cuda/sparse.py b/python/tvm/topi/cuda/sparse.py
index cb61d9686919..0b46cf0f9f97 100644
--- a/python/tvm/topi/cuda/sparse.py
+++ b/python/tvm/topi/cuda/sparse.py
@@ -23,7 +23,7 @@
 from tvm import relay, te
 
 from .. import nn
-from ..utils import traverse_inline, get_const_tuple, prod, get_const_int
+from ..utils import traverse_inline, get_const_tuple, prod, get_const_int, ceil_div
 
 
 def sparse_dense(data, weight_data, weight_indices, weight_indptr, sparse_lhs=False):
@@ -162,9 +162,6 @@ def sparse_dense_tir(data, w_data, w_indices, w_indptr):
     default_function_kernel1 for the multiply.
     """
 
-    def ceil_div(a, b):
-        return (a + (b - 1)) // b
-
     def gen_ir(data, w_data, w_indices, w_indptr, out):
         # pylint: disable=invalid-name
         # TODO(tkonolige): use tensorcores for block multiply
diff --git a/python/tvm/topi/utils.py b/python/tvm/topi/utils.py
index c3e14eff3919..dfc226f0c331 100644
--- a/python/tvm/topi/utils.py
+++ b/python/tvm/topi/utils.py
@@ -487,3 +487,8 @@ def is_empty_shape(shape):
       Whether input shape is empty or has dimesion with size 0.
     """
     return cpp.utils.is_empty_shape(shape)
+
+
+def ceil_div(a, b):
+    """Return ceil division of a by b"""
+    return tvm.tir.indexdiv(a + (b - 1), b)
diff --git a/src/runtime/contrib/thrust/thrust.cu b/src/runtime/contrib/thrust/thrust.cu
index 6a48f1ad876a..4e3e3a81af1a 100644
--- a/src/runtime/contrib/thrust/thrust.cu
+++ b/src/runtime/contrib/thrust/thrust.cu
@@ -25,6 +25,7 @@
 #include <thrust/device_vector.h>
 #include <thrust/sort.h>
 #include <thrust/gather.h>
+#include <thrust/scan.h>
 
 #include <tvm/runtime/registry.h>
 #include <dlpack/dlpack.h>
@@ -264,5 +265,80 @@ TVM_REGISTER_GLOBAL("tvm.contrib.thrust.stable_sort_by_key")
   }
 });
 
+template<typename InType, typename OutType>
+void thrust_scan(DLTensor* data,
+                 DLTensor* output,
+                 bool exclusive) {
+  thrust::device_ptr<InType> data_ptr(static_cast<InType *>(data->data));
+  thrust::device_ptr<OutType> output_ptr(static_cast<OutType *>(output->data));
+  const auto scan_size = data->shape[data->ndim - 1];
+
+  if (scan_size == 0) return;
+
+  if (data->ndim == 1 || (data->ndim == 2 && data->shape[0] == 1)) {
+    if (exclusive) {
+      thrust::exclusive_scan(data_ptr, data_ptr + scan_size, output_ptr);
+    } else {
+      thrust::inclusive_scan(data_ptr, data_ptr + scan_size, output_ptr);
+    }
+  } else {
+    // Use thrust segmented scan to compute scan on the inner most axis
+    // data->shape[0] * data->shape[1] * ... * data->shape[ndim - 2] scans are
+    // computed in parallel
+
+    // This is for constructing a sequence 0, 0, 0,...,1, 1, 1,...,2, 2, 2,...,
+    // without materializing the sequence vector
+    auto counting_iter = thrust::counting_iterator<int64_t>(0);
+    // Without __host__ annotation, cub crashes
+    auto linear_index_to_scan_key = [scan_size] __host__ __device__(int64_t i) {
+        return i / scan_size;
+    }; // NOLINT(*)
+    auto key_iter = thrust::make_transform_iterator(counting_iter, linear_index_to_scan_key);
+    int64_t size = 1;
+    for (int i = 0; i < data->ndim; ++i) size *= data->shape[i];
+
+    if (exclusive) {
+      thrust::exclusive_scan_by_key(key_iter, key_iter + size, data_ptr, output_ptr);
+    } else {
+      thrust::inclusive_scan_by_key(key_iter, key_iter + size, data_ptr, output_ptr);
+    }
+  }
+}
+
+TVM_REGISTER_GLOBAL("tvm.contrib.thrust.sum_scan")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  ICHECK_EQ(args.num_args, 3);
+  DLTensor* data = args[0];
+  DLTensor* output = args[1];
+  bool exclusive = args[2];
+
+  auto in_dtype = DLDataType2String(data->dtype);
+  auto out_dtype = DLDataType2String(output->dtype);
+
+  if (in_dtype == "int32") {
+    if (out_dtype == "int32") {
+      thrust_scan<int, int>(data, output, exclusive);
+    } else if (out_dtype == "int64") {
+      thrust_scan<int, int64_t>(data, output, exclusive);
+    } else {
+      LOG(FATAL) << "Unsupported output dtype: " << out_dtype;
+    }
+  } else if (in_dtype == "int64") {
+    if (out_dtype == "int64") {
+      thrust_scan<int64_t, int64_t>(data, output, exclusive);
+    } else {
+      LOG(FATAL) << "Unsupported output dtype: " << out_dtype;
+    }
+  } else if (in_dtype == "float32") {
+    if (out_dtype == "float32") {
+      thrust_scan<float, float>(data, output, exclusive);
+    } else {
+      LOG(FATAL) << "Unsupported output dtype: " << out_dtype;
+    }
+  } else {
+    LOG(FATAL) << "Unsupported input dtype: " << in_dtype;
+  }
+});
+
 }  // namespace contrib
 }  // namespace tvm
diff --git a/tests/python/contrib/test_sort.py b/tests/python/contrib/test_sort.py
index f338276ca118..a049602ac265 100644
--- a/tests/python/contrib/test_sort.py
+++ b/tests/python/contrib/test_sort.py
@@ -17,7 +17,7 @@
 import tvm
 import tvm.testing
 from tvm import te
-from tvm.topi.cuda import stable_sort_by_key_thrust, is_thrust_available, sort_by_key
+from tvm.topi.cuda import sort_by_key
 import numpy as np
 
 
@@ -91,38 +91,6 @@ def test_sort_np():
     tvm.testing.assert_allclose(c.asnumpy(), np_out, rtol=1e-5)
 
 
-def test_thrust_stable_sort_by_key():
-    if not is_thrust_available():
-        print("skip because thrust is not enabled...")
-        return
-
-    size = 6
-    keys = te.placeholder((size,), name="keys", dtype="int32")
-    values = te.placeholder((size,), name="values", dtype="int32")
-
-    keys_out, values_out = stable_sort_by_key_thrust(keys, values)
-
-    ctx = tvm.gpu(0)
-    target = "cuda"
-    s = te.create_schedule([keys_out.op, values_out.op])
-    f = tvm.build(s, [keys, values, keys_out, values_out], target)
-
-    keys_np = np.array([1, 4, 2, 8, 2, 7], np.int32)
-    values_np = np.random.randint(0, 10, size=(size,)).astype(np.int32)
-    keys_np_out = np.zeros(keys_np.shape, np.int32)
-    values_np_out = np.zeros(values_np.shape, np.int32)
-    keys_in = tvm.nd.array(keys_np, ctx)
-    values_in = tvm.nd.array(values_np, ctx)
-    keys_out = tvm.nd.array(keys_np_out, ctx)
-    values_out = tvm.nd.array(values_np_out, ctx)
-    f(keys_in, values_in, keys_out, values_out)
-
-    ref_keys_out = np.sort(keys_np)
-    ref_values_out = np.array([values_np[i] for i in np.argsort(keys_np)])
-    tvm.testing.assert_allclose(keys_out.asnumpy(), ref_keys_out, rtol=1e-5)
-    tvm.testing.assert_allclose(values_out.asnumpy(), ref_values_out, rtol=1e-5)
-
-
 def test_sort_by_key_gpu():
     size = 6
     keys = te.placeholder((size,), name="keys", dtype="int32")
@@ -158,5 +126,4 @@ def test_sort_by_key_gpu():
 if __name__ == "__main__":
     test_sort()
     test_sort_np()
-    test_thrust_stable_sort_by_key()
     test_sort_by_key_gpu()
diff --git a/tests/python/contrib/test_thrust.py b/tests/python/contrib/test_thrust.py
new file mode 100644
index 000000000000..5f66d465bf17
--- /dev/null
+++ b/tests/python/contrib/test_thrust.py
@@ -0,0 +1,123 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+import tvm.testing
+from tvm import te
+from tvm.topi.cuda import stable_sort_by_key_thrust, is_thrust_available
+from tvm.topi.cuda.scan import exclusive_scan, scan_thrust, schedule_scan
+import numpy as np
+
+
+def test_stable_sort_by_key():
+    if not is_thrust_available():
+        print("skip because thrust is not enabled...")
+        return
+
+    size = 6
+    keys = te.placeholder((size,), name="keys", dtype="int32")
+    values = te.placeholder((size,), name="values", dtype="int32")
+
+    keys_out, values_out = stable_sort_by_key_thrust(keys, values)
+
+    ctx = tvm.gpu(0)
+    target = "cuda"
+    s = te.create_schedule([keys_out.op, values_out.op])
+    f = tvm.build(s, [keys, values, keys_out, values_out], target)
+
+    keys_np = np.array([1, 4, 2, 8, 2, 7], np.int32)
+    values_np = np.random.randint(0, 10, size=(size,)).astype(np.int32)
+    keys_np_out = np.zeros(keys_np.shape, np.int32)
+    values_np_out = np.zeros(values_np.shape, np.int32)
+    keys_in = tvm.nd.array(keys_np, ctx)
+    values_in = tvm.nd.array(values_np, ctx)
+    keys_out = tvm.nd.array(keys_np_out, ctx)
+    values_out = tvm.nd.array(values_np_out, ctx)
+    f(keys_in, values_in, keys_out, values_out)
+
+    ref_keys_out = np.sort(keys_np)
+    ref_values_out = np.array([values_np[i] for i in np.argsort(keys_np)])
+    tvm.testing.assert_allclose(keys_out.asnumpy(), ref_keys_out, rtol=1e-5)
+    tvm.testing.assert_allclose(values_out.asnumpy(), ref_values_out, rtol=1e-5)
+
+
+def test_exclusive_scan():
+    if not is_thrust_available():
+        print("skip because thrust is not enabled...")
+        return
+
+    for ishape in [(1,), (10, 10)]:
+        values = te.placeholder(ishape, name="values", dtype="int32")
+
+        with tvm.target.Target("cuda"):
+            scan, reduction = exclusive_scan(values, return_reduction=True)
+            s = schedule_scan([scan, reduction])
+
+        ctx = tvm.gpu(0)
+        f = tvm.build(s, [values, scan, reduction], "cuda")
+
+        values_np = np.random.randint(0, 10, size=ishape).astype(np.int32)
+        values_np_out = np.zeros(values_np.shape, np.int32)
+
+        if len(ishape) == 1:
+            reduction_shape = ()
+        else:
+            reduction_shape = (ishape[0],)
+
+        reduction_np_out = np.zeros(reduction_shape, np.int32)
+
+        values_in = tvm.nd.array(values_np, ctx)
+        values_out = tvm.nd.array(values_np_out, ctx)
+        reduction_out = tvm.nd.array(reduction_np_out, ctx)
+        f(values_in, values_out, reduction_out)
+
+        ref_values_out = np.cumsum(values_np, axis=-1, dtype="int32") - values_np
+        tvm.testing.assert_allclose(values_out.asnumpy(), ref_values_out, rtol=1e-5)
+        ref_reduction_out = np.sum(values_np, axis=-1)
+        tvm.testing.assert_allclose(reduction_out.asnumpy(), ref_reduction_out, rtol=1e-5)
+
+
+def test_inclusive_scan():
+    if not is_thrust_available():
+        print("skip because thrust is not enabled...")
+        return
+
+    out_dtype = "int64"
+
+    for ishape in [(10,), (10, 10)]:
+        values = te.placeholder(ishape, name="values", dtype="int32")
+
+        with tvm.target.Target("cuda"):
+            scan = scan_thrust(values, out_dtype, exclusive=False)
+            s = tvm.te.create_schedule([scan.op])
+
+        ctx = tvm.gpu(0)
+        f = tvm.build(s, [values, scan], "cuda")
+
+        values_np = np.random.randint(0, 10, size=ishape).astype(np.int32)
+        values_np_out = np.zeros(values_np.shape, out_dtype)
+        values_in = tvm.nd.array(values_np, ctx)
+        values_out = tvm.nd.array(values_np_out, ctx)
+        f(values_in, values_out)
+
+        ref_values_out = np.cumsum(values_np, axis=-1, dtype=out_dtype)
+        tvm.testing.assert_allclose(values_out.asnumpy(), ref_values_out, rtol=1e-5)
+
+
+if __name__ == "__main__":
+    test_stable_sort_by_key()
+    test_exclusive_scan()
+    test_inclusive_scan()
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index d30e7873dae7..a537782355d2 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -879,6 +879,54 @@ def test_any_topk():
     verify_any_topk(any_dims(1), 0, (0,), "float32", ret_type="both")
 
 
+def verify_any_get_valid_counts(num_anchor_real, dtype, targets=None):
+    mod = tvm.IRModule()
+    batch_size = 1
+    num_anchor = relay.Any()
+    data = relay.var("data", shape=(batch_size, num_anchor, 5), dtype=dtype)
+    np_data = np.random.uniform(size=(batch_size, num_anchor_real, 5)).astype(dtype)
+
+    np_out1 = np.zeros(shape=(batch_size,))
+    np_out2 = np.zeros(shape=np_data.shape).astype(dtype)
+    np_out3 = np.zeros(shape=(batch_size, num_anchor_real))
+    score_threshold = 0.95
+
+    for i in range(batch_size):
+        np_out1[i] = 0
+        inter_idx = 0
+        for j in range(num_anchor_real):
+            score = np_data[i, j, 0]
+            if score > score_threshold:
+                for k in range(5):
+                    np_out2[i, inter_idx, k] = np_data[i, j, k]
+                np_out1[i] += 1
+                np_out3[i, inter_idx] = j
+                inter_idx += 1
+            if j >= np_out1[i]:
+                for k in range(5):
+                    np_out2[i, j, k] = -1.0
+                np_out3[i, j] = -1
+
+    z = relay.vision.get_valid_counts(data, score_threshold, 0, score_index=0)
+
+    mod["main"] = relay.Function([data], z.astuple())
+
+    check_result([np_data], mod, [np_out1, np_out2, np_out3], targets=targets)
+
+
+@tvm.testing.uses_gpu
+def test_any_get_valid_counts():
+    verify_any_get_valid_counts(10, "float32")
+    # opencl seems to have issues with empty size buffer
+    # Check failed: err_code == CL_SUCCESS == false: OpenCL Error,
+    # code=-61: CL_INVALID_BUFFER_SIZE
+    targets = []
+    for tgt, ctx in tvm.testing.enabled_targets():
+        if "opencl" not in tgt:
+            targets.append((tgt, ctx))
+    verify_any_get_valid_counts(0, "float32", targets=targets)
+
+
 @tvm.testing.uses_gpu
 def test_fused_ops():
     x = relay.var("x", shape=(relay.Any(), relay.Any()), dtype="float32")

From 969b77a209c24470ddc4e1e93eeb26e8ea74389b Mon Sep 17 00:00:00 2001
From: Dmitriy Smirnov <dmitriy.smirnov@arm.com>
Date: Wed, 20 Jan 2021 11:48:31 +0000
Subject: [PATCH 091/357] [BYOC][ACL] removed ACL 20.05 limitations (#7251)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Removed checks for padding in according with changes in ACL 20.11

*ACL stands for "Compute Library for the Arm® Architecture"
---
 .../tvm/relay/op/contrib/arm_compute_lib.py   | 39 ++------
 .../test_arm_compute_lib/infrastructure.py    |  2 +-
 .../test_arm_compute_lib/test_dense.py        | 99 +++++++++----------
 .../test_arm_compute_lib/test_network.py      |  2 +-
 4 files changed, 55 insertions(+), 87 deletions(-)

diff --git a/python/tvm/relay/op/contrib/arm_compute_lib.py b/python/tvm/relay/op/contrib/arm_compute_lib.py
index 8a03cb173612..139f25fef4fd 100644
--- a/python/tvm/relay/op/contrib/arm_compute_lib.py
+++ b/python/tvm/relay/op/contrib/arm_compute_lib.py
@@ -16,7 +16,6 @@
 # under the License.
 # pylint: disable=invalid-name, unused-argument
 """Arm Compute Library supported operators."""
-import numpy as np
 import tvm
 
 from tvm._ffi import register_func
@@ -382,7 +381,7 @@ def dense(expr):
         return False
     if attrs.out_dtype != "float32" and attrs.out_dtype != "":
         return False
-    return not require_padding([*args, expr.checked_type])
+    return True
 
 
 def qnn_dense(expr):
@@ -396,7 +395,7 @@ def qnn_dense(expr):
         return False
     if attrs.out_dtype != "int32":
         return False
-    return not require_padding([*args, expr.checked_type])
+    return True
 
 
 @tvm.ir.register_op_attr("nn.max_pool2d", "target.arm_compute_lib")
@@ -408,33 +407,7 @@ def max_pool2d(expr):
     typ = args[0].checked_type
     if typ.dtype not in ["float32", "uint8"]:
         return False
-    return not require_padding([*args, expr.checked_type])
-
-
-def require_padding(inputs):
-    """Checks whether supplied data will require padding.
-    Most of the operators ACL up to 20.11 uses padded data.
-    """
-
-    def _check(shape, dtype):
-        """NEON has 128bits/16bytes per vector"""
-        if len(shape) == 0:
-            return False
-        return (shape[-1] * np.dtype(dtype).itemsize) % 16 != 0
-
-    for i in inputs:
-        if isinstance(i, (tvm.relay.expr.Var, tvm.relay.expr.Call)):
-            if _check(i.checked_type.shape, i.checked_type.dtype):
-                return True
-        elif isinstance(i, tvm.relay.expr.Constant):
-            if _check(i.data.shape, i.data.dtype):
-                return True
-        elif isinstance(i, tvm.ir.tensor_type.TensorType):
-            if _check(i.shape, i.dtype):
-                return True
-        else:
-            raise RuntimeException("Not supported input type: %s" % type(i))
-    return False
+    return True
 
 
 @tvm.ir.register_op_attr("nn.avg_pool2d", "target.arm_compute_lib")
@@ -452,7 +425,7 @@ def avg_pool2d(expr, from_quantized_composite=False):
     if attrs.layout != "NHWC":
         return False
 
-    return not require_padding([*args, expr.checked_type])
+    return True
 
 
 @tvm.ir.register_op_attr("nn.global_max_pool2d", "target.arm_compute_lib")
@@ -464,7 +437,7 @@ def global_max_pool2d(expr):
         return False
     if attrs.layout != "NHWC":
         return False
-    return not require_padding([*args, expr.checked_type])
+    return True
 
 
 @tvm.ir.register_op_attr("nn.global_avg_pool2d", "target.arm_compute_lib")
@@ -476,7 +449,7 @@ def global_avg_pool2d(expr):
         return False
     if attrs.layout != "NHWC":
         return False
-    return not require_padding([*args, expr.checked_type])
+    return True
 
 
 @tvm.ir.register_op_attr("maximum", "target.arm_compute_lib")
diff --git a/tests/python/contrib/test_arm_compute_lib/infrastructure.py b/tests/python/contrib/test_arm_compute_lib/infrastructure.py
index 80cd5847440e..9a9bf69958f5 100644
--- a/tests/python/contrib/test_arm_compute_lib/infrastructure.py
+++ b/tests/python/contrib/test_arm_compute_lib/infrastructure.py
@@ -275,7 +275,7 @@ def extract_acl_modules(module):
 def verify_codegen(
     module,
     known_good_codegen,
-    num_acl_modules,
+    num_acl_modules=1,
     tvm_ops=0,
     target="llvm -mtriple=aarch64-linux-gnu -mattr=+neon",
 ):
diff --git a/tests/python/contrib/test_arm_compute_lib/test_dense.py b/tests/python/contrib/test_arm_compute_lib/test_dense.py
index dba7be67a012..e6620a4bc1cb 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_dense.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_dense.py
@@ -101,7 +101,7 @@ def _get_qnn_model(
     out = relay.qnn.op.requantize(
         out,
         relay.const(input_sc * kernel_sc, "float32"),  # input scale
-        relay.const(input_zp * kernel_zp, "int32"),  # input zero point
+        relay.const(0, "int32"),  # input zero point
         relay.const(output_sc, "float32"),  # output scale
         relay.const(output_zp, "int32"),  # output zero point
         out_dtype="uint8",
@@ -182,20 +182,18 @@ def test_dense():
 
     device = Device()
     np.random.seed(0)
-
     dtype = "float32"
     trials = [
-        [(1, 128), (16, 128), 16, True, 1],
-        [(1, 128), (16, 128), 16, False, 1],
-        [(32, 32), (32, 32), 32, True, 1],
-        [(32, 32), (32, 32), 32, False, 1],
-        [(1, 64), (1, 64), 1, True, 0],
-        [(1, 64), (1, 64), 1, False, 0],
-        [(11, 2), (2, 2), 2, True, 0],
-        [(11, 2), (2, 2), 2, False, 0],
+        [(1, 128), (16, 128), 16, True],
+        [(1, 128), (16, 128), 16, False],
+        [(32, 32), (32, 32), 32, True],
+        [(32, 32), (32, 32), 32, False],
+        [(1, 64), (1, 64), 1, True],
+        [(1, 64), (1, 64), 1, False],
+        [(11, 2), (2, 2), 2, True],
+        [(11, 2), (2, 2), 2, False],
     ]
-
-    for shape, weight_shape, units, composite, acl_partitions in trials:
+    for shape, weight_shape, units, composite in trials:
         outputs = []
         inputs = {"a": tvm.nd.array(np.random.uniform(-128, 127, shape).astype(dtype))}
         func, params = _get_model(
@@ -210,11 +208,8 @@ def test_dense():
                     params,
                     device,
                     enable_acl=acl,
-                    tvm_ops=(1 - acl_partitions) * (2 - int(not composite)),
-                    acl_partitions=acl_partitions,
                 )[0]
             )
-
         config = {
             "shape": shape,
             "weight_shape": weight_shape,
@@ -230,27 +225,25 @@ def test_codegen_dense():
         return
 
     np.random.seed(0)
-
     dtype = "float32"
     trials = [
-        [(1, 128), (16, 128), 16, True, 1],
-        [(1, 128), (16, 128), 16, False, 1],
-        [(32, 32), (32, 32), 32, True, 1],
-        [(32, 32), (32, 32), 32, False, 1],
-        [(1, 64), (1, 64), 1, True, 0],
-        [(1, 64), (1, 64), 1, False, 0],
+        [(1, 128), (16, 128), 16, True],
+        [(1, 128), (16, 128), 16, False],
+        [(32, 32), (32, 32), 32, True],
+        [(32, 32), (32, 32), 32, False],
+        [(1, 64), (1, 64), 1, True],
+        [(1, 64), (1, 64), 1, False],
+        [(11, 2), (2, 2), 2, True],
+        [(11, 2), (2, 2), 2, False],
     ]
-
-    for shape, weight_shape, units, composite, acl_partitions in trials:
+    for shape, weight_shape, units, composite in trials:
         inputs = {"a"}
 
         args = (shape, weight_shape, units, dtype)
 
         func, params = _get_model(*args, var_names=iter(inputs), has_bias=composite)
         exp_codegen = _get_expected_codegen(*args, has_bias=composite)
-        verify_codegen(
-            func, exp_codegen, acl_partitions, (1 - acl_partitions) * (2 - int(not composite))
-        )
+        verify_codegen(func, exp_codegen)
 
 
 def test_qnn_dense():
@@ -264,19 +257,20 @@ def test_qnn_dense():
 
     dtype = "uint8"
     trials = [
-        [(4, 4), (4, 4), 4, True, 0],
-        [(4, 4), (4, 4), 4, False, 0],
-        [(16, 16), (4, 16), 4, True, 1],
-        [(16, 16), (4, 16), 4, False, 1],
-        [(1, 128), (16, 128), 16, True, 1],
-        [(1, 128), (16, 128), 16, False, 1],
-        [(32, 32), (32, 32), 32, True, 1],
-        [(32, 32), (32, 32), 32, False, 1],
-        [(1, 64), (1, 64), 1, True, 0],
-        [(1, 64), (1, 64), 1, False, 0],
+        [(1, 2), (2, 2), 2, True],
+        [(1, 2), (2, 2), 2, False],
+        [(4, 4), (4, 4), 4, True],
+        [(4, 4), (4, 4), 4, False],
+        [(16, 16), (4, 16), 4, True],
+        [(16, 16), (4, 16), 4, False],
+        [(1, 128), (16, 128), 16, True],
+        [(1, 128), (16, 128), 16, False],
+        [(32, 32), (32, 32), 32, True],
+        [(32, 32), (32, 32), 32, False],
+        [(1, 64), (1, 64), 1, True],
+        [(1, 64), (1, 64), 1, False],
     ]
-
-    for shape, weight_shape, units, composite, acl_partitions in trials:
+    for shape, weight_shape, units, composite in trials:
         outputs = []
         inputs = {"a": tvm.nd.array(np.random.uniform(0, 255, shape).astype(dtype))}
         input_zp = 100
@@ -310,8 +304,6 @@ def test_qnn_dense():
                     1,
                     params,
                     device,
-                    tvm_ops=(1 - acl_partitions) * (3 - int(not composite)),
-                    acl_partitions=acl_partitions,
                     enable_acl=acl,
                 )[0]
             )
@@ -340,15 +332,20 @@ def test_codegen_qnn_dense():
 
     dtype = "uint8"
     trials = [
-        [(1, 128), (16, 128), 16, True, 1],
-        [(1, 128), (16, 128), 16, False, 1],
-        [(32, 32), (32, 32), 32, True, 1],
-        [(32, 32), (32, 32), 32, False, 1],
-        [(1, 64), (1, 64), 1, True, 0],
-        [(1, 64), (1, 64), 1, False, 0],
+        [(1, 2), (2, 2), 2, True],
+        [(1, 2), (2, 2), 2, False],
+        [(4, 4), (4, 4), 4, True],
+        [(4, 4), (4, 4), 4, False],
+        [(16, 16), (4, 16), 4, True],
+        [(16, 16), (4, 16), 4, False],
+        [(1, 128), (16, 128), 16, True],
+        [(1, 128), (16, 128), 16, False],
+        [(32, 32), (32, 32), 32, True],
+        [(32, 32), (32, 32), 32, False],
+        [(1, 64), (1, 64), 1, True],
+        [(1, 64), (1, 64), 1, False],
     ]
-
-    for shape, weight_shape, units, composite, acl_partitions in trials:
+    for shape, weight_shape, units, composite in trials:
         inputs = {"a"}
         args = (shape, weight_shape, units, dtype)
 
@@ -372,9 +369,7 @@ def test_codegen_qnn_dense():
             has_bias=composite,
         )
         exp_codegen = _get_expected_codegen(*args, has_bias=composite)
-        verify_codegen(
-            func, exp_codegen, acl_partitions, (1 - acl_partitions) * (3 - int(not composite))
-        )
+        verify_codegen(func, exp_codegen)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/contrib/test_arm_compute_lib/test_network.py b/tests/python/contrib/test_arm_compute_lib/test_network.py
index 462df143b447..bb44b79078dd 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_network.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_network.py
@@ -172,7 +172,7 @@ def get_model():
         return mod, params, inputs
 
     _build_and_run_network(
-        *get_model(), device=device, tvm_ops=10, acl_partitions=30, atol=8, rtol=0
+        *get_model(), device=device, tvm_ops=9, acl_partitions=31, atol=8, rtol=0
     )
 
 

From 727345e4edfafbb6103e52039517c86a6f81217b Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Wed, 20 Jan 2021 12:40:24 -0800
Subject: [PATCH 092/357] [COMMUNITY] tkonolige -> Reviewer (#7311)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 36743a345d21..185202a9700c 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -102,6 +102,7 @@ We do encourage everyone to work anything they are interested in.
 - [Xiaoqiang Dan](https://github.com/xqdan): @xqdan
 - [Ziheng Jiang](https://github.com/ZihengJiang): @ZihengJiang
 - [Marisa Kirisame](https://github.com/MarisaKirisame): @MarisaKirisame
+- [Tristan Konolige](https://github.com/tkonolige): @tkonolige
 - [Wuwei Lin](https://github.com/vinx13): @vinx13
 - [Andrew Liu](https://github.com/hypercubestart): @hypercubestart
 - [Henry Liu](https://github.com/optima2005): @optima2005

From e8ab6079920dcac57d7b89582ec6a609d6363dd9 Mon Sep 17 00:00:00 2001
From: Dmitriy Smirnov <dmitriy.smirnov@arm.com>
Date: Thu, 21 Jan 2021 01:48:16 +0000
Subject: [PATCH 093/357] [TFLite] Strided slice handling of shrink_axis_mask
 improved (#6998)

* [TFLite] Strided slice handlig of shrink_axis_mask improved

1. Added removal of dimensions if result is a scalar
to mimic TensorFlow behaviour. E.g.:
    tf.strided_slice([1,2,3], [0], [1], [1], shrink_axis_mask=0)
    <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>

    tf.strided_slice([[[1,2,3],[4,5,6],[7,8,9]]], [0, 0, 0], [3, 3, 3], [1, 1, 1], shrink_axis_mask=7)
    <tf.Tensor: shape=(), dtype=int32, numpy=1>

2. Added extra check to assert_allclose to check shape equalities
as np.testing.assert_allclose() does not distinguish between cases like:

    np.testing.assert_allclose(1, np.array(1))
    np.testing.assert_allclose(1, np.array([1]))
    np.testing.assert_allclose(np.array(1), np.array([1]))

* unit tests fixed
---
 python/tvm/relay/frontend/tflite.py          |  7 ++++++-
 python/tvm/testing.py                        |  3 +++
 tests/python/frontend/tflite/test_forward.py | 18 ++++++++++++++++++
 tests/python/integration/test_dot.py         |  4 ++--
 tests/python/integration/test_reduce.py      | 14 +++++++-------
 5 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index 316815c980e3..f474e59407e0 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -1613,14 +1613,19 @@ def _transform_mask(stride_dim, ellipsis_mask):
 
         # Create final output shape.
         final_output = []
+        final_len = len(fshape_indices)
         for gather_index in fshape_indices:
             if gather_index == -1:
                 final_output.append(1)
+                final_len += 1
             elif gather_index == -2:
-                pass
+                final_len -= 1
             else:
                 final_output.append(out_shape[gather_index])
 
+        if final_len == 0:
+            return _op.squeeze(out, axis=tuple(range(len(fshape_indices))))
+
         if not final_output:
             return out
         return _op.reshape(out, newshape=tuple(final_output))
diff --git a/python/tvm/testing.py b/python/tvm/testing.py
index 8311a63d0749..d65ab23677b5 100644
--- a/python/tvm/testing.py
+++ b/python/tvm/testing.py
@@ -76,6 +76,9 @@ def assert_allclose(actual, desired, rtol=1e-7, atol=1e-7):
     compares the `abs(actual-desired)` with `atol+rtol*abs(desired)`.  Since we
     often allow `desired` to be close to zero, we generally want non-zero `atol`.
     """
+    actual = np.asanyarray(actual)
+    desired = np.asanyarray(desired)
+    np.testing.assert_allclose(actual.shape, desired.shape)
     np.testing.assert_allclose(actual, desired, rtol=rtol, atol=atol, verbose=True)
 
 
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index f3653014be00..6847fd96f37c 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -583,6 +583,24 @@ def _test_stridedslice(
 def test_forward_stridedslice():
     """test StridedSlice"""
     for quantized in [False, True]:
+        _test_stridedslice(
+            (1, 3, 3),
+            [0, 0, 0],
+            [3, 3, 3],
+            [1, 1, 1],
+            "float32",
+            shrink_axis_mask=7,
+            quantized=quantized,
+        )
+        _test_stridedslice(
+            (1, 3, 3),
+            [0, 0, 0],
+            [3, 3, 3],
+            [1, 1, 1],
+            "float32",
+            shrink_axis_mask=5,
+            quantized=quantized,
+        )
         _test_stridedslice((2), [1], [1], [1], "float32", shrink_axis_mask=1, quantized=quantized)
         _test_stridedslice(
             (3, 4, 3), [1, -1, 0], [4, -5, 3], [2, -1, 1], "float32", quantized=quantized
diff --git a/tests/python/integration/test_dot.py b/tests/python/integration/test_dot.py
index d4364c88dc9a..609b6dedfb3a 100644
--- a/tests/python/integration/test_dot.py
+++ b/tests/python/integration/test_dot.py
@@ -27,7 +27,7 @@ def test_dot():
     A = te.placeholder((n,), name="A")
     B = te.placeholder((n,), name="B")
     k = te.reduce_axis((0, n), "k")
-    C = te.compute((1,), lambda _: te.sum(A[k] * B[k], axis=k), name="C")
+    C = te.compute((), lambda: te.sum(A[k] * B[k], axis=k), name="C")
     s = te.create_schedule(C.op)
 
     def verify(target):
@@ -36,7 +36,7 @@ def verify(target):
         ctx = tvm.cpu(0)
         a = tvm.nd.array(np.random.uniform(size=(nn,)).astype(A.dtype), ctx)
         b = tvm.nd.array(np.random.uniform(size=(nn,)).astype(B.dtype), ctx)
-        c = tvm.nd.array(np.zeros((1,), dtype=C.dtype), ctx)
+        c = tvm.nd.array(np.zeros((), dtype=C.dtype), ctx)
         f(a, b, c)
         tvm.testing.assert_allclose(c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-4)
 
diff --git a/tests/python/integration/test_reduce.py b/tests/python/integration/test_reduce.py
index b02b7980f37a..e978b83aabd6 100644
--- a/tests/python/integration/test_reduce.py
+++ b/tests/python/integration/test_reduce.py
@@ -73,7 +73,7 @@ def test_init_imm():
     n = tvm.runtime.convert(1027)
     A = te.placeholder((n,), name="A")
     k = te.reduce_axis((0, n))
-    B = te.compute((1,), lambda i: te.sum(A[k], axis=k, init=10.0), name="B")
+    B = te.compute((), lambda: te.sum(A[k], axis=k, init=10.0), name="B")
     # schedule
     s = te.create_schedule(B.op)
     # one line to build the function.
@@ -86,7 +86,7 @@ def check_target(target="llvm"):
         # launch the kernel.
         n = 1027
         a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.zeros(1, dtype=B.dtype), ctx)
+        b = tvm.nd.array(np.zeros((), dtype=B.dtype), ctx)
         fsum(a, b)
         res = 10.0 + np.sum(a.asnumpy(), axis=0)
         tvm.testing.assert_allclose(b.asnumpy(), res, rtol=1e-4)
@@ -129,7 +129,7 @@ def test_rfactor():
     n = tvm.runtime.convert(1027)
     A = te.placeholder((n,), name="A")
     k = te.reduce_axis((0, n))
-    B = te.compute((1,), lambda i: te.sum(A[k], axis=k), name="B")
+    B = te.compute((), lambda: te.sum(A[k], axis=k), name="B")
     # schedule
     s = te.create_schedule(B.op)
     kf, ki = s[B].split(k, nparts=4)
@@ -145,7 +145,7 @@ def check_target(target="llvm"):
         # launch the kernel.
         n = 1027
         a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.zeros(1, dtype=B.dtype), ctx)
+        b = tvm.nd.array(np.zeros((), dtype=B.dtype), ctx)
         fsum(a, b)
         res = np.sum(a.asnumpy(), axis=0)
         tvm.testing.assert_allclose(b.asnumpy(), res, rtol=1e-4)
@@ -191,11 +191,11 @@ def test_rfactor_factor_axis():
     n = tvm.runtime.convert(1027)
     A = te.placeholder((n,), name="A")
     k = te.reduce_axis((0, n))
-    B = te.compute((1,), lambda i: te.sum(A[k], axis=k), name="B")
+    B = te.compute((), lambda: te.sum(A[k], axis=k), name="B")
     # schedule
     s = te.create_schedule(B.op)
     kf, ki = s[B].split(k, nparts=4)
-    BF = s.rfactor(B, kf, 1)
+    BF = s.rfactor(B, kf, 0)
     s[BF].parallel(BF.op.axis[0])
     # one line to build the function.
     def check_target(target="llvm"):
@@ -207,7 +207,7 @@ def check_target(target="llvm"):
         # launch the kernel.
         n = 1027
         a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.zeros(1, dtype=B.dtype), ctx)
+        b = tvm.nd.array(np.zeros((), dtype=B.dtype), ctx)
         fsum(a, b)
         res = np.sum(a.asnumpy(), axis=0)
         tvm.testing.assert_allclose(b.asnumpy(), res, rtol=1e-4)

From f82940393e86bf0b94d4e35d35a1684223092b33 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Thu, 21 Jan 2021 15:05:44 +0900
Subject: [PATCH 094/357] [TOPI] Rewrite GPU argwhere using exclusive scan
 (#7314)

* use ex scan to write argwhere

* add doc
---
 python/tvm/contrib/nvcc.py       |   2 +-
 python/tvm/topi/cuda/argwhere.py | 524 ++++++-------------------------
 2 files changed, 98 insertions(+), 428 deletions(-)

diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py
index bc11e4a867e4..5886760934fb 100644
--- a/python/tvm/contrib/nvcc.py
+++ b/python/tvm/contrib/nvcc.py
@@ -186,7 +186,7 @@ def find_libdevice_path(arch):
     selected_ver = 0
     selected_path = None
     cuda_ver = get_cuda_version(cuda_path)
-    if cuda_ver in (9.0, 9.1, 10.0, 10.1, 10.2, 11.0, 11.1):
+    if cuda_ver in (9.0, 9.1, 10.0, 10.1, 10.2, 11.0, 11.1, 11.2):
         path = os.path.join(lib_path, "libdevice.10.bc")
     else:
         for fn in os.listdir(lib_path):
diff --git a/python/tvm/topi/cuda/argwhere.py b/python/tvm/topi/cuda/argwhere.py
index e39004dc76a9..cc6c4c26eddb 100644
--- a/python/tvm/topi/cuda/argwhere.py
+++ b/python/tvm/topi/cuda/argwhere.py
@@ -21,169 +21,135 @@
 
 import tvm
 from tvm import te
-from tvm._ffi import get_global_func
 from .injective import schedule_injective_from_existing
-from .nms import atomic_add
-from .sort import topk, topk_thrust, argsort, argsort_thrust
+from .scan import exclusive_scan
 from .. import tag
-from ..transform import strided_slice, adv_index, squeeze
-
-logger = logging.getLogger("topi")
+from ..utils import ceil_div, prod
+from ..transform import reshape
+from ..broadcast import not_equal
+from ..math import cast
 
 
-def _get_sort_func(mode=0):
-    """Get sort function for argwhere. mode 0 for topk and others for argsort."""
-    if get_global_func("tvm.contrib.thrust.sort", allow_missing=True):
-        ret = topk_thrust if mode == 0 else argsort_thrust
-    else:
-        logger.warning(
-            "It's highly recommended to enable thrust library with set(USE_THRUST ON)"
-            " when compiling argwhere for cuda target. Otherwise, it can result in"
-            " significant performance degradation or incorrect result"
-        )
-        ret = topk if mode == 0 else argsort
+logger = logging.getLogger("topi")
 
-    return ret
+fdiv = tvm.tir.floordiv
+fmod = tvm.tir.floormod
 
 
-def argwhere_1d_ir(condition, out):
-    """Low level IR for argwhere 1D
+def compact_nonzero_indices_ir(condition, write_indices, out, do_write_func):
+    """Copy nonzero indices to the corresponding write locations.
 
     Parameters
     ----------
     condition : Buffer
-        The condition buffer.
+        The input condition.
+
+    write_indices : Buffer
+        The result of exclusive scan on a boolean array, where True indicates that
+        the condition is non zero at that position.
 
     out : Buffer
-        The output buffer.
+        The output buffer to copy indices to.
+
+    do_write_func : a function
+        A callback that accepts an output buffer, a dst index to write to, and a src index.
 
     Returns
     -------
     stmt : Stmt
         The result IR statement.
     """
+
     ib = tvm.tir.ir_builder.create()
-    a0 = condition.shape[0]
+    size_1d = prod(condition.shape)
 
     condition = ib.buffer_ptr(condition)
+    write_indices = ib.buffer_ptr(write_indices)
     out = ib.buffer_ptr(out)
 
-    valid_index = ib.allocate("int32", (1,), name="valid_index", scope="global")
-    tmp = ib.allocate("int32", (1,), name="tmp", scope="local")
-    one_count = tvm.tir.const(1, dtype="int32")
-
-    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-    nthread_tx = max_threads
-    # Limit threads to a single block to make sure atomic_add works normally.
+    nthread_tx = int(tvm.target.Target.current(allow_none=False).max_num_threads)
+    nthread_bx = ceil_div(size_1d, nthread_tx)
     tx = te.thread_axis("threadIdx.x")
+    bx = te.thread_axis("blockIdx.x")
     ib.scope_attr(tx, "thread_extent", nthread_tx)
-    len_inner_for = a0 // nthread_tx + 1
-    valid_index[0] = 0
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
 
-    with ib.for_range(0, len_inner_for, name="i") as i:
-        idx = tx * len_inner_for + i
-        with ib.if_scope(idx < a0):
+    with ib.new_scope():
+        idx = bx * nthread_tx + tx
+        with ib.if_scope(idx < size_1d):
             with ib.if_scope(condition[idx] != 0):
-                tmp[0] = atomic_add(
-                    tvm.tir.call_intrin("handle", "tir.address_of", valid_index[0]),
-                    one_count,
-                )
-                out[tmp[0]] = idx
+                do_write_func(out, write_indices[idx], idx)
 
     return ib.get()
 
 
-def argwhere_1d(output_shape, condition):
-    """Compute for argwhere 1D
+def argwhere_common(output_shape, condition, do_write_func):
+    """A common compute used by argwhere of various ranks.
 
     Parameters
     ----------
-    condition : list of int or tvm.tir.Any
-        The output shape
+    output_shape : list of int or tvm.tir.Any
+        Tensor with output shape info.
 
-    out : tvm.te.Tensor
-        Tensor with boolean values.
+    condition : tvm.te.Tensor
+        The input condition.
+
+    do_write_func : a function
+        A callback that accepts an output buffer, a dst index to write to, and a src index.
 
     Returns
     -------
-    stmt : Stmt
-        The result IR statement.
+    out : tvm.te.Tensor
+        Indices of non-zero elements.
     """
+
+    flags = not_equal(condition, tvm.tir.const(0))
+    flags_1d = reshape(flags, (prod(flags.shape),))
+    write_indices = exclusive_scan(cast(flags_1d, dtype="int32"))
+
     condition_buf = tvm.tir.decl_buffer(
         condition.shape, condition.dtype, "data_buf", data_alignment=8
     )
+    write_indices_buf = tvm.tir.decl_buffer(
+        write_indices.shape, write_indices.dtype, "write_indices_buf", data_alignment=8
+    )
     out_buf = tvm.tir.decl_buffer(output_shape, "int32", "out_buf", data_alignment=8)
 
     out = te.extern(
         [output_shape],
-        [condition],
-        lambda ins, outs: argwhere_1d_ir(ins[0], outs[0]),
+        [condition, write_indices],
+        lambda ins, outs: compact_nonzero_indices_ir(ins[0], ins[1], outs[0], do_write_func),
         dtype=["int32"],
-        in_buffers=[condition_buf],
+        in_buffers=[condition_buf, write_indices_buf],
         out_buffers=[out_buf],
-        name="argwhere_1d",
-        tag="argwhere1d_gpu",
+        name="argwhere",
+        tag="argwhere_gpu",
     )
 
-    if isinstance(out.shape[0], (int, tvm.tir.expr.IntImm)) and int(out.shape[0]) <= 1:
-        return out
-
-    sorted_out = _get_sort_func()(
-        out, k=0, axis=0, ret_type="values", is_ascend="True", dtype="int32"
-    )
-
-    return sorted_out
+    return out
 
 
-def argwhere_2d_ir(condition, out):
-    """Low level IR for argwhere 2D
+def argwhere_1d(output_shape, condition):
+    """Compute for argwhere 1D
 
     Parameters
     ----------
-    condition : Buffer
-        The condition buffer.
+    condition : list of int or tvm.tir.Any
+        The output shape
 
-    out : Buffer
-        The output buffer.
+    out : tvm.te.Tensor
+        Tensor with boolean values.
 
     Returns
     -------
     stmt : Stmt
         The result IR statement.
     """
-    ib = tvm.tir.ir_builder.create()
-    a0 = condition.shape[0]
-    a1 = condition.shape[1]
 
-    condition = ib.buffer_ptr(condition)
-    out = ib.buffer_ptr(out)
+    def do_write(out, write_index, idx):
+        out[write_index] = idx
 
-    valid_index = ib.allocate("int32", (1,), name="valid_index", scope="local")
-    tmp = ib.allocate("int32", (1,), name="tmp", scope="local")
-    one_count = tvm.tir.const(1, dtype="int32")
-
-    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-    nthread_tx = max_threads
-
-    # Limit threads to a single block to make sure atomic_add works normally.
-    tx = te.thread_axis("threadIdx.x")
-    ib.scope_attr(tx, "thread_extent", nthread_tx)
-    len_inner_for = (a0 * a1) // nthread_tx + 1
-
-    valid_index[0] = 0
-
-    with ib.for_range(0, len_inner_for, name="i") as i:
-        idx = tx * len_inner_for + i
-        with ib.if_scope(idx < (a0 * a1)):
-            with ib.if_scope(condition[idx] != 0):
-                tmp[0] = atomic_add(
-                    tvm.tir.call_intrin("handle", "tir.address_of", valid_index[0]),
-                    one_count,
-                )
-                out[tmp[0] * 2] = tvm.tir.floordiv(idx, a1)
-                out[tmp[0] * 2 + 1] = tvm.tir.floormod(idx, a1)
-
-    return ib.get()
+    return argwhere_common(output_shape, condition, do_write)
 
 
 def argwhere_2d(output_shape, condition):
@@ -202,109 +168,13 @@ def argwhere_2d(output_shape, condition):
     stmt : Stmt
         The result IR statement.
     """
-    condition_buf = tvm.tir.decl_buffer(
-        condition.shape, condition.dtype, "data_buf", data_alignment=8
-    )
-    out_buf = tvm.tir.decl_buffer(output_shape, "int32", "out_buf", data_alignment=8)
-
-    out = te.extern(
-        [output_shape],
-        [condition],
-        lambda ins, outs: argwhere_2d_ir(ins[0], outs[0]),
-        dtype=["int32"],
-        in_buffers=[condition_buf],
-        out_buffers=[out_buf],
-        name="argwhere_2d",
-        tag="argwhere2d_gpu",
-    )
-
-    if isinstance(out.shape[0], (int, tvm.tir.expr.IntImm)) and int(out.shape[0]) <= 1:
-        return out
-
-    sort_func = _get_sort_func(1)
-
-    # sort the output from the least significant to the most significant
-    # column.
-    if isinstance(out.shape[0], (int, tvm.tir.expr.IntImm)):
-        out1 = strided_slice(out, [0, 1], [out.shape[0], 2])
-        out2 = sort_func(out1, axis=0, dtype="int32")
-        out3 = squeeze(out2)
-        out = adv_index(out, [out3])
-
-        out1 = strided_slice(out, [0, 0], [out.shape[0], 1])
-        out2 = sort_func(out1, axis=0, dtype="int32")
-        out3 = squeeze(out2)
-
-        out = adv_index(out, [out3])
-    else:
-        out1 = strided_slice(out, [0, 1], [out.shape[0], 2], [1, 1])
-        out2 = sort_func(out1, axis=0, dtype="int32")
-        out3 = squeeze(out2)
-        out = adv_index(out, [out3])
-
-        out1 = strided_slice(out, [0, 0], [out.shape[0], 1], [1, 1])
-        out2 = sort_func(out1, axis=0, dtype="int32")
-        out3 = squeeze(out2)
-        out = adv_index(out, [out3])
-    return out
-
-
-def argwhere_3d_ir(condition, out):
-    """Low level IR for argwhere 3D
-
-    Parameters
-    ----------
-    condition : Buffer
-        The condition buffer.
-
-    out : Buffer
-        The output buffer.
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-    ib = tvm.tir.ir_builder.create()
-    a0 = condition.shape[0]
-    a1 = condition.shape[1]
-    a2 = condition.shape[2]
-    s1 = a1 * a2
-    s0 = a0 * s1
-
-    condition = ib.buffer_ptr(condition)
-    out = ib.buffer_ptr(out)
-
-    valid_index = ib.allocate("int32", (1,), name="valid_index", scope="local")
-    tmp = ib.allocate("int32", (1,), name="tmp", scope="local")
-    one_count = tvm.tir.const(1, dtype="int32")
 
-    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-    nthread_tx = max_threads
+    def do_write(out, write_index, idx):
+        a1 = condition.shape[1]
+        out[write_index * 2] = tvm.tir.floordiv(idx, a1)
+        out[write_index * 2 + 1] = tvm.tir.floormod(idx, a1)
 
-    # Limit threads to a single block to make sure atomic_add works normally.
-    tx = te.thread_axis("threadIdx.x")
-    ib.scope_attr(tx, "thread_extent", nthread_tx)
-    len_inner_for = s0 // nthread_tx + 1
-
-    fdiv = tvm.tir.floordiv
-    fmod = tvm.tir.floormod
-
-    valid_index[0] = 0
-
-    with ib.for_range(0, len_inner_for, name="i") as i:
-        idx = tx * len_inner_for + i
-        with ib.if_scope(idx < s0):
-            with ib.if_scope(condition[idx] != 0):
-                tmp[0] = atomic_add(
-                    tvm.tir.call_intrin("handle", "tir.address_of", valid_index[0]),
-                    one_count,
-                )
-                out[tmp[0] * 3] = fdiv(idx, s1)
-                out[tmp[0] * 3 + 1] = fdiv(fmod(idx, s1), a2)
-                out[tmp[0] * 3 + 2] = fmod(idx, a2)
-
-    return ib.get()
+    return argwhere_common(output_shape, condition, do_write)
 
 
 def argwhere_3d(output_shape, condition):
@@ -323,103 +193,15 @@ def argwhere_3d(output_shape, condition):
     stmt : Stmt
         The result IR statement.
     """
-    condition_buf = tvm.tir.decl_buffer(
-        condition.shape, condition.dtype, "data_buf", data_alignment=8
-    )
-    out_buf = tvm.tir.decl_buffer(output_shape, "int32", "out_buf", data_alignment=8)
-
-    out = te.extern(
-        [output_shape],
-        [condition],
-        lambda ins, outs: argwhere_3d_ir(ins[0], outs[0]),
-        dtype=["int32"],
-        in_buffers=[condition_buf],
-        out_buffers=[out_buf],
-        name="argwhere_3d",
-        tag="argwhere3d_gpu",
-    )
-
-    if isinstance(out.shape[0], (int, tvm.tir.expr.IntImm)) and int(out.shape[0]) <= 1:
-        return out
-
-    # sort the output from the least significant to the most significant
-    # column.
-    sort_func = _get_sort_func(1)
-
-    if isinstance(out.shape[0], (int, tvm.tir.expr.IntImm)):
-        for i in reversed(range(3)):
-            out1 = strided_slice(out, [0, i], [out.shape[0], i + 1])
-            out2 = sort_func(out1, axis=0, dtype="int32")
-            out3 = squeeze(out2)
-            out = adv_index(out, [out3])
-    else:
-        for i in reversed(range(3)):
-            out1 = strided_slice(out, [0, i], [out.shape[0], i + 1], [1, 1])
-            out2 = sort_func(out1, axis=0, dtype="int32")
-            out3 = squeeze(out2)
-            out = adv_index(out, [out3])
-    return out
-
-
-def argwhere_4d_ir(condition, out):
-    """Low level IR for argwhere 4D
-
-    Parameters
-    ----------
-    condition : Buffer
-        The condition buffer.
-
-    out : Buffer
-        The output buffer.
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-    ib = tvm.tir.ir_builder.create()
-    a0 = condition.shape[0]
-    a1 = condition.shape[1]
-    a2 = condition.shape[2]
-    a3 = condition.shape[3]
-    s1 = a2 * a3
-    s2 = a1 * s1
-    s0 = a0 * s2
-
-    condition = ib.buffer_ptr(condition)
-    out = ib.buffer_ptr(out)
-
-    valid_index = ib.allocate("int32", (1,), name="valid_index", scope="local")
-    tmp = ib.allocate("int32", (1,), name="tmp", scope="local")
-    one_count = tvm.tir.const(1, dtype="int32")
-
-    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-    nthread_tx = max_threads
-
-    # Limit threads to a single block to make sure atomic_add works normally.
-    tx = te.thread_axis("threadIdx.x")
-    ib.scope_attr(tx, "thread_extent", nthread_tx)
-    len_inner_for = s0 // nthread_tx + 1
-
-    fdiv = tvm.tir.floordiv
-    fmod = tvm.tir.floormod
 
-    valid_index[0] = 0
+    def do_write(out, write_index, idx):
+        _, a1, a2 = condition.shape
+        s1 = a1 * a2
+        out[write_index * 3] = fdiv(idx, s1)
+        out[write_index * 3 + 1] = fdiv(fmod(idx, s1), a2)
+        out[write_index * 3 + 2] = fmod(idx, a2)
 
-    with ib.for_range(0, len_inner_for, name="i") as i:
-        idx = tx * len_inner_for + i
-        with ib.if_scope(idx < s0):
-            with ib.if_scope(condition[idx] != 0):
-                tmp[0] = atomic_add(
-                    tvm.tir.call_intrin("handle", "tir.address_of", valid_index[0]),
-                    one_count,
-                )
-                out[tmp[0] * 4] = fdiv(idx, s2)
-                out[tmp[0] * 4 + 1] = fdiv(fmod(idx, s2), s1)
-                out[tmp[0] * 4 + 2] = fdiv(fmod(idx, s1), a3)
-                out[tmp[0] * 4 + 3] = fmod(idx, a3)
-
-    return ib.get()
+    return argwhere_common(output_shape, condition, do_write)
 
 
 def argwhere_4d(output_shape, condition):
@@ -438,106 +220,17 @@ def argwhere_4d(output_shape, condition):
     stmt : Stmt
         The result IR statement.
     """
-    condition_buf = tvm.tir.decl_buffer(
-        condition.shape, condition.dtype, "data_buf", data_alignment=8
-    )
-    out_buf = tvm.tir.decl_buffer(output_shape, "int32", "out_buf", data_alignment=8)
-
-    out = te.extern(
-        [output_shape],
-        [condition],
-        lambda ins, outs: argwhere_4d_ir(ins[0], outs[0]),
-        dtype=["int32"],
-        in_buffers=[condition_buf],
-        out_buffers=[out_buf],
-        name="argwhere_4d",
-        tag="argwhere4d_gpu",
-    )
-
-    if isinstance(out.shape[0], (int, tvm.tir.expr.IntImm)) and int(out.shape[0]) <= 1:
-        return out
-
-    # sort the output from the least significant to the most significant
-    # column.
-    sort_func = _get_sort_func(1)
-    if isinstance(out.shape[0], (int, tvm.tir.expr.IntImm)):
-        for i in reversed(range(4)):
-            out1 = strided_slice(out, [0, i], [out.shape[0], i + 1])
-            out2 = sort_func(out1, axis=0, dtype="int32")
-            out3 = squeeze(out2)
-            out = adv_index(out, [out3])
-    else:
-        for i in reversed(range(4)):
-            out1 = strided_slice(out, [0, i], [out.shape[0], i + 1], [1, 1])
-            out2 = sort_func(out1, axis=0, dtype="int32")
-            out3 = squeeze(out2)
-            out = adv_index(out, [out3])
-
-    return out
-
-
-def argwhere_5d_ir(condition, out):
-    """Low level IR for argwhere 5D
-
-    Parameters
-    ----------
-    condition : Buffer
-        The condition buffer.
-
-    out : Buffer
-        The output buffer.
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-    ib = tvm.tir.ir_builder.create()
-    a0 = condition.shape[0]
-    a1 = condition.shape[1]
-    a2 = condition.shape[2]
-    a3 = condition.shape[3]
-    a4 = condition.shape[4]
-    s1 = a3 * a4
-    s2 = a2 * s1
-    s3 = a1 * s2
-    s0 = a0 * s3
-
-    condition = ib.buffer_ptr(condition)
-    out = ib.buffer_ptr(out)
-
-    valid_index = ib.allocate("int32", (1,), name="valid_index", scope="local")
-    tmp = ib.allocate("int32", (1,), name="tmp", scope="local")
-    one_count = tvm.tir.const(1, dtype="int32")
-
-    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-    nthread_tx = max_threads
 
-    # Limit threads to a single block to make sure atomic_add works normally.
-    tx = te.thread_axis("threadIdx.x")
-    ib.scope_attr(tx, "thread_extent", nthread_tx)
-    len_inner_for = s0 // nthread_tx + 1
+    def do_write(out, write_index, idx):
+        _, a1, a2, a3 = condition.shape
+        s1 = a2 * a3
+        s2 = a1 * s1
+        out[write_index * 4] = fdiv(idx, s2)
+        out[write_index * 4 + 1] = fdiv(fmod(idx, s2), s1)
+        out[write_index * 4 + 2] = fdiv(fmod(idx, s1), a3)
+        out[write_index * 4 + 3] = fmod(idx, a3)
 
-    fdiv = tvm.tir.floordiv
-    fmod = tvm.tir.floormod
-
-    valid_index[0] = 0
-
-    with ib.for_range(0, len_inner_for, name="i") as i:
-        idx = tx * len_inner_for + i
-        with ib.if_scope(idx < s0):
-            with ib.if_scope(condition[idx] != 0):
-                tmp[0] = atomic_add(
-                    tvm.tir.call_intrin("handle", "tir.address_of", valid_index[0]),
-                    one_count,
-                )
-                out[tmp[0] * 5] = fdiv(idx, s3)
-                out[tmp[0] * 5 + 1] = fdiv(fmod(idx, s3), s2)
-                out[tmp[0] * 5 + 2] = fdiv(fmod(idx, s2), s1)
-                out[tmp[0] * 5 + 3] = fdiv(fmod(idx, s1), a4)
-                out[tmp[0] * 5 + 4] = fmod(idx, a4)
-
-    return ib.get()
+    return argwhere_common(output_shape, condition, do_write)
 
 
 def argwhere_5d(output_shape, condition):
@@ -556,42 +249,19 @@ def argwhere_5d(output_shape, condition):
     stmt : Stmt
         The result IR statement.
     """
-    condition_buf = tvm.tir.decl_buffer(
-        condition.shape, condition.dtype, "data_buf", data_alignment=8
-    )
-    out_buf = tvm.tir.decl_buffer(output_shape, "int32", "out_buf", data_alignment=8)
 
-    out = te.extern(
-        [output_shape],
-        [condition],
-        lambda ins, outs: argwhere_5d_ir(ins[0], outs[0]),
-        dtype=["int32"],
-        in_buffers=[condition_buf],
-        out_buffers=[out_buf],
-        name="argwhere_5d",
-        tag="argwhere5d_gpu",
-    )
-
-    if isinstance(out.shape[0], (int, tvm.tir.expr.IntImm)) and int(out.shape[0]) <= 1:
-        return out
-
-    # sort the output from the least significant to the most significant
-    # column.
-    sort_func = _get_sort_func(1)
-    if isinstance(out.shape[0], (int, tvm.tir.expr.IntImm)):
-        for i in reversed(range(5)):
-            out1 = strided_slice(out, [0, i], [out.shape[0], i + 1])
-            out2 = sort_func(out1, axis=0, dtype="int32")
-            out3 = squeeze(out2)
-            out = adv_index(out, [out3])
-    else:
-        for i in reversed(range(5)):
-            out1 = strided_slice(out, [0, i], [out.shape[0], i + 1], [1, 1])
-            out2 = sort_func(out1, axis=0, dtype="int32")
-            out3 = squeeze(out2)
-            out = adv_index(out, [out3])
-
-    return out
+    def do_write(out, write_index, idx):
+        _, a1, a2, a3, a4 = condition.shape
+        s1 = a3 * a4
+        s2 = a2 * s1
+        s3 = a1 * s2
+        out[write_index * 5] = fdiv(idx, s3)
+        out[write_index * 5 + 1] = fdiv(fmod(idx, s3), s2)
+        out[write_index * 5 + 2] = fdiv(fmod(idx, s2), s1)
+        out[write_index * 5 + 3] = fdiv(fmod(idx, s1), a4)
+        out[write_index * 5 + 4] = fmod(idx, a4)
+
+    return argwhere_common(output_shape, condition, do_write)
 
 
 def argwhere(output_shape, condition):

From 20e03bc5120063e7e3da6b01c9d51d5cc3147e7f Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Thu, 21 Jan 2021 01:49:43 -0800
Subject: [PATCH 095/357] [COMMUNITY] @jwfromm -> Committer (#7316)

* [COMMUNITY] @jwfromm -> Committer

* add areas
---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 185202a9700c..bf10271b55e1 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -47,6 +47,7 @@ We do encourage everyone to work anything they are interested in.
 - [Wei Chen](https://github.com/wweic): @wweic - runtime, relay, vm
 - [Zhi Chen](https://github.com/zhiics) (PMC): @zhiics - relay, quantization, pass manager
 - [Chenfan](https://github.com/jcf94): @jcf94 - autoscheduling
+- [Josh Fromm](https://github.com/jwfromm): @jwfromm - frontends, quantization, topi
 - [Yuwei Hu](https://github.com/Huyuwei): @Huyuwei - topi, frontends
 - [Nick Hynes](https://github.com/nhynes): @nhynes: - sgx, rust
 - [Animesh Jain](https://github.com/anijain2305): @anijain2305 - quantization, relay

From 8524b28078928caf5c8ca82442ad0eab81dce838 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Thu, 21 Jan 2021 07:56:42 -0800
Subject: [PATCH 096/357] =?UTF-8?q?[=C2=B5TVM]=20Add=20TVMPlatformGenerate?=
 =?UTF-8?q?Random,=20a=20non-cryptographic=20random=20number=20generator.?=
 =?UTF-8?q?=20(#7266)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* [uTVM] Add TVMPlatformGenerateRandom, and use with Session nonce.

 * This change is preparation to support autotuning in microTVM. It
   also cleans up a loose end in the microTVM RPC server
   implementation.
 * Randomness is needed in two places of the CRT:
    1. to initialize the Session nonce, which provides a more robust
       way to detect reboots and ensure that messages are not confused
       across them.
    2. to fill input tensors when timing AutoTVM operators (once
       AutoTVM support lands in the next PR).

 * This change adds TVMPlatformGenerateRandom, a platform function for
   generating non-cryptographic random data, to service those needs.
---
 include/tvm/runtime/crt/platform.h            | 19 ++++++++++++
 include/tvm/runtime/crt/rpc_common/session.h  | 10 ++++---
 src/runtime/crt/common/crt_runtime_api.c      |  5 ++++
 src/runtime/crt/host/main.cc                  | 15 ++++++++++
 src/runtime/crt/utvm_rpc_common/session.cc    |  5 +++-
 src/runtime/crt/utvm_rpc_server/rpc_server.cc | 12 ++++++--
 src/runtime/micro/micro_session.cc            | 30 +++++++++++++++++--
 tests/crt/session_test.cc                     | 14 +++++----
 tests/micro/qemu/zephyr-runtime/prj.conf      |  4 +++
 tests/micro/qemu/zephyr-runtime/src/main.c    | 21 +++++++++++++
 10 files changed, 118 insertions(+), 17 deletions(-)

diff --git a/include/tvm/runtime/crt/platform.h b/include/tvm/runtime/crt/platform.h
index 8e0383912f50..d1226e388f73 100644
--- a/include/tvm/runtime/crt/platform.h
+++ b/include/tvm/runtime/crt/platform.h
@@ -97,6 +97,25 @@ tvm_crt_error_t TVMPlatformTimerStart();
  */
 tvm_crt_error_t TVMPlatformTimerStop(double* elapsed_time_seconds);
 
+/*! \brief Fill a buffer with random data.
+ *
+ * Cryptographically-secure random data is NOT required. This function is intended for use
+ * cases such as filling autotuning input tensors and choosing the nonce used for microTVM RPC.
+ *
+ * This function does not need to be implemented for inference tasks. It is used only by
+ * AutoTVM and the RPC server. When not implemented, an internal weak-linked stub is provided.
+ *
+ * Please take care that across successive resets, this function returns different sequences of
+ * values. If e.g. the random number generator is seeded with the same value, it may make it
+ * difficult for a host to detect device resets during autotuning or host-driven inference.
+ *
+ * \param buffer Pointer to the 0th byte to write with random data. `num_bytes` of random data
+ * should be written here.
+ * \param num_bytes Number of bytes to write.
+ * \return kTvmErrorNoError if successful; a descriptive error code otherwise.
+ */
+tvm_crt_error_t TVMPlatformGenerateRandom(uint8_t* buffer, size_t num_bytes);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/include/tvm/runtime/crt/rpc_common/session.h b/include/tvm/runtime/crt/rpc_common/session.h
index 9e6a9f380554..eee1de6072d2 100644
--- a/include/tvm/runtime/crt/rpc_common/session.h
+++ b/include/tvm/runtime/crt/rpc_common/session.h
@@ -78,9 +78,9 @@ class Session {
   /*! \brief An invalid nonce value that typically indicates an unknown nonce. */
   static constexpr const uint8_t kInvalidNonce = 0;
 
-  Session(uint8_t initial_session_nonce, Framer* framer, FrameBuffer* receive_buffer,
-          MessageReceivedFunc message_received_func, void* message_received_func_context)
-      : local_nonce_{initial_session_nonce},
+  Session(Framer* framer, FrameBuffer* receive_buffer, MessageReceivedFunc message_received_func,
+          void* message_received_func_context)
+      : local_nonce_{kInvalidNonce},
         session_id_{0},
         state_{State::kReset},
         receiver_{this},
@@ -99,9 +99,11 @@ class Session {
 
   /*!
    * \brief Send a session terminate message, usually done at startup to interrupt a hanging remote.
+   * \param initial_session_nonce Initial nonce that should be used on the first session start
+   *      message. Callers should ensure this is different across device resets.
    * \return kTvmErrorNoError on success, or an error code otherwise.
    */
-  tvm_crt_error_t Initialize();
+  tvm_crt_error_t Initialize(uint8_t initial_session_nonce);
 
   /*!
    * \brief Terminate any previously-established session.
diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c
index 960f844652a9..bc47f995eac0 100644
--- a/src/runtime/crt/common/crt_runtime_api.c
+++ b/src/runtime/crt/common/crt_runtime_api.c
@@ -509,3 +509,8 @@ release_and_return : {
 }
   return err;
 }
+
+// Default implementation, overridden by the platform runtime.
+__attribute__((weak)) tvm_crt_error_t TVMPlatformGenerateRandom(uint8_t* buffer, size_t num_bytes) {
+  return kTvmErrorFunctionCallNotImplemented;
+}
diff --git a/src/runtime/crt/host/main.cc b/src/runtime/crt/host/main.cc
index 7db17f50ccbf..bf36deacb938 100644
--- a/src/runtime/crt/host/main.cc
+++ b/src/runtime/crt/host/main.cc
@@ -22,6 +22,7 @@
  * \brief main entry point for host subprocess-based CRT
  */
 #include <inttypes.h>
+#include <time.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/crt/logging.h>
 #include <tvm/runtime/crt/memory.h>
@@ -93,6 +94,20 @@ tvm_crt_error_t TVMPlatformTimerStop(double* elapsed_time_seconds) {
   g_utvm_timer_running = 0;
   return kTvmErrorNoError;
 }
+
+static_assert(RAND_MAX >= (1 << 8), "RAND_MAX is smaller than acceptable");
+unsigned int random_seed = 0;
+tvm_crt_error_t TVMPlatformGenerateRandom(uint8_t* buffer, size_t num_bytes) {
+  if (random_seed == 0) {
+    random_seed = (unsigned int)time(NULL);
+  }
+  for (size_t i = 0; i < num_bytes; ++i) {
+    int random = rand_r(&random_seed);
+    buffer[i] = (uint8_t)random;
+  }
+
+  return kTvmErrorNoError;
+}
 }
 
 uint8_t memory[512 * 1024];
diff --git a/src/runtime/crt/utvm_rpc_common/session.cc b/src/runtime/crt/utvm_rpc_common/session.cc
index 5930863da37a..e1e338e42825 100644
--- a/src/runtime/crt/utvm_rpc_common/session.cc
+++ b/src/runtime/crt/utvm_rpc_common/session.cc
@@ -95,7 +95,10 @@ tvm_crt_error_t Session::StartSession() {
   return to_return;
 }
 
-tvm_crt_error_t Session::Initialize() { return TerminateSession(); }
+tvm_crt_error_t Session::Initialize(uint8_t initial_session_nonce) {
+  local_nonce_ = initial_session_nonce;
+  return TerminateSession();
+}
 
 tvm_crt_error_t Session::TerminateSession() {
   SetSessionId(0, 0);
diff --git a/src/runtime/crt/utvm_rpc_server/rpc_server.cc b/src/runtime/crt/utvm_rpc_server/rpc_server.cc
index 074799c44b1d..0b9e96cd660f 100644
--- a/src/runtime/crt/utvm_rpc_server/rpc_server.cc
+++ b/src/runtime/crt/utvm_rpc_server/rpc_server.cc
@@ -112,7 +112,7 @@ class MicroRPCServer {
                  utvm_rpc_channel_write_t write_func, void* write_func_ctx)
       : receive_buffer_{receive_storage, receive_storage_size_bytes},
         framer_{&send_stream_},
-        session_{0xa5, &framer_, &receive_buffer_, &HandleCompleteMessageCb, this},
+        session_{&framer_, &receive_buffer_, &HandleCompleteMessageCb, this},
         io_{&session_, &receive_buffer_},
         unframer_{session_.Receiver()},
         rpc_server_{&io_},
@@ -120,7 +120,13 @@ class MicroRPCServer {
 
   void* operator new(size_t count, void* ptr) { return ptr; }
 
-  void Initialize() { CHECK_EQ(kTvmErrorNoError, session_.Initialize(), "rpc server init"); }
+  void Initialize() {
+    uint8_t initial_session_nonce = Session::kInvalidNonce;
+    tvm_crt_error_t error =
+        TVMPlatformGenerateRandom(&initial_session_nonce, sizeof(initial_session_nonce));
+    CHECK_EQ(kTvmErrorNoError, error, "generating random session id");
+    CHECK_EQ(kTvmErrorNoError, session_.Initialize(initial_session_nonce), "rpc server init");
+  }
 
   /*! \brief Process one message from the receive buffer, if possible.
    *
@@ -242,7 +248,7 @@ void TVMLogf(const char* format, ...) {
   } else {
     tvm::runtime::micro_rpc::SerialWriteStream write_stream;
     tvm::runtime::micro_rpc::Framer framer{&write_stream};
-    tvm::runtime::micro_rpc::Session session{0xa5, &framer, nullptr, nullptr, nullptr};
+    tvm::runtime::micro_rpc::Session session{&framer, nullptr, nullptr, nullptr};
     tvm_crt_error_t err =
         session.SendMessage(tvm::runtime::micro_rpc::MessageType::kLog,
                             reinterpret_cast<uint8_t*>(log_buffer), num_bytes_logged);
diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc
index ceaa5dd6245b..f26a717dae33 100644
--- a/src/runtime/micro/micro_session.cc
+++ b/src/runtime/micro/micro_session.cc
@@ -105,7 +105,7 @@ class MicroTransportChannel : public RPCChannel {
         write_stream_{fsend, session_start_timeout},
         framer_{&write_stream_},
         receive_buffer_{new uint8_t[TVM_CRT_MAX_PACKET_SIZE_BYTES], TVM_CRT_MAX_PACKET_SIZE_BYTES},
-        session_{0x5c, &framer_, &receive_buffer_, &HandleMessageReceivedCb, this},
+        session_{&framer_, &receive_buffer_, &HandleMessageReceivedCb, this},
         unframer_{session_.Receiver()},
         did_receive_message_{false},
         frecv_{frecv},
@@ -161,13 +161,35 @@ class MicroTransportChannel : public RPCChannel {
     }
   }
 
+  static constexpr const int kNumRandRetries = 10;
+  static std::atomic<unsigned int> random_seed;
+
+  inline uint8_t GenerateRandomNonce() {
+    // NOTE: this is bad concurrent programming but in practice we don't really expect race
+    // conditions here, and even if they occur we don't particularly care whether a competing
+    // process computes a different random seed. This value is just chosen pseudo-randomly to
+    // form an initial distinct session id. Here we just want to protect against bad loads causing
+    // confusion.
+    unsigned int seed = random_seed.load();
+    if (seed == 0) {
+      seed = (unsigned int)time(NULL);
+    }
+    uint8_t initial_nonce = 0;
+    for (int i = 0; i < kNumRandRetries && initial_nonce == 0; ++i) {
+      initial_nonce = rand_r(&seed);
+    }
+    random_seed.store(seed);
+    ICHECK_NE(initial_nonce, 0) << "rand() does not seem to be producing random values";
+    return initial_nonce;
+  }
+
   bool StartSessionInternal() {
     using ::std::chrono::duration_cast;
     using ::std::chrono::microseconds;
     using ::std::chrono::steady_clock;
 
     steady_clock::time_point start_time = steady_clock::now();
-    ICHECK_EQ(kTvmErrorNoError, session_.Initialize());
+    ICHECK_EQ(kTvmErrorNoError, session_.Initialize(GenerateRandomNonce()));
     ICHECK_EQ(kTvmErrorNoError, session_.StartSession());
 
     if (session_start_timeout_ == microseconds::zero() &&
@@ -198,7 +220,7 @@ class MicroTransportChannel : public RPCChannel {
       }
       end_time += session_start_retry_timeout_;
 
-      ICHECK_EQ(kTvmErrorNoError, session_.Initialize());
+      ICHECK_EQ(kTvmErrorNoError, session_.Initialize(GenerateRandomNonce()));
       ICHECK_EQ(kTvmErrorNoError, session_.StartSession());
     }
 
@@ -365,6 +387,8 @@ class MicroTransportChannel : public RPCChannel {
   std::string pending_chunk_;
 };
 
+std::atomic<unsigned int> MicroTransportChannel::random_seed{0};
+
 TVM_REGISTER_GLOBAL("micro._rpc_connect").set_body([](TVMArgs args, TVMRetValue* rv) {
   MicroTransportChannel* micro_channel =
       new MicroTransportChannel(args[1], args[2], ::std::chrono::microseconds(uint64_t(args[3])),
diff --git a/tests/crt/session_test.cc b/tests/crt/session_test.cc
index a1d57fcb5436..60686be25060 100644
--- a/tests/crt/session_test.cc
+++ b/tests/crt/session_test.cc
@@ -55,8 +55,9 @@ class TestSession {
   TestSession(uint8_t initial_nonce)
       : framer{&framer_write_stream},
         receive_buffer{receive_buffer_array, sizeof(receive_buffer_array)},
-        sess{initial_nonce, &framer, &receive_buffer, TestSessionMessageReceivedThunk, this},
-        unframer{sess.Receiver()} {}
+        sess{&framer, &receive_buffer, TestSessionMessageReceivedThunk, this},
+        unframer{sess.Receiver()},
+        initial_nonce{initial_nonce} {}
 
   void WriteTo(TestSession* other) {
     auto framer_buffer = framer_write_stream.BufferContents();
@@ -84,6 +85,7 @@ class TestSession {
   FrameBuffer receive_buffer;
   Session sess;
   Unframer unframer;
+  uint8_t initial_nonce;
 };
 
 #define EXPECT_FRAMED_PACKET(session, expected)          \
@@ -126,14 +128,14 @@ class SessionTest : public ::testing::Test {
 
 TEST_F(SessionTest, NormalExchange) {
   tvm_crt_error_t err;
-  err = alice_.sess.Initialize();
+  err = alice_.sess.Initialize(alice_.initial_nonce);
   EXPECT_EQ(kTvmErrorNoError, err);
   EXPECT_FRAMED_PACKET(alice_,
                        "\xfe\xff\xfd\x03\0\0\0\0\0\x02"
                        "fw");
   alice_.WriteTo(&bob_);
 
-  err = bob_.sess.Initialize();
+  err = bob_.sess.Initialize(bob_.initial_nonce);
   EXPECT_EQ(kTvmErrorNoError, err);
   EXPECT_FRAMED_PACKET(bob_,
                        "\xfe\xff\xfd\x03\0\0\0\0\0\x02"
@@ -212,14 +214,14 @@ static constexpr const char kBobStartPacket[] = "\xff\xfd\x04\0\0\0f\0\0\x01`\xa
 
 TEST_F(SessionTest, DoubleStart) {
   tvm_crt_error_t err;
-  err = alice_.sess.Initialize();
+  err = alice_.sess.Initialize(alice_.initial_nonce);
   EXPECT_EQ(kTvmErrorNoError, err);
   EXPECT_FRAMED_PACKET(alice_,
                        "\xfe\xff\xfd\x03\0\0\0\0\0\x02"
                        "fw");
   alice_.WriteTo(&bob_);
 
-  err = bob_.sess.Initialize();
+  err = bob_.sess.Initialize(bob_.initial_nonce);
   EXPECT_EQ(kTvmErrorNoError, err);
   EXPECT_FRAMED_PACKET(bob_,
                        "\xfe\xff\xfd\x03\0\0\0\0\0\x02"
diff --git a/tests/micro/qemu/zephyr-runtime/prj.conf b/tests/micro/qemu/zephyr-runtime/prj.conf
index cebb55756e8c..7be42b260bbb 100644
--- a/tests/micro/qemu/zephyr-runtime/prj.conf
+++ b/tests/micro/qemu/zephyr-runtime/prj.conf
@@ -29,3 +29,7 @@ CONFIG_FPU=y
 
 # For TVMPlatformAbort().
 CONFIG_REBOOT=y
+
+# For TVMPlatformGenerateRandom(). Remember, these values do not need to be truly random.
+CONFIG_TEST_RANDOM_GENERATOR=y
+CONFIG_TIMER_RANDOM_GENERATOR=y
diff --git a/tests/micro/qemu/zephyr-runtime/src/main.c b/tests/micro/qemu/zephyr-runtime/src/main.c
index 9d10504dcbed..e04fc20508b4 100644
--- a/tests/micro/qemu/zephyr-runtime/src/main.c
+++ b/tests/micro/qemu/zephyr-runtime/src/main.c
@@ -26,6 +26,7 @@
 #include <drivers/uart.h>
 #include <kernel.h>
 #include <power/reboot.h>
+#include <random/rand32.h>
 #include <stdio.h>
 #include <sys/printk.h>
 #include <sys/ring_buffer.h>
@@ -161,6 +162,26 @@ tvm_crt_error_t TVMPlatformTimerStop(double* elapsed_time_seconds) {
   return kTvmErrorNoError;
 }
 
+tvm_crt_error_t TVMPlatformGenerateRandom(uint8_t* buffer, size_t num_bytes) {
+  uint32_t random;  // one unit of random data.
+
+  // Fill parts of `buffer` which are as large as `random`.
+  size_t num_full_blocks = num_bytes / sizeof(random);
+  for (int i = 0; i < num_full_blocks; ++i) {
+    random = sys_rand32_get();
+    memcpy(&buffer[i * sizeof(random)], &random, sizeof(random));
+  }
+
+  // Fill any leftover tail which is smaller than `random`.
+  size_t num_tail_bytes = num_bytes % sizeof(random);
+  if (num_tail_bytes > 0) {
+    random = sys_rand32_get();
+    memcpy(&buffer[num_bytes - num_tail_bytes], &random, num_tail_bytes);
+  }
+
+  return kTvmErrorNoError;
+}
+
 #define RING_BUF_SIZE 512
 struct uart_rx_buf_t {
   struct ring_buf buf;

From fc9e264954349ea40d029bd9b8b8d0127864d28e Mon Sep 17 00:00:00 2001
From: Joshua Chia <joshchia@gmail.com>
Date: Fri, 22 Jan 2021 06:17:00 +0800
Subject: [PATCH 097/357] Made tensorflow IsNan actually work (#7320)

* Made tensorflow IsNan actually work

IsNan was added to tensorflow.rst in fa1b859f but this commit makes IsNan actually work

* Added test case for tensorflow.is_nan
---
 python/tvm/relay/frontend/tensorflow.py          | 1 +
 tests/python/frontend/tensorflow/test_forward.py | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index d5746a38582c..2c7361a7d813 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -2357,6 +2357,7 @@ def _impl(inputs, attr, params, mod):
     "Identity": _identity(),
     "IsFinite": AttrCvt("isfinite"),
     "IsInf": AttrCvt("isinf"),
+    "IsNan": AttrCvt("isnan"),
     "LeakyRelu": AttrCvt("leaky_relu"),
     "LeftShift": AttrCvt("left_shift"),
     "Less": _broadcast("less"),
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index d71405796ede..3c30b6662c81 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -4178,6 +4178,10 @@ def test_forward_isfinite():
     _verify_infiniteness_ops(tf.is_finite, "isfinite")
 
 
+def test_forward_isnan():
+    _verify_infiniteness_ops(tf.is_nan, "isnan")
+
+
 def _test_spop_placeholder_without_shape_info():
     with tf.Graph().as_default():
 

From 7b6a1a7bcaa403b1c277a494e58774dc36b38326 Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Thu, 21 Jan 2021 17:09:19 -0700
Subject: [PATCH 098/357] Fix an issue with dynamic functions overwritting call
 arg types (#7295)

* Fix an issue with dynamic functions overwritting call arg types

* fix a bug for un-annotated inputs

* normalize names in TypeSolver::Unifier

* fix name normalization
---
 src/relay/analysis/type_solver.cc     | 18 ++++++++++--------
 src/relay/analysis/type_solver.h      |  3 ++-
 src/relay/transforms/type_infer.cc    | 12 ++++++------
 tests/python/relay/test_type_infer.py | 14 ++++++++++++++
 4 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/src/relay/analysis/type_solver.cc b/src/relay/analysis/type_solver.cc
index 64db13acbac0..cc1ada677c65 100644
--- a/src/relay/analysis/type_solver.cc
+++ b/src/relay/analysis/type_solver.cc
@@ -102,11 +102,12 @@ class TypeSolver::Unifier : public TypeFunctor<Type(const Type&, const Type&)> {
  public:
   explicit Unifier(TypeSolver* solver, const Span& span) : solver_(solver), span(span) {}
 
-  Type Unify(const Type& src, const Type& dst) {
+  Type Unify(const Type& lhs_type, const Type& rhs_type, bool assign_lhs = true,
+             bool assign_rhs = true) {
     // Known limitation
     // - handle shape pattern matching
-    TypeNode* lhs = solver_->GetTypeNode(dst);
-    TypeNode* rhs = solver_->GetTypeNode(src);
+    TypeNode* lhs = solver_->GetTypeNode(lhs_type);
+    TypeNode* rhs = solver_->GetTypeNode(rhs_type);
 
     // do occur check so we don't create self-referencing structure
     if (lhs->FindRoot() == rhs->FindRoot()) {
@@ -127,7 +128,7 @@ class TypeSolver::Unifier : public TypeFunctor<Type(const Type&, const Type&)> {
       solver_->MergeFromTo(rhs, lhs);
       return lhs->resolved_type;
     } else {
-      Type resolved = this->VisitType(lhs->resolved_type, rhs->resolved_type);
+      Type resolved = this->VisitType(rhs->resolved_type, lhs->resolved_type);
 
       if (!resolved.defined()) {
         solver_->diag_ctx_.Emit(
@@ -139,8 +140,8 @@ class TypeSolver::Unifier : public TypeFunctor<Type(const Type&, const Type&)> {
         return lhs->resolved_type;
       } else {
         TypeNode* top = solver_->GetTypeNode(resolved);
-        solver_->MergeFromTo(lhs, top);
-        solver_->MergeFromTo(rhs, top);
+        if (assign_lhs) solver_->MergeFromTo(lhs, top);
+        if (assign_rhs) solver_->MergeFromTo(rhs, top);
         return resolved;
       }
     }
@@ -549,9 +550,10 @@ void TypeSolver::MergeFromTo(TypeNode* src, TypeNode* dst) {
 }
 
 // Add equality constraint
-Type TypeSolver::Unify(const Type& dst, const Type& src, const Span& span) {
+Type TypeSolver::Unify(const Type& dst, const Type& src, const Span& span, bool assign_lhs,
+                       bool assign_rhs) {
   Unifier unifier(this, span);
-  return unifier.Unify(dst, src);
+  return unifier.Unify(dst, src, assign_lhs, assign_rhs);
 }
 
 // Add type constraint to the solver.
diff --git a/src/relay/analysis/type_solver.h b/src/relay/analysis/type_solver.h
index 4ae2e6a2b07b..56cea60ceeda 100644
--- a/src/relay/analysis/type_solver.h
+++ b/src/relay/analysis/type_solver.h
@@ -88,7 +88,8 @@ class TypeSolver {
    * \param rhs The right operand
    * \param location The location at which the unification problem arose.
    */
-  Type Unify(const Type& lhs, const Type& rhs, const Span& span);
+  Type Unify(const Type& lhs, const Type& rhs, const Span& span, bool assign_lhs = true,
+             bool assign_rhs = true);
   /*!
    * \brief Report a diagnostic.
    * \param diag The diagnostic to report.
diff --git a/src/relay/transforms/type_infer.cc b/src/relay/transforms/type_infer.cc
index 327b5d1e260a..921e83fdb092 100644
--- a/src/relay/transforms/type_infer.cc
+++ b/src/relay/transforms/type_infer.cc
@@ -162,9 +162,10 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
 
   // Perform unification on two types and report the error at the expression
   // or the span of the expression.
-  Type Unify(const Type& t1, const Type& t2, const Span& span) {
+  Type Unify(const Type& t1, const Type& t2, const Span& span, bool assign_lhs = true,
+             bool assign_rhs = true) {
     try {
-      return solver_.Unify(t1, t2, span);
+      return solver_.Unify(t1, t2, span, assign_lhs, assign_rhs);
     } catch (const dmlc::Error& e) {
       this->EmitFatal(Diagnostic::Error(span)
                       << "Error unifying `" << t1 << "` and `" << t2 << "`: " << e.what());
@@ -495,7 +496,7 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
     }
 
     for (size_t i = 0; i < fn_ty->arg_types.size(); i++) {
-      this->Unify(fn_ty->arg_types[i], arg_types[i], call->span);
+      this->Unify(fn_ty->arg_types[i], arg_types[i], call->span, true, false);
     }
 
     for (auto cs : fn_ty->type_constraints) {
@@ -526,6 +527,7 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
       }
     }
 
+    solver_.Solve();
     return GeneralCall(call, arg_types);
   }
 
@@ -572,9 +574,7 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
     return FuncType(c->inputs, TypeCall(c->belong_to, types), td->type_vars, {});
   }
 
-  void Solve() {
-    solver_.Solve();
-  }
+  void Solve() { solver_.Solve(); }
 };
 
 class TypeInferencer::Resolver : public MixedModeMutator, PatternMutator {
diff --git a/tests/python/relay/test_type_infer.py b/tests/python/relay/test_type_infer.py
index b518c31d3e62..e8179a37756c 100644
--- a/tests/python/relay/test_type_infer.py
+++ b/tests/python/relay/test_type_infer.py
@@ -402,6 +402,20 @@ def @main(%f: float32) -> float32 {
     tvm.ir.assert_structural_equal(mod["main"].body.type_args, [relay.TensorType((), "float32")])
 
 
+def test_dynamic_function():
+    dy_tt = relay.TensorType([relay.Any()], "float32")
+    s_tt = relay.TensorType([10], "float32")
+    x = relay.Var("x", dy_tt)
+    f = relay.Function([x], x + x)
+    y = relay.Var("y", s_tt)
+    c = f(y)
+
+    mod = tvm.IRModule()
+    mod["main"] = relay.Function([y], c)
+    mod = transform.InferType()(mod)
+    assert mod["main"].params[0].checked_type == s_tt
+
+
 if __name__ == "__main__":
     import sys
 

From 17ae44dbb8ce65bd72af1e2400d5796ed14277e5 Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Thu, 21 Jan 2021 19:05:25 -0700
Subject: [PATCH 099/357] add a shape function and dynamic test for round
 (#7324)

---
 python/tvm/relay/op/_tensor.py | 1 +
 tests/python/relay/test_any.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
index 6fc423371325..7728d6e3efa4 100644
--- a/python/tvm/relay/op/_tensor.py
+++ b/python/tvm/relay/op/_tensor.py
@@ -235,6 +235,7 @@ def elemwise_shape_func(attrs, inputs, _):
 
 register_shape_func("cast", False, elemwise_shape_func)
 register_shape_func("cast_like", False, elemwise_shape_func)
+register_shape_func("round", False, elemwise_shape_func)
 register_shape_func("zeros", False, no_data_full_shape_func)
 register_shape_func("zeros_like", False, elemwise_shape_func)
 register_shape_func("ones", False, no_data_full_shape_func)
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index a537782355d2..0b575d120e8f 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -120,6 +120,7 @@ def test_any_elemwise():
     verify_any_elemwise((relay.Any(),), (3,), relay.sqrt, np.sqrt)
     verify_any_elemwise((relay.Any(), 2), (5, 2), relay.negative, np.negative)
     verify_any_elemwise((relay.Any(), relay.Any()), (5, 4), relay.exp, np.exp)
+    verify_any_elemwise((relay.Any(),), (3,), relay.round, np.round)
 
 
 @tvm.testing.uses_gpu

From 790344c6ef035947caaaf1cd812ade8d862802aa Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Fri, 22 Jan 2021 11:28:16 +0900
Subject: [PATCH 100/357] relax tolerance for dlpack test (#7325)

---
 tests/python/contrib/test_dlpack.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/contrib/test_dlpack.py b/tests/python/contrib/test_dlpack.py
index 661e284c299f..6ff2529f7570 100644
--- a/tests/python/contrib/test_dlpack.py
+++ b/tests/python/contrib/test_dlpack.py
@@ -54,7 +54,7 @@ def test():
         f_pytorch = to_pytorch_func(f)
         zz2 = torch.empty(137, 137)
         f_pytorch(xx, yy, zz2)
-        tvm.testing.assert_allclose(zz.numpy(), zz2.numpy(), rtol=1e-6)
+        tvm.testing.assert_allclose(zz.numpy(), zz2.numpy(), rtol=1e-4, atol=1e-4)
 
     except ImportError:
         pass

From 6787d7494f8815bce9523906935169f6385b9d93 Mon Sep 17 00:00:00 2001
From: SebastianBoblestETAS
 <73823717+SebastianBoblestETAS@users.noreply.github.com>
Date: Fri, 22 Jan 2021 14:56:18 +0100
Subject: [PATCH 101/357] get_top_results works on a copy of output (#7327)

---
 python/tvm/driver/tvmc/runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/driver/tvmc/runner.py b/python/tvm/driver/tvmc/runner.py
index dec0e9842a37..87ea3be1436a 100644
--- a/python/tvm/driver/tvmc/runner.py
+++ b/python/tvm/driver/tvmc/runner.py
@@ -427,7 +427,7 @@ def get_top_results(outputs, max_results):
         The first row is the indices and the second is the values.
 
     """
-    output = outputs["output_0"]
+    output = np.copy(outputs["output_0"])
     sorted_labels = output.argsort()[0][-max_results:][::-1]
     output.sort()
     sorted_values = output[0][-max_results:][::-1]

From af9d1d24b73030d7742dfb01b0ac24c58e0ffac4 Mon Sep 17 00:00:00 2001
From: Luis Vega <vegaluisjose@users.noreply.github.com>
Date: Fri, 22 Jan 2021 13:48:10 -0800
Subject: [PATCH 102/357] [BYOC][Verilator] add support to dynamically load
 hardware library (#7286)

* add files

* remove import

* remove os import

* reorder header

* fix header order cpplint

* lint fix
---
 cmake/config.cmake                            |  4 +-
 cmake/modules/contrib/Verilator.cmake         |  8 +--
 .../backend/contrib/verilator/codegen.cc      | 30 +++++++-
 .../contrib/verilator/verilator_runtime.cc    | 69 ++++++++++++++++---
 .../contrib/test_verilator/infrastructure.py  | 39 ++++++++++-
 5 files changed, 129 insertions(+), 21 deletions(-)

diff --git a/cmake/config.cmake b/cmake/config.cmake
index cd0f4b8e75e9..872feb918a4f 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -232,8 +232,8 @@ set(USE_TENSORRT_RUNTIME OFF)
 # Whether use VITIS-AI codegen
 set(USE_VITIS_AI OFF)
 
-# Build Verilator codegen and runtime, example located in 3rdparty/vta-hw/apps/verilator
-set(USE_VERILATOR_HW OFF)
+# Build Verilator codegen and runtime
+set(USE_VERILATOR OFF)
 
 # Build ANTLR parser for Relay text format
 # Possible values:
diff --git a/cmake/modules/contrib/Verilator.cmake b/cmake/modules/contrib/Verilator.cmake
index d3c1a7161182..4947d44064a0 100644
--- a/cmake/modules/contrib/Verilator.cmake
+++ b/cmake/modules/contrib/Verilator.cmake
@@ -15,14 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 
-if(USE_VERILATOR_HW STREQUAL "ON")
-  execute_process(COMMAND make --directory ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/vta-hw/apps/verilator)
+if(USE_VERILATOR STREQUAL "ON")
   file(GLOB VERILATOR_RELAY_CONTRIB_SRC src/relay/backend/contrib/verilator/codegen.cc)
-  list(APPEND COMPILER_SRCS ${VERILATOR_RELAY_CONTRIB_SRC})
-  list(APPEND COMPILER_SRCS ${JSON_RELAY_CONTRIB_SRC})
-  find_library(EXTERN_LIBRARY_VERILATOR NAMES verilator PATHS ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/vta-hw/apps/verilator)
-  list(APPEND TVM_RUNTIME_LINKER_LIBS ${EXTERN_LIBRARY_VERILATOR})
   file(GLOB VERILATOR_CONTRIB_SRC src/runtime/contrib/verilator/verilator_runtime.cc)
+  list(APPEND COMPILER_SRCS ${VERILATOR_RELAY_CONTRIB_SRC})
   list(APPEND RUNTIME_SRCS ${VERILATOR_CONTRIB_SRC})
 endif()
 
diff --git a/src/relay/backend/contrib/verilator/codegen.cc b/src/relay/backend/contrib/verilator/codegen.cc
index 4124fa2459d6..f1c7f785330e 100644
--- a/src/relay/backend/contrib/verilator/codegen.cc
+++ b/src/relay/backend/contrib/verilator/codegen.cc
@@ -43,6 +43,7 @@ namespace contrib {
 
 using namespace backend;
 
+/*! \brief Verilator JSON serializer */
 class VerilatorJSONSerializer : public backend::contrib::JSONSerializer {
   using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
   using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry;
@@ -74,6 +75,24 @@ class VerilatorJSONSerializer : public backend::contrib::JSONSerializer {
   }
 };
 
+/*! \brief Attributes to store the compiler options for Verilator */
+struct VerilatorCompilerConfigNode : public tvm::AttrsNode<VerilatorCompilerConfigNode> {
+  String lib;
+
+  TVM_DECLARE_ATTRS(VerilatorCompilerConfigNode, "ext.attrs.VerilatorCompilerConfigNode") {
+    TVM_ATTR_FIELD(lib).set_default("libverilator.so");
+  }
+};
+
+class VerilatorCompilerConfig : public Attrs {
+ public:
+  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(VerilatorCompilerConfig, Attrs,
+                                            VerilatorCompilerConfigNode);
+};
+
+TVM_REGISTER_NODE_TYPE(VerilatorCompilerConfigNode);
+TVM_REGISTER_PASS_CONFIG_OPTION("relay.ext.verilator.options", VerilatorCompilerConfig);
+
 /*!
  * \brief The external compiler/codegen tool. It takes a Relay expression/module and
  * compile it into a runtime module.
@@ -87,9 +106,18 @@ runtime::Module VerilatorCompiler(const ObjectRef& ref) {
   std::string graph_json = serializer.GetJSON();
   auto params = serializer.GetParams();
 
+  // Get Verilator compiler options
+  auto ctx = transform::PassContext::Current();
+  auto cfg = ctx->GetConfig<VerilatorCompilerConfig>("relay.ext.verilator.options");
+  if (!cfg.defined()) {
+    cfg = AttrsWithDefaultValues<VerilatorCompilerConfig>();
+  }
+
+  auto lib_name = cfg.value()->lib;
+
   const auto* pf = runtime::Registry::Get("runtime.VerilatorJSONRuntimeCreate");
   CHECK(pf != nullptr) << "Cannot find JSON runtime module to create";
-  auto mod = (*pf)(func_name, graph_json, params);
+  auto mod = (*pf)(lib_name, func_name, graph_json, params);
   return mod;
 }
 
diff --git a/src/runtime/contrib/verilator/verilator_runtime.cc b/src/runtime/contrib/verilator/verilator_runtime.cc
index a44faf6d3274..ae52d9e1e08d 100644
--- a/src/runtime/contrib/verilator/verilator_runtime.cc
+++ b/src/runtime/contrib/verilator/verilator_runtime.cc
@@ -22,6 +22,7 @@
  * \brief A simple JSON runtime for Verilator.
  */
 
+#include <dlfcn.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/registry.h>
 
@@ -29,6 +30,7 @@
 #include <string>
 #include <vector>
 
+#include "../../library_module.h"
 #include "../json/json_node.h"
 #include "../json/json_runtime.h"
 #include "verilator_device.h"
@@ -38,9 +40,40 @@ namespace tvm {
 namespace runtime {
 namespace contrib {
 
+typedef VerilatorHandle (*VerilatorAllocFunc)();
+typedef void (*VerilatorResetFunc)(VerilatorHandle, int);
+typedef void (*VerilatorAddFunc)(VerilatorHandle, int*, int*, int*, int, int);
+
 using namespace tvm::runtime;
 using namespace tvm::runtime::json;
 
+class VerilatorLibrary : public Library {
+ public:
+  ~VerilatorLibrary() {
+    if (lib_handle_) Unload();
+  }
+  void Init(const std::string& name) { Load(name); }
+
+  void* GetSymbol(const char* name) final { return GetSymbol_(name); }
+
+ private:
+  // Library handle
+  void* lib_handle_{nullptr};
+  // load the library
+  void Load(const std::string& name) {
+    lib_handle_ = dlopen(name.c_str(), RTLD_LAZY | RTLD_LOCAL);
+    ICHECK(lib_handle_ != nullptr)
+        << "Failed to load dynamic shared library " << name << " " << dlerror();
+  }
+
+  void* GetSymbol_(const char* name) { return dlsym(lib_handle_, name); }
+
+  void Unload() {
+    dlclose(lib_handle_);
+    lib_handle_ = nullptr;
+  }
+};
+
 class VerilatorJSONRuntime : public JSONRuntimeBase {
  public:
   VerilatorJSONRuntime(const std::string& symbol_name, const std::string& graph_json,
@@ -49,8 +82,25 @@ class VerilatorJSONRuntime : public JSONRuntimeBase {
 
   const char* type_key() const { return "verilator_json"; }
 
+  void LoadLibrary(const std::string& lib_name) {
+    lib_ = new VerilatorLibrary();
+    lib_->Init(lib_name);
+  }
+
   void Init(const Array<NDArray>& consts) override {
-    BuildEngine();
+    // get symbols
+    auto alloc_func = reinterpret_cast<VerilatorAllocFunc>(lib_->GetSymbol("VerilatorAlloc"));
+    ICHECK(alloc_func != nullptr);
+    auto reset_func = reinterpret_cast<VerilatorResetFunc>(lib_->GetSymbol("VerilatorReset"));
+    ICHECK(reset_func != nullptr);
+    vadd_func_ = reinterpret_cast<VerilatorAddFunc>(lib_->GetSymbol("verilator_add"));
+    ICHECK(vadd_func_ != nullptr);
+
+    // alloc device
+    device_ = (*alloc_func)();
+
+    // reset for 10 cycles
+    (*reset_func)(device_, 10);
 
     CHECK_EQ(consts.size(), const_idx_.size())
         << "The number of input constants must match the number of required.";
@@ -80,7 +130,7 @@ class VerilatorJSONRuntime : public JSONRuntimeBase {
         if ("add" == op_name) {
           auto entry = node.GetInputs()[0];
           auto shape = nodes_[entry.id_].GetOpShape()[entry.index_];
-          verilator_add(device_, in_ptr[0], in_ptr[1], out_ptr[0], shape[0], shape[1]);
+          (*vadd_func_)(device_, in_ptr[0], in_ptr[1], out_ptr[0], shape[0], shape[1]);
         } else {
           LOG(FATAL) << "Unsupported op: " << op_name;
         }
@@ -89,19 +139,18 @@ class VerilatorJSONRuntime : public JSONRuntimeBase {
   }
 
  private:
-  void BuildEngine() {
-    device_ = VerilatorAlloc();
-    // reset for 10 cycles
-    VerilatorReset(device_, 10);
-  }
-
-  /* The verilator handle. */
+  /* The verilator device handle. */
   VerilatorHandle device_{nullptr};
+  /* The verilator library handle. */
+  VerilatorLibrary* lib_{nullptr};
+  /* The verilator add function handle */
+  VerilatorAddFunc vadd_func_{nullptr};
 };
 
-runtime::Module VerilatorJSONRuntimeCreate(String symbol_name, String graph_json,
+runtime::Module VerilatorJSONRuntimeCreate(String lib_name, String symbol_name, String graph_json,
                                            const Array<String>& const_names) {
   auto n = make_object<VerilatorJSONRuntime>(symbol_name, graph_json, const_names);
+  n->LoadLibrary(lib_name);
   return runtime::Module(n);
 }
 
diff --git a/tests/python/contrib/test_verilator/infrastructure.py b/tests/python/contrib/test_verilator/infrastructure.py
index 1333f484aec9..e8fd943aa8a0 100644
--- a/tests/python/contrib/test_verilator/infrastructure.py
+++ b/tests/python/contrib/test_verilator/infrastructure.py
@@ -16,7 +16,9 @@
 # under the License.
 """Verilator utility functions"""
 
+import os
 import sys
+import subprocess as sp
 
 import tvm
 from tvm import relay
@@ -66,10 +68,43 @@ def offload(mod):
     return mod
 
 
+def verilator_app_path():
+    """Find verilator hardware app path"""
+
+    cur_dir = os.path.dirname(os.path.realpath(__file__))
+    return os.path.join(
+        cur_dir,
+        "..",
+        "..",
+        "..",
+        "..",
+        "3rdparty",
+        "vta-hw",
+        "apps",
+        "verilator",
+    )
+
+
+def compile_hardware():
+    """Compile hardware into shared library"""
+
+    cmd = []
+    cmd.append("make")
+    cmd.append("--directory")
+    cmd.append(verilator_app_path())
+    sp.run(cmd, check=True)
+
+
 def compile_module(mod):
-    """Compile Relay module"""
+    """Compile Relay module and hardware library"""
+
+    lib = os.path.join(verilator_app_path(), "libverilator.so")
+    if not os.path.isfile(lib):
+        compile_hardware()
 
-    with relay.build_config(opt_level=3):
+    with tvm.transform.PassContext(
+        opt_level=3, config={"relay.ext.verilator.options": {"lib": lib}}
+    ):
         exe = relay.vm.compile(mod, target="llvm", params=None)
         code, lib = exe.save()
         return runtime.vm.Executable.load_exec(code, lib)

From 3ec67f0b4b871be7b174d71dc1b0f60bed984d22 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Fri, 22 Jan 2021 15:56:53 -0800
Subject: [PATCH 103/357] [AutoScheduler] Fix conv3d's op strategy for
 auto-scheduler (#7328)

---
 python/tvm/relay/op/strategy/x86.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 9e3e191b2f2b..edfaaeefc5df 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -304,7 +304,7 @@ def conv3d_strategy_cpu(attrs, inputs, out_type, target):
         # or packed layouts.
         if layout == "NCDHW":
             strategy.add_implementation(
-                wrap_compute_conv3d(topi.nn.conv3d_ncdhw, need_auto_scheduler_layout=True),
+                wrap_compute_conv3d(topi.nn.conv3d_ncdhw),
                 naive_schedule,
                 name="conv3d_ncdhw.x86",
             )

From e889defc623f4e76589926fb89c58b5f5b5e66c8 Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Sat, 23 Jan 2021 01:37:21 -0700
Subject: [PATCH 104/357] [PatternLang] Add a relay LetPattern (#7332)

* Add a relay LetPattern

* fix If copy

Co-authored-by: Cody Yu <comaniac0422@gmail.com>

* fix If copy

Co-authored-by: Cody Yu <comaniac0422@gmail.com>

Co-authored-by: Cody Yu <comaniac0422@gmail.com>
---
 docs/langref/relay_pattern.rst                | 29 ++++++++++++
 include/tvm/relay/dataflow_pattern.h          | 36 +++++++++++++++
 include/tvm/relay/dataflow_pattern_functor.h  | 11 +++--
 python/tvm/relay/dataflow_pattern/__init__.py | 44 +++++++++++++++++++
 src/relay/ir/dataflow_matcher.cc              | 11 ++++-
 src/relay/ir/dataflow_pattern.cc              | 22 ++++++++++
 src/relay/ir/dataflow_pattern_functor.cc      |  6 +++
 src/relay/ir/indexed_graph.cc                 |  6 +++
 tests/python/relay/test_dataflow_pattern.py   | 39 ++++++++++++++++
 9 files changed, 199 insertions(+), 5 deletions(-)

diff --git a/docs/langref/relay_pattern.rst b/docs/langref/relay_pattern.rst
index 992954c9a5b1..d77a51980f23 100644
--- a/docs/langref/relay_pattern.rst
+++ b/docs/langref/relay_pattern.rst
@@ -246,6 +246,24 @@ are matched:
 
         assert pat.match(relay.expr.If(cond, x, y))
 
+
+A Relay ``Let`` expression can be matched if all of its variable, value, and body
+are matched:
+
+.. code-block:: python
+
+  def test_match_let():
+      x = is_var("x")
+      y = is_var("y")
+      let_var = is_var("let")
+      pat = is_let(let_var, is_op("less")(x, y), let_var)
+
+      x = relay.var("x")
+      y = relay.var("y")
+      lv = relay.var("let")
+      cond = x < y
+      assert pat.match(relay.expr.Let(lv, cond, lv))
+
 Matching Diamonds and Post-Dominator Graphs
 *******************************************
 
@@ -310,6 +328,7 @@ The high level design is to introduce a language of patterns for now we propose
             | is_tuple()
             | is_tuple_get_item(pattern, index = None)
             | is_if(cond, tru, fls)
+            | is_let(var, value, body)
             | pattern1 `|` pattern2
             | dominates(parent_pattern, path_pattern, child_pattern)
             | FunctionPattern(params, body)
@@ -367,6 +386,16 @@ Function Pattern
 
 Match a Function with a body and parameters
 
+If Pattern
+**********
+
+Match an If with condition, true branch, and false branch
+
+Let Pattern
+***********
+
+Match a Let with a variable, value, and body
+
 Applications
 ============
 
diff --git a/include/tvm/relay/dataflow_pattern.h b/include/tvm/relay/dataflow_pattern.h
index 1b0c0aca7ff6..1e6cecfd041b 100644
--- a/include/tvm/relay/dataflow_pattern.h
+++ b/include/tvm/relay/dataflow_pattern.h
@@ -222,6 +222,42 @@ class FunctionPattern : public DFPattern {
   TVM_DEFINE_OBJECT_REF_COW_METHOD(FunctionPatternNode);
 };
 
+/*! \brief A binding of a sub-network. */
+class LetPatternNode : public DFPatternNode {
+ public:
+  /*! \brief The variable we bind to */
+  DFPattern var;
+  /*! \brief The value we bind var to */
+  DFPattern value;
+  /*! \brief The body of the let binding */
+  DFPattern body;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("var", &var);
+    v->Visit("value", &value);
+    v->Visit("body", &body);
+  }
+
+  static constexpr const char* _type_key = "relay.dataflow_pattern.LetPattern";
+  TVM_DECLARE_FINAL_OBJECT_INFO(LetPatternNode, DFPatternNode);
+};
+
+/*!
+ * \brief Let binding that binds a local var
+ */
+class LetPattern : public DFPattern {
+ public:
+  /*!
+   * \brief The constructor
+   * \param var The variable that is bound to.
+   * \param value The value used to bind to the variable.
+   * \param body The body of the let binding.
+   */
+  TVM_DLL LetPattern(DFPattern var, DFPattern value, DFPattern body);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(LetPattern, DFPattern, LetPatternNode);
+};
+
 /*! \brief Tuple of multiple Exprs */
 class TuplePattern;
 /*! \brief Tuple container */
diff --git a/include/tvm/relay/dataflow_pattern_functor.h b/include/tvm/relay/dataflow_pattern_functor.h
index bff9e23ef046..490cdc5e3f9d 100644
--- a/include/tvm/relay/dataflow_pattern_functor.h
+++ b/include/tvm/relay/dataflow_pattern_functor.h
@@ -84,18 +84,19 @@ class DFPatternFunctor<R(const DFPattern& n, Args...)> {
   virtual R VisitDFPattern_(const AltPatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT;
   virtual R VisitDFPattern_(const AttrPatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT;
   virtual R VisitDFPattern_(const CallPatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT;
+  virtual R VisitDFPattern_(const ConstantPatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT;
   virtual R VisitDFPattern_(const DataTypePatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT;
   virtual R VisitDFPattern_(const DominatorPatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT;
   virtual R VisitDFPattern_(const ExprPatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT;
   virtual R VisitDFPattern_(const FunctionPatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT;
+  virtual R VisitDFPattern_(const IfPatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT;
+  virtual R VisitDFPattern_(const LetPatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT;
   virtual R VisitDFPattern_(const ShapePatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT;
   virtual R VisitDFPattern_(const TupleGetItemPatternNode* op,
                             Args... args) DFPATTERN_FUNCTOR_DEFAULT;
-  virtual R VisitDFPattern_(const IfPatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT;
   virtual R VisitDFPattern_(const TuplePatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT;
   virtual R VisitDFPattern_(const TypePatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT;
   virtual R VisitDFPattern_(const VarPatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT;
-  virtual R VisitDFPattern_(const ConstantPatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT;
   virtual R VisitDFPattern_(const WildcardPatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT;
   virtual R VisitDFPatternDefault_(const Object* op, Args...) {
     LOG(FATAL) << "Do not have a default for " << op->GetTypeKey();
@@ -115,9 +116,10 @@ class DFPatternFunctor<R(const DFPattern& n, Args...)> {
     RELAY_DFPATTERN_FUNCTOR_DISPATCH(DominatorPatternNode);
     RELAY_DFPATTERN_FUNCTOR_DISPATCH(ExprPatternNode);
     RELAY_DFPATTERN_FUNCTOR_DISPATCH(FunctionPatternNode);
+    RELAY_DFPATTERN_FUNCTOR_DISPATCH(IfPatternNode);
+    RELAY_DFPATTERN_FUNCTOR_DISPATCH(LetPatternNode);
     RELAY_DFPATTERN_FUNCTOR_DISPATCH(ShapePatternNode);
     RELAY_DFPATTERN_FUNCTOR_DISPATCH(TupleGetItemPatternNode);
-    RELAY_DFPATTERN_FUNCTOR_DISPATCH(IfPatternNode);
     RELAY_DFPATTERN_FUNCTOR_DISPATCH(TuplePatternNode);
     RELAY_DFPATTERN_FUNCTOR_DISPATCH(TypePatternNode);
     RELAY_DFPATTERN_FUNCTOR_DISPATCH(VarPatternNode);
@@ -143,10 +145,11 @@ class DFPatternVisitor : public DFPatternFunctor<void(const DFPattern&)> {
   void VisitDFPattern_(const DominatorPatternNode* op) override;
   void VisitDFPattern_(const ExprPatternNode* op) override;
   void VisitDFPattern_(const FunctionPatternNode* op) override;
+  void VisitDFPattern_(const IfPatternNode* op) override;
+  void VisitDFPattern_(const LetPatternNode* op) override;
   void VisitDFPattern_(const ShapePatternNode* op) override;
   void VisitDFPattern_(const TupleGetItemPatternNode* op) override;
   void VisitDFPattern_(const TuplePatternNode* op) override;
-  void VisitDFPattern_(const IfPatternNode* op) override;
   void VisitDFPattern_(const TypePatternNode* op) override;
   void VisitDFPattern_(const VarPatternNode* op) override;
   void VisitDFPattern_(const WildcardPatternNode* op) override;
diff --git a/python/tvm/relay/dataflow_pattern/__init__.py b/python/tvm/relay/dataflow_pattern/__init__.py
index 6f764e1651da..d4a8481d106e 100644
--- a/python/tvm/relay/dataflow_pattern/__init__.py
+++ b/python/tvm/relay/dataflow_pattern/__init__.py
@@ -337,6 +337,29 @@ def is_if(cond, true_branch, false_branch):
     return IfPattern(cond, true_branch, false_branch)
 
 
+def is_let(var, value, body):
+    """
+    Syntatic sugar for creating a LetPattern.
+
+    Parameters
+    ----------
+    var: tvm.relay.dataflow_pattern.DFPattern
+        The pattern describing the variable of Let.
+
+    value: tvm.relay.dataflow_pattern.DFPattern
+        The pattern describing the value of Let.
+
+    body: tvm.relay.dataflow_pattern.DFPattern
+        The pattern describing the body where the binding is in effect.
+
+    Returns
+    -------
+    result: tvm.relay.dataflow_pattern.DFPattern
+        The resulting pattern.
+    """
+    return LetPattern(var, value, body)
+
+
 def wildcard() -> "DFPattern":
     """
     Syntatic sugar for creating a WildcardPattern.
@@ -579,6 +602,27 @@ def __init__(self, cond: "DFPattern", true_branch: "DFPattern", false_branch: "D
         self.__init_handle_by_constructor__(ffi.IfPattern, cond, true_branch, false_branch)
 
 
+@register_df_node
+class LetPattern(DFPattern):
+    """A patern matching a Relay Let.
+
+    Parameters
+    ----------
+    var: tvm.relay.dataflow_pattern.DFPattern
+        The pattern describing the variable of Let.
+
+    value: tvm.relay.dataflow_pattern.DFPattern
+        The pattern describing the value of Let.
+
+    body: tvm.relay.dataflow_pattern.DFPattern
+        The pattern describing the body where the binding is in effect.
+
+    """
+
+    def __init__(self, var: "DFPattern", value: "DFPattern", body: "DFPattern"):
+        self.__init_handle_by_constructor__(ffi.LetPattern, var, value, body)
+
+
 @register_df_node
 class TuplePattern(DFPattern):
     """A patern matching a Relay Tuple.
diff --git a/src/relay/ir/dataflow_matcher.cc b/src/relay/ir/dataflow_matcher.cc
index 459694b8f679..0d9481312137 100644
--- a/src/relay/ir/dataflow_matcher.cc
+++ b/src/relay/ir/dataflow_matcher.cc
@@ -55,10 +55,11 @@ class DFPatternMatcher : public DFPatternFunctor<bool(const DFPattern&, const Ex
   bool VisitDFPattern_(const DominatorPatternNode* op, const Expr& expr) override;
   bool VisitDFPattern_(const ExprPatternNode* op, const Expr& expr) override;
   bool VisitDFPattern_(const FunctionPatternNode* op, const Expr& expr) override;
+  bool VisitDFPattern_(const IfPatternNode* op, const Expr& expr) override;
+  bool VisitDFPattern_(const LetPatternNode* op, const Expr& expr) override;
   bool VisitDFPattern_(const ShapePatternNode* op, const Expr& expr) override;
   bool VisitDFPattern_(const TupleGetItemPatternNode* op, const Expr& expr) override;
   bool VisitDFPattern_(const TuplePatternNode* op, const Expr& expr) override;
-  bool VisitDFPattern_(const IfPatternNode* op, const Expr& expr) override;
   bool VisitDFPattern_(const TypePatternNode* op, const Expr& expr) override;
   bool VisitDFPattern_(const VarPatternNode* op, const Expr& expr) override;
   bool VisitDFPattern_(const WildcardPatternNode* op, const Expr& expr) override;
@@ -423,6 +424,14 @@ bool DFPatternMatcher::VisitDFPattern_(const IfPatternNode* op, const Expr& expr
   return false;
 }
 
+bool DFPatternMatcher::VisitDFPattern_(const LetPatternNode* op, const Expr& expr) {
+  if (const auto* let_node = expr.as<LetNode>()) {
+    return VisitDFPattern(op->var, let_node->var) && VisitDFPattern(op->value, let_node->value) &&
+           VisitDFPattern(op->body, let_node->body);
+  }
+  return false;
+}
+
 Expr InferType(const Expr& expr) {
   auto mod = IRModule::FromExpr(expr);
   mod = transform::InferType()(mod);
diff --git a/src/relay/ir/dataflow_pattern.cc b/src/relay/ir/dataflow_pattern.cc
index 1e268fb00d97..4c3b82cc19d4 100644
--- a/src/relay/ir/dataflow_pattern.cc
+++ b/src/relay/ir/dataflow_pattern.cc
@@ -112,6 +112,28 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
       p->stream << "FunctionPatternNode(" << node->params << ", " << node->body << ")";
     });
 
+LetPattern::LetPattern(DFPattern var, DFPattern value, DFPattern body) {
+  ObjectPtr<LetPatternNode> n = make_object<LetPatternNode>();
+  n->var = std::move(var);
+  n->value = std::move(value);
+  n->body = std::move(body);
+  data_ = std::move(n);
+}
+
+TVM_REGISTER_NODE_TYPE(LetPatternNode);
+
+TVM_REGISTER_GLOBAL("relay.dataflow_pattern.LetPattern")
+    .set_body_typed([](DFPattern var, DFPattern value, DFPattern body) {
+      return LetPattern(var, value, body);
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<LetPatternNode>([](const ObjectRef& ref, ReprPrinter* p) {
+      auto* node = static_cast<const LetPatternNode*>(ref.get());
+      p->stream << "LetPatternNode(" << node->var << ", " << node->value << ", " << node->body
+                << ")";
+    });
+
 IfPattern::IfPattern(DFPattern cond, DFPattern true_branch, DFPattern false_branch) {
   ObjectPtr<IfPatternNode> n = make_object<IfPatternNode>();
   n->cond = std::move(cond);
diff --git a/src/relay/ir/dataflow_pattern_functor.cc b/src/relay/ir/dataflow_pattern_functor.cc
index 25b247306229..828e867b332c 100644
--- a/src/relay/ir/dataflow_pattern_functor.cc
+++ b/src/relay/ir/dataflow_pattern_functor.cc
@@ -87,6 +87,12 @@ void DFPatternVisitor::VisitDFPattern_(const IfPatternNode* op) {
   VisitDFPattern(op->false_branch);
 }
 
+void DFPatternVisitor::VisitDFPattern_(const LetPatternNode* op) {
+  VisitDFPattern(op->var);
+  VisitDFPattern(op->value);
+  VisitDFPattern(op->body);
+}
+
 void DFPatternVisitor::VisitDFPattern_(const TypePatternNode* op) { VisitDFPattern(op->pattern); }
 
 void DFPatternVisitor::VisitDFPattern_(const VarPatternNode* op) {}
diff --git a/src/relay/ir/indexed_graph.cc b/src/relay/ir/indexed_graph.cc
index 9ee5c9cf6b85..0f81c2360d0f 100644
--- a/src/relay/ir/indexed_graph.cc
+++ b/src/relay/ir/indexed_graph.cc
@@ -288,6 +288,12 @@ IndexedGraph<DFPattern> CreateIndexedGraph(const DFPattern& pattern) {
       VisitDFPattern(op->false_branch, graph_.node_map_[GetRef<DFPattern>(op)]);
     }
 
+    void VisitDFPattern_(const LetPatternNode* op, NodePtr parent) override {
+      VisitDFPattern(op->var, graph_.node_map_[GetRef<DFPattern>(op)]);
+      VisitDFPattern(op->value, graph_.node_map_[GetRef<DFPattern>(op)]);
+      VisitDFPattern(op->body, graph_.node_map_[GetRef<DFPattern>(op)]);
+    }
+
     void VisitDFPattern_(const TypePatternNode* op, NodePtr parent) override {
       VisitDFPattern(op->pattern, graph_.node_map_[GetRef<DFPattern>(op)]);
     }
diff --git a/tests/python/relay/test_dataflow_pattern.py b/tests/python/relay/test_dataflow_pattern.py
index 934ebf462b95..e7b367b8f631 100644
--- a/tests/python/relay/test_dataflow_pattern.py
+++ b/tests/python/relay/test_dataflow_pattern.py
@@ -138,6 +138,18 @@ def test_IfPattern():
     assert isinstance(pat.false_branch, VarPattern)
 
 
+def test_LetPattern():
+    x = is_var("x")
+    y = is_var("y")
+    let_var = is_var("let")
+    pat = is_let(let_var, is_op("less")(x, y), let_var)
+
+    assert isinstance(pat, LetPattern)
+    assert isinstance(pat.var, VarPattern)
+    assert isinstance(pat.value, CallPattern)
+    assert isinstance(pat.body, VarPattern)
+
+
 ## MATCHER TESTS
 
 
@@ -233,6 +245,33 @@ def test_no_match_if():
     assert not pat.match(relay.expr.If(x < y, y, x))
 
 
+def test_match_let():
+    x = is_var("x")
+    y = is_var("y")
+    let_var = is_var("let")
+    pat = is_let(let_var, is_op("less")(x, y), let_var)
+
+    x = relay.var("x")
+    y = relay.var("y")
+    lv = relay.var("let")
+    cond = x < y
+    assert pat.match(relay.expr.Let(lv, cond, lv))
+
+
+def test_no_match_let():
+    x = is_var("x")
+    y = is_var("y")
+    let_var = is_var("let")
+    pat = is_let(let_var, is_op("less")(x, y), let_var)
+
+    x = relay.var("x")
+    y = relay.var("y")
+    lv = relay.var("let")
+
+    assert not pat.match(relay.expr.Let(lv, x > y, lv))
+    assert not pat.match(relay.expr.Let(lv, x < y, lv * x))
+
+
 def test_match_option():
     x = relay.var("x")
     w = relay.var("w")

From 218048ec8306d4f3dc2e9a8dcca187804ad8a254 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Sat, 23 Jan 2021 00:48:40 -0800
Subject: [PATCH 105/357] [FIX,AUTOTVM] Add flop counts to cublas (#7297)

---
 python/tvm/autotvm/task/task.py | 1 +
 python/tvm/contrib/cublas.py    | 4 ++--
 python/tvm/topi/cuda/dense.py   | 5 ++++-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py
index c8b50ad33741..52f0996c800c 100644
--- a/python/tvm/autotvm/task/task.py
+++ b/python/tvm/autotvm/task/task.py
@@ -580,6 +580,7 @@ def traverse(ops):
                 pass
             else:
                 raise FlopCalculationError(
+                    f"{op.name} is not supported by autotvm. "
                     "Only support te.compute currently. "
                     "Other ops like tvm.te.scan/te.extern is not supported"
                 )
diff --git a/python/tvm/contrib/cublas.py b/python/tvm/contrib/cublas.py
index 9a36fa52ce4b..e01b09c3e4ee 100644
--- a/python/tvm/contrib/cublas.py
+++ b/python/tvm/contrib/cublas.py
@@ -48,7 +48,7 @@ def matmul(lhs, rhs, transa=False, transb=False, dtype=None):
             "tvm.contrib.cublas.matmul", ins[0], ins[1], outs[0], transa, transb
         ),
         dtype=dtype,
-        name="C",
+        name="matmul_cublas",
     )
 
 
@@ -82,5 +82,5 @@ def batch_matmul(lhs, rhs, transa=False, transb=False, dtype=None):
             "tvm.contrib.cublas.batch_matmul", ins[0], ins[1], outs[0], transa, transb
         ),
         dtype=dtype,
-        name="C",
+        name="batch_matmul_cublas",
     )
diff --git a/python/tvm/topi/cuda/dense.py b/python/tvm/topi/cuda/dense.py
index 85b9b19bdb02..f8abe4d4d799 100644
--- a/python/tvm/topi/cuda/dense.py
+++ b/python/tvm/topi/cuda/dense.py
@@ -17,7 +17,7 @@
 # pylint: disable=invalid-name, unused-argument
 """Schedule for dense operator"""
 import logging
-from tvm import te
+from tvm import te, tir
 import tvm.autotvm as autotvm
 from tvm.autotvm.task.space import SplitEntity
 from tvm.contrib import cublas
@@ -44,6 +44,9 @@ def dense_cublas(cfg, data, weight, bias=None, out_dtype=None):
     matmul = cublas.matmul(data, weight, False, True)
     if isinstance(batch, int):
         cfg.add_flop(batch * in_dim * out_dim * 2)
+    elif isinstance(batch, tir.IntImm):
+        cfg.add_flop(batch.value * in_dim * out_dim * 2)
+    # if we get a te.Var, we cannot add flop counts
     if bias is not None:
         matmul = te.compute(
             (batch, out_dim), lambda i, j: matmul[i, j] + bias[j], tag=tag.BROADCAST

From 42eb55dda8f121fe572e99127372757826f8b892 Mon Sep 17 00:00:00 2001
From: Luis Vega <vegaluisjose@users.noreply.github.com>
Date: Sat, 23 Jan 2021 12:41:21 -0800
Subject: [PATCH 106/357] add Verilator to CI (#7098)

---
 Jenkinsfile                             | 4 ++--
 tests/scripts/task_config_build_cpu.sh  | 1 +
 tests/scripts/task_config_build_i386.sh | 1 +
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 67a41cd51430..0bf3a1b98c64 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -46,9 +46,9 @@
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = "tlcpack/ci-lint:v0.62"
 ci_gpu = "tlcpack/ci-gpu:v0.72"
-ci_cpu = "tlcpack/ci-cpu:v0.71"
+ci_cpu = "tlcpack/ci-cpu:v0.72-t0"
 ci_wasm = "tlcpack/ci-wasm:v0.70"
-ci_i386 = "tlcpack/ci-i386:v0.71"
+ci_i386 = "tlcpack/ci-i386:v0.72-t0"
 ci_qemu = "tlcpack/ci-qemu:v0.01"
 ci_arm = "tlcpack/ci-arm:v0.01"
 // <--- End of regex-scanned config.
diff --git a/tests/scripts/task_config_build_cpu.sh b/tests/scripts/task_config_build_cpu.sh
index 9a009b6a4a78..9ddf1778ff9f 100755
--- a/tests/scripts/task_config_build_cpu.sh
+++ b/tests/scripts/task_config_build_cpu.sh
@@ -45,3 +45,4 @@ echo set\(USE_FLATBUFFERS_PATH \"/flatbuffers\"\) >> config.cmake
 echo set\(USE_ETHOSN /opt/arm/ethosn-driver\) >> config.cmake
 echo set\(USE_ETHOSN_HW OFF\) >> config.cmake
 echo set\(USE_VITIS_AI ON\) >> config.cmake
+echo set\(USE_VERILATOR ON\) >> config.cmake
diff --git a/tests/scripts/task_config_build_i386.sh b/tests/scripts/task_config_build_i386.sh
index 8ed5f94e30dc..68e61c6a039c 100755
--- a/tests/scripts/task_config_build_i386.sh
+++ b/tests/scripts/task_config_build_i386.sh
@@ -34,3 +34,4 @@ echo set\(CMAKE_CXX_COMPILER g++\) >> config.cmake
 echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake
 echo set\(USE_VTA_TSIM ON\) >> config.cmake
 echo set\(USE_VTA_FSIM ON\) >> config.cmake
+echo set\(USE_VERILATOR ON\) >> config.cmake

From 5d3349104a1dc4b84f9a744aeee9b124df231f04 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <tmoreau@octoml.ai>
Date: Sun, 24 Jan 2021 23:38:17 -0800
Subject: [PATCH 107/357] [Tutorial] Autoscheduler on ARM devices (#7326)

* arm tuning tutorial

* adjustment to get RPC working

* fix lint

* fix target

* integrate Leandros comments

* dont request remote in CI

* use API from auto_scheduler, not autoTVM and updated comments

* make ci-runnable

* fix the formatting

* address Zhaos comments

* full run stats

* taking Zhaos comments into consideration
---
 tutorials/auto_scheduler/tune_network_arm.py | 421 +++++++++++++++++++
 1 file changed, 421 insertions(+)
 create mode 100644 tutorials/auto_scheduler/tune_network_arm.py

diff --git a/tutorials/auto_scheduler/tune_network_arm.py b/tutorials/auto_scheduler/tune_network_arm.py
new file mode 100644
index 000000000000..f821c2e55d13
--- /dev/null
+++ b/tutorials/auto_scheduler/tune_network_arm.py
@@ -0,0 +1,421 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Auto-scheduling a Neural Network for ARM CPU
+=============================================
+**Author**: `Thierry Moreau <https://github.com/tmoreau89, Lianmin Zheng <https://github.com/merrymercy>>`_
+
+Auto-tuning for specific devices and workloads is critical for getting the
+best performance. This is a tutorial on how to tune a whole neural
+network for ARM CPU with the auto-scheduler via RPC.
+
+To auto-tune a neural network, we partition the network into small subgraphs and 
+tune them independently. Each subgraph is treated as one search task.
+A task scheduler slices the time and dynamically allocates time resources to
+these tasks. The task scheduler predicts the impact of each task on the end-to-end
+execution time and prioritizes the one that can reduce the execution time the most.
+
+For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to
+get the computational DAG in the tensor expression form.
+We then use the auto-scheduler to construct a search space of this DAG and search
+for good schedules (low-level optimizations).
+
+Different from the template-based :ref:`autotvm <tutorials-autotvm-sec>` which relies on
+manual templates to define the search space, the auto-scheduler does not require any
+schedule templates. In other words, the auto-scheduler only uses the compute declarations
+in :code:`tvm/python/topi` and does not use existing schedule templates.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
+"""
+
+import numpy as np
+
+import tvm
+from tvm import relay, auto_scheduler
+import tvm.relay.testing
+from tvm.contrib import graph_runtime
+from tvm.contrib.utils import tempdir
+
+#################################################################
+# Define a Network
+# ----------------
+# First, we need to define the network with relay frontend API.
+# We can load some pre-defined network from :code:`tvm.relay.testing`.
+# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow
+# (see :ref:`front end tutorials<tutorial-frontend>`).
+#
+# For convolutional neural networks, although auto-scheduler can work correctly
+# with any layout, we found the best performance is typically achieved with NHWC layout.
+# We also implemented more optimizations for NHWC layout with the auto-scheduler.
+# So it is recommended to convert your models to NHWC layout to use the auto-scheduler.
+# You can use :ref:`ConvertLayout <convert-layout-usage>` pass to do the layout conversion in TVM.
+
+
+def get_network(name, batch_size, layout="NHWC", dtype="float32"):
+    """Get the symbol definition and random weight of a network"""
+
+    # auto-scheduler prefers NHWC layout
+    if layout == "NHWC":
+        image_shape = (224, 224, 3)
+    elif layout == "NCHW":
+        image_shape = (3, 224, 224)
+    else:
+        raise ValueError("Invalid layout: " + layout)
+
+    input_shape = (batch_size,) + image_shape
+    output_shape = (batch_size, 1000)
+
+    if name.startswith("resnet-"):
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer,
+            batch_size=batch_size,
+            layout=layout,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name.startswith("resnet3d-"):
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer,
+            batch_size=batch_size,
+            layout=layout,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name == "mobilenet":
+        mod, params = relay.testing.mobilenet.get_workload(
+            batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape
+        )
+    elif name == "squeezenet_v1.1":
+        assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout"
+        mod, params = relay.testing.squeezenet.get_workload(
+            version="1.1",
+            batch_size=batch_size,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name == "inception_v3":
+        input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3)
+        mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
+    elif name == "mxnet":
+        # an example for mxnet model
+        from mxnet.gluon.model_zoo.vision import get_model
+
+        assert layout == "NCHW"
+
+        block = get_model("resnet50_v1", pretrained=True)
+        mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
+        net = mod["main"]
+        net = relay.Function(
+            net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs
+        )
+        mod = tvm.IRModule.from_expr(net)
+
+    return mod, params, input_shape, output_shape
+
+
+#################################################################
+# Start RPC Tracker
+# -----------------
+# TVM uses RPC session to communicate with ARM boards.
+# During tuning, the tuner will send the generated code to the board and
+# measure the speed of code on the board.
+#
+# To scale up the tuning, TVM uses RPC Tracker to manage distributed devices.
+# The RPC Tracker is a centralized controller node. We can register all devices to
+# the tracker. For example, if we have 10 phones, we can register all of them
+# to the tracker, and run 10 measurements in parallel, accelerating the tuning process.
+#
+# To start an RPC tracker, run this command on the host machine. The tracker is
+# required during the whole tuning process, so we need to open a new terminal for
+# this command:
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.rpc_tracker --host=0.0.0.0 --port=9190
+#
+# The expected output is
+#
+# .. code-block:: bash
+#
+#   INFO:RPCTracker:bind to 0.0.0.0:9190
+
+#################################################################
+# Register Devices to RPC Tracker
+# -----------------------------------
+# Now we can register our devices to the tracker. The first step is to
+# build the TVM runtime for the ARM devices.
+#
+# * For Linux:
+#   Follow this section :ref:`build-tvm-runtime-on-device` to build
+#   the TVM runtime on the device. Then register the device to tracker by
+#
+#   .. code-block:: bash
+#
+#     python -m tvm.exec.rpc_server --tracker=[HOST_IP]:9190 --key=rasp4b-64
+#
+#   (replace :code:`[HOST_IP]` with the IP address of your host machine)
+#
+# * For Android:
+#   Follow this `readme page <https://github.com/apache/tvm/tree/main/apps/android_rpc>`_ to
+#   install the TVM RPC APK on the android device. Make sure you can pass the android rpc test.
+#   Then you have already registered your device. During tuning, you have to go to developer option
+#   and enable "Keep screen awake during changing" and charge your phone to make it stable.
+#
+# After registering devices, we can confirm it by querying rpc_tracker
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190
+#
+# For example, if we have 2 Huawei mate10 pro, 11 Raspberry Pi 4B with 64bit OS, and 2 rk3399,
+# the output can be
+#
+# .. code-block:: bash
+#
+#    Queue Status
+#    ----------------------------------
+#    key          total  free  pending
+#    ----------------------------------
+#    mate10pro    2      2     0
+#    rk3399       2      2     0
+#    rasp4b-64    11     11    0
+#    ----------------------------------
+#
+# You can register multiple devices to the tracker to accelerate the measurement in tuning.
+
+###########################################
+# Set Tuning Options
+# ------------------
+# Before tuning, we should apply some configurations. Here I use a Raspberry Pi 4b 4GB board
+# as example with a 64bit OS (Ubuntu 20.04). In your setting, you should modify the target
+# and device_key accordingly.
+# set :code:`use_ndk` to True if you use android phone.
+
+#### DEVICE CONFIG ####
+
+# Replace "aarch64-linux-gnu" with the correct target of your board.
+# This target is used for cross compilation. You can query it by :code:`gcc -v` on your device.
+# FIXME(tmoreau89, merrymercy): We leave '-device=arm_cpu' out of the target string
+#                               because we're sharing x86 op strategy.
+target = tvm.target.Target("llvm -mtriple=aarch64-linux-gnu -mattr=+neon")
+
+# Also replace this with the device key in your tracker
+device_key = "rasp4b-64"
+
+# Set this to True if you use ndk tools for cross compiling
+# And also set the environment variable below to point to the cross compiler
+use_ndk = False
+# os.environ["TVM_NDK_CC"] = "/usr/bin/aarch64-linux-gnu-g++"
+
+#### TUNING OPTION ####
+network = "mobilenet"
+batch_size = 1
+layout = "NHWC"
+dtype = "float32"
+log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target.kind.name)
+
+#################################################################
+# Extract Search Tasks
+# --------------------
+# Next, we extract the search tasks and their weights from a network.
+# The weight of a task is the number of appearances of the task's subgraph
+# in the whole network.
+# By using the weight, we can approximate the end-to-end latency of the network
+# as :code:`sum(latency[t] * weight[t])`, where :code:`latency[t]` is the
+# latency of a task and :code:`weight[t]` is the weight of the task.
+# The task scheduler will just optimize this objective.
+
+# Extract tasks from the network
+print("Extract tasks...")
+mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype)
+tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
+
+for idx, task in enumerate(tasks):
+    print("========== Task %d  (workload key: %s) ==========" % (idx, task.workload_key))
+    print(task.compute_dag)
+
+
+#################################################################
+# Tuning and Evaluation
+# ---------------------
+# Now, we set some options for tuning and launch the search tasks
+#
+# * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning.
+#   You can set it to a small number (e.g., 200) for a fast demonstrative run.
+#   In practice, we recommend setting it around :code:`800 * len(tasks)`,
+#   which is typically enough for the search to converge.
+#   For example, there are 29 tasks in resnet-50, so we can set it as 20000.
+#   You can adjust this parameter according to your time budget.
+# * In addition, we use :code:`RecordToFile` to dump measurement records into a log file,
+#   The measurement records can be used to query the history best, resume the search,
+#   and do more analyses later.
+# * see :any:`auto_scheduler.TuningOptions`,
+#   :any:`auto_scheduler.LocalRunner` for more parameters.
+#
+# After auto-tuning, we can compile the network with the best schedules we found.
+# All measurement records are dumped into the log file during auto-tuning,
+# so we can read the log file and load the best schedules.
+
+
+def tune_and_evaluate():
+    print("Begin tuning...")
+    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
+    tune_option = auto_scheduler.TuningOptions(
+        num_measure_trials=200,  # change this to 20000 to achieve the best performance
+        runner=auto_scheduler.RPCRunner(
+            device_key,
+            host="0.0.0.0",
+            port=9191,
+            timeout=30,
+            repeat=1,
+            min_repeat_ms=200,
+            enable_cpu_cache_flush=True,
+        ),
+        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+    )
+
+    tuner.tune(tune_option)
+
+    # Compile with the history best
+    print("Compile...")
+    with auto_scheduler.ApplyHistoryBest(log_file):
+        with tvm.transform.PassContext(
+            opt_level=3, config={"relay.backend.use_auto_scheduler": True}
+        ):
+            lib = relay.build(mod, target=target, params=params)
+
+    # Export library
+    tmp = tempdir()
+    if use_ndk:
+        from tvm.contrib import ndk
+
+        filename = "net.so"
+        lib.export_library(tmp.relpath(filename), ndk.create_shared)
+    else:
+        filename = "net.tar"
+        lib.export_library(tmp.relpath(filename))
+
+    # Upload module to device
+    print("Upload...")
+    remote = auto_scheduler.utils.request_remote(device_key, "0.0.0.0", 9191, timeout=10000)
+    remote.upload(tmp.relpath(filename))
+    rlib = remote.load_module(filename)
+
+    # Create graph runtime
+    ctx = remote.cpu()
+    module = graph_runtime.GraphModule(rlib["default"](ctx))
+    data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+    module.set_input("data", data_tvm)
+
+    # Evaluate
+    print("Evaluate inference time cost...")
+    ftimer = module.module.time_evaluator("run", ctx, repeat=3, min_repeat_ms=500)
+    prof_res = np.array(ftimer().results) * 1e3  # convert to millisecond
+    print(
+        "Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res))
+    )
+
+
+# We do not run the tuning in our webpage server since the server doesn't have a Raspberry Pi,
+# or device tracker running.
+# Uncomment the following line to run it by yourself.
+
+# tune_and_evaluate()
+
+
+######################################################################
+# .. note:: Explaining the printed information during tuning
+#
+#   During the tuning, a lot of information will be printed on the console.
+#   They are used for debugging purposes. The most important info is the output
+#   of the task scheduler. The following table is a sample output.
+#
+#   .. code-block:: c
+#
+#    ----------------------------------------------------------------------
+#    ------------------------------  [ Task Scheduler ]
+#    ----------------------------------------------------------------------
+#    |  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
+#    -------------------------------------------------
+#    |    0 |        0.013 |           0.31 |     64 |
+#    |    1 |        0.845 |           2.43 |    448 |
+#    |    2 |        0.046 |          -0.00 |     64 |
+#    |    3 |        4.194 |          24.53 |   2112 |
+#    |    4 |        0.109 |           9.21 |     64 |
+#    |    5 |        1.759 |          29.27 |    896 |
+#    |    6 |        0.083 |           6.01 |     64 |
+#    |    7 |        3.084 |          33.38 |   7680 |
+#    |    8 |        0.136 |          14.78 |    384 |
+#    |    9 |        1.349 |          38.23 |    768 |
+#    |   10 |        0.133 |           7.55 |    128 |
+#    |   11 |        2.747 |          37.56 |   1536 |
+#    |   12 |        0.338 |          11.87 |    192 |
+#    |   13 |        1.295 |          40.00 |    704 |
+#    |   14 |        0.482 |           4.16 |    256 |
+#    |   15 |        2.686 |          38.56 |   1344 |
+#    |   16 |        0.884 |           9.08 |    448 |
+#    |   17 |        1.332 |          39.18 |    704 |
+#    |   18 |        1.045 |           3.84 |    576 |
+#    |   19 |        1.391 |          38.09 |    704 |
+#    |   20 |        0.777 |          10.34 |    448 |
+#    |   21 |        0.739 |          30.97 |    448 |
+#    -------------------------------------------------
+#     Estimated total latency: 38.347 ms      Trials: 19992   Used time : 19260 s     Next ID: 3
+#
+#   This table lists the latency and (estimated) speed of all tasks.
+#   It also lists the allocation of measurement trials for all tasks.
+#   The last line prints the total weighted latency of these tasks,
+#   which can be a rough estimation of the end-to-end execution time
+#   of the network.
+#   The last line also prints the total number of measurement trials,
+#   total time spent on auto-tuning and the id of the next task to tune.
+#
+#   There will also be some "dmlc::Error"s errors, because the
+#   auto-scheduler will try some invalid schedules.
+#   You can safely ignore them if the tuning can continue, because these
+#   errors are isolated from the main process.
+#
+
+######################################################################
+# .. note:: Terminate the tuning earlier
+#
+#   You can terminate the tuning earlier by forcibly killing this process.
+#   As long as you get at least one valid schedule for each task in the log file,
+#   you should be able to do the compilation (the secion below).
+#
+
+#################################################################
+# Other Tips
+# ----------
+# 1. During the tuning, the auto-scheduler needs to compile many programs and
+#    extract feature from them. This part is CPU-intensive,
+#    so a high-performance CPU with many cores is recommended for faster search.
+# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill --i log.json`
+#    to distill the large log file and only save the best useful records.
+# 3. You can resume a search from the previous log file. You just need to
+#    add a new argument :code:`load_log_file` when creating the task scheduler
+#    in function :code:`run_tuning`. Say,
+#    :code:`tuner = auto_scheduler.TaskScheduler(tasks, task_weights, load_log_file=log_file)`
+# 4. If you have multiple target CPUs, you can use all of them for measurements to
+#    parallelize the measurements. Check this :ref:`section <tutorials-autotvm-rpc-tracker>`
+#    to learn how to use the RPC Tracker and RPC Server.
+#    To use the RPC Tracker in auto-scheduler, replace the runner in :code:`TuningOptions`
+#    with :any:`auto_scheduler.RPCRunner`.

From e6d53185b96cc39f2aaec5e86ae11ca0ac675b8a Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Mon, 25 Jan 2021 03:27:34 -0800
Subject: [PATCH 108/357] [AutoScheduler] Separate shapes from DAG hash and
 enable schedule sharing (#7317)

* [AutoScheduler] Separate shapes from DAG hash and enable schedule sharing

* Update CI logs

* lint

* fix registry

* add message; fix layout rewrite mismatch

* update message

* support other formats
---
 include/tvm/auto_scheduler/compute_dag.h      |   7 +
 python/tvm/auto_scheduler/compute_dag.py      |  35 ++---
 python/tvm/auto_scheduler/measure_record.py   | 126 ++++++++++++++++--
 .../tvm/auto_scheduler/relay_integration.py   |   6 +-
 python/tvm/auto_scheduler/search_task.py      |   8 +-
 python/tvm/auto_scheduler/utils.py            |  27 ++++
 .../tvm/auto_scheduler/workload_registry.py   |  37 +++--
 src/auto_scheduler/compute_dag.cc             | 109 ++++++++-------
 .../unittest/test_auto_scheduler_measure.py   |  33 +++++
 .../ci_logs/resnet-18-NHWC-B1-cuda.json       |  48 +++----
 .../ci_logs/resnet-50-NHWC-B1-llvm.json       |  55 ++++----
 11 files changed, 342 insertions(+), 149 deletions(-)
 mode change 100755 => 100644 src/auto_scheduler/compute_dag.cc

diff --git a/include/tvm/auto_scheduler/compute_dag.h b/include/tvm/auto_scheduler/compute_dag.h
index 1e3f09721279..a87563e348f7 100755
--- a/include/tvm/auto_scheduler/compute_dag.h
+++ b/include/tvm/auto_scheduler/compute_dag.h
@@ -262,6 +262,13 @@ class ComputeDAG : public ObjectRef {
    */
   String PrintStepsAsPython(const Array<Step>& transform_steps) const;
 
+  /*!
+   * \brief Print the compute DAG to a string. This is also used to generate the ComputeDAG hash.
+   * \param simple_mode Simple mode will only include the op names and brief compute.
+   * \return The ComputeDAG in a string.
+   */
+  String PrintDAG(bool simple_mode = false) const;
+
   /*!
    * \brief Fill the correct bound information for a given state by calling ir_pass::InferBound.
    * The states can lose complete bound information after some transform steps (e.g., compute_at).
diff --git a/python/tvm/auto_scheduler/compute_dag.py b/python/tvm/auto_scheduler/compute_dag.py
index a7f200aa5cdd..948f277034db 100755
--- a/python/tvm/auto_scheduler/compute_dag.py
+++ b/python/tvm/auto_scheduler/compute_dag.py
@@ -19,11 +19,11 @@
 """ The auto-scheduler's computational graph and related program analyses. """
 
 import hashlib
+import json
 
 import tvm._ffi
 from tvm.runtime import Object
 from tvm.runtime._ffi_node_api import LoadJSON, SaveJSON
-from tvm.te import ComputeOp, PlaceholderOp
 
 from . import _ffi_api
 from .loop_state import State, StateObject
@@ -220,32 +220,23 @@ def rewrite_layout_from_state(self, state):
         state_obj = state if isinstance(state, StateObject) else state.state_object
         return _ffi_api.ComputeDAGRewriteLayoutFromState(self, state_obj)
 
-    def hash_key(self):
-        """Return the hash key of this compute DAG.
+    def workload_key(self):
+        """Return the workload key of this compute DAG.
+        The workload key is a JSON string from a tuple of (hash-key, tensor shapes...)
 
         Returns
         -------
         key: str
-            The hash key of this compute DAG
+            The workload key of this compute DAG
         """
-        # TODO(merrymercy): Implement this more carefully and move this to c++ as a member function
-        # of ComputeDAG
-        str_key = ""
-        for op in self.ops:
-            t = op.output(0)
-            if isinstance(op, PlaceholderOp):
-                str_key += "placeholder,"
-                str_key += str(get_const_tuple(t.shape)) + ","
-                str_key += t.dtype + ";"
-            elif isinstance(op, ComputeOp):
-                str_key += str(t.op.body) + ","
-                str_key += str(get_const_tuple(t.shape)) + ","
-                str_key += t.dtype + ";"
-            else:
-                raise ValueError("Invalid op: " + op)
-
-        str_key = str_key.encode(encoding="utf-8")
-        return hashlib.md5(str_key).hexdigest()
+        str_dag = _ffi_api.ComputeDAGPrintDAG(self, True)
+        str_dag = str_dag.encode(encoding="utf-8")
+        hash_key = hashlib.md5(str_dag).hexdigest()
+
+        io_shapes = []
+        for tensor in self.tensors:
+            io_shapes += get_const_tuple(tensor.shape)
+        return json.dumps([hash_key] + io_shapes)
 
     def __str__(self):
         # pretty print
diff --git a/python/tvm/auto_scheduler/measure_record.py b/python/tvm/auto_scheduler/measure_record.py
index 35e5e9b68a43..9eaef189e081 100644
--- a/python/tvm/auto_scheduler/measure_record.py
+++ b/python/tvm/auto_scheduler/measure_record.py
@@ -27,6 +27,7 @@
 import tvm._ffi
 from tvm.runtime import Object
 from .measure import MeasureErrorNo, MeasureCallback
+from .utils import decode_workload_key
 from . import _ffi_api
 
 logger = logging.getLogger("auto_scheduler")
@@ -59,8 +60,37 @@ class RecordReader(Object):
     """
 
     def __init__(self, filename):
+        # a set to prevent print duplicated message
+        self.messages = set()
+
         self.__init_handle_by_constructor__(_ffi_api.RecordReader, filename)
 
+    def check_workload_key(self, inputs):
+        """Check and throw warnings for records with old format workload key.
+
+        Parameters
+        ----------
+        inputs: List[MeasureInput]
+            The measure inputs to be checked.
+
+        Notes
+        -----
+        This checker could be deprecated in the future.
+        """
+        for inp in inputs:
+            _, args = decode_workload_key(inp.task.workload_key)
+            if args is None:
+                continue
+            if not args:
+                msg = (
+                    "MeasureInput with old format workload key %s should be updated "
+                    "using the script from https://github.com/apache/tvm/pull/7317."
+                    % inp.task.workload_key
+                )
+                if msg not in self.messages:
+                    self.messages.add(msg)
+                    logger.warning(msg)
+
     def read_lines(self, max_lines=None, skip_lines=0):
         """Read multiple lines from the log file.
 
@@ -88,6 +118,7 @@ def read_lines(self, max_lines=None, skip_lines=0):
         inputs, results = _ffi_api.RecordReaderReadLines(
             self, max_lines if max_lines else -1, skip_lines
         )
+        self.check_workload_key(inputs)
         return inputs, results
 
     def __iter__(self):
@@ -95,9 +126,69 @@ def __iter__(self):
             ret = _ffi_api.RecordReaderReadNext(self)
             if not ret:
                 break
+            self.check_workload_key([ret[0]])
             yield ret[0], ret[1]  # (input, result)
 
 
+def calc_workload_dis_factor(target_workload_key, workload_key):
+    """Calculate the distance factor of the workload to the target workload.
+    If two workloads are not compatible at all (i.e., different compute DAG or function),
+    then the distance factor is "inf". Otherwise, we calculate the factor by traversing
+    the workload arguments, which are the arguments of the compute function,
+    or the output shapes for the ComputeDAG. The factor is calculated by the following rules:
+
+    1. For non-zero integer values: `product(target_arg / candidate_arg)`.
+    2. For non-integer or zero values: "inf" if not equal else 1.
+
+    As a result, factor=1 is the optimal when two workloads are identical.
+
+    Parameters
+    ----------
+    target_workload_key: str
+        The target workload key in JSON string.
+
+    workload_key: str
+        The candidate workload key in JSON string.
+
+    Returns
+    -------
+    dis_f: float
+        The distance factor.
+    """
+
+    def flatten_list(inp):
+        ret = []
+        for elt in inp:
+            if isinstance(elt, list):
+                ret += flatten_list(elt)
+            else:
+                ret.append(elt)
+        return ret
+
+    target_key, target_args = decode_workload_key(target_workload_key)
+    target_args = flatten_list(target_args) if target_args is not None else []
+    key, args = decode_workload_key(workload_key)
+    args = flatten_list(args) if args is not None else []
+
+    # Not even the same func/DAG.
+    if key != target_key or len(target_args) != len(args):
+        return float("inf")
+
+    dis_f = 1
+    for target_arg, arg in zip(target_args, args):
+        if isinstance(target_arg, int):
+            if target_arg == 0 or arg == 0:
+                if target_arg != arg:
+                    return float("inf")
+            elif target_arg % arg != 0:
+                return float("inf")
+            else:
+                dis_f *= target_arg / arg
+        elif target_arg != arg:
+            return float("inf")
+    return dis_f
+
+
 def load_record_from_string(record):
     """
     Load the measure record from string.
@@ -174,7 +265,7 @@ def save_records(filename, inputs, results):
     _ffi_api.SaveRecords(filename, inputs, results)
 
 
-def load_best_record(filename, workload_key=None, target=None):
+def load_best_record(filename, workload_key=None, target=None, include_compatible=False):
     """Return the best measurement pair form a log file. This may return none results if
     there is no legal measure pair with the specified workload_key/target found from the log file.
 
@@ -188,6 +279,8 @@ def load_best_record(filename, workload_key=None, target=None):
     target : Optional[tvm.target.Target]
         The target device.
         With `None`, this returns the best measure pair of all target devices.
+    include_compatible: bool
+        When set to True, all compatible records in the log file will be considered.
 
     Returns
     -------
@@ -204,13 +297,23 @@ def load_best_record(filename, workload_key=None, target=None):
     for inp, res in log_reader:
         if res.error_no != MeasureErrorNo.NO_ERROR:
             continue
-        if workload_key and inp.task.workload_key != workload_key:
-            continue
         if target and inp.task.target.kind.name != target.kind.name:
             continue
 
         costs = [v.value for v in res.costs]
         cost = np.mean(costs)
+
+        if workload_key is not None:
+            dis_f = calc_workload_dis_factor(workload_key, inp.task.workload_key)
+            if dis_f == float("inf"):
+                continue
+            if not include_compatible and dis_f != 1:
+                continue
+
+            # Since different workloads have different FLOPS, we multiply the factor to
+            # eliminate this difference, which is basically the concept of throughput.
+            cost *= dis_f
+
         if cost < best_cost:
             best_cost = cost
             best_inp = inp
@@ -267,12 +370,8 @@ def measure_input_str_key(inp):
     logger.info("Extract %d best records from %s to %s", len(inputs), in_file, out_file)
 
 
-"""
-Usage:
-* Distill the best entries from a large log file
-e.g. python -m tvm.auto_scheduler.measure_record --mode distill --i input.json
-"""
-if __name__ == "__main__":
+def main():
+    """The main function for CLI."""
     parser = argparse.ArgumentParser()
     parser.add_argument("--mode", choices=["distill"], required=True)
     parser.add_argument("--i", type=str, help="input file")
@@ -285,3 +384,12 @@ def measure_input_str_key(inp):
     if args.mode == "distill":
         args.o = args.o or args.i + ".best.json"
         distill_record_file(args.i, args.o)
+
+
+"""
+Usage:
+* Distill the best entries from a large log file
+e.g. python -m tvm.auto_scheduler.measure_record --mode distill --i input.json
+"""
+if __name__ == "__main__":
+    main()
diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py
index fb60da19fe44..b39aba227a88 100644
--- a/python/tvm/auto_scheduler/relay_integration.py
+++ b/python/tvm/auto_scheduler/relay_integration.py
@@ -22,7 +22,6 @@
 2. Provide auto-scheduling for all TOPI compute functions
 """
 
-import json
 import logging
 import threading
 
@@ -281,7 +280,7 @@ def auto_schedule_topi(outs):
         logger.info("Failed to create a ComputeDAG for auto_scheduler: %s", str(err))
         return None
 
-    key = register_workload_tensors(dag.hash_key(), io_tensors)
+    key = register_workload_tensors(dag.workload_key(), io_tensors)
     target = tvm.target.Target.current()
 
     env = TracingEnvironment.current
@@ -310,9 +309,8 @@ def auto_schedule_topi(outs):
                 return None
 
             # rewrite the layout and update the context for the new dag
-            dag = ComputeDAG(outs)
             new_dag = dag.rewrite_layout_from_state(state)
-            new_key = json.dumps((new_dag.hash_key(),))
+            new_key = new_dag.workload_key()
             if new_key != key:
                 dispatch_ctx.update(target, new_key, state)
     else:
diff --git a/python/tvm/auto_scheduler/search_task.py b/python/tvm/auto_scheduler/search_task.py
index d985ed1341f5..83f665b229d2 100644
--- a/python/tvm/auto_scheduler/search_task.py
+++ b/python/tvm/auto_scheduler/search_task.py
@@ -257,13 +257,15 @@ def tune(self, tuning_options, search_policy=None):
 
         _ffi_api.AutoSchedule(search_policy, tuning_options)
 
-    def apply_best(self, log_file, layout_rewrite_option=None):
+    def apply_best(self, log_file, include_compatible=False, layout_rewrite_option=None):
         """Apply the history best from a log file and return the schedule.
 
         Parameters
         ----------
         log_file : str
            The name of the log file.
+        include_compatible: bool
+            When set to True, all compatible records in the log file will be considered.
         layout_rewrite_option : Optional[LayoutRewriteOption]
            The layout rewrite option.
 
@@ -272,7 +274,9 @@ def apply_best(self, log_file, layout_rewrite_option=None):
         -------
             A `te.Schedule` and the a list of `te.Tensor` to be used in `tvm.lower` or `tvm.build`.
         """
-        inp, _ = load_best_record(log_file, self.workload_key)
+        inp, _ = load_best_record(
+            log_file, self.workload_key, include_compatible=include_compatible
+        )
         if inp is None:
             raise RuntimeError(
                 "Cannot find any valid schedule for %s in file %s" % (self.workload_key, log_file)
diff --git a/python/tvm/auto_scheduler/utils.py b/python/tvm/auto_scheduler/utils.py
index 334acaf02238..fd25fdb783f7 100644
--- a/python/tvm/auto_scheduler/utils.py
+++ b/python/tvm/auto_scheduler/utils.py
@@ -19,6 +19,7 @@
 """ Common utilities for auto_scheduler. """
 
 from typing import Hashable
+import json
 import multiprocessing
 import multiprocessing.pool
 import queue
@@ -42,6 +43,32 @@
 from ..te import Tensor, placeholder
 
 
+def decode_workload_key(workload_key):
+    """Decode the workload key from a string to the name and arguments. The wokrload key
+    is expected to be a list of "[func_name/hash, args ...]" in a JSON string. If not,
+    then simply return the workload key as the name without arguments.
+
+    Parameters
+    ----------
+    workload_key: str
+        The workload key in string. Format: "[func_name/hash, args ...]".
+
+    Returns
+    -------
+    name: str
+        The workload function name or the DAG hash.
+    args: Optional[List[Any]]
+        The arguments of the workload, or None if the workload key format is not decodeable.
+    """
+    try:
+        key_list = json.loads(workload_key)
+        if isinstance(key_list, list) and len(key_list) >= 1:
+            return key_list[0], key_list[1:]
+    except json.decoder.JSONDecodeError:
+        pass
+    return workload_key, None
+
+
 def get_func_name(func):
     """Get name of a function.
 
diff --git a/python/tvm/auto_scheduler/workload_registry.py b/python/tvm/auto_scheduler/workload_registry.py
index 9a7c15c877aa..51ae64d6adeb 100644
--- a/python/tvm/auto_scheduler/workload_registry.py
+++ b/python/tvm/auto_scheduler/workload_registry.py
@@ -98,14 +98,14 @@ def register(myf):
     return register
 
 
-def register_workload_tensors(func_name, tensors, override=True):
+def register_workload_tensors(workload_key, tensors, override=True):
     """Register a workload by provding input/output tensors. Since this function is used
     when extracting/deserializing tasks, it expects duplicated registrations by default.
 
     Parameters
     ----------
-    func_name: str
-        The function name or the hash key of the compute DAG.
+    workload_key: str
+        The wokrload key of the compute DAG in JSON string.
     tensors: List[Tensor]
         The input/output tensors of a compute DAG
     override : boolean = True
@@ -113,11 +113,11 @@ def register_workload_tensors(func_name, tensors, override=True):
 
     Returns
     -------
-    key: str
-        The serialized JSON string as the workload key.
+    workload_key: str
+        The wokrload key of the compute DAG in JSON string.
     """
-    register_workload(func_name, override=override)(tensors)
-    return json.dumps((func_name,))
+    register_workload(workload_key, override=override)(tensors)
+    return workload_key
 
 
 def make_workload_key(func, args):
@@ -169,7 +169,8 @@ def workload_key_to_tensors(workload_key):
     Parameters
     ----------
     workload_key : str
-        The input workload key.
+        The input workload key in JSON string. The format is either (func_name, arguments...)
+        for compute functions, or (hash, shapes...) for ComputeDAG.
 
     Returns
     -------
@@ -178,16 +179,21 @@ def workload_key_to_tensors(workload_key):
     """
     global WORKLOAD_FUNC_REGISTRY
 
+    # We register ComputeDAG with both hash and argumetns, which are fixed in ComputeDAG,
+    # so we use an entire workload key to query the ComputeDAG.
+    if workload_key in WORKLOAD_FUNC_REGISTRY:
+        return WORKLOAD_FUNC_REGISTRY[workload_key]
+
+    # We register compute function with only the function name since
+    # it does not bind to specific arguments, so we use the function name to query
+    # the function and call the function with arguments to get the tensors.
     workload = json.loads(workload_key)
     name = workload[0]
     value = WORKLOAD_FUNC_REGISTRY[name]
+    assert callable(value)
 
-    # "value" can be either a function or a list of tensors
-    if callable(value):  # if it is a func
-        args = deserialize_args(workload[1:])
-        return value(*args)
-    # otherwise, it is a list of tensors
-    return value
+    args = deserialize_args(workload[1:])
+    return value(*args)
 
 
 def serialize_workload_registry_entry(workload_key):
@@ -209,6 +215,9 @@ def serialize_workload_registry_entry(workload_key):
     """
     global WORKLOAD_FUNC_REGISTRY
 
+    if workload_key in WORKLOAD_FUNC_REGISTRY:
+        return (workload_key, WORKLOAD_FUNC_REGISTRY[workload_key])
+
     workload = json.loads(workload_key)
     name = workload[0]
     value = WORKLOAD_FUNC_REGISTRY[name]
diff --git a/src/auto_scheduler/compute_dag.cc b/src/auto_scheduler/compute_dag.cc
old mode 100755
new mode 100644
index 735f0442b402..4e7fb05660a4
--- a/src/auto_scheduler/compute_dag.cc
+++ b/src/auto_scheduler/compute_dag.cc
@@ -1243,6 +1243,62 @@ String ComputeDAG::PrintStepsAsPython(const Array<Step>& transform_steps) const
   return ss.str();
 }
 
+String ComputeDAG::PrintDAG(bool simple_mode) const {
+  std::stringstream ss;
+
+  for (const auto& op : operator->()->ops) {
+    if (op->IsInstance<te::PlaceholderOpNode>()) {
+      ss << op->name << " = PLACEHOLDER ";
+      if (!simple_mode) {
+        ss << op.output(0)->shape;
+      }
+      ss << "\n";
+    } else if (auto pop = op.as<te::ComputeOpNode>()) {
+      for (size_t k = 0; k < pop->body.size(); ++k) {
+        ss << op->name << "(";
+        for (size_t i = 0; i < pop->axis.size(); i++) {
+          ss << pop->axis[i]->var->name_hint;
+          if (i != pop->axis.size() - 1) {
+            ss << ", ";
+          }
+        }
+        ss << ")";
+        if (pop->body.size() > 1) {
+          ss << ".v" << k;
+        }
+        if (auto preduce = pop->body[k].as<ReduceNode>()) {
+          ICHECK_LT(k, preduce->combiner->result.size());
+          PrimExpr combiner = preduce->combiner->result[k];
+          if (combiner->IsInstance<AddNode>()) {
+            ss << " += " << preduce->source[0] << "\n";
+          } else if (combiner->IsInstance<MaxNode>()) {
+            ss << " max= " << preduce->source[0] << "\n";
+          } else if (combiner->IsInstance<MinNode>()) {
+            ss << " min= " << preduce->source[0] << "\n";
+          } else if (combiner->IsInstance<SelectNode>()) {
+            const auto& select = combiner.as<SelectNode>();
+            ss << " select(" << select->condition << ", " << select->true_value << ", "
+               << select->false_value << ")= " << '(' << preduce->source[0] << ','
+               << preduce->source[1] << ")\n";
+          } else {
+            ss << "reduce" << combiner << "\n";
+          }
+        } else {
+          auto call = pop->body[k].as<CallNode>();
+          if (simple_mode && call) {
+            ss << " = " << call->op << "\n";
+          } else {
+            ss << " = " << pop->body[k] << "\n";
+          }
+        }
+      }
+    } else {
+      LOG(FATAL) << "Invalid op";
+    }
+  }
+  return String(ss.str());
+}
+
 State ComputeDAG::InferBound(const State& state) const {
   ICHECK(state->concrete) << "Only concrete state can be processed to get bound info.";
 
@@ -1383,51 +1439,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<ComputeDAGNode>([](const ObjectRef& ref, ReprPrinter* p) {
       auto* node = static_cast<const ComputeDAGNode*>(ref.get());
-      std::stringstream ss;
-
-      for (const auto& op : node->ops) {
-        if (op->IsInstance<te::PlaceholderOpNode>()) {
-          ss << op->name << " = PLACEHOLDER " << op.output(0)->shape << "\n";
-        } else if (auto pop = op.as<te::ComputeOpNode>()) {
-          for (size_t k = 0; k < pop->body.size(); ++k) {
-            ss << op->name << "(";
-            for (size_t i = 0; i < pop->axis.size(); i++) {
-              ss << pop->axis[i]->var->name_hint;
-              if (i != pop->axis.size() - 1) {
-                ss << ", ";
-              }
-            }
-            ss << ")";
-            if (pop->body.size() > 1) {
-              ss << ".v" << k;
-            }
-            if (auto preduce = pop->body[k].as<ReduceNode>()) {
-              ICHECK_LT(k, preduce->combiner->result.size());
-              PrimExpr combiner = preduce->combiner->result[k];
-              if (combiner->IsInstance<AddNode>()) {
-                ss << " += " << preduce->source[0] << "\n";
-              } else if (combiner->IsInstance<MaxNode>()) {
-                ss << " max= " << preduce->source[0] << "\n";
-              } else if (combiner->IsInstance<MinNode>()) {
-                ss << " min= " << preduce->source[0] << "\n";
-              } else if (combiner->IsInstance<SelectNode>()) {
-                const auto& select = combiner.as<SelectNode>();
-                ss << " select(" << select->condition << ", " << select->true_value << ", "
-                   << select->false_value << ")= " << '(' << preduce->source[0] << ','
-                   << preduce->source[1] << ")\n";
-              } else {
-                ss << "reduce" << combiner << "\n";
-              }
-            } else {
-              ss << " = " << pop->body[k] << "\n";
-            }
-          }
-        } else {
-          LOG(FATAL) << "Invalid op";
-        }
-      }
-
-      p->stream << ss.str();
+      auto dag = GetRef<ComputeDAG>(node);
+      auto dag_str = dag.PrintDAG();
+      p->stream << dag_str;
     });
 
 Array<PrimExpr> GetShapeFromRewrittenLayout(String rewritten_layout, Array<String> axis_names) {
@@ -1476,6 +1490,11 @@ TVM_REGISTER_GLOBAL("auto_scheduler.ComputeDAGPrintPythonCodeFromState")
       return dag.PrintStepsAsPython(state->transform_steps);
     });
 
+TVM_REGISTER_GLOBAL("auto_scheduler.ComputeDAGPrintDAG")
+    .set_body_typed([](const ComputeDAG& dag, bool simple_mode) {
+      return dag.PrintDAG(simple_mode);
+    });
+
 TVM_REGISTER_GLOBAL("auto_scheduler.ComputeDAGInferBoundFromState")
     .set_body_typed([](const ComputeDAG& dag, const State& state) {
       return dag.InferBound(state);
diff --git a/tests/python/unittest/test_auto_scheduler_measure.py b/tests/python/unittest/test_auto_scheduler_measure.py
index e9f1fa40c8b3..3b074b273358 100644
--- a/tests/python/unittest/test_auto_scheduler_measure.py
+++ b/tests/python/unittest/test_auto_scheduler_measure.py
@@ -16,6 +16,7 @@
 # under the License.
 
 """ Test measurement and log serialization. """
+import json
 
 import multiprocessing
 import tvm
@@ -200,6 +201,38 @@ def test_recover_measure_input():
         assert str(correct_inp.state) == str(inp.state)
 
 
+def test_workload_dis_factor():
+    calc = auto_scheduler.measure_record.calc_workload_dis_factor
+
+    # Identical
+    target_wkl_key = json.dumps(
+        ["func1", [8, 3, 224, 224], [32, 3, 3, 3], [0, 0], [1, 1], "float32"]
+    )
+    assert calc(target_wkl_key, target_wkl_key) == 1
+
+    # Compatible with a factor
+    wkl_key = json.dumps(["func1", [1, 3, 112, 112], [32, 3, 3, 3], [0, 0], [1, 1], "float32"])
+    assert calc(target_wkl_key, wkl_key) == 8 * 2 * 2
+
+    # Incompatible argument with zeros
+    wkl_key = json.dumps(["func1", [8, 3, 224, 224], [32, 3, 3, 3], [1, 1], [1, 1], "float32"])
+    assert calc(target_wkl_key, wkl_key) == float("inf")
+    wkl_key = json.dumps(["func1", [8, 3, 224, 224], [32, 3, 3, 3], [0, 0], [0, 0], "float32"])
+    assert calc(target_wkl_key, wkl_key) == float("inf")
+
+    # Incompatible non-integter argument
+    wkl_key = json.dumps(["func1", [8, 3, 224, 224], [32, 3, 3, 3], [0, 0], [1, 1], "int8"])
+    assert calc(target_wkl_key, wkl_key) == float("inf")
+
+    # Incompatible function
+    wkl_key = json.dumps(["func2", [8, 3, 224, 224], [32, 3, 3, 3], [0, 0], [1, 1], "float32"])
+    assert calc(target_wkl_key, wkl_key) == float("inf")
+
+    # Incompatible due to non-dividable factor
+    wkl_key = json.dumps(["func1", [8, 3, 223, 223], [32, 3, 3, 3], [0, 0], [1, 1], "float32"])
+    assert calc(target_wkl_key, wkl_key) == float("inf")
+
+
 def test_measure_local_builder_runner():
     if not tvm.testing.device_enabled("llvm"):
         return
diff --git a/tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1-cuda.json b/tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1-cuda.json
index 8d0a6ae980c4..7cb3a67067b0 100644
--- a/tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1-cuda.json
+++ b/tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1-cuda.json
@@ -1,26 +1,26 @@
 # Provide valid schedules for resnet-18 on GPU.
 # This is used to run the tutorial on the documentation web server.
-{"i": [["[\"b32ed43fb351136894c322ee49097a1a\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["SP", 4, 1, 1000, [40], 1], ["AN", 4, 2, 6], ["FSP", 3, 1, 0, 1], ["AN", 3, 2, 6], ["CA", 3, 4, 0], ["CI", 2], ["FSP", 1, 1, 0, 1], ["AN", 1, 2, 6], ["CA", 1, 4, 0], ["AN", 4, 0, 5], ["PR", 1, 0, "auto_unroll_max_step$512"], ["PR", 3, 0, "auto_unroll_max_step$512"]]]], "r": [[4.87396e-06], 0, 1.30575, 1606984701], "v": "v0.3"}
-{"i": [["[\"d09dc1a6bb90d59c91b68989ad3492ff\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["SP", 2, 0, 1, [1, 1, 1, 1], 1], ["SP", 2, 5, 1000, [1, 50, 1, 1], 1], ["SP", 2, 10, 512, [1, 16], 1], ["RE", 2, [0, 5, 1, 6, 2, 7, 10, 11, 3, 8, 12, 4, 9]], ["FSP", 4, 0, 0, 3], ["FSP", 4, 4, 1, 3], ["RE", 4, [0, 4, 1, 5, 2, 6, 3, 7]], ["CA", 2, 4, 5], ["CHR", 1, "shared", [2]], ["CA", 2, 3, 6], ["CHR", 0, "shared", [3]], ["CA", 1, 4, 6], ["FU", 6, [0, 1]], ["AN", 6, 0, 5], ["FU", 6, [1, 2]], ["AN", 6, 1, 4], ["FU", 6, [2, 3]], ["AN", 6, 2, 6], ["FU", 3, [0, 1]], ["SP", 3, 0, 32, [1], 1], ["AN", 3, 1, 2], ["FFSP", 3, 0, [1, 0], 1, 1], ["AN", 3, 1, 6], ["FU", 1, [0, 1]], ["SP", 1, 0, 32, [1], 1], ["AN", 1, 1, 2], ["FFSP", 1, 0, [1, 0], 1, 1], ["AN", 1, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"]]]], "r": [[2.25155e-05], 0, 1.5128, 1606984719], "v": "v0.3"}
-{"i": [["[\"7de313da0ca29a8c63f647791692430d\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 512, [2], 1], ["AN", 2, 0, 5], ["AN", 2, 1, 6], ["FU", 1, [0, 1, 2, 3]], ["SP", 1, 0, 512, [32], 1], ["AN", 1, 0, 5], ["AN", 1, 1, 6], ["PR", 1, 0, "auto_unroll_max_step$64"]]]], "r": [[3.91068e-06], 0, 1.63708, 1606984742], "v": "v0.3"}
-{"i": [["[\"8d5a93959138dc7b2ee1f1b3219dfa14\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 15], ["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [2], 1], ["SP", 8, 4, 512, [16], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 16, [4, 1, 4, 1], 1], ["SP", 6, 15, 512, [2, 8, 1, 1], 1], ["SP", 6, 20, 512, [2, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 13, 3], ["FSP", 7, 4, 14, 3], ["FSP", 7, 8, 15, 3], ["FSP", 7, 12, 16, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [1], 1], ["SP", 4, 4, 512, [8], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 19, [0, 1, 2, 3]], ["SP", 19, 0, 25088, [32], 1], ["AN", 19, 0, 5], ["AN", 19, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 2, [2], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [16, 15, 14, 13], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 8, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [16, 15, 14, 13], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$512"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[0.000190231], 0, 1.95863, 1606984773], "v": "v0.3"}
-{"i": [["[\"ac6920940de3797cc3f9f9c260675e5d\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [4], 1], ["SP", 8, 4, 512, [8], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 2], 1], ["SP", 6, 5, 4, [1, 1, 1, 2], 1], ["SP", 6, 10, 16, [4, 2, 2, 1], 1], ["SP", 6, 15, 512, [1, 16, 2, 1], 1], ["SP", 6, 20, 512, [2, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [16], 1], ["SP", 4, 4, 512, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 25088, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 16, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 16, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$512"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.000218188], 0, 2.05807, 1606984806], "v": "v0.3"}
-{"i": [["[\"7e83a2ee5cd5d50282ed19310700046a\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [1], 1], ["SP", 8, 4, 512, [8], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 2], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 16, [2, 1, 1, 8], 1], ["SP", 6, 15, 512, [1, 16, 1, 2], 1], ["SP", 6, 20, 512, [2, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [2], 1], ["SP", 4, 4, 512, [16], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 25088, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 256, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000165484], 0, 2.76154, 1606984831], "v": "v0.3"}
-{"i": [["[\"424ba83160af31badc0b098136e1a3b0\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [1], 1], ["SP", 8, 4, 256, [16], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 2], 1], ["SP", 6, 5, 4, [1, 1, 1, 2], 1], ["SP", 6, 10, 49, [1, 1, 1, 7], 1], ["SP", 6, 15, 256, [1, 128, 1, 2], 1], ["SP", 6, 20, 256, [1, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [1], 1], ["SP", 4, 4, 256, [32], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 50176, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 32, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 112, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$64"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.000157488], 0, 2.05375, 1606984883], "v": "v0.3"}
-{"i": [["[\"a169cd0053d3a7ca82998fcb62e42c58\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [1], 1], ["SP", 8, 4, 256, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 49, [1, 7, 7, 1], 1], ["SP", 6, 15, 256, [1, 32, 1, 2], 1], ["SP", 6, 20, 256, [8, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [7], 1], ["SP", 4, 4, 256, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 50176, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [2], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 224, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.00011824], 0, 1.84964, 1606984912], "v": "v0.3"}
-{"i": [["[\"0141ffc4fbabc10cc5a94c954419055b\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [7], 1], ["SP", 8, 4, 256, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 4, 1, 1], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 49, [1, 1, 7, 1], 1], ["SP", 6, 15, 256, [4, 8, 1, 1], 1], ["SP", 6, 20, 256, [1, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [49], 1], ["SP", 4, 4, 256, [8], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 50176, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 8, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 56, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$0"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[8.67244e-05], 0, 1.93124, 1606984935], "v": "v0.3"}
-{"i": [["[\"81aae4b8e2c076a4014d403e8a2c70a1\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 14, [2, 7, 1, 1], 1], ["SP", 3, 10, 14, [1, 7, 2, 1], 1], ["SP", 3, 15, 256, [2, 2, 1, 4], 1], ["SP", 3, 20, 3, [3, 1], 1], ["SP", 3, 23, 3, [1, 3], 1], ["SP", 3, 26, 128, [4, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 96, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 36, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$64"]]]], "r": [[9.20105e-05], 0, 1.88263, 1606984952], "v": "v0.3"}
-{"i": [["[\"c7a6b56bdc04b94c829fb2ef9874019e\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [14], 1], ["SP", 8, 4, 128, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 1], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 196, [1, 7, 1, 7], 1], ["SP", 6, 15, 128, [1, 4, 1, 16], 1], ["SP", 6, 20, 128, [1, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [2], 1], ["SP", 4, 4, 128, [32], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 100352, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 28, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000102747], 0, 2.2858, 1606984979], "v": "v0.3"}
-{"i": [["[\"c035cc8b0568a8e054d06bd7f4950550\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [2], 1], ["SP", 8, 4, 128, [4], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 4], 1], ["SP", 6, 5, 4, [1, 1, 1, 2], 1], ["SP", 6, 10, 196, [2, 49, 1, 1], 1], ["SP", 6, 15, 128, [2, 8, 1, 1], 1], ["SP", 6, 20, 128, [2, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [2], 1], ["SP", 4, 4, 128, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 100352, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 64, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$0"]]]], "r": [[0.000133211], 0, 2.07337, 1606985017], "v": "v0.3"}
-{"i": [["[\"c5ee3e05edd9754492d0763aa41fd025\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [2], 1], ["SP", 8, 4, 128, [4], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 196, [1, 2, 7, 1], 1], ["SP", 6, 15, 128, [1, 2, 2, 2], 1], ["SP", 6, 20, 128, [2, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [28], 1], ["SP", 4, 4, 128, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 100352, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 16, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 28, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[0.000150142], 0, 1.90539, 1606985042], "v": "v0.3"}
-{"i": [["[\"022ebb6b7c55c5ed030421380ec83a04\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 28, [1, 1, 2, 1], 1], ["SP", 3, 10, 28, [1, 7, 2, 2], 1], ["SP", 3, 15, 128, [1, 8, 8, 1], 1], ["SP", 3, 20, 3, [3, 1], 1], ["SP", 3, 23, 3, [1, 1], 1], ["SP", 3, 26, 64, [4, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 576, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 360, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000101548], 0, 1.92449, 1606985059], "v": "v0.3"}
-{"i": [["[\"de0df0893e01892cfe69f7bc2c24111f\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [2], 1], ["SP", 8, 4, 64, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 1, 1, 1], 1], ["SP", 6, 5, 6, [1, 1, 1, 2], 1], ["SP", 6, 10, 196, [2, 14, 1, 1], 1], ["SP", 6, 15, 64, [2, 2, 4, 1], 1], ["SP", 6, 20, 64, [4, 2], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [7], 1], ["SP", 4, 4, 64, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 200704, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 16, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"], ["PR", 8, 0, "auto_unroll_max_step$0"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[5.64548e-05], 0, 3.15692, 1606985088], "v": "v0.3"}
-{"i": [["[\"f2e3c09a00e7d0a9897f70497e089f1e\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [2], 1], ["SP", 8, 4, 64, [64], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 3, 2, 1], 1], ["SP", 6, 5, 6, [1, 3, 1, 2], 1], ["SP", 6, 10, 196, [1, 1, 4, 1], 1], ["SP", 6, 15, 64, [1, 8, 1, 4], 1], ["SP", 6, 20, 64, [1, 2], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [1], 1], ["SP", 4, 4, 64, [4], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 200704, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 128, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 128, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000135574], 0, 2.88002, 1606985120], "v": "v0.3"}
-{"i": [["[\"fa26946d7ac51126bfa859cb183f9ca1\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [49], 1], ["SP", 8, 4, 64, [2], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 2, 1, 3], 1], ["SP", 6, 5, 6, [1, 3, 1, 2], 1], ["SP", 6, 10, 196, [1, 1, 1, 4], 1], ["SP", 6, 15, 64, [1, 8, 1, 1], 1], ["SP", 6, 20, 64, [4, 2], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [14], 1], ["SP", 4, 4, 64, [16], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 200704, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 48, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 96, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$16"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000115802], 0, 4.06441, 1606985158], "v": "v0.3"}
-{"i": [["[\"ba2026d923536b75e9b4faed89287d5f\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 4], ["CI", 1], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 200704, [64], 1], ["AN", 5, 0, 5], ["AN", 5, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 200704, [64], 1], ["AN", 2, 0, 5], ["AN", 2, 1, 6], ["PR", 2, 0, "auto_unroll_max_step$16"]]]], "r": [[2.00968e-05], 0, 1.53065, 1606985193], "v": "v0.3"}
-{"i": [["[\"a0eb8d6048282a4a0986cc2ccf14eaa2\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 112, [1, 2, 7, 1], 1], ["SP", 3, 10, 112, [1, 7, 1, 1], 1], ["SP", 3, 15, 64, [1, 8, 4, 1], 1], ["SP", 3, 20, 7, [7, 1], 1], ["SP", 3, 23, 7, [1, 7], 1], ["SP", 3, 26, 3, [3, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 84, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 273, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[7.14326e-05], 0, 2.05623, 1606985220], "v": "v0.3"}
-{"i": [["[\"bf78a7bf0209980f72953637dfd14a6f\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 56, [1, 2, 2, 2], 1], ["SP", 3, 10, 56, [1, 7, 1, 2], 1], ["SP", 3, 15, 64, [1, 16, 1, 4], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 64, [2, 8], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 64, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 256, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[1.17113e-05], 0, 1.9863, 1606985239], "v": "v0.3"}
-{"i": [["[\"6630936c26852f2b89dbfa2ff37fbb9c\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 28, [1, 7, 1, 1], 1], ["SP", 3, 10, 28, [1, 2, 1, 7], 1], ["SP", 3, 15, 128, [8, 8, 1, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 64, [2, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 16, [2], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 208, [2], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$16"]]]], "r": [[1.76965e-05], 0, 1.63284, 1606985253], "v": "v0.3"}
-{"i": [["[\"ba5f918733ccbbd4a1d7fd3724665a2f\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 14, [1, 1, 1, 1], 1], ["SP", 3, 10, 14, [2, 1, 7, 1], 1], ["SP", 3, 15, 256, [2, 64, 1, 2], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 128, [1, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8, [2], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 52, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$0"]]]], "r": [[3.05015e-05], 0, 1.59532, 1606985280], "v": "v0.3"}
-{"i": [["[\"21ad409d72953de188314010134e3acd\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 7, [1, 1, 7, 1], 1], ["SP", 3, 10, 7, [1, 1, 1, 1], 1], ["SP", 3, 15, 512, [4, 128, 1, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 256, [1, 16], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 16, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 2704, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[2.18808e-05], 0, 1.88033, 1606985298], "v": "v0.3"}
-{"i": [["[\"1f6cd3637ec856bf5cf5010a623eed05\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 7, [7, 1, 1, 1], 1], ["SP", 3, 10, 7, [1, 7, 1, 1], 1], ["SP", 3, 15, 512, [1, 4, 1, 1], 1], ["SP", 3, 20, 3, [1, 3], 1], ["SP", 3, 23, 3, [1, 3], 1], ["SP", 3, 26, 256, [8, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 144, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 144, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000190239], 0, 2.28266, 1606985323], "v": "v0.3"}
+{"i": [["[\"d7b65649a4dd54becea0a52aabbc5af5\", 1, 1000, 1, 1000]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["SP", 4, 1, 1000, [40], 1], ["AN", 4, 2, 6], ["FSP", 3, 1, 0, 1], ["AN", 3, 2, 6], ["CA", 3, 4, 0], ["CI", 2], ["FSP", 1, 1, 0, 1], ["AN", 1, 2, 6], ["CA", 1, 4, 0], ["AN", 4, 0, 5], ["PR", 1, 0, "auto_unroll_max_step$512"], ["PR", 3, 0, "auto_unroll_max_step$512"]]]], "r": [[4.87396e-06], 0, 1.30575, 1606984701], "v": "v0.5"}
+{"i": [["[\"9847f8cc0b305137f49f2c5c0c8ab25d\", 1, 512, 1000, 512, 1000, 1, 1000]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["SP", 2, 0, 1, [1, 1, 1, 1], 1], ["SP", 2, 5, 1000, [1, 50, 1, 1], 1], ["SP", 2, 10, 512, [1, 16], 1], ["RE", 2, [0, 5, 1, 6, 2, 7, 10, 11, 3, 8, 12, 4, 9]], ["FSP", 4, 0, 0, 3], ["FSP", 4, 4, 1, 3], ["RE", 4, [0, 4, 1, 5, 2, 6, 3, 7]], ["CA", 2, 4, 5], ["CHR", 1, "shared", [2]], ["CA", 2, 3, 6], ["CHR", 0, "shared", [3]], ["CA", 1, 4, 6], ["FU", 6, [0, 1]], ["AN", 6, 0, 5], ["FU", 6, [1, 2]], ["AN", 6, 1, 4], ["FU", 6, [2, 3]], ["AN", 6, 2, 6], ["FU", 3, [0, 1]], ["SP", 3, 0, 32, [1], 1], ["AN", 3, 1, 2], ["FFSP", 3, 0, [1, 0], 1, 1], ["AN", 3, 1, 6], ["FU", 1, [0, 1]], ["SP", 1, 0, 32, [1], 1], ["AN", 1, 1, 2], ["FFSP", 1, 0, [1, 0], 1, 1], ["AN", 1, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"]]]], "r": [[2.25155e-05], 0, 1.5128, 1606984719], "v": "v0.5"}
+{"i": [["[\"69115f188984ae34ede37c3b8ca40b43\", 1, 7, 7, 512, 1, 1, 1, 512]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 512, [2], 1], ["AN", 2, 0, 5], ["AN", 2, 1, 6], ["FU", 1, [0, 1, 2, 3]], ["SP", 1, 0, 512, [32], 1], ["AN", 1, 0, 5], ["AN", 1, 1, 6], ["PR", 1, 0, "auto_unroll_max_step$64"]]]], "r": [[3.91068e-06], 0, 1.63708, 1606984742], "v": "v0.5"}
+{"i": [["[\"ad6cecbf5d85cb1cda3c2bb7af170211\", 1, 7, 7, 512, 4, 4, 512, 512, 1, 7, 7, 512, 1, 1, 1, 512, 1, 1, 1, 512, 1, 7, 7, 512]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 15], ["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [2], 1], ["SP", 8, 4, 512, [16], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 16, [4, 1, 4, 1], 1], ["SP", 6, 15, 512, [2, 8, 1, 1], 1], ["SP", 6, 20, 512, [2, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 13, 3], ["FSP", 7, 4, 14, 3], ["FSP", 7, 8, 15, 3], ["FSP", 7, 12, 16, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [1], 1], ["SP", 4, 4, 512, [8], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 19, [0, 1, 2, 3]], ["SP", 19, 0, 25088, [32], 1], ["AN", 19, 0, 5], ["AN", 19, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 2, [2], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [16, 15, 14, 13], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 8, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [16, 15, 14, 13], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$512"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[0.000190231], 0, 1.95863, 1606984773], "v": "v0.5"}
+{"i": [["[\"3a69f9fbc63760d99e36b4c17b3bfc57\", 1, 7, 7, 512, 4, 4, 512, 512, 1, 1, 1, 512, 1, 7, 7, 512]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [4], 1], ["SP", 8, 4, 512, [8], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 2], 1], ["SP", 6, 5, 4, [1, 1, 1, 2], 1], ["SP", 6, 10, 16, [4, 2, 2, 1], 1], ["SP", 6, 15, 512, [1, 16, 2, 1], 1], ["SP", 6, 20, 512, [2, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [16], 1], ["SP", 4, 4, 512, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 25088, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 16, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 16, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$512"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.000218188], 0, 2.05807, 1606984806], "v": "v0.5"}
+{"i": [["[\"d730bcd28f0920f6b97245e2a11bd8d6\", 1, 7, 7, 512, 4, 4, 512, 512, 1, 7, 7, 512, 1, 7, 7, 512]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [1], 1], ["SP", 8, 4, 512, [8], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 2], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 16, [2, 1, 1, 8], 1], ["SP", 6, 15, 512, [1, 16, 1, 2], 1], ["SP", 6, 20, 512, [2, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [2], 1], ["SP", 4, 4, 512, [16], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 25088, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 256, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000165484], 0, 2.76154, 1606984831], "v": "v0.5"}
+{"i": [["[\"f3b6c10fcc6ce01ff01add933e4d21e9\", 1, 14, 14, 256, 4, 4, 256, 256, 1, 14, 14, 256, 1, 1, 1, 256, 1, 14, 14, 256]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [1], 1], ["SP", 8, 4, 256, [16], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 2], 1], ["SP", 6, 5, 4, [1, 1, 1, 2], 1], ["SP", 6, 10, 49, [1, 1, 1, 7], 1], ["SP", 6, 15, 256, [1, 128, 1, 2], 1], ["SP", 6, 20, 256, [1, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [1], 1], ["SP", 4, 4, 256, [32], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 50176, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 32, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 112, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$64"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.000157488], 0, 2.05375, 1606984883], "v": "v0.5"}
+{"i": [["[\"b8b52b9be9df6102466a22a014c44c1f\", 1, 14, 14, 256, 4, 4, 256, 256, 1, 1, 1, 256, 1, 14, 14, 256]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [1], 1], ["SP", 8, 4, 256, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 49, [1, 7, 7, 1], 1], ["SP", 6, 15, 256, [1, 32, 1, 2], 1], ["SP", 6, 20, 256, [8, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [7], 1], ["SP", 4, 4, 256, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 50176, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [2], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 224, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.00011824], 0, 1.84964, 1606984912], "v": "v0.5"}
+{"i": [["[\"d374e472bd9d8164892b9e28a0a8cb59\", 1, 14, 14, 256, 4, 4, 256, 256, 1, 14, 14, 256, 1, 14, 14, 256]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [7], 1], ["SP", 8, 4, 256, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 4, 1, 1], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 49, [1, 1, 7, 1], 1], ["SP", 6, 15, 256, [4, 8, 1, 1], 1], ["SP", 6, 20, 256, [1, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [49], 1], ["SP", 4, 4, 256, [8], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 50176, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 8, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 56, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$0"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[8.67244e-05], 0, 1.93124, 1606984935], "v": "v0.5"}
+{"i": [["[\"12b88bedece6984af589a28b43e0f3c4\", 1, 28, 28, 128, 3, 3, 128, 256, 1, 1, 1, 256, 1, 14, 14, 256]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 14, [2, 7, 1, 1], 1], ["SP", 3, 10, 14, [1, 7, 2, 1], 1], ["SP", 3, 15, 256, [2, 2, 1, 4], 1], ["SP", 3, 20, 3, [3, 1], 1], ["SP", 3, 23, 3, [1, 3], 1], ["SP", 3, 26, 128, [4, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 96, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 36, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$64"]]]], "r": [[9.20105e-05], 0, 1.88263, 1606984952], "v": "v0.5"}
+{"i": [["[\"c4500b4e2fd04e695c32d2f31bbdc14a\", 1, 28, 28, 128, 4, 4, 128, 128, 1, 28, 28, 128, 1, 1, 1, 128, 1, 28, 28, 128]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [14], 1], ["SP", 8, 4, 128, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 1], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 196, [1, 7, 1, 7], 1], ["SP", 6, 15, 128, [1, 4, 1, 16], 1], ["SP", 6, 20, 128, [1, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [2], 1], ["SP", 4, 4, 128, [32], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 100352, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 28, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000102747], 0, 2.2858, 1606984979], "v": "v0.5"}
+{"i": [["[\"e4cdf917b876dbdd64488c3818d9c141\", 1, 28, 28, 128, 4, 4, 128, 128, 1, 1, 1, 128, 1, 28, 28, 128]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [2], 1], ["SP", 8, 4, 128, [4], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 4], 1], ["SP", 6, 5, 4, [1, 1, 1, 2], 1], ["SP", 6, 10, 196, [2, 49, 1, 1], 1], ["SP", 6, 15, 128, [2, 8, 1, 1], 1], ["SP", 6, 20, 128, [2, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [2], 1], ["SP", 4, 4, 128, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 100352, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 64, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$0"]]]], "r": [[0.000133211], 0, 2.07337, 1606985017], "v": "v0.5"}
+{"i": [["[\"dac19035dd5fe9424ee8617421b9c817\", 1, 28, 28, 128, 4, 4, 128, 128, 1, 28, 28, 128, 1, 28, 28, 128]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [2], 1], ["SP", 8, 4, 128, [4], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 196, [1, 2, 7, 1], 1], ["SP", 6, 15, 128, [1, 2, 2, 2], 1], ["SP", 6, 20, 128, [2, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [28], 1], ["SP", 4, 4, 128, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 100352, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 16, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 28, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[0.000150142], 0, 1.90539, 1606985042], "v": "v0.5"}
+{"i": [["[\"12b88bedece6984af589a28b43e0f3c4\", 1, 56, 56, 64, 3, 3, 64, 128, 1, 1, 1, 128, 1, 28, 28, 128]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 28, [1, 1, 2, 1], 1], ["SP", 3, 10, 28, [1, 7, 2, 2], 1], ["SP", 3, 15, 128, [1, 8, 8, 1], 1], ["SP", 3, 20, 3, [3, 1], 1], ["SP", 3, 23, 3, [1, 1], 1], ["SP", 3, 26, 64, [4, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 576, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 360, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000101548], 0, 1.92449, 1606985059], "v": "v0.5"}
+{"i": [["[\"1e3c4211ffd2f2db91078ae4d04b779d\", 1, 56, 56, 64, 6, 6, 64, 64, 1, 56, 56, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [2], 1], ["SP", 8, 4, 64, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 1, 1, 1], 1], ["SP", 6, 5, 6, [1, 1, 1, 2], 1], ["SP", 6, 10, 196, [2, 14, 1, 1], 1], ["SP", 6, 15, 64, [2, 2, 4, 1], 1], ["SP", 6, 20, 64, [4, 2], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [7], 1], ["SP", 4, 4, 64, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 200704, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 16, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"], ["PR", 8, 0, "auto_unroll_max_step$0"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[5.64548e-05], 0, 3.15692, 1606985088], "v": "v0.5"}
+{"i": [["[\"b818b53148cd450f86569dfc3e04cb8a\", 1, 56, 56, 64, 6, 6, 64, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [2], 1], ["SP", 8, 4, 64, [64], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 3, 2, 1], 1], ["SP", 6, 5, 6, [1, 3, 1, 2], 1], ["SP", 6, 10, 196, [1, 1, 4, 1], 1], ["SP", 6, 15, 64, [1, 8, 1, 4], 1], ["SP", 6, 20, 64, [1, 2], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [1], 1], ["SP", 4, 4, 64, [4], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 200704, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 128, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 128, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000135574], 0, 2.88002, 1606985120], "v": "v0.5"}
+{"i": [["[\"3ea73fb9b0364374730d09e068821f95\", 1, 56, 56, 64, 6, 6, 64, 64, 1, 56, 56, 64, 1, 56, 56, 64]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [49], 1], ["SP", 8, 4, 64, [2], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 2, 1, 3], 1], ["SP", 6, 5, 6, [1, 3, 1, 2], 1], ["SP", 6, 10, 196, [1, 1, 1, 4], 1], ["SP", 6, 15, 64, [1, 8, 1, 1], 1], ["SP", 6, 20, 64, [4, 2], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [14], 1], ["SP", 4, 4, 64, [16], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 200704, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 48, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 96, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$16"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000115802], 0, 4.06441, 1606985158], "v": "v0.5"}
+{"i": [["[\"a5612fdeb9db4d579a75ec225ea4c06a\", 1, 112, 112, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 4], ["CI", 1], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 200704, [64], 1], ["AN", 5, 0, 5], ["AN", 5, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 200704, [64], 1], ["AN", 2, 0, 5], ["AN", 2, 1, 6], ["PR", 2, 0, "auto_unroll_max_step$16"]]]], "r": [[2.00968e-05], 0, 1.53065, 1606985193], "v": "v0.5"}
+{"i": [["[\"12b88bedece6984af589a28b43e0f3c4\", 1, 224, 224, 3, 7, 7, 3, 64, 1, 1, 1, 64, 1, 112, 112, 64]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 112, [1, 2, 7, 1], 1], ["SP", 3, 10, 112, [1, 7, 1, 1], 1], ["SP", 3, 15, 64, [1, 8, 4, 1], 1], ["SP", 3, 20, 7, [7, 1], 1], ["SP", 3, 23, 7, [1, 7], 1], ["SP", 3, 26, 3, [3, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 84, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 273, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[7.14326e-05], 0, 2.05623, 1606985220], "v": "v0.5"}
+{"i": [["[\"7006235cfc29b73be524cf390ed5a977\", 1, 56, 56, 64, 1, 1, 64, 64, 1, 56, 56, 64]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 56, [1, 2, 2, 2], 1], ["SP", 3, 10, 56, [1, 7, 1, 2], 1], ["SP", 3, 15, 64, [1, 16, 1, 4], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 64, [2, 8], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 64, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 256, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[1.17113e-05], 0, 1.9863, 1606985239], "v": "v0.5"}
+{"i": [["[\"f4380bb1dc62422a69ad4a1a9771f927\", 1, 56, 56, 64, 1, 1, 64, 128, 1, 28, 28, 128]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 28, [1, 7, 1, 1], 1], ["SP", 3, 10, 28, [1, 2, 1, 7], 1], ["SP", 3, 15, 128, [8, 8, 1, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 64, [2, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 16, [2], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 208, [2], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$16"]]]], "r": [[1.76965e-05], 0, 1.63284, 1606985253], "v": "v0.5"}
+{"i": [["[\"f4380bb1dc62422a69ad4a1a9771f927\", 1, 28, 28, 128, 1, 1, 128, 256, 1, 14, 14, 256]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 14, [1, 1, 1, 1], 1], ["SP", 3, 10, 14, [2, 1, 7, 1], 1], ["SP", 3, 15, 256, [2, 64, 1, 2], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 128, [1, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8, [2], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 52, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$0"]]]], "r": [[3.05015e-05], 0, 1.59532, 1606985280], "v": "v0.5"}
+{"i": [["[\"f4380bb1dc62422a69ad4a1a9771f927\", 1, 14, 14, 256, 1, 1, 256, 512, 1, 7, 7, 512]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 7, [1, 1, 7, 1], 1], ["SP", 3, 10, 7, [1, 1, 1, 1], 1], ["SP", 3, 15, 512, [4, 128, 1, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 256, [1, 16], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 16, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 2704, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[2.18808e-05], 0, 1.88033, 1606985298], "v": "v0.5"}
+{"i": [["[\"12b88bedece6984af589a28b43e0f3c4\", 1, 14, 14, 256, 3, 3, 256, 512, 1, 1, 1, 512, 1, 7, 7, 512]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 7, [7, 1, 1, 1], 1], ["SP", 3, 10, 7, [1, 7, 1, 1], 1], ["SP", 3, 15, 512, [1, 4, 1, 1], 1], ["SP", 3, 20, 3, [1, 3], 1], ["SP", 3, 23, 3, [1, 3], 1], ["SP", 3, 26, 256, [8, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 144, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 144, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000190239], 0, 2.28266, 1606985323], "v": "v0.5"}
diff --git a/tutorials/auto_scheduler/ci_logs/resnet-50-NHWC-B1-llvm.json b/tutorials/auto_scheduler/ci_logs/resnet-50-NHWC-B1-llvm.json
index 611f7765f584..3dd4541fd33a 100644
--- a/tutorials/auto_scheduler/ci_logs/resnet-50-NHWC-B1-llvm.json
+++ b/tutorials/auto_scheduler/ci_logs/resnet-50-NHWC-B1-llvm.json
@@ -1,31 +1,28 @@
 # Provide valid schedules for resnet-50 for CPU.
 # This is used to run the tutorial on the documentation web server.
-{"i": [["[\"b32ed43fb351136894c322ee49097a1a\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["SP", 3, 1, 1000, [50], 1], ["RF", 3, 2, 1], ["RE", 3, [0, 2, 1]], ["SP", 1, 1, 1000, [20], 1], ["RF", 1, 2, 1], ["RE", 1, [0, 2, 1]], ["CR", 6], ["CA", 5, 6, 1], ["CR", 4], ["CA", 2, 3, 1], ["AN", 1, 0, 3], ["FU", 3, [0, 1]], ["AN", 3, 0, 3], ["AN", 4, 0, 3], ["FU", 6, [0, 1]], ["AN", 6, 0, 3], ["PR", 1, 0, "auto_unroll_max_step$16"], ["PR", 2, 0, "auto_unroll_max_step$16"], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 5, 0, "auto_unroll_max_step$64"]]]], "r": [[8.75e-06, 1.0781e-05, 9.875e-06, 9.836e-06, 1.0357e-05, 1.0238e-05, 1.0341e-05, 9.75e-06, 9.561e-06, 1.0122e-05], 0, 0.17921, 1606960872], "v": "v0.3"}
-{"i": [["[\"6129df1a3d5f6326c8393a8d17160199\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["SP", 2, 0, 1, [1, 1, 1], 1], ["SP", 2, 4, 1000, [1, 1, 1], 1], ["SP", 2, 8, 16, [2, 2, 4], 1], ["SP", 2, 12, 128, [32], 1], ["RE", 2, [0, 4, 8, 1, 5, 9, 12, 2, 6, 10, 13, 3, 7, 11]], ["CR", 5], ["CA", 3, 5, 1], ["FU", 2, [0, 1]], ["AN", 2, 0, 3], ["FU", 5, [0, 1]], ["AN", 5, 0, 3], ["PR", 2, 0, "auto_unroll_max_step$16"], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 2, 12, 2]]]], "r": [[8.7769e-05, 8.6467e-05, 8.6989e-05, 9.3901e-05, 8.6221e-05, 8.4351e-05, 8.4747e-05, 8.8687e-05, 8.8928e-05, 8.3574e-05], 0, 0.33759, 1606960890], "v": "v0.3"}
-{"i": [["[\"36ee2798ed60bae3bcd1bb89a0285fe8\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CA", 1, 2, 3], ["FU", 2, [0, 1, 2, 3]], ["AN", 2, 0, 3], ["PR", 1, 0, "auto_unroll_max_step$16"]]]], "r": [[6.28e-06, 8.176e-06, 8.048e-06, 7.942e-06, 7.977e-06, 8.002e-06, 8.093e-06, 7.924e-06, 7.943e-06, 7.924e-06], 0, 0.130759, 1606960900], "v": "v0.3"}
-{"i": [["[\"dcf6fcf5f56fa614bf9aef0c82382caf\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 9], ["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 7, 1], 1], ["SP", 3, 8, 7, [1, 1, 1], 1], ["SP", 3, 12, 2048, [8, 2, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 10, 0, 3, 2], ["FSP", 10, 3, 4, 2], ["FSP", 10, 6, 5, 2], ["FSP", 10, 9, 6, 2], ["RE", 10, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 10, 7], ["CI", 1], ["FU", 10, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 10, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 10, 4, 2]]]], "r": [[0.000175984, 0.000171372, 0.00018538, 0.000178085, 0.00017879, 0.000179878, 0.000179221, 0.000178598, 0.000176714, 0.000168318], 0, 0.277929, 1606960917], "v": "v0.3"}
-{"i": [["[\"7657f886f5e9d8b5f19a5fd2c5b90d8d\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 1, 7], 1], ["SP", 3, 8, 7, [1, 1, 1], 1], ["SP", 3, 12, 512, [32, 1, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 1024, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["CI", 1], ["FU", 3, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 3, 0, 3], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 14, 2]]]], "r": [[0.00012651, 0.00012801, 0.000128605, 0.00013267, 0.00012596, 0.000126418, 0.000121995, 0.000127242, 0.000128152, 0.000129989], 0, 0.310011, 1606960986], "v": "v0.3"}
-{"i": [["[\"7e09b626cf077cd419190fee02091dd6\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [7, 1, 2], 1], ["SP", 3, 8, 14, [2, 1, 1], 1], ["SP", 3, 12, 1024, [1, 1, 32], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["CI", 1], ["FU", 3, [0, 1, 2, 3, 4, 5, 6]], ["AN", 3, 0, 3], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000183629, 0.000188334, 0.000195553, 0.000187308, 0.000196409, 0.000190496, 0.000190344, 0.000188567, 0.000186319, 0.000187136], 0, 0.384722, 1606961002], "v": "v0.3"}
-{"i": [["[\"1dce2c5e4269b8a12dfc50cd4dd23ff1\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [2, 1, 7], 1], ["SP", 3, 8, 14, [2, 1, 1], 1], ["SP", 3, 12, 256, [16, 4, 4], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [64], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["CI", 1], ["CR", 6], ["FU", 3, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 3, 0, 3], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 14, 2]]]], "r": [[0.000118033, 0.000116806, 0.000134047, 0.000116701, 0.000116219, 0.000116834, 0.000117132, 0.000117029, 0.000116393, 0.000116778], 0, 0.31025, 1606961069], "v": "v0.3"}
-{"i": [["[\"d3b36ce001dc24d693facfbdae1979b4\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [1, 1, 1], 1], ["SP", 3, 8, 28, [7, 1, 1], 1], ["SP", 3, 12, 512, [1, 2, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 128, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 8, 0, 2, 2], ["FSP", 8, 3, 3, 2], ["FSP", 8, 6, 4, 2], ["FSP", 8, 9, 5, 2], ["RE", 8, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 8, 7], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$0"], ["AN", 3, 21, 2]]]], "r": [[0.00019554, 0.000203491, 0.000199599, 0.000194289, 0.000197556, 0.000199504, 0.000198527, 0.000200656, 0.000200037, 0.000201954], 0, 0.240599, 1606961080], "v": "v0.3"}
-{"i": [["[\"a085717fb3dcb046e5c4c2c04d3dc541\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [14, 1, 2], 1], ["SP", 3, 8, 28, [2, 1, 1], 1], ["SP", 3, 12, 128, [1, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [16], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 1], ["FSP", 6, 2, 2, 1], ["FSP", 6, 4, 3, 1], ["FSP", 6, 6, 4, 1], ["RE", 6, [0, 2, 4, 6, 1, 3, 5, 7]], ["CA", 3, 6, 3], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000128461, 0.000158344, 0.000154659, 0.000148478, 0.000162668, 0.000155789, 0.000149412, 0.000141607, 0.000148815, 0.000165989], 0, 0.299928, 1606961156], "v": "v0.3"}
-{"i": [["[\"8dd7d81db440763f622f03fdc99e6d46\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [14, 2, 2], 1], ["SP", 3, 8, 56, [2, 1, 2], 1], ["SP", 3, 12, 64, [1, 16, 4], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 1], ["FSP", 6, 2, 2, 1], ["FSP", 6, 4, 3, 1], ["FSP", 6, 6, 4, 1], ["RE", 6, [0, 2, 4, 6, 1, 3, 5, 7]], ["CA", 3, 6, 3], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2]]]], "r": [[7.8291e-05, 7.4365e-05, 6.7147e-05, 6.7413e-05, 8.1894e-05, 7.1771e-05, 7.2916e-05, 6.6615e-05, 7.3038e-05, 7.4967e-05], 0, 1.09095, 1606961258], "v": "v0.3"}
-{"i": [["[\"ba2026d923536b75e9b4faed89287d5f\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 4], ["CA", 2, 5, 3], ["CR", 1], ["FU", 1, [0, 1, 2]], ["AN", 1, 0, 3], ["FU", 5, [0, 1, 2]], ["AN", 5, 0, 3], ["PR", 2, 0, "auto_unroll_max_step$64"]]]], "r": [[2.9217e-05, 3.1065e-05, 3.188e-05, 3.0897e-05, 3.1295e-05, 3.1307e-05, 3.19e-05, 3.1038e-05, 3.1919e-05, 3.2077e-05], 0, 0.217184, 1606961266], "v": "v0.3"}
-{"i": [["[\"0fb1dfcdb5b755e2dab290ed0129dcf2\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [1, 1, 2], 1], ["SP", 3, 8, 28, [1, 1, 2], 1], ["SP", 3, 12, 128, [2, 2, 16], 1], ["SP", 3, 16, 3, [3], 1], ["SP", 3, 18, 3, [3], 1], ["SP", 3, 20, 128, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CA", 1, 3, 8], ["FU", 6, [0, 1, 2, 3, 4, 5, 6]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$0"], ["AN", 3, 21, 2]]]], "r": [[0.000224019, 0.000238271, 0.000237129, 0.000233981, 0.000223557, 0.000238411, 0.000238778, 0.000236382, 0.000236069, 0.000239037], 0, 0.285437, 1606961576], "v": "v0.3"}
-{"i": [["[\"e043f834cc7f19597227e09dc7f59503\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [1, 1, 2], 1], ["SP", 3, 8, 14, [7, 2, 1], 1], ["SP", 3, 12, 256, [1, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 1024, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2]]]], "r": [[0.000153068, 0.000161094, 0.000164674, 0.000160245, 0.000159626, 0.000146788, 0.000140718, 0.000159237, 0.000162109, 0.000139686], 0, 0.273946, 1606961647], "v": "v0.3"}
-{"i": [["[\"a0eb8d6048282a4a0986cc2ccf14eaa2\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 112, [1, 1, 4], 1], ["SP", 3, 8, 112, [4, 2, 1], 1], ["SP", 3, 12, 64, [1, 1, 16], 1], ["SP", 3, 16, 7, [7], 1], ["SP", 3, 18, 7, [7], 1], ["SP", 3, 20, 3, [3], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CA", 1, 6, 3], ["FU", 6, [0, 1, 2]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 1, 3, 2], ["AN", 3, 21, 2], ["AN", 6, 9, 2]]]], "r": [[0.000247808, 0.000233393, 0.000251767, 0.000252226, 0.000254169, 0.000254176, 0.00025333, 0.00025511, 0.000253678, 0.000251738], 0, 0.315503, 1606961659], "v": "v0.3"}
-{"i": [["[\"03614e726dc588d11887eb0953a77e53\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 1, 1], 1], ["SP", 3, 8, 7, [1, 1, 7], 1], ["SP", 3, 12, 2048, [256, 1, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 5, 4, 2]]]], "r": [[0.000169437, 0.000169021, 0.00016965, 0.00017079, 0.000170862, 0.0001692, 0.000164768, 0.000175541, 0.000171528, 0.000169094], 0, 0.25194, 1606961681], "v": "v0.3"}
-{"i": [["[\"b51e06c1131d4cded40d1b215f722a4e\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [4, 1, 1], 1], ["SP", 3, 8, 56, [7, 4, 1], 1], ["SP", 3, 12, 64, [4, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 6, 5, 2]]]], "r": [[0.00015141, 0.000158121, 0.000132758, 0.00015109, 0.000148266, 0.000152599, 0.000150809, 0.000151947, 0.000150702, 0.000156091], 0, 0.221869, 1606961698], "v": "v0.3"}
-{"i": [["[\"a9e632e5167afb60fbe29e7aeef1d152\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [1, 1, 1], 1], ["SP", 3, 8, 56, [7, 1, 4], 1], ["SP", 3, 12, 64, [1, 1, 16], 1], ["SP", 3, 16, 3, [1], 1], ["SP", 3, 18, 3, [3], 1], ["SP", 3, 20, 64, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["FU", 1, [0, 1, 2]], ["AN", 1, 0, 3], ["FU", 6, [0, 1, 2, 3, 4, 5, 6]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 6, 5, 2]]]], "r": [[0.000221341, 0.000225005, 0.000209954, 0.000209741, 0.000228281, 0.000208451, 0.000223046, 0.000222672, 0.000228098, 0.000220093], 0, 0.231218, 1606961709], "v": "v0.3"}
-{"i": [["[\"e0a9eb3795b531085e0ebb772e7e800c\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [7, 1, 1], 1], ["SP", 3, 8, 7, [1, 7, 1], 1], ["SP", 3, 12, 512, [2, 2, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 2048, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$0"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000165941, 0.000152645, 0.000165687, 0.000166639, 0.000166094, 0.00016649, 0.000164394, 0.000169288, 0.000169497, 0.000168535], 0, 0.245559, 1606961724], "v": "v0.3"}
-{"i": [["[\"8fcee68a4342c38248a827f1c6c69177\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [4, 2, 1], 1], ["SP", 3, 8, 56, [1, 1, 1], 1], ["SP", 3, 12, 256, [2, 4, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 5, 6, 2]]]], "r": [[0.000161206, 0.000161372, 0.000158862, 0.000159596, 0.00014964, 0.000162042, 0.000159626, 0.000158166, 0.000161209, 0.000159408], 0, 0.337652, 1606961748], "v": "v0.3"}
-{"i": [["[\"4d7e646d99bfa3cea8245bd7100369cb\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [7, 2, 1], 1], ["SP", 3, 8, 14, [14, 1, 1], 1], ["SP", 3, 12, 1024, [2, 2, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2]]]], "r": [[0.000238006, 0.000235502, 0.000239805, 0.000234637, 0.000235266, 0.000238355, 0.000240836, 0.000232856, 0.000231219, 0.000238776], 0, 0.219506, 1606961782], "v": "v0.3"}
-{"i": [["[\"b2010aa63c95dedf1f58f3fe8bc78634\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [2, 1, 2], 1], ["SP", 3, 8, 28, [1, 2, 1], 1], ["SP", 3, 12, 512, [16, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 4, 4, 2]]]], "r": [[0.000213071, 0.000218117, 0.000216346, 0.000216237, 0.000214703, 0.00021605, 0.000210522, 0.000214234, 0.000218293, 0.00021484], 0, 0.291873, 1606961801], "v": "v0.3"}
-{"i": [["[\"537c8642716948c33a6eaaabc86b159d\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [7, 1, 1], 1], ["SP", 3, 8, 7, [1, 7, 1], 1], ["SP", 3, 12, 2048, [128, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 1024, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 4, 4, 2]]]], "r": [[0.000265306, 0.000259738, 0.000256412, 0.000284932, 0.000267557, 0.000266362, 0.00026533, 0.000263389, 0.000263022, 0.000263069], 0, 0.296232, 1606961838], "v": "v0.3"}
-{"i": [["[\"7e3f0cf5a6dd80d36dab1a3dad92674a\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 7, 1], 1], ["SP", 3, 8, 7, [7, 1, 1], 1], ["SP", 3, 12, 512, [4, 1, 8], 1], ["SP", 3, 16, 3, [3], 1], ["SP", 3, 18, 3, [1], 1], ["SP", 3, 20, 512, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CR", 1], ["FU", 1, [0, 1, 2, 3]], ["AN", 1, 0, 3], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000269786, 0.0002657, 0.000261922, 0.000267462, 0.000270495, 0.000265371, 0.000273858, 0.000268022, 0.000266746, 0.000272337], 0, 0.331923, 1606961848], "v": "v0.3"}
-{"i": [["[\"cd7c4a374fb2bbc0d075c8cae638ad14\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [7, 1, 2], 1], ["SP", 3, 8, 14, [7, 2, 1], 1], ["SP", 3, 12, 1024, [16, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 5, 4, 2]]]], "r": [[0.000159777, 0.00015711, 0.000163052, 0.000152569, 0.00015342, 0.000154918, 0.000153887, 0.000154133, 0.000154319, 0.000150102], 0, 0.195628, 1606961878], "v": "v0.3"}
-{"i": [["[\"45b4de07687dee43ee1cbde9f516b2bf\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [56, 1, 1], 1], ["SP", 3, 8, 56, [14, 1, 2], 1], ["SP", 3, 12, 256, [1, 2, 32], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [64], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 4, 4, 2]]]], "r": [[0.000159044, 0.000157356, 0.000158889, 0.000160304, 0.000158648, 0.000159749, 0.000143679, 0.000156393, 0.000164916, 0.000155957], 0, 0.240777, 1606961918], "v": "v0.3"}
-{"i": [["[\"95bf49cc8cf7a351e974b2359702aac0\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [1, 2, 1], 1], ["SP", 3, 8, 14, [1, 7, 1], 1], ["SP", 3, 12, 256, [2, 1, 8], 1], ["SP", 3, 16, 3, [1], 1], ["SP", 3, 18, 3, [3], 1], ["SP", 3, 20, 256, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["FU", 1, [0, 1, 2, 3]], ["AN", 1, 0, 3], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000230538, 0.000229192, 0.000235935, 0.000233141, 0.000233405, 0.000233217, 0.000225995, 0.000231786, 0.000229054, 0.00022851], 0, 0.256995, 1606961941], "v": "v0.3"}
-{"i": [["[\"5e3ceb6e23ae8c351d5a1770d5fc6c7c\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [1, 1, 2], 1], ["SP", 3, 8, 28, [1, 1, 1], 1], ["SP", 3, 12, 512, [4, 1, 32], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 128, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 5, 4, 2]]]], "r": [[0.000168259, 0.000157338, 0.0001551, 0.000156552, 0.000160492, 0.000164505, 0.000144937, 0.000138397, 0.000153011, 0.000153186], 0, 0.231498, 1606961965], "v": "v0.3"}
-{"i": [["[\"691feef049c8693bbe91bd5e7c9cdf34\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [7, 1, 4], 1], ["SP", 3, 8, 56, [4, 2, 1], 1], ["SP", 3, 12, 256, [32, 1, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 8, 0, 2, 2], ["FSP", 8, 3, 3, 2], ["FSP", 8, 6, 4, 2], ["FSP", 8, 9, 5, 2], ["RE", 8, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 8, 7], ["CI", 1], ["FU", 8, [0, 1, 2, 3, 4, 5, 6]], ["AN", 8, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 8, 5, 2]]]], "r": [[0.000185957, 0.000180964, 0.000179419, 0.000168205, 0.000176155, 0.000178243, 0.000180175, 0.00017753, 0.000174475, 0.000158878], 0, 0.316404, 1606961979], "v": "v0.3"}
-{"i": [["[\"45acfc473c772458684f36a34549d8aa\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [7, 1, 4], 1], ["SP", 3, 8, 28, [14, 1, 1], 1], ["SP", 3, 12, 128, [1, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000150378, 0.000154444, 0.000156051, 0.000130306, 0.000156154, 0.000131167, 0.000142357, 0.000152532, 0.000131899, 0.000157696], 0, 0.18509, 1606962002], "v": "v0.3"}
+{"i": [["[\"d7b65649a4dd54becea0a52aabbc5af5\", 1, 1000, 1, 1000]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["SP", 3, 1, 1000, [50], 1], ["RF", 3, 2, 1], ["RE", 3, [0, 2, 1]], ["SP", 1, 1, 1000, [20], 1], ["RF", 1, 2, 1], ["RE", 1, [0, 2, 1]], ["CR", 6], ["CA", 5, 6, 1], ["CR", 4], ["CA", 2, 3, 1], ["AN", 1, 0, 3], ["FU", 3, [0, 1]], ["AN", 3, 0, 3], ["AN", 4, 0, 3], ["FU", 6, [0, 1]], ["AN", 6, 0, 3], ["PR", 1, 0, "auto_unroll_max_step$16"], ["PR", 2, 0, "auto_unroll_max_step$16"], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 5, 0, "auto_unroll_max_step$64"]]]], "r": [[8.75e-06, 1.0781e-05, 9.875e-06, 9.836e-06, 1.0357e-05, 1.0238e-05, 1.0341e-05, 9.75e-06, 9.561e-06, 1.0122e-05], 0, 0.17921, 1606960872], "v": "v0.5"}
+{"i": [["[\"69115f188984ae34ede37c3b8ca40b43\", 1, 7, 7, 2048, 1, 1, 1, 2048]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CA", 1, 2, 3], ["FU", 2, [0, 1, 2, 3]], ["AN", 2, 0, 3], ["PR", 1, 0, "auto_unroll_max_step$16"]]]], "r": [[6.28e-06, 8.176e-06, 8.048e-06, 7.942e-06, 7.977e-06, 8.002e-06, 8.093e-06, 7.924e-06, 7.943e-06, 7.924e-06], 0, 0.130759, 1606960900], "v": "v0.5"}
+{"i": [["[\"875556d12d0be2269206a7775d5296a6\", 1, 7, 7, 512, 1, 1, 512, 2048, 1, 7, 7, 2048, 1, 1, 1, 2048, 1, 1, 1, 2048, 1, 7, 7, 2048]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 9], ["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 7, 1], 1], ["SP", 3, 8, 7, [1, 1, 1], 1], ["SP", 3, 12, 2048, [8, 2, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 10, 0, 3, 2], ["FSP", 10, 3, 4, 2], ["FSP", 10, 6, 5, 2], ["FSP", 10, 9, 6, 2], ["RE", 10, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 10, 7], ["CI", 1], ["FU", 10, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 10, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 10, 4, 2]]]], "r": [[0.000175984, 0.000171372, 0.00018538, 0.000178085, 0.00017879, 0.000179878, 0.000179221, 0.000178598, 0.000176714, 0.000168318], 0, 0.277929, 1606960917], "v": "v0.5"}
+{"i": [["[\"de7d1695278cf52778b038e6573d7626\", 1, 14, 14, 1024, 1, 1, 1024, 512, 1, 1, 1, 512, 1, 7, 7, 512]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 1, 7], 1], ["SP", 3, 8, 7, [1, 1, 1], 1], ["SP", 3, 12, 512, [32, 1, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 1024, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["CI", 1], ["FU", 3, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 3, 0, 3], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 14, 2]]]], "r": [[0.00012651, 0.00012801, 0.000128605, 0.00013267, 0.00012596, 0.000126418, 0.000121995, 0.000127242, 0.000128152, 0.000129989], 0, 0.310011, 1606960986], "v": "v0.5"}
+{"i": [["[\"1b524af89dd867d26059e1f621cf987c\", 1, 14, 14, 256, 1, 1, 256, 1024, 1, 14, 14, 1024, 1, 1, 1, 1024, 1, 14, 14, 1024]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [7, 1, 2], 1], ["SP", 3, 8, 14, [2, 1, 1], 1], ["SP", 3, 12, 1024, [1, 1, 32], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["CI", 1], ["FU", 3, [0, 1, 2, 3, 4, 5, 6]], ["AN", 3, 0, 3], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000183629, 0.000188334, 0.000195553, 0.000187308, 0.000196409, 0.000190496, 0.000190344, 0.000188567, 0.000186319, 0.000187136], 0, 0.384722, 1606961002], "v": "v0.5"}
+{"i": [["[\"de7d1695278cf52778b038e6573d7626\", 1, 28, 28, 512, 1, 1, 512, 256, 1, 1, 1, 256, 1, 14, 14, 256]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [2, 1, 7], 1], ["SP", 3, 8, 14, [2, 1, 1], 1], ["SP", 3, 12, 256, [16, 4, 4], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [64], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["CI", 1], ["CR", 6], ["FU", 3, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 3, 0, 3], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 14, 2]]]], "r": [[0.000118033, 0.000116806, 0.000134047, 0.000116701, 0.000116219, 0.000116834, 0.000117132, 0.000117029, 0.000116393, 0.000116778], 0, 0.31025, 1606961069], "v": "v0.5"}
+{"i": [["[\"1b524af89dd867d26059e1f621cf987c\", 1, 28, 28, 128, 1, 1, 128, 512, 1, 28, 28, 512, 1, 1, 1, 512, 1, 28, 28, 512]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [1, 1, 1], 1], ["SP", 3, 8, 28, [7, 1, 1], 1], ["SP", 3, 12, 512, [1, 2, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 128, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 8, 0, 2, 2], ["FSP", 8, 3, 3, 2], ["FSP", 8, 6, 4, 2], ["FSP", 8, 9, 5, 2], ["RE", 8, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 8, 7], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$0"], ["AN", 3, 21, 2]]]], "r": [[0.00019554, 0.000203491, 0.000199599, 0.000194289, 0.000197556, 0.000199504, 0.000198527, 0.000200656, 0.000200037, 0.000201954], 0, 0.240599, 1606961080], "v": "v0.5"}
+{"i": [["[\"de7d1695278cf52778b038e6573d7626\", 1, 56, 56, 256, 1, 1, 256, 128, 1, 1, 1, 128, 1, 28, 28, 128]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [14, 1, 2], 1], ["SP", 3, 8, 28, [2, 1, 1], 1], ["SP", 3, 12, 128, [1, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [16], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 1], ["FSP", 6, 2, 2, 1], ["FSP", 6, 4, 3, 1], ["FSP", 6, 6, 4, 1], ["RE", 6, [0, 2, 4, 6, 1, 3, 5, 7]], ["CA", 3, 6, 3], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000128461, 0.000158344, 0.000154659, 0.000148478, 0.000162668, 0.000155789, 0.000149412, 0.000141607, 0.000148815, 0.000165989], 0, 0.299928, 1606961156], "v": "v0.5"}
+{"i": [["[\"6b7583cf23c7c37d3212cad9d06e58c1\", 1, 56, 56, 64, 1, 1, 64, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [14, 2, 2], 1], ["SP", 3, 8, 56, [2, 1, 2], 1], ["SP", 3, 12, 64, [1, 16, 4], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 1], ["FSP", 6, 2, 2, 1], ["FSP", 6, 4, 3, 1], ["FSP", 6, 6, 4, 1], ["RE", 6, [0, 2, 4, 6, 1, 3, 5, 7]], ["CA", 3, 6, 3], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2]]]], "r": [[7.8291e-05, 7.4365e-05, 6.7147e-05, 6.7413e-05, 8.1894e-05, 7.1771e-05, 7.2916e-05, 6.6615e-05, 7.3038e-05, 7.4967e-05], 0, 1.09095, 1606961258], "v": "v0.5"}
+{"i": [["[\"a5612fdeb9db4d579a75ec225ea4c06a\", 1, 112, 112, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 4], ["CA", 2, 5, 3], ["CR", 1], ["FU", 1, [0, 1, 2]], ["AN", 1, 0, 3], ["FU", 5, [0, 1, 2]], ["AN", 5, 0, 3], ["PR", 2, 0, "auto_unroll_max_step$64"]]]], "r": [[2.9217e-05, 3.1065e-05, 3.188e-05, 3.0897e-05, 3.1295e-05, 3.1307e-05, 3.19e-05, 3.1038e-05, 3.1919e-05, 3.2077e-05], 0, 0.217184, 1606961266], "v": "v0.5"}
+{"i": [["[\"6b7583cf23c7c37d3212cad9d06e58c1\", 1, 14, 14, 1024, 1, 1, 1024, 256, 1, 1, 1, 256, 1, 14, 14, 256]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [1, 1, 2], 1], ["SP", 3, 8, 14, [7, 2, 1], 1], ["SP", 3, 12, 256, [1, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 1024, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2]]]], "r": [[0.000153068, 0.000161094, 0.000164674, 0.000160245, 0.000159626, 0.000146788, 0.000140718, 0.000159237, 0.000162109, 0.000139686], 0, 0.273946, 1606961647], "v": "v0.5"}
+{"i": [["[\"12b88bedece6984af589a28b43e0f3c4\", 1, 224, 224, 3, 7, 7, 3, 64, 1, 1, 1, 64, 1, 112, 112, 64]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 112, [1, 1, 4], 1], ["SP", 3, 8, 112, [4, 2, 1], 1], ["SP", 3, 12, 64, [1, 1, 16], 1], ["SP", 3, 16, 7, [7], 1], ["SP", 3, 18, 7, [7], 1], ["SP", 3, 20, 3, [3], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CA", 1, 6, 3], ["FU", 6, [0, 1, 2]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 1, 3, 2], ["AN", 3, 21, 2], ["AN", 6, 9, 2]]]], "r": [[0.000247808, 0.000233393, 0.000251767, 0.000252226, 0.000254169, 0.000254176, 0.00025333, 0.00025511, 0.000253678, 0.000251738], 0, 0.315503, 1606961659], "v": "v0.5"}
+{"i": [["[\"1cc666833c122282e3fcf3595901b12b\", 1, 7, 7, 512, 1, 1, 512, 2048, 1, 7, 7, 2048, 1, 7, 7, 2048]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 1, 1], 1], ["SP", 3, 8, 7, [1, 1, 7], 1], ["SP", 3, 12, 2048, [256, 1, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 5, 4, 2]]]], "r": [[0.000169437, 0.000169021, 0.00016965, 0.00017079, 0.000170862, 0.0001692, 0.000164768, 0.000175541, 0.000171528, 0.000169094], 0, 0.25194, 1606961681], "v": "v0.5"}
+{"i": [["[\"6b7583cf23c7c37d3212cad9d06e58c1\", 1, 56, 56, 256, 1, 1, 256, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [4, 1, 1], 1], ["SP", 3, 8, 56, [7, 4, 1], 1], ["SP", 3, 12, 64, [4, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 6, 5, 2]]]], "r": [[0.00015141, 0.000158121, 0.000132758, 0.00015109, 0.000148266, 0.000152599, 0.000150809, 0.000151947, 0.000150702, 0.000156091], 0, 0.221869, 1606961698], "v": "v0.5"}
+{"i": [["[\"2350d19dc42a0665244368384c66b3a5\", 1, 56, 56, 64, 3, 3, 64, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [1, 1, 1], 1], ["SP", 3, 8, 56, [7, 1, 4], 1], ["SP", 3, 12, 64, [1, 1, 16], 1], ["SP", 3, 16, 3, [1], 1], ["SP", 3, 18, 3, [3], 1], ["SP", 3, 20, 64, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["FU", 1, [0, 1, 2]], ["AN", 1, 0, 3], ["FU", 6, [0, 1, 2, 3, 4, 5, 6]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 6, 5, 2]]]], "r": [[0.000221341, 0.000225005, 0.000209954, 0.000209741, 0.000228281, 0.000208451, 0.000223046, 0.000222672, 0.000228098, 0.000220093], 0, 0.231218, 1606961709], "v": "v0.5"}
+{"i": [["[\"6b7583cf23c7c37d3212cad9d06e58c1\", 1, 7, 7, 2048, 1, 1, 2048, 512, 1, 1, 1, 512, 1, 7, 7, 512]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [7, 1, 1], 1], ["SP", 3, 8, 7, [1, 7, 1], 1], ["SP", 3, 12, 512, [2, 2, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 2048, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$0"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000165941, 0.000152645, 0.000165687, 0.000166639, 0.000166094, 0.00016649, 0.000164394, 0.000169288, 0.000169497, 0.000168535], 0, 0.245559, 1606961724], "v": "v0.5"}
+{"i": [["[\"1cc666833c122282e3fcf3595901b12b\", 1, 56, 56, 64, 1, 1, 64, 256, 1, 56, 56, 256, 1, 56, 56, 256]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [4, 2, 1], 1], ["SP", 3, 8, 56, [1, 1, 1], 1], ["SP", 3, 12, 256, [2, 4, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 5, 6, 2]]]], "r": [[0.000161206, 0.000161372, 0.000158862, 0.000159596, 0.00014964, 0.000162042, 0.000159626, 0.000158166, 0.000161209, 0.000159408], 0, 0.337652, 1606961748], "v": "v0.5"}
+{"i": [["[\"f4380bb1dc62422a69ad4a1a9771f927\", 1, 28, 28, 512, 1, 1, 512, 1024, 1, 14, 14, 1024]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [7, 2, 1], 1], ["SP", 3, 8, 14, [14, 1, 1], 1], ["SP", 3, 12, 1024, [2, 2, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2]]]], "r": [[0.000238006, 0.000235502, 0.000239805, 0.000234637, 0.000235266, 0.000238355, 0.000240836, 0.000232856, 0.000231219, 0.000238776], 0, 0.219506, 1606961782], "v": "v0.5"}
+{"i": [["[\"f4380bb1dc62422a69ad4a1a9771f927\", 1, 56, 56, 256, 1, 1, 256, 512, 1, 28, 28, 512]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [2, 1, 2], 1], ["SP", 3, 8, 28, [1, 2, 1], 1], ["SP", 3, 12, 512, [16, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 4, 4, 2]]]], "r": [[0.000213071, 0.000218117, 0.000216346, 0.000216237, 0.000214703, 0.00021605, 0.000210522, 0.000214234, 0.000218293, 0.00021484], 0, 0.291873, 1606961801], "v": "v0.5"}
+{"i": [["[\"f4380bb1dc62422a69ad4a1a9771f927\", 1, 14, 14, 1024, 1, 1, 1024, 2048, 1, 7, 7, 2048]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [7, 1, 1], 1], ["SP", 3, 8, 7, [1, 7, 1], 1], ["SP", 3, 12, 2048, [128, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 1024, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 4, 4, 2]]]], "r": [[0.000265306, 0.000259738, 0.000256412, 0.000284932, 0.000267557, 0.000266362, 0.00026533, 0.000263389, 0.000263022, 0.000263069], 0, 0.296232, 1606961838], "v": "v0.5"}
+{"i": [["[\"2350d19dc42a0665244368384c66b3a5\", 1, 7, 7, 512, 3, 3, 512, 512, 1, 1, 1, 512, 1, 7, 7, 512]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 7, 1], 1], ["SP", 3, 8, 7, [7, 1, 1], 1], ["SP", 3, 12, 512, [4, 1, 8], 1], ["SP", 3, 16, 3, [3], 1], ["SP", 3, 18, 3, [1], 1], ["SP", 3, 20, 512, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CR", 1], ["FU", 1, [0, 1, 2, 3]], ["AN", 1, 0, 3], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000269786, 0.0002657, 0.000261922, 0.000267462, 0.000270495, 0.000265371, 0.000273858, 0.000268022, 0.000266746, 0.000272337], 0, 0.331923, 1606961848], "v": "v0.5"}
+{"i": [["[\"1cc666833c122282e3fcf3595901b12b\", 1, 14, 14, 256, 1, 1, 256, 1024, 1, 14, 14, 1024, 1, 14, 14, 1024]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [7, 1, 2], 1], ["SP", 3, 8, 14, [7, 2, 1], 1], ["SP", 3, 12, 1024, [16, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 5, 4, 2]]]], "r": [[0.000159777, 0.00015711, 0.000163052, 0.000152569, 0.00015342, 0.000154918, 0.000153887, 0.000154133, 0.000154319, 0.000150102], 0, 0.195628, 1606961878], "v": "v0.5"}
+{"i": [["[\"7006235cfc29b73be524cf390ed5a977\", 1, 56, 56, 64, 1, 1, 64, 256, 1, 56, 56, 256]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [56, 1, 1], 1], ["SP", 3, 8, 56, [14, 1, 2], 1], ["SP", 3, 12, 256, [1, 2, 32], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [64], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 4, 4, 2]]]], "r": [[0.000159044, 0.000157356, 0.000158889, 0.000160304, 0.000158648, 0.000159749, 0.000143679, 0.000156393, 0.000164916, 0.000155957], 0, 0.240777, 1606961918], "v": "v0.5"}
+{"i": [["[\"1cc666833c122282e3fcf3595901b12b\", 1, 28, 28, 128, 1, 1, 128, 512, 1, 28, 28, 512, 1, 28, 28, 512]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [1, 1, 2], 1], ["SP", 3, 8, 28, [1, 1, 1], 1], ["SP", 3, 12, 512, [4, 1, 32], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 128, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 5, 4, 2]]]], "r": [[0.000168259, 0.000157338, 0.0001551, 0.000156552, 0.000160492, 0.000164505, 0.000144937, 0.000138397, 0.000153011, 0.000153186], 0, 0.231498, 1606961965], "v": "v0.5"}
+{"i": [["[\"1b524af89dd867d26059e1f621cf987c\", 1, 56, 56, 64, 1, 1, 64, 256, 1, 56, 56, 256, 1, 1, 1, 256, 1, 56, 56, 256]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [7, 1, 4], 1], ["SP", 3, 8, 56, [4, 2, 1], 1], ["SP", 3, 12, 256, [32, 1, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 8, 0, 2, 2], ["FSP", 8, 3, 3, 2], ["FSP", 8, 6, 4, 2], ["FSP", 8, 9, 5, 2], ["RE", 8, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 8, 7], ["CI", 1], ["FU", 8, [0, 1, 2, 3, 4, 5, 6]], ["AN", 8, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 8, 5, 2]]]], "r": [[0.000185957, 0.000180964, 0.000179419, 0.000168205, 0.000176155, 0.000178243, 0.000180175, 0.00017753, 0.000174475, 0.000158878], 0, 0.316404, 1606961979], "v": "v0.5"}
+{"i": [["[\"6b7583cf23c7c37d3212cad9d06e58c1\", 1, 28, 28, 512, 1, 1, 512, 128, 1, 1, 1, 128, 1, 28, 28, 128]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [7, 1, 4], 1], ["SP", 3, 8, 28, [14, 1, 1], 1], ["SP", 3, 12, 128, [1, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000150378, 0.000154444, 0.000156051, 0.000130306, 0.000156154, 0.000131167, 0.000142357, 0.000152532, 0.000131899, 0.000157696], 0, 0.18509, 1606962002], "v": "v0.5"}

From f3b852df94398c76d9e91490c9c031be7b139584 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Mon, 25 Jan 2021 09:27:43 -0800
Subject: [PATCH 109/357] [FIX] Infer input shape in sparse_dense_padded's
 alter_op if one does not exist (#7308)

* [FIX] Infer input shape in sparse_dense_padded's alter_op if one does not exist

If there are multiple alter_ops in a model, the first alteration does
not run type inference for the subsequent ones. In this case, we don't
have the shape information, so we run the inferencer manually.

* add todo
---
 python/tvm/topi/cuda/sparse.py          | 9 ++++++++-
 src/relay/transforms/alter_op_layout.cc | 1 +
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/python/tvm/topi/cuda/sparse.py b/python/tvm/topi/cuda/sparse.py
index 0b46cf0f9f97..f68b31ec30ef 100644
--- a/python/tvm/topi/cuda/sparse.py
+++ b/python/tvm/topi/cuda/sparse.py
@@ -292,7 +292,14 @@ def is_valid_for_sparse_dense_padded(data, weight_data):
     """
     # pylint:disable=invalid-name
     warp_size = int(tvm.target.Target.current(allow_none=False).thread_warp_size)
-    m = get_const_tuple(data.checked_type.shape)[1]
+    # If there are multiple alter_ops in a model, the first alteration does not
+    # run type inference for the subsequent ones. In this case, we don't have
+    # the shape information, so we run the inferencer manually.
+    try:
+        m = get_const_tuple(data.checked_type.shape)[1]
+    except ValueError:
+        data_infered = relay.transform.InferType()(tvm.IRModule.from_expr(data))["main"]
+        m = get_const_tuple(data_infered.ret_type.shape)[1]
     if len(weight_data.shape) == 1:
         bs_m = 1
     else:
diff --git a/src/relay/transforms/alter_op_layout.cc b/src/relay/transforms/alter_op_layout.cc
index 924e61ad0d16..d7ffff68c1f5 100644
--- a/src/relay/transforms/alter_op_layout.cc
+++ b/src/relay/transforms/alter_op_layout.cc
@@ -110,6 +110,7 @@ class AlterTransformMemorizer : public TransformMemorizer {
  * 2. Do not support nested tuple arguments.
  */
 Expr AlterOpLayout(const Expr& expr) {
+  // TODO(@icemelon9): need to rerun type inference after applying an alter op.
   AlterTransformMemorizer alterMemorizer(make_object<AlterTransformMemorizerNode>());
   auto fcontext = [&](const Call& call) -> ObjectRef { return alterMemorizer; };
 

From da446af3b73a9b7354b89bb5792968cd705359d8 Mon Sep 17 00:00:00 2001
From: ggardet <guillaume.gardet@opensuse.org>
Date: Mon, 25 Jan 2021 18:34:05 +0100
Subject: [PATCH 110/357] Fix warning showed with GCC10 (#7336)

catching polymorphic type 'struct dmlc::Error' by value
---
 tests/cpp/ir_functor_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/cpp/ir_functor_test.cc b/tests/cpp/ir_functor_test.cc
index 9be83987ba57..d242b20f1ba7 100644
--- a/tests/cpp/ir_functor_test.cc
+++ b/tests/cpp/ir_functor_test.cc
@@ -72,7 +72,7 @@ TEST(IRF, ExprTransform) {
   try {
     f(z - 1, 2);
     LOG(FATAL) << "should fail";
-  } catch (dmlc::Error) {
+  } catch (dmlc::Error&) {
   }
 }
 

From 6f75cffb64f20e72a2fad425ce58d0fd32c0d4c8 Mon Sep 17 00:00:00 2001
From: Altan Haan <ahaan@octoml.ai>
Date: Mon, 25 Jan 2021 13:10:48 -0800
Subject: [PATCH 111/357] [Relay][Training] Add more gradients (#7323)

* add more gradients

* add documentation
---
 python/tvm/relay/op/_tensor_grad.py       | 54 ++++++++++++++++++++---
 tests/python/relay/test_op_grad_level1.py |  8 ++++
 tests/python/relay/test_op_grad_level3.py |  7 +++
 tests/python/relay/test_op_grad_level4.py | 37 +++++++++++++++-
 4 files changed, 99 insertions(+), 7 deletions(-)

diff --git a/python/tvm/relay/op/_tensor_grad.py b/python/tvm/relay/op/_tensor_grad.py
index 9c84411352f2..c9a20a3b2989 100644
--- a/python/tvm/relay/op/_tensor_grad.py
+++ b/python/tvm/relay/op/_tensor_grad.py
@@ -357,16 +357,24 @@ def global_avg_pool2d_grad(orig, grad):
     return [pool_grad]
 
 
-# not implemented, this is only for testing.
 @register_gradient("concatenate")
 def concatenate_grad(orig, grad):
+    """
+    Returns the gradient of concatenate, which is just the downstream gradient
+    split across the inputs.
+    """
     assert len(orig.args) == 1
     t = orig.args[0]
-    x = TupleGetItem(t, 0)
-    y = TupleGetItem(t, 1)
-    # Assume only two element in tuple rn.
-    # In the real implementation, concatenate_grad probably need to be implemented by an operator.
-    return [Tuple([zeros_like(x), zeros_like(y)])]
+
+    # calculate split indices. TODO(@altanh): support Any?
+    axis_dims = [ty.shape[orig.attrs.axis] for ty in t.checked_type.fields]
+    splits, cumsum = [], 0
+    for dim in axis_dims[:-1]:
+        cumsum += dim
+        splits.append(cumsum)
+
+    grads = split(grad, tuple(splits), axis=orig.attrs.axis).tuple_value
+    return [grads]
 
 
 @register_gradient("nn.conv2d")
@@ -808,5 +816,39 @@ def arange_grad(orig, grad):
 
 @register_gradient("gather_nd")
 def gather_nd_grad(orig, grad):
+    """
+    Returns the gradient of gather_nd, which is simply scatter_nd.
+    """
     data, indices = orig.args
     return [scatter_nd(grad, indices, data.checked_type.concrete_shape), zeros_like(indices)]
+
+
+@register_gradient("reshape_like")
+def reshape_like_grad(orig, grad):
+    """
+    Returns the gradient of reshape_like.
+    """
+    data, shape_like = orig.args
+    return [reshape_like(grad, data), zeros_like(shape_like)]
+
+
+@register_gradient("where")
+def where_grad(orig, grad):
+    """
+    Returns the gradient of where.
+    """
+    cond, x, y = orig.args
+    g_zeros = zeros_like(grad)
+
+    grad_x = collapse_sum_like(where(cond, grad, g_zeros), x)
+    grad_y = collapse_sum_like(where(cond, g_zeros, grad), y)
+
+    return [zeros_like(cond), grad_x, grad_y]
+
+
+@register_gradient("less_equal")
+def less_equal_grad(orig, grad):
+    """
+    Returns the gradient of less_equal.
+    """
+    return [zeros_like(orig.args[0]), zeros_like(orig.args[1])]
diff --git a/tests/python/relay/test_op_grad_level1.py b/tests/python/relay/test_op_grad_level1.py
index cac07c437a42..a79be8684b20 100644
--- a/tests/python/relay/test_op_grad_level1.py
+++ b/tests/python/relay/test_op_grad_level1.py
@@ -150,5 +150,13 @@ def test_expand_dims_grad():
     check_grad(fwd_func)
 
 
+def test_concatenate_grad():
+    x = relay.var("x", shape=(2, 2, 5))
+    y = relay.var("y", shape=(2, 1, 5))
+    z = relay.var("z", shape=(2, 4, 5))
+    fwd_func = relay.Function([x, y, z], relay.concatenate([x, y, z], axis=1))
+    check_grad(fwd_func)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/python/relay/test_op_grad_level3.py b/tests/python/relay/test_op_grad_level3.py
index 98ff62ed75d4..0c89aa7d2e9a 100644
--- a/tests/python/relay/test_op_grad_level3.py
+++ b/tests/python/relay/test_op_grad_level3.py
@@ -126,5 +126,12 @@ def test_gather_nd_grad():
     check_grad(fwd, inputs=[data_np, indices_np], test_inputs=[data_np])
 
 
+def test_reshape_like_grad():
+    data = relay.var("data", shape=(2, 3, 4), dtype="float32")
+    shape_like = relay.var("shape_like", shape=(6, 2, 2), dtype="float32")
+    fwd_func = relay.Function([data, shape_like], relay.reshape_like(data, shape_like))
+    check_grad(fwd_func)
+
+
 if __name__ == "__main__":
     pytest.main()
diff --git a/tests/python/relay/test_op_grad_level4.py b/tests/python/relay/test_op_grad_level4.py
index d4792219816a..0f73e89c94ad 100644
--- a/tests/python/relay/test_op_grad_level4.py
+++ b/tests/python/relay/test_op_grad_level4.py
@@ -15,8 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 import pytest
+import numpy as np
 from tvm import relay
-from tvm.relay.testing import check_grad
+from tvm.relay.testing import check_grad, _np_randn_from_type
 
 
 def verify_reduction_grad(red_fn, d_shape, axis=None, keepdims=False, exclude=False):
@@ -51,5 +52,39 @@ def test_max_grad():
     verify_max_grad((5, 4, 3), axis=(0, 2), exclude=True)
 
 
+def test_where_grad():
+    cond_type = relay.TensorType((2, 3, 4), "int32")
+    lhs_type = relay.TensorType((1, 3, 4), "float32")
+    rhs_type = relay.TensorType((2, 1, 4), "float32")
+    inputs = [
+        np.random.randint(2, size=cond_type.concrete_shape, dtype=cond_type.dtype),
+        _np_randn_from_type(lhs_type, scale=1e-5),
+        _np_randn_from_type(rhs_type, scale=1e-5),
+    ]
+
+    cond = relay.var("cond", type_annotation=cond_type)
+    lhs = relay.var("lhs", type_annotation=lhs_type)
+    rhs = relay.var("rhs", type_annotation=rhs_type)
+    fwd_func = relay.Function([cond, lhs, rhs], relay.where(cond, lhs, rhs))
+    check_grad(fwd_func, inputs=inputs, test_inputs=inputs[1:])
+
+
+def test_less_equal_grad():
+    x_type = relay.TensorType((2, 3, 4), "float32")
+    y_type = relay.TensorType((3, 1), "float32")
+    # We need to generate inputs far apart to get correct numerical gradients
+    # (otherwise adding epsilon may change comparison result). The gradient
+    # should always be zero for both inputs.
+    inputs = [
+        np.random.choice([-1, 1], size=x_type.concrete_shape).astype(x_type.dtype),
+        np.random.choice([-2, 2], size=y_type.concrete_shape).astype(y_type.dtype),
+    ]
+
+    x = relay.var("x", type_annotation=x_type)
+    y = relay.var("y", type_annotation=y_type)
+    fwd_func = relay.Function([x, y], relay.less_equal(x, y))
+    check_grad(fwd_func, inputs=inputs, test_inputs=inputs, eps=1e-6)
+
+
 if __name__ == "__main__":
     pytest.main()

From 3d13809864540bd03cba7bd02fa4edafd264cab7 Mon Sep 17 00:00:00 2001
From: Altan Haan <ahaan@octoml.ai>
Date: Mon, 25 Jan 2021 21:11:21 -0800
Subject: [PATCH 112/357] fix tanh gradient and update tests to use downstream
 gradient (#7340)

---
 python/tvm/relay/op/_tensor_grad.py       |  2 +-
 tests/python/relay/test_op_grad_level1.py | 52 ++++++++++++-----------
 2 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/python/tvm/relay/op/_tensor_grad.py b/python/tvm/relay/op/_tensor_grad.py
index c9a20a3b2989..90120d64c2ac 100644
--- a/python/tvm/relay/op/_tensor_grad.py
+++ b/python/tvm/relay/op/_tensor_grad.py
@@ -198,7 +198,7 @@ def sigmoid_grad(orig, grad):
 @register_gradient("tanh")
 def tanh_grad(orig, grad):
     """Returns grad * (1 - tanh(x) * tanh(x))."""
-    return [grad * ones_like(orig) - orig * orig]
+    return [grad * (ones_like(orig) - orig * orig)]
 
 
 @register_gradient("nn.relu")
diff --git a/tests/python/relay/test_op_grad_level1.py b/tests/python/relay/test_op_grad_level1.py
index a79be8684b20..0ac604c6bca1 100644
--- a/tests/python/relay/test_op_grad_level1.py
+++ b/tests/python/relay/test_op_grad_level1.py
@@ -42,42 +42,44 @@ def check_single_op(opfunc, ref, dtype):
         shape = (10, 4)
         tp = relay.TensorType(shape, dtype)
         x = relay.var("x", tp)
-        y = opfunc(x)
+        g = relay.var("g", tp)
+        y = opfunc(x) * g
 
         if ref is not None:
             data = np.random.rand(*shape).astype(dtype)
-            ref_grad = ref(data)
-            fwd_func = relay.Function([x], y)
+            grad_in = np.random.rand(*shape).astype(dtype)
+            ref_grad = ref(data, grad_in)
+            fwd_func = relay.Function([x, g], y)
             fwd_func = run_infer_type(fwd_func)
             bwd_func = run_infer_type(gradient(fwd_func))
 
             for target, ctx in tvm.testing.enabled_targets():
                 intrp = relay.create_executor(ctx=ctx, target=target)
-                op_res, (op_grad,) = intrp.evaluate(bwd_func)(data)
+                op_res, (op_grad, _) = intrp.evaluate(bwd_func)(data, grad_in)
                 np.testing.assert_allclose(op_grad.asnumpy(), ref_grad, rtol=0.01)
 
     for opfunc, ref in [
-        (tvm.relay.log, lambda x: 1 / x),
-        (tvm.relay.exp, np.exp),
-        (tvm.relay.sigmoid, lambda x: sigmoid(x) * (1 - sigmoid(x))),
-        (tvm.relay.tanh, lambda x: 1 - np.tanh(x) * np.tanh(x)),
-        (tvm.relay.sqrt, lambda x: 0.5 * np.power(x, -0.5)),
-        (tvm.relay.abs, lambda x: np.where(x < 0, -np.ones_like(x), np.ones_like(x))),
-        (relay.nn.relu, lambda x: np.where(x < 0, np.zeros_like(x), np.ones_like(x))),
-        (tvm.relay.erf, lambda x: 2.0 / (np.pi ** (0.5)) * np.exp(-x * x)),
-        (tvm.relay.cos, lambda x: -1.0 * np.sin(x)),
-        (tvm.relay.sin, lambda x: np.cos(x)),
-        (tvm.relay.tan, lambda x: 1.0 / (np.cos(x) ** 2)),
-        (tvm.relay.atan, lambda x: 1 / (1 + np.power(x, 2.0))),
-        (tvm.relay.log2, lambda x: 1 / (np.log(2) * x)),
-        (tvm.relay.log10, lambda x: 1 / (np.log(10) * x)),
-        (tvm.relay.cosh, lambda x: np.sinh(x)),
-        (tvm.relay.sinh, lambda x: np.cosh(x)),
-        (tvm.relay.asin, lambda x: 1.0 / (1.0 - x ** 2) ** (1.0 / 2.0)),
-        (tvm.relay.acos, lambda x: -1.0 / (1.0 - x ** 2.0) ** (1.0 / 2.0)),
-        (tvm.relay.acosh, lambda x: 1.0 / (x ** 2 - 1.0) ** (1.0 / 2.0)),
-        (tvm.relay.asinh, lambda x: 1.0 / (x ** 2 + 1.0) ** (1.0 / 2.0)),
-        (tvm.relay.atanh, lambda x: -1.0 / (x ** 2 - 1.0)),
+        (tvm.relay.log, lambda x, g: g * (1 / x)),
+        (tvm.relay.exp, lambda x, g: g * np.exp(x)),
+        (tvm.relay.sigmoid, lambda x, g: g * sigmoid(x) * (1 - sigmoid(x))),
+        (tvm.relay.tanh, lambda x, g: g * (1 - np.tanh(x) * np.tanh(x))),
+        (tvm.relay.sqrt, lambda x, g: g * 0.5 * np.power(x, -0.5)),
+        (tvm.relay.abs, lambda x, g: np.where(x < 0, -g, g)),
+        (relay.nn.relu, lambda x, g: np.where(x < 0, np.zeros_like(x), g)),
+        (tvm.relay.erf, lambda x, g: g * (2.0 / (np.pi ** (0.5)) * np.exp(-x * x))),
+        (tvm.relay.cos, lambda x, g: g * -1.0 * np.sin(x)),
+        (tvm.relay.sin, lambda x, g: g * np.cos(x)),
+        (tvm.relay.tan, lambda x, g: g * (1.0 / (np.cos(x) ** 2))),
+        (tvm.relay.atan, lambda x, g: g * (1 / (1 + np.power(x, 2.0)))),
+        (tvm.relay.log2, lambda x, g: g * (1 / (np.log(2) * x))),
+        (tvm.relay.log10, lambda x, g: g * (1 / (np.log(10) * x))),
+        (tvm.relay.cosh, lambda x, g: g * (np.sinh(x))),
+        (tvm.relay.sinh, lambda x, g: g * (np.cosh(x))),
+        (tvm.relay.asin, lambda x, g: g * (1.0 / (1.0 - x ** 2) ** (1.0 / 2.0))),
+        (tvm.relay.acos, lambda x, g: g * (-1.0 / (1.0 - x ** 2.0) ** (1.0 / 2.0))),
+        (tvm.relay.acosh, lambda x, g: g * (1.0 / (x ** 2 - 1.0) ** (1.0 / 2.0))),
+        (tvm.relay.asinh, lambda x, g: g * (1.0 / (x ** 2 + 1.0) ** (1.0 / 2.0))),
+        (tvm.relay.atanh, lambda x, g: g * (-1.0 / (x ** 2 - 1.0))),
     ]:
         for dtype in ("float32", "float64"):
             check_single_op(opfunc, ref, dtype)

From c53030f40e6911a10555097230f69809bc5af73f Mon Sep 17 00:00:00 2001
From: windclarion <windclarion@gmail.com>
Date: Tue, 26 Jan 2021 21:51:24 +0800
Subject: [PATCH 113/357] [CMake] use wrong flag name (#7341)

Signed-off-by: windclarion <windclarion@gmail.com>
---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6929dd66e0ef..98dd7dec8bed 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -375,9 +375,9 @@ add_library(tvm_objs OBJECT ${COMPILER_SRCS} ${RUNTIME_SRCS})
 add_library(tvm_runtime_objs OBJECT ${RUNTIME_SRCS})
 
 add_library(tvm SHARED $<TARGET_OBJECTS:tvm_objs>)
-set_property(TARGET tvm APPEND PROPERTY LINK_OPTIONS "${TVM_VISIBILITY_FLAGS}")
+set_property(TARGET tvm APPEND PROPERTY LINK_OPTIONS "${TVM_VISIBILITY_FLAG}")
 add_library(tvm_runtime SHARED $<TARGET_OBJECTS:tvm_runtime_objs>)
-set_property(TARGET tvm_runtime APPEND PROPERTY LINK_OPTIONS "${TVM_VISIBILITY_FLAGS}")
+set_property(TARGET tvm_runtime APPEND PROPERTY LINK_OPTIONS "${TVM_VISIBILITY_FLAG}")
 
 if(USE_MICRO)
   # NOTE: cmake doesn't track dependencies at the file level across subdirectories. For the

From ab8bc0aaa96a6b4eac7bc682a1b2ef746da4a27a Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Tue, 26 Jan 2021 06:41:04 -0800
Subject: [PATCH 114/357] Add resource_handle to TVM_DLL_EXPORT_TYPED_FUNC.
 (#7338)

* In #5921, resource_handle was added as a parameter to
   TVMBackendPackedCFunc, which is the typedef for functions called by
   LibraryModule's function lookup.
 * It appears TVM_DLL_EXPORT_TYPED_FUNC was overlooked in that PR,
   although there don't seem to be any runtime affects known so
   far. However, making this definition proper to avoid any compiler
   warnings/debug tool problems.
 * See also https://discuss.tvm.apache.org/t/rfc-misra-c-changes-for-rpc-support/7098/5
---
 include/tvm/runtime/packed_func.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index ee4ab82cd4d3..ee2c7fd830f0 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -910,9 +910,9 @@ struct PackedFuncValueConverter {
 #define TVM_DLL_EXPORT_PACKED_FUNC(ExportName, Function)                                    \
   extern "C" {                                                                              \
   TVM_DLL int ExportName(TVMValue* args, int* type_code, int num_args, TVMValue* out_value, \
-                         int* out_type_code);                                               \
+                         int* out_type_code, void* resource_handle);                        \
   int ExportName(TVMValue* args, int* type_code, int num_args, TVMValue* out_value,         \
-                 int* out_type_code) {                                                      \
+                 int* out_type_code, void* resource_handle) {                               \
     try {                                                                                   \
       ::tvm::runtime::TVMRetValue rv;                                                       \
       Function(::tvm::runtime::TVMArgs(args, type_code, num_args), &rv);                    \

From 1e0d3569b94f650243f4d0ac204d196e3be8b0aa Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Wed, 27 Jan 2021 08:54:36 +0900
Subject: [PATCH 115/357] [Relay, TOPI] Add numpy style cumsum op (#7334)

* Add cumsum relay/topi op

* relay tests working

* add torch frontend converter

* fix for importing detr

* fix bad merge

* begin cuda cumsum

* support non innermost axis

* support rank higher than 3

* making binop parameter

* fix overflow issue in thrust scan

* generic binop parameter working

* relay test working

* fixed for bool input

* remove pytorch change

* fix pylint

* doc update

* Update python/tvm/topi/cumsum.py

Co-authored-by: Tristan Konolige <tristan.konolige@gmail.com>

* Update tests/python/relay/test_op_level3.py

Co-authored-by: Tristan Konolige <tristan.konolige@gmail.com>

* add example outputs

* add supported input and output dtype in thrust log

* adding more loop var names

* fix cpplint

* fix missing check for the cuda target in nms thrust sort

* parallelize cpu cumsum

* making binop argument tir function

* update doc for binop

* doc update

Co-authored-by: Tristan Konolige <tristan.konolige@gmail.com>
---
 include/tvm/relay/attrs/transform.h          |  10 +
 python/tvm/relay/op/_transform.py            |  12 +-
 python/tvm/relay/op/strategy/cuda.py         |  12 +
 python/tvm/relay/op/strategy/generic.py      |  21 ++
 python/tvm/relay/op/transform.py             |  49 ++++
 python/tvm/topi/__init__.py                  |   1 +
 python/tvm/topi/cuda/__init__.py             |   1 +
 python/tvm/topi/cuda/nms.py                  |   3 +-
 python/tvm/topi/cuda/scan.py                 | 255 +++++++++++++------
 python/tvm/topi/cuda/sort.py                 |   7 +-
 python/tvm/topi/cumsum.py                    | 106 ++++++++
 python/tvm/topi/utils.py                     |   5 +
 src/relay/op/tensor/transform.cc             |  52 ++++
 src/runtime/contrib/thrust/thrust.cu         |  73 +++++-
 tests/python/contrib/test_thrust.py          |   4 +-
 tests/python/relay/test_op_level3.py         |  36 +++
 tests/python/topi/python/test_topi_cumsum.py |  72 ++++++
 17 files changed, 625 insertions(+), 94 deletions(-)
 create mode 100644 python/tvm/topi/cumsum.py
 create mode 100644 tests/python/topi/python/test_topi_cumsum.py

diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index efa44e026c51..43166249638a 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -438,6 +438,16 @@ struct MatrixSetDiagAttrs : public tvm::AttrsNode<MatrixSetDiagAttrs> {
   }
 };  // struct MatrixSetDiagAttrs
 
+/*! \brief Attributes used in cumsum operator */
+struct CumsumAttrs : public tvm::AttrsNode<CumsumAttrs> {
+  Integer axis;
+  DataType dtype;
+  TVM_DECLARE_ATTRS(CumsumAttrs, "relay.attrs.CumsumAttrs") {
+    TVM_ATTR_FIELD(axis).describe("The axis to sum over").set_default(NullValue<Integer>());
+    TVM_ATTR_FIELD(dtype).describe("Output data type").set_default(NullValue<DataType>());
+  }
+};
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_ATTRS_TRANSFORM_H_
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 05ca6d2e4bb9..fd07c98ddc1f 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -103,7 +103,7 @@ def compute_scatter_add(attrs, inputs, output_type):
 
 _reg.register_strategy("scatter_add", strategy.scatter_add_strategy)
 
-# scatter
+# scatter_nd
 @_reg.register_compute("scatter_nd")
 def compute_scatter_nd(attrs, inputs, output_type):
     """Compute definition of scatter_nd"""
@@ -112,6 +112,16 @@ def compute_scatter_nd(attrs, inputs, output_type):
 
 _reg.register_strategy("scatter_nd", strategy.scatter_nd_strategy)
 
+# cumsum
+@_reg.register_compute("cumsum")
+def compute_cumsum(attrs, inputs, output_type):
+    """Compute definition of cumsum"""
+    return [topi.cumsum(inputs[0], attrs.axis, attrs.dtype)]
+
+
+_reg.register_strategy("cumsum", strategy.cumsum_strategy)
+_reg.register_shape_func("cumsum", False, elemwise_shape_func)
+
 #####################
 #  Shape functions  #
 #####################
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 3863df0fd831..346e93445f1c 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -996,3 +996,15 @@ def argwhere_strategy_cuda(attrs, inputs, out_type, target):
         name="argwhere.cuda",
     )
     return strategy
+
+
+@cumsum_strategy.register(["cuda", "gpu"])
+def cumsum_strategy_cuda(attrs, inputs, out_type, target):
+    """cumsum cuda strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_cumsum(topi.cuda.cumsum),
+        wrap_topi_schedule(topi.cuda.schedule_scan),
+        name="cumsum.cuda",
+    )
+    return strategy
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 8dd9dc5844dd..3ad75faf4bc1 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -1361,3 +1361,24 @@ def threefry_split_strategy(attrs, inputs, out_type, target):
         name="threefry_split.generic",
     )
     return strategy
+
+
+def wrap_compute_cumsum(topi_compute):
+    """Wrap cumsum topi compute"""
+
+    def _compute_cumsum(attrs, inputs, _):
+        return [topi_compute(inputs[0], attrs.axis, attrs.dtype)]
+
+    return _compute_cumsum
+
+
+@override_native_generic_func("cumsum_strategy")
+def cumsum_strategy(attrs, inputs, out_type, target):
+    """cumsum generic strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_cumsum(topi.cumsum),
+        wrap_topi_schedule(topi.generic.schedule_extern),
+        name="cumsum.generic",
+    )
+    return strategy
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 7e7f9b299593..6785ff248612 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -1320,3 +1320,52 @@ def adv_index(inputs):
         Output tensor.
     """
     return _make.adv_index(Tuple(inputs))
+
+
+def cumsum(data, axis=None, dtype=None):
+    """Numpy style cumsum op. Return the cumulative inclusive sum of the elements along
+    a given axis.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    axis : int, optional
+        Axis along which the cumulative sum is computed. The default (None) is to compute
+        the cumsum over the flattened array.
+
+    dtype : string, optional
+        Type of the returned array and of the accumulator in which the elements are summed.
+        If dtype is not specified, it defaults to the dtype of data.
+
+    Returns
+    -------
+    result : relay.Expr
+        The result has the same size as data, and the same shape as data if axis is not None.
+        If axis is None, the result is a 1-d array.
+
+    Examples
+    --------
+    .. code-block:: python
+        a = [[1,2,3], [4,5,6]]
+
+        cumsum(a)  # if axis is not provided, cumsum is done over the flattened input.
+        -> [ 1,  3,  6, 10, 15, 21]
+
+        cumsum(a, dtype="float32")
+        -> [  1.,   3.,   6.,  10.,  15.,  21.]
+
+        cumsum(a, axis=0)  # sum over rows for each of the 3 columns
+        -> [[1, 2, 3],
+            [5, 7, 9]]
+
+        cumsum(a, axis=1)
+        -> [[ 1,  3,  6],
+            [ 4,  9, 15]]
+
+        a = [1, 0, 1, 0, 1, 1, 0]  # a is a boolean array
+        cumsum(a, dtype=int32)  # dtype should be provided to get the expected results
+        -> [1, 1, 2, 2, 3, 4, 4]
+    """
+    return _make.cumsum(data, axis, dtype)
diff --git a/python/tvm/topi/__init__.py b/python/tvm/topi/__init__.py
index cb94b5b86c9e..873901df62a5 100644
--- a/python/tvm/topi/__init__.py
+++ b/python/tvm/topi/__init__.py
@@ -40,6 +40,7 @@
 from .scatter import *
 from .scatter_add import *
 from .argwhere import *
+from .cumsum import *
 from . import generic
 from . import nn
 from . import x86
diff --git a/python/tvm/topi/cuda/__init__.py b/python/tvm/topi/cuda/__init__.py
index 42bf980bec4c..e0ff5a12a9b2 100644
--- a/python/tvm/topi/cuda/__init__.py
+++ b/python/tvm/topi/cuda/__init__.py
@@ -56,3 +56,4 @@
 from .correlation import *
 from .sparse import *
 from .argwhere import *
+from .scan import *
diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
index 32691da90ecc..2d6e1e464ef8 100644
--- a/python/tvm/topi/cuda/nms.py
+++ b/python/tvm/topi/cuda/nms.py
@@ -609,7 +609,8 @@ def _get_sorted_indices(data, data_buf, score_index, score_shape):
         tag="fetch_score",
     )
 
-    if is_thrust_available():
+    target = tvm.target.Target.current()
+    if target and target.kind.name == "cuda" and is_thrust_available():
         sort_tensor = argsort_thrust(score_tensor, axis=1, is_ascend=False, dtype="int32")
     else:
         sort_tensor = argsort(score_tensor, axis=1, is_ascend=False, dtype="int32")
diff --git a/python/tvm/topi/cuda/scan.py b/python/tvm/topi/cuda/scan.py
index f19e4a14239a..232d679840fd 100644
--- a/python/tvm/topi/cuda/scan.py
+++ b/python/tvm/topi/cuda/scan.py
@@ -19,30 +19,41 @@
 import tvm
 from tvm import te
 from tvm._ffi import get_global_func
-from ..transform import expand_dims, squeeze
-from ..utils import ceil_div
+from ..transform import expand_dims, squeeze, transpose, reshape
+from ..utils import ceil_div, swap, prod, get_const_int
 from ..math import cast
 from .. import tag
 from .injective import schedule_injective_from_existing
 
 
-def exclusive_sum_scan2d_ir(data, output, reduction=None):
+def _get_thrust_func_name(tvmop):
+    tvmop_to_thrust_func_name = {tvm.tir.generic.add: "tvm.contrib.thrust.sum_scan"}
+    assert tvmop in tvmop_to_thrust_func_name, "{} not supported by thrust".format(tvmop)
+    return tvmop_to_thrust_func_name[tvmop]
+
+
+def exclusive_scan_ir(data, output, reduction=None, binop=tvm.tir.generic.add):
     """Low level IR to do exclusive sum scan along rows of 2D input.
 
     Parameters
     ----------
     data : Buffer
-        Input data. 2-D Buffer with shape [batch_size, scan_axis_size].
+        Input N-D Buffer. Scan is done over the innermost axis.
 
     output: Buffer
-        A buffer to store the output scan, of the same size as data
+        A buffer to store the output scan, of the same shape as data
 
     reduction: Buffer, optional
-        1D Buffer of size [batch_size], to store the sum of each row.
+        (N-1)-D Buffer, to store the sum of each scan axis.
+
+    binop: function, optional
+        A binary associative op to use for scan. The function takes two TIR expressions
+        and produce a new TIR expression. By default it uses tvm.tir.generic.add to compute
+        prefix sum.
     """
 
-    batch_size = data.shape[0]
-    scan_axis_size = data.shape[1]
+    batch_size = prod(data.shape[:-1])
+    scan_axis_size = data.shape[-1]
 
     ib = tvm.tir.ir_builder.create()
 
@@ -76,7 +87,7 @@ def exclusive_sum_scan2d_ir(data, output, reduction=None):
             ib.scope_attr(by, "thread_extent", nthread_by)
             tid = bx * nthread_tx + tx
             with ib.if_scope(tid < scan_axis_size):
-                output[by, tid] = data[by, tid]
+                output[by * scan_axis_size + tid] = cast(data[by * scan_axis_size + tid], out_dtype)
 
         nthread_tx = max_threads
         nthread_bx = ceil_div(scan_axis_size, max_threads)
@@ -111,9 +122,10 @@ def exclusive_sum_scan2d_ir(data, output, reduction=None):
                     middle[0] = start[0] + tvm.tir.indexdiv(width, 2)
                     end[0] = tvm.te.min(start[0] + width, scan_axis_size)
                     with ib.if_scope(middle[0] < scan_axis_size):
-                        output[by * scan_axis_size + end[0] - 1] += output[
-                            by * scan_axis_size + middle[0] - 1
-                        ]
+                        output[by * scan_axis_size + end[0] - 1] = binop(
+                            output[by * scan_axis_size + end[0] - 1],
+                            output[by * scan_axis_size + middle[0] - 1],
+                        )
 
         # Down Sweep of exclusive scan
         with ib.new_scope():
@@ -153,28 +165,33 @@ def exclusive_sum_scan2d_ir(data, output, reduction=None):
                         output[by * scan_axis_size + middle[0] - 1] = output[
                             by * scan_axis_size + end[0] - 1
                         ]
-                        output[by * scan_axis_size + end[0] - 1] += tmp[0]
+                        output[by * scan_axis_size + end[0] - 1] = binop(
+                            output[by * scan_axis_size + end[0] - 1], tmp[0]
+                        )
     return ib.get()
 
 
-def get_reduction_from_exclusive_scan(data, ex_scan_output):
+def get_reduction_from_exclusive_scan(data, ex_scan_output, binop=tvm.tir.generic.add):
     """Return the sum of the last element of data and the exclusive scan output.
     The is the reduction of data along each row (for 2-D case).
 
     Parameters
     ----------
     data : tvm.te.Tensor
-        Input data. 1-D tensor with shape [scan_axis_size], or
-        2-D tensor with shape [batch_size, scan_axis_size].
+        Input data of any shape
 
     ex_scan_output : tvm.te.Tensor
-        1-D tensor that is the exclusive scan of the input, or
-        2-D tensor storing the exclusive scan of each row.
+        The output of exclusive scan on data
+
+    binop: function, optional
+        A binary associative op to use for scan. The function takes two TIR expressions
+        and produce a new TIR expression. By default it uses tvm.tir.generic.add to compute
+        prefix sum.
 
     Returns
     -------
     reduction : tvm.te.Tensor
-        1-D tensor storing the reduction of each row.
+        (N-1)-D tensor storing the reduction of each scan axis.
     """
     ndim = len(data.shape)
     if ndim == 1:
@@ -182,8 +199,8 @@ def get_reduction_from_exclusive_scan(data, ex_scan_output):
         ex_scan_output = expand_dims(ex_scan_output, axis=0)
 
     def ir(data, data_ex_scan, reduction):
-        batch_size = data.shape[0]
-        num_anchors = data.shape[1]
+        batch_size = prod(data.shape[:-1])
+        scan_axis_size = data.shape[-1]
 
         ib = tvm.tir.ir_builder.create()
 
@@ -201,21 +218,23 @@ def ir(data, data_ex_scan, reduction):
             ib.scope_attr(bx, "thread_extent", nthread_bx)
             tid = bx * max_threads + tx
             with ib.if_scope(tid < batch_size):
-                with ib.if_scope(num_anchors > 0):
-                    reduction[tid] = data_ex_scan[tid, num_anchors - 1] + data[tid, num_anchors - 1]
+                with ib.if_scope(scan_axis_size > 0):
+                    reduction[tid] = binop(
+                        data_ex_scan[tid * scan_axis_size + scan_axis_size - 1],
+                        data[tid, scan_axis_size - 1],
+                    )
                 with ib.else_scope():
                     reduction[tid] = 0
 
         return ib.get()
 
-    assert len(data.shape) == 2, "Only 2D input supported for now"
     data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "valid_indices_buf", data_alignment=8)
     ex_scan_output_buf = tvm.tir.decl_buffer(
         ex_scan_output.shape, ex_scan_output.dtype, "ex_scan_output_buf", data_alignment=8
     )
 
     reduction = te.extern(
-        [(data.shape[0],)],
+        [data.shape[:-1]],
         [data, ex_scan_output],
         lambda ins, outs: ir(ins[0], ins[1], outs[0]),
         dtype=[ex_scan_output.dtype],
@@ -235,14 +254,15 @@ def is_thrust_available():
     return get_global_func("tvm.contrib.thrust.sum_scan", allow_missing=True) is not None
 
 
-def scan_thrust(data, output_dtype, exclusive=True, return_reduction=False):
-    """Do exclusive scan on 1D input or along rows of 2D input, using thrust.
+def scan_thrust(
+    data, output_dtype, exclusive=True, return_reduction=False, binop=tvm.tir.generic.add
+):
+    """Do exclusive or inclusive scan on 1D or multidimensional input, using thrust.
 
     Parameters
     ----------
     data : tvm.te.Tensor
-        Input data. 1-D tensor with shape [scan_axis_size], or
-        2-D tensor with shape [batch_size, scan_axis_size].
+        Input data of any shape. The scan is done over the innermost axis.
 
     output_dtype: string
         The dtype of the output scan tensor.
@@ -251,99 +271,104 @@ def scan_thrust(data, output_dtype, exclusive=True, return_reduction=False):
         Whether or not do exclusive or inclusive scan.
 
     return_reduction: bool, optional
-        Whether or not return a 1-D tensor storing the reduction of each row.
+        Whether or not return a (N-1)-D tensor storing the reduction of each scan axis.
         Reductions are computed as part of the upsweep pass, so there is no extra cost.
-        If False, reductions are ignored.
+        If False, reductions are ignored. It must be False when exclusive is False.
+
+    binop: function, optional
+        A binary associative op to use for scan. Since we need to lookup the corresponding
+        thrust function, arbitrariy callables are not supported. Currently only
+        tvm.tir.generic.add can be passed in.
 
     Returns
     -------
     output : tvm.te.Tensor
-        1-D tensor that is the exclusive scan of the input, or
-        2-D tensor storing the exclusive scan of each row.
+        A N-D tensor of the same rank N and shape as the input data.
 
     reduction : tvm.te.Tensor, optional
-        1-D tensor storing the reduction of each row.
+        (N-1)-D tensor storing the reduction of each scan axis.
         Returned if return_reduction is True.
     """
     data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
     output_buf = tvm.tir.decl_buffer(data.shape, output_dtype, "output_buf", data_alignment=8)
+
     output = te.extern(
         [data.shape],
         [data],
         lambda ins, outs: tvm.tir.call_packed(
-            "tvm.contrib.thrust.sum_scan", ins[0], outs[0], exclusive
+            _get_thrust_func_name(binop), ins[0], outs[0], exclusive
         ),
         dtype=[output_dtype],
         in_buffers=[data_buf],
         out_buffers=[output_buf],
-        name="exclusive_sum_scan2d",
-        tag="exclusive_sum_scan2d_gpu",
+        name="exclusive_scan_thrust",
+        tag="exclusive_scan_thrust_gpu",
     )
 
     if return_reduction:
         assert exclusive, "return_reduction should be False for inclusive scan"
-        reduction = get_reduction_from_exclusive_scan(data, output)
+        reduction = get_reduction_from_exclusive_scan(data, output, binop)
         return output, reduction
 
     return output
 
 
-def exclusive_scan(data, axis=-1, return_reduction=False, output_dtype=None):
-    """Do exclusive scan on 1D input or along rows of 2D input.
+def exclusive_scan(
+    data, axis=-1, return_reduction=False, output_dtype=None, binop=tvm.tir.generic.add
+):
+    """Do exclusive scan on 1D or multidimensional input.
 
     Parameters
     ----------
     data : tvm.te.Tensor
-        Input data. 1-D tensor with shape [scan_axis_size], or
-        2-D tensor with shape [batch_size, scan_axis_size].
+        Input data of any shape.
 
     axis: int, optional
-        The axis to do scan on. For now, only the inner most axis is supported.
+        The axis to do scan on. By default, scan is done on the innermost axis.
 
     return_reduction: bool, optional
-        Whether or not return a 1-D tensor storing the reduction of each row.
+        Whether or not return a tensor storing the reduction over each scan axis.
+        If the input rank is N, this tensor is of rank N - 1.
         Reductions are computed as part of the upsweep pass, so there is no extra cost.
         If False, reductions are ignored.
 
     output_dtype: string, optional
         The dtype of the output scan tensor. If not provided, the dtype of the input is used.
 
+    binop: function, optional
+        A binary associative op to use for scan. The function takes two TIR expressions
+        and produce a new TIR expression. By default it uses tvm.tir.generic.add to compute
+        prefix sum.
+
     Returns
     -------
     output : tvm.te.Tensor
-        1-D tensor that is the exclusive scan of the input, or
-        2-D tensor storing the exclusive scan of each row.
+        A N-D tensor of the same rank N and shape as the input data.
 
     reduction : tvm.te.Tensor, optional
-        1-D tensor storing the reduction of each row.
+        (N-1)-D tensor storing the reduction of each scan axis.
         Returned if return_reduction is True.
     """
-    # TODO(masahi): Support other binary operators
-    ndim = len(data.shape)
-    if axis < 0:
-        axis += ndim
-    assert axis == ndim - 1, "Only support scan on the inner most axis."
 
-    if output_dtype is None:
-        output_dtype = data.dtype
+    def do_scan(data, output_dtype):
+        target = tvm.target.Target.current()
+        if target and target.kind.name == "cuda" and is_thrust_available():
+            return scan_thrust(
+                data, output_dtype, exclusive=True, return_reduction=return_reduction, binop=binop
+            )
 
-    target = tvm.target.Target.current()
-    if target and target.kind.name == "cuda" and is_thrust_available():
-        return scan_thrust(data, output_dtype, exclusive=True, return_reduction=return_reduction)
+        if ndim == 1:
+            # TIR exclusive scan accepts only 2D or higher-rank inputs.
+            data = expand_dims(data, axis=0)
 
-    if ndim == 1:
-        # TIR exclusive scan accepts only 2D inputs.
-        data = expand_dims(data, axis=0)
+        data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
+        output_buf = tvm.tir.decl_buffer(data.shape, output_dtype, "output_buf", data_alignment=8)
 
-    data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
-    output_buf = tvm.tir.decl_buffer(data.shape, output_dtype, "output_buf", data_alignment=8)
-
-    if len(data.shape) == 2:
         if return_reduction:
             output, reduction = te.extern(
-                [data.shape, (data.shape[0],)],
+                [data.shape, data.shape[:-1]],
                 [data],
-                lambda ins, outs: exclusive_sum_scan2d_ir(ins[0], outs[0], outs[1]),
+                lambda ins, outs: exclusive_scan_ir(ins[0], outs[0], outs[1], binop=binop),
                 dtype=[data.dtype, output_dtype],
                 in_buffers=[data_buf],
                 name="exclusive_scan",
@@ -353,7 +378,7 @@ def exclusive_scan(data, axis=-1, return_reduction=False, output_dtype=None):
             output = te.extern(
                 [data.shape],
                 [data],
-                lambda ins, outs: exclusive_sum_scan2d_ir(ins[0], outs[0]),
+                lambda ins, outs: exclusive_scan_ir(ins[0], outs[0], binop=binop),
                 dtype=[output_dtype],
                 in_buffers=[data_buf],
                 out_buffers=[output_buf],
@@ -361,13 +386,38 @@ def exclusive_scan(data, axis=-1, return_reduction=False, output_dtype=None):
                 tag="exclusive_scan_gpu",
             )
             reduction = None
-    else:
-        assert False, "Unsupported dimension {}".format(ndim)
 
-    if ndim == 1:
-        output = squeeze(output, 0)
+        if ndim == 1:
+            output = squeeze(output, 0)
+            if return_reduction:
+                reduction = squeeze(reduction, 0)
+
         if return_reduction:
-            reduction = squeeze(reduction, 0)
+            return output, reduction
+
+        return output
+
+    if output_dtype is None or output_dtype == "":
+        output_dtype = data.dtype
+
+    ndim = len(data.shape)
+    if axis < 0:
+        axis += ndim
+
+    # If scan axis is not the innermost one, swap the scan and the innermost axes
+    # Scan is always done on the innermost axis, for performance reason.
+    if axis != ndim - 1:
+        axes = swap(list(range(ndim)), axis)
+        data = transpose(data, axes)
+
+    if return_reduction:
+        output, reduction = do_scan(data, output_dtype)
+    else:
+        output = do_scan(data, output_dtype)
+
+    if axis != ndim - 1:
+        axes = swap(list(range(ndim)), axis)
+        output = transpose(output, axes)
 
     if return_reduction:
         return output, reduction
@@ -375,6 +425,38 @@ def exclusive_scan(data, axis=-1, return_reduction=False, output_dtype=None):
     return output
 
 
+def inclusive_scan(data, axis=-1, output_dtype=None, binop=tvm.tir.generic.add):
+    """Do inclusive scan on 1D or multidimensional input.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        Input data of any shape.
+
+    axis: int, optional
+        The axis to do scan on. By default, scan is done on the innermost axis.
+
+    output_dtype: string, optional
+        The dtype of the output scan tensor. If not provided, the dtype of the input is used.
+
+    binop: function, optional
+        A binary associative op to use for scan. The function takes two TIR expressions
+        and produce a new TIR expression. By default it uses tvm.tir.generic.add to compute
+        prefix sum.
+
+    Returns
+    -------
+    output : tvm.te.Tensor
+        A N-D tensor of the same rank N as the input data.
+    """
+    ex_scan = exclusive_scan(data, axis, output_dtype=output_dtype, binop=binop)
+
+    if output_dtype is not None and data.dtype != output_dtype and output_dtype != "":
+        data = cast(data, output_dtype)
+
+    return binop(data, ex_scan)
+
+
 def schedule_scan(outs):
     """Schedule for scan operator.
 
@@ -404,3 +486,32 @@ def traverse(op):
     for out in outs:
         traverse(out.op)
     return s
+
+
+def cumsum(data, axis=None, dtype=None):
+    """Numpy style cumsum op. Return the cumulative sum of the elements along a given axis.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        The input data to the operator.
+
+    axis : int, optional
+        Axis along which the cumulative sum is computed. The default (None) is to compute
+        the cumsum over the flattened array.
+
+    dtype : string, optional
+        Type of the returned array and of the accumulator in which the elements are summed.
+        If dtype is not specified, it defaults to the dtype of data.
+
+    Returns
+    -------
+    result : tvm.te.Tensor
+        The result has the same size as data, and the same shape as data if axis is not None.
+        If axis is None, the result is a 1-d array.
+    """
+    if axis is None:
+        axis = 0
+        data = reshape(data, (prod(data.shape),))
+    axis = get_const_int(axis)
+    return inclusive_scan(data, axis, output_dtype=dtype, binop=tvm.tir.generic.add)
diff --git a/python/tvm/topi/cuda/sort.py b/python/tvm/topi/cuda/sort.py
index 18340385205e..c0f076fb6065 100644
--- a/python/tvm/topi/cuda/sort.py
+++ b/python/tvm/topi/cuda/sort.py
@@ -23,12 +23,7 @@
 from .injective import schedule_injective_from_existing
 from ..transform import strided_slice, transpose
 from .. import tag
-from ..utils import ceil_div
-
-
-def swap(arr, axis):
-    """ swap arr[axis] and arr[-1] """
-    return arr[:axis] + [arr[-1]] + arr[axis + 1 : -1] + [arr[axis]]
+from ..utils import ceil_div, swap
 
 
 def _schedule_sort(outs):
diff --git a/python/tvm/topi/cumsum.py b/python/tvm/topi/cumsum.py
new file mode 100644
index 000000000000..855427b1c619
--- /dev/null
+++ b/python/tvm/topi/cumsum.py
@@ -0,0 +1,106 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""Cumsum operator"""
+from ..tir import decl_buffer, ir_builder
+from ..te import extern
+from .utils import prod, get_const_int
+from .math import cast
+
+
+def cumsum(data, axis=None, dtype=None):
+    """Numpy style cumsum op. Return the cumulative sum of the elements along a given axis.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        The input data to the operator.
+
+    axis : int, optional
+        Axis along which the cumulative sum is computed. The default (None) is to compute
+        the cumsum over the flattened array.
+
+    dtype : string, optional
+        Type of the returned array and of the accumulator in which the elements are summed.
+        If dtype is not specified, it defaults to the dtype of data.
+
+    Returns
+    -------
+    result : tvm.te.Tensor
+        The result has the same size as data, and the same shape as data if axis is not None.
+        If axis is None, the result is a 1-d array.
+    """
+    if dtype is None or dtype == "":
+        dtype = data.dtype
+
+    def maybe_cast(x):
+        if dtype != data.dtype:
+            return cast(x, dtype)
+        return x
+
+    axis_mul_before = 1
+    axis_mul_after = 1
+
+    if axis is None:
+        axis = 0
+        cumsum_axis_len = prod(data.shape)
+        shape = (cumsum_axis_len,)
+    else:
+        if not isinstance(axis, int):
+            axis = get_const_int(axis)
+
+        shape = data.shape
+        cumsum_axis_len = shape[axis]
+
+        if axis < 0:
+            axis = len(shape) + axis
+
+        for i, value in enumerate(shape, 0):
+            if i < axis:
+                axis_mul_before *= value
+            elif i > axis:
+                axis_mul_after *= value
+
+    def gen_ir(data_buf, out_buf):
+        ib = ir_builder.create()
+        data_buf = ib.buffer_ptr(data_buf)
+        out_buf = ib.buffer_ptr(out_buf)
+
+        with ib.for_range(0, axis_mul_before * axis_mul_after, "fused", kind="parallel") as fused:
+            i = fused // axis_mul_after
+            j = fused % axis_mul_after
+            base_idx = i * cumsum_axis_len * axis_mul_after + j
+            out_buf[base_idx] = maybe_cast(data_buf[base_idx])
+            with ib.for_range(0, cumsum_axis_len - 1, "_k") as _k:
+                k = _k + 1
+                cur_idx = base_idx + k * axis_mul_after
+                prev_idx = base_idx + (k - 1) * axis_mul_after
+                out_buf[cur_idx] = out_buf[prev_idx] + maybe_cast(data_buf[cur_idx])
+
+        return ib.get()
+
+    out_buf = decl_buffer(shape, dtype, "out_buf")
+
+    return extern(
+        [shape],
+        [data],
+        lambda ins, outs: gen_ir(ins[0], outs[0]),
+        dtype=dtype,
+        out_buffers=[out_buf],
+        name="cumsum_generic",
+        tag="cumsum_generic",
+    )
diff --git a/python/tvm/topi/utils.py b/python/tvm/topi/utils.py
index dfc226f0c331..cd9f0c61c854 100644
--- a/python/tvm/topi/utils.py
+++ b/python/tvm/topi/utils.py
@@ -492,3 +492,8 @@ def is_empty_shape(shape):
 def ceil_div(a, b):
     """Return ceil division of a by b"""
     return tvm.tir.indexdiv(a + (b - 1), b)
+
+
+def swap(arr, axis):
+    """ swap arr[axis] and arr[-1] """
+    return arr[:axis] + [arr[-1]] + arr[axis + 1 : -1] + [arr[axis]]
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index ecfde359d11d..0e868cdc50c9 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -3674,5 +3674,57 @@ RELAY_REGISTER_OP("adv_index")
     .set_attr<TOpPattern>("TOpPattern", kInjective)
     .set_attr<FTVMCompute>("FTVMCompute", AdvIndexCompute);
 
+TVM_REGISTER_NODE_TYPE(CumsumAttrs);
+
+bool CumsumRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+               const TypeReporter& reporter) {
+  // types: [data, output]
+  ICHECK_EQ(types.size(), 2) << "Expects two types, one for the input and another for the output";
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) {
+    ICHECK(types[0].as<IncompleteTypeNode>())
+        << "cumsum: expect input type to be TensorType but get " << types[0];
+    return false;
+  }
+
+  const auto* param = attrs.as<CumsumAttrs>();
+
+  auto dtype = param->dtype;
+  if (dtype.is_void()) {
+    dtype = data->dtype;
+  }
+
+  if (param->axis.defined()) {
+    reporter->Assign(types[1], TensorType(data->shape, dtype));
+  } else {
+    auto prod = data->shape[0];
+    for (size_t i = 1; i < data->shape.size(); ++i) {
+      prod = prod * data->shape[i];
+    }
+    reporter->Assign(types[1], TensorType({prod}, dtype));
+  }
+
+  return true;
+}
+
+Expr MakeCumsum(Expr data, Integer axis, DataType dtype) {
+  auto attrs = make_object<CumsumAttrs>();
+  attrs->dtype = dtype;
+  attrs->axis = axis;
+  static const Op& op = Op::Get("cumsum");
+  return Call(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.cumsum").set_body_typed(MakeCumsum);
+
+RELAY_REGISTER_OP("cumsum")
+    .describe(
+        R"doc(Return the cumulative sum of the elements along a given axis.)doc" TVM_ADD_FILELINE)
+    .set_num_inputs(1)
+    .add_argument("data", "Tensor", "The input tensor.")
+    .set_support_level(3)
+    .add_type_rel("Cumsum", CumsumRel)
+    .set_attr<TOpPattern>("TOpPattern", kOpaque);
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/runtime/contrib/thrust/thrust.cu b/src/runtime/contrib/thrust/thrust.cu
index 4e3e3a81af1a..7295d4c47c3f 100644
--- a/src/runtime/contrib/thrust/thrust.cu
+++ b/src/runtime/contrib/thrust/thrust.cu
@@ -275,9 +275,22 @@ void thrust_scan(DLTensor* data,
 
   if (scan_size == 0) return;
 
-  if (data->ndim == 1 || (data->ndim == 2 && data->shape[0] == 1)) {
-    if (exclusive) {
+  size_t size = 1;
+  for (int i = 0; i < data->ndim; ++i) size *= data->shape[i];
+
+  const bool need_cast = std::is_same<InType, OutType>::value == false;
+
+  auto data_cast_ptr = thrust::make_transform_iterator(data_ptr, [] __host__ __device__(InType v) {
+    return static_cast<OutType>(v);
+  }); // NOLINT(*)
+
+  if (size == static_cast<size_t>(data->shape[data->ndim - 1])) {
+    if (exclusive && need_cast) {
+      thrust::exclusive_scan(data_cast_ptr, data_cast_ptr + scan_size, output_ptr);
+    } else if (exclusive && !need_cast) {
       thrust::exclusive_scan(data_ptr, data_ptr + scan_size, output_ptr);
+    } else if (!exclusive && need_cast) {
+      thrust::inclusive_scan(data_cast_ptr, data_cast_ptr + scan_size, output_ptr);
     } else {
       thrust::inclusive_scan(data_ptr, data_ptr + scan_size, output_ptr);
     }
@@ -288,17 +301,19 @@ void thrust_scan(DLTensor* data,
 
     // This is for constructing a sequence 0, 0, 0,...,1, 1, 1,...,2, 2, 2,...,
     // without materializing the sequence vector
-    auto counting_iter = thrust::counting_iterator<int64_t>(0);
+    auto counting_iter = thrust::counting_iterator<size_t>(0);
     // Without __host__ annotation, cub crashes
-    auto linear_index_to_scan_key = [scan_size] __host__ __device__(int64_t i) {
+    auto linear_index_to_scan_key = [scan_size] __host__ __device__(size_t i) {
         return i / scan_size;
     }; // NOLINT(*)
     auto key_iter = thrust::make_transform_iterator(counting_iter, linear_index_to_scan_key);
-    int64_t size = 1;
-    for (int i = 0; i < data->ndim; ++i) size *= data->shape[i];
 
-    if (exclusive) {
+    if (exclusive && need_cast) {
+      thrust::exclusive_scan_by_key(key_iter, key_iter + size, data_cast_ptr, output_ptr);
+    } else if (exclusive && !need_cast) {
       thrust::exclusive_scan_by_key(key_iter, key_iter + size, data_ptr, output_ptr);
+    } else if (!exclusive && need_cast) {
+      thrust::inclusive_scan_by_key(key_iter, key_iter + size, data_cast_ptr, output_ptr);
     } else {
       thrust::inclusive_scan_by_key(key_iter, key_iter + size, data_ptr, output_ptr);
     }
@@ -315,28 +330,62 @@ TVM_REGISTER_GLOBAL("tvm.contrib.thrust.sum_scan")
   auto in_dtype = DLDataType2String(data->dtype);
   auto out_dtype = DLDataType2String(output->dtype);
 
-  if (in_dtype == "int32") {
+  if (in_dtype == "bool") {
+    if (out_dtype == "int32") {
+      thrust_scan<bool, int>(data, output, exclusive);
+    } else if (out_dtype == "int64") {
+      thrust_scan<bool, int64_t>(data, output, exclusive);
+    } else if (out_dtype == "float32") {
+      thrust_scan<bool, float>(data, output, exclusive);
+    } else if (out_dtype == "float64") {
+      thrust_scan<bool, double>(data, output, exclusive);
+    } else {
+      LOG(FATAL) << "Unsupported output dtype: " << out_dtype
+                 << ". Supported output dtypes are int32, int64, float32, and float64";
+    }
+  } else if (in_dtype == "int32") {
     if (out_dtype == "int32") {
       thrust_scan<int, int>(data, output, exclusive);
     } else if (out_dtype == "int64") {
       thrust_scan<int, int64_t>(data, output, exclusive);
+    } else if (out_dtype == "float32") {
+      thrust_scan<int, float>(data, output, exclusive);
+    } else if (out_dtype == "float64") {
+      thrust_scan<int, double>(data, output, exclusive);
     } else {
-      LOG(FATAL) << "Unsupported output dtype: " << out_dtype;
+      LOG(FATAL) << "Unsupported output dtype: " << out_dtype
+                 << ". Supported output dtypes are int32, int64, float32, and float64";
     }
   } else if (in_dtype == "int64") {
     if (out_dtype == "int64") {
       thrust_scan<int64_t, int64_t>(data, output, exclusive);
+    } else if (out_dtype == "float32") {
+      thrust_scan<int64_t, float>(data, output, exclusive);
+    } else if (out_dtype == "float64") {
+      thrust_scan<int64_t, double>(data, output, exclusive);
     } else {
-      LOG(FATAL) << "Unsupported output dtype: " << out_dtype;
+      LOG(FATAL) << "Unsupported output dtype: " << out_dtype
+                 << ". Supported output dtypes are int64, float32, and float64";
     }
   } else if (in_dtype == "float32") {
     if (out_dtype == "float32") {
       thrust_scan<float, float>(data, output, exclusive);
+    } else if (out_dtype == "float64") {
+      thrust_scan<float, double>(data, output, exclusive);
     } else {
-      LOG(FATAL) << "Unsupported output dtype: " << out_dtype;
+      LOG(FATAL) << "Unsupported output dtype: " << out_dtype
+                 << ". Supported output dtypes are float32, and float64";
+    }
+  } else if (in_dtype == "float64") {
+    if (out_dtype == "float64") {
+      thrust_scan<double, double>(data, output, exclusive);
+    } else {
+      LOG(FATAL) << "Unsupported output dtype: " << out_dtype
+                 << ". Supported output dtype is float64";
     }
   } else {
-    LOG(FATAL) << "Unsupported input dtype: " << in_dtype;
+    LOG(FATAL) << "Unsupported input dtype: " << in_dtype
+               << ". Supported input dtypes are bool, int32, int64, float32, and float64";
   }
 });
 
diff --git a/tests/python/contrib/test_thrust.py b/tests/python/contrib/test_thrust.py
index 5f66d465bf17..c5b6a29d57d5 100644
--- a/tests/python/contrib/test_thrust.py
+++ b/tests/python/contrib/test_thrust.py
@@ -59,7 +59,7 @@ def test_exclusive_scan():
         print("skip because thrust is not enabled...")
         return
 
-    for ishape in [(1,), (10, 10)]:
+    for ishape in [(10,), (10, 10), (10, 10, 10)]:
         values = te.placeholder(ishape, name="values", dtype="int32")
 
         with tvm.target.Target("cuda"):
@@ -75,7 +75,7 @@ def test_exclusive_scan():
         if len(ishape) == 1:
             reduction_shape = ()
         else:
-            reduction_shape = (ishape[0],)
+            reduction_shape = ishape[:-1]
 
         reduction_np_out = np.zeros(reduction_shape, np.int32)
 
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 5e44170b6428..559eb2462fa8 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -1311,6 +1311,7 @@ def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_
     # verify_sparse_to_dense([[[[0, 1, 4], [0, 2, 4]]]], [[[[3.1, 3.1, 3.1]]]], 3.5, [5], [3.1, 3.1, 3.5, 3.5, 3.1])
 
 
+@tvm.testing.uses_gpu
 def test_adv_index():
     def verify_adv_index(data_shape, index_shapes):
         dtype = "float32"
@@ -1342,6 +1343,40 @@ def verify_adv_index(data_shape, index_shapes):
     verify_adv_index((10, 5, 15), [(1, 2, 1), (1, 2, 7)])
 
 
+@tvm.testing.parametrize_targets
+def test_cumsum(target, ctx):
+    def verify_cumsum(data_np, np_out, axis=None, out_dtype=None, rtol=1e-5, atol=1e-5):
+        inp = relay.var("data", relay.TensorType(data_np.shape, str(data_np.dtype)))
+
+        out = relay.op.cumsum(inp, axis, out_dtype)
+        func = relay.Function([inp], out)
+
+        for kind in ["graph", "debug"]:
+            intrp = relay.create_executor(kind, ctx=ctx, target=target)
+            op_res = intrp.evaluate(func)(data_np)
+            tvm.testing.assert_allclose(op_res.asnumpy(), np_out, rtol=rtol, atol=atol)
+
+    data = np.array([2, 3, 0])
+    verify_cumsum(data, np.cumsum(data))
+    verify_cumsum(data, np.cumsum(data), out_dtype="int64")
+
+    data = np.random.randn(10, 10)
+    verify_cumsum(data, np.cumsum(data))
+    verify_cumsum(data, np.cumsum(data, axis=0), axis=0)
+    verify_cumsum(data, np.cumsum(data, axis=1), axis=1)
+
+    data = np.random.randn(10, 5, 10).astype("float32")
+    verify_cumsum(data, np.cumsum(data), rtol=1e-4, atol=1e-4)
+    verify_cumsum(data, np.cumsum(data, axis=0), axis=0, rtol=1e-4, atol=1e-4)
+    verify_cumsum(data, np.cumsum(data, axis=1), axis=1, rtol=1e-4, atol=1e-4)
+    verify_cumsum(data, np.cumsum(data, axis=-1), axis=-1, rtol=1e-4, atol=1e-4)
+
+    data = np.random.rand(10) > 0.5
+    data = data.astype(np.int32)
+    verify_cumsum(data, np.cumsum(data, dtype=np.int32))
+    verify_cumsum(data, np.cumsum(data, dtype="int64"), out_dtype="int64")
+
+
 if __name__ == "__main__":
     test_cast()
     test_zeros_ones()
@@ -1379,3 +1414,4 @@ def verify_adv_index(data_shape, index_shapes):
     test_sparse_to_dense()
     test_fixed_point_multiply()
     test_adv_index()
+    test_cumsum()
diff --git a/tests/python/topi/python/test_topi_cumsum.py b/tests/python/topi/python/test_topi_cumsum.py
new file mode 100644
index 000000000000..a01a496f92e9
--- /dev/null
+++ b/tests/python/topi/python/test_topi_cumsum.py
@@ -0,0 +1,72 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import numpy as np
+import tvm
+import tvm.testing
+from tvm import topi
+import tvm.topi.testing
+
+
+@tvm.testing.parametrize_targets
+def test_cumsum(ctx, target):
+    def check_cumsum(np_ref, data, axis=None, dtype=None):
+        implementations = {
+            "generic": (lambda x: topi.cumsum(x, axis, dtype), topi.generic.schedule_extern),
+            "cuda": (lambda x: topi.cuda.cumsum(x, axis, dtype), topi.cuda.schedule_scan),
+            "nvptx": (lambda x: topi.cuda.cumsum(x, axis, dtype), topi.cuda.schedule_scan),
+        }
+        fcompute, fschedule = tvm.topi.testing.dispatch(target, implementations)
+        tvm.topi.testing.compare_numpy_tvm([data], np_ref, target, ctx, fcompute, fschedule)
+
+    data = np.array([2, 3, 0])
+    check_cumsum(np.cumsum(data), data)
+
+    data = np.random.rand(10) > 0.5
+    data = data.astype(np.int32)
+    check_cumsum(np.cumsum(data, dtype=np.int32), data)
+    check_cumsum(np.cumsum(data), data, dtype="int64")
+
+    data = np.random.rand(10) > 0.5
+    check_cumsum(np.cumsum(data, dtype=np.int32), data, dtype="int32")
+
+    for in_dtype in ["float32", "float64"]:
+        data = np.random.randn(10, 10).astype(in_dtype)
+        check_cumsum(np.cumsum(data), data)
+        check_cumsum(np.cumsum(data, axis=0), data, axis=0)
+        check_cumsum(np.cumsum(data, axis=1), data, axis=1)
+
+        data = np.random.randn(10, 5, 10).astype(in_dtype)
+        check_cumsum(np.cumsum(data), data)
+        check_cumsum(np.cumsum(data, axis=0), data, axis=0)
+        check_cumsum(np.cumsum(data, axis=1), data, axis=1)
+        check_cumsum(np.cumsum(data, axis=-1), data, axis=-1)
+
+    for in_dtype in ["int32", "int64"]:
+        data = np.random.randint(-100, 100, size=(100, 100)).astype(in_dtype)
+        check_cumsum(np.cumsum(data, dtype=in_dtype), data)
+        check_cumsum(np.cumsum(data), data, dtype="int64")
+        check_cumsum(np.cumsum(data, axis=0, dtype=in_dtype), data, axis=0)
+        check_cumsum(np.cumsum(data, axis=1, dtype=in_dtype), data, axis=1)
+
+        data = np.random.randint(1 << 30, (1 << 31) - 1, size=(100)).astype(in_dtype)
+        check_cumsum(np.cumsum(data), data, dtype="int64")
+
+
+if __name__ == "__main__":
+    test_cumsum(tvm.context("cpu"), tvm.target.Target("llvm"))
+    test_cumsum(tvm.context("cuda"), tvm.target.Target("cuda"))
+    test_cumsum(tvm.context("nvptx"), tvm.target.Target("nvptx"))

From eeec538066f62ae52f4e803d4d3defe74ab9bffc Mon Sep 17 00:00:00 2001
From: JC Li <jincheng.li@gmail.com>
Date: Wed, 27 Jan 2021 05:36:38 -0800
Subject: [PATCH 116/357] Add resource_handle to both TVM_DLL_EXPORT_TYPED_FUNC
 and TVM_DLL_EXPORT_PACKED_FUNC macros in packed_func.h. This is a patch PR
 for #7388. (#7343)

Co-authored-by: JC Li <jinli@nvidia.com>
---
 include/tvm/runtime/packed_func.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index ee2c7fd830f0..fd4e2114b11a 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -963,7 +963,7 @@ struct PackedFuncValueConverter {
 #define TVM_DLL_EXPORT_TYPED_FUNC(ExportName, Function)                                     \
   extern "C" {                                                                              \
   TVM_DLL int ExportName(TVMValue* args, int* type_code, int num_args, TVMValue* out_value, \
-                         int* out_type_code) {                                              \
+                         int* out_type_code, void* resource_handle) {                       \
     try {                                                                                   \
       auto f = Function;                                                                    \
       using FType = ::tvm::runtime::detail::function_signature<decltype(f)>::FType;         \

From 38fa4202a8dda9de11e484ddd2b5b39124ead4f3 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Wed, 27 Jan 2021 05:37:11 -0800
Subject: [PATCH 117/357] [FIX] Don't add $TVM_HOME/.. to the include path when
 compiling code. (#7342)

If the user has a dmlc-core directory next to the tvm directory, this
dmlc-core directory will be incorrectly used when compiling files with
cc.py.
---
 python/tvm/_ffi/libinfo.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/tvm/_ffi/libinfo.py b/python/tvm/_ffi/libinfo.py
index 28614d072f01..8d67313e2e61 100644
--- a/python/tvm/_ffi/libinfo.py
+++ b/python/tvm/_ffi/libinfo.py
@@ -167,7 +167,6 @@ def find_include_path(name=None, search_path=None, optional=False):
     """
     ffi_dir = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
     source_dir = os.path.join(ffi_dir, "..", "..", "..")
-    install_include_dir = os.path.join(ffi_dir, "..", "..", "..", "..")
 
     third_party_dir = os.path.join(source_dir, "3rdparty")
 
@@ -176,7 +175,6 @@ def find_include_path(name=None, search_path=None, optional=False):
     if os.environ.get("TVM_INCLUDE_PATH", None):
         header_path.append(os.environ["TVM_INCLUDE_PATH"])
 
-    header_path.append(install_include_dir)
     header_path.append(source_dir)
     header_path.append(third_party_dir)
 

From eae21b087cbde53b99fe40b862be7c99dedc57d0 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Wed, 27 Jan 2021 10:05:27 -0800
Subject: [PATCH 118/357] [PRNG] Add check to PRNG to make sure that unsigned
 integer arithmetic is wrapping (#7287)

* [PRNG] Add check to PRNG to make sure that unsigned integer arithmetic is wrapping

* Add threefry_test_wrapping: a manual test for wrapping unsigned arithmetic.

* fix test to actually run on the target

* formatting

* lint
---
 python/tvm/topi/random/kernel.py           | 62 +++++++++++++++++++++-
 tests/python/topi/python/test_topi_prng.py |  8 +++
 2 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/python/tvm/topi/random/kernel.py b/python/tvm/topi/random/kernel.py
index 576fd9254a79..b21db3778744 100644
--- a/python/tvm/topi/random/kernel.py
+++ b/python/tvm/topi/random/kernel.py
@@ -17,6 +17,7 @@
 """Pseudorandom number kernels."""
 import tvm
 import tvm.topi
+import numpy as np
 from ... import tir
 from ...tir import ir_builder
 
@@ -135,7 +136,7 @@ def _threefry(
     assert key_buf.dtype == counter_buf.dtype, "threefry key and counter must be the same dtype"
 
     def mix(a, b, rotation):
-        x = a + b  # TODO should be wrapping
+        x = a + b  # wrapping
         y = x ^ ((b << rotation) | (b >> (iwidth - rotation)))
         return [x, y]
 
@@ -167,7 +168,7 @@ def key_schedule(s, i):
     with irb.for_range(0, out_shape, name="l") as l:  # pylint: disable=invalid-name
         for i in range(nrounds // 4):
             for j in range(nwords):
-                out_buf[out_offset + l * nwords + j] += key_schedule(i, j)  # TODO wrapping
+                out_buf[out_offset + l * nwords + j] += key_schedule(i, j)  # wrapping
             for k in range(4):
                 for j in range(nwords // 2):
                     (
@@ -201,6 +202,13 @@ def threefry_generate(gen, out_shape):
     then a new generator is created by applying Threefry to the current key, path, and counter.
     This new generator will have a reset counter.
 
+    Warning
+    -------
+    Threeyfry requires that unsigned integer arithmetic wraps on overflow. Currently TVM has no
+    guarantee of this, so threefry contains an internal assert to check wrapping behavior. This
+    assert may or may not run depending on your platform, so it is recommended you run
+    :py:func:`threefry_test_wrapping` to verify wrapping behavior.
+
     Parameters
     ----------
     gen : Tensor[10, uint64]
@@ -234,6 +242,18 @@ def gen_ir(gen_ptr, out_gen_ptr, out_array_ptr):
         out_gen = irb.buffer_ptr(out_gen_ptr)
         out_array = irb.buffer_ptr(out_array_ptr)
 
+        # Check that unsigned arithmetic wraps, as it is required to implement threefry correctly.
+        irb.emit(
+            tvm.tir.AssertStmt(
+                tvm.tir.const(0xFFFFFFFFFFFFFFFF, "uint64") + tvm.tir.const(1, "uint64")
+                == tvm.tir.const(0, "uint64"),
+                tvm.tir.StringImm(
+                    "Unsigned integer arithmetic is not wrapping, but threefry requires wrapping."
+                ),
+                tvm.tir.Evaluate(0),
+            )
+        )
+
         # Create a temporary array to hold the generator state we will use to create the random
         # numbers. We cannot use gen because we may need to update the key + path if there is not
         # enough room in the counter.
@@ -408,3 +428,41 @@ def gen_ir(gen_ptr, out_left_ptr, out_right_ptr):
         name="threefry_split",
         tag="threefry_split",
     )
+
+
+def threefry_test_wrapping(target, ctx):
+    """Test that unsigned arithmetic wraps on overflow.
+
+    Parameters
+    ----------
+    target : tvm.target.Target
+        Target to run against
+    ctx : tvm.runtime.TVMContext
+        Context to run the test on
+
+    Returns
+    -------
+    is_wrapping : bool
+        Whether or not unsigned integer arithmetic is wrapping for this target, context pair. True
+        indicates that threefry will work on this platform.
+    """
+    if isinstance(target, str):
+        target = tvm.target.Target(target)
+
+    def gen_ir(out_ptr):
+        irb = ir_builder.create()
+        out = irb.buffer_ptr(out_ptr)
+        if "gpu" in target.keys:
+            thread_x = tvm.te.thread_axis("threadIdx.x")
+            irb.scope_attr(thread_x, "thread_extent", 1)
+        out[0] = tvm.tir.const(0xFFFFFFFFFFFFFFFF, "uint64") + tvm.tir.const(1, "uint64")
+        return irb.get()
+
+    out = tvm.tir.decl_buffer((1,), dtype="uint64")
+    f = tvm.te.extern(
+        [out.shape], [], lambda ins, outs: gen_ir(outs[0]), dtype="uint64", out_buffers=[out]
+    )
+    s = tvm.te.create_schedule([f.op])
+    out_ary = tvm.nd.array(np.ones((1,), "uint64"), ctx)
+    tvm.build(s, [f], target=target)(out_ary)
+    return out_ary.asnumpy()[0] == 0
diff --git a/tests/python/topi/python/test_topi_prng.py b/tests/python/topi/python/test_topi_prng.py
index 43b0494ee6f5..649e5410c147 100644
--- a/tests/python/topi/python/test_topi_prng.py
+++ b/tests/python/topi/python/test_topi_prng.py
@@ -111,6 +111,14 @@ def test_threefry_generate(target, ctx):
     ).any(), "Overflowing counter with no space left in path should change state"
 
 
+@tvm.testing.parametrize_targets
+def test_threefry_wrapping(target, ctx):
+    assert tvm.topi.random.threefry_test_wrapping(
+        target, ctx
+    ), f"{target} does not suppport wrapping unsigned integer arithmetic"
+
+
 if __name__ == "__main__":
     test_threefry_split(tvm.target.Target("llvm"), tvm.context("cpu"))
     test_threefry_generate(tvm.target.Target("llvm"), tvm.context("cpu"))
+    test_threefry_wrapping(tvm.target.Target("llvm"), tvm.context("cpu"))

From 59e0a4a46461b1a90bc24660cf25e08cfcfb7a1f Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Thu, 28 Jan 2021 04:30:08 +0900
Subject: [PATCH 119/357] [Torch] Various updates for PyTorch frontend  
 (#7348)

* add conversion for detr

* remove explicit broadcast_to before batched matmul

* use take with wrap mode

* add test for transformer and negative indices

* add sort and argsort

* add logical_and

* support masked_select

* add gpu targets to masked_select test

* improve sort conversion
---
 python/tvm/relay/frontend/pytorch.py          |  63 ++++++++---
 tests/python/frontend/pytorch/test_forward.py | 101 +++++++++++++++++-
 2 files changed, 150 insertions(+), 14 deletions(-)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 991e3a8a0032..68e68fdbeed2 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -399,10 +399,7 @@ def slice(self, inputs, input_types):
         begin = [0] * ndim
         dim = int(inputs[1])
         stride = int(inputs[4])
-        if isinstance(inputs[2], _expr.Call):
-            begin[dim], _ = try_infer_value(inputs[2], lambda ret: np.asscalar(ret.astype(np.int)))
-        else:
-            begin[dim] = int(inputs[2])
+        begin[dim], _ = try_infer_value(inputs[2], lambda ret: np.asscalar(ret.astype(np.int)))
 
         # Process begin
         if not isinstance(begin[dim], int):
@@ -518,13 +515,13 @@ def select(self, inputs, input_types):
         data = inputs[0]
         dim = int(inputs[1])
         index = _wrap_const(inputs[2])
-        return _op.transform.take(data, index, axis=dim)
+        return _op.transform.take(data, index, axis=dim, mode="wrap")
 
     def take(self, inputs, input_types):
         data = inputs[0]
         indices = _op.cast(inputs[1], "int32")
 
-        return _op.transform.take(data, indices=indices)
+        return _op.transform.take(data, indices=indices, mode="wrap")
 
     def topk(self, inputs, input_types):
         data = inputs[0]
@@ -551,7 +548,13 @@ def reciprocal(self, inputs, input_types):
 
     def repeat(self, inputs, input_types):
         data = inputs[0]
-        reps = inputs[1]
+        reps = []
+        for r in inputs[1]:
+            if isinstance(r, int):
+                reps.append(r)
+            else:
+                reps.append(int(_infer_value(r, {}).asnumpy()))
+
         return _op.transform.tile(data, reps=reps)
 
     def repeat_interleave(self, inputs, input_types):
@@ -1520,12 +1523,6 @@ def matmul(self, inputs, input_types):
             # Convert a and b into 3 dimensional tensors.
             a = _op.reshape(inputs_0, [-1, a_shape[-2], a_shape[-1]])
             b = _op.reshape(inputs_1, [-1, b_shape[-2], b_shape[-1]])
-            # Broadcast b to match batch size of a
-            new_b_shape = list(self.infer_shape_with_prelude(b))
-            new_a_shape = self.infer_shape_with_prelude(a)
-            if new_a_shape[0] > new_b_shape[0]:
-                new_b_shape[0] = new_a_shape[0]
-                b = _op.broadcast_to(b, new_b_shape)
             # Transpose matrix dimensions of b.
             b = _op.transpose(b, [0, 2, 1])
             # Perform a batch matmul.
@@ -2070,6 +2067,40 @@ def scatter_add(self, inputs, input_types):
         src = inputs[3]
         return _op.scatter_add(data, index, src, axis=axis)
 
+    def cumsum(self, inputs, input_types):
+        data = inputs[0]
+        dim = inputs[1]
+        dtype = inputs[2]
+
+        if inputs[2] is not None:
+            dtype = _convert_dtype_value(inputs[2])
+
+        return _op.cumsum(data, axis=dim, dtype=dtype)
+
+    def masked_fill(self, inputs, input_types):
+        mask = inputs[1]
+        value = _op.cast(_wrap_const(inputs[2]), input_types[0])
+        return _op.where(mask, value, inputs[0])
+
+    def masked_select(self, inputs, input_types):
+        mask = inputs[1]
+        indices = self.nonzero([mask], input_types, is_numpy_style=True)
+        return _op.adv_index([inputs[0]] + [indices[i] for i in range(indices.size)])
+
+    def sort(self, inputs, input_types):
+        data = inputs[0]
+        dim = inputs[1]
+        is_descending = inputs[2]
+        # pytorch sort returns both sorted indices and values
+        indices = _op.argsort(data, dim, not is_descending)
+        return _op.gather(data, dim, indices), indices
+
+    def argsort(self, inputs, input_types):
+        data = inputs[0]
+        dim = inputs[1]
+        is_descending = inputs[2]
+        return _op.argsort(data, dim, not is_descending)
+
     def is_floating_point(self, inputs, input_types):
         assert len(inputs) == 1
 
@@ -2263,6 +2294,7 @@ def create_convert_map(self):
             "torchvision::roi_align": self.roi_align,
             "aten::unbind": self.unbind,
             "aten::__and__": self.logical_and,
+            "aten::logical_and": self.logical_and,
             "aten::_shape_as_tensor": self.shape_as_tensor,
             "aten::nonzero": self.nonzero,
             "aten::nonzero_numpy": self.nonzero_numpy,
@@ -2278,6 +2310,11 @@ def create_convert_map(self):
             "aten::__not__": self.logical_not,
             "aten::hardswish_": self.hard_swish,
             "aten::hardswish": self.hard_swish,
+            "aten::cumsum": self.cumsum,
+            "aten::masked_fill": self.masked_fill,
+            "aten::masked_select": self.masked_select,
+            "aten::argsort": self.argsort,
+            "aten::sort": self.sort,
         }
 
     def update_convert_map(self, custom_map):
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 7cdd450448ca..6d9b559c6ba1 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -1147,7 +1147,7 @@ def forward(self, *args):
 @tvm.testing.uses_gpu
 def test_forward_select():
     torch.set_grad_enabled(False)
-    input_shape = [1, 3, 10, 10]
+    input_shape = [5, 3, 10, 10]
 
     class Select1(Module):
         def forward(self, *args):
@@ -1167,6 +1167,9 @@ def forward(self, index):
     input_data = torch.rand(input_shape).float()
     verify_model(Select1().float().eval(), input_data=input_data)
 
+    # test negative indexing
+    verify_model(lambda x: x[-1], input_data=input_data)
+
     x = torch.randn(3, 4)
     indices = torch.tensor([0, 2])
     verify_model(IndexedSelect(x, 0).eval(), input_data=indices)
@@ -2653,6 +2656,8 @@ def forward(self, *args):
     verify_model(Take1().float().eval(), input_data=input_data)
     indices = torch.tensor([[0, 0], [1, 0]])
     verify_model(Take2().float().eval(), input_data=[input_data, indices])
+    indices = torch.tensor([0, -1])
+    verify_model(Take2().float().eval(), input_data=[input_data, indices])
 
 
 @tvm.testing.uses_gpu
@@ -3452,6 +3457,93 @@ def test_hard_swish():
         verify_model(torch.nn.Hardswish(inplace=True).eval(), input_data=input)
 
 
+def test_cumsum():
+    def test_fn(dim, dtype=None):
+        return lambda x: torch.cumsum(x, dim=dim, dtype=dtype)
+
+    inp = torch.randint(0, 100, (10000,), dtype=torch.int32)
+    verify_model(test_fn(0), [inp])
+    verify_model(test_fn(0), [inp.to(torch.int64)])
+    verify_model(test_fn(0, dtype=torch.int64), [inp.to(torch.int64)])
+
+    inp = torch.randn((100, 100), dtype=torch.float32)
+    verify_model(test_fn(dim=0, dtype=torch.float64), [inp])
+    verify_model(test_fn(dim=1), [inp])
+
+    inp = torch.randn((100, 100), dtype=torch.float32) > 0.5
+    verify_model(test_fn(dim=0, dtype=torch.int32), [inp])
+
+
+def test_masked_fill():
+    def test_fn(x, mask):
+        return torch.masked_fill(x, mask, 0.0)
+
+    inp = torch.randn(100, 100)
+    verify_model(test_fn, [inp, inp > 0.5])
+    verify_model(test_fn, [inp.to(torch.float64), inp > 0.5])
+
+
+def test_transformer():
+    model = torch.nn.Transformer(d_model=256, nhead=8, num_encoder_layers=6, num_decoder_layers=6)
+    model = model.eval()
+    src = torch.rand((10, 32, 256))
+    tgt = torch.rand((20, 32, 256))
+    verify_model(model.eval(), input_data=[src, tgt])
+
+
+def test_argsort():
+    def test_fn(dim, descending):
+        return lambda x: torch.argsort(x, dim=dim, descending=descending)
+
+    inp = torch.randn(100)
+    verify_model(test_fn(0, True), [inp])
+    verify_model(test_fn(0, False), [inp])
+
+    inp = torch.randn(100, 100)
+    verify_model(test_fn(0, True), [inp])
+    verify_model(test_fn(0, False), [inp])
+    verify_model(test_fn(1, True), [inp])
+    verify_model(test_fn(1, False), [inp])
+
+
+def test_sort():
+    def test_fn(dim, descending):
+        return lambda x: torch.sort(x, dim=dim, descending=descending)
+
+    inp = torch.randn(100)
+    verify_model(test_fn(0, True), [inp])
+    verify_model(test_fn(0, False), [inp])
+
+    inp = torch.randn(100, 100)
+    verify_model(test_fn(0, True), [inp])
+    verify_model(test_fn(0, False), [inp])
+    verify_model(test_fn(1, True), [inp])
+    verify_model(test_fn(1, False), [inp])
+
+
+def test_logical_and():
+    def test_fn(x, y):
+        return torch.logical_and(x, y)
+
+    a = torch.tensor([0, 1, 10, 0], dtype=torch.int8)
+    b = torch.tensor([4, 0, 1, 0], dtype=torch.int8)
+    verify_model(test_fn, [a, b])
+
+    a = torch.tensor([True, False, True])
+    b = torch.tensor([True, False, False])
+    verify_model(test_fn, [a, b])
+
+
+def test_masked_select():
+    def test_fn(x, mask):
+        return torch.masked_select(x, mask)
+
+    for shape in [(10,), (3, 4), (16, 32, 64)]:
+        x = torch.randn(*shape)
+        mask = x.ge(0.5)
+        verify_trace_model(test_fn, [x, mask], ["llvm", "cuda", "nvptx"])
+
+
 if __name__ == "__main__":
     # some structural tests
     test_forward_traced_function()
@@ -3580,6 +3672,13 @@ def test_hard_swish():
     test_forward_scatter()
     test_numel()
     test_bincount()
+    test_cumsum()
+    test_masked_fill()
+    test_transformer()
+    test_sort()
+    test_argsort()
+    test_logical_and()
+    test_masked_select()
 
     # Model tests
     test_resnet18()

From fd391223c19bec454f488f8a976a0766fadb0db3 Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Wed, 27 Jan 2021 14:54:43 -0800
Subject: [PATCH 120/357] [AutoScheduler] Enable schedule sharing in dispatch
 context (#7344)

* [AutoScheduler] Enable schedule sharing in dispatch context

* Update python/tvm/auto_scheduler/dispatcher.py
---
 python/tvm/auto_scheduler/dispatcher.py       | 135 +++++++++++++-----
 python/tvm/auto_scheduler/measure_record.py   |  65 +--------
 python/tvm/auto_scheduler/utils.py            |  65 ++++++++-
 .../unittest/test_auto_scheduler_measure.py   |  18 +--
 4 files changed, 178 insertions(+), 105 deletions(-)

diff --git a/python/tvm/auto_scheduler/dispatcher.py b/python/tvm/auto_scheduler/dispatcher.py
index b0b98d8d0f56..f2d7536bea88 100644
--- a/python/tvm/auto_scheduler/dispatcher.py
+++ b/python/tvm/auto_scheduler/dispatcher.py
@@ -30,6 +30,7 @@
 
 from tvm.tir.expr import FloatImm
 from .measure_record import load_records
+from .utils import calc_workload_dis_factor, decode_workload_key
 
 logger = logging.getLogger("auto_scheduler")
 
@@ -126,18 +127,53 @@ class ApplyHistoryBest(DispatchContext):
         If is str, then it should be the filename of a records log file.
         Each row of this file is an encoded record pair. Otherwise, it is an iterator.
     n_lines: Optional[int]
-        if it is not None, only load the first `n_lines` lines of log
+        if it is not None, only load the first `n_lines` lines of log.
+    include_compatible: bool
+        When set to True, compatible records will also be considered.
     """
 
-    def __init__(self, records, n_lines=None):
+    def __init__(self, records, n_lines=None, include_compatible=False):
         super(ApplyHistoryBest, self).__init__()
+        self.include_compatible = include_compatible
 
+        # Dict[str (target key),
+        #   Dict[str (workload hash),
+        #     Dict[tuple (workload args), tuple (State, cost)]]]
         self.best_by_targetkey = {}
         self.best_by_model = {}
         self._best_user_defined = {}
 
         self.load(records, n_lines)
 
+    @staticmethod
+    def get_workload_entry(best_records, target_key, workload_key):
+        """Get the entry of the target key and workload key hash in the given best record map.
+
+        Parameters
+        ----------
+        best_records: Dict[str, Dict[str, Dict[str, Any]]]
+            The best record map.
+        target_key: str
+            The first key to the best_records.
+        workload_key: str
+            The workload key that can be decoded to workload hash and args.
+
+        Returns
+        -------
+        entry: Dict[str, Any]
+            The entry in best_records with target key and workload hash.
+        workload_hash: str
+            The workload hash decoded from workload_key.
+        workload_args: Tuple[Any, ...]
+            The hashable tuple of workload args decoded from workload_key.
+        """
+        workload_hash, workload_args = decode_workload_key(workload_key)
+        if target_key not in best_records:
+            best_records[target_key] = {}
+        if workload_hash not in best_records[target_key]:
+            best_records[target_key][workload_hash] = {}
+        return best_records[target_key][workload_hash], workload_hash, workload_args
+
     def load(self, records, n_lines=None):
         """Load records to this dispatch context
 
@@ -171,29 +207,32 @@ def load(self, records, n_lines=None):
             if res.error_no != 0:
                 continue
 
+            costs = [x.value for x in res.costs if isinstance(x, FloatImm)]
+            cost = np.mean(costs)
+
             # use target keys in tvm target system as key to build best map
             for k in inp.task.target.keys:
-                key = (k, inp.task.workload_key)
-                if key not in best_by_targetkey:
-                    best_by_targetkey[key] = (inp, res)
+                entry, _, workload_args = self.get_workload_entry(
+                    best_by_targetkey, k, inp.task.workload_key
+                )
+                if workload_args not in entry:
+                    entry[workload_args] = (inp.state, cost)
                 else:
-                    _, other_res = best_by_targetkey[key]
-                    other_costs = [x.value for x in other_res.costs if isinstance(x, FloatImm)]
-                    costs = [x.value for x in res.costs if isinstance(x, FloatImm)]
-                    if np.mean(other_costs) > np.mean(costs):
-                        best_by_targetkey[key] = (inp, res)
+                    _, other_cost = entry[workload_args]
+                    if other_cost > cost:
+                        entry[workload_args] = (inp.state, cost)
 
             # use model as key to build best map
-            key = (inp.task.target.model, inp.task.workload_key)
-            if key not in best_by_model:
+            entry, _, workload_args = self.get_workload_entry(
+                best_by_model, inp.task.target.model, inp.task.workload_key
+            )
+            if workload_args not in entry:
                 if inp.task.target.model != "unknown":
-                    best_by_model[key] = (inp, res)
+                    entry[workload_args] = (inp.state, cost)
             else:
-                _, other_res = best_by_model[key]
-                other_costs = [x.value for x in other_res.costs if isinstance(x, FloatImm)]
-                costs = [x.value for x in res.costs if isinstance(x, FloatImm)]
-                if np.mean(other_costs) > np.mean(costs):
-                    best_by_model[key] = (inp, res)
+                _, other_cost = entry[workload_args]
+                if other_cost > cost:
+                    entry[workload_args] = (inp.state, cost)
 
         logger.debug("Finish loading %d records", counter)
 
@@ -205,31 +244,61 @@ def _query_inside(self, target, workload_key):
                 " above the dispatcher call. So does other target. "
             )
 
+        def match_record(best_records, target_key, workload_key):
+            """The helper function to match the record in the given map
+            and return the matched state, or None if no match.
+            """
+            ret = None
+
+            entry, workload_hash, workload_args = self.get_workload_entry(
+                best_records, target_key, workload_key
+            )
+            if workload_args in entry:
+                ret = entry[workload_args][0]
+            elif self.include_compatible:
+                best_cost = float("inf")
+                for args, val in entry.items():
+                    dis_f = calc_workload_dis_factor(
+                        (workload_hash, workload_args), (workload_hash, args)
+                    )
+                    if dis_f == float("inf"):
+                        continue
+
+                    state, cost = val
+                    cost *= dis_f
+                    if ret is None or cost < best_cost:
+                        best_cost = cost
+                        ret = state
+            return ret
+
         # first try matching by model
-        key = (target.model, workload_key)
-        if key in self._best_user_defined:
-            return self._best_user_defined[key]
-        if key in self.best_by_model:
-            return self.best_by_model[key][0].state
+        ret = match_record(self._best_user_defined, target.model, workload_key)
+        if ret is not None:
+            return ret
+        ret = match_record(self.best_by_model, target.model, workload_key)
+        if ret is not None:
+            return ret
 
         # then try matching by target key
         for k in target.keys:
-            key = (k, workload_key)
-            if key in self._best_user_defined:
-                return self._best_user_defined[key]
-            if key in self.best_by_targetkey:
-                return self.best_by_targetkey[key][0].state
+            ret = match_record(self._best_user_defined, k, workload_key)
+            if ret is not None:
+                return ret
+            ret = match_record(self.best_by_targetkey, k, workload_key)
+            if ret is not None:
+                return ret
 
         return None
 
     def update(self, target, workload_key, state):
-        model = target.model
-        key = (model, workload_key)
-        self._best_user_defined[key] = state
+        entry, _, workload_args = self.get_workload_entry(
+            self._best_user_defined, target.model, workload_key
+        )
+        entry[workload_args] = (state, 1)
 
         for k in target.keys:
-            key = (k, workload_key)
-            self._best_user_defined[key] = state
+            entry, _, _ = self.get_workload_entry(self._best_user_defined, k, workload_key)
+            entry[workload_args] = (state, 1)
 
 
 class FallbackContext(DispatchContext):
diff --git a/python/tvm/auto_scheduler/measure_record.py b/python/tvm/auto_scheduler/measure_record.py
index 9eaef189e081..200d24fa7d50 100644
--- a/python/tvm/auto_scheduler/measure_record.py
+++ b/python/tvm/auto_scheduler/measure_record.py
@@ -27,7 +27,7 @@
 import tvm._ffi
 from tvm.runtime import Object
 from .measure import MeasureErrorNo, MeasureCallback
-from .utils import decode_workload_key
+from .utils import calc_workload_dis_factor, decode_workload_key
 from . import _ffi_api
 
 logger = logging.getLogger("auto_scheduler")
@@ -130,65 +130,6 @@ def __iter__(self):
             yield ret[0], ret[1]  # (input, result)
 
 
-def calc_workload_dis_factor(target_workload_key, workload_key):
-    """Calculate the distance factor of the workload to the target workload.
-    If two workloads are not compatible at all (i.e., different compute DAG or function),
-    then the distance factor is "inf". Otherwise, we calculate the factor by traversing
-    the workload arguments, which are the arguments of the compute function,
-    or the output shapes for the ComputeDAG. The factor is calculated by the following rules:
-
-    1. For non-zero integer values: `product(target_arg / candidate_arg)`.
-    2. For non-integer or zero values: "inf" if not equal else 1.
-
-    As a result, factor=1 is the optimal when two workloads are identical.
-
-    Parameters
-    ----------
-    target_workload_key: str
-        The target workload key in JSON string.
-
-    workload_key: str
-        The candidate workload key in JSON string.
-
-    Returns
-    -------
-    dis_f: float
-        The distance factor.
-    """
-
-    def flatten_list(inp):
-        ret = []
-        for elt in inp:
-            if isinstance(elt, list):
-                ret += flatten_list(elt)
-            else:
-                ret.append(elt)
-        return ret
-
-    target_key, target_args = decode_workload_key(target_workload_key)
-    target_args = flatten_list(target_args) if target_args is not None else []
-    key, args = decode_workload_key(workload_key)
-    args = flatten_list(args) if args is not None else []
-
-    # Not even the same func/DAG.
-    if key != target_key or len(target_args) != len(args):
-        return float("inf")
-
-    dis_f = 1
-    for target_arg, arg in zip(target_args, args):
-        if isinstance(target_arg, int):
-            if target_arg == 0 or arg == 0:
-                if target_arg != arg:
-                    return float("inf")
-            elif target_arg % arg != 0:
-                return float("inf")
-            else:
-                dis_f *= target_arg / arg
-        elif target_arg != arg:
-            return float("inf")
-    return dis_f
-
-
 def load_record_from_string(record):
     """
     Load the measure record from string.
@@ -304,7 +245,9 @@ def load_best_record(filename, workload_key=None, target=None, include_compatibl
         cost = np.mean(costs)
 
         if workload_key is not None:
-            dis_f = calc_workload_dis_factor(workload_key, inp.task.workload_key)
+            dis_f = calc_workload_dis_factor(
+                decode_workload_key(workload_key), decode_workload_key(inp.task.workload_key)
+            )
             if dis_f == float("inf"):
                 continue
             if not include_compatible and dis_f != 1:
diff --git a/python/tvm/auto_scheduler/utils.py b/python/tvm/auto_scheduler/utils.py
index fd25fdb783f7..8aa33e6775f8 100644
--- a/python/tvm/auto_scheduler/utils.py
+++ b/python/tvm/auto_scheduler/utils.py
@@ -57,18 +57,77 @@ def decode_workload_key(workload_key):
     -------
     name: str
         The workload function name or the DAG hash.
-    args: Optional[List[Any]]
-        The arguments of the workload, or None if the workload key format is not decodeable.
+    args: Optional[Tuple[Any, ...]]
+        The flatten arguments in a tuple, or None if the workload key format is not decodeable.
     """
+
+    def flatten_list(inp):
+        ret = []
+        for elt in inp:
+            if isinstance(elt, list):
+                ret += flatten_list(elt)
+            else:
+                ret.append(elt)
+        return ret
+
     try:
         key_list = json.loads(workload_key)
         if isinstance(key_list, list) and len(key_list) >= 1:
-            return key_list[0], key_list[1:]
+            return key_list[0], tuple(flatten_list(key_list[1:]))
     except json.decoder.JSONDecodeError:
         pass
     return workload_key, None
 
 
+def calc_workload_dis_factor(target_workload_pair, workload_pair):
+    """Calculate the distance factor of the workload to the target workload.
+    If two workloads are not compatible at all (i.e., different compute DAG or function),
+    then the distance factor is "inf". Otherwise, we calculate the factor by traversing
+    the workload arguments, which are the arguments of the compute function,
+    or the output shapes for the ComputeDAG. The factor is calculated by the following rules:
+
+    1. For non-zero integer values: `product(target_arg / candidate_arg)`.
+    2. For non-integer or zero values: "inf" if not equal else 1.
+
+    As a result, factor=1 is the optimal when two workloads are identical.
+
+    Parameters
+    ----------
+    target_workload_pair: Tuple[str, Optional[Tuple[Any, ...]]]
+        The target workload pair: (hash, argument tuple).
+
+    workload_pair: Tuple[str, Optional[Tuple[Any, ...]]]
+        The candidate workload pair: (hash, argument tuple).
+
+    Returns
+    -------
+    dis_f: float
+        The distance factor.
+    """
+    target_key, target_args = target_workload_pair
+    target_args = target_args if target_args is not None else []
+    key, args = workload_pair
+    args = args if args is not None else []
+
+    # Not even the same func/DAG.
+    if key != target_key or len(target_args) != len(args):
+        return float("inf")
+
+    dis_f = 1
+    for target_arg, arg in zip(target_args, args):
+        if isinstance(target_arg, int):
+            if target_arg == 0 or arg == 0:
+                if target_arg != arg:
+                    return float("inf")
+            elif target_arg % arg != 0:
+                return float("inf")
+            else:
+                dis_f *= target_arg / arg
+        elif target_arg != arg:
+            return float("inf")
+    return dis_f
+
+
 def get_func_name(func):
     """Get name of a function.
 
diff --git a/tests/python/unittest/test_auto_scheduler_measure.py b/tests/python/unittest/test_auto_scheduler_measure.py
index 3b074b273358..041fb7ee76d3 100644
--- a/tests/python/unittest/test_auto_scheduler_measure.py
+++ b/tests/python/unittest/test_auto_scheduler_measure.py
@@ -202,35 +202,36 @@ def test_recover_measure_input():
 
 
 def test_workload_dis_factor():
-    calc = auto_scheduler.measure_record.calc_workload_dis_factor
+    calc = auto_scheduler.utils.calc_workload_dis_factor
+    decode = auto_scheduler.utils.decode_workload_key
 
     # Identical
     target_wkl_key = json.dumps(
         ["func1", [8, 3, 224, 224], [32, 3, 3, 3], [0, 0], [1, 1], "float32"]
     )
-    assert calc(target_wkl_key, target_wkl_key) == 1
+    assert calc(decode(target_wkl_key), decode(target_wkl_key)) == 1
 
     # Compatible with a factor
     wkl_key = json.dumps(["func1", [1, 3, 112, 112], [32, 3, 3, 3], [0, 0], [1, 1], "float32"])
-    assert calc(target_wkl_key, wkl_key) == 8 * 2 * 2
+    assert calc(decode(target_wkl_key), decode(wkl_key)) == 8 * 2 * 2
 
     # Incompatible argument with zeros
     wkl_key = json.dumps(["func1", [8, 3, 224, 224], [32, 3, 3, 3], [1, 1], [1, 1], "float32"])
-    assert calc(target_wkl_key, wkl_key) == float("inf")
+    assert calc(decode(target_wkl_key), decode(wkl_key)) == float("inf")
     wkl_key = json.dumps(["func1", [8, 3, 224, 224], [32, 3, 3, 3], [0, 0], [0, 0], "float32"])
-    assert calc(target_wkl_key, wkl_key) == float("inf")
+    assert calc(decode(target_wkl_key), decode(wkl_key)) == float("inf")
 
     # Incompatible non-integter argument
     wkl_key = json.dumps(["func1", [8, 3, 224, 224], [32, 3, 3, 3], [0, 0], [1, 1], "int8"])
-    assert calc(target_wkl_key, wkl_key) == float("inf")
+    assert calc(decode(target_wkl_key), decode(wkl_key)) == float("inf")
 
     # Incompatible function
     wkl_key = json.dumps(["func2", [8, 3, 224, 224], [32, 3, 3, 3], [0, 0], [1, 1], "float32"])
-    assert calc(target_wkl_key, wkl_key) == float("inf")
+    assert calc(decode(target_wkl_key), decode(wkl_key)) == float("inf")
 
     # Incompatible due to non-dividable factor
     wkl_key = json.dumps(["func1", [8, 3, 223, 223], [32, 3, 3, 3], [0, 0], [1, 1], "float32"])
-    assert calc(target_wkl_key, wkl_key) == float("inf")
+    assert calc(decode(target_wkl_key), decode(wkl_key)) == float("inf")
 
 
 def test_measure_local_builder_runner():
@@ -322,6 +323,7 @@ def test_measure_target_host():
     test_record_follow_split_follow_fused_split()
     test_record_pragma_storage_align_rfactor()
     test_recover_measure_input()
+    test_workload_dis_factor()
     test_measure_local_builder_runner()
     test_measure_local_builder_rpc_runner()
     test_measure_target_host()

From 4006bde68e32daeaac5de11d9fc331a28ff55706 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Thu, 28 Jan 2021 08:09:43 +0900
Subject: [PATCH 121/357] [Torch] More graph rewrites for Faster RCNN /
 MaskRCNN (#7346)

* add post nms topk to max_out_size rewrite

* add argsort conversion

* scatter pattern first cut

* matching seems to working

* dup matching fixed

* add converter

* conversion seems working

* add reshape, use take

* remove pytorch argsort converter

* update test

* add doc
---
 python/tvm/relay/frontend/pytorch_utils.py    | 258 +++++++++++++++++-
 .../frontend/pytorch/test_object_detection.py |  18 +-
 2 files changed, 261 insertions(+), 15 deletions(-)

diff --git a/python/tvm/relay/frontend/pytorch_utils.py b/python/tvm/relay/frontend/pytorch_utils.py
index 6fc5a6af4a36..248f5354cfbb 100644
--- a/python/tvm/relay/frontend/pytorch_utils.py
+++ b/python/tvm/relay/frontend/pytorch_utils.py
@@ -16,13 +16,16 @@
 # under the License.
 # pylint: disable=import-outside-toplevel, unused-argument, invalid-name
 """ Common utilities used by PyTorch frontend """
+from .. import expr
 from .. import op
 from ..dataflow_pattern import (
+    wildcard,
     is_constant,
     is_op,
     rewrite,
     is_tuple,
-    wildcard,
+    is_tuple_get_item,
+    is_if,
     DFPatternCallback,
 )
 
@@ -36,6 +39,19 @@ def is_version_greater_than(ver):
     )
 
 
+def dyn_strided_slice_pattern(inp, end):
+    """A pattern to detect dynamic strided slice op."""
+    zero = is_constant()
+    cast_like = is_op("cast_like")(zero, is_constant())
+    less = is_op("less")(is_constant(), cast_like)
+    shape_of = is_op("shape_of")(inp)
+    cast_like = is_op("cast_like")(shape_of, is_constant())
+    add = is_op("add")(is_constant(), cast_like)
+    where = is_op("where")(less, add, is_constant())
+
+    return is_op("dyn.strided_slice")(inp, where, end, is_constant())
+
+
 def batched_nms_pattern(boxes, scores, idxs, iou_threshold, num_boxes, indices):
     """A pattern to detect batched_nms function in torchvision
 
@@ -73,7 +89,6 @@ def batched_nms(boxes, scores, idxs, iou_threshold):
 
     """
     one = is_constant()
-    zero = is_constant()
 
     # Equivelent PyTorch code from above snippet
     # offsets = idxs.to(boxes) * (max_coordinate + torch.tensor(1).to(boxes))
@@ -84,17 +99,10 @@ def batched_nms(boxes, scores, idxs, iou_threshold):
 
     # The following doesn't appear in the above Relay snippet. It is required for dynamic
     # stride_slice handling
-    cast_like = is_op("cast_like")(zero, is_constant())
-    less = is_op("less")(is_constant(), cast_like)
-    shape_of = is_op("shape_of")(mul)
-    cast_like = is_op("cast_like")(shape_of, is_constant())
-    add = is_op("add")(is_constant(), cast_like)
-    where = is_op("where")(less, add, is_constant())
     shape_of = is_op("shape_of")(mul)
     cast = is_op("cast")(shape_of)
-
     # This corresponds to offsets[:, None], where offsets is the result of multiplication
-    dyn_strided_slice = is_op("dyn.strided_slice")(mul, where, cast, is_constant())
+    dyn_strided_slice = dyn_strided_slice_pattern(mul, cast)
 
     # Add offsets to the boxes
     expand_dims = is_op("expand_dims")(dyn_strided_slice)
@@ -112,8 +120,49 @@ def batched_nms(boxes, scores, idxs, iou_threshold):
     )
 
 
-class NMSRewrite(DFPatternCallback):
-    """A callback to rewrite nms and restore batched nms"""
+def topk_after_batch_nms_pattern(cond, true_branch, data, valid_count, indices, iou_threshold):
+    """
+    Detect the following pattern used in torchvision detection models.
+
+    def batched_nms(...):
+        if boxes.numel() == 0:
+            return torch.empty((0,), dtype=torch.int64, device=boxes.device)
+        else:
+            ...
+            return nms(boxes_for_nms, scores, iou_threshold)
+
+    keep = batched_nms(boxes, scores, lvl, self.nms_thresh)
+    keep = keep[:post_nms_top_k] # keep only topk scoring predictions
+
+    An equivalent Relay subgraph:
+
+    %1184 = if (%1117) {
+      ...
+    } else {
+      ...
+      %1172 = vision.non_max_suppression(%1167, %1168, %1171, -1, 0.7f, ...);
+      ...
+      %1183 = dyn.strided_slice(%1174, %1180, %1182, ...);
+      cast(%1183, dtype="int64")
+    };
+    %1185 = strided_slice(%1184, begin=[0], end=[1000], strides=[1]);
+
+    """
+    nms = is_op("vision.non_max_suppression")(
+        data, valid_count, indices, is_constant(), iou_threshold
+    )
+    indices = is_op("squeeze")(is_tuple_get_item(nms, 0))
+    size = is_op("squeeze")(is_tuple_get_item(nms, 1))
+    dyn_strided_slice = dyn_strided_slice_pattern(indices, size)
+    cast_i64 = is_op("cast")(dyn_strided_slice)
+
+    batched_nms_result = is_if(cond, true_branch, cast_i64)
+
+    return is_op("strided_slice")(batched_nms_result)
+
+
+class MulticlassNMSRewrite(DFPatternCallback):
+    """A callback to rewrite nms and restore batched nms."""
 
     def __init__(self):
         super().__init__()
@@ -169,10 +218,193 @@ def callback(self, pre, post, node_map):
         return self.convert_batched_nms(boxes, scores, idxs, iou_thres, num_boxes, indices)
 
 
+class PostNMSTopKRewrite(DFPatternCallback):
+    """A callback to rewrite nms to exploit max_out_size parameter."""
+
+    def __init__(self):
+        super().__init__()
+        self.cond = wildcard()
+        self.true_branch = wildcard()
+        self.data = wildcard()
+        self.valid_count = wildcard()
+        self.indices = wildcard()
+        self.iou_threshold = wildcard()
+
+        self.pattern = topk_after_batch_nms_pattern(
+            self.cond,
+            self.true_branch,
+            self.data,
+            self.valid_count,
+            self.indices,
+            self.iou_threshold,
+        )
+
+    def rewrite_batch_nms_with_max_out_size(
+        self, cond, true_branch, data, valid_count, indices, iou_threshold, post_nms_topk
+    ):
+        """Use the detected post NMS topk parameter in NMS op."""
+        nms_ret = op.vision.non_max_suppression(
+            data=data,
+            valid_count=valid_count,
+            indices=indices,
+            max_output_size=post_nms_topk,
+            iou_threshold=iou_threshold,
+            force_suppress=False,
+            top_k=-1,
+            coord_start=2,
+            score_index=1,
+            id_index=0,
+            return_indices=True,
+            invalid_to_bottom=False,
+        )
+
+        size = op.squeeze(nms_ret[1], axis=[1])
+        data_slice = op.squeeze(nms_ret[0], axis=[0])
+
+        ret = op.strided_slice(data_slice, begin=expr.const([0]), end=size, slice_mode="size")
+
+        nms_result = op.cast(ret, "int64")
+
+        return expr.If(cond, true_branch, nms_result)
+
+    def callback(self, pre, post, node_map):
+        post_nms_topk = post.attrs.end[0].value
+        return self.rewrite_batch_nms_with_max_out_size(
+            node_map[self.cond][0],
+            node_map[self.true_branch][0],
+            node_map[self.data][0],
+            node_map[self.valid_count][0],
+            node_map[self.indices][0],
+            node_map[self.iou_threshold][0],
+            post_nms_topk,
+        )
+
+
+def scatter_roi_align_result_pattern(levels, roi_align_results, num_scales):
+    """Detect the Relay subgraph corresponding to the following PyTorch code
+
+    first_result = roi_align_results[0]
+    dtype, device = first_result.dtype, first_result.device
+    res = torch.zeros((levels.size(0), first_result.size(1),
+                       first_result.size(2), first_result.size(3)),
+                      dtype=dtype, device=device)
+    for level in range(len(roi_align_results)):
+        index = torch.where(levels == level)[0].view(-1, 1, 1, 1)
+        index = index.expand(index.size(0),
+                             roi_align_results[level].size(1),
+                             roi_align_results[level].size(2),
+                             roi_align_results[level].size(3))
+        res = res.scatter(0, index, roi_align_results[level])
+    return res
+    """
+
+    def do_where(levels, _):
+        idx_in_level = is_op("argwhere")(is_op("equal")(levels, is_constant()))
+        idx_in_level = is_op("split")(idx_in_level)
+        idx_in_level = is_tuple_get_item(idx_in_level, 0)
+        idx_in_level = is_op("squeeze")(idx_in_level)
+        idx_in_level = is_tuple_get_item(is_tuple([idx_in_level]), 0)
+        return idx_in_level
+
+    scatter_res = wildcard()
+
+    for i in range(num_scales):
+        # index = torch.where(levels == level)[0].view(-1, 1, 1, 1)
+        scatter_indices = do_where(levels, i)
+        scatter_indices = is_op("reshape")(scatter_indices)
+
+        # index = index.expand(index.size(0),
+        #                      unmerged_results[level].size(1),
+        #                      unmerged_results[level].size(2),
+        #                      unmerged_results[level].size(3))
+        scatter_indices = is_op("repeat")(scatter_indices)
+        scatter_indices = is_op("repeat")(scatter_indices)
+        scatter_indices = is_op("repeat")(scatter_indices)
+
+        scatter_res = is_op("scatter")(scatter_res, scatter_indices, roi_align_results[i])
+
+    return is_op("reshape")(scatter_res)
+
+
+class ScatterRewrite(DFPatternCallback):
+    """A callback to rewrite repeated scatters with a batched gather."""
+
+    def __init__(self, num_scales):
+        super().__init__()
+        self.num_scales = num_scales
+        self.levels = wildcard()
+        self.roi_align_results = []
+        for _ in range(num_scales):
+            self.roi_align_results.append(wildcard())
+
+        self.pattern = scatter_roi_align_result_pattern(
+            self.levels, self.roi_align_results, num_scales
+        )
+
+    def convert_scatter_to_gather(self, levels, roi_align_results):
+        """Replace the detected scatter loop with the following PyTorch code
+
+        indices_per_level = []
+        for level in range(num_scales):
+            idx_in_level = torch.where(levels == level)[0]
+            indices_per_leve.append(idx_in_level)
+
+        stacked_features = torch.cat(roi_align_results, dim=0)
+        stacked_indices = torch.cat(indices_per_level, dim=0)
+        argsort_indices = torch.argort(stacked_indices)
+        return stacked_features[argsort_indices, :]
+        """
+
+        # Collect inidices and concat them
+        indices_per_level = []
+        for i in range(self.num_scales):
+            equal = op.equal(levels, expr.const(i, dtype="int64"))
+            argwhere = op.argwhere(equal)
+            split = op.split(argwhere, indices_or_sections=1, axis=1)
+            squeeze = op.squeeze(split[0], axis=[1])
+            indices = op.cast(squeeze, dtype="int64")
+            indices_per_level.append(indices)
+
+        indices_concat = op.concatenate(indices_per_level, 0)
+
+        # Concat roi align results per level, and argsort indices
+        # To prepare for a batched gather
+        roi_align_results_concat = op.concatenate(roi_align_results, 0)
+        argsort_indices = op.cast(op.argsort(indices_concat), dtype="int64")
+
+        # Permute rows by argsorted indices
+        permuted = op.take(roi_align_results_concat, argsort_indices, axis=0)
+
+        return op.reshape(permuted, [0, -1, 1, 1])
+
+    def callback(self, pre, post, node_map):
+        levels = node_map[self.levels][0]
+        roi_align_results = [node_map[feat][0] for feat in self.roi_align_results]
+        return self.convert_scatter_to_gather(levels, roi_align_results)
+
+
 def rewrite_nms_to_batched_nms(mod):
     """Rewrite the input graph to replace non maximum surpression
     in torchvision that does not take class id into account with the one
     that avoids IOU tests between different classes.
     """
-    mod["main"] = rewrite(NMSRewrite(), mod["main"])
+    mod["main"] = rewrite(MulticlassNMSRewrite(), mod["main"])
+    return mod
+
+
+def rewrite_batched_nms_with_max_out_size(mod):
+    """Rewrite the input graph to detect slicing after batched nms and
+    use the slicing size as the parameter max_out_size in NMS.
+    """
+    mod["main"] = rewrite(PostNMSTopKRewrite(), mod["main"])
+    return mod
+
+
+def rewrite_scatter_to_gather(mod, num_scales):
+    """Rewrite the input graph to replace a repeated scatter loop with
+    a batched gather. The scatter loop is used in torchvision MultiScaleRoIAlign
+    to merge roi_align results for all scales. The scatter is used to emulate
+    inplace updates.
+    """
+    mod["main"] = rewrite(ScatterRewrite(num_scales), mod["main"])
     return mod
diff --git a/tests/python/frontend/pytorch/test_object_detection.py b/tests/python/frontend/pytorch/test_object_detection.py
index 2c323776f087..fd33dd1da8b1 100644
--- a/tests/python/frontend/pytorch/test_object_detection.py
+++ b/tests/python/frontend/pytorch/test_object_detection.py
@@ -26,7 +26,11 @@
 import tvm.testing
 from tvm import relay
 from tvm.runtime.vm import VirtualMachine
-from tvm.relay.frontend.pytorch_utils import rewrite_nms_to_batched_nms
+from tvm.relay.frontend.pytorch_utils import (
+    rewrite_nms_to_batched_nms,
+    rewrite_batched_nms_with_max_out_size,
+    rewrite_scatter_to_gather,
+)
 from tvm.contrib.download import download
 
 
@@ -72,7 +76,7 @@ def generate_jit_model(index):
     ]
 
     model_func = model_funcs[index]
-    model = TraceWrapper(model_func(pretrained=True, rpn_pre_nms_top_n_test=200))
+    model = TraceWrapper(model_func(pretrained=True, rpn_pre_nms_top_n_test=1000))
 
     model.eval()
     inp = torch.Tensor(np.random.uniform(0.0, 250.0, size=(1, 3, in_size, in_size)))
@@ -141,6 +145,16 @@ def compile_and_run_vm(mod, params, data_np, target):
     after = mod["main"]
     assert not tvm.ir.structural_equal(after, before)
 
+    before = mod["main"]
+    mod = rewrite_batched_nms_with_max_out_size(mod)
+    after = mod["main"]
+    assert not tvm.ir.structural_equal(after, before)
+
+    before = mod["main"]
+    mod = rewrite_scatter_to_gather(mod, 4)  # num_scales is 4 for maskrcnn_resnet50_fpn
+    after = mod["main"]
+    assert not tvm.ir.structural_equal(after, before)
+
     tvm_res_after_rewrite = compile_and_run_vm(mod, params, data_np, "llvm")
 
     # Results should be equivalent after rewriting

From 00257f347faad0b3ec2e9624413015bef34d451f Mon Sep 17 00:00:00 2001
From: Haozheng Fan <hzfan@apache.org>
Date: Thu, 28 Jan 2021 08:32:04 +0800
Subject: [PATCH 122/357] [Autodiff] Deterministic gradient compute (#7321)

* fix unstable compute

* fix

* fix

* lint

* sort linear equation

* sort inequalities

* fix

* fix find

* lint

* fix find

* lint
---
 src/arith/solve_linear_equation.cc   |  9 +++--
 src/arith/solve_linear_inequality.cc | 54 ++++++++++++++--------------
 src/te/autodiff/ad_simplify.cc       | 26 ++++++++------
 3 files changed, 46 insertions(+), 43 deletions(-)

diff --git a/src/arith/solve_linear_equation.cc b/src/arith/solve_linear_equation.cc
index 22bf7360563d..d66e75d9d361 100644
--- a/src/arith/solve_linear_equation.cc
+++ b/src/arith/solve_linear_equation.cc
@@ -427,11 +427,10 @@ IntConstraintsTransform SolveLinearEquations(const IntConstraints& system_to_sol
 
   // We have to transform ranges of the old variables into relations over new variables because
   // new ranges are not enough usually.
-  for (const auto& p : system_to_solve->ranges) {
-    const Var& old_var = p.first;
-    const Range& old_range = p.second;
-    if (old_to_new_map.count(old_var)) {
-      PrimExpr express_by_new_vars = old_to_new_map[old_var];
+  for (const auto& old_var : system_to_solve->variables) {
+    if (system_to_solve->ranges.find(old_var) != system_to_solve->ranges.end()) {
+      const Range& old_range = system_to_solve->ranges.at(old_var);
+      PrimExpr express_by_new_vars = old_to_new_map.at(old_var);
       PrimExpr lower_cond = analyzer_solution.Simplify(old_range->min <= express_by_new_vars);
       PrimExpr upper_cond =
           analyzer_solution.Simplify(express_by_new_vars < old_range->min + old_range->extent);
diff --git a/src/arith/solve_linear_inequality.cc b/src/arith/solve_linear_inequality.cc
index f4de9ffb197b..dd9044833546 100644
--- a/src/arith/solve_linear_inequality.cc
+++ b/src/arith/solve_linear_inequality.cc
@@ -94,11 +94,10 @@ struct ExprLess {
   }
 };
 
-void DebugPrint(
-    const std::unordered_set<PrimExpr, StructuralHash, StructuralEqual>& current_ineq_set,
-    const std::unordered_set<PrimExpr, StructuralHash, StructuralEqual>& next_ineq_set,
-    const std::vector<PrimExpr>& rest, const std::vector<std::pair<int64_t, PrimExpr>>& coef_pos,
-    const std::vector<std::pair<int64_t, PrimExpr>>& coef_neg) {
+void DebugPrint(const std::vector<PrimExpr>& current_ineq_set,
+                const std::vector<PrimExpr>& next_ineq_set, const std::vector<PrimExpr>& rest,
+                const std::vector<std::pair<int64_t, PrimExpr>>& coef_pos,
+                const std::vector<std::pair<int64_t, PrimExpr>>& coef_neg) {
   std::cout << "Current ineq set:\n[";
   for (auto& ineq : current_ineq_set) {
     std::cout << ineq << ", ";
@@ -148,9 +147,12 @@ class NormalizeComparisons : public ExprMutator {
   arith::Analyzer analyzer_;
 };
 
-void AddInequality(std::unordered_set<PrimExpr, StructuralHash, StructuralEqual>* inequality_set,
-                   const PrimExpr& new_ineq, Analyzer* analyzer) {
-  if (analyzer->CanProve(new_ineq) || inequality_set->find(new_ineq) != inequality_set->end()) {
+void AddInequality(std::vector<PrimExpr>* inequality_set, const PrimExpr& new_ineq,
+                   Analyzer* analyzer) {
+  if (analyzer->CanProve(new_ineq) ||
+      std::find_if(inequality_set->begin(), inequality_set->end(), [&](const PrimExpr& e) {
+        return StructuralEqual()(e, new_ineq);
+      }) != inequality_set->end()) {
     // redundant: follows from the vranges
     // or has already been added
     return;
@@ -168,15 +170,13 @@ void AddInequality(std::unordered_set<PrimExpr, StructuralHash, StructuralEqual>
     }
   }
 
-  inequality_set->insert(new_ineq);
+  inequality_set->push_back(new_ineq);
 }
 
-void ClassifyByPolarity(
-    const Var& var,
-    const std::unordered_set<PrimExpr, StructuralHash, StructuralEqual>& current_ineq_set,
-    std::unordered_set<PrimExpr, StructuralHash, StructuralEqual>* next_ineq_set,
-    std::vector<PrimExpr>* rest, std::vector<std::pair<int64_t, PrimExpr>>* coef_pos,
-    std::vector<std::pair<int64_t, PrimExpr>>* coef_neg, Analyzer* analyzer) {
+void ClassifyByPolarity(const Var& var, const std::vector<PrimExpr>& current_ineq_set,
+                        std::vector<PrimExpr>* next_ineq_set, std::vector<PrimExpr>* rest,
+                        std::vector<std::pair<int64_t, PrimExpr>>* coef_pos,
+                        std::vector<std::pair<int64_t, PrimExpr>>* coef_neg, Analyzer* analyzer) {
   // Take formulas from current_ineq_set and classify them according to polarity wrt var
   // and store to coef_pos and coef_neg respectively.
   for (const PrimExpr& ineq : current_ineq_set) {
@@ -218,14 +218,14 @@ void ClassifyByPolarity(
   }
 }
 
-void MoveEquality(std::unordered_set<PrimExpr, StructuralHash, StructuralEqual>* upper_bounds,
-                  std::unordered_set<PrimExpr, StructuralHash, StructuralEqual>* lower_bounds,
-                  std::unordered_set<PrimExpr, StructuralHash, StructuralEqual>* equalities) {
+void MoveEquality(std::vector<PrimExpr>* upper_bounds, std::vector<PrimExpr>* lower_bounds,
+                  std::vector<PrimExpr>* equalities) {
   // those exist in both upper & lower bounds will be moved to equalities
   for (auto ub = upper_bounds->begin(); ub != upper_bounds->end();) {
-    auto lb = lower_bounds->find(*ub);
+    auto lb = std::find_if(lower_bounds->begin(), lower_bounds->end(),
+                           [&](const PrimExpr& e) { return StructuralEqual()(e, *ub); });
     if (lb != lower_bounds->end()) {
-      equalities->insert(*lb);
+      equalities->push_back(*lb);
       lower_bounds->erase(lb);
       ub = upper_bounds->erase(ub);
     } else {
@@ -249,8 +249,8 @@ PartialSolvedInequalities SolveLinearInequalities(const IntConstraints& system_t
   //   and move to the next variable.
 
   // normalized inequality
-  std::unordered_set<PrimExpr, StructuralHash, StructuralEqual> current_ineq_set_to_solve;
-  std::unordered_set<PrimExpr, StructuralHash, StructuralEqual> next_ineq_set_to_solve;
+  std::vector<PrimExpr> current_ineq_set_to_solve;
+  std::vector<PrimExpr> next_ineq_set_to_solve;
   // A vector of pairs (c, e), c > 0, representing formulas of the form c*v + e <= 0
   std::vector<std::pair<int64_t, PrimExpr>> coef_pos;
   // A vector of pairs (c, e), c < 0, representing formulas of the form c*v + e <= 0
@@ -321,8 +321,8 @@ PartialSolvedInequalities SolveLinearInequalities(const IntConstraints& system_t
     }
 
     // The resulting lower and upper bounds
-    std::unordered_set<PrimExpr, StructuralHash, StructuralEqual> upper_bounds;
-    std::unordered_set<PrimExpr, StructuralHash, StructuralEqual> lower_bounds;
+    std::vector<PrimExpr> upper_bounds;
+    std::vector<PrimExpr> lower_bounds;
     upper_bounds.reserve(coef_pos.size());
     lower_bounds.reserve(coef_neg.size());
 
@@ -345,7 +345,7 @@ PartialSolvedInequalities SolveLinearInequalities(const IntConstraints& system_t
         }
       }
       // Add the upper bound
-      upper_bounds.insert(bound);
+      upper_bounds.push_back(bound);
     }
     for (const auto& neg : coef_neg) {
       PrimExpr bound = make_const(v.dtype(), -coef_lcm / neg.first) * neg.second;
@@ -366,10 +366,10 @@ PartialSolvedInequalities SolveLinearInequalities(const IntConstraints& system_t
         }
       }
       // Add the lower bound
-      lower_bounds.insert(bound);
+      lower_bounds.push_back(bound);
     }
 
-    std::unordered_set<PrimExpr, StructuralHash, StructuralEqual> equal;
+    std::vector<PrimExpr> equal;
     equal.reserve(std::min(upper_bounds.size(), lower_bounds.size()));
     MoveEquality(&upper_bounds, &lower_bounds, &equal);
     std::vector<PrimExpr> equal_list(equal.begin(), equal.end());
diff --git a/src/te/autodiff/ad_simplify.cc b/src/te/autodiff/ad_simplify.cc
index cc0e82066171..96f278e63be7 100644
--- a/src/te/autodiff/ad_simplify.cc
+++ b/src/te/autodiff/ad_simplify.cc
@@ -413,15 +413,17 @@ class FactorOutAtomicFormulasFunctor
     auto res_b = VisitExpr(op->b);
 
     // For the And case we return the union of the sets of atomic formulas
-    std::unordered_set<PrimExpr, StructuralHash, StructuralEqual> res_set;
-    res_set.reserve(res_a.atomic_formulas.size() + res_b.atomic_formulas.size());
+    std::unordered_set<PrimExpr, StructuralHash, StructuralEqual> res_a_set;
+    res_a_set.reserve(res_a.atomic_formulas.size());
     std::copy(res_a.atomic_formulas.begin(), res_a.atomic_formulas.end(),
-              std::inserter(res_set, res_set.end()));
-    std::copy(res_b.atomic_formulas.begin(), res_b.atomic_formulas.end(),
-              std::inserter(res_set, res_set.end()));
-
-    std::vector<PrimExpr> res{res_set.begin(), res_set.end()};
+              std::inserter(res_a_set, res_a_set.end()));
 
+    std::vector<PrimExpr> res = res_a.atomic_formulas;
+    for (const auto& e : res_b.atomic_formulas) {
+      if (res_a_set.find(e) == res_a_set.end()) {
+        res.emplace_back(e);
+      }
+    }
     // And the residuals are combined with &&
     return {res, res_a.rest && res_b.rest};
   }
@@ -443,10 +445,13 @@ class FactorOutAtomicFormulasFunctor
 
     // For the Or case we intersect the sets of atomic formulas
     std::unordered_set<PrimExpr, StructuralHash, StructuralEqual> res_set;
+    std::vector<PrimExpr> res;
     res_set.reserve(std::min(res_a.atomic_formulas.size(), res_b.atomic_formulas.size()));
-    for (const auto& res_b_formula : res_b_set) {
+    res.reserve(std::min(res_a.atomic_formulas.size(), res_b.atomic_formulas.size()));
+    for (const auto& res_b_formula : res_b.atomic_formulas) {
       if (res_a_set.count(res_b_formula)) {
         res_set.insert(res_b_formula);
+        res.push_back(res_b_formula);
       }
     }
 
@@ -454,13 +459,13 @@ class FactorOutAtomicFormulasFunctor
     // which are left behind, and then combine them with the residuals into the new residual.
     std::vector<PrimExpr> new_cond_a;
     new_cond_a.reserve(res_a.atomic_formulas.size() - res_set.size());
-    for (const auto& formula : res_a_set) {
+    for (const auto& formula : res_a.atomic_formulas) {
       if (!res_set.count(formula)) new_cond_a.emplace_back(formula);
     }
 
     std::vector<PrimExpr> new_cond_b;
     new_cond_b.reserve(res_b.atomic_formulas.size() - res_set.size());
-    for (const auto& formula : res_b_set) {
+    for (const auto& formula : res_b.atomic_formulas) {
       if (!res_set.count(formula)) new_cond_b.emplace_back(formula);
     }
 
@@ -468,7 +473,6 @@ class FactorOutAtomicFormulasFunctor
     res_b.atomic_formulas = std::move(new_cond_b);
 
     PrimExpr new_rest = res_a.to_expr() || res_b.to_expr();
-    std::vector<PrimExpr> res{res_set.begin(), res_set.end()};
 
     return {res, new_rest};
   }

From d8efe709a7c70c24c7b9cd1b7842677497b342ed Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 27 Jan 2021 21:08:05 -0500
Subject: [PATCH 123/357] [COMMUNITY] @trevor-m -> reviewer (#7352)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index bf10271b55e1..773f94a50dd9 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -112,6 +112,7 @@ We do encourage everyone to work anything they are interested in.
 - [Sergey Mironov](https://github.com/grwlf): @grwlf
 - [Thierry Moreau](https://github.com/tmoreau89): @tmoreau89
 - [Kazutaka Morita](https://github.com/kazum): @kazum
+- [Trevor Morris](https://github.com/trevor-m): @trevor-m
 - [Tatsuya Nishiyama](https://github.com/nishi-t): @nishi-t
 - [Wei Pan](https://github.com/wpan11nv): @wpan11nv
 - [Krzysztof Parzyszek](https://github.com/kparzysz-quic): @kparzysz-quic

From 913abe087a3054831662b995c2e4f1f2271afbc6 Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Wed, 27 Jan 2021 20:30:30 -0800
Subject: [PATCH 124/357] [Relay][Frontend][Onnx] Robustify Loop Importer
 (#7353)

* Add test for array loop.

* Fixed scalar issue.

* Formatting.

* Fix injective schedule for dynamic shapes.
---
 python/tvm/relay/frontend/onnx.py          | 13 +++-
 python/tvm/topi/x86/injective.py           | 27 ++++----
 tests/python/frontend/onnx/test_forward.py | 74 +++++++++++++++++++---
 3 files changed, 92 insertions(+), 22 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 7a3b168fc8fd..b1b01b87f715 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -2227,8 +2227,17 @@ def body_fn(*loop_inputs):
             # Add new scan outputs to tracking
             combined_scan_outputs = []
             for i, scan in enumerate(scan_outputs):
-                new_scan = _op.expand_dims(new_scan_outputs[i], axis=0)
-                combined_scan = _op.concatenate([scan, new_scan], axis=0)
+                rank = len(infer_shape(scan)) - 1
+                new_scan = new_scan_outputs[i]
+                expand_scan = _op.expand_dims(new_scan, axis=0)
+                # For non scalar outputs we need to broadcast the initial value.
+                if rank > 0:
+                    new_scan_shape = _op.shape_of(new_scan, dtype=iter_dtype)
+                    scan_broadcast = _op.concatenate(
+                        [_op.reshape(loop_count, [1]), new_scan_shape], axis=0
+                    )
+                    scan = _op.broadcast_to(scan, scan_broadcast)
+                combined_scan = _op.concatenate([scan, expand_scan], axis=0)
                 combined_scan_outputs.append(combined_scan)
 
             # Increment counter.
diff --git a/python/tvm/topi/x86/injective.py b/python/tvm/topi/x86/injective.py
index 29f903fd4e35..6492b78d6037 100644
--- a/python/tvm/topi/x86/injective.py
+++ b/python/tvm/topi/x86/injective.py
@@ -17,6 +17,7 @@
 # pylint: disable=invalid-name
 """x86 declaration and schedules."""
 from tvm import te
+from tvm.tir import IntImm
 from ..utils import is_empty_shape
 
 
@@ -100,18 +101,20 @@ def schedule_concatenate(outs):
     def vectorize(sch, tensor, vectorize_limit):
         """Internal vectorization function for concatenate."""
         inner_axis = s[tensor].op.axis[len(s[tensor].op.axis) - 1]
-        inner_length = tensor.shape[len(tensor.shape) - 1].value
-        if inner_length <= vectorize_limit:
-            sch[tensor].vectorize(inner_axis)
-        else:
-            split_factor = 1
-            for i in range(vectorize_limit, 1, -1):
-                if inner_length % i == 0:
-                    split_factor = i
-                    break
-            if split_factor > 1:
-                _, inner_i = sch[tensor].split(inner_axis, split_factor)
-                sch[tensor].vectorize(inner_i)
+        # Check that the tensor shape is static. Otherwise skip vectorization.
+        if isinstance(tensor.shape[len(tensor.shape) - 1], IntImm):
+            inner_length = tensor.shape[len(tensor.shape) - 1].value
+            if inner_length <= vectorize_limit:
+                sch[tensor].vectorize(inner_axis)
+            else:
+                split_factor = 1
+                for i in range(vectorize_limit, 1, -1):
+                    if inner_length % i == 0:
+                        split_factor = i
+                        break
+                if split_factor > 1:
+                    _, inner_i = sch[tensor].split(inner_axis, split_factor)
+                    sch[tensor].vectorize(inner_i)
 
     outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
     x = outs[0]
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 20937d2060c5..c666604d0e89 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -3654,14 +3654,14 @@ def verify_cond_loop():
 
 
 def verify_count_loop():
-    y_in = helper.make_tensor_value_info("y_in", TensorProto.FLOAT, [1])
-    y_out = helper.make_tensor_value_info("y_out", TensorProto.FLOAT, [1])
-    scan_out = helper.make_tensor_value_info("scan_out", TensorProto.FLOAT, [1])
+    y_in = helper.make_tensor_value_info("y_in", TensorProto.FLOAT, [])
+    y_out = helper.make_tensor_value_info("y_out", TensorProto.FLOAT, [])
+    scan_out = helper.make_tensor_value_info("scan_out", TensorProto.FLOAT, [])
     cond_in = helper.make_tensor_value_info("cond_in", TensorProto.BOOL, [])
     cond_out = helper.make_tensor_value_info("cond_out", TensorProto.BOOL, [])
     iter_count = helper.make_tensor_value_info("iter_count", TensorProto.INT64, [])
 
-    y = np.array([-2]).astype(np.float32)
+    y = np.array(-2).astype(np.float32)
 
     iter_cast_node = helper.make_node(
         "Cast", inputs=["iter_count"], outputs=["iter_cast"], to=onnx.TensorProto.FLOAT
@@ -3693,11 +3693,11 @@ def verify_count_loop():
         inputs=[
             onnx.helper.make_tensor_value_info("trip_count", onnx.TensorProto.INT64, []),
             onnx.helper.make_tensor_value_info("cond", onnx.TensorProto.BOOL, []),
-            onnx.helper.make_tensor_value_info("y", onnx.TensorProto.FLOAT, [1]),
+            onnx.helper.make_tensor_value_info("y", onnx.TensorProto.FLOAT, []),
         ],
         outputs=[
-            onnx.helper.make_tensor_value_info("res_y", onnx.TensorProto.FLOAT, [1]),
-            onnx.helper.make_tensor_value_info("res_scan", onnx.TensorProto.FLOAT, [5, 1]),
+            onnx.helper.make_tensor_value_info("res_y", onnx.TensorProto.FLOAT, []),
+            onnx.helper.make_tensor_value_info("res_scan", onnx.TensorProto.FLOAT, [5]),
         ],
     )
     loop_model = onnx.helper.make_model(loop_graph)
@@ -3708,11 +3708,69 @@ def verify_count_loop():
     verify_with_ort_with_inputs(loop_model, input_vals, use_vm=True, freeze_params=True)
 
 
+def verify_tensor_loop():
+    y_in = helper.make_tensor_value_info("y_in", TensorProto.FLOAT, [3, 3, 3, 3])
+    y_out = helper.make_tensor_value_info("y_out", TensorProto.FLOAT, [3, 3, 3, 3])
+    scan_out = helper.make_tensor_value_info("scan_out", TensorProto.FLOAT, [3, 3, 3, 3])
+    cond_in = helper.make_tensor_value_info("cond_in", TensorProto.BOOL, [])
+    cond_out = helper.make_tensor_value_info("cond_out", TensorProto.BOOL, [])
+    iter_count = helper.make_tensor_value_info("iter_count", TensorProto.INT64, [])
+
+    y = np.random.normal(size=[3, 3, 3, 3]).astype(np.float32)
+
+    iter_cast_node = helper.make_node(
+        "Cast", inputs=["iter_count"], outputs=["iter_cast"], to=onnx.TensorProto.FLOAT
+    )
+
+    y_add_node = helper.make_node("Add", inputs=["y_in", "iter_cast"], outputs=["y_out"])
+
+    identity_node = helper.make_node("Identity", inputs=["cond_in"], outputs=["cond_out"])
+
+    scan_identity_node = helper.make_node("Identity", inputs=["y_out"], outputs=["scan_out"])
+
+    loop_body = helper.make_graph(
+        [identity_node, iter_cast_node, y_add_node, scan_identity_node],
+        "loop_body",
+        [iter_count, cond_in, y_in],
+        [cond_out, y_out, scan_out],
+    )
+
+    loop_node = helper.make_node(
+        "Loop", inputs=["trip_count", "cond", "y"], outputs=["res_y", "res_scan"], body=loop_body
+    )
+
+    trip_count = np.array(5).astype(np.int64)
+    cond = np.array(1).astype(np.bool)
+    loop_graph = onnx.helper.make_graph(
+        [loop_node],
+        "loop_outer",
+        inputs=[
+            onnx.helper.make_tensor_value_info("trip_count", onnx.TensorProto.INT64, []),
+            onnx.helper.make_tensor_value_info("cond", onnx.TensorProto.BOOL, []),
+            onnx.helper.make_tensor_value_info("y", onnx.TensorProto.FLOAT, [3, 3, 3, 3]),
+        ],
+        outputs=[
+            onnx.helper.make_tensor_value_info("res_y", onnx.TensorProto.FLOAT, [3, 3, 3, 3]),
+            onnx.helper.make_tensor_value_info("res_scan", onnx.TensorProto.FLOAT, [5, 3, 3, 3, 3]),
+        ],
+    )
+    loop_model = onnx.helper.make_model(loop_graph)
+
+    trip_count = np.array(5).astype(np.int64)
+    cond = np.array(1).astype(np.bool)
+    input_vals = [trip_count, cond, y]
+    verify_with_ort_with_inputs(
+        loop_model, input_vals, use_vm=True, freeze_params=True, convert_to_static=True
+    )
+
+
 def test_loop():
     # Test a loop that exits once a condition is met.
     verify_cond_loop()
-    # Test a loop that exits after a fixed number of iterations.
+    # Test a loop that exits after a fixed number of iterations with scalar outputs.
     verify_count_loop()
+    # Test a loop that uses an array output.
+    verify_tensor_loop()
 
 
 def verify_if(cond_array):

From 02fefbc1df0dec8989105076f48eace34027a31b Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Wed, 27 Jan 2021 21:30:52 -0700
Subject: [PATCH 125/357] If an expression has two branches, and the pattern
 ignores one with a wildcard, allow grouping via dominator analysis (#7355)

---
 src/relay/ir/dataflow_matcher.cc            |  3 +-
 src/relay/ir/indexed_graph.h                | 22 +++++++
 tests/python/relay/test_dataflow_pattern.py | 71 +++++++++++++++++++++
 3 files changed, 95 insertions(+), 1 deletion(-)

diff --git a/src/relay/ir/dataflow_matcher.cc b/src/relay/ir/dataflow_matcher.cc
index 0d9481312137..cfacd41487c8 100644
--- a/src/relay/ir/dataflow_matcher.cc
+++ b/src/relay/ir/dataflow_matcher.cc
@@ -730,7 +730,8 @@ class PatternGrouper {
           auto node = matcher_->expr_graph_.node_map_.at(kv.first);
           for (auto* output : node->outputs_) {
             // and the node is used by nodes outside of the group
-            if (memo.count(output->ref_) == 0) {
+            if (memo.count(output->ref_) == 0 &&
+                !matcher_->expr_graph_.node_map_.at(expr)->Dominates(output)) {
               // Exit because nodes in this pattern's body are used outside the pattern
               // fusing it would be invalid
               return;
diff --git a/src/relay/ir/indexed_graph.h b/src/relay/ir/indexed_graph.h
index 4bbb741b760d..d073bcaeea5c 100644
--- a/src/relay/ir/indexed_graph.h
+++ b/src/relay/ir/indexed_graph.h
@@ -27,6 +27,7 @@
 #include <tvm/relay/dataflow_pattern.h>
 
 #include <memory>
+#include <stack>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
@@ -74,6 +75,27 @@ class IndexedGraph {
     Node* dominator_parent_;
     /*! \brief The nodes this node dominates */
     std::vector<Node*> dominator_children_;
+
+    bool Dominates(const Node* other) {
+      std::stack<const Node*> stack;
+      std::unordered_set<const Node*> visited;
+      stack.push(this);
+      while (!stack.empty()) {
+        const Node* current = stack.top();
+        stack.pop();
+        for (auto node : current->dominator_children_) {
+          if (visited.count(node) == 0) {
+            if (other == node) {
+              return true;
+            } else {
+              stack.push(node);
+            }
+            visited.insert(node);
+          }
+        }
+      }
+      return false;
+    }
   };
   /*! \brief Construct the domination tree inside IndexedGraph */
   void PostDom() {
diff --git a/tests/python/relay/test_dataflow_pattern.py b/tests/python/relay/test_dataflow_pattern.py
index e7b367b8f631..15d3ee035450 100644
--- a/tests/python/relay/test_dataflow_pattern.py
+++ b/tests/python/relay/test_dataflow_pattern.py
@@ -16,6 +16,7 @@
 # under the License.
 # pylint: disable=unused-wildcard-import
 import numpy as np
+import pytest
 
 import tvm
 from tvm import relay
@@ -1470,6 +1471,76 @@ def test_partition_function():
     assert tvm.ir.structural_equal(pattern.partition(expr), expr2)
 
 
+def test_rewrite_function_with_fuzzy_body():
+    """Allow Rewriting a function with a fuzzy body via dominator analysis"""
+    x = relay.var("x")
+    w = relay.var("w")
+    b = relay.var("b")
+
+    x1 = relay.var("x1")
+    w1 = relay.var("w1")
+
+    wc_x = wildcard()
+    wc_w = wildcard()
+    wc_b = wildcard()
+    wc_x1 = wildcard()
+    wc_w1 = wildcard()
+
+    func_pattern = FunctionPattern([wc_x1, wc_w1], wildcard())
+    pattern = func_pattern(wc_x, wc_w) + wc_b
+
+    func = relay.Function([x1, w1], relay.nn.conv2d(x1, w1))
+    expr = func(x, w) + b + b
+
+    class TestRewrite(DFPatternCallback):
+        def __init__(self):
+            super(TestRewrite, self).__init__()
+            self.pattern = pattern
+
+        def callback(self, pre, post, node_map):
+            return x + w
+
+    out = rewrite(TestRewrite(), expr)
+    assert tvm.ir.structural_equal(x + w, x + w)
+
+
+@pytest.mark.skip(
+    """TODO(mbrookhart): The current partitioner can't properly handle 
+                       the partitioned inputs on the fuzzy body"""
+)
+def test_partition_function_with_fuzzy_body():
+    """
+    Allow Rewriting a function with a fuzzy body via dominator analysis
+    """
+    x = relay.var("x")
+    w = relay.var("w")
+    b = relay.var("b")
+
+    x1 = relay.var("x1")
+    w1 = relay.var("w1")
+
+    wc_x = wildcard()
+    wc_w = wildcard()
+    wc_b = wildcard()
+    wc_x1 = wildcard()
+    wc_w1 = wildcard()
+
+    func_pattern = FunctionPattern([wc_x1, wc_w1], wildcard())
+    pattern = func_pattern(wc_x, wc_w) + wc_b
+
+    func = relay.Function([x1, w1], relay.nn.conv2d(x1, w1))
+    expr = func(x, w) + b + b
+
+    x2 = relay.var("x2")
+    w2 = relay.var("w2")
+    b2 = relay.var("b2")
+    func2 = relay.Function([x2, w2, b2], func(x2, w2) + b2).with_attr(
+        "PartitionedFromPattern", "FunctionCall_add_"
+    )
+    expr2 = func2(x, w, b) + b
+    assert tvm.ir.structural_equal(pattern.partition(expr), expr2)
+
+
 def test_match_match():
     add_pattern = is_op("add")(wildcard(), wildcard())
 

From 8b84e33679585082fd1817821eac8a7eae5830c6 Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Wed, 27 Jan 2021 21:31:18 -0700
Subject: [PATCH 126/357] Fold If when the condition is Constant (#7354)

---
 src/relay/transforms/fold_constant.cc         | 12 ++++++
 tests/python/relay/test_pass_fold_constant.py | 39 +++++++++++++++++++
 2 files changed, 51 insertions(+)

diff --git a/src/relay/transforms/fold_constant.cc b/src/relay/transforms/fold_constant.cc
index 48af31f9a11f..66f233bbba85 100644
--- a/src/relay/transforms/fold_constant.cc
+++ b/src/relay/transforms/fold_constant.cc
@@ -120,6 +120,18 @@ class ConstantFolder : public MixedModeMutator {
     }
   }
 
+  Expr VisitExpr_(const IfNode* op) final {
+    auto new_cond = ExprMutator::VisitExpr(op->cond);
+    if (auto const_cond = new_cond.as<ConstantNode>()) {
+      if (reinterpret_cast<uint8_t*>(const_cond->data->data)[0]) {
+        return ExprMutator::VisitExpr(op->true_branch);
+      } else {
+        return ExprMutator::VisitExpr(op->false_branch);
+      }
+    }
+    return ExprMutator::VisitExpr_(op);
+  }
+
   Expr Rewrite_(const CallNode* call, const Expr& post) final {
     if (inside_primitive) {
       return GetRef<Expr>(call);
diff --git a/tests/python/relay/test_pass_fold_constant.py b/tests/python/relay/test_pass_fold_constant.py
index 549596d61693..76182d2c3e08 100644
--- a/tests/python/relay/test_pass_fold_constant.py
+++ b/tests/python/relay/test_pass_fold_constant.py
@@ -147,6 +147,45 @@ def expected():
     assert tvm.ir.structural_equal(zz, zexpected)
 
 
+def test_fold_if():
+    cond_data = np.array(1).astype("bool")
+    x_data = np.array([[1, 2, 3]]).astype("float32")
+
+    def before():
+        a = relay.const(cond_data)
+        x = relay.const(x_data)
+        y = relay.const(x_data)
+        iff = relay.If(a, x + y, x - y)
+        return relay.Function([], iff)
+
+    def expected():
+        y_data = x_data + x_data
+        y = relay.const(y_data)
+        return relay.Function([], y)
+
+    zz = run_opt_pass(before(), transform.FoldConstant())
+    zexpected = run_opt_pass(expected(), transform.InferType())
+    assert tvm.ir.structural_equal(zz, zexpected)
+
+    cond_data = np.array(0).astype("bool")
+
+    def before():
+        a = relay.const(cond_data)
+        x = relay.const(x_data)
+        y = relay.const(x_data)
+        iff = relay.If(a, x + y, x - y)
+        return relay.Function([], iff)
+
+    def expected():
+        y_data = x_data - x_data
+        y = relay.const(y_data)
+        return relay.Function([], y)
+
+    zz = run_opt_pass(before(), transform.FoldConstant())
+    zexpected = run_opt_pass(expected(), transform.InferType())
+    assert tvm.ir.structural_equal(zz, zexpected)
+
+
 def test_fold_shape_of():
     c_shape = (8, 9, 10)
 

From cbc035f70a0cd2b3b85681fb77f843bb9b74b1ea Mon Sep 17 00:00:00 2001
From: "Matt Welsh (OctoML)" <63477620+mdw-octoml@users.noreply.github.com>
Date: Wed, 27 Jan 2021 20:59:26 -0800
Subject: [PATCH 127/357] Update uTVM code to work with the nRF5340DK dev
 board. (#7331)

* Various fixes to get nRF5340 working. Not yet there.

* nRF5340 test runs locally.

* Various fixes to get nRF5340 working. Not yet there.

* nRF5340 test runs locally.

* Add `nrfjprog --recover` for nRF5340DK

* Cleanup.

* Remove debugging code.

* Revert submodule update.

* Remove debugging code.

* Fix comment.

* Remove -keys argument.

* Adding some debugging code

* Fix passing west command to ZephyrFlasher.

* Various fixes to get nRF5340 working. Not yet there.

* nRF5340 test runs locally.

* Add `nrfjprog --recover` for nRF5340DK

* Cleanup.

* Various fixes to get nRF5340 working. Not yet there.

* nRF5340 test runs locally.

* Remove debugging code.

* Fix comment.

* Remove -keys argument.

* Fix merge.
---
 .../reference-vm/zephyr/pyproject.toml        |  3 ++
 python/tvm/micro/contrib/zephyr.py            | 23 +++++++++++--
 python/tvm/target/target.py                   |  3 ++
 tests/micro/qemu/conftest.py                  |  9 +++++
 tests/micro/qemu/test_zephyr.py               | 33 ++++++++++---------
 5 files changed, 53 insertions(+), 18 deletions(-)

diff --git a/apps/microtvm/reference-vm/zephyr/pyproject.toml b/apps/microtvm/reference-vm/zephyr/pyproject.toml
index f21c272731c4..b4cfc544df58 100644
--- a/apps/microtvm/reference-vm/zephyr/pyproject.toml
+++ b/apps/microtvm/reference-vm/zephyr/pyproject.toml
@@ -64,6 +64,9 @@ scipy = "^1.4"
 python = "^3.6"
 tornado = "^6"
 typed_ast = "^1.4"
+pyyaml = "^5.4.1"
+pyserial = "^3.5"
+
 
 # AutoTVM
 xgboost = {version = "^1.1", optional = true}
diff --git a/python/tvm/micro/contrib/zephyr.py b/python/tvm/micro/contrib/zephyr.py
index fa032e20c930..ed1c9866c741 100644
--- a/python/tvm/micro/contrib/zephyr.py
+++ b/python/tvm/micro/contrib/zephyr.py
@@ -191,7 +191,7 @@ def library(self, output, sources, options=None):
         with open(os.path.join(output, "main.c"), "w"):
             pass
 
-        # expecetd not to exist after populate_tvm_libs
+        # expected not to exist after populate_tvm_libs
         build_dir = os.path.join(output, "__tvm_build")
         os.mkdir(build_dir)
         self._subprocess_env.run(
@@ -241,11 +241,12 @@ def binary(self, output, objects, options=None, link_main=True, main_options=Non
     def flasher_factory(self):
         return compiler.FlasherFactory(
             ZephyrFlasher,
-            (self._west_cmd,),
+            (self._board,),
             dict(
                 zephyr_base=self._zephyr_base,
                 project_dir=self._project_dir,
                 subprocess_env=self._subprocess_env.default_overrides,
+                west_cmd=self._west_cmd,
             ),
         )
 
@@ -291,7 +292,7 @@ class ZephyrFlasher(tvm.micro.compiler.Flasher):
 
     def __init__(
         self,
-        west_cmd,
+        board,
         zephyr_base=None,
         project_dir=None,
         subprocess_env=None,
@@ -300,6 +301,7 @@ def __init__(
         flash_args=None,
         debug_rpc_session=None,
         serial_timeouts=None,
+        west_cmd=None,
     ):
         zephyr_base = zephyr_base or os.environ["ZEPHYR_BASE"]
         sys.path.insert(0, os.path.join(zephyr_base, "scripts", "dts"))
@@ -310,6 +312,7 @@ def __init__(
         finally:
             sys.path.pop(0)
 
+        self._board = board
         self._zephyr_base = zephyr_base
         self._project_dir = project_dir
         self._west_cmd = west_cmd
@@ -414,6 +417,20 @@ def flash(self, micro_binary):
         build_dir = os.path.dirname(
             micro_binary.abspath(micro_binary.labelled_files["cmake_cache"][0])
         )
+
+        # The nRF5340DK requires an additional `nrfjprog --recover` before each flash cycle.
+        # This is because readback protection is enabled by default when this device is flashed.
+        # Otherwise, flashing may fail with an error such as the following:
+        #  ERROR: The operation attempted is unavailable due to readback protection in
+        #  ERROR: your device. Please use --recover to unlock the device.
+        if (
+            self._board.startswith("nrf5340dk")
+            and self._get_flash_runner(cmake_entries) == "nrfjprog"
+        ):
+            recover_args = ["nrfjprog", "--recover"]
+            recover_args.extend(self._get_nrf_device_args())
+            self._subprocess_env.run(recover_args, cwd=build_dir)
+
         west_args = (
             self._west_cmd
             + ["flash", "--build-dir", build_dir, "--skip-rebuild"]
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 161cd549fade..0ebf31ae6462 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -234,7 +234,10 @@ def micro(model="unknown", options=None):
     trans_table = {
         "host": [],
         "stm32f746xx": ["-mcpu=cortex-m7", "-march=armv7e-m"],
+        "nrf5340dk": ["-mcpu=cortex-m33"],
     }
+    if model not in trans_table:
+        raise ValueError(f"Model {model} not supported by tvm.target.micro.")
     opts = _merge_opts(
         trans_table[model] + ["-runtime=c", "--system-lib", f"-model={model}"],
         options,
diff --git a/tests/micro/qemu/conftest.py b/tests/micro/qemu/conftest.py
index e6cd9f2ffb1a..3fc54df02063 100644
--- a/tests/micro/qemu/conftest.py
+++ b/tests/micro/qemu/conftest.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import pytest
 
 
 def pytest_addoption(parser):
@@ -25,8 +26,16 @@ def pytest_addoption(parser):
             "for microTVM tests."
         ),
     )
+    parser.addoption(
+        "--west-cmd", default="west", help="Path to `west` command for flashing device."
+    )
 
 
 def pytest_generate_tests(metafunc):
     if "platform" in metafunc.fixturenames:
         metafunc.parametrize("platform", metafunc.config.getoption("microtvm_platforms").split(","))
+
+
+@pytest.fixture
+def west_cmd(request):
+    return request.config.getoption("--west-cmd")
diff --git a/tests/micro/qemu/test_zephyr.py b/tests/micro/qemu/test_zephyr.py
index ab3a25d36543..865c7f88806f 100644
--- a/tests/micro/qemu/test_zephyr.py
+++ b/tests/micro/qemu/test_zephyr.py
@@ -43,15 +43,15 @@
 TARGET = None
 
 
-def _make_sess_from_op(model, zephyr_board, op_name, sched, arg_bufs):
+def _make_sess_from_op(model, zephyr_board, west_cmd, op_name, sched, arg_bufs):
     target = tvm.target.target.micro(model)
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
         mod = tvm.build(sched, arg_bufs, target, target_host=target, name=op_name)
 
-    return _make_session(model, target, zephyr_board, mod)
+    return _make_session(model, target, zephyr_board, west_cmd, mod)
 
 
-def _make_session(model, target, zephyr_board, mod):
+def _make_session(model, target, zephyr_board, west_cmd, mod):
     test_name = f"{os.path.splitext(os.path.abspath(__file__))[0]}-{model}"
     prev_build = f"{test_name}-last-build.micro-binary"
     workspace_root = (
@@ -65,8 +65,9 @@ def _make_session(model, target, zephyr_board, mod):
     project_dir = os.path.join(os.path.dirname(__file__) or ".", "zephyr-runtime")
     compiler = zephyr.ZephyrCompiler(
         project_dir=project_dir,
-        board="nucleo_f746zg" if "stm32f746" in str(target) else "qemu_x86",
+        board=zephyr_board,
         zephyr_toolchain_variant="zephyr",
+        west_cmd=west_cmd,
     )
 
     opts = tvm.micro.default_options(f"{project_dir}/crt")
@@ -106,12 +107,12 @@ def _make_session(model, target, zephyr_board, mod):
     return tvm.micro.Session(**session_kw)
 
 
-def _make_add_sess(model, zephyr_board):
+def _make_add_sess(model, zephyr_board, west_cmd):
     A = tvm.te.placeholder((2,), dtype="int8")
     B = tvm.te.placeholder((1,), dtype="int8")
     C = tvm.te.compute(A.shape, lambda i: A[i] + B[0], name="C")
     sched = tvm.te.create_schedule(C.op)
-    return _make_sess_from_op(model, zephyr_board, "add", sched, [A, B, C])
+    return _make_sess_from_op(model, zephyr_board, west_cmd, "add", sched, [A, B, C])
 
 
 # The models that should pass this configuration. Maps a short, identifying platform string to
@@ -119,11 +120,12 @@ def _make_add_sess(model, zephyr_board):
 PLATFORMS = {
     "host": ("host", "qemu_x86"),
     "stm32f746xx": ("stm32f746xx", "nucleo_f746zg"),
+    "nrf5340dk": ("nrf5340dk", "nrf5340dk_nrf5340_cpuapp"),
 }
 
 
 # The same test code can be executed on both the QEMU simulation and on real hardware.
-def test_compile_runtime(platform):
+def test_compile_runtime(platform, west_cmd):
     """Test compiling the on-device runtime."""
 
     model, zephyr_board = PLATFORMS[platform]
@@ -141,11 +143,11 @@ def test_basic_add(sess):
         system_lib.get_function("add")(A_data, B_data, C_data)
         assert (C_data.asnumpy() == np.array([6, 7])).all()
 
-    with _make_add_sess(model, zephyr_board) as sess:
+    with _make_add_sess(model, zephyr_board, west_cmd) as sess:
         test_basic_add(sess)
 
 
-def test_platform_timer(platform):
+def test_platform_timer(platform, west_cmd):
     """Test compiling the on-device runtime."""
 
     model, zephyr_board = PLATFORMS[platform]
@@ -168,11 +170,11 @@ def test_basic_add(sess):
         assert result.mean > 0
         assert len(result.results) == 3
 
-    with _make_add_sess(model, zephyr_board) as sess:
+    with _make_add_sess(model, zephyr_board, west_cmd) as sess:
         test_basic_add(sess)
 
 
-def test_relay(platform):
+def test_relay(platform, west_cmd):
     """Testing a simple relay graph"""
     model, zephyr_board = PLATFORMS[platform]
     shape = (10,)
@@ -188,7 +190,7 @@ def test_relay(platform):
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
         graph, mod, params = tvm.relay.build(func, target=target)
 
-    with _make_session(model, target, zephyr_board, mod) as session:
+    with _make_session(model, target, zephyr_board, west_cmd, mod) as session:
         graph_mod = tvm.micro.create_local_graph_runtime(
             graph, session.get_system_lib(), session.context
         )
@@ -254,14 +256,14 @@ def visit_call(self, call):
         return super().visit_call(call)
 
 
-def check_result(relay_mod, model, zephyr_board, map_inputs, out_shape, result):
+def check_result(relay_mod, model, zephyr_board, west_cmd, map_inputs, out_shape, result):
     """Helper function to verify results"""
     TOL = 1e-5
     target = tvm.target.target.micro(model)
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
         graph, mod, params = tvm.relay.build(relay_mod, target=target)
 
-    with _make_session(model, target, zephyr_board, mod) as session:
+    with _make_session(model, target, zephyr_board, west_cmd, mod) as session:
         rt_mod = tvm.micro.create_local_graph_runtime(
             graph, session.get_system_lib(), session.context
         )
@@ -280,7 +282,7 @@ def check_result(relay_mod, model, zephyr_board, map_inputs, out_shape, result):
             tvm.testing.assert_allclose(out.asnumpy(), results[idx], rtol=TOL, atol=TOL)
 
 
-def test_byoc_utvm(platform):
+def test_byoc_utvm(platform, west_cmd):
     """This is a simple test case to check BYOC capabilities of uTVM"""
     model, zephyr_board = PLATFORMS[platform]
     x = relay.var("x", shape=(10, 10))
@@ -335,6 +337,7 @@ def test_byoc_utvm(platform):
         ),
         model=model,
         zephyr_board=zephyr_board,
+        west_cmd=west_cmd,
     )
 
 

From dda8f5d944747b9f48b9155e866fd0f746fcd9bb Mon Sep 17 00:00:00 2001
From: ANSHUMAN TRIPATHY <anshuman.t@huawei.com>
Date: Thu, 28 Jan 2021 11:28:13 +0530
Subject: [PATCH 128/357] [Frontend][Tensorflow] Sparse dense matmul adjoint
 option added (#7267)

* [Frontend][Tensorflow] Sparse dense matmul adjoint option added

* [1] Review comments handled

* [2] Review comments handled

* [3] Review comments handled
---
 python/tvm/relay/frontend/tensorflow.py       | 69 ++++++++++++-------
 .../frontend/tensorflow/test_forward.py       | 12 ++--
 2 files changed, 53 insertions(+), 28 deletions(-)

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 2c7361a7d813..b34e6c723645 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -926,13 +926,6 @@ def _impl(inputs, attr, params, mod):
 
         data = inputs[3]
 
-        # By default, in tensorflow the first input ,i.e., data is sparse
-        sparse_lhs = True
-
-        # If both are true means First input was dense and second was sparse
-        if attr.get("adjoint_a") and attr.get("adjoint_b"):
-            sparse_lhs = False
-
         rows = [x[0] for x in indices_tensor]
         cols = [x[1] for x in indices_tensor]
 
@@ -941,9 +934,53 @@ def _impl(inputs, attr, params, mod):
             (values_tensor, (rows, cols)), shape=tuple(dense_shape_tensor.tolist())
         )
 
-        if sparse_lhs:
+        # As per tensorflow implementation, we have 4 possible input combination
+        # and the first input(A) is always sparse and second input(B) is always dense.
+        # Case 1: A , B , adjoint_a=False, adjoint_b=False  --> A * B
+        # Case 2: A , B , adjoint_a=True,   adjoint_b=False  --> A.T * B
+        # Case 3: A , B , adjoint_a=False, adjoint_b=True    --> A * B.T
+        # Case 4: A , B , adjoint_a=True,   adjoint_b=True    --> A.T * B.T
+        #
+        # Topi implementation for sparse_dense(matmul) has 2 possible input
+        # combination where first input(A) is always dense
+        # and second input(B) is always sparse.
+        # Case 1: A , B, sparse_lhs = False  --> A * B.T
+        # Case 2: A , B, sparse_lhs = True    --> B * A.T
+        #
+        # The mapping would be as below:
+        # TF Case 1: A , B , adjoint_a=False, adjoint_b=False
+        #           --> In TF: A * B   --> In Topi: A * B.T.T
+        #           --> sparse_dense(transpose(B), A, sparse_lhs=True)
+        #
+        # TF Case 2: A , B , adjoint_a=True, adjoint_b=False
+        #           --> In TF: A.T * B   --> In Topi: A.T * B.T.T
+        #           --> sparse_dense(transpose(B), transpose(A), sparse_lhs=True)
+        #
+        # TF Case 3: A , B , adjoint_a=False, adjoint_b=True
+        #           --> In TF: A * B.T   --> In Topi: A * B
+        #           --> sparse_dense(B, A, sparse_lhs=True)
+        #
+        # TF Case 4: A , B , adjoint_a=True, adjoint_b=True
+        #           --> In TF: A.T * B.T   --> In Topi: (B * A.T).T
+        #           --> transpose(sparse_dense(B, transpose(A), sparse_lhs=False))
+
+        # By default, in tensorflow the first input ,i.e., data is sparse
+        sparse_lhs = True
+
+        # TF Case 1:
+        if not attr.get("adjoint_a") and not attr.get("adjoint_b"):
+            data = _op.transpose(data)
+        # TF Case 2:
+        elif attr.get("adjoint_a") and not attr.get("adjoint_b"):
             data = _op.transpose(data)
+            weight_sp = csr_matrix(weight_sp.transpose())
+        # TF Case 3:
+        elif not attr.get("adjoint_a") and attr.get("adjoint_b"):
+            pass
+        # TF Case 4:
+        # attr.get("adjoint_a") and attr.get("adjoint_b"):
         else:
+            sparse_lhs = False
             weight_sp = csr_matrix(weight_sp.transpose())
 
         weight_data = _expr.const(weight_sp.data, weight_sp.data.dtype)
@@ -953,23 +990,9 @@ def _impl(inputs, attr, params, mod):
         ret = _op.nn.sparse_dense(data, [weight_data, weight_indices, weight_indptrs], sparse_lhs)
 
         if not sparse_lhs:
+            # TF Case 4
             ret = _op.transpose(ret)
 
-        # Case 1. If both are true means first input was dense and second was sparse
-        # Case 2. If both are false means first input was sparse and second was dense
-        # TODO(ANSHUMAN87): Support other adjoint option too
-        if not (
-            (attr.get("adjoint_a") and attr.get("adjoint_b"))
-            or ((not attr.get("adjoint_a")) and (not attr.get("adjoint_b")))
-        ):
-            raise tvm.error.OpAttributeUnImplemented(
-                "Only tf.sparse.sparse_dense_matmul() with adjoint_a=True and adjoint_b=True"
-                "or with adjoint_a=False and adjoint_b=False"
-                " is supported, but adjoint_a={} and adjoint_b={} was supplied.".format(
-                    attr.get("adjoint_a"), attr.get("adjoint_b")
-                )
-            )
-
         return ret
 
     return _impl
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 3c30b6662c81..34ee0f3528ae 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -1758,19 +1758,21 @@ def test_forward_batch_matmul():
 # ----------------------------------
 
 
-def _test_sparse_dense_matmul(indices, values, A_shape, B_shape, dtype, flip=False):
+def _test_sparse_dense_matmul(indices, values, A_inp_shape, B_inp_shape, dtype, flip=False):
     """ One iteration of sparse_dense_matmul """
 
-    # TODO(ANSHUMAN87): Support adjoint options too
-    for adjoint_a in [False]:
-        for adjoint_b in [False]:
+    for adjoint_a in [False, True]:
+        for adjoint_b in [False, True]:
+            A_shape = A_inp_shape[::-1] if adjoint_a else A_inp_shape
+            B_shape = B_inp_shape[::-1] if adjoint_b else B_inp_shape
+
             with tf.Graph().as_default():
                 A_sp = tf.sparse.SparseTensor(indices=indices, values=values, dense_shape=A_shape)
                 B = tf.placeholder(shape=B_shape, dtype=dtype, name="B")
 
                 if flip:
                     result = tf.sparse.sparse_dense_matmul(
-                        B, A_sp, adjoint_a=adjoint_a, adjoint_b=adjoint_b
+                        B, A_sp, adjoint_a=adjoint_b, adjoint_b=adjoint_a
                     )
                 else:
                     result = tf.sparse.sparse_dense_matmul(

From 67acad3aeb5c931975eb51d4eb60e1defc843ce7 Mon Sep 17 00:00:00 2001
From: Manupa Karunaratne <manupa.karunaratne@arm.com>
Date: Thu, 28 Jan 2021 16:37:19 +0000
Subject: [PATCH 129/357] [Relay][PatternLang] Bug fix of rewrite func attr
 (#7358)

When using pattern with attr of functions, such attrs
mostly does not exist for op node. Therefore, hasattr
check has to be done for op nodes.

Change-Id: Ia313ab34be95ccc793c32fd8e5e5ef566b78685b
---
 src/relay/ir/dataflow_matcher.cc            |  8 ++++---
 tests/python/relay/test_dataflow_pattern.py | 23 +++++++++++++++++++++
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/src/relay/ir/dataflow_matcher.cc b/src/relay/ir/dataflow_matcher.cc
index cfacd41487c8..a43f50f600df 100644
--- a/src/relay/ir/dataflow_matcher.cc
+++ b/src/relay/ir/dataflow_matcher.cc
@@ -159,9 +159,11 @@ bool DFPatternMatcher::VisitDFPattern_(const AttrPatternNode* attr_pattern, cons
     for (auto kv : attributes) {
       auto attr_name = kv.first;
       auto attr_value = kv.second;
-      auto op_map = Op::GetAttrMap<TVMRetValue>(attr_name);
-      if (op_map.count(op)) {
-        matches = MatchRetValue(attr_value, op_map[op]);
+      if (Op::HasAttrMap(attr_name)) {
+        auto op_map = Op::GetAttrMap<TVMRetValue>(attr_name);
+        if (op_map.count(op)) {
+          matches = MatchRetValue(attr_value, op_map[op]);
+        }
       }
     }
   } else if (auto* op = expr.as<CallNode>()) {
diff --git a/tests/python/relay/test_dataflow_pattern.py b/tests/python/relay/test_dataflow_pattern.py
index 15d3ee035450..b39c03a6160e 100644
--- a/tests/python/relay/test_dataflow_pattern.py
+++ b/tests/python/relay/test_dataflow_pattern.py
@@ -786,6 +786,29 @@ def callback(self, pre, post, node_map):
     assert sub_pattern.match(out)
 
 
+def test_rewrite_func_with_attr():
+    x = relay.var("x")
+    y = relay.var("y")
+    f = relay.Function([x, y], x + y).with_attr("Composite", "add")
+
+    a = relay.var("a")
+    b = relay.var("b")
+    c = relay.Call(f, [a, b])
+    c_abs = relay.abs(c)
+
+    class TestRewrite(DFPatternCallback):
+        def __init__(self):
+            super(TestRewrite, self).__init__()
+            self.pattern = wildcard().has_attr({"Composite": "add"})(wildcard(), wildcard())
+
+        def callback(self, pre, post, node_map):
+            return post.args[0] + post.args[1]
+
+    out = rewrite(TestRewrite(), c_abs)
+    inlined_add_pattern = is_op("abs")(is_op("add")(wildcard(), wildcard()))
+    assert inlined_add_pattern.match(out)
+
+
 def test_nested_rewrite():
     class PatternCallback(DFPatternCallback):
         def __init__(self, pattern):

From f17cba780d2d513ce4593327b4108b527243f7bf Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Thu, 28 Jan 2021 13:23:42 -0800
Subject: [PATCH 130/357] [RUNTIME] Improve error messages for TypedPackedFunc
 (#7152)

* [RUNTIME] Improve error messages for TypedPackedFunc

- TypedPackedFunc now prints the function name when the incorrect number
  of arguments is passed.
- TypedPackedFunc now prints the function name and which argument when
  an argument cannot be converted to the correct type.

* check argument conversion by template deducing argument types

* switch from template approach to TVMMovableArgValueWithContext

* move passes back into cc files

* remove error message prefixes

* Remove TVM_ICHECK_TYPE_CODE. Rename name to optional_name.

* revert changes to module pass for later PR

* reverted too much

* documentation

* formatting

* more docs

* unify error message language. TypedPackedFunc contrustor that does not take a name

* Update include/tvm/runtime/packed_func.h

Co-authored-by: Junru Shao <junrushao1994@gmail.com>

Co-authored-by: Junru Shao <junrushao1994@gmail.com>
---
 include/tvm/runtime/packed_func.h | 153 ++++++++++++++++++++++++------
 include/tvm/runtime/registry.h    |  10 +-
 src/relay/op/nn/nn.cc             |  15 +--
 src/relay/op/nn/sparse.cc         |  10 +-
 src/relay/op/tensor/reduce.cc     |   4 +-
 src/relay/op/tensor/transform.cc  |   4 +-
 6 files changed, 137 insertions(+), 59 deletions(-)

diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index fd4e2114b11a..87606f3f738c 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -60,7 +60,7 @@ namespace runtime {
 // forward declarations
 class TVMArgs;
 class TVMArgValue;
-class TVMMovableArgValue_;
+class TVMMovableArgValueWithContext_;
 class TVMRetValue;
 class TVMArgsSetter;
 
@@ -215,7 +215,7 @@ class TypedPackedFunc<R(Args...)> {
    * \brief constructor from TVMMovableArgValue_
    * \param value The TVMMovableArgValue_
    */
-  inline TypedPackedFunc(TVMMovableArgValue_&& value);  // NOLINT(*)
+  inline TypedPackedFunc(TVMMovableArgValueWithContext_&& value);  // NOLINT(*)
   /*!
    * \brief construct from a lambda function with the same signature.
    *
@@ -223,6 +223,30 @@ class TypedPackedFunc<R(Args...)> {
    * \code
    * auto typed_lambda = [](int x)->int { return x + 1; }
    * // construct from packed function
+   * TypedPackedFunc<int(int)> ftyped(typed_lambda, "add_one");
+   * // call the typed version.
+   * ICHECK_EQ(ftyped(1), 2);
+   * \endcode
+   *
+   * \param typed_lambda typed lambda function.
+   * \param name the name of the lambda function.
+   * \tparam FLambda the type of the lambda function.
+   */
+  template <typename FLambda, typename = typename std::enable_if<std::is_convertible<
+                                  FLambda, std::function<R(Args...)>>::value>::type>
+  TypedPackedFunc(const FLambda& typed_lambda, std::string name) {  // NOLINT(*)
+    this->AssignTypedLambda(typed_lambda, name);
+  }
+  /*!
+   * \brief construct from a lambda function with the same signature.
+   *
+   * This version does not take a name. It is highly recommend you use the
+   * version that takes a name for the lambda.
+   *
+   * Example usage:
+   * \code
+   * auto typed_lambda = [](int x)->int { return x + 1; }
+   * // construct from packed function
    * TypedPackedFunc<int(int)> ftyped(typed_lambda);
    * // call the typed version.
    * ICHECK_EQ(ftyped(1), 2);
@@ -231,9 +255,8 @@ class TypedPackedFunc<R(Args...)> {
    * \param typed_lambda typed lambda function.
    * \tparam FLambda the type of the lambda function.
    */
-  template <typename FLambda, typename = typename std::enable_if<
-                                  std::is_convertible<FLambda,
-                                                      std::function<R(Args...)>>::value>::type>
+  template <typename FLambda, typename = typename std::enable_if<std::is_convertible<
+                                  FLambda, std::function<R(Args...)>>::value>::type>
   TypedPackedFunc(const FLambda& typed_lambda) {  // NOLINT(*)
     this->AssignTypedLambda(typed_lambda);
   }
@@ -297,6 +320,17 @@ class TypedPackedFunc<R(Args...)> {
    * \brief Assign the packed field using a typed lambda function.
    *
    * \param flambda The lambda function.
+   * \param name The name associated with this lambda.
+   * \tparam FLambda The lambda function type.
+   * \note We capture the lambda when possible for maximum efficiency.
+   */
+  template <typename FLambda>
+  inline void AssignTypedLambda(FLambda flambda, std::string name);
+  /*!
+   * \brief Assign the packed field using a typed lambda function. This variant is for functions
+   * without names.
+   *
+   * \param flambda The lambda function.
    * \tparam FLambda The lambda function type.
    * \note We capture the lambda when possible for maximum efficiency.
    */
@@ -337,7 +371,7 @@ inline const char* ArgTypeCode2Str(int type_code);
 
 // macro to check type code.
 #define TVM_CHECK_TYPE_CODE(CODE, T) \
-  ICHECK_EQ(CODE, T) << " expected " << ArgTypeCode2Str(T) << " but get " << ArgTypeCode2Str(CODE)
+  ICHECK_EQ(CODE, T) << "expected " << ArgTypeCode2Str(T) << " but got " << ArgTypeCode2Str(CODE)
 
 /*!
  * \brief Type traits for runtime type check during FFI conversion.
@@ -401,8 +435,8 @@ class TVMPODValue_ {
       return static_cast<DLTensor*>(value_.v_handle);
     } else {
       if (type_code_ == kTVMNullptr) return nullptr;
-      LOG(FATAL) << "Expect "
-                 << "DLTensor* or NDArray but get " << ArgTypeCode2Str(type_code_);
+      LOG(FATAL) << "Expected "
+                 << "DLTensor* or NDArray but got " << ArgTypeCode2Str(type_code_);
       return nullptr;
     }
   }
@@ -442,6 +476,7 @@ class TVMPODValue_ {
  protected:
   friend class TVMArgsSetter;
   friend class TVMRetValue;
+  friend class TVMMovableArgValue_;
   TVMPODValue_() : type_code_(kTVMNullptr) {}
   TVMPODValue_(TVMValue value, int type_code) : value_(value), type_code_(type_code) {}
 
@@ -562,6 +597,44 @@ class TVMMovableArgValue_ : public TVMPODValue_ {
   TVMArgValue AsArgValue() const { return TVMArgValue(value_, type_code_); }
 };
 
+/*!
+ * \brief Internal auxiliary struct for TypedPackedFunc to indicate a movable argument with
+ * additional context information (function name and argument index) for better error reporting.
+ *
+ * \sa MovableArgValue_
+ * \note For internal development purpose only.
+ */
+class TVMMovableArgValueWithContext_ {
+ public:
+  /*!
+   * \brief move constructor from another return value.
+   * \param value The other return value.
+   * \param type_code The code associated with the type of the value.
+   * \param arg_index In a function call, this argument is at index arg_index (0-indexed).
+   * \param optional_name Name of the function being called. Can be nullptr if the function is not
+   * named.
+   */
+  TVMMovableArgValueWithContext_(TVMValue value, int type_code, int arg_index,
+                                 const std::string* optional_name)
+      : value_(value, type_code), arg_index_(arg_index), optional_name_(optional_name) {}
+
+  template <typename T>
+  operator T() const {
+    try {
+      return value_;  // implicit conversion happens here
+    } catch (dmlc::Error& e) {
+      LOG(FATAL) << "In function " << (optional_name_ == nullptr ? "<anonymous>" : *optional_name_)
+                 << ": error while converting argument " << arg_index_ << ": " << e.what();
+      throw;  // never reached, LOG(FATAL) throws, but this silences a warning.
+    }
+  }
+
+ private:
+  TVMMovableArgValue_ value_;
+  int arg_index_;
+  const std::string* optional_name_;
+};
+
 /*!
  * \brief Return Value container,
  *  Unlike TVMArgValue, which only holds reference and do not delete
@@ -1213,20 +1286,23 @@ namespace detail {
 template <typename R, int nleft, int index, typename F>
 struct unpack_call_dispatcher {
   template <typename... Args>
-  TVM_ALWAYS_INLINE static void run(const F& f, const TVMArgs& args_pack, TVMRetValue* rv,
+  TVM_ALWAYS_INLINE static void run(const std::string* optional_name, const F& f,
+                                    const TVMArgs& args_pack, TVMRetValue* rv,
                                     Args&&... unpacked_args) {
     // construct a movable argument value
     // which allows potential move of argument to the input of F.
     unpack_call_dispatcher<R, nleft - 1, index + 1, F>::run(
-        f, args_pack, rv, std::forward<Args>(unpacked_args)...,
-        TVMMovableArgValue_(args_pack.values[index], args_pack.type_codes[index]));
+        optional_name, f, args_pack, rv, std::forward<Args>(unpacked_args)...,
+        TVMMovableArgValueWithContext_(args_pack.values[index], args_pack.type_codes[index], index,
+                                       optional_name));
   }
 };
 
 template <typename R, int index, typename F>
 struct unpack_call_dispatcher<R, 0, index, F> {
   template <typename... Args>
-  TVM_ALWAYS_INLINE static void run(const F& f, const TVMArgs& args_pack, TVMRetValue* rv,
+  TVM_ALWAYS_INLINE static void run(const std::string* optional_name, const F& f,
+                                    const TVMArgs& args_pack, TVMRetValue* rv,
                                     Args&&... unpacked_args) {
     using RetType = decltype(f(std::forward<Args>(unpacked_args)...));
     if (std::is_same<RetType, R>::value) {
@@ -1240,16 +1316,21 @@ struct unpack_call_dispatcher<R, 0, index, F> {
 template <int index, typename F>
 struct unpack_call_dispatcher<void, 0, index, F> {
   template <typename... Args>
-  TVM_ALWAYS_INLINE static void run(const F& f, const TVMArgs& args_pack, TVMRetValue* rv,
+  TVM_ALWAYS_INLINE static void run(const std::string* optional_name, const F& f,
+                                    const TVMArgs& args_pack, TVMRetValue* rv,
                                     Args&&... unpacked_args) {
     f(std::forward<Args>(unpacked_args)...);
   }
 };
 
 template <typename R, int nargs, typename F>
-TVM_ALWAYS_INLINE void unpack_call(const F& f, const TVMArgs& args, TVMRetValue* rv) {
-  ICHECK_EQ(nargs, args.size()) << "Expect " << nargs << " arguments but get " << args.size();
-  unpack_call_dispatcher<R, nargs, 0, F>::run(f, args, rv);
+TVM_ALWAYS_INLINE void unpack_call(const std::string* optional_name, const F& f,
+                                   const TVMArgs& args, TVMRetValue* rv) {
+  CHECK_EQ(nargs, args.size()) << "Function "
+                               << (optional_name == nullptr ? "<anonymous>" : *optional_name)
+                               << " expects " << nargs << " arguments but " << args.size()
+                               << " were provided";
+  unpack_call_dispatcher<R, nargs, 0, F>::run(optional_name, f, args, rv);
 }
 
 template <typename FType>
@@ -1259,7 +1340,7 @@ template <typename R, typename... Args>
 struct unpack_call_by_signature<R(Args...)> {
   template <typename F>
   TVM_ALWAYS_INLINE static void run(const F& f, const TVMArgs& args, TVMRetValue* rv) {
-    unpack_call<R, sizeof...(Args)>(f, args, rv);
+    unpack_call<R, sizeof...(Args)>(nullptr, f, args, rv);
   }
 };
 
@@ -1297,14 +1378,30 @@ TypedPackedFunc<R(Args...)>::TypedPackedFunc(const TVMArgValue& value)
     : packed_(value.operator PackedFunc()) {}
 
 template <typename R, typename... Args>
-TypedPackedFunc<R(Args...)>::TypedPackedFunc(TVMMovableArgValue_&& value)
+TypedPackedFunc<R(Args...)>::TypedPackedFunc(TVMMovableArgValueWithContext_&& value)
     : packed_(value.operator PackedFunc()) {}
 
+template <typename R, typename... Args>
+template <typename FType>
+inline void TypedPackedFunc<R(Args...)>::AssignTypedLambda(FType flambda, std::string name) {
+  packed_ = PackedFunc([flambda, name](const TVMArgs& args, TVMRetValue* rv) {
+    if (args.size() != sizeof...(Args)) {
+      LOG(FATAL) << "Function " << name << " expects " << sizeof...(Args) << " arguments, but "
+                 << args.size() << " were provided.";
+    }
+    detail::unpack_call<R, sizeof...(Args)>(&name, flambda, args, rv);
+  });
+}
+
 template <typename R, typename... Args>
 template <typename FType>
 inline void TypedPackedFunc<R(Args...)>::AssignTypedLambda(FType flambda) {
   packed_ = PackedFunc([flambda](const TVMArgs& args, TVMRetValue* rv) {
-    detail::unpack_call<R, sizeof...(Args)>(flambda, args, rv);
+    if (args.size() != sizeof...(Args)) {
+      LOG(FATAL) << "Function <anonymous> expects " << sizeof...(Args) << " arguments, but "
+                 << args.size() << " were provided.";
+    }
+    detail::unpack_call<R, sizeof...(Args)>(nullptr, flambda, args, rv);
   });
 }
 
@@ -1377,7 +1474,7 @@ inline TObjectRef TVMPODValue_::AsObjectRef() const {
   using ContainerType = typename TObjectRef::ContainerType;
 
   if (type_code_ == kTVMNullptr) {
-    ICHECK(TObjectRef::_type_is_nullable)
+    CHECK(TObjectRef::_type_is_nullable)
         << "Expect a not null value of " << ContainerType::_type_key;
     return TObjectRef(ObjectPtr<Object>(nullptr));
   }
@@ -1387,29 +1484,29 @@ inline TObjectRef TVMPODValue_::AsObjectRef() const {
     TVM_CHECK_TYPE_CODE(type_code_, kTVMNDArrayHandle);
     ObjectPtr<Object> data =
         NDArray::FFIDataFromHandle(static_cast<TVMArrayHandle>(value_.v_handle));
-    ICHECK(data->IsInstance<ContainerType>())
-        << "Expect " << ContainerType::_type_key << " but get " << data->GetTypeKey();
+    CHECK(data->IsInstance<ContainerType>())
+        << "Expected " << ContainerType::_type_key << " but got " << data->GetTypeKey();
     return TObjectRef(data);
   }
   if (std::is_base_of<Module::ContainerType, ContainerType>::value) {
     // Casting to a sub-class of Module
     TVM_CHECK_TYPE_CODE(type_code_, kTVMModuleHandle);
     ObjectPtr<Object> data = GetObjectPtr<Object>(static_cast<Object*>(value_.v_handle));
-    ICHECK(data->IsInstance<ContainerType>())
-        << "Expect " << ContainerType::_type_key << " but get " << data->GetTypeKey();
+    CHECK(data->IsInstance<ContainerType>())
+        << "Expected " << ContainerType::_type_key << " but got " << data->GetTypeKey();
     return TObjectRef(data);
   }
   if (type_code_ == kTVMObjectHandle) {
     // normal object type check.
     Object* ptr = static_cast<Object*>(value_.v_handle);
-    ICHECK(ObjectTypeChecker<TObjectRef>::Check(ptr))
-        << "Expect " << ObjectTypeChecker<TObjectRef>::TypeName() << " but get "
+    CHECK(ObjectTypeChecker<TObjectRef>::Check(ptr))
+        << "Expected " << ObjectTypeChecker<TObjectRef>::TypeName() << " but got "
         << ptr->GetTypeKey();
     return TObjectRef(GetObjectPtr<Object>(ptr));
   } else if (type_code_ == kTVMObjectRValueRefArg) {
     Object* ptr = *static_cast<Object**>(value_.v_handle);
-    ICHECK(ObjectTypeChecker<TObjectRef>::Check(ptr))
-        << "Expect " << ObjectTypeChecker<TObjectRef>::TypeName() << " but get "
+    CHECK(ObjectTypeChecker<TObjectRef>::Check(ptr))
+        << "Expected " << ObjectTypeChecker<TObjectRef>::TypeName() << " but got "
         << ptr->GetTypeKey();
     return TObjectRef(GetObjectPtr<Object>(ptr));
   } else if (std::is_base_of<ContainerType, NDArray::ContainerType>::value &&
diff --git a/include/tvm/runtime/registry.h b/include/tvm/runtime/registry.h
index 86e3706b2058..859a8ace1abe 100644
--- a/include/tvm/runtime/registry.h
+++ b/include/tvm/runtime/registry.h
@@ -93,7 +93,7 @@ class Registry {
   template <typename FLambda>
   Registry& set_body_typed(FLambda f) {
     using FType = typename detail::function_signature<FLambda>::FType;
-    return set_body(TypedPackedFunc<FType>(std::move(f)).packed());
+    return set_body(TypedPackedFunc<FType>(std::move(f), name_).packed());
   }
   /*!
    * \brief set the body of the function to be the passed method pointer.
@@ -122,7 +122,7 @@ class Registry {
       // call method pointer
       return (target.*f)(params...);
     };
-    return set_body(TypedPackedFunc<R(T, Args...)>(fwrap));
+    return set_body(TypedPackedFunc<R(T, Args...)>(fwrap, name_));
   }
 
   /*!
@@ -152,7 +152,7 @@ class Registry {
       // call method pointer
       return (target.*f)(params...);
     };
-    return set_body(TypedPackedFunc<R(const T, Args...)>(fwrap));
+    return set_body(TypedPackedFunc<R(const T, Args...)>(fwrap, name_));
   }
 
   /*!
@@ -194,7 +194,7 @@ class Registry {
       // call method pointer
       return (target->*f)(params...);
     };
-    return set_body(TypedPackedFunc<R(TObjectRef, Args...)>(fwrap));
+    return set_body(TypedPackedFunc<R(TObjectRef, Args...)>(fwrap, name_));
   }
 
   /*!
@@ -236,7 +236,7 @@ class Registry {
       // call method pointer
       return (target->*f)(params...);
     };
-    return set_body(TypedPackedFunc<R(TObjectRef, Args...)>(fwrap));
+    return set_body(TypedPackedFunc<R(TObjectRef, Args...)>(fwrap, name_));
   }
 
   /*!
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index ce622429bdb9..8ace82be9ff8 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -718,10 +718,7 @@ Expr MakeInstanceNorm(Expr data, Expr gamma, Expr beta, int axis, double epsilon
   return Call(op, {data, gamma, beta}, Attrs(attrs), {});
 }
 
-TVM_REGISTER_GLOBAL("relay.op.nn._make.instance_norm")
-    .set_body([](const TVMArgs& args, TVMRetValue* rv) {
-      runtime::detail::unpack_call<Expr, 7>(MakeInstanceNorm, args, rv);
-    });
+TVM_REGISTER_GLOBAL("relay.op.nn._make.instance_norm").set_body_typed(MakeInstanceNorm);
 
 RELAY_REGISTER_OP("nn.instance_norm")
     .describe(R"code(Instance Normalization (Ulyanov and et al., 2016)
@@ -785,10 +782,7 @@ Expr MakeLayerNorm(Expr data, Expr gamma, Expr beta, int axis, double epsilon, b
   return Call(op, {data, gamma, beta}, Attrs(attrs), {});
 }
 
-TVM_REGISTER_GLOBAL("relay.op.nn._make.layer_norm")
-    .set_body([](const TVMArgs& args, TVMRetValue* rv) {
-      runtime::detail::unpack_call<Expr, 7>(MakeLayerNorm, args, rv);
-    });
+TVM_REGISTER_GLOBAL("relay.op.nn._make.layer_norm").set_body_typed(MakeLayerNorm);
 
 RELAY_REGISTER_OP("nn.layer_norm")
     .describe(R"code(
@@ -831,10 +825,7 @@ Expr MakeGroupNorm(Expr data, Expr gamma, Expr beta, int num_groups, int axis, d
   return Call(op, {data, gamma, beta}, Attrs(attrs), {});
 }
 
-TVM_REGISTER_GLOBAL("relay.op.nn._make.group_norm")
-    .set_body([](const TVMArgs& args, TVMRetValue* rv) {
-      runtime::detail::unpack_call<Expr, 8>(MakeGroupNorm, args, rv);
-    });
+TVM_REGISTER_GLOBAL("relay.op.nn._make.group_norm").set_body_typed(MakeGroupNorm);
 
 RELAY_REGISTER_OP("nn.group_norm")
     .describe(R"code(
diff --git a/src/relay/op/nn/sparse.cc b/src/relay/op/nn/sparse.cc
index e9073730641d..6322cfffd7c2 100644
--- a/src/relay/op/nn/sparse.cc
+++ b/src/relay/op/nn/sparse.cc
@@ -101,10 +101,7 @@ Expr MakeSparseDense(Expr data, Expr weight_data, Expr weight_indices, Expr weig
   return Call(op, {data, weight_data, weight_indices, weight_indptr}, Attrs(attrs), {});
 }
 
-TVM_REGISTER_GLOBAL("relay.op.nn._make.sparse_dense")
-    .set_body([](const TVMArgs& args, TVMRetValue* rv) {
-      runtime::detail::unpack_call<Expr, 5>(MakeSparseDense, args, rv);
-    });
+TVM_REGISTER_GLOBAL("relay.op.nn._make.sparse_dense").set_body_typed(MakeSparseDense);
 
 RELAY_REGISTER_OP("nn.sparse_dense")
     .describe(
@@ -130,10 +127,7 @@ Expr MakeSparseDensePadded(Expr data, Expr weight_data, Expr weight_indices, Exp
   return Call(op, {data, weight_data, weight_indices, weight_indptr}, Attrs(attrs), {});
 }
 
-TVM_REGISTER_GLOBAL("relay.op.nn._make.sparse_dense_padded")
-    .set_body([](const TVMArgs& args, TVMRetValue* rv) {
-      runtime::detail::unpack_call<Expr, 4>(MakeSparseDensePadded, args, rv);
-    });
+TVM_REGISTER_GLOBAL("relay.op.nn._make.sparse_dense_padded").set_body_typed(MakeSparseDensePadded);
 
 RELAY_REGISTER_OP("nn.internal.sparse_dense_padded")
     .describe(
diff --git a/src/relay/op/tensor/reduce.cc b/src/relay/op/tensor/reduce.cc
index f611dc2eefd2..0b198005001b 100644
--- a/src/relay/op/tensor/reduce.cc
+++ b/src/relay/op/tensor/reduce.cc
@@ -595,9 +595,7 @@ Expr MakeVariance(Expr data, Expr mean, Array<Integer> axis, bool keepdims, bool
   return Call(op, {data, mean}, Attrs(attrs), {});
 }
 
-TVM_REGISTER_GLOBAL("relay.op._make._variance").set_body([](const TVMArgs& args, TVMRetValue* rv) {
-  runtime::detail::unpack_call<Expr, 6>(MakeVariance, args, rv);
-});
+TVM_REGISTER_GLOBAL("relay.op._make._variance").set_body_typed(MakeVariance);
 
 RELAY_REGISTER_OP("variance")
     .describe(R"code(Computes the variance of array elements over given axes.
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 0e868cdc50c9..d44bfe6959ca 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -157,9 +157,7 @@ Expr MakeReinterpret(Expr data, DataType dtype) {
   return Call(op, {data}, Attrs(attrs), {});
 }
 
-TVM_REGISTER_GLOBAL("relay._make.reinterpret").set_body([](const TVMArgs& args, TVMRetValue* rv) {
-  runtime::detail::unpack_call<Expr, 2>(MakeReinterpret, args, rv);
-});
+TVM_REGISTER_GLOBAL("relay._make.reinterpret").set_body_typed(MakeReinterpret);
 
 RELAY_REGISTER_OP("reinterpret")
     .describe(R"code(Reinterpret the data into a new data type.

From b8ad146dfd00710376e9477dd2367cc94399d9bb Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Thu, 28 Jan 2021 15:06:12 -0700
Subject: [PATCH 131/357] [Relay] Type Relation Fixes (#7362)

* fix an error in the dynamic Full Type Relation

* Add Diagnostic Errors to Broadcast Type Relations
---
 src/relay/op/dyn/tensor/transform.cc |  3 +++
 src/relay/op/type_relations.cc       | 12 ++++++++++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/relay/op/dyn/tensor/transform.cc b/src/relay/op/dyn/tensor/transform.cc
index e4e81e3612fb..8bad3943f5ce 100644
--- a/src/relay/op/dyn/tensor/transform.cc
+++ b/src/relay/op/dyn/tensor/transform.cc
@@ -400,6 +400,9 @@ bool FullRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   if (fill_value == nullptr) {
     return false;
   }
+  if (fill_shape == nullptr) {
+    return false;
+  }
 
   DataType out_dtype = param->dtype;
   if (out_dtype.bits() == 0) {
diff --git a/src/relay/op/type_relations.cc b/src/relay/op/type_relations.cc
index 7a3bfcb21ce6..7b30aea2eb57 100644
--- a/src/relay/op/type_relations.cc
+++ b/src/relay/op/type_relations.cc
@@ -104,7 +104,11 @@ bool BroadcastRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   //                 << ",Out:" << types[2] << std::endl;
   if (auto* t0 = types[0].as<TensorTypeNode>()) {
     if (auto* t1 = types[1].as<TensorTypeNode>()) {
-      ICHECK_EQ(t0->dtype, t1->dtype);
+      if (t0->dtype != t1->dtype) {
+        reporter->GetDiagCtx().Emit(Diagnostic::Error(t0->span)
+                                    << "data types " << t0->dtype << " and " << t1->dtype
+                                    << "do not match in BroadcastRel");
+      }
       reporter->Assign(
           types[2], ConcreteBroadcast(GetRef<TensorType>(t0), GetRef<TensorType>(t1), t0->dtype));
       return true;
@@ -120,7 +124,11 @@ bool BroadcastCompRel(const Array<Type>& types, int num_inputs, const Attrs& att
   //                 << ",Out:" << types[2] << std::endl;
   if (auto* t0 = types[0].as<TensorTypeNode>()) {
     if (auto* t1 = types[1].as<TensorTypeNode>()) {
-      ICHECK_EQ(t0->dtype, t1->dtype);
+      if (t0->dtype != t1->dtype) {
+        reporter->GetDiagCtx().Emit(Diagnostic::Error(t0->span)
+                                    << "data types " << t0->dtype << " and " << t1->dtype
+                                    << "do not match in BroadcastCompRel");
+      }
       reporter->Assign(types[2], ConcreteBroadcast(GetRef<TensorType>(t0), GetRef<TensorType>(t1),
                                                    DataType::Bool()));
       return true;

From ef032b3b30cb05f4fbf30f0c9e20869904a1cdc6 Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Thu, 28 Jan 2021 17:25:19 -0700
Subject: [PATCH 132/357] Remove MemoryPlan from VM passes (#7361)

---
 src/relay/backend/vm/compiler.cc    | 7 +++++--
 src/relay/backend/vm/lambda_lift.cc | 1 -
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index 8fbe31edce3d..d908153f88cb 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -985,8 +985,11 @@ transform::Sequential MemoryOpt(tvm::Target host_target, TargetsMap targets) {
   // Fuse the shape functions.
   pass_seqs.push_back(transform::FuseOps());
 
-  // Perform memory planning in order to coalesce/reduce allocations.
-  pass_seqs.push_back(transform::MemoryPlan());
+  // TODO(mbrookhart, jroesch, masahi): this pass is very slow, and is
+  // incomplete to provide memory resuse optimizations. Disable it until we can
+  // rewrite it in C++ and complete it.
+  // // Perform memory planning in order to coalesce/reduce allocations.
+  // pass_seqs.push_back(transform::MemoryPlan());
 
   // Compute away constant computation introduced by coalescing allocations.
   pass_seqs.push_back(transform::FoldConstant());
diff --git a/src/relay/backend/vm/lambda_lift.cc b/src/relay/backend/vm/lambda_lift.cc
index 8e9cc625063b..fe9a544a719e 100644
--- a/src/relay/backend/vm/lambda_lift.cc
+++ b/src/relay/backend/vm/lambda_lift.cc
@@ -192,7 +192,6 @@ class LambdaLifter : public ExprMutator {
       global = module_->GetGlobalVar(name);
     } else {
       // Add the lifted function to the module.
-      std::cout << AsText(lifted_func) << std::endl;
       module_->Add(global, lifted_func);
     }
 

From f7275f9dc1ffb98e7348bdb977068974fde989cd Mon Sep 17 00:00:00 2001
From: "Matt Welsh (OctoML)" <63477620+mdw-octoml@users.noreply.github.com>
Date: Thu, 28 Jan 2021 21:08:05 -0800
Subject: [PATCH 133/357] Some docstring fixes. (#7367)

---
 python/tvm/contrib/graph_runtime.py |  4 ++--
 python/tvm/relay/build_module.py    | 10 ++--------
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/python/tvm/contrib/graph_runtime.py b/python/tvm/contrib/graph_runtime.py
index a960e552f68f..59db716e917c 100644
--- a/python/tvm/contrib/graph_runtime.py
+++ b/python/tvm/contrib/graph_runtime.py
@@ -141,11 +141,11 @@ class GraphModule(object):
         lib = relay.build(...)
         lib.export_library("compiled_lib.so")
         # load it back as a runtime
-        lib:tvm.runtime.Module = tvm.runtime.load_module("compiled_lib.so")
+        lib: tvm.runtime.Module = tvm.runtime.load_module("compiled_lib.so")
         # Call the library factory function for default and create
         # a new runtime.Module, wrap with graph module.
         gmod = graph_runtime.GraphModule(lib["default"](ctx))
-        # use the gmod
+        # use the graph module.
         gmod.set_input("x", data)
         gmod.run()
     """
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 20cdc24ebc69..f05e105ed2a2 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -110,14 +110,8 @@ def build(self, mod, target=None, target_host=None, params=None):
 
         Returns
         -------
-        graph_json : str
-            The json string that can be accepted by graph runtime.
-
-        mod : tvm.Module
-            The module containing necessary libraries.
-
-        params : dict
-            The parameters of the final graph.
+        factory_module : tvm.relay.backend.graph_runtime_factory.GraphRuntimeFactoryModule
+            The runtime factory for the TVM graph runtime.
         """
         target = _update_target(target)
 

From f7862e7d157c661c4f3405f20f5b3900414e7f76 Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Thu, 28 Jan 2021 21:27:24 -0800
Subject: [PATCH 134/357] [Relay][Frontend[Onnx] Add testing for output
 datatypes and fix related bugs. (#7364)

* Add testing for datatypes and fix related bugs.

* Fix lint issue in onnx.
---
 python/tvm/relay/frontend/onnx.py          | 6 +++---
 tests/python/frontend/onnx/test_forward.py | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index b1b01b87f715..897c6a022594 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -1451,7 +1451,7 @@ def _impl_v1(cls, inputs, attr, params):
         axis = attr.get("axis", 0)
         keepdims = attr.get("keepdims", True)
         attr = {"axis": axis, "keepdims": keepdims}
-        return AttrCvt("argmax")(inputs, attr)
+        return _op.cast(AttrCvt("argmax")(inputs, attr), "int64")
 
 
 class ArgMin(OnnxOpConverter):
@@ -1462,7 +1462,7 @@ def _impl_v1(cls, inputs, attr, params):
         axis = attr.get("axis", 0)
         keepdims = attr.get("keepdims", True)
         attr = {"axis": axis, "keepdims": keepdims}
-        return AttrCvt("argmin")(inputs, attr)
+        return _op.cast(AttrCvt("argmin")(inputs, attr), "int64")
 
 
 class Softmax(OnnxOpConverter):
@@ -2000,7 +2000,7 @@ def _impl_v1(cls, inputs, attr, params):
         if largest == 0:
             raise ValueError("TVM only supports finding TopK largest elements")
 
-        return _op.topk(inputs[0], inputs[1], axis=axis)
+        return _op.topk(inputs[0], inputs[1], axis=axis, dtype="int64")
 
 
 class Range(OnnxOpConverter):
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index c666604d0e89..56d1dd5a5265 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -163,6 +163,7 @@ def verify_with_ort_with_inputs(
                 ort_val = scipy.special.softmax(ort_val)
                 tvm_val = scipy.special.softmax(tvm_val)
             tvm.testing.assert_allclose(ort_val, tvm_val, rtol=rtol, atol=atol)
+            assert ort_val.dtype == tvm_val.dtype
 
 
 def verify_with_ort(

From 4f414fddef17fc3c1ce859ab02b1440e5979c231 Mon Sep 17 00:00:00 2001
From: Altan Haan <ahaan@octoml.ai>
Date: Thu, 28 Jan 2021 22:04:52 -0800
Subject: [PATCH 135/357] fix grad for zeros and ones (#7357)

---
 python/tvm/relay/op/_tensor_grad.py       | 22 ++++++++--
 tests/python/relay/test_op_grad_level3.py | 49 ++++++++++++++++++++++-
 2 files changed, 66 insertions(+), 5 deletions(-)

diff --git a/python/tvm/relay/op/_tensor_grad.py b/python/tvm/relay/op/_tensor_grad.py
index 90120d64c2ac..5836aebce393 100644
--- a/python/tvm/relay/op/_tensor_grad.py
+++ b/python/tvm/relay/op/_tensor_grad.py
@@ -238,14 +238,28 @@ def divide_grad(orig, grad):
 
 @register_gradient("zeros")
 def zeros_grad(orig, grad):
-    """Returns [shape]"""
-    return [orig.args[0]]
+    """Returns []"""
+    return []
+
+
+@register_gradient("dyn.zeros")
+def dyn_zeros_grad(orig, grad):
+    """Returns the gradient of dyn.zeros which is just zero."""
+    assert len(orig.args) == 1
+    return [zeros_like(orig.args[0])]
 
 
 @register_gradient("ones")
 def ones_grad(orig, grad):
-    """Returns [shape]"""
-    return [orig.args[0]]
+    """Returns []"""
+    return []
+
+
+@register_gradient("dyn.ones")
+def dyn_ones_grad(orig, grad):
+    """Returns the gradient of dyn.ones which is just zero."""
+    assert len(orig.args) == 1
+    return [zeros_like(orig.args[0])]
 
 
 @register_gradient("zeros_like")
diff --git a/tests/python/relay/test_op_grad_level3.py b/tests/python/relay/test_op_grad_level3.py
index 0c89aa7d2e9a..904576a181f6 100644
--- a/tests/python/relay/test_op_grad_level3.py
+++ b/tests/python/relay/test_op_grad_level3.py
@@ -20,7 +20,7 @@
 import tvm
 from tvm import te
 from tvm import relay
-from tvm.relay.testing import check_grad, run_infer_type, _np_randn_from_type
+from tvm.relay.testing import check_grad, run_infer_type, run_opt_pass, _np_randn_from_type
 from tvm.relay.transform import gradient
 import tvm.testing
 
@@ -133,5 +133,52 @@ def test_reshape_like_grad():
     check_grad(fwd_func)
 
 
+def test_zeros_ones_grad_const_ints():
+    # when shape is static (i.e. not an input), there is no gradient at all
+    static_ty = relay.TensorType([2, 3, 4], dtype="float32")
+    expected_ty = relay.TupleType([static_ty, relay.TupleType([])])
+
+    for op in [relay.zeros, relay.ones]:
+        fwd_func = relay.Function([], op(static_ty.concrete_shape, static_ty.dtype))
+        bwd_func = run_infer_type(gradient(run_infer_type(fwd_func)))
+        tvm.ir.assert_structural_equal(bwd_func.ret_type, expected_ty)
+
+
+def test_zeros_ones_grad_const_expr():
+    # when shape is static (i.e. not an input), there is no gradient at all
+    shape_const = relay.const(np.array([2, 3, 4]), dtype="int32")
+    static_ty = relay.TensorType([2, 3, 4], dtype="float32")
+    dyn_ty = relay.TensorType([relay.Any(), relay.Any(), relay.Any()], dtype="float32")
+    expected_ty_static = relay.TupleType([static_ty, relay.TupleType([])])
+    expected_ty_dyn = relay.TupleType([dyn_ty, relay.TupleType([])])
+
+    for op in [relay.zeros, relay.ones]:
+        # with DynamicToStatic, the shape should be concretized
+        fwd_func = relay.Function([], op(shape_const, static_ty.dtype))
+        fwd_func = run_opt_pass(fwd_func, relay.transform.DynamicToStatic())
+        bwd_func = run_infer_type(gradient(run_infer_type(fwd_func)))
+        tvm.ir.assert_structural_equal(bwd_func.ret_type, expected_ty_static)
+
+        fwd_func = relay.Function([], op(shape_const, static_ty.dtype))
+        bwd_func = run_infer_type(gradient(run_infer_type(fwd_func)))
+        tvm.ir.assert_structural_equal(bwd_func.ret_type, expected_ty_dyn)
+
+
+def test_zeros_ones_grad_dynamic():
+    rank = np.random.randint(low=1, high=5, dtype="int32")
+    dyn_shape = np.random.randint(low=1, high=4, size=(rank,), dtype="int32")
+    shape_data = relay.var("shape_data", shape=(rank,), dtype="int32")
+
+    for op, op_ref in [(relay.zeros, np.zeros), (relay.ones, np.ones)]:
+        fwd_func = relay.Function([shape_data], op(shape_data, dtype="float32"))
+        bwd_func = run_infer_type(gradient(run_infer_type(fwd_func)))
+
+        for target, ctx in tvm.testing.enabled_targets():
+            intrp = relay.create_executor(ctx=ctx, target=target)
+            res, (grad,) = intrp.evaluate(bwd_func)(dyn_shape)
+            tvm.testing.assert_allclose(res.asnumpy(), op_ref(dyn_shape, dtype="float32"))
+            tvm.testing.assert_allclose(grad.asnumpy(), np.zeros((rank,), dtype="int32"))
+
+
 if __name__ == "__main__":
     pytest.main()

From 4142128e1c2bbc4ddedf295e244b0cdb01ac3046 Mon Sep 17 00:00:00 2001
From: Luis Vega <vegaluisjose@users.noreply.github.com>
Date: Thu, 28 Jan 2021 22:11:51 -0800
Subject: [PATCH 136/357] [BYOC][Verilator] change runtime registry function
 name (#7351)

* use lowercase for verilator runtime registry function

* lint fix

* update comment
---
 src/relay/backend/contrib/verilator/codegen.cc     | 2 +-
 src/runtime/contrib/verilator/verilator_runtime.cc | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/relay/backend/contrib/verilator/codegen.cc b/src/relay/backend/contrib/verilator/codegen.cc
index f1c7f785330e..2f61ae540395 100644
--- a/src/relay/backend/contrib/verilator/codegen.cc
+++ b/src/relay/backend/contrib/verilator/codegen.cc
@@ -115,7 +115,7 @@ runtime::Module VerilatorCompiler(const ObjectRef& ref) {
 
   auto lib_name = cfg.value()->lib;
 
-  const auto* pf = runtime::Registry::Get("runtime.VerilatorJSONRuntimeCreate");
+  const auto* pf = runtime::Registry::Get("runtime.verilator_runtime_create");
   CHECK(pf != nullptr) << "Cannot find JSON runtime module to create";
   auto mod = (*pf)(lib_name, func_name, graph_json, params);
   return mod;
diff --git a/src/runtime/contrib/verilator/verilator_runtime.cc b/src/runtime/contrib/verilator/verilator_runtime.cc
index ae52d9e1e08d..60f36e494da7 100644
--- a/src/runtime/contrib/verilator/verilator_runtime.cc
+++ b/src/runtime/contrib/verilator/verilator_runtime.cc
@@ -143,7 +143,7 @@ class VerilatorJSONRuntime : public JSONRuntimeBase {
   VerilatorHandle device_{nullptr};
   /* The verilator library handle. */
   VerilatorLibrary* lib_{nullptr};
-  /* The verilator add function handle */
+  /* The verilator vadd function handle. */
   VerilatorAddFunc vadd_func_{nullptr};
 };
 
@@ -154,8 +154,7 @@ runtime::Module VerilatorJSONRuntimeCreate(String lib_name, String symbol_name,
   return runtime::Module(n);
 }
 
-TVM_REGISTER_GLOBAL("runtime.VerilatorJSONRuntimeCreate")
-    .set_body_typed(VerilatorJSONRuntimeCreate);
+TVM_REGISTER_GLOBAL("runtime.verilator_runtime_create").set_body_typed(VerilatorJSONRuntimeCreate);
 
 TVM_REGISTER_GLOBAL("runtime.module.loadbinary_verilator_json")
     .set_body_typed(JSONRuntimeBase::LoadFromBinary<VerilatorJSONRuntime>);

From 02c764c72f70d5f08be1385737cea706e872def0 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Fri, 29 Jan 2021 16:38:09 +0900
Subject: [PATCH 137/357] disable one of rewrite in torch detection test
 (#7365)

---
 tests/python/frontend/pytorch/test_object_detection.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/python/frontend/pytorch/test_object_detection.py b/tests/python/frontend/pytorch/test_object_detection.py
index fd33dd1da8b1..6b7f9be06d99 100644
--- a/tests/python/frontend/pytorch/test_object_detection.py
+++ b/tests/python/frontend/pytorch/test_object_detection.py
@@ -150,10 +150,10 @@ def compile_and_run_vm(mod, params, data_np, target):
     after = mod["main"]
     assert not tvm.ir.structural_equal(after, before)
 
-    before = mod["main"]
-    mod = rewrite_scatter_to_gather(mod, 4)  # num_scales is 4 for maskrcnn_resnet50_fpn
-    after = mod["main"]
-    assert not tvm.ir.structural_equal(after, before)
+    # before = mod["main"]
+    # mod = rewrite_scatter_to_gather(mod, 4)  # num_scales is 4 for maskrcnn_resnet50_fpn
+    # after = mod["main"]
+    # assert not tvm.ir.structural_equal(after, before)
 
     tvm_res_after_rewrite = compile_and_run_vm(mod, params, data_np, "llvm")
 

From 44a071aa1e9ad11c20fbfcf725ddb6dd8a2823c4 Mon Sep 17 00:00:00 2001
From: Zhi <5145158+zhiics@users.noreply.github.com>
Date: Fri, 29 Jan 2021 07:16:45 -0800
Subject: [PATCH 138/357] [Refactor][VM] Port memory_alloc to c++ (#7369)

* Port memory_alloc to c++

* remove memory python pass
---
 include/tvm/relay/transform.h              |  12 +
 python/tvm/relay/__init__.py               |   1 -
 python/tvm/relay/transform/__init__.py     |   1 -
 python/tvm/relay/transform/memory_alloc.py | 389 ----------------
 src/relay/backend/vm/compiler.cc           |   6 -
 src/relay/transforms/memory_alloc.cc       | 494 +++++++++++++++++++++
 tests/python/relay/test_any.py             |   1 -
 tests/python/relay/test_memory_passes.py   |   1 -
 8 files changed, 506 insertions(+), 399 deletions(-)
 delete mode 100644 python/tvm/relay/transform/memory_alloc.py
 create mode 100644 src/relay/transforms/memory_alloc.cc

diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h
index e4b39da85206..123b7e395faa 100644
--- a/include/tvm/relay/transform.h
+++ b/include/tvm/relay/transform.h
@@ -31,6 +31,7 @@
 #include <tvm/relay/op.h>
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/runtime/container.h>
+#include <tvm/target/target.h>
 
 #include <string>
 
@@ -419,6 +420,17 @@ TVM_DLL Pass RemoveUnusedFunctions(Array<runtime::String> entry_functions);
  */
 TVM_DLL Pass SimplifyExpr();
 
+/*!
+ * \brief A pass for manifesting explicit memory allocations and rewriting
+ * specific dialects.
+ *
+ * \param target_host The target used by the host for compliation.
+ * \param targets The device type and target pairs for compliation.
+ *
+ * \return The pass.
+ */
+TVM_DLL Pass ManifestAlloc(Target target_host, Map<tvm::Integer, tvm::Target> targets);
+
 }  // namespace transform
 
 /*!
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index 97f6d1cb60c0..89c8fcb17d73 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -61,7 +61,6 @@
 from .scope_builder import ScopeBuilder
 
 # Load Memory Passes
-from .transform import memory_alloc
 from .transform import memory_plan
 
 # Required to traverse large programs
diff --git a/python/tvm/relay/transform/__init__.py b/python/tvm/relay/transform/__init__.py
index 1d0ea176b16f..ca9996aeaaae 100644
--- a/python/tvm/relay/transform/__init__.py
+++ b/python/tvm/relay/transform/__init__.py
@@ -19,4 +19,3 @@
 # transformation passes
 from .transform import *
 from .recast import recast
-from . import memory_alloc
diff --git a/python/tvm/relay/transform/memory_alloc.py b/python/tvm/relay/transform/memory_alloc.py
deleted file mode 100644
index 66528c861788..000000000000
--- a/python/tvm/relay/transform/memory_alloc.py
+++ /dev/null
@@ -1,389 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=no-else-return,invalid-name,len-as-condition,too-many-nested-blocks
-"""
-A pass for manifesting explicit memory allocations.
-"""
-import numpy as np
-
-from tvm.ir.transform import PassContext, module_pass
-from tvm.relay.transform import InferType
-from tvm import nd, container
-from ..function import Function
-from ..expr_functor import ExprVisitor, ExprMutator
-from ..scope_builder import ScopeBuilder
-from .. import op
-from ... import DataType, register_func
-from .. import ty, expr
-from ..backend import compile_engine
-from ..op.memory import flatten_tuple_type, from_tuple_type, to_tuple_type
-from ... import cpu
-from ..op.memory import alloc_storage
-from ..analysis import context_analysis
-from ..._ffi.runtime_ctypes import TVMContext
-
-
-def alloc_tensor(storage, shape, dtype="float32", assert_shape=None):
-    offset = expr.const(0, dtype="int64")
-    return op.memory.alloc_tensor(storage, offset, shape, dtype, assert_shape)
-
-
-def is_primitive(call):
-    return (
-        hasattr(call, "op")
-        and hasattr(call.op, "attrs")
-        and hasattr(call.op.attrs, "Primitive")
-        and int(call.op.attrs.Primitive) == 1
-    )
-
-
-def is_device_copy(func):
-    """
-    Check if the current relay expression is a device copy call. We can simply check
-    the body of it if it is a function becase the device_copy op is opaque.
-    """
-    if isinstance(func, Function):
-        body = func.body
-        return isinstance(body, expr.Call) and body.op == op.get("device_copy")
-    if isinstance(func, expr.Call):
-        return func.op == op.get("device_copy")
-    return False
-
-
-class CheckReshapeOnly(ExprVisitor):
-    """A pass to check if the fused op contains only reshape ops."""
-
-    def __init__(self):
-        super().__init__()
-        self._reshape_ops = [
-            op.get("reshape"),
-            op.get("contrib_reverse_reshape"),
-            op.get("dyn.reshape"),
-        ]
-        self.reshape_only = True
-
-    def visit_call(self, call):
-        if not self.reshape_only:
-            return
-        if call.op not in self._reshape_ops:
-            self.reshape_only = False
-        for arg in call.args:
-            self.visit(arg)
-
-    def visit_var(self, var):
-        var_type = var.checked_type
-        if not isinstance(var_type, ty.TensorType):
-            self.reshape_only = False
-
-
-def is_reshape_only(func):
-    """Check if the primitive function contains only reshape ops."""
-    check = CheckReshapeOnly()
-    check.visit(func)
-    return check.reshape_only
-
-
-class ManifestAllocPass(ExprMutator):
-    """A pass for explicitly manifesting all memory allocations in Relay."""
-
-    def __init__(self, target_host, context_analysis_map):
-        self.invoke_tvm = op.vm.invoke_tvm_op
-        self.shape_func = op.vm.shape_func
-        self.shape_of = op.vm.shape_of
-        self.reshape_tensor = op.vm.reshape_tensor
-        self.scopes = [ScopeBuilder()]
-        self.target_host = target_host
-        self.default_context = cpu(0)
-        self.compute_dtype = "int64"
-        self.context_analysis_map = context_analysis_map
-        super().__init__()
-
-    def get_context(self, exp):
-        """Get the context of a given expression"""
-        assert exp in self.context_analysis_map, exp.astext(False)
-        val = self.context_analysis_map[exp]
-        # val[0], val[1] are device_type and device_id, respectively.
-        # We don't need to unpack after porting this pass to C++.
-        assert len(val) == 2
-        return TVMContext(val[0].value, val[1].value)
-
-    def device_copy(self, inp, src_ctx, dst_ctx):
-        """Insert a device copy node."""
-        return self.visit(op.tensor.device_copy(inp, src_ctx, dst_ctx))
-
-    def current_scope(self):
-        return self.scopes[-1]
-
-    def visit_tuple(self, tup):
-        scope = self.current_scope()
-        new_fields = []
-        for field in tup.fields:
-            field = self.visit(field)
-            if isinstance(field, expr.Constant):
-                field = scope.let("const", field)
-            new_fields.append(field)
-        return expr.Tuple(new_fields)
-
-    def compute_alignment(self, dtype):
-        dtype = DataType(dtype)
-        align = (dtype.bits // 8) * dtype.lanes
-        # MAGIC CONSTANT FROM device_api.h
-        if align < 64:
-            align = 64
-
-        return expr.const(align, dtype="int64")
-
-    def compute_storage_in_relay(self, shape, dtype):
-        dtype = DataType(dtype)
-        els = op.prod(shape)
-        num = expr.const(dtype.bits * dtype.lanes, self.compute_dtype)
-        num = num + expr.const(7, self.compute_dtype)
-        div = expr.const(8, self.compute_dtype)
-        return els * (num / div)
-
-    def compute_storage(self, tensor_type):
-        dtype = DataType(tensor_type.dtype)
-        shape = [int(sh) for sh in tensor_type.shape]
-        size = 1
-        for sh in shape:
-            size *= sh
-        size *= (dtype.bits * dtype.lanes + 7) // 8
-        return expr.const(size, dtype=self.compute_dtype)
-
-    def make_static_allocation(self, scope, tensor_type, ctx, name_hint):
-        """Allocate a tensor with a statically known shape."""
-        shape = [int(sh) for sh in tensor_type.shape]
-        if len(shape) == 0:
-            shape = expr.const(np.empty((), dtype=self.compute_dtype), dtype=self.compute_dtype)
-        else:
-            shape = expr.const(np.array(shape), dtype=self.compute_dtype)
-        size = self.compute_storage(tensor_type)
-        alignment = self.compute_alignment(tensor_type.dtype)
-        dtype = tensor_type.dtype
-        sto = scope.let("storage_{0}".format(name_hint), alloc_storage(size, alignment, ctx, dtype))
-        # TODO(@jroesch): There is a bug with typing based on the constant shape.
-        tensor = alloc_tensor(sto, shape, dtype, tensor_type.shape)
-        return scope.let("tensor_{0}".format(name_hint), tensor)
-
-    def visit_let(self, let):
-        scope = ScopeBuilder()
-
-        self.scopes.append(scope)
-        while isinstance(let, expr.Let):
-            new_val = self.visit(let.value)
-            scope.let(let.var, new_val)
-            let = let.body
-
-        new_body = self.visit(let)
-        scope.ret(new_body)
-        self.scopes.pop()
-
-        return scope.get()
-
-    def emit_shape_func(self, scope, func, new_args):
-        """Insert the shape function given a primitive function."""
-        shape_func_ins = []
-        engine = compile_engine.get()
-        cfunc = engine.lower_shape_func(func, self.target_host)
-        input_states = cfunc.shape_func_param_states
-
-        is_inputs = []
-        input_pos = 0
-        cpu_ctx = nd.cpu(0)
-        for i, (arg, state) in enumerate(zip(new_args, input_states)):
-            state = int(state)
-            # Pass Shapes
-            if state == 2:
-                for j, subexp in enumerate(from_tuple_type(arg.type_annotation, arg)):
-                    sh_of = self.visit(self.shape_of(subexp))
-                    shape_func_ins.append(scope.let("in_shape_{0}".format(input_pos + j), sh_of))
-                    input_pos += 1
-                is_inputs.append(0)
-            # Pass Inputs
-            elif state == 1:
-                new_arg = self.visit(arg)
-                ctx = self.get_context(arg)
-                if ctx.device_type != cpu_ctx.device_type:
-                    new_arg = self.device_copy(new_arg, ctx, cpu_ctx)
-                shape_func_ins.append(scope.let("in_shape_{0}".format(input_pos), new_arg))
-                input_pos += 1
-                is_inputs.append(1)
-            else:
-                # TODO(@jroesch): handle 3rd case
-                raise Exception("unsupported shape function input state")
-
-        out_shapes = []
-        for i, out in enumerate(cfunc.outputs):
-            tt = ty.TensorType(out.shape, out.dtype)
-            # Put shape func on CPU. This also ensures that everything between
-            # shape_of and shape_func are on CPU.
-            alloc = self.make_static_allocation(scope, tt, cpu_ctx, i)
-            alloc = scope.let("shape_func_out_{0}".format(i), alloc)
-            out_shapes.append(alloc)
-
-        shape_call = self.shape_func(
-            func, expr.Tuple(shape_func_ins), expr.Tuple(out_shapes), is_inputs
-        )
-
-        scope.let("shape_func", shape_call)
-        return out_shapes
-
-    def dynamic_invoke(self, scope, func, ins, new_args, out_types, ret_type):
-        """Generate the code for invoking a TVM op with a dynamic shape."""
-        out_shapes = self.emit_shape_func(scope, func, new_args)
-
-        storages = []
-        func_ctx = self.get_context(func)
-        for i, (out_shape, out_type) in enumerate(zip(out_shapes, out_types)):
-            size = self.compute_storage_in_relay(out_shape, out_type.dtype)
-            alignment = self.compute_alignment(out_type.dtype)
-            sto = scope.let(
-                "storage_{i}".format(i=i), alloc_storage(size, alignment, func_ctx, out_type.dtype)
-            )
-            storages.append(sto)
-
-        outs = []
-        sh_ty_storage = zip(out_shapes, out_types, storages)
-        for i, (out_shape, out_type, storage) in enumerate(sh_ty_storage):
-            alloc = alloc_tensor(storage, out_shape, out_type.dtype, out_type.shape)
-            alloc = scope.let("out_{i}".format(i=i), alloc)
-            outs.append(alloc)
-
-        tuple_outs = expr.Tuple(outs)
-        invoke = self.invoke_tvm(func, ins, tuple_outs)
-        scope.let("", invoke)
-        return to_tuple_type(ret_type, tuple_outs.fields)
-
-    def emit_reshape_tensor(self, scope, func, new_args, ret_type):
-        if self.is_dynamic(ret_type):
-            out_shapes = self.emit_shape_func(scope, func, new_args)
-            shape_expr = out_shapes[0]
-        else:
-            # constant output shape
-            shape = [int(dim) for dim in ret_type.shape]
-            shape_expr = expr.const(shape, dtype=self.compute_dtype)
-        return self.reshape_tensor(new_args[0], shape_expr, ret_type.shape)
-
-    def is_dynamic(self, ret_type):
-        is_dynamic = ty.is_dynamic(ret_type)
-        # TODO(@jroesch): restore this code, more complex then it seems
-        # for arg in call.args:
-        #     is_dynamic = is_dynamic or arg.checked_type.is_dynamic()
-        return is_dynamic
-
-    def visit_call(self, call):
-        if is_primitive(call):
-            # Because we are in ANF we do not need to visit the arguments.
-            scope = self.current_scope()
-            new_args = [self.visit(arg) for arg in call.args]
-
-            ins = expr.Tuple(new_args)
-            ret_type = call.checked_type
-            out_types = flatten_tuple_type(ret_type)
-
-            if is_reshape_only(call.op):
-                # Handle fused op that only contains reshape op
-                return self.emit_reshape_tensor(scope, call.op, new_args, ret_type)
-
-            if is_device_copy(call.op):
-                # Handle device copy op
-                if isinstance(call.op, Function):
-                    attr = call.op.body.attrs
-                else:
-                    attr = call.attr
-                return self.device_copy(
-                    new_args[0], TVMContext(attr.src_dev_type, 0), TVMContext(attr.dst_dev_type, 0)
-                )
-
-            if self.is_dynamic(ret_type):
-                # Handle dynamic case.
-                return self.dynamic_invoke(scope, call.op, ins, new_args, out_types, ret_type)
-
-            # Handle static case.
-            outs = []
-            for i, out_ty in enumerate(out_types):
-                ctx = self.get_context(call)
-                assert isinstance(ctx, TVMContext)
-                out = self.make_static_allocation(scope, out_ty, ctx, i)
-                outs.append(out)
-
-            output = expr.Tuple(outs)
-            invoke = self.invoke_tvm(call.op, ins, output)
-            scope.let("", invoke)
-            return to_tuple_type(ret_type, output.fields)
-        return super().visit_call(call)
-
-
-def mk_analysis_annotator(results):
-    """Pretty print the annotated relay program with device info"""
-
-    def _annotator(exp):
-        if exp in results:
-            val = results[exp]
-            assert len(val) == 2
-            ctx = TVMContext(val[0].value, val[1].value)
-            return f"<{ctx}>"
-        else:
-            return ""
-
-    return _annotator
-
-
-@module_pass(opt_level=0)
-class ManifestAlloc:
-    """The explicit pass wrapper around ManifestAlloc."""
-
-    # TODO(zhiics, jroesch) Port this pass to C++.
-    def __init__(self, target_host, targets):
-        self.target_host = target_host
-        self.targets = targets
-
-    def transform_module(self, mod, _):
-        """Invokes the pass"""
-        # TODO(@jroesch): Is there a way to do one shot initialization?
-        # can we have def pass_init?
-        mod.import_from_std("core.rly")
-        mod = InferType()(mod)
-
-        assert isinstance(self.targets, (dict, container.Map))
-        if len(self.targets) > 1:
-            pass_ctx = PassContext.current()
-            if "relay.fallback_device_type" in pass_ctx.config:
-                fallback_ctx = nd.context(pass_ctx.config["relay.fallback_device_type"])
-            else:
-                fallback_ctx = cpu(0)
-            ca = context_analysis(mod, TVMContext(fallback_ctx.device_type, 0))
-        else:
-            if isinstance(self.targets, dict):
-                dev = list(self.targets.keys())[0]
-            else:
-                dev, _ = self.targets.items()[0]
-            ca = context_analysis(mod, nd.context(dev.value))
-
-        # The following code can be used for debugging the module after
-        # annotation.
-        # print(mod.astext(show_meta_data=False, annotate=mk_analysis_annotator(ca)))
-
-        gv_funcs = mod.functions
-        for gv, f in gv_funcs.items():
-            ea = ManifestAllocPass(self.target_host, ca)
-            f = ea.visit(f)
-            mod.update_func(gv, f)
-        return mod
-
-
-register_func("relay.transform.ManifestAlloc", ManifestAlloc)
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index d908153f88cb..7861502965a8 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -58,12 +58,6 @@ namespace transform {
 Pass LambdaLift();
 Pass InlinePrimitives();
 
-Pass ManifestAlloc(Target target_host, vm::TargetsMap targets) {
-  auto f = tvm::runtime::Registry::Get("relay.transform.ManifestAlloc");
-  ICHECK(f != nullptr) << "unable to load allocation manifestation pass";
-  return (*f)(target_host, targets);
-}
-
 Pass MemoryPlan() {
   auto f = tvm::runtime::Registry::Get("relay.transform.MemoryPlan");
   ICHECK(f != nullptr) << "unable to load the memory planning pass";
diff --git a/src/relay/transforms/memory_alloc.cc b/src/relay/transforms/memory_alloc.cc
new file mode 100644
index 000000000000..360778e1723b
--- /dev/null
+++ b/src/relay/transforms/memory_alloc.cc
@@ -0,0 +1,494 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/transforms/memory_alloc.cc
+ * \brief A pass for manifesting explicit memory allocations.
+ */
+
+#include <tvm/node/structural_equal.h>
+#include <tvm/node/structural_hash.h>
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/attrs/device_copy.h>
+#include <tvm/relay/attrs/memory.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/op.h>
+#include <tvm/relay/transform.h>
+#include <tvm/support/logging.h>
+#include <tvm/target/target.h>
+
+#include <cstdint>
+#include <cstdio>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "../backend/compile_engine.h"
+#include "let_list.h"
+#include "pattern_utils.h"
+
+using namespace tvm::runtime;
+
+namespace tvm {
+namespace relay {
+
+extern Expr ToTupleType(const Type& ty, const std::vector<Expr>& exprs);
+extern std::vector<Expr> FromTupleType(const Type& type, const Expr& expr);
+extern std::vector<TensorType> FlattenTupleType(const Type& type);
+
+using AnalysisResultMap =
+    std::unordered_map<Expr, TVMContext, runtime::ObjectPtrHash, runtime::ObjectPtrEqual>;
+
+inline Constant MakeConstant(const std::vector<int64_t>& value) {
+  return MakeConstantTensor(DataType::Int(64), {static_cast<int64_t>(value.size())}, value);
+}
+
+inline Expr AllocTensor(const Expr& storage, tvm::relay::Expr shape, DataType dtype,
+                        Array<IndexExpr> assert_shape) {
+  auto f = runtime::Registry::Get("relay.op.memory._make.alloc_tensor");
+  CHECK(f != nullptr) << "unable to find alloc_tensor op";
+  auto offset = MakeConstantScalar(DataType::Int(64), 0);
+  return (*f)(storage, offset, shape, dtype, assert_shape);
+}
+
+// A pass to check if the fused op contains only reshape ops.
+class CheckReshapeOnly : public ExprVisitor {
+ public:
+  CheckReshapeOnly()
+      : reshape_(Op::Get("reshape")),
+        contr_reshape_(Op::Get("contrib_reverse_reshape")),
+        dyn_reshape_(Op::Get("dyn.reshape")) {}
+
+  void VisitExpr_(const CallNode* cn) final {
+    if (!reshape_only) return;
+    if (cn->op != reshape_ && cn->op != contr_reshape_ && cn->op != dyn_reshape_) {
+      reshape_only = false;
+    }
+    for (auto arg : cn->args) ExprVisitor::VisitExpr(arg);
+  }
+
+  void VisitExpr_(const VarNode* vn) final {
+    if (!vn->checked_type_->IsInstance<TensorTypeNode>()) {
+      reshape_only = false;
+    }
+  }
+
+  const Op& reshape_;
+  const Op& contr_reshape_;
+  const Op& dyn_reshape_;
+  bool reshape_only{true};
+};
+
+// Check if the primitive function contains only reshape ops.
+bool IsReshapeOnly(const Expr& expr) {
+  auto check = CheckReshapeOnly();
+  check.VisitExpr(expr);
+  return check.reshape_only;
+}
+
+class DialectRewriter : public ExprMutator {
+ public:
+  DialectRewriter(const Target& target_host, const AnalysisResultMap& context_analysis_map)
+      : target_host_(target_host),
+        context_analysis_map_(context_analysis_map),
+        device_copy_(runtime::Registry::Get("relay.op._make.device_copy")),
+        invoke_tvm_(runtime::Registry::Get("relay.op.vm.invoke_tvm_op")),
+        alloc_storage_(runtime::Registry::Get("relay.op.memory._make.alloc_storage")),
+        shape_func_(runtime::Registry::Get("relay.op.vm.shape_func")),
+        shape_of_(runtime::Registry::Get("relay.op.vm.shape_of")),
+        reshape_tensor_(runtime::Registry::Get("relay.op.vm.reshape_tensor")),
+        prod_(runtime::Registry::Get("relay.op._make.prod")),
+        divide_(runtime::Registry::Get("relay.op._make.divide")),
+        add_(runtime::Registry::Get("relay.op._make.add")),
+        multiply_(runtime::Registry::Get("relay.op._make.multiply")) {}
+
+  // Get the context of an expression.
+  TVMContext GetContext(const Expr& expr) const {
+    auto it = context_analysis_map_.find(expr);
+    CHECK(it != context_analysis_map_.end()) << "Cannot find expr in the context analysis map:\n"
+                                             << AsText(expr, false);
+    return it->second;
+  }
+
+  Function Rewrite(const Function& expr) {
+    auto ret = ExprMutator::Mutate(expr);
+    return Downcast<Function>(ret);
+  }
+
+  Expr VisitExpr_(const TupleNode* tn) final {
+    LetList& scope = scopes_.back();
+    Array<Expr> new_fields;
+    for (auto field : tn->fields) {
+      auto new_field = ExprMutator::Mutate(field);
+      if (new_field->IsInstance<ConstantNode>()) {
+        Var const_var("const", Type(nullptr));
+        new_field = scope.Push(const_var, new_field);
+      }
+      new_fields.push_back(new_field);
+    }
+    return Tuple(new_fields);
+  }
+
+  Expr VisitExpr_(const LetNode* ln) final {
+    scopes_.emplace_back();
+
+    const LetNode* let = ln;
+    Expr body;
+    while (let) {
+      auto new_value = ExprMutator::Mutate(let->value);
+      scopes_.back().Push(let->var, new_value);
+      body = let->body;
+      let = body.as<LetNode>();
+    }
+
+    CHECK(body.defined());
+    auto new_body = ExprMutator::Mutate(body);
+    auto ret = scopes_.back().Get(new_body);
+    scopes_.pop_back();
+    return ret;
+  }
+
+  Expr VisitExpr_(const CallNode* cn) final {
+    if (IsPrimitive(cn)) {
+      // Because we are in ANF we do not need to visit the arguments.
+      LetList& scope = scopes_.back();
+      std::vector<Expr> new_args;
+      for (const auto& it : cn->args) {
+        new_args.push_back(ExprMutator::Mutate(it));
+      }
+
+      Tuple ins(new_args);
+      Type ret_type = cn->checked_type_;
+      std::vector<TensorType> out_types = FlattenTupleType(ret_type);
+
+      // Handle fused op that only contains reshape op
+      if (IsReshapeOnly(cn->op)) {
+        Function func = Downcast<Function>(cn->op);
+        return EmitReshapeTensor(&scope, func, new_args, ret_type);
+      }
+
+      // Handle device copy op
+      if (IsDeviceCopy(cn->op)) {
+        Attrs attr;
+        if (const auto* fn = cn->op.as<FunctionNode>()) {
+          const auto* copy_call = fn->body.as<CallNode>();
+          CHECK(copy_call);
+          attr = copy_call->attrs;
+        } else {
+          attr = cn->attrs;
+        }
+        const DeviceCopyAttrs* copy_attr = attr.as<DeviceCopyAttrs>();
+        CHECK(copy_attr);
+        return DeviceCopy(new_args[0], copy_attr->src_dev_type, copy_attr->dst_dev_type);
+      } else if (IsDynamic(ret_type)) {
+        Function func = Downcast<Function>(cn->op);
+        return DynamicInvoke(&scope, func, ins, new_args, out_types, ret_type);
+      } else {
+        // Handle the static case
+        Array<Expr> outs;
+        for (size_t i = 0; i < out_types.size(); ++i) {
+          TVMContext ctx = GetContext(GetRef<Call>(cn));
+          auto out = MakeStaticAllocation(&scope, out_types[i], ctx, std::to_string(i));
+          outs.push_back(out);
+        }
+        Tuple output(outs);
+        Expr invoke = (*invoke_tvm_)(cn->op, ins, output);
+        scope.Push(invoke);
+        return ToTupleType(ret_type,
+                           std::vector<Expr>(output->fields.begin(), output->fields.end()));
+      }
+    } else {
+      return ExprMutator::VisitExpr_(cn);
+    }
+  }
+
+ private:
+  // Insert a device copy node.
+  Expr DeviceCopy(const Expr& inp, int src_ctx, int dst_ctx) {
+    return ExprMutator::Mutate((*device_copy_)(inp, src_ctx, dst_ctx));
+  }
+
+  // Check if a call invokes a primitive function.
+  bool IsPrimitive(const CallNode* call) const {
+    if (const auto* fn = call->op.as<FunctionNode>()) {
+      return fn->HasNonzeroAttr(attr::kPrimitive);
+    }
+    return false;
+  }
+
+  // Check if the current relay expression is a device copy call. We can simply
+  // check the body of it if it is a function because the device_copy op is opaque.
+  bool IsDeviceCopy(const Expr& expr) const {
+    if (const auto* fn = expr.as<FunctionNode>()) {
+      auto body = fn->body;
+      const CallNode* call = body.as<CallNode>();
+      return call && call->op == Op::Get("device_copy");
+    } else if (const CallNode* cn = expr.as<CallNode>()) {
+      return cn->op == Op::Get("device_copy");
+    } else {
+      return false;
+    }
+  }
+
+  Expr ComputeAlignment(const DataType& dtype) const {
+    int64_t align = dtype.bits() / 8 * dtype.lanes();
+    if (align < 64) {
+      align = 64;
+    }
+    return MakeConstantScalar(DataType::Int(64), align);
+  }
+
+  Expr ComputeStorageInRelay(const Expr& shape, const TensorType& type) const {
+    auto dtype = DataType(type->dtype);
+    Expr els = (*prod_)(shape, Array<Expr>(nullptr), false, false);
+    Expr num = MakeConstantScalar(DataType::Int(64), dtype.bits() * dtype.lanes());
+    Expr add = (*add_)(num, MakeConstantScalar(DataType::Int(64), 7));
+    Expr div = MakeConstantScalar(DataType::Int(64), 8);
+    Expr ret = (*multiply_)(els, (*divide_)(add, div));
+    return std::move(ret);
+  }
+
+  Expr ComputeStorage(const TensorType& type) {
+    int64_t size = 1;
+    for (auto it : type->shape) {
+      auto val = it.as<IntImmNode>();
+      CHECK(val);
+      size *= val->value;
+    }
+    size *= (type->dtype.bits() * type->dtype.lanes() + 7) / 8;
+    return std::move(MakeConstantScalar(DataType::Int(64), size));
+  }
+
+  // Allocate a tensor with a statically known shape.
+  Var MakeStaticAllocation(LetList* scope, const TensorType& type, TVMContext ctx,
+                           String name_hint) {
+    std::vector<int64_t> int_shape;
+    for (auto it : type->shape) {
+      const auto* imm = it.as<IntImmNode>();
+      CHECK(imm) << "expect static int shape";
+      int_shape.push_back(imm->value);
+    }
+    Expr shape = MakeConstant(int_shape);
+    Expr size = ComputeStorage(type);
+    Expr alignment = ComputeAlignment(type->dtype);
+    // Run type inference later to get the correct type.
+    Var var("storage_" + name_hint, Type(nullptr));
+    Expr value = (*alloc_storage_)(size, alignment, ctx, type->dtype);
+    auto sto = scope->Push(var, value);
+
+    // TODO(@jroesch): There is a bug with typing based on the constant shape.
+    auto tensor = AllocTensor(sto, shape, type->dtype, type->shape);
+    Var tensor_var("tensor_" + name_hint, Type(nullptr));
+    return scope->Push(tensor_var, tensor);
+  }
+
+  // Insert the shape function given a primitive function.
+  Array<Expr> EmitShapeFunc(LetList* scope, const Function& func,
+                            const std::vector<Expr>& new_args) {
+    Array<Expr> shape_func_ins;
+    auto engine = CompileEngine::Global();
+    CCacheKey key(func, target_host_);
+    auto cfunc = engine->LowerShapeFunc(key);
+    auto input_states = cfunc->shape_func_param_states;
+
+    Array<Integer> is_inputs;
+    int input_pos = 0;
+    TVMContext cpu_ctx = default_context_;
+    CHECK_EQ(new_args.size(), input_states.size());
+    for (size_t i = 0; i < new_args.size(); ++i) {
+      Expr arg = new_args[i];
+      Type ty;
+      if (const auto* vn = arg.as<VarNode>()) {
+        ty = vn->type_annotation;
+      } else {
+        ty = arg->checked_type();
+      }
+      int state = input_states[i]->value;
+      // Pass Shapes
+      if (state == 2) {
+        std::vector<Expr> exprs = FromTupleType(ty, arg);
+        for (size_t j = 0; j < exprs.size(); ++j) {
+          Expr sh_of = ExprMutator::Mutate((*shape_of_)(exprs[j]));
+          Var in_shape_var("in_shape_" + std::to_string(input_pos + j), Type(nullptr));
+          shape_func_ins.push_back(scope->Push(in_shape_var, sh_of));
+          input_pos++;
+        }
+        is_inputs.push_back(0);
+      } else if (state == 1) {
+        auto new_arg = ExprMutator::Mutate(arg);
+        auto ctx = GetContext(arg);
+        if (ctx.device_type != cpu_ctx.device_type) {
+          new_arg = DeviceCopy(new_arg, ctx.device_type, cpu_ctx.device_type);
+        }
+        Var in_shape_var("in_shape_" + std::to_string(input_pos), Type(nullptr));
+        shape_func_ins.push_back(scope->Push(in_shape_var, new_arg));
+        input_pos++;
+        is_inputs.push_back(1);
+      } else {
+        // TODO(@jroesch): handle 3rd case
+        LOG(FATAL) << "unsupported shape function input state";
+      }
+    }
+
+    Array<Expr> out_shapes;
+    for (size_t i = 0; i < cfunc->outputs.size(); ++i) {
+      auto out = cfunc->outputs[i];
+      auto tt = TensorType(out->shape, out->dtype);
+      // Put shape func on CPU. This also ensures that everything between
+      // shape_of and shape_func are on CPU.
+      auto alloc = MakeStaticAllocation(scope, tt, cpu_ctx, std::to_string(i));
+      Var shape_func_out_var("shape_func_out_" + std::to_string(i), Type(nullptr));
+      alloc = scope->Push(shape_func_out_var, alloc);
+      out_shapes.push_back(alloc);
+    }
+    auto shape_call = (*shape_func_)(func, Tuple(shape_func_ins), Tuple(out_shapes), is_inputs);
+    Var shape_func_var("shape_func", Type(nullptr));
+    scope->Push(shape_func_var, shape_call);
+    return out_shapes;
+  }
+
+  // Generate the code for invoking a TVM op with a dynamic shape.
+  Expr DynamicInvoke(LetList* scope, const Function& func, const Tuple& ins,
+                     const std::vector<Expr>& new_args, const std::vector<TensorType>& out_types,
+                     const Type& ret_type) {
+    auto out_shapes = EmitShapeFunc(scope, func, new_args);
+    std::vector<Var> storages;
+    auto func_ctx = GetContext(func);
+    CHECK_EQ(out_shapes.size(), out_types.size());
+    for (size_t i = 0; i < out_shapes.size(); ++i) {
+      auto out_shape = out_shapes[i];
+      auto out_type = out_types[i];
+      auto size = ComputeStorageInRelay(out_shape, out_type);
+      auto alignment = ComputeAlignment(out_type->dtype);
+      Var sto_var("storage_" + std::to_string(i), Type(nullptr));
+      auto val = (*alloc_storage_)(size, alignment, func_ctx, out_type->dtype);
+      storages.push_back(scope->Push(sto_var, val));
+    }
+
+    Array<Expr> outs;
+    for (size_t i = 0; i < storages.size(); ++i) {
+      auto out_shape = out_shapes[i];
+      auto out_type = out_types[i];
+      auto storage = storages[i];
+      auto alloc = AllocTensor(storage, out_shape, out_type->dtype, out_type->shape);
+      Var out_var("out_" + std::to_string(i), Type(nullptr));
+      outs.push_back(scope->Push(out_var, alloc));
+    }
+
+    Tuple tuple_outs(outs);
+    auto invoke = (*invoke_tvm_)(func, ins, tuple_outs);
+    scope->Push(invoke);
+    return ToTupleType(ret_type,
+                       std::vector<Expr>(tuple_outs->fields.begin(), tuple_outs->fields.end()));
+  }
+
+  Expr EmitReshapeTensor(LetList* scope, const Function& func, const std::vector<Expr>& new_args,
+                         const Type& ret_type) {
+    TensorType ret_ty = Downcast<TensorType>(ret_type);
+    Expr shape_expr;
+    if (IsDynamic(ret_type)) {
+      auto out_shapes = EmitShapeFunc(scope, func, new_args);
+      shape_expr = out_shapes[0];
+    } else {
+      std::vector<int64_t> shape;
+      for (const auto& it : ret_ty->shape) {
+        const auto* imm = it.as<IntImmNode>();
+        CHECK(imm) << "expect static int shape";
+        shape.push_back(imm->value);
+      }
+      shape_expr = MakeConstant(shape);
+    }
+    return (*reshape_tensor_)(new_args[0], shape_expr, ret_ty->shape);
+  }
+
+ private:
+  Target target_host_;
+  AnalysisResultMap context_analysis_map_;
+  std::vector<LetList> scopes_;
+
+  // Cache the following ops
+  const PackedFunc* device_copy_;
+  const PackedFunc* invoke_tvm_;
+  const PackedFunc* alloc_storage_;
+  const PackedFunc* shape_func_;
+  const PackedFunc* shape_of_;
+  const PackedFunc* reshape_tensor_;
+  const PackedFunc* prod_;
+  const PackedFunc* divide_;
+  const PackedFunc* add_;
+  const PackedFunc* multiply_;
+
+  runtime::DataType compute_dtype_ = runtime::DataType::Int(64);
+  TVMContext default_context_{kDLCPU, 0};
+};
+
+namespace transform {
+
+Pass ManifestAlloc(Target target_host, Map<tvm::Integer, tvm::Target> targets) {
+  return tvm::transform::CreateModulePass(
+      [=](IRModule mod, const PassContext& pass_ctx) {
+        DLOG(INFO) << "tvm::relay::transform::ManifestAlloc";
+        // We need to mutate module, therefore making a copy of it.
+        mod.CopyOnWrite();
+        mod->ImportFromStd("core.rly");
+        mod = relay::transform::InferType()(mod);
+
+        TVMContext fallback_ctx;
+        if (targets.size() > 1) {
+          auto pass_ctx = PassContext::Current();
+          Optional<Integer> opt_fallback_dev =
+              pass_ctx->GetConfig("relay.fallback_device_type", Integer(static_cast<int>(kDLCPU)));
+          auto fallback_dev = opt_fallback_dev.value();
+          CHECK_GT(fallback_dev->value, 0U);
+          fallback_ctx.device_type = static_cast<DLDeviceType>(fallback_dev->value);
+          fallback_ctx.device_id = 0;
+        } else {
+          const auto& it = targets.begin();
+          fallback_ctx.device_type = static_cast<DLDeviceType>((*it).first->value);
+          fallback_ctx.device_id = 0;
+        }
+        auto ca = ContextAnalysis(mod, fallback_ctx);
+
+        auto glob_funcs = mod->functions;
+        for (const auto& it : glob_funcs) {
+          if (auto* func_node = it.second.as<FunctionNode>()) {
+            auto func = GetRef<Function>(func_node);
+            auto rewriter = DialectRewriter(target_host, ca);
+            auto updated_func = rewriter.Rewrite(func);
+
+            mod->Update(it.first, updated_func);
+          }
+        }
+
+        mod = relay::transform::InferType()(mod);
+        return mod;
+      },
+      0, "ManifestAlloc", {});
+}
+
+TVM_REGISTER_GLOBAL("relay.transform.ManifestAlloc")
+    .set_body_typed([](Target target_host, Map<tvm::Integer, tvm::Target> targets) {
+      return ManifestAlloc(target_host, targets);
+    });
+
+}  // namespace transform
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index 0b575d120e8f..9d05631a753a 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -54,7 +54,6 @@ def check_result(
     for kind in ["debug", "vm"]:
         targets = targets or tvm.testing.enabled_targets()
         for tgt, ctx in targets:
-            print(tgt)
             if disable_targets and tgt in disable_targets:
                 continue
             if kind == "debug" and (only_vm or ctx.device_type != tvm.cpu().device_type):
diff --git a/tests/python/relay/test_memory_passes.py b/tests/python/relay/test_memory_passes.py
index c960d1f90c37..546aaf51f734 100644
--- a/tests/python/relay/test_memory_passes.py
+++ b/tests/python/relay/test_memory_passes.py
@@ -18,7 +18,6 @@
 from tvm import te
 import numpy as np
 from tvm import relay
-from tvm.relay import memory_alloc
 
 
 def check_memory_plan(func, check_fn):

From 3734d5f7f8475a2a7897f239b9942c913256fc96 Mon Sep 17 00:00:00 2001
From: Meteorix <liuxin.ai@bytedance.com>
Date: Sat, 30 Jan 2021 03:00:12 +0800
Subject: [PATCH 139/357] [CUDA][PASS]Legalize tensorcore (#7147)

* add pad_to_tensorcore & legalize for dense/bmm/conv2d

* fix pad & slice

* fix comments

* fix comments

* resolve conflict

* resolve conflict

* support only fp16

* add tests/python/relay/test_pass_legalize_tensorcore.py

* add tests for legalize tensorcore

* fix pylint

* fix pylint

* code format

* use_gpu test only; fix conv2d_alter_op

* fix tests params

* revert transform fix
---
 python/tvm/relay/op/nn/_nn.py                 |  42 +++
 python/tvm/topi/cuda/__init__.py              |   1 +
 python/tvm/topi/cuda/conv2d_alter_op.py       |  48 ++++
 python/tvm/topi/cuda/tensorcore_alter_op.py   | 204 +++++++++++++++
 python/tvm/topi/nn/batch_matmul.py            |  24 ++
 python/tvm/topi/nn/dense.py                   |  24 ++
 .../relay/test_pass_legalize_tensorcore.py    | 239 ++++++++++++++++++
 7 files changed, 582 insertions(+)
 create mode 100644 python/tvm/topi/cuda/tensorcore_alter_op.py
 create mode 100644 tests/python/relay/test_pass_legalize_tensorcore.py

diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index c5af5d83bd7d..37ee6b6e929f 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -52,6 +52,27 @@
 reg.register_pattern("nn.log_softmax", OpPattern.OPAQUE)
 
 
+@reg.register_legalize("nn.dense")
+def legalize_dense(attrs, inputs, types):
+    """Legalize dense op.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current convolution
+    inputs : list of tvm.relay.Expr
+        The args of the Relay expr to be legalized
+    types : list of types
+        List of input and output types
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The legalized expr
+    """
+    return topi.nn.dense_legalize(attrs, inputs, types)
+
+
 # dense
 reg.register_strategy("nn.dense", strategy.dense_strategy)
 reg.register_pattern("nn.dense", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
@@ -67,6 +88,27 @@ def compute_fifo_buffer(attrs, inputs, out_type):
 reg.register_pattern("nn.fifo_buffer", OpPattern.OPAQUE)
 
 
+@reg.register_legalize("nn.batch_matmul")
+def legalize_batch_matmul(attrs, inputs, types):
+    """Legalize batch_matmul op.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current convolution
+    inputs : list of tvm.relay.Expr
+        The args of the Relay expr to be legalized
+    types : list of types
+        List of input and output types
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The legalized expr
+    """
+    return topi.nn.batch_matmul_legalize(attrs, inputs, types)
+
+
 # batch_matmul
 reg.register_strategy("nn.batch_matmul", strategy.batch_matmul_strategy)
 reg.register_pattern("nn.batch_matmul", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
diff --git a/python/tvm/topi/cuda/__init__.py b/python/tvm/topi/cuda/__init__.py
index e0ff5a12a9b2..bf3582c01d4f 100644
--- a/python/tvm/topi/cuda/__init__.py
+++ b/python/tvm/topi/cuda/__init__.py
@@ -55,5 +55,6 @@
 from .conv2d_hwnc_tensorcore import *
 from .correlation import *
 from .sparse import *
+from . import tensorcore_alter_op
 from .argwhere import *
 from .scan import *
diff --git a/python/tvm/topi/cuda/conv2d_alter_op.py b/python/tvm/topi/cuda/conv2d_alter_op.py
index 8cf0519ebe29..65bf9d1f178d 100644
--- a/python/tvm/topi/cuda/conv2d_alter_op.py
+++ b/python/tvm/topi/cuda/conv2d_alter_op.py
@@ -24,8 +24,10 @@
 from .. import nn
 from ..utils import get_const_tuple
 from .conv2d_winograd import _infer_tile_size
+from .tensorcore_alter_op import pad_to_tensorcore
 from ..nn import conv2d_legalize
 
+
 logger = logging.getLogger("topi")
 
 
@@ -345,4 +347,50 @@ def _conv2d_legalize(attrs, inputs, arg_types):
             else:
                 out = relay.nn.conv2d(data, kernel, **new_attrs)
             return out
+    elif data_dtype in ["float16"]:  # todo: support int8/int4
+        if data_layout == "NHWC" and kernel_layout == "HWIO":
+            batch = data_tensor.shape[0].value
+            in_channel = data_tensor.shape[3].value
+            out_channel = kernel_tensor.shape[3].value
+
+            if (
+                (batch % 8 == 0 and in_channel % 16 == 0 and out_channel % 32 == 0)
+                or (batch % 16 == 0 and in_channel % 16 == 0 and out_channel % 16 == 0)
+                or (batch % 32 == 0 and in_channel % 16 == 0 and out_channel % 8 == 0)
+            ):
+                # no need to pad
+                return None
+
+            (db, di, do), extra_flops = pad_to_tensorcore(batch, in_channel, out_channel)
+
+            if extra_flops > 2:
+                logger.info("conv2d pad_to_tensorcore skipped, extra_flops %s", extra_flops)
+                return None
+
+            logger.info("conv2d pad_to_tensorcore, extra_flops %s", extra_flops)
+
+            # Pad batch size
+            if db != 0:
+                data = relay.nn.pad(data, pad_width=((0, db), (0, 0), (0, 0), (0, 0)))
+
+            # Pad input channel
+            if di != 0:
+                data = relay.nn.pad(data, pad_width=((0, 0), (0, 0), (0, 0), (0, di)))
+                kernel = relay.nn.pad(kernel, pad_width=((0, 0), (0, 0), (0, di), (0, 0)))
+
+            # Pad output channel
+            if do != 0:
+                kernel = relay.nn.pad(kernel, pad_width=((0, 0), (0, 0), (0, 0), (0, do)))
+
+            if do != 0:
+                new_out_channel = out_channel + do
+                new_attrs["channels"] = new_out_channel
+
+            out = relay.nn.conv2d(data, kernel, **new_attrs)
+
+            if db != 0 or do != 0:
+                original_out_shape = [x.value for x in output_tensor.shape]
+                out = relay.strided_slice(out, begin=[0, 0, 0, 0], end=original_out_shape)
+
+            return out
     return None
diff --git a/python/tvm/topi/cuda/tensorcore_alter_op.py b/python/tvm/topi/cuda/tensorcore_alter_op.py
new file mode 100644
index 000000000000..aec7acbfde56
--- /dev/null
+++ b/python/tvm/topi/cuda/tensorcore_alter_op.py
@@ -0,0 +1,204 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument
+"""Tensorcore alter op and legalize functions for cuda backend"""
+
+import logging
+import math
+from tvm import relay
+
+from .. import nn
+
+logger = logging.getLogger("topi")
+
+
+@nn.batch_matmul_legalize.register("cuda")
+def _batch_matmul_legalize(attrs, inputs, arg_types):
+    """Legalizes batch_matmul op.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current convolution
+    inputs : list of tvm.relay.Expr
+        The args of the Relay expr to be legalized
+    arg_types : list of types
+        List of input and output types
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The legalized expr
+    """
+    # Collect the input tensors.
+    x_tensor, y_tensor = arg_types[0], arg_types[1]
+    dtype = x_tensor.dtype
+
+    # Collect the output tensor.
+    output_tensor = arg_types[2]
+
+    # Collect the input exprs.
+    x, y = inputs
+
+    # Pad input and output channels to use tensorcore schedule.
+    if dtype in ["float16"]:  # todo: support int8/int4
+        B, M, K = x_tensor.shape
+        B, N, K = y_tensor.shape
+        M = M.value
+        K = K.value
+        N = N.value
+
+        # The shape of (M, K, N) must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32)
+        if (
+            (M % 8 == 0 and K % 16 == 0 and N % 32 == 0)
+            or (M % 16 == 0 and K % 16 == 0 and N % 16 == 0)
+            or (M % 32 == 0 and K % 16 == 0 and N % 8 == 0)
+        ):
+            # no need to pad
+            return None
+
+        (dm, dk, dn), extra_flops = pad_to_tensorcore(M, K, N)
+
+        if extra_flops > 2:
+            logger.info("batch_matmul pad_to_tensorcore skipped, extra_flops %s", extra_flops)
+            return None
+
+        logger.info("batch_matmul pad_to_tensorcore, extra_flops %s", extra_flops)
+        if dm or dk:
+            x_ = relay.nn.pad(x, pad_width=((0, 0), (0, dm), (0, dk)))
+        else:
+            x_ = x
+        if dn or dk:
+            y_ = relay.nn.pad(y, pad_width=((0, 0), (0, dn), (0, dk)))
+        else:
+            y_ = y
+        out_ = relay.nn.batch_matmul(x_, y_)
+        if dm or dn:
+            original_out_shape = [x.value for x in output_tensor.shape]
+            out = relay.strided_slice(out_, begin=[0, 0, 0], end=original_out_shape)
+        else:
+            out = out_
+        return out
+    return None
+
+
+@nn.dense_legalize.register("cuda")
+def _dense_legalize(attrs, inputs, arg_types):
+    """Legalizes dense op.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current convolution
+    inputs : list of tvm.relay.Expr
+        The args of the Relay expr to be legalized
+    types : list of types
+        List of input and output types
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The legalized expr
+    """
+    # Collect the input tensors.
+    x_tensor, y_tensor = arg_types[0], arg_types[1]
+    dtype = x_tensor.dtype
+
+    # Collect the output tensor.
+    output_tensor = arg_types[2]
+
+    # Collect the input exprs.
+    x, y = inputs
+
+    # Pad input and output channels to use tensorcore schedule.
+    if dtype in ["float16"]:  # todo: support int8/int4
+        M, K = x_tensor.shape
+        N, K = y_tensor.shape
+        try:
+            M = M.value
+            K = K.value
+            N = N.value
+        except AttributeError:
+            # todo: deal with unfixed shape when compiling wdl model
+            return None
+
+        # The shape of (M, K, N) must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32)
+        if (
+            (M % 8 == 0 and K % 16 == 0 and N % 32 == 0)
+            or (M % 16 == 0 and K % 16 == 0 and N % 16 == 0)
+            or (M % 32 == 0 and K % 16 == 0 and N % 8 == 0)
+        ):
+            # no need to pad
+            return None
+
+        (dm, dk, dn), extra_flops_ratio = pad_to_tensorcore(M, K, N)
+
+        if extra_flops_ratio > 2:
+            logger.info("dense pad_to_tensorcore skipped, extra_flops_ratio %s", extra_flops_ratio)
+            return None
+
+        logger.info("dense pad_to_tensorcore, extra_flops_ratio %s", extra_flops_ratio)
+
+        if dm or dk:
+            x_ = relay.nn.pad(x, pad_width=((0, dm), (0, dk)))
+        else:
+            x_ = x
+        if dn or dk:
+            y_ = relay.nn.pad(y, pad_width=((0, dn), (0, dk)))
+        else:
+            y_ = y
+        out_ = relay.nn.dense(x_, y_)
+        if dm or dn:
+            original_out_shape = [x.value for x in output_tensor.shape]
+            out = relay.strided_slice(out_, begin=[0, 0], end=original_out_shape)
+        else:
+            out = out_
+        return out
+    return None
+
+
+def pad_to_tensorcore(M, K, N):
+    """pad shape to enable tensorcore"""
+    candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)]
+
+    flops = M * K * N
+    extra_flops = math.inf
+    best_pad = (0, 0, 0)
+    for padding in candidates:
+        dm, dk, dn = _pad_to(M, K, N, padding)
+        e = (M + dm) * (N + dn) * (K + dk) - M * N * K
+        # print(dm, dk, dn, e, flops)
+        if e < extra_flops:
+            extra_flops = e
+            best_pad = (dm, dk, dn)
+    return best_pad, extra_flops / flops
+
+
+def _pad_to(M, K, N, PADDING):
+    dm, dk, dn = 0, 0, 0
+
+    if M % PADDING[0] != 0:
+        M_ = ((M + PADDING[0]) // PADDING[0]) * PADDING[0]
+        dm = M_ - M
+    if K % PADDING[1] != 0:
+        K_ = ((K + PADDING[1]) // PADDING[1]) * PADDING[1]
+        dk = K_ - K
+    if N % PADDING[2] != 0:
+        N_ = ((N + PADDING[2]) // PADDING[2]) * PADDING[2]
+        dn = N_ - N
+
+    return dm, dk, dn
diff --git a/python/tvm/topi/nn/batch_matmul.py b/python/tvm/topi/nn/batch_matmul.py
index 9ca2df7c46e1..9c5848129397 100644
--- a/python/tvm/topi/nn/batch_matmul.py
+++ b/python/tvm/topi/nn/batch_matmul.py
@@ -16,6 +16,7 @@
 # under the License.
 """Batch matrix multiplication"""
 # pylint: disable=invalid-name
+import tvm
 from tvm import te, auto_scheduler
 from ..utils import get_const_tuple
 
@@ -77,3 +78,26 @@ def batch_matmul(x, y, oshape=None, auto_scheduler_rewritten_layout=""):
         output = auto_scheduler.rewrite_compute_body(output, auto_scheduler_rewritten_layout)
 
     return output
+
+
+@tvm.target.generic_func
+def batch_matmul_legalize(attrs, inputs, types):
+    """Legalizes batch_matmul op.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current batch_matmul
+    inputs : list of tvm.relay.Expr
+        The args of the Relay expr to be legalized
+    types : list of types
+        List of input and output types
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The legalized expr
+    """
+    # not to change by default
+    # pylint: disable=unused-argument
+    return None
diff --git a/python/tvm/topi/nn/dense.py b/python/tvm/topi/nn/dense.py
index 474fea42a7cb..bb6ea90c3fcd 100644
--- a/python/tvm/topi/nn/dense.py
+++ b/python/tvm/topi/nn/dense.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """TVM operator fully connected compute."""
+import tvm
 from tvm import te, auto_scheduler
 from .. import tag
 
@@ -80,3 +81,26 @@ def dense(data, weight, bias=None, out_dtype=None, auto_scheduler_rewritten_layo
         matmul = auto_scheduler.rewrite_compute_body(matmul, auto_scheduler_rewritten_layout)
 
     return matmul
+
+
+@tvm.target.generic_func
+def dense_legalize(attrs, inputs, types):
+    """Legalizes dense op.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current dense
+    inputs : list of tvm.relay.Expr
+        The args of the Relay expr to be legalized
+    types : list of types
+        List of input and output types
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The legalized expr
+    """
+    # not to change by default
+    # pylint: disable=unused-argument
+    return None
diff --git a/tests/python/relay/test_pass_legalize_tensorcore.py b/tests/python/relay/test_pass_legalize_tensorcore.py
new file mode 100644
index 000000000000..5ecda4ba07a8
--- /dev/null
+++ b/tests/python/relay/test_pass_legalize_tensorcore.py
@@ -0,0 +1,239 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test legalize pass"""
+import numpy as np
+import tvm
+from tvm import te
+from tvm import topi
+from tvm import relay
+from tvm.contrib import graph_runtime
+from tvm.relay import transform, analysis
+from tvm.relay.testing.temp_op_attr import TempOpAttr
+
+
+def run_opt_pass(expr, passes):
+    passes = passes if isinstance(passes, list) else [passes]
+    mod = tvm.IRModule.from_expr(expr)
+    seq = tvm.transform.Sequential(passes)
+    with tvm.transform.PassContext(opt_level=3):
+        mod = seq(mod)
+    entry = mod["main"]
+    return entry if isinstance(expr, relay.Function) else entry.body
+
+
+@tvm.testing.uses_gpu
+def test_legalize_conv2d():
+    """test legalize conv2d to enable tensorcore"""
+
+    def _test_legalize_conv2d(data_shape, kernel_shape, pad_shape, do_pad=True):
+        out_channel = kernel_shape[3]
+        out_shape = list(data_shape)
+        out_shape[3] = out_channel
+        db, di, do = pad_shape
+
+        def before():
+            x = relay.var("x", shape=data_shape, dtype="float16")
+            weight = relay.var("weight", shape=kernel_shape, dtype="float16")
+            y = relay.nn.conv2d(
+                x,
+                weight,
+                channels=out_channel,
+                kernel_size=(3, 3),
+                padding=(1, 1),
+                data_layout="NHWC",
+                kernel_layout="HWIO",
+            )
+            y = relay.Function([x, weight], y)
+            return y
+
+        def legalize_conv2d(attrs, inputs, types):
+            with tvm.target.Target("cuda"):
+                return topi.nn.conv2d_legalize(attrs, inputs, types)
+
+        def expected():
+            if not do_pad:
+                return before()
+            x = relay.var("x", shape=data_shape, dtype="float16")
+            if db or di:
+                x_pad = relay.nn.pad(x, pad_width=((0, db), (0, 0), (0, 0), (0, di)))
+            else:
+                x_pad = x
+            weight = relay.var("weight", shape=(kernel_shape), dtype="float16")
+            if di or do:
+                weight_pad = relay.nn.pad(weight, pad_width=((0, 0), (0, 0), (0, di), (0, do)))
+            else:
+                weight_pad = weight
+            y_pad = relay.nn.conv2d(
+                x_pad,
+                weight=weight_pad,
+                channels=out_channel + do,
+                kernel_size=(3, 3),
+                padding=(1, 1),
+                data_layout="NHWC",
+                kernel_layout="HWIO",
+            )
+            if db or do:
+                y = relay.strided_slice(y_pad, begin=[0, 0, 0, 0], end=out_shape)
+            else:
+                y = y_pad
+            y = relay.Function([x, weight], y)
+            return y
+
+        with TempOpAttr("nn.conv2d", "FTVMLegalize", legalize_conv2d):
+            a = before()
+            a = run_opt_pass(a, transform.Legalize())
+            b = run_opt_pass(expected(), transform.InferType())
+        assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "Expected = \n" + str(b)
+
+    # conv2d pad batch
+    _test_legalize_conv2d((7, 16, 16, 64), (3, 3, 64, 64), (1, 0, 0))
+    _test_legalize_conv2d((3, 16, 16, 64), (3, 3, 64, 64), (5, 0, 0))
+    _test_legalize_conv2d((2, 16, 16, 64), (3, 3, 64, 64), (0, 0, 0), False)
+    # conv2d pad in_channel
+    _test_legalize_conv2d((8, 16, 16, 63), (3, 3, 63, 64), (0, 1, 0))
+    _test_legalize_conv2d((8, 16, 16, 33), (3, 3, 33, 64), (0, 15, 0))
+    _test_legalize_conv2d((8, 16, 16, 13), (3, 3, 13, 64), (0, 3, 0))
+    _test_legalize_conv2d((8, 16, 16, 1), (3, 3, 1, 64), (0, 0, 0), False)
+    # conv2d pad out_channel
+    _test_legalize_conv2d((8, 16, 16, 64), (3, 3, 64, 63), (0, 0, 1))
+    _test_legalize_conv2d((8, 16, 16, 64), (3, 3, 64, 33), (0, 0, 31))
+    _test_legalize_conv2d((8, 16, 16, 64), (3, 3, 64, 1), (0, 0, 0), False)
+
+
+@tvm.testing.uses_gpu
+def test_legalize_dense():
+    def _test_legalize_dense(data_shape, kernel_shape, pad_shape, do_pad=True):
+        """test legalize dense to enable tensorcore"""
+        M, K = data_shape
+        N, _ = kernel_shape
+        out_shape = (M, N)
+        dm, dk, dn = pad_shape
+
+        def before():
+            x = relay.var("x", shape=data_shape, dtype="float16")
+            weight = relay.var("weight", shape=kernel_shape, dtype="float16")
+            y = relay.nn.dense(x, weight)
+            y = relay.Function([x, weight], y)
+            return y
+
+        def legalize_dense(attrs, inputs, types):
+            with tvm.target.Target("cuda"):
+                return topi.nn.dense_legalize(attrs, inputs, types)
+
+        def expected():
+            if not do_pad:
+                return before()
+            x = relay.var("x", shape=data_shape, dtype="float16")
+            if dm or dk:
+                x_pad = relay.nn.pad(x, pad_width=((0, dm), (0, dk)))
+            else:
+                x_pad = x
+            weight = relay.var("weight", shape=(kernel_shape), dtype="float16")
+            if dn or dk:
+                weight_pad = relay.nn.pad(weight, pad_width=((0, dn), (0, dk)))
+            else:
+                weight_pad = weight
+            y_pad = relay.nn.dense(
+                x_pad,
+                weight_pad,
+            )
+            if dm or dn:
+                y = relay.strided_slice(y_pad, begin=[0, 0], end=out_shape)
+            else:
+                y = y_pad
+            y = relay.Function([x, weight], y)
+            return y
+
+        with TempOpAttr("nn.dense", "FTVMLegalize", legalize_dense):
+            a = before()
+            a = run_opt_pass(a, transform.Legalize())
+            b = run_opt_pass(expected(), transform.InferType())
+        assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "Expected = \n" + str(b)
+
+    # dense
+    _test_legalize_dense((8, 16), (32, 16), (0, 0, 0), False)
+    _test_legalize_dense((7, 16), (32, 16), (1, 0, 0))
+    _test_legalize_dense((8, 15), (32, 15), (0, 1, 0))
+    _test_legalize_dense((8, 16), (31, 16), (0, 0, 1))
+    _test_legalize_dense((7, 15), (31, 15), (1, 1, 1))
+    _test_legalize_dense((3, 16), (32, 16), (5, 0, 0))
+    _test_legalize_dense((2, 16), (32, 16), (0, 0, 0), False)
+
+
+@tvm.testing.uses_gpu
+def test_legalize_batch_matmul():
+    def _test_legalize_batch_matmul(data_shape, kernel_shape, pad_shape, do_pad=True):
+        """test legalize dense to enable tensorcore"""
+        B, M, _ = data_shape
+        _, N, _ = kernel_shape
+        out_shape = (B, M, N)
+        dm, dk, dn = pad_shape
+
+        def before():
+            x = relay.var("x", shape=data_shape, dtype="float16")
+            weight = relay.var("weight", shape=kernel_shape, dtype="float16")
+            y = relay.nn.batch_matmul(x, weight)
+            y = relay.Function([x, weight], y)
+            return y
+
+        def legalize_batch_matmul(attrs, inputs, types):
+            with tvm.target.Target("cuda"):
+                return topi.nn.batch_matmul_legalize(attrs, inputs, types)
+
+        def expected():
+            if not do_pad:
+                return before()
+            x = relay.var("x", shape=data_shape, dtype="float16")
+            if dm or dk:
+                x_pad = relay.nn.pad(x, pad_width=((0, 0), (0, dm), (0, dk)))
+            else:
+                x_pad = x
+            weight = relay.var("weight", shape=(kernel_shape), dtype="float16")
+            if dn or dk:
+                weight_pad = relay.nn.pad(weight, pad_width=((0, 0), (0, dn), (0, dk)))
+            else:
+                weight_pad = weight
+            y_pad = relay.nn.batch_matmul(
+                x_pad,
+                weight_pad,
+            )
+            if dm or dn:
+                y = relay.strided_slice(y_pad, begin=[0, 0, 0], end=out_shape)
+            else:
+                y = y_pad
+            y = relay.Function([x, weight], y)
+            return y
+
+        with TempOpAttr("nn.batch_matmul", "FTVMLegalize", legalize_batch_matmul):
+            a = before()
+            a = run_opt_pass(a, transform.Legalize())
+            b = run_opt_pass(expected(), transform.InferType())
+        assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "Expected = \n" + str(b)
+
+    _test_legalize_batch_matmul((16, 8, 16), (16, 32, 16), (0, 0, 0), False)
+    _test_legalize_batch_matmul((16, 7, 16), (16, 32, 16), (1, 0, 0))
+    _test_legalize_batch_matmul((16, 8, 15), (16, 32, 15), (0, 1, 0))
+    _test_legalize_batch_matmul((16, 8, 16), (16, 31, 16), (0, 0, 1))
+    _test_legalize_batch_matmul((16, 7, 15), (16, 31, 15), (1, 1, 1))
+    _test_legalize_batch_matmul((16, 3, 16), (16, 32, 16), (5, 0, 0))
+    _test_legalize_batch_matmul((16, 2, 16), (16, 32, 16), (0, 0, 0), False)
+
+
+if __name__ == "__main__":
+    test_legalize_conv2d()
+    test_legalize_dense()
+    test_legalize_batch_matmul()

From e8926199b8c25ad11eb8641c4aef8efec13b9380 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Mon, 1 Feb 2021 00:42:29 +0900
Subject: [PATCH 140/357] swap pytorch and tvm import order (#7380)

---
 .../python/frontend/pytorch/test_object_detection.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/python/frontend/pytorch/test_object_detection.py b/tests/python/frontend/pytorch/test_object_detection.py
index 6b7f9be06d99..3c94b0b846d8 100644
--- a/tests/python/frontend/pytorch/test_object_detection.py
+++ b/tests/python/frontend/pytorch/test_object_detection.py
@@ -17,8 +17,6 @@
 # pylint: disable=import-self, invalid-name, unused-argument
 """Test torch vision fasterrcnn and maskrcnn models"""
 import numpy as np
-import torch
-import torchvision
 import cv2
 
 import tvm
@@ -33,6 +31,8 @@
 )
 from tvm.contrib.download import download
 
+import torch
+import torchvision
 
 in_size = 300
 
@@ -150,10 +150,10 @@ def compile_and_run_vm(mod, params, data_np, target):
     after = mod["main"]
     assert not tvm.ir.structural_equal(after, before)
 
-    # before = mod["main"]
-    # mod = rewrite_scatter_to_gather(mod, 4)  # num_scales is 4 for maskrcnn_resnet50_fpn
-    # after = mod["main"]
-    # assert not tvm.ir.structural_equal(after, before)
+    before = mod["main"]
+    mod = rewrite_scatter_to_gather(mod, 4)  # num_scales is 4 for maskrcnn_resnet50_fpn
+    after = mod["main"]
+    assert not tvm.ir.structural_equal(after, before)
 
     tvm_res_after_rewrite = compile_and_run_vm(mod, params, data_np, "llvm")
 

From 96b09817fc1796a789524ae30cd2d7e9d6f73d6c Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Mon, 1 Feb 2021 10:20:09 +0900
Subject: [PATCH 141/357] disable other rewrite to test CI (#7371)

---
 .../python/frontend/pytorch/test_object_detection.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/tests/python/frontend/pytorch/test_object_detection.py b/tests/python/frontend/pytorch/test_object_detection.py
index 3c94b0b846d8..a404a88393bc 100644
--- a/tests/python/frontend/pytorch/test_object_detection.py
+++ b/tests/python/frontend/pytorch/test_object_detection.py
@@ -122,7 +122,7 @@ def compile_and_run_vm(mod, params, data_np, target):
         vm.set_input("main", **{input_name: data_np})
         return vm.run()
 
-    for target in ["cuda", "llvm"]:
+    for target in ["llvm"]:
         tvm_res = compile_and_run_vm(mod, params, data_np, target)
 
         # Bounding boxes
@@ -145,10 +145,12 @@ def compile_and_run_vm(mod, params, data_np, target):
     after = mod["main"]
     assert not tvm.ir.structural_equal(after, before)
 
-    before = mod["main"]
-    mod = rewrite_batched_nms_with_max_out_size(mod)
-    after = mod["main"]
-    assert not tvm.ir.structural_equal(after, before)
+    # TODO(masahi): It seems this rewrite causes flaky segfaults on CI
+    # See https://github.com/apache/tvm/issues/7363
+    # before = mod["main"]
+    # mod = rewrite_batched_nms_with_max_out_size(mod)
+    # after = mod["main"]
+    # assert not tvm.ir.structural_equal(after, before)
 
     before = mod["main"]
     mod = rewrite_scatter_to_gather(mod, 4)  # num_scales is 4 for maskrcnn_resnet50_fpn

From 2365c7ee6620f672c172247afc73da3884165884 Mon Sep 17 00:00:00 2001
From: yhj050806 <33542485+yhj050806@users.noreply.github.com>
Date: Mon, 1 Feb 2021 15:12:48 +0800
Subject: [PATCH 142/357] fix duplicated symbol bug in external codegen (#7383)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: 袁航剑 <yuanhangjian@bytedance.com>
---
 src/relay/backend/compile_engine.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index a66ae0a7e2c0..ed09e4f6eb32 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -651,10 +651,10 @@ class CompileEngineImpl : public CompileEngineNode {
                                       << AsText(src_func, false);
 
         std::string sn = symbol_name.value();
-        if (cached_symbol.count(sn)) {
+        if (!cached_symbol.count(sn)) {
           cached_symbol[sn] = code_gen_name;
         } else {
-          ICHECK_NE(sn, code_gen_name)
+          ICHECK_NE(cached_symbol[sn], code_gen_name)
               << "Found duplicated symbol: " << sn << " for: " << code_gen_name;
         }
 

From 0d303b46cac4420f2b83ed3a0f445060d2fd5982 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Tue, 2 Feb 2021 06:39:41 +0900
Subject: [PATCH 143/357] [Parser] Fix tokenizing inf (#7370)

* fix tokenizing inf

* use ParseNumber to parse inf, handle -inf

* fix neg handling

* fixed multi negation

* refactor

* use while loop

* simplyfing

* fix lint

* simpler implementation per altan's suggestion

* disable flaky test
---
 src/parser/tokenizer.h               | 61 +++++++++++++++-------------
 tests/python/contrib/test_cudnn.py   |  3 +-
 tests/python/relay/test_ir_parser.py | 14 ++++++-
 3 files changed, 47 insertions(+), 31 deletions(-)

diff --git a/src/parser/tokenizer.h b/src/parser/tokenizer.h
index c6fb3e09f4d1..5e71794cc7fb 100644
--- a/src/parser/tokenizer.h
+++ b/src/parser/tokenizer.h
@@ -212,6 +212,25 @@ struct Tokenizer {
     }
   }
 
+  Token ParseNumber(bool is_pos) {
+    std::stringstream ss;
+    while (More() && IsNumeric(Peek())) {
+      ss << Next();
+    }
+
+    bool is_float = false;
+
+    // Remove trailing floating point prefix.
+    if (More() && Peek() == 'f') {
+      ss << Next();
+      while (More() && IsNumeric(Peek())) {
+        ss << Next();
+      }
+      is_float = true;
+    }
+    return ParseNumber(is_pos, is_float, ss.str());
+  }
+
   bool MatchString(const std::string& string) {
     int start = this->pos;
 
@@ -340,38 +359,28 @@ struct Tokenizer {
       auto token = NewToken(TokenType::kWhitespace);
       Next();
       return token;
-    } else if (IsDigit(next) || next == '-') {
+    } else if (next == '-') {
       int negs = 0;
       while (More() && Peek() == '-') {
         Next();
         negs++;
       }
-      // If there isn't a number right after either,
-      // this is really slow for lexing, should replace
-      // with multi-token return or something.
-      if (negs && !IsDigit(Peek())) {
+      bool is_neg = negs % 2 == 1;
+      if (More() && IsDigit(Peek())) {
+        return ParseNumber(!is_neg);
+      } else if (More() && MatchString("inff")) {
+        return ParseNumber(!is_neg, true, "inff");
+      } else {
+        // If there isn't a number right after either,
+        // this is really slow for lexing, should replace
+        // with multi-token return or something.
         pos = pos - (negs - 1);
         return NewToken(TokenType::kMinus);
       }
-
-      bool is_neg = negs % 2 == 1;
-      std::stringstream ss;
-      while (More() && IsNumeric(Peek())) {
-        ss << Next();
-      }
-
-      bool is_float = false;
-
-      // Remove trailing floating point prefix.
-      if (More() && Peek() == 'f') {
-        ss << Next();
-        while (More() && IsNumeric(Peek())) {
-          ss << Next();
-        }
-        is_float = true;
-      }
-
-      return ParseNumber(!is_neg, is_float, ss.str());
+    } else if (IsDigit(next)) {
+      return ParseNumber(true);
+    } else if (MatchString("inff")) {
+      return ParseNumber(true, true, "inff");
     } else if (next == '.') {
       auto token = NewToken(TokenType::kPeriod);
       Next();
@@ -404,10 +413,6 @@ struct Tokenizer {
       auto token = NewToken(TokenType::kPlus);
       Next();
       return token;
-    } else if (next == '-') {
-      auto token = NewToken(TokenType::kMinus);
-      Next();
-      return token;
     } else if (next == '*') {
       auto token = NewToken(TokenType::kStar);
       Next();
diff --git a/tests/python/contrib/test_cudnn.py b/tests/python/contrib/test_cudnn.py
index b07f2b2fe96c..514f529b4692 100644
--- a/tests/python/contrib/test_cudnn.py
+++ b/tests/python/contrib/test_cudnn.py
@@ -93,7 +93,8 @@ def verify_conv2d(data_dtype, conv_dtype, tensor_format=0, groups=1):
 def test_conv2d():
     verify_conv2d("float32", "float32", tensor_format=0)
     verify_conv2d("float16", "float32", tensor_format=1)
-    verify_conv2d("float16", "float16", tensor_format=0)
+    # This test is flaky, disable for now
+    # verify_conv2d("float16", "float16", tensor_format=0)
     verify_conv2d("int8", "int32", tensor_format=1)
 
     verify_conv2d("float32", "float32", tensor_format=0, groups=2)
diff --git a/tests/python/relay/test_ir_parser.py b/tests/python/relay/test_ir_parser.py
index 162271756557..70fb56049873 100644
--- a/tests/python/relay/test_ir_parser.py
+++ b/tests/python/relay/test_ir_parser.py
@@ -14,14 +14,14 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import numpy as np
+
 import tvm
-from tvm import te
 from tvm import relay
 import tvm.relay.testing
 import pytest
 from numpy import isclose
 from typing import Union
-from functools import wraps
 
 
 SEMVER = '#[version = "0.0.5"]\n'
@@ -910,6 +910,16 @@ def test_load_prelude():
     tvm.parser.parse(mod.astext())
 
 
+def test_tokenize_inf():
+    x = relay.var("x", shape=(3, 4), dtype="float32")
+    y = relay.clip(x, -np.inf, np.inf)
+
+    f = relay.Function([x], y)
+    mod = tvm.IRModule.from_expr(f)
+
+    mod = relay.transform.AnnotateSpans()(mod)
+
+
 if __name__ == "__main__":
     import sys
 

From f7e05c3cadf142e1482b01385e30b93a9f7679da Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Mon, 1 Feb 2021 15:31:17 -0800
Subject: [PATCH 144/357] Improve op_type missing message (#7384)

---
 include/tvm/ir/op.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/tvm/ir/op.h b/include/tvm/ir/op.h
index c73be3c1e564..9456ea80d860 100644
--- a/include/tvm/ir/op.h
+++ b/include/tvm/ir/op.h
@@ -146,7 +146,7 @@ class OpNode : public RelayExprNode {
   // Internal function to compute if it is primitive op
   bool IsPrimitiveOp_() const {
     const auto& fn_ty = this->op_type;
-    ICHECK(fn_ty.get() != nullptr);
+    ICHECK(fn_ty.get() != nullptr) << "op_type of " << this->name << "is not registered";
     if (fn_ty->type_constraints.size() != 1) return false;
     const TypeRelationNode* rel = fn_ty->type_constraints[0].as<TypeRelationNode>();
     if (rel == nullptr) return false;

From 0bd259ae24852e36b74b44ea0c0acde2a0bd91a6 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Mon, 1 Feb 2021 18:54:05 -0500
Subject: [PATCH 145/357] [COMMUNITY] @hzfan -> reviewer (#7360)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 773f94a50dd9..42f7a8f81701 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -89,6 +89,7 @@ We do encourage everyone to work anything they are interested in.
 - [Neo Chien](https://github.com/cchung100m): @cchung100m
 - [Meghan Cowan](https://github.com/cowanmeg): @cowanmeg
 - [Balint Cristian](https://github.com/cbalint13): @cbalint13
+- [Haozheng Fan](https://github.com/hzfan): @hzfan
 - [Josh Fromm](https://github.com/jwfromm): @jwfromm
 - [Sergei Grechanik](https://github.com/sgrechanik-h): @sgrechanik-h
 - [Hao Lu](https://github.com/hlu1): @hlu1

From 3635945e48d9d1a7d8e76df418f057a4a3b88dc4 Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Mon, 1 Feb 2021 16:55:23 -0700
Subject: [PATCH 146/357] Refactor Dynamic to Static (#7368)

* DynamicToStatic Refactor

* fix test

* add regression tests

* cleanup

* skip PrepareInput if the arg is already a constant

* fix an issue with type inference with global functions
---
 src/relay/transforms/dynamic_to_static.cc     | 155 +++++++++++-------
 .../relay/test_pass_dynamic_to_static.py      |  44 ++++-
 2 files changed, 138 insertions(+), 61 deletions(-)

diff --git a/src/relay/transforms/dynamic_to_static.cc b/src/relay/transforms/dynamic_to_static.cc
index c580f60c2a68..815e4d224cc5 100644
--- a/src/relay/transforms/dynamic_to_static.cc
+++ b/src/relay/transforms/dynamic_to_static.cc
@@ -34,27 +34,30 @@ namespace relay {
 
 class DynamicToStaticMutator : public MixedModeMutator {
  public:
-  DynamicToStaticMutator() {
+  DynamicToStaticMutator(IRModule mod, Function func) : mod_(mod), func_(func) {
     op_map_ = {
         {Op::Get("dyn.reshape"),
-         [](const CallNode* call_node) {
-           if (const ConstantNode* shape = call_node->args[1].as<ConstantNode>()) {
+         [this](const CallNode* call_node) {
+           auto args = PrepareArgs(call_node);
+           if (const ConstantNode* shape = args[1].as<ConstantNode>()) {
              ICHECK_EQ(shape->data->ndim, 1);
              return MakeReshape(call_node->args[0], ToVector(shape->data));
            }
            return Expr(nullptr);
          }},
         {Op::Get("dyn.tile"),
-         [](const CallNode* call_node) {
-           if (const ConstantNode* reps = call_node->args[1].as<ConstantNode>()) {
+         [this](const CallNode* call_node) {
+           auto args = PrepareArgs(call_node);
+           if (const ConstantNode* reps = args[1].as<ConstantNode>()) {
              ICHECK_EQ(reps->data->ndim, 1);
              return MakeTile(call_node->args[0], ToVector(reps->data));
            }
            return Expr(nullptr);
          }},
         {Op::Get("dyn.topk"),
-         [](const CallNode* call_node) {
-           if (const ConstantNode* k = call_node->args[1].as<ConstantNode>()) {
+         [this](const CallNode* call_node) {
+           auto args = PrepareArgs(call_node);
+           if (const ConstantNode* k = args[1].as<ConstantNode>()) {
              const TopKAttrs* param = call_node->attrs.as<TopKAttrs>();
              ICHECK(param);
              return MakeTopK(call_node->args[0], static_cast<int>(ToScalar(k->data, 0)),
@@ -63,16 +66,18 @@ class DynamicToStaticMutator : public MixedModeMutator {
            return Expr(nullptr);
          }},
         {Op::Get("dyn.broadcast_to"),
-         [](const CallNode* call_node) {
-           if (const ConstantNode* shape = call_node->args[1].as<ConstantNode>()) {
+         [this](const CallNode* call_node) {
+           auto args = PrepareArgs(call_node);
+           if (const ConstantNode* shape = args[1].as<ConstantNode>()) {
              ICHECK_EQ(shape->data->ndim, 1);
              return MakeBroadCastTo(call_node->args[0], ToVector(shape->data));
            }
            return Expr(nullptr);
          }},
         {Op::Get("dyn.zeros"),
-         [](const CallNode* call_node) {
-           if (const ConstantNode* shape = call_node->args[0].as<ConstantNode>()) {
+         [this](const CallNode* call_node) {
+           auto args = PrepareArgs(call_node);
+           if (const ConstantNode* shape = args[0].as<ConstantNode>()) {
              const InitOpAttrs* param = call_node->attrs.as<InitOpAttrs>();
              ICHECK(param);
              return MakeZeros(ToVector(shape->data), param->dtype);
@@ -80,8 +85,9 @@ class DynamicToStaticMutator : public MixedModeMutator {
            return Expr(nullptr);
          }},
         {Op::Get("dyn.ones"),
-         [](const CallNode* call_node) {
-           if (const ConstantNode* shape = call_node->args[0].as<ConstantNode>()) {
+         [this](const CallNode* call_node) {
+           auto args = PrepareArgs(call_node);
+           if (const ConstantNode* shape = args[0].as<ConstantNode>()) {
              const InitOpAttrs* param = call_node->attrs.as<InitOpAttrs>();
              ICHECK(param);
              return MakeOnes(ToVector(shape->data), param->dtype);
@@ -89,8 +95,9 @@ class DynamicToStaticMutator : public MixedModeMutator {
            return Expr(nullptr);
          }},
         {Op::Get("dyn.one_hot"),
-         [](const CallNode* call_node) {
-           if (const ConstantNode* depth = call_node->args[3].as<ConstantNode>()) {
+         [this](const CallNode* call_node) {
+           auto args = PrepareArgs(call_node);
+           if (const ConstantNode* depth = args[3].as<ConstantNode>()) {
              const OneHotAttrs* param = call_node->attrs.as<OneHotAttrs>();
              ICHECK(param);
              return MakeOneHot(call_node->args[0], call_node->args[1], call_node->args[2],
@@ -100,8 +107,9 @@ class DynamicToStaticMutator : public MixedModeMutator {
            return Expr(nullptr);
          }},
         {Op::Get("dyn.image.resize"),
-         [](const CallNode* call_node) {
-           if (const ConstantNode* size = call_node->args[1].as<ConstantNode>()) {
+         [this](const CallNode* call_node) {
+           auto args = PrepareArgs(call_node);
+           if (const ConstantNode* size = args[1].as<ConstantNode>()) {
              const ResizeAttrs* param = call_node->attrs.as<ResizeAttrs>();
              ICHECK(param);
              auto size_int = ToVector(size->data);
@@ -115,8 +123,9 @@ class DynamicToStaticMutator : public MixedModeMutator {
            return Expr(nullptr);
          }},
         {Op::Get("dyn.full"),
-         [](const CallNode* call_node) {
-           if (const ConstantNode* shape = call_node->args[1].as<ConstantNode>()) {
+         [this](const CallNode* call_node) {
+           auto args = PrepareArgs(call_node);
+           if (const ConstantNode* shape = args[1].as<ConstantNode>()) {
              ICHECK_EQ(shape->data->ndim, 1);
              const InitOpAttrs* param = call_node->attrs.as<InitOpAttrs>();
              ICHECK(param);
@@ -125,9 +134,10 @@ class DynamicToStaticMutator : public MixedModeMutator {
            return Expr(nullptr);
          }},
         {Op::Get("dyn.nn.upsampling"),
-         [](const CallNode* call_node) {
-           const ConstantNode* scale_h = call_node->args[1].as<ConstantNode>();
-           const ConstantNode* scale_w = call_node->args[2].as<ConstantNode>();
+         [this](const CallNode* call_node) {
+           auto args = PrepareArgs(call_node);
+           const ConstantNode* scale_h = args[1].as<ConstantNode>();
+           const ConstantNode* scale_w = args[2].as<ConstantNode>();
            if (scale_h && scale_w) {
              ICHECK_EQ(scale_h->data->ndim, 0);
              ICHECK_EQ(scale_w->data->ndim, 0);
@@ -140,10 +150,11 @@ class DynamicToStaticMutator : public MixedModeMutator {
            return Expr(nullptr);
          }},
         {Op::Get("dyn.nn.upsampling3d"),
-         [](const CallNode* call_node) {
-           const ConstantNode* scale_d = call_node->args[1].as<ConstantNode>();
-           const ConstantNode* scale_h = call_node->args[2].as<ConstantNode>();
-           const ConstantNode* scale_w = call_node->args[3].as<ConstantNode>();
+         [this](const CallNode* call_node) {
+           auto args = PrepareArgs(call_node);
+           const ConstantNode* scale_d = args[1].as<ConstantNode>();
+           const ConstantNode* scale_h = args[2].as<ConstantNode>();
+           const ConstantNode* scale_w = args[3].as<ConstantNode>();
            if (scale_d && scale_h && scale_w) {
              ICHECK_EQ(scale_d->data->ndim, 0);
              ICHECK_EQ(scale_h->data->ndim, 0);
@@ -159,9 +170,10 @@ class DynamicToStaticMutator : public MixedModeMutator {
            return Expr(nullptr);
          }},
         {Op::Get("dyn.nn.pad"),
-         [](const CallNode* call_node) {
-           const ConstantNode* pad_width = call_node->args[1].as<ConstantNode>();
-           const ConstantNode* pad_fill = call_node->args[2].as<ConstantNode>();
+         [this](const CallNode* call_node) {
+           auto args = PrepareArgs(call_node);
+           const ConstantNode* pad_width = args[1].as<ConstantNode>();
+           const ConstantNode* pad_fill = args[2].as<ConstantNode>();
            if (pad_width && pad_fill) {
              ICHECK_EQ(pad_fill->data->ndim, 0);   // pad_val is 1d
              ICHECK_EQ(pad_width->data->ndim, 2);  // pad_width is 2d
@@ -174,10 +186,11 @@ class DynamicToStaticMutator : public MixedModeMutator {
            return Expr(nullptr);
          }},
         {Op::Get("dyn.strided_slice"),
-         [](const CallNode* call_node) {
-           const ConstantNode* begin = call_node->args[1].as<ConstantNode>();
-           const ConstantNode* end = call_node->args[2].as<ConstantNode>();
-           const ConstantNode* stride = call_node->args[3].as<ConstantNode>();
+         [this](const CallNode* call_node) {
+           auto args = PrepareArgs(call_node);
+           const ConstantNode* begin = args[1].as<ConstantNode>();
+           const ConstantNode* end = args[2].as<ConstantNode>();
+           const ConstantNode* stride = args[3].as<ConstantNode>();
            if (begin && end && stride) {
              ICHECK_EQ(begin->data->ndim, 1);
              ICHECK_EQ(end->data->ndim, 1);
@@ -190,8 +203,9 @@ class DynamicToStaticMutator : public MixedModeMutator {
            return Expr(nullptr);
          }},
         {Op::Get("dyn.sparse_to_dense"),
-         [](const CallNode* call_node) {
-           const ConstantNode* output_shape = call_node->args[3].as<ConstantNode>();
+         [this](const CallNode* call_node) {
+           auto args = PrepareArgs(call_node);
+           const ConstantNode* output_shape = args[3].as<ConstantNode>();
            if (output_shape) {
              ICHECK_EQ(output_shape->data->ndim, 1);
              return MakeSparseToDense(call_node->args[0], ToVector(output_shape->data),
@@ -200,6 +214,45 @@ class DynamicToStaticMutator : public MixedModeMutator {
            return Expr(nullptr);
          }},
     };
+    Map<BaseFunc, GlobalVar> vars;
+    for (auto kv : mod_->functions) {
+      vars.Set(kv.second, kv.first);
+    }
+    gv_ = vars[func_];
+  }
+
+  Expr PrepareInput(const Expr& expr) {
+    BaseFunc func;
+    if (auto* func_node = expr.as<BaseFuncNode>()) {
+      func = GetRef<BaseFunc>(func_node);
+    } else {
+      func =
+          relay::Function(relay::FreeVars(expr), expr, Type(), relay::FreeTypeVars(expr, mod_), {});
+    }
+    mod_->Update(gv_, func);
+    mod_ = transform::FoldConstant()(mod_);
+    mod_ = transform::InferType()(mod_);
+    mod_ = transform::FoldConstant()(mod_);
+    mod_ = transform::InferType()(mod_);
+    Expr out;
+    if (expr.as<FunctionNode>()) {
+      out = mod_->Lookup(gv_);
+    } else {
+      out = mod_->Lookup(gv_).as<FunctionNode>()->body;
+    }
+    return out;
+  }
+
+  std::vector<Expr> PrepareArgs(const CallNode* call_node) {
+    std::vector<Expr> args;
+    for (auto arg : call_node->args) {
+      if (arg.as<ConstantNode>()) {
+        args.emplace_back(arg);
+      } else {
+        args.emplace_back(PrepareInput(arg));
+      }
+    }
+    return args;
   }
 
  private:
@@ -222,35 +275,19 @@ class DynamicToStaticMutator : public MixedModeMutator {
     }
     return post;
   }
+
   std::unordered_map<Expr, std::function<Expr(const CallNode*)>, ObjectPtrHash, ObjectPtrEqual>
       op_map_;
+  IRModule mod_;
+  Function func_;
+  GlobalVar gv_;
 };
 
 Expr DynamicToStatic(Function f, IRModule m) {
-  Expr pre = f;
-  Expr expr = f;
-  auto fold_const = transform::FoldConstant();
-  auto infer_type = transform::InferType();
-  DynamicToStaticMutator mutator;
-  Map<BaseFunc, GlobalVar> vars;
-  for (auto kv : m->functions) {
-    vars.Set(kv.second, kv.first);
-  }
-  const auto gv = vars[f];
-  // Put a limit on the while loop
-  // Primarily used to prevent accidental infinite lops in development
-  const int loop_limit = 1000;
-  int i = 0;
-  do {
-    pre = expr;
-    // TODO(mbrookhart): Is it possible to run these passes JUST on the current function?
-    m = infer_type(m);
-    m = fold_const(m);
-    expr = mutator.Mutate(m->functions[gv]);
-    m->Update(gv, Downcast<BaseFunc>(expr));
-    i += 1;
-  } while (!StructuralEqual()(pre, expr) && i < loop_limit);
-  return expr;
+  DynamicToStaticMutator mutator(m, f);
+  Expr expr = mutator.Mutate(f);
+  Expr out = mutator.PrepareInput(expr);
+  return out;
 }
 
 namespace transform {
diff --git a/tests/python/relay/test_pass_dynamic_to_static.py b/tests/python/relay/test_pass_dynamic_to_static.py
index 141023d77019..c9e047a38540 100644
--- a/tests/python/relay/test_pass_dynamic_to_static.py
+++ b/tests/python/relay/test_pass_dynamic_to_static.py
@@ -232,11 +232,11 @@ def verify_ones_zeros(shape, dtype):
 
             func = run_infer_type(relay.Function([x], y))
             func2 = run_opt_pass(
-                run_opt_pass(func, transform.DynamicToStatic()), transform.InferType()
+                run_opt_pass(func, transform.DynamicToStatic()),
+                transform.InferType(),
             )
 
             zz = func2.body
-            assert isinstance(zz, relay.Constant)
             assert zz.checked_type == relay.ty.TensorType(shape, dtype)
 
             x_data = np.random.uniform(low=1, high=1, size=shape)
@@ -518,5 +518,45 @@ def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_
     verify_sparse_to_dense(1, 3, None, [5], [0, 3, 0, 0, 0])  # default value not specified
 
 
+@tvm.testing.uses_gpu
+def test_dynamic_to_static_dynamic_rank():
+    def verify_full(fill_value, fill_shape, dtype):
+        x = relay.var("x", relay.scalar_type(dtype))
+        y = relay.var("y", relay.TensorType(fill_shape, "int64"))
+        shape = relay.shape_of(y)
+        shape = relay.strided_slice(shape, [0], relay.shape_of(shape))
+        z = relay.full(x, shape, dtype)
+
+        func = relay.Function([x, y], z)
+        func2 = run_opt_pass(run_opt_pass(func, transform.DynamicToStatic()), transform.InferType())
+
+        zz = func2.body
+        assert isinstance(zz, relay.Call)
+        assert zz.op == relay.op.get("full")
+
+        ref_res = np.full(fill_shape, fill_value).astype(dtype)
+        y_data = np.random.uniform(low=-1, high=1, size=fill_shape).astype("int64")
+        verify_func(func2, [fill_value, y_data], ref_res)
+
+    verify_full(4, (1, 2, 3, 4), "int32")
+    verify_full(4.0, (1, 2, 8, 10), "float32")
+
+
+@tvm.testing.uses_gpu
+def test_dynamic_to_static_dynamic_if():
+    x = relay.var("x", relay.TensorType((2, 2), "int64"))
+    cond = relay.const(1)
+    iff = relay.If(cond, relay.reshape(x, [1, 4]), relay.reshape(x, (4, 1)))
+
+    func = relay.Function([x], iff)
+    func2 = run_opt_pass(run_opt_pass(func, transform.DynamicToStatic()), transform.InferType())
+
+    zz = func2.body
+    assert isinstance(zz, relay.Call)
+    assert zz.op == relay.op.get("reshape")
+    x_data = np.random.uniform(low=-1, high=1, size=(2, 2)).astype("int64")
+    verify_func(func2, [x_data], x_data.reshape(1, 4))
+
+
 if __name__ == "__main__":
     pytest.main([__file__])

From 0ab9c95a6d3b0a1bbe1f9fb141890058553d6260 Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Tue, 2 Feb 2021 10:38:20 -0700
Subject: [PATCH 147/357] [Relay][Passes] Iterative A-normal Traversals (#7374)

* [WIP][Relay][Passes] non-recursive a-normal traversals

* fix clang warning

* Refactor ANormal Iterative traversal into a higher order function utility with lambdas

* refactor missed pass

* add explict use of  to lamdbas
---
 include/tvm/relay/expr_functor.h      |  4 ++
 src/relay/analysis/util.cc            | 12 +++++
 src/relay/ir/expr_functor.cc          | 22 +++++++++
 src/relay/transforms/de_duplicate.cc  | 20 ++++++--
 src/relay/transforms/fold_constant.cc | 36 +++++++++-----
 src/relay/transforms/fuse_ops.cc      | 52 ++++++++++++++++----
 src/relay/transforms/type_infer.cc    | 70 ++++++++++++++++++++-------
 7 files changed, 175 insertions(+), 41 deletions(-)

diff --git a/include/tvm/relay/expr_functor.h b/include/tvm/relay/expr_functor.h
index 8589f8cc4f16..d53658f87f40 100644
--- a/include/tvm/relay/expr_functor.h
+++ b/include/tvm/relay/expr_functor.h
@@ -476,6 +476,10 @@ void ExpandDataflow(Expr expr, FCheckVisited fcheck_visited, FVisitLeaf fvisit_l
     }
   }
 }
+
+void ExpandANormalForm(const LetNode* op, std::function<void(const LetNode*)> pre_visit,
+                       std::function<void(const LetNode*)> post_visit);
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_EXPR_FUNCTOR_H_
diff --git a/src/relay/analysis/util.cc b/src/relay/analysis/util.cc
index abb9e6b034c2..90750575b9d4 100644
--- a/src/relay/analysis/util.cc
+++ b/src/relay/analysis/util.cc
@@ -141,6 +141,18 @@ class TypeVarEVisitor : private MixedModeVisitor {
     ExprVisitor::VisitExpr_(f);
   }
 
+  void VisitExpr_(const LetNode* op) final {
+    auto pre_visit = [this](const LetNode* op) {
+      this->VisitExpr(op->var);
+      this->VisitExpr(op->value);
+    };
+    auto post_visit = [this](const LetNode* op) {
+      this->VisitExpr(op->body);
+      this->visit_counter_[op] += 1;
+    };
+    ExpandANormalForm(op, pre_visit, post_visit);
+  }
+
   void VisitExpr_(const ConstructorNode* cn) final {
     // for constructors, type vars will be bound in the module
     auto data = mod_->LookupTypeDef(cn->belong_to);
diff --git a/src/relay/ir/expr_functor.cc b/src/relay/ir/expr_functor.cc
index 74095a753950..d70c6fe2dd1f 100644
--- a/src/relay/ir/expr_functor.cc
+++ b/src/relay/ir/expr_functor.cc
@@ -532,5 +532,27 @@ TVM_REGISTER_GLOBAL("relay.ir.Bind").set_body([](TVMArgs args, TVMRetValue* ret)
     *ret = Bind(Downcast<Type>(input), args[1]);
   }
 });
+
+void ExpandANormalForm(const LetNode* op, std::function<void(const LetNode*)> pre_visit,
+                       std::function<void(const LetNode*)> post_visit) {
+  std::stack<const LetNode*> stack;
+  stack.push(op);
+  bool is_anormal = true;
+  while (is_anormal) {
+    const LetNode* current_op = stack.top();
+    pre_visit(current_op);
+    if (const LetNode* new_op = current_op->body.as<LetNode>()) {
+      stack.push(new_op);
+    } else {
+      is_anormal = false;
+    }
+  }
+  while (stack.size()) {
+    const LetNode* current_op = stack.top();
+    stack.pop();
+    post_visit(current_op);
+  }
+}
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/transforms/de_duplicate.cc b/src/relay/transforms/de_duplicate.cc
index 43b71f6f10cc..2fd88736bf31 100644
--- a/src/relay/transforms/de_duplicate.cc
+++ b/src/relay/transforms/de_duplicate.cc
@@ -27,6 +27,8 @@
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/pattern_functor.h>
 
+#include <stack>
+
 namespace tvm {
 namespace relay {
 
@@ -61,8 +63,20 @@ Expr DeDup(const Expr& e) {
     }
 
     Expr VisitExpr_(const LetNode* op) final {
-      Var v = Fresh(op->var);
-      return Let(v, VisitExpr(op->value), VisitExpr(op->body));
+      std::unordered_map<Expr, Var, ObjectPtrHash, ObjectPtrEqual> new_vars;
+      auto pre_visit = [this, &new_vars](const LetNode* op) {
+        Expr expr = GetRef<Expr>(op);
+        new_vars[expr] = this->Fresh(op->var);
+        // Rely on the Memoizer to cache pre-visit values
+        this->VisitExpr(op->value);
+      };
+      auto post_visit = [this, &new_vars](const LetNode* op) {
+        Expr expr = GetRef<Expr>(op);
+        this->memo_[expr] =
+            Let(new_vars[expr], this->VisitExpr(op->value), this->VisitExpr(op->body));
+      };
+      ExpandANormalForm(op, pre_visit, post_visit);
+      return memo_[GetRef<Expr>(op)];
     }
 
     Type VisitType(const Type& t) final { return t.defined() ? TypeMutator::VisitType(t) : t; }
@@ -99,7 +113,7 @@ Expr DeDup(const Expr& e) {
   ICHECK(WellFormed(ret));
   ICHECK_EQ(FreeVars(e).size(), FreeVars(ret).size());
   return ret;
-}
+}  // namespace relay
 
 TVM_REGISTER_GLOBAL("relay._transform.dedup").set_body_typed(DeDup);
 
diff --git a/src/relay/transforms/fold_constant.cc b/src/relay/transforms/fold_constant.cc
index 66f233bbba85..0689263cca77 100644
--- a/src/relay/transforms/fold_constant.cc
+++ b/src/relay/transforms/fold_constant.cc
@@ -92,19 +92,33 @@ class ConstantFolder : public MixedModeMutator {
   using MixedModeMutator::VisitExpr_;
 
   Expr VisitExpr_(const LetNode* op) final {
-    Expr value = this->Mutate(op->value);
-    if (value.as<ConstantNode>()) {
-      memo_[op->var] = value;
-      return this->Mutate(op->body);
-    } else {
-      Var var = Downcast<Var>(this->Mutate(op->var));
-      Expr body = this->Mutate(op->body);
-      if (var.same_as(op->var) && value.same_as(op->value) && body.same_as(op->body)) {
-        return GetRef<Expr>(op);
+    auto pre_visit = [this](const LetNode* op) {
+      // Rely on the Memoizer to cache pre-visit values
+      Expr value = this->Mutate(op->value);
+      if (value.as<ConstantNode>()) {
+        this->memo_[op->var] = value;
       } else {
-        return Let(var, value, body);
+        this->Mutate(op->var);
       }
-    }
+    };
+    auto post_visit = [this](const LetNode* op) {
+      Expr expr = GetRef<Expr>(op);
+      // Rely on the Memoizer to cache pre-visit values
+      Expr value = this->Mutate(op->value);
+      if (value.as<ConstantNode>()) {
+        this->memo_[expr] = this->Mutate(op->body);
+      } else {
+        Var var = Downcast<Var>(this->Mutate(op->var));
+        Expr body = this->Mutate(op->body);
+        if (var.same_as(op->var) && value.same_as(op->value) && body.same_as(op->body)) {
+          this->memo_[expr] = expr;
+        } else {
+          this->memo_[expr] = Let(var, value, body);
+        }
+      }
+    };
+    ExpandANormalForm(op, pre_visit, post_visit);
+    return memo_[GetRef<Expr>(op)];
   }
 
   bool inside_primitive = false;
diff --git a/src/relay/transforms/fuse_ops.cc b/src/relay/transforms/fuse_ops.cc
index 1b28980a0a2f..eaef0b905079 100644
--- a/src/relay/transforms/fuse_ops.cc
+++ b/src/relay/transforms/fuse_ops.cc
@@ -315,11 +315,20 @@ class IndexedForwardGraph::Creator : private ExprVisitor {
 
   void VisitExpr_(const LetNode* op) final {
     // do not fuse through let.
-    this->Update(op->var, nullptr, kOpaque);
-    this->Update(op->value, nullptr, kOpaque);
-    this->Update(op->body, nullptr, kOpaque);
-    ExprVisitor::VisitExpr_(op);
-    this->AddNode(op);
+    auto pre_visit = [this](const LetNode* op) {
+      // Rely on the Memoizer to cache pre-visit values
+      this->Update(op->var, nullptr, kOpaque);
+      this->Update(op->value, nullptr, kOpaque);
+      this->Update(op->body, nullptr, kOpaque);
+      this->VisitExpr(op->var);
+      this->VisitExpr(op->value);
+    };
+    auto post_visit = [this](const LetNode* op) {
+      this->VisitExpr(op->body);
+      this->visit_counter_[op] += 1;
+      this->AddNode(op);
+    };
+    ExpandANormalForm(op, pre_visit, post_visit);
   }
 
   void VisitExpr_(const IfNode* op) final {
@@ -797,7 +806,7 @@ std::vector<GraphPartitioner::Group*> GraphPartitioner::Partition(
   return std::move(groups_);
 }
 
-class FuseMutator : private ExprMutator {
+class FuseMutator : private MixedModeMutator {
  public:
   // Run the transform
   Expr Transform(const Expr& body, int fuse_opt_level, size_t max_fuse_depth) {
@@ -814,6 +823,8 @@ class FuseMutator : private ExprMutator {
   }
 
  private:
+  using MixedModeMutator::VisitExpr_;
+
   /*! \brief Temporary information from each group. */
   struct GroupInfo {
    public:
@@ -853,7 +864,7 @@ class FuseMutator : private ExprMutator {
   }
 
   // Transform calls.
-  Expr VisitExpr_(const CallNode* call) {
+  Expr Rewrite_(const CallNode* call, const Expr& post) {
     if (call->op.as<OpNode>()) {
       static auto fnoncomputational = Op::GetAttrMap<TNonComputational>("TNonComputational");
 
@@ -886,7 +897,7 @@ class FuseMutator : private ExprMutator {
     }
   }
 
-  Expr VisitExpr_(const TupleNode* tuple) {
+  Expr Rewrite_(const TupleNode* tuple, const Expr& post) {
     auto* ret_group = gmap_.at(tuple)->FindRoot();
     if (ret_group->root_ref == tuple) {
       return ExprMutator::VisitExpr_(tuple);
@@ -896,7 +907,7 @@ class FuseMutator : private ExprMutator {
     return Tuple(new_fields);
   }
 
-  Expr VisitExpr_(const TupleGetItemNode* tuple_get) {
+  Expr Rewrite_(const TupleGetItemNode* tuple_get, const Expr& post) {
     auto* ret_group = gmap_.at(tuple_get)->FindRoot();
     auto new_tuple = GetNewArguments({tuple_get->tuple}, ret_group)[0];
     auto new_node = TupleGetItem(new_tuple, tuple_get->index);
@@ -913,6 +924,29 @@ class FuseMutator : private ExprMutator {
     return std::move(new_node);
   }
 
+  Expr VisitExpr_(const LetNode* op) final {
+    auto pre_visit = [this](const LetNode* op) {
+      // Rely on the Memoizer to cache pre-visit values
+      this->VisitExpr(op->var);
+      this->VisitExpr(op->value);
+    };
+    auto post_visit = [this](const LetNode* op) {
+      // Rely on the Memoizer to cache pre-visit values
+      Var var = Downcast<Var>(this->VisitExpr(op->var));
+      Expr value = this->VisitExpr(op->value);
+      // Visit body and cache the op
+      Expr body = this->VisitExpr(op->body);
+      auto expr = GetRef<Expr>(op);
+      if (var.same_as(op->var) && value.same_as(op->value) && body.same_as(op->body)) {
+        this->memo_[expr] = expr;
+      } else {
+        this->memo_[expr] = Let(var, value, body);
+      }
+    };
+    ExpandANormalForm(op, pre_visit, post_visit);
+    return memo_[GetRef<Expr>(op)];
+  }
+
   Expr MakeNewFunction(GraphPartitioner::Group* group, Type ret_type, Expr body) {
     // If the function has no call, it is not a primitive function.
     struct HasCallVisitor : ExprVisitor {
diff --git a/src/relay/transforms/type_infer.cc b/src/relay/transforms/type_infer.cc
index 921e83fdb092..b4ccd1659865 100644
--- a/src/relay/transforms/type_infer.cc
+++ b/src/relay/transforms/type_infer.cc
@@ -341,26 +341,34 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
   Type VisitExpr_(const OpNode* op) final { return op->op_type; }
 
   Type VisitExpr_(const LetNode* let) final {
-    // if the definition is a function literal, permit recursion
-    bool is_functional_literal = let->value.as<FunctionNode>() != nullptr;
-    Type let_type = IncompleteType(Kind::kType);
-
-    if (is_functional_literal) {
-      let_type = GetType(let->var);
-      type_map_[let->var].checked_type = let_type;
-    }
+    auto pre_visit = [this](const LetNode* op) {
+      // if the definition is a function literal, permit recursion
+      bool is_functional_literal = op->value.as<FunctionNode>() != nullptr;
+      Type let_type = IncompleteType(Kind::kType);
+
+      if (is_functional_literal) {
+        let_type = this->GetType(op->var);
+        this->type_map_[op->var].checked_type = let_type;
+      }
 
-    if (let->var->type_annotation.defined()) {
-      let_type = Unify(let_type, let->var->type_annotation, let->span);
-    }
+      if (op->var->type_annotation.defined()) {
+        let_type = this->Unify(let_type, op->var->type_annotation, op->span);
+      }
 
-    Type vtype = GetType(let->value);
-    let_type = Unify(let_type, vtype, let->span);
+      Type vtype = this->GetType(op->value);
+      let_type = this->Unify(let_type, vtype, op->span);
 
-    ICHECK(is_functional_literal || !type_map_.count(let->var));
-    // NOTE: no scoping is necessary because var are unique in program
-    type_map_[let->var].checked_type = let_type;
-    return GetType(let->body);
+      ICHECK(is_functional_literal || !this->type_map_.count(op->var));
+      // NOTE: no scoping is necessary because var are unique in program
+      this->type_map_[op->var].checked_type = let_type;
+    };
+    auto post_visit = [this](const LetNode* op) {
+      Expr expr = GetRef<Expr>(op);
+      this->memo_[expr] = this->GetType(op->body);
+      this->type_map_[expr].checked_type = this->memo_[expr];
+    };
+    ExpandANormalForm(let, pre_visit, post_visit);
+    return memo_[GetRef<Expr>(let)];
   }
 
   Type VisitExpr_(const IfNode* ite) final {
@@ -603,7 +611,21 @@ class TypeInferencer::Resolver : public MixedModeMutator, PatternMutator {
 
   Expr Rewrite_(const CallNode* op, const Expr& post) final { return AttachCheckedType(op, post); }
 
-  Expr VisitExpr_(const LetNode* op) final { return AttachCheckedType(op); }
+  Expr VisitExpr_(const LetNode* op) final {
+    auto pre_visit = [this](const LetNode* op) {
+      this->VisitExpr(op->var);
+      this->VisitExpr(op->value);
+    };
+    auto post_visit = [this](const LetNode* op) {
+      Expr expr = GetRef<Expr>(op);
+      Var var = Downcast<Var>(this->VisitExpr(op->var));
+      Expr value = this->VisitExpr(op->value);
+      Expr body = this->VisitExpr(op->body);
+      this->memo_[expr] = this->AttachCheckedType(op, Let(var, value, body));
+    };
+    ExpandANormalForm(op, pre_visit, post_visit);
+    return memo_[GetRef<Expr>(op)];
+  }
 
   Expr VisitExpr_(const IfNode* op) final { return AttachCheckedType(op); }
 
@@ -738,6 +760,7 @@ Expr TypeInferencer::Infer(GlobalVar var, Function function) {
 }
 
 struct AllCheckTypePopulated : MixedModeVisitor {
+  using MixedModeVisitor::VisitExpr_;
   void DispatchExprVisit(const Expr& e) {
     if (e.as<OpNode>()) {
       return;
@@ -751,6 +774,17 @@ struct AllCheckTypePopulated : MixedModeVisitor {
     ICHECK(e->checked_type_.defined()) << "Expression: " << e;
     return ExprVisitor::VisitExpr(e);
   }
+  void VisitExpr_(const LetNode* op) final {
+    auto pre_visit = [this](const LetNode* op) {
+      this->VisitExpr(op->var);
+      this->VisitExpr(op->value);
+    };
+    auto post_visit = [this](const LetNode* op) {
+      this->VisitExpr(op->body);
+      this->visit_counter_[op] += 1;
+    };
+    ExpandANormalForm(op, pre_visit, post_visit);
+  }
 };
 
 void EnsureCheckedType(const Expr& e) { AllCheckTypePopulated().VisitExpr(e); }

From de0ab4cb7cd73e3c05eb3512c916607587156b13 Mon Sep 17 00:00:00 2001
From: Balint Cristian <cristian.balint@gmail.com>
Date: Tue, 2 Feb 2021 19:51:31 +0200
Subject: [PATCH 148/357] Fix missing round(), floor(), ceil() for target C
 lowering (#7382)

---
 src/target/intrin_rule.cc                     |  6 ++
 .../unittest/test_target_codegen_c_host.py    | 87 +++++++++++++++++--
 2 files changed, 87 insertions(+), 6 deletions(-)

diff --git a/src/target/intrin_rule.cc b/src/target/intrin_rule.cc
index f8f4d0ef5414..1a7214476188 100644
--- a/src/target/intrin_rule.cc
+++ b/src/target/intrin_rule.cc
@@ -77,6 +77,12 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.ldexp").set_body(DispatchPureExtern
 
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.sqrt").set_body(DispatchPureExtern<FloatSuffix>);
 
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.floor").set_body(DispatchPureExtern<FloatSuffix>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.ceil").set_body(DispatchPureExtern<FloatSuffix>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.round").set_body(DispatchPureExtern<FloatSuffix>);
+
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.rsqrt")
     .set_body([](const TVMArgs& args, TVMRetValue* rv) {
       PrimExpr e = args[0];
diff --git a/tests/python/unittest/test_target_codegen_c_host.py b/tests/python/unittest/test_target_codegen_c_host.py
index 3178d6dad0e4..d1ca8b1450f0 100644
--- a/tests/python/unittest/test_target_codegen_c_host.py
+++ b/tests/python/unittest/test_target_codegen_c_host.py
@@ -30,12 +30,12 @@ def test_add():
     s = te.create_schedule(C.op)
 
     def check_c():
-        mhost = tvm.build(s, [A, B, C], "c", name="fadd")
+        mhost = tvm.build(s, [A, B, C], "c", name="test_fadd")
         temp = utils.tempdir()
         path_dso = temp.relpath("temp.so")
         mhost.export_library(path_dso)
         m = tvm.runtime.load_module(path_dso)
-        fadd = m["fadd"]
+        fadd = m["test_fadd"]
         ctx = tvm.cpu(0)
         # launch the kernel.
         n = nn
@@ -73,14 +73,14 @@ def check_c():
         )
         binds = {A: Ab}
         # BUILD and invoke the kernel.
-        f1 = tvm.lower(s, [A, B, C], name="fadd_pipeline")
+        f1 = tvm.lower(s, [A, B, C], name="test_fadd_pipeline")
         mhost = tvm.build(f1, target="c")
 
         temp = utils.tempdir()
         path_dso = temp.relpath("temp.so")
         mhost.export_library(path_dso)
         m = tvm.runtime.load_module(path_dso)
-        fadd = m["fadd_pipeline"]
+        fadd = m["test_fadd_pipeline"]
         ctx = tvm.cpu(0)
         # launch the kernel.
         n = nn
@@ -103,12 +103,12 @@ def test_reinterpret():
     s = te.create_schedule(B.op)
 
     def check_c():
-        mhost = tvm.build(s, [A, B], "c", name="reinterpret")
+        mhost = tvm.build(s, [A, B], "c", name="test_reinterpret")
         temp = utils.tempdir()
         path_dso = temp.relpath("temp.so")
         mhost.export_library(path_dso)
         m = tvm.runtime.load_module(path_dso)
-        fadd = m["reinterpret"]
+        fadd = m["test_reinterpret"]
         ctx = tvm.cpu(0)
         n = nn
         a = tvm.nd.array(np.random.randint(-(2 ** 30), 2 ** 30, size=n).astype(A.dtype), ctx)
@@ -119,7 +119,82 @@ def check_c():
     check_c()
 
 
+def test_ceil():
+    nn = 1024
+    n = tvm.runtime.convert(nn)
+    A = te.placeholder((n,), name="A", dtype="float32")
+    B = te.compute(A.shape, lambda *i: tvm.tir.call_intrin("float32", "tir.ceil", A(*i)), name="B")
+    s = te.create_schedule(B.op)
+
+    def check_c():
+        mhost = tvm.build(s, [A, B], "c", name="test_ceil")
+        temp = utils.tempdir()
+        path_dso = temp.relpath("temp.so")
+        mhost.export_library(path_dso)
+        m = tvm.runtime.load_module(path_dso)
+        fceil = m["test_ceil"]
+        ctx = tvm.cpu(0)
+        n = nn
+        a = tvm.nd.array(np.random.rand(n).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx)
+        fceil(a, b)
+        tvm.testing.assert_allclose(b.asnumpy(), (np.ceil(a.asnumpy()).view("float32")))
+
+    check_c()
+
+
+def test_floor():
+    nn = 1024
+    n = tvm.runtime.convert(nn)
+    A = te.placeholder((n,), name="A", dtype="float32")
+    B = te.compute(A.shape, lambda *i: tvm.tir.call_intrin("float32", "tir.floor", A(*i)), name="B")
+    s = te.create_schedule(B.op)
+
+    def check_c():
+        mhost = tvm.build(s, [A, B], "c", name="test_floor")
+        temp = utils.tempdir()
+        path_dso = temp.relpath("temp.so")
+        mhost.export_library(path_dso)
+        m = tvm.runtime.load_module(path_dso)
+        ffloor = m["test_floor"]
+        ctx = tvm.cpu(0)
+        n = nn
+        a = tvm.nd.array(np.random.rand(n).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx)
+        ffloor(a, b)
+        tvm.testing.assert_allclose(b.asnumpy(), (np.floor(a.asnumpy()).view("float32")))
+
+    check_c()
+
+
+def test_round():
+    nn = 1024
+    n = tvm.runtime.convert(nn)
+    A = te.placeholder((n,), name="A", dtype="float32")
+    B = te.compute(A.shape, lambda *i: tvm.tir.call_intrin("float32", "tir.round", A(*i)), name="B")
+    s = te.create_schedule(B.op)
+
+    def check_c():
+        mhost = tvm.build(s, [A, B], "c", name="test_round")
+        temp = utils.tempdir()
+        path_dso = temp.relpath("temp.so")
+        mhost.export_library(path_dso)
+        m = tvm.runtime.load_module(path_dso)
+        fround = m["test_round"]
+        ctx = tvm.cpu(0)
+        n = nn
+        a = tvm.nd.array(np.random.rand(n).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx)
+        fround(a, b)
+        tvm.testing.assert_allclose(b.asnumpy(), (np.round(a.asnumpy()).view("float32")))
+
+    check_c()
+
+
 if __name__ == "__main__":
     test_add()
     test_add_pipeline()
     test_reinterpret()
+    test_ceil()
+    test_floor()
+    test_round()

From da42924a16c5e688bbb31facb3b17ba066b776df Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Tue, 2 Feb 2021 13:38:54 -0800
Subject: [PATCH 149/357] [FFI] Improve error messages when array/map types do
 not match in function calls (#7330)

* [FIX] Improve error messages when array/map types do not match in function calls

* missed some places for renaming

* Rename Mismatch to CheckAndGetMismatch. Add Check back in. Use Optional::defined.

* Optional<String> -> String

* formatting

* move ObjectTypeChecker template specializations into where thier respective classes are defined so they will always be found correctly
---
 include/tvm/node/container.h      | 31 ++++++-------
 include/tvm/runtime/packed_func.h | 73 ++++++++++++++++++++++++++++---
 src/ir/expr.cc                    |  6 +--
 3 files changed, 86 insertions(+), 24 deletions(-)

diff --git a/include/tvm/node/container.h b/include/tvm/node/container.h
index 209bb9e72f33..2ed1fdf880f1 100644
--- a/include/tvm/node/container.h
+++ b/include/tvm/node/container.h
@@ -1447,24 +1447,25 @@ inline Map<K, V> Merge(Map<K, V> lhs, const Map<K, V>& rhs) {
 namespace tvm {
 namespace runtime {
 // Additional overloads for PackedFunc checking.
-template <typename T>
-struct ObjectTypeChecker<Array<T>> {
-  static bool Check(const Object* ptr) {
-    if (ptr == nullptr) return true;
-    if (!ptr->IsInstance<ArrayNode>()) return false;
-    const ArrayNode* n = static_cast<const ArrayNode*>(ptr);
-    for (const ObjectRef& p : *n) {
-      if (!ObjectTypeChecker<T>::Check(p.get())) {
-        return false;
+template <typename K, typename V>
+struct ObjectTypeChecker<Map<K, V>> {
+  static Optional<String> CheckAndGetMismatch(const Object* ptr) {
+    if (ptr == nullptr) return NullOpt;
+    if (!ptr->IsInstance<MapNode>()) return String(ptr->GetTypeKey());
+    const MapNode* n = static_cast<const MapNode*>(ptr);
+    for (const auto& kv : *n) {
+      Optional<String> key_type = ObjectTypeChecker<K>::CheckAndGetMismatch(kv.first.get());
+      Optional<String> value_type = ObjectTypeChecker<K>::CheckAndGetMismatch(kv.first.get());
+      if (key_type.defined() || value_type.defined()) {
+        std::string key_name =
+            key_type.defined() ? std::string(key_type.value()) : ObjectTypeChecker<K>::TypeName();
+        std::string value_name = value_type.defined() ? std::string(value_type.value())
+                                                      : ObjectTypeChecker<V>::TypeName();
+        return String("Map[" + key_name + ", " + value_name + "]");
       }
     }
-    return true;
+    return NullOpt;
   }
-  static std::string TypeName() { return "Array[" + ObjectTypeChecker<T>::TypeName() + "]"; }
-};
-
-template <typename K, typename V>
-struct ObjectTypeChecker<Map<K, V>> {
   static bool Check(const Object* ptr) {
     if (ptr == nullptr) return true;
     if (!ptr->IsInstance<MapNode>()) return false;
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index 87606f3f738c..e43e042866ff 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -379,6 +379,33 @@ inline const char* ArgTypeCode2Str(int type_code);
  */
 template <typename T>
 struct ObjectTypeChecker {
+  /*!
+   * \brief Check if an object matches the template type and return the
+   *        mismatched type if it exists.
+   * \param ptr The object to check the type of.
+   * \return An Optional containing the actual type of the pointer if it does not match the
+   *         template type. If the Optional does not contain a value, then the types match.
+   */
+  static Optional<String> CheckAndGetMismatch(const Object* ptr) {
+    using ContainerType = typename T::ContainerType;
+    if (ptr == nullptr) {
+      if (T::_type_is_nullable) {
+        return NullOpt;
+      } else {
+        return String("nullptr");
+      }
+    }
+    if (ptr->IsInstance<ContainerType>()) {
+      return NullOpt;
+    } else {
+      return String(ptr->GetTypeKey());
+    }
+  }
+  /*!
+   * \brief Check if an object matches the template type.
+   * \param ptr The object to check the type of.
+   * \return Whether or not the template type matches the objects type.
+   */
   static bool Check(const Object* ptr) {
     using ContainerType = typename T::ContainerType;
     if (ptr == nullptr) return T::_type_is_nullable;
@@ -390,6 +417,40 @@ struct ObjectTypeChecker {
   }
 };
 
+// Additional overloads for PackedFunc checking.
+template <typename T>
+struct ObjectTypeChecker<Array<T>> {
+  static Optional<String> CheckAndGetMismatch(const Object* ptr) {
+    if (ptr == nullptr) {
+      return NullOpt;
+    }
+    if (!ptr->IsInstance<ArrayNode>()) {
+      return String(ptr->GetTypeKey());
+    }
+    const ArrayNode* n = static_cast<const ArrayNode*>(ptr);
+    for (size_t i = 0; i < n->size(); i++) {
+      const ObjectRef& p = (*n)[i];
+      Optional<String> check_subtype = ObjectTypeChecker<T>::CheckAndGetMismatch(p.get());
+      if (check_subtype.defined()) {
+        return String("Array[index " + std::to_string(i) + ": " + check_subtype.value() + "]");
+      }
+    }
+    return NullOpt;
+  }
+  static bool Check(const Object* ptr) {
+    if (ptr == nullptr) return true;
+    if (!ptr->IsInstance<ArrayNode>()) return false;
+    const ArrayNode* n = static_cast<const ArrayNode*>(ptr);
+    for (const ObjectRef& p : *n) {
+      if (!ObjectTypeChecker<T>::Check(p.get())) {
+        return false;
+      }
+    }
+    return true;
+  }
+  static std::string TypeName() { return "Array[" + ObjectTypeChecker<T>::TypeName() + "]"; }
+};
+
 /*!
  * \brief Internal base class to
  *  handle conversion to POD values.
@@ -1499,15 +1560,15 @@ inline TObjectRef TVMPODValue_::AsObjectRef() const {
   if (type_code_ == kTVMObjectHandle) {
     // normal object type check.
     Object* ptr = static_cast<Object*>(value_.v_handle);
-    CHECK(ObjectTypeChecker<TObjectRef>::Check(ptr))
-        << "Expected " << ObjectTypeChecker<TObjectRef>::TypeName() << " but got "
-        << ptr->GetTypeKey();
+    Optional<String> checked_type = ObjectTypeChecker<TObjectRef>::CheckAndGetMismatch(ptr);
+    ICHECK(!checked_type.defined()) << "Expected " << ObjectTypeChecker<TObjectRef>::TypeName()
+                                    << ", but got " << checked_type.value();
     return TObjectRef(GetObjectPtr<Object>(ptr));
   } else if (type_code_ == kTVMObjectRValueRefArg) {
     Object* ptr = *static_cast<Object**>(value_.v_handle);
-    CHECK(ObjectTypeChecker<TObjectRef>::Check(ptr))
-        << "Expected " << ObjectTypeChecker<TObjectRef>::TypeName() << " but got "
-        << ptr->GetTypeKey();
+    Optional<String> checked_type = ObjectTypeChecker<TObjectRef>::CheckAndGetMismatch(ptr);
+    ICHECK(!checked_type.defined()) << "Expected " << ObjectTypeChecker<TObjectRef>::TypeName()
+                                    << ", but got " << checked_type.value();
     return TObjectRef(GetObjectPtr<Object>(ptr));
   } else if (std::is_base_of<ContainerType, NDArray::ContainerType>::value &&
              type_code_ == kTVMNDArrayHandle) {
diff --git a/src/ir/expr.cc b/src/ir/expr.cc
index 4cc2ac31a4a1..203520802091 100644
--- a/src/ir/expr.cc
+++ b/src/ir/expr.cc
@@ -49,9 +49,9 @@ PrimExpr PrimExpr::FromObject_(ObjectRef ref) {
   if (auto* ptr = ref.as<runtime::StringObj>()) {
     return tir::StringImm(GetRef<runtime::String>(ptr));
   }
-  ICHECK(ObjectTypeChecker<PrimExpr>::Check(ref.get()))
-      << "Expect type " << ObjectTypeChecker<PrimExpr>::TypeName() << " but get "
-      << ref->GetTypeKey();
+  Optional<String> actual_type = ObjectTypeChecker<PrimExpr>::CheckAndGetMismatch(ref.get());
+  ICHECK(!actual_type.defined()) << "Expected type " << ObjectTypeChecker<PrimExpr>::TypeName()
+                                 << " but got " << actual_type.value();
   return Downcast<PrimExpr>(ref);
 }
 

From 618ef9e87a833ea1e9fc69b013f8d181a530b7b8 Mon Sep 17 00:00:00 2001
From: Ke Han <38852697+hanke580@users.noreply.github.com>
Date: Thu, 4 Feb 2021 02:13:40 +0800
Subject: [PATCH 150/357] [TOPI] Add einsum operator (#6370)

* [TOPI] Einsum
* Fix tuple
* fix oshape

* * test

* * Fix lint

* * Remove useless define

* * Move to einsum header file

* * Fix single value situation

* * Fix CamelASE

* * Print stride

* * Fix single input bug

* * fix lint

* * Fix lint and add comments

* * create test einsum

* * Fix lint

* * Fix comments
---
 include/tvm/topi/einsum.h                    | 943 +++++++++++++++++++
 include/tvm/topi/tags.h                      |   1 +
 python/tvm/topi/__init__.py                  |   1 +
 python/tvm/topi/einsum.py                    |  44 +
 src/topi/transform.cc                        |   5 +
 tests/python/topi/python/test_topi_einsum.py |  78 ++
 tests/python/unittest/test_te_autodiff.py    |   4 +
 7 files changed, 1076 insertions(+)
 create mode 100644 include/tvm/topi/einsum.h
 create mode 100644 python/tvm/topi/einsum.py
 create mode 100644 tests/python/topi/python/test_topi_einsum.py

diff --git a/include/tvm/topi/einsum.h b/include/tvm/topi/einsum.h
new file mode 100644
index 000000000000..e1baadab09d3
--- /dev/null
+++ b/include/tvm/topi/einsum.h
@@ -0,0 +1,943 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file topi/einsum.h
+ * \brief Einstein summation op
+ */
+#ifndef TVM_TOPI_EINSUM_H_
+#define TVM_TOPI_EINSUM_H_
+
+#define LABELRANGE 128
+#define NPY_MAXDIMS 16
+#define NPY_MAXARGS 16
+
+#include <tvm/te/operation.h>
+#include <tvm/tir/data_layout.h>
+#include <tvm/topi/detail/constant_utils.h>
+#include <tvm/topi/detail/ravel_unravel.h>
+#include <tvm/topi/detail/tensor_utils.h>
+#include <tvm/topi/tags.h>
+
+#include <algorithm>
+#include <bitset>
+#include <iterator>
+#include <string>
+#include <tuple>
+#include <unordered_set>
+#include <vector>
+
+namespace tvm {
+namespace topi {
+
+using namespace tvm::te;
+using namespace topi::detail;
+
+/*!
+ * \brief Compute the stride of the given shape.
+ *
+ * \param shape for the operation.
+ *
+ * \return the stride of the shape.
+ */
+inline Array<PrimExpr> GetStride(const Array<PrimExpr> shape) {
+  size_t ndim = shape.size();
+  int prod = 1;
+  Array<PrimExpr> stride = Array<PrimExpr>(ndim, -1);
+  for (int i = ndim - 1; i >= 0; i--) {
+    stride.Set(i, if_then_else(shape[i] > 1, prod, 0));
+    prod = prod * GetConstInt(shape[i]);
+  }
+  return stride;
+}
+
+/*!
+ * \brief Pad the shape with 1.
+ *
+ * \param shape the input shape to be padded
+ * \param odim the padding size of the objective shape.
+ *
+ * \return the padded shape.
+ */
+inline Array<PrimExpr> Pad(const Array<PrimExpr> shape, int odim) {
+  int ndim = shape.size();
+  CHECK_GE(odim, ndim);
+  Array<PrimExpr> ret(static_cast<size_t>(odim), 1);
+  for (int idim = 0; idim < ndim; ++idim) {
+    ret.Set(idim, shape[idim]);
+  }
+  return ret;
+}
+
+/*!
+ * \brief Parse the subscripts for one operand into an output of 'ndim' labels.
+ *
+ * \param subscripts the subscripts for to be parsed.
+ * \param length subscripts[0: length] represents the current operand.
+ * \param ndim the ndim of current operand.
+ * \param iop the index of the operand.
+ * \param op_labels the parsing result.
+ *        For Example:
+ *           subscripts="abbcbc",  ndim=6 -> op_labels=[97, 98, -1, 99, -3, -2].
+ *           subscripts="ab...bc", ndim=6 -> op_labels=[97, 98, 0, 0, -3, 99].
+ * \param label_counts Count the number the label appears.
+ * \param min_label Save the minimal label according to ASCII.
+ * \param max_label Save the maximal label according to ASCII.
+ *
+ * \return 0.
+ */
+inline int ParseOperandSubscripts(const char* subscripts, int length, int ndim, int iop,
+                                  char* op_labels, char* label_counts, int* min_label,
+                                  int* max_label) {
+  int i;
+  int idim = 0;
+  int ellipsis = -1;
+
+  /* Process all labels for this operand */
+  for (i = 0; i < length; ++i) {
+    int label = subscripts[i];
+
+    /* A proper label for an axis. */
+    if (label > 0 && isalpha(label)) {
+      /* Check we don't exceed the operator dimensions. */
+      CHECK(idim < ndim) << "einstein sum subscripts string contains "
+                         << "too many subscripts for operand " << iop;
+
+      op_labels[idim++] = label;
+      if (label < *min_label) {
+        *min_label = label;
+      }
+      if (label > *max_label) {
+        *max_label = label;
+      }
+      label_counts[label]++;
+    } else if (label == '.') {
+      /* The beginning of the ellipsis. */
+      /* Check it's a proper ellipsis. */
+      CHECK(
+          !(ellipsis != -1 || i + 2 >= length || subscripts[++i] != '.' || subscripts[++i] != '.'))
+          << "einstein sum subscripts string contains a "
+          << "'.' that is not part of an ellipsis ('...') "
+          << "in operand " << iop;
+
+      ellipsis = idim;
+    } else {
+      CHECK(label == ' ') << "invalid subscript '" << static_cast<char>(label)
+                          << "' in einstein sum "
+                          << "subscripts string, subscripts must "
+                          << "be letters";
+    }
+  }
+
+  /* No ellipsis found, labels must match dimensions exactly. */
+  if (ellipsis == -1) {
+    CHECK(idim == ndim) << "operand has more dimensions than subscripts "
+                        << "given in einstein sum, but no '...' ellipsis "
+                        << "provided to broadcast the extra dimensions.";
+  } else if (idim < ndim) {
+    /* Ellipsis found, may have to add broadcast dimensions. */
+    /* Move labels after ellipsis to the end. */
+    for (i = 0; i < idim - ellipsis; ++i) {
+      op_labels[ndim - i - 1] = op_labels[idim - i - 1];
+    }
+    /* Set all broadcast dimensions to zero. */
+    for (i = 0; i < ndim - idim; ++i) {
+      op_labels[ellipsis + i] = 0;
+    }
+  }
+
+  /*
+   * Find any labels duplicated for this operand, and turn them
+   * into negative offsets to the axis to merge with.
+   *
+   * In C, the char type may be signed or unsigned, but with
+   * twos complement arithmetic the char is ok either way here, and
+   * later where it matters the char is cast to a signed char.
+   */
+  for (idim = 0; idim < ndim - 1; ++idim) {
+    int label = op_labels[idim];
+    /* If it is a proper label, find any duplicates of it. */
+    if (label > 0) {
+      /* Search for the next matching label. */
+      char* next = reinterpret_cast<char*>(memchr(op_labels + idim + 1, label, ndim - idim - 1));
+
+      while (next != nullptr) {
+        /* The offset from next to op_labels[idim] (negative). */
+        *next = static_cast<char>((op_labels + idim) - next);
+        /* Search for the next matching label. */
+        next = reinterpret_cast<char*>(memchr(next + 1, label, op_labels + ndim - 1 - next));
+      }
+    }
+  }
+  return 0;
+}
+
+/*!
+ * \brief Parse the subscripts for the output into an output that includes 'ndim_broadcast'
+ *        unlabeled dimensions.
+ *
+ * \param subscripts the subscripts for to be parsed.
+ * \param length subscripts[0: length] represents the output operand.
+ * \param ndim_broadcast the broadcast dimension number.
+ * \param label_counts Count the number the label appears.
+ * \param out_labels similar to the op_labels in ParseOperandSubscripts, for each
+ *        dimension, the ASCII code of the corresponding label. zero for the broadcasting dim.
+ *
+ * \return the total number of output dimensions or -1 if there is an error.
+ */
+inline int ParseOutputSubscripts(const char* subscripts, int length, int ndim_broadcast,
+                                 const char* label_counts, char* out_labels) {
+  int i, bdim;
+  int ndim = 0;
+  int ellipsis = 0;
+
+  /* Process all the output labels. */
+  for (i = 0; i < length; ++i) {
+    int label = subscripts[i];
+
+    /* A proper label for an axis. */
+    if (label > 0 && isalpha(label)) {
+      /* Check that it doesn't occur again. */
+      CHECK(memchr(subscripts + i + 1, label, length - i - 1) == nullptr)
+          << "einstein sum subscripts string includes "
+          << "output subscript '" << static_cast<char>(label) << "' multiple times";
+
+      /* Check that it was used in the inputs. */
+      CHECK(label_counts[label] != 0)
+          << "einstein sum subscripts string included "
+          << "output subscript '" << static_cast<char>(label) << "' which never appeared "
+          << "in an input";
+
+      /* Check that there is room in out_labels for this label. */
+      CHECK(ndim < NPY_MAXDIMS) << "einstein sum subscripts string contains "
+                                << "too many subscripts in the output";
+
+      out_labels[ndim++] = label;
+    } else if (label == '.') {
+      /* The beginning of the ellipsis. */
+      /* Check it is a proper ellipsis. */
+      CHECK(!(ellipsis || i + 2 >= length || subscripts[++i] != '.' || subscripts[++i] != '.'))
+          << "einstein sum subscripts string "
+          << "contains a '.' that is not part of "
+          << "an ellipsis ('...') in the output";
+
+      /* Check there is room in out_labels for broadcast dims. */
+      CHECK(ndim + ndim_broadcast <= NPY_MAXDIMS) << "einstein sum subscripts string contains "
+                                                  << "too many subscripts in the output";
+
+      ellipsis = 1;
+      for (bdim = 0; bdim < ndim_broadcast; ++bdim) {
+        out_labels[ndim++] = 0;
+      }
+    } else {
+      CHECK(label == ' ') << "invalid subscript '" << static_cast<char>(label)
+                          << "' in einstein sum "
+                          << "subscripts string, subscripts must "
+                          << "be letters";
+    }
+  }
+
+  /* If no ellipsis was found there should be no broadcast dimensions. */
+  CHECK(!(!ellipsis && ndim_broadcast > 0)) << "output has more dimensions than subscripts "
+                                            << "given in einstein sum, but no '...' ellipsis "
+                                            << "provided to broadcast the extra dimensions.";
+
+  return ndim;
+}
+
+/*!
+ * \brief If any dimensions are combined, create a view that combines them.
+ *        Shows in newshape and newstride.
+ *
+ * \param op the operand tensor.
+ * \param iop the index of the operand.
+ * \param labels the op_labels fot the operand. Like [97, 98, -2] for "aba".
+ * \param newshape The combined shape.
+ * \param newstride The combined stride.
+ *
+ * For example:
+ *  "aba -> ab",              shape = [2,3,2] stride = [6,2,1]
+ *  op_labels = [97, 98, -2], newshape = [2,3], newstride = [7,2]
+ */
+inline void GetCombinedDimsView(const Tensor& op, int iop, char* labels, Array<PrimExpr>* newshape,
+                                Array<PrimExpr>* newstride) {
+  int idim, ndim, icombine, combineoffset;
+  int icombinemap[NPY_MAXDIMS];
+  int newdim;
+
+  Array<PrimExpr> shape = op->shape;
+  Array<PrimExpr> stride = GetStride(shape);
+  ndim = op.ndim();
+  newdim = newshape->size();
+
+  /* Initialize the dimensions and strides to zero */
+  for (idim = 0; idim < newdim; ++idim) {
+    newshape->Set(idim, 0);
+    newstride->Set(idim, 0);
+  }
+
+  /* Copy the dimensions and strides, except when collapsing */
+  icombine = 0;
+  for (idim = 0; idim < ndim; ++idim) {
+    /*
+     * The char type may be either signed or unsigned, we
+     * need it to be signed here.
+     */
+    int label = (signed char)labels[idim];
+    /* If this label says to merge axes, get the actual label */
+    if (label < 0) {
+      combineoffset = label;
+      label = labels[idim + label];
+    } else {
+      combineoffset = 0;
+      if (icombine != idim) {
+        labels[icombine] = labels[idim];
+      }
+      icombinemap[idim] = icombine;
+    }
+    /* If the label is 0, it's an unlabeled broadcast dimension */
+    if (label == 0) {
+      newshape->Set(icombine, shape[idim]);
+      newstride->Set(icombine, stride[idim]);
+    } else {
+      /* Update the combined axis dimensions and strides */
+      int i = icombinemap[idim + combineoffset];
+      CHECK(!((combineoffset < 0) &&
+              GetConstInt((*newshape)[i] != 0 && (*newshape)[i] != shape[idim])))
+          << "dimensions in operand " << iop << " for collapsing index '" << label
+          << "' don't match (" << GetConstInt((*newshape)[i]) << " != " << shape[idim] << ")";
+      newshape->Set(i, shape[idim]);
+      newstride->Set(i, (*newstride)[i] + stride[idim]);
+    }
+
+    /* If the label didn't say to combine axes, increment dest i */
+    if (combineoffset == 0) {
+      icombine++;
+    }
+  }
+}
+
+/*!
+ * \brief Prepare the operand axes to match each stride or shape pair.
+ *
+ * \param ndim the ndim of the operand tensor.
+ * \param iop the index of the operand.
+ * \param labels the op_labels fot the operand. [97, 98, -1, 99, -3, -2] for "abbcbc".
+ * \param axes The matched axes to be calculated.
+ * \param ndim_iter the dimension of iterating. Subscripts "ab, bc -> ac" ndim_iter = 3.
+ * \param iter_labels output_labels with the iterating label. ['a', 'c', 'b'] for the case above.
+ */
+inline static int PrepareOpAxes(int ndim, int iop, char* labels, int* axes, int ndim_iter,
+                                char* iter_labels) {
+  int i, label, ibroadcast;
+
+  ibroadcast = ndim - 1;
+  for (i = ndim_iter - 1; i >= 0; --i) {
+    label = iter_labels[i];
+    /*
+     * If it's an unlabeled broadcast dimension, choose
+     * the next broadcast dimension from the operand.
+     */
+    if (label == 0) {
+      while (ibroadcast >= 0 && labels[ibroadcast] != 0) {
+        --ibroadcast;
+      }
+      /*
+       * If we used up all the operand broadcast dimensions,
+       * extend it with a "newaxis"
+       */
+      if (ibroadcast < 0) {
+        axes[i] = -1;
+      } else {
+        /* Otherwise map to the broadcast axis */
+        axes[i] = ibroadcast;
+        --ibroadcast;
+      }
+    } else {
+      /* It's a labeled dimension, find the matching one */
+      char* match = reinterpret_cast<char*>(memchr(labels, label, ndim));
+      /* If the op doesn't have the label, broadcast it */
+      if (match == nullptr) {
+        axes[i] = -1;
+      } else {
+        /* Otherwise use it */
+        axes[i] = match - labels;
+      }
+    }
+  }
+  return 0;
+}
+
+/*!
+ * \brief Count SubString.
+ * \param str the object string
+ * \param sub the pattern string
+ *
+ * \return number of substring
+ */
+inline int CountSubstring(const std::string& str, const std::string& sub) {
+  int count = 0;
+  std::string::size_type pos = 0;
+  while ((pos = str.find(sub, pos)) != std::string::npos) {
+    ++count;
+    pos += sub.length();
+  }
+  return count;
+}
+
+/*!
+ * \brief Transfer string to.
+ * \param str input string.
+ *
+ * \return bitset.
+ */
+inline std::bitset<LABELRANGE> Str2Set(const std::string& str) {
+  std::bitset<LABELRANGE> ret;
+  for (const char& c : str) {
+    ret.set(static_cast<int>(c));
+  }
+  return ret;
+}
+
+/*!
+ * \brief Split str according to substring.
+ * \param str input string.
+ * \param sub the split pattern string.
+ *
+ * \return vector contains the splited substring.
+ */
+inline std::vector<std::string> Split(const std::string& str, const std::string& sub) {
+  std::string::size_type pos = 0;
+  std::string::size_type start = 0;
+  std::vector<std::string> ret;
+  while ((pos = str.find(sub, start)) != std::string::npos) {
+    ret.push_back(str.substr(start, pos - start));
+    start = pos + sub.length();
+  }
+  ret.push_back(str.substr(start));
+  return ret;
+}
+
+/*!
+ * \brief Parse the input subscripts into a vector of strings.
+ * \param subscripts input subscripts.
+ * \param operands operand tensors.
+ *
+ * \return vector of strings, vector[0] represents the input part, vector[1] represents the ouput.
+ * if no output, the vector[1] is NULL.
+ * "ab, bc -> ac" => ["ab,bc", "ac"]
+ */
+inline std::tuple<std::string, std::string> ParseEinsumInput(
+    std::string subscripts, const std::vector<Array<PrimExpr>>& operands) {
+  const std::string einsum_symbols = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
+  std::bitset<LABELRANGE> einsum_symbols_set;
+  for (const char& c : einsum_symbols) {
+    einsum_symbols_set.set(c);
+  }
+
+  CHECK_NE(operands.size(), 0U) << "No input operands";
+
+  auto end_pos = std::remove(subscripts.begin(), subscripts.end(), ' ');
+  subscripts.erase(end_pos, subscripts.end());
+
+  // Ensure all characters are valid
+  for (const char& c : subscripts) {
+    if (c == '.' || c == ',' || c == '-' || c == '>') {
+      continue;
+    }
+    CHECK(einsum_symbols_set.test(c)) << "Character " << c << " is not a valid symbol.";
+  }
+
+  // Check for proper "->"
+  if (subscripts.find('-') != std::string::npos || subscripts.find('>') != std::string::npos) {
+    bool invalid = (std::count(subscripts.begin(), subscripts.end(), '-') > 1 ||
+                    std::count(subscripts.begin(), subscripts.end(), '>') > 1);
+    CHECK(!invalid && CountSubstring(subscripts, "->") == 1)
+        << "Subscripts can only contain one '->'.";
+  }
+
+  // Parse ellipses
+  if (subscripts.find('.') != std::string::npos) {
+    std::string used = subscripts;
+    used.erase(
+        std::remove_if(used.begin(), used.end(),
+                       [](const char& c) { return c == '.' || c == ',' || c == '-' || c == '>'; }),
+        used.end());
+
+    std::bitset<LABELRANGE> used_set = Str2Set(used);
+    std::string ellipse_inds = "";
+    for (const char& c : einsum_symbols) {
+      if (!used_set.test(static_cast<int>(c))) {
+        ellipse_inds.append(1, c);
+      }
+    }
+    int longest = 0;
+    std::string input_tmp, output_sub;
+    std::vector<std::string> split_subscripts;
+    bool out_sub;
+
+    if (subscripts.find("->") != std::string::npos) {
+      std::vector<std::string> tmp = Split(subscripts, "->");
+      input_tmp = tmp[0];
+      output_sub = tmp[1];
+      split_subscripts = Split(input_tmp, ",");
+      out_sub = true;
+    } else {
+      split_subscripts = Split(subscripts, ",");
+      out_sub = false;
+    }
+
+    size_t size_split_subscripts = split_subscripts.size();
+    subscripts = "";
+    for (size_t i = 0; i < size_split_subscripts; ++i) {
+      const std::string& sub = split_subscripts[i];
+      if (sub.find('.') != std::string::npos) {
+        CHECK_EQ(std::count(sub.begin(), sub.end(), '.'), 3) << "Invalid Ellipses";
+        CHECK_EQ(CountSubstring(sub, "..."), 1) << "Invalid Ellipses";
+
+        // Take into account numerical values
+        int ellipse_count = 0;
+        if (operands[i].size() == 0) {
+          ellipse_count = 0;
+        } else {
+          ellipse_count = std::max(operands[i].size(), static_cast<size_t>(1));
+          ellipse_count -= sub.length() - 3;
+        }
+
+        if (ellipse_count > longest) {
+          longest = ellipse_count;
+        }
+
+        CHECK_GE(ellipse_count, 0) << "Ellipses lengths do not match.";
+        if (ellipse_count == 0) {
+          split_subscripts[i].erase(sub.find("..."), 3);
+        } else {
+          std::string rep_inds = ellipse_inds.substr(ellipse_inds.length() - ellipse_count);
+          split_subscripts[i].replace(sub.find("..."), 3, rep_inds);
+        }
+      }
+      subscripts += split_subscripts[i];
+      if (i + 1 < size_split_subscripts) {
+        subscripts += ",";
+      }
+    }
+    std::string out_ellipse;
+    if (longest == 0) {
+      out_ellipse = "";
+    } else {
+      out_ellipse = ellipse_inds.substr(ellipse_inds.length() - longest);
+    }
+
+    if (out_sub) {
+      output_sub.replace(output_sub.find("..."), 3, out_ellipse);
+      subscripts += "->" + output_sub;
+    } else {
+      // Special care for outputless ellipses
+      std::bitset<LABELRANGE> out_ellipse_set = Str2Set(out_ellipse);
+      std::string tmp_subscripts = subscripts, output_subscript = "";
+      size_t len_tmp_subscripts = tmp_subscripts.length();
+      std::sort(tmp_subscripts.begin(), tmp_subscripts.end());
+      for (size_t i = 0; i < len_tmp_subscripts; ++i) {
+        const char& c = tmp_subscripts[i];
+        if (c == ',') {
+          continue;
+        }
+        CHECK(einsum_symbols_set.test(c)) << "Character " << c << " is not a valid symbol.";
+        if ((i == 0 || tmp_subscripts[i - 1] != c) &&
+            (i == len_tmp_subscripts - 1 || tmp_subscripts[i + 1] != c) &&
+            !out_ellipse_set.test(c)) {
+          output_subscript.append(1, c);
+        }
+      }
+      subscripts += "->" + out_ellipse + output_subscript;
+    }
+  }
+
+  // Build output string if does not exist
+  std::tuple<std::string, std::string> ret;
+  if (subscripts.find("->") != std::string::npos) {
+    std::vector<std::string> tmp(2);
+    tmp = Split(subscripts, "->");
+    ret = std::make_tuple(tmp[0], tmp[1]);
+  } else {
+    std::string first = subscripts;
+    std::string second = "";
+    // Build output subscripts
+    std::string tmp_subscripts = subscripts;
+    size_t len_tmp_subscripts = tmp_subscripts.length();
+    std::sort(tmp_subscripts.begin(), tmp_subscripts.end());
+    for (size_t i = 0; i < len_tmp_subscripts; ++i) {
+      const char& c = tmp_subscripts[i];
+      if (c == ',') {
+        continue;
+      }
+      CHECK(einsum_symbols_set.test(c)) << "Character " << c << " is not a valid symbol.";
+      if ((i == 0 || tmp_subscripts[i - 1] != c) &&
+          (i == len_tmp_subscripts - 1 || tmp_subscripts[i + 1] != c)) {
+        second.append(1, c);
+      }
+    }
+    ret = std::make_tuple(first, second);
+  }
+
+  // Make sure output subscripts are in the input
+  std::bitset<LABELRANGE> input_subscripts_set = Str2Set(std::get<0>(ret));
+  for (const char& c : std::get<1>(ret)) {
+    CHECK(input_subscripts_set.test(c))
+        << "Output character " << c << " did not appear in the input";
+  }
+
+  // Make sure number operands is equivalent to the number of terms
+  CHECK_EQ(std::count(std::get<0>(ret).begin(), std::get<0>(ret).end(), ',') + 1, operands.size())
+      << "Number of einsum subscripts must be equal to the "
+      << "number of operands.";
+
+  return ret;
+}
+
+/*!
+ * \brief Compute the shape of the output.
+ * \param subscripts input subscripts.
+ * \param operands operand tensors.
+ *
+ * \return the shape of the output.
+ */
+inline Array<PrimExpr> NumpyEinsumShape(const std::string subscripts,
+                                        const std::vector<Array<PrimExpr>>& operands) {
+  // Parsing
+  std::tuple<std::string, std::string> parsed_subscripts = ParseEinsumInput(subscripts, operands);
+
+  // Build a few useful list and sets
+  std::vector<std::string> input_list = Split(std::get<0>(parsed_subscripts), ",");
+  size_t isize = input_list.size();
+
+  // Get length of each unique dimension and ensure all dimensions are correct
+  int dimension_dict[LABELRANGE];
+  memset(dimension_dict, -1, sizeof(dimension_dict));
+  for (size_t i = 0; i < isize; ++i) {
+    const std::string& term = input_list[i];
+    const Array<PrimExpr>& sh = operands[i];
+    CHECK_EQ(sh.size(), term.length())
+        << "Einstein sum subscript " << input_list[i] << " does not contain the "
+        << "correct number of indices for operand " << i << ".";
+    size_t len_term = term.length();
+    for (size_t j = 0; j < len_term; ++j) {
+      int64_t dim = GetConstInt(sh[j]);
+      const char& c = term[j];
+
+      if (dimension_dict[static_cast<int>(c)] != -1) {
+        // For broadcasting cases we always want the largest dim size
+        if (dimension_dict[static_cast<int>(c)] == 1) {
+          dimension_dict[static_cast<int>(c)] = dim;
+        }
+        CHECK(dim == 1 || dim == dimension_dict[static_cast<int>(c)])
+            << "Size of label '" << c << "' for operand  " << i << " ("
+            << dimension_dict[static_cast<int>(c)] << ") does not match previous terms (" << dim
+            << ").";
+      } else {
+        dimension_dict[static_cast<int>(c)] = dim;
+      }
+    }
+  }
+
+  // Get oshape
+  const std::string& output_str = std::get<1>(parsed_subscripts);
+  size_t odim = output_str.size();
+  Array<PrimExpr> oshape(odim, -1);
+  for (size_t i = 0; i < odim; ++i) {
+    oshape.Set(i, dimension_dict[static_cast<int>(output_str[i])]);
+  }
+  // Neglecting oshape assign check temporally
+  return oshape;
+}
+
+/*!
+ * \brief Evaluates the Einstein summation convention on the operands.
+ *
+ * \param subscripts_str Specifies the subscripts for summation as comma separated list of
+ * subscript labels.
+ * \param inputs Arrays for the operation.
+ * \param name The name of the operation.
+ * \param tag The tag to mark the operation.
+ *
+ * \return The calculation based on the Einstein summation convention.
+ */
+inline Tensor einsum(const std::string& subscripts_str, const Array<Tensor> inputs,
+                     std::string name = "T_einsum", std::string tag = kEinsum) {
+  bool back = false;
+  const char* subscripts = subscripts_str.data();
+  const char* head = subscripts;
+  const int nop = inputs.size();
+
+  /* Step 1: Parse the subscripts string into label_counts and op_labels */
+  int iop, idim, min_label = LABELRANGE - 1, max_label = 0;
+  char label_counts[LABELRANGE], op_labels[NPY_MAXARGS][NPY_MAXDIMS];
+  memset(label_counts, 0, sizeof(label_counts));
+  for (iop = 0; iop < nop; ++iop) {
+    int length = static_cast<int>(strcspn(subscripts, ",-"));
+
+    CHECK(!(iop == nop - 1 && subscripts[length] == ','))
+        << "more operands provided to einstein sum function "
+        << "than specified in the subscripts string";
+    CHECK(!(iop < nop - 1 && subscripts[length] != ','))
+        << "fewer operands provided to einstein sum function "
+        << "than specified in the subscripts string";
+    CHECK_EQ(ParseOperandSubscripts(subscripts, length, inputs[iop + back].ndim(), iop,
+                                    op_labels[iop], label_counts, &min_label, &max_label),
+             0);
+
+    /* Move subscripts to the start of the labels for the next op */
+    subscripts += length;
+
+    if (iop < nop - 1) {
+      CHECK_LT(subscripts - head, subscripts_str.length()) << "subscripts out of range";
+      subscripts++;
+    }
+  }
+  /*
+   * Find the number of broadcast dimensions, which is the maximum
+   * number of labels == 0 in an op_labels array.
+   */
+  int ndim_broadcast = 0;
+  for (iop = 0; iop < nop; ++iop) {
+    int count_zeros = 0;
+    int ndim;
+    char* labels = op_labels[iop];
+
+    ndim = inputs[iop + back].ndim();
+    for (idim = 0; idim < ndim; ++idim) {
+      if (labels[idim] == 0) {
+        ++count_zeros;
+      }
+    }
+
+    if (count_zeros > ndim_broadcast) {
+      ndim_broadcast = count_zeros;
+    }
+  }
+
+  /*
+   * If there is no output signature, fill output_labels and ndim_output
+   * using each label that appeared once, in alphabetical order.
+   */
+  int label, ndim_output;
+  char output_labels[NPY_MAXDIMS];
+  if (subscripts[0] == '\0') {
+    /* If no output was specified, always broadcast left, as usual. */
+    for (ndim_output = 0; ndim_output < ndim_broadcast; ++ndim_output) {
+      output_labels[ndim_output] = 0;
+    }
+    for (label = min_label; label <= max_label; ++label) {
+      if (label_counts[label] == 1) {
+        CHECK(ndim_output < NPY_MAXDIMS) << "einstein sum subscript string has too many "
+                                         << "distinct labels";
+        output_labels[ndim_output++] = label;
+      }
+    }
+  } else {
+    CHECK(subscripts[0] == '-' && subscripts[1] == '>') << "einstein sum subscript string does not "
+                                                        << "contain proper '->' output specified";
+    subscripts += 2;
+
+    /* Parse the output subscript string. */
+    ndim_output = ParseOutputSubscripts(subscripts, strlen(subscripts), ndim_broadcast,
+                                        label_counts, output_labels);
+    CHECK_GE(ndim_output, 0);
+  }
+
+  /*
+   * Step 2:
+   * Process all the input ops, combining dimensions into their
+   * diagonal where specified.
+   */
+  std::vector<Array<PrimExpr>> opshape(nop), opstride_true(nop);
+  for (iop = 0; iop < nop; ++iop) {
+    char* labels = op_labels[iop];
+    int combine, ndim;
+
+    ndim = inputs[iop + back].ndim();
+
+    /*
+     * Check whether any dimensions need to be combined
+     *
+     * The char type may be either signed or unsigned, we
+     * need it to be signed here.
+     */
+    combine = 0;
+    for (idim = 0; idim < ndim; ++idim) {
+      if ((signed char)labels[idim] < 0) {
+        combine++;
+      }
+    }
+    /* If any dimensions are combined, create a view which combines them */
+    if (combine) {
+      Array<PrimExpr> tshape(static_cast<size_t>(ndim - combine), -1);
+      Array<PrimExpr> tstride(static_cast<size_t>(ndim - combine), -1);
+      GetCombinedDimsView(inputs[iop + back], iop, labels, &tshape, &tstride);
+      opshape[iop] = tshape;
+      opstride_true[iop] = tstride;
+    } else {
+      /* No combining needed */
+      opshape[iop] = inputs[iop + back]->shape;
+      opstride_true[iop] = GetStride(opshape[iop]);
+    }
+  }
+  /*
+   * Step 3:
+   * Set up the labels for the iterator (output + combined labels).
+   * Can just share the output_labels memory, because iter_labels
+   * is output_labels with some more labels appended.
+   */
+  char* iter_labels = output_labels;
+  int ndim_iter = ndim_output;
+  for (label = min_label; label <= max_label; ++label) {
+    if (label_counts[label] > 0 && memchr(output_labels, label, ndim_output) == nullptr) {
+      CHECK(ndim_iter < NPY_MAXDIMS) << "too many subscripts in einsum";
+      iter_labels[ndim_iter++] = label;
+    }
+  }
+  /* Step 4: Set up the op_axes for the iterator */
+  Array<PrimExpr> itershape(static_cast<size_t>(ndim_iter), -1);
+  std::vector<Array<PrimExpr>> iterstride(nop + 1,
+                                          Array<PrimExpr>(static_cast<size_t>(ndim_iter), 0));
+
+  // output_shape
+  std::vector<Array<PrimExpr>> operands;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    operands.push_back(inputs[i]->shape);
+  }
+  Array<PrimExpr> oshape = NumpyEinsumShape(subscripts_str, operands);
+  Array<PrimExpr> ostride_true = GetStride(oshape);
+  Array<PrimExpr> reduceshape;
+  std::vector<Array<PrimExpr>> remainshape(nop);
+  int op_axes_arrays[NPY_MAXARGS][NPY_MAXDIMS];
+  int* op_axes[NPY_MAXARGS];
+  for (iop = 0; iop < nop; ++iop) {
+    op_axes[iop] = op_axes_arrays[iop];
+    CHECK_GE(PrepareOpAxes(opshape[iop].size(), iop, op_labels[iop], op_axes[iop], ndim_iter,
+                           iter_labels),
+             0);
+    for (idim = 0; idim < ndim_iter; idim++) {
+      if (op_axes[iop][idim] != -1) {
+        iterstride[iop].Set(idim, opstride_true[iop][op_axes[iop][idim]]);
+        if (GetConstInt(itershape[idim]) != -1) {
+          if (GetConstInt(itershape[idim]) == 1) {
+            itershape.Set(idim, opshape[iop][op_axes[iop][idim]]);
+          }
+        } else {
+          itershape.Set(idim, opshape[iop][op_axes[iop][idim]]);
+        }
+      }
+    }
+  }
+  for (idim = 0; idim < ndim_output; ++idim) {
+    iterstride[nop].Set(idim, ostride_true[idim]);
+  }
+  reduceshape = Array<PrimExpr>(static_cast<size_t>(ndim_iter - ndim_output), 0);
+  for (idim = ndim_output; idim < ndim_iter; ++idim) {
+    reduceshape.Set(idim - ndim_output, itershape[idim]);
+  }
+  for (iop = 0; iop < nop; iop++) {
+    Array<Integer> rsh;
+    for (idim = 0; idim < ndim_iter; idim++) {
+      if (op_axes_arrays[iop][idim] == -1) {
+        rsh.push_back(GetConstInt(itershape[idim]));
+      } else {
+        if (GetConstInt(itershape[idim] != opshape[iop][op_axes_arrays[iop][idim]])) {
+          rsh.push_back(GetConstInt(itershape[idim]));
+        }
+      }
+    }
+    remainshape[iop] = Array<PrimExpr>(rsh.begin(), rsh.end());
+  }
+  // exclude the 0-dim case
+  if (ndim_iter == 0) {
+    ndim_iter = 1;
+  }
+  itershape = Pad(itershape, ndim_iter);
+  for (iop = 0; iop <= nop; ++iop) {
+    iterstride[iop] = Pad(iterstride[iop], ndim_iter);
+  }
+  // oshape = Pad(oshape, ndim_iter);
+  reduceshape = Pad(reduceshape, ndim_iter);
+  for (iop = 0; iop < nop; ++iop) {
+    opshape[iop] = Pad(opshape[iop], ndim_iter);
+    remainshape[iop] = Pad(remainshape[iop], ndim_iter);
+  }
+  // ostride and rstride
+  Array<Array<PrimExpr>> ostride;
+  Array<Array<PrimExpr>> rstride;
+
+  for (iop = 0; iop < nop; ++iop) {
+    Array<PrimExpr> otmp(static_cast<size_t>(ndim_iter), 0);
+    Array<PrimExpr> rtmp(static_cast<size_t>(ndim_iter), 0);
+    for (idim = 0; idim < ndim_iter; ++idim) {
+      otmp.Set(idim, idim < ndim_output ? iterstride[iop][idim] : 1);
+      rtmp.Set(idim, idim < ndim_iter - ndim_output ? iterstride[iop][idim + ndim_output] : 1);
+    }
+    ostride.push_back(otmp);
+    rstride.push_back(rtmp);
+  }
+
+  // func: input indices => return cooresponding value
+  auto func = [inputs, oshape, ostride, reduceshape, ndim_iter, rstride,
+               nop](const Array<Var>& input_indices) -> PrimExpr {
+    for (int rdim = 0; rdim < ndim_iter; ++rdim) {
+      if (GetConstInt(reduceshape[rdim]) == 0) {
+        return 0;  //
+      }
+    }
+    Array<PrimExpr> ridx = UnravelIndex(0, reduceshape);
+
+    PrimExpr sum = 0;
+    bool rec_flag = false;
+    do {
+      PrimExpr tmp = 1;
+      for (int iop = 0; iop < nop; ++iop) {
+        if (iop != -1) {
+          PrimExpr k = 0;
+
+          for (size_t i = 0; i < input_indices.size(); ++i) {
+            k += input_indices[i] * ostride[iop][i];
+          }
+          for (size_t i = 0; i < ridx.size(); ++i) {
+            k += ridx[i] * rstride[iop][i];
+          }
+          Array<PrimExpr> temp_indices = UnravelIndex(k, inputs[iop]->shape);
+          tmp = tmp * inputs[iop](temp_indices);
+        }
+      }
+      sum += tmp;
+      ridx.Set(ridx.size() - 1, ridx[ridx.size() - 1] + 1);
+      for (int i = static_cast<int>(ridx.size() - 1);
+           (i > 0) && GetConstInt(ridx[i] >= reduceshape[i]); --i) {
+        ridx.Set(i, ridx[i] - reduceshape[i]);
+        ridx.Set(i - 1, ridx[i - 1] + 1);
+      }
+      rec_flag = GetConstInt(ridx[0] < reduceshape[0]);
+    } while (rec_flag);
+    return sum;
+  };
+
+  return compute(oshape, func, name, tag);
+}
+
+}  // namespace topi
+}  // namespace tvm
+#endif  // TVM_TOPI_EINSUM_H_
diff --git a/include/tvm/topi/tags.h b/include/tvm/topi/tags.h
index 3b748ca60ce5..c3641ae0de12 100644
--- a/include/tvm/topi/tags.h
+++ b/include/tvm/topi/tags.h
@@ -41,6 +41,7 @@ constexpr auto kDepthwiseConv2dNCHW = "depthwise_conv2d_nchw";
 constexpr auto kDepthwiseConv2dNHWC = "depthwise_conv2d_nhwc";
 constexpr auto kDepthwiseConv2dBackInputNHWC = "depthwise_conv2d_back_input_nhwc";
 constexpr auto kDepthwiseConv2dBackWeightNHWC = "depthwise_conv2d_back_weight_nhwc";
+constexpr auto kEinsum = "einsum";
 constexpr auto kGroupConv2d = "group_conv2d";
 
 inline bool is_broadcast(std::string tag) {
diff --git a/python/tvm/topi/__init__.py b/python/tvm/topi/__init__.py
index 873901df62a5..6836f04b5ada 100644
--- a/python/tvm/topi/__init__.py
+++ b/python/tvm/topi/__init__.py
@@ -41,6 +41,7 @@
 from .scatter_add import *
 from .argwhere import *
 from .cumsum import *
+from .einsum import *
 from . import generic
 from . import nn
 from . import x86
diff --git a/python/tvm/topi/einsum.py b/python/tvm/topi/einsum.py
new file mode 100644
index 000000000000..f1f426ec8173
--- /dev/null
+++ b/python/tvm/topi/einsum.py
@@ -0,0 +1,44 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,consider-using-enumerate,redefined-outer-name
+"""Einsum operator"""
+from . import cpp
+
+
+def einsum(subscripts, *operand):
+    """Evaluates the Einstein summation convention on the operands.
+
+    Parameters
+    ----------
+    subscripts : string
+        Specifies the subscripts for summation as comma separated list of subscript labels.
+        An implicit (classical Einstein summation) calculation is performed unless the
+        explicit indicator ‘->’ is included as well as subscript labels of the precise
+        output form.
+
+    a_tuple : tuple of tvm.te.Tensor
+        These are the Tensors for the operation.
+        The only difference of einsum between in tvm and numpy is it needs an extra brackets
+        for the tensors. For example, topi.einsum("ij, jk -> ik", (A, B)).
+
+    Returns
+    -------
+    out : tvm.te.Tensor
+        The calculation based on the Einstein summation convention.
+    """
+
+    return cpp.einsum(subscripts, operand)
diff --git a/src/topi/transform.cc b/src/topi/transform.cc
index e1e3988f6400..f71fae3c5aaa 100644
--- a/src/topi/transform.cc
+++ b/src/topi/transform.cc
@@ -23,6 +23,7 @@
  */
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/topi/einsum.h>
 #include <tvm/topi/transform.h>
 #include <tvm/topi/utils.h>
 
@@ -165,6 +166,10 @@ TVM_REGISTER_GLOBAL("topi.tensordot").set_body([](TVMArgs args, TVMRetValue* rv)
   }
 });
 
+TVM_REGISTER_GLOBAL("topi.einsum").set_body([](TVMArgs args, TVMRetValue* rv) {
+  *rv = einsum(args[0], args[1]);
+});
+
 TVM_REGISTER_GLOBAL("topi.strided_slice").set_body([](TVMArgs args, TVMRetValue* rv) {
   *rv = strided_slice(args[0], args[1], args[2], args[3], args[4]);
 });
diff --git a/tests/python/topi/python/test_topi_einsum.py b/tests/python/topi/python/test_topi_einsum.py
new file mode 100644
index 000000000000..49e951398f40
--- /dev/null
+++ b/tests/python/topi/python/test_topi_einsum.py
@@ -0,0 +1,78 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import numpy as np
+import tvm
+import tvm.testing
+from tvm import te
+from tvm import topi
+from tvm.topi.utils import get_const_tuple
+
+
+def with_tvm(lam, *args):
+    """Take numpy arrays as args, convert them to TVM tensors and call `lam`.
+    Result of lambda is converted back to numpy array and returned.
+    """
+    ctx = tvm.cpu(0)
+    pls = []  # placeholders
+    vals_nd = []  # initial values
+    for i, arg in enumerate(args):
+        pls.append(te.placeholder(arg.shape, name="pl" + str(i)))
+        vals_nd.append(tvm.nd.array(arg, ctx))
+
+    out = lam(*pls)
+    out_nd = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=out.dtype), ctx)
+    s = te.create_schedule([out.op])
+    m = tvm.build(s, pls + [out], "llvm")
+    m(*(vals_nd + [out_nd]))
+    return out_nd.asnumpy()
+
+
+def verify_einsum(subscripts, shapes):
+    ops = []
+    for shape in shapes:
+        tmp = np.random.uniform(low=-1.0, high=1.0, size=shape).astype(np.float32)
+        ops.append(tmp)
+
+    c1 = np.einsum(subscripts, *ops)
+
+    if len(ops) == 1:
+        c2 = with_tvm(lambda A: topi.einsum(subscripts, A), *ops)
+    elif len(ops) == 2:
+        c2 = with_tvm(lambda A, B: topi.einsum(subscripts, A, B), *ops)
+    elif len(ops) == 3:
+        c2 = with_tvm(lambda A, B, C: topi.einsum(subscripts, A, B, C), *ops)
+
+    tvm.testing.assert_allclose(c1, c2, rtol=1e-5, atol=1e-5)
+
+
+def test_einsum():
+    verify_einsum("ii", [(5, 5)])
+    verify_einsum("ii->i", [(5, 5)])
+    verify_einsum("ij->i", [(5, 5)])
+    verify_einsum("...j->...", [(5, 5)])
+    verify_einsum("...j, j", [(5, 5), (5,)])
+    verify_einsum("..., ...", [(), (2, 3)])
+    verify_einsum("ijk, jil->kl", [(3, 4, 5), (4, 3, 2)])
+    verify_einsum("ij, ij -> i", [(1, 4), (2, 4)])
+    verify_einsum("...ij, ...jk -> ...ik", [(1, 4), (4, 2)])
+    verify_einsum("...ij, ...ik -> ...jk", [(1, 1, 1, 4), (1, 1, 1, 3)])
+    verify_einsum("ij,jk->ik", [(2, 3), (3, 4)])
+    verify_einsum("ij,jk,km->im", [(2, 3), (3, 4), (4, 5)])
+
+
+if __name__ == "__main__":
+    test_einsum()
diff --git a/tests/python/unittest/test_te_autodiff.py b/tests/python/unittest/test_te_autodiff.py
index 6031182091fe..b2f26471d267 100644
--- a/tests/python/unittest/test_te_autodiff.py
+++ b/tests/python/unittest/test_te_autodiff.py
@@ -170,6 +170,10 @@ def fidentity(t0):
     Y = topi.tensordot(A, B, 1)
     check_grad(Y, X)
 
+    X = te.placeholder((3, 3), name="X")
+    Y = topi.einsum("ii->i", (X))
+    check_grad(Y, X)
+
 
 def test_topi():
     X = te.placeholder((1, 2, 4, 4), name="X")

From 2e8133db05e56a48613a56b4214d887ea4482021 Mon Sep 17 00:00:00 2001
From: Dmitriy Smirnov <dmitriy.smirnov@arm.com>
Date: Thu, 4 Feb 2021 10:34:17 +0000
Subject: [PATCH 151/357] [TFLite] Added check for dynamic range quantization
 (#7114)

* [TFLite] Added check for dynamic range quantization

Added check to prevent optimized with "dynamic range quantization"
tflite files to be loaded as the optimization is not fully supported.

https://www.tensorflow.org/lite/performance/post_training_quantization#dynamic_range_quantization

* linter

* linter

* unit test fix
---
 python/tvm/relay/frontend/tflite.py          | 34 ++++++++++++++++++--
 tests/python/frontend/tflite/test_forward.py | 21 ++++++++++++
 2 files changed, 52 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index f474e59407e0..6d9bb18a7573 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -176,17 +176,45 @@ def __init__(self, model, subgraph, exp_tab):
     def check_unsupported_ops(self):
         """Check unsupported TFLite ops in our converter."""
         unsupported_ops_set = set()
-
+        dynamic_range_ops_set = set()
         for op_idx in range(self.subgraph.OperatorsLength()):
             op = self.subgraph.Operators(op_idx)
             op_code_str = self.get_op_code_str(op)
             if op_code_str not in self.convert_map:
                 unsupported_ops_set.add(op_code_str)
+                continue
+
+            # Trying to exclude "dynamic range quantization" optimized ops as not supported in TVM
+            qnn_in_cnt = len(
+                [_.qnn_params for _ in self.get_input_tensors(op)[0:1] if _.qnn_params is not None]
+            )
+            qnn_weight_cnt = len(
+                [_.qnn_params for _ in self.get_input_tensors(op)[1:] if _.qnn_params is not None]
+            )
+            qnn_out_cnt = len(
+                [_.qnn_params for _ in self.get_output_tensors(op) if _.qnn_params is not None]
+            )
+
+            if qnn_in_cnt == 0 and qnn_out_cnt == 0 and qnn_weight_cnt > 0:
+                dynamic_range_ops_set.add(op_code_str)
+
+        raise_msg = ""
 
         if unsupported_ops_set:
-            msg = "The following operators are not supported in frontend " "TFLite: {}"
+            msg = "The following operators are not supported in frontend " "TFLite: {}\n"
             ops = str(list(unsupported_ops_set)).strip("[,]")
-            raise tvm.error.OpNotImplemented(msg.format(ops))
+            raise_msg += msg.format(ops)
+
+        if dynamic_range_ops_set:
+            msg = (
+                "The following operators are likely to have dynamic range quantization: {}. "
+                "If you are running an optimized graph, please turn off dynamic range quantization "
+                "or use full integer quantization"
+            )
+            raise_msg += msg.format(str(list(dynamic_range_ops_set)).strip("[,]"))
+
+        if len(raise_msg) > 0:
+            raise tvm.error.OpNotImplemented(raise_msg)
 
     def convert_op_to_relay(self):
         """Convert TFLite ops to relay ops"""
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index 6847fd96f37c..0d02c15f2eb8 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -4156,6 +4156,27 @@ def test_forward_mediapipe_hand_landmark():
         )
 
 
+#######################################################################
+# Test check for Tensorflow "dynamic range quantization" optimization
+# --------------
+def test_prevent_tensorflow_dynamic_range():
+    """
+    Should prevent runnung "dynamic range quantization" optimized TFLite graph
+    """
+    data_array = np.random.randint(0, 2, (1, 1024, 1024)).astype(dtype=np.float32)
+    filter_array = np.random.randint(0, 2, (1024, 1024)).astype(dtype=np.float32)
+    data_in = tf.keras.layers.Input(shape=data_array.shape[1:])
+    dense = tf.keras.layers.Dense(units=filter_array.shape[-1], use_bias=False)(data_in)
+    keras_model = tf.keras.models.Model(data_in, dense)
+    keras_model.layers[1].set_weights([filter_array])
+
+    converter = interpreter_wrapper.TFLiteConverter.from_keras_model(keras_model)
+    converter.optimizations = [tf.lite.Optimize.DEFAULT]
+    tflite_model = converter.convert()
+    with pytest.raises(tvm.error.OpNotImplemented):
+        tvm_output = run_tvm_graph(tflite_model, data_array, data_in.name.replace(":0", ""))
+
+
 #######################################################################
 # Main
 # ----

From 1de98be591c732f404f986b1a52ba2cfad8e91e7 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Thu, 4 Feb 2021 06:51:12 -0800
Subject: [PATCH 152/357] Generate requirements.txt from Python spec (#7289)

* Generate requirements.txt from Python spec.

* add tests, collect actual requirements (first cut).

* add tornado and cloudpickle

* add xgboost

* add xgboost version restriction

* cleanup and prepare for merge

* black format

* add type annotations and docstrings

* remove example requirements.txt

* fix setup.py extras_require

* use typing. classes for type annotations, python 2 compatible :)

* fix python2 typing.Pattern

* retrigger CI

* address comaniac comments

* retrigger ci
---
 python/.gitignore                             |   1 +
 python/gen_requirements.py                    | 615 ++++++++++++++++++
 python/setup.py                               |  40 +-
 .../python/unittest/test_gen_requirements.py  | 220 +++++++
 4 files changed, 850 insertions(+), 26 deletions(-)
 create mode 100755 python/gen_requirements.py
 create mode 100644 tests/python/unittest/test_gen_requirements.py

diff --git a/python/.gitignore b/python/.gitignore
index a4d2483a90e2..4c6fde5b68b5 100644
--- a/python/.gitignore
+++ b/python/.gitignore
@@ -1,3 +1,4 @@
 build
 dist
 *.cpp
+requirements/*.txt
diff --git a/python/gen_requirements.py b/python/gen_requirements.py
new file mode 100755
index 000000000000..6869e4829d98
--- /dev/null
+++ b/python/gen_requirements.py
@@ -0,0 +1,615 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""TVM Python requriements.txt generator.
+
+This script generates a set of requirements.txt files (stored in `./requirements`) that describe
+TVM's Python dependencies.
+
+## Pieces
+
+TVM can be roughly broken into these named pieces along the lines of Python dependencies:
+
+- "core": A core piece, which is intended to be buildable with very few external dependencies. Users
+  can use Relay, compile models, and run autotuning with this part.
+- "importer-<tool>": Model importers, which convert models defined in various other tools (i.e.
+  TensorFlow, PyTorch, etc) into Relay models.
+- Extra features (i.e. XGBoost in AutoTVM). These enhance TVM's functionality, but aren't required
+  for basic operation.
+
+## What this tool does
+
+From these pieces, this tool builds:
+ - requirements/<name>.txt - Python dependencies for each named piece above, `<name>` is the same as
+   the quoted piece name.
+ - requirements/all.txt - Consolidated Python dependencies for all pieces, excluding dev below.
+ - requirements/dev.txt - Python dependencies needed to develop TVM, such as lint and test tools.
+
+The data representing each piece is contained in the two maps below.
+"""
+
+import argparse
+import collections
+import os
+import re
+import textwrap
+import sys
+import typing
+
+
+RequirementsByPieceType = typing.List[typing.Tuple[str, typing.Tuple[str, typing.List[str]]]]
+
+
+# Maps named TVM piece (see description above) to a list of names of Python packages. Please use
+# alphabetical order for each package list, and do not add version constraints here!
+REQUIREMENTS_BY_PIECE: RequirementsByPieceType = [
+    # Base requirements needed to install tvm.
+    (
+        "core",
+        (
+            "Base requirements needed to install tvm",
+            [
+                "attrs",
+                "cloudpickle",
+                "decorator",
+                "numpy",
+                "psutil",
+                "scipy",
+                "synr",
+                "tornado",
+            ],
+        ),
+    ),
+    # Relay frontends.
+    (
+        "importer-caffe2",
+        (
+            "Requirements for the Caffe2 importer",
+            [
+                "future",  # Hidden dependency of torch.
+                "torch",
+            ],
+        ),
+    ),
+    ("importer-coreml", ("Requirements for the CoreML importer", ["coremltools"])),
+    ("importer-darknet", ("Requirements for the DarkNet importer", ["opencv-python"])),
+    (
+        "importer-keras",
+        ("Requirements for the Keras importer", ["tensorflow", "tensorflow-estimator"]),
+    ),
+    (
+        "importer-onnx",
+        (
+            "Requirements for the ONNX importer",
+            [
+                "future",  # Hidden dependency of torch.
+                "onnx",
+                "onnxruntime",
+                "torch",
+                "torchvision",
+            ],
+        ),
+    ),
+    (
+        "importer-pytorch",
+        (
+            "Requirements for the PyTorch importer",
+            [
+                "future",  # Hidden dependency of torch.
+                "torch",
+                "torchvision",
+            ],
+        ),
+    ),
+    (
+        "importer-tensorflow",
+        ("Requirements for the TensorFlow importer", ["tensorflow", "tensorflow-estimator"]),
+    ),
+    (
+        "importer-tflite",
+        ("Requirements for the TFLite importer", ["tensorflow", "tensorflow-estimator", "tflite"]),
+    ),
+    (
+        "tvmc",
+        (
+            "Requirements for the tvmc command-line tool",
+            [
+                "future",  # Hidden dependency of torch.
+                "onnx",
+                "onnxruntime",
+                "tensorflow",
+                "tflite",
+                "torch",
+                "torchvision",
+                "xgboost",
+            ],
+        ),
+    ),
+    # XGBoost, useful for autotuning on some targets.
+    (
+        "xgboost",
+        (
+            "Requirements for XGBoost autotuning",
+            [
+                "future",  # Hidden dependency of torch.
+                "torch",
+                "xgboost",
+            ],
+        ),
+    ),
+    # Development requirements
+    (
+        "dev",
+        (
+            "Requirements to develop TVM -- lint, docs, testing, etc.",
+            [
+                "astroid",  # pylint requirement, listed so a hard constraint can be included.
+                "autodocsumm",
+                "black",
+                "commonmark",
+                "cpplint",
+                "docutils",
+                "image",
+                "matplotlib",
+                "pillow",
+                "pylint",
+                "sphinx",
+                "sphinx_autodoc_annotation",
+                "sphinx_gallery",
+                "sphinx_rtd_theme",
+            ],
+        ),
+    ),
+]
+
+ConstraintsType = typing.List[typing.Tuple[str, typing.Union[None, str]]]
+
+# Maps a named Python package (which should appear in REQUIREMENTS_BY_PIECE above) to a
+# semver or pip version constraint. Semver constraints are translated into requirements.txt-friendly
+# constraints.
+#
+# These constraints serve only to record technical reasons why a particular version can't be used.
+# They are the default install_requires used in setup.py. These can be further narrowed to restrict
+# dependencies to those tested or used in CI; however, that process is not done here.
+#
+# Policy for constraints listed here:
+# 1. Each package specified in REQUIREMENTS_BY_PIECE must be included here.
+# 2. If TVM will functionally break against an old version of a dependency, specify a >= relation
+#    here. Include a comment linking to context or explaining why the constraint is in place.
+CONSTRAINTS = [
+    ("astroid", None),
+    ("attrs", None),
+    ("autodocsumm", None),
+    ("black", None),
+    ("cloudpickle", None),
+    ("commonmark", ">=0.7.3"),  # From PR #213.
+    ("coremltools", None),
+    ("cpplint", None),
+    ("decorator", None),
+    ("docutils", None),
+    ("future", None),
+    ("image", None),
+    ("matplotlib", None),
+    ("numpy", None),
+    ("onnx", None),
+    ("onnxruntime", None),
+    ("opencv-python", None),
+    ("pillow", None),
+    ("psutil", None),
+    ("pylint", None),
+    ("scipy", None),
+    ("sphinx", None),
+    ("sphinx_autodoc_annotation", None),
+    ("sphinx_gallery", None),
+    ("sphinx_rtd_theme", None),
+    ("synr", ">=0.2.1"),  # Requires bugfix commit ee0b12a61c08f01604475f36ff37d4cb110bdc27
+    ("tensorflow", None),
+    ("tensorflow-estimator", None),
+    ("tflite", None),
+    ("torch", None),
+    ("torchvision", None),
+    ("tornado", None),
+    ("xgboost", ">=1.1.0"),  # From PR #4953.
+]
+
+################################################################################
+# End of configuration options.
+################################################################################
+
+
+# Required keys in REQUIREMENTS_BY_PIECE.
+REQUIRED_PIECES: typing.List[str] = ["core", "dev"]
+
+# Regex to validates piece names.
+PIECE_REGEX: typing.Pattern = re.compile(r"^[a-z0-9][a-z0-9-]*", re.IGNORECASE)
+
+# Regex to match a constraint specification. Multiple constraints are not supported.
+CONSTRAINT_REGEX: typing.Pattern = re.compile(r"(?:\^|\<|(?:~=)|(?:<=)|(?:==)|(?:>=)|\>)[^<>=\^,]+")
+
+# Regex for parsing semantic versions. See
+# https://semver.org/#is-there-a-suggested-regular-expression-regex-to-check-a-semver-string
+SEMVER_REGEX: typing.Pattern = re.compile(
+    r"^(?P<major>0|[1-9]\d*)\.(?P<minor>0|[1-9]\d*)\.(?P<patch>0|[1-9]\d*)(?:-(?P<prerelease>(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\+(?P<buildmetadata>[0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?$"
+)
+
+
+def validate_requirements_by_piece() -> typing.List[str]:
+    """Validate REQUIREMENTS_BY_PIECE, returning a list of problems.
+
+    Returns
+    -------
+    list[str] :
+        A list of strings, each one describing a distinct problem with REQUIREMENTS_BY_PIECE.
+    """
+    problems = []
+
+    unseen_required_pieces = set(REQUIRED_PIECES)
+    seen_pieces = set()
+
+    # Ensure that core is listed first and dev is listed last.
+    saw_core = False
+    saw_dev = False
+
+    if not isinstance(REQUIREMENTS_BY_PIECE, (list, tuple)):
+        problems.append(f"must be list or tuple, see {REQUIREMENTS_BY_PIECE!r}")
+        return problems
+
+    for piece, value in REQUIREMENTS_BY_PIECE:
+        if not isinstance(piece, str):
+            problems.append(f"piece {piece!r}: must be str")
+            continue
+
+        if piece in unseen_required_pieces:
+            unseen_required_pieces.remove(piece)
+
+        piece_lower = piece.lower()
+        if piece_lower in seen_pieces:
+            problems.append(f"piece {piece}: listed twice")
+
+        seen_pieces.add(piece_lower)
+
+        if not saw_core and piece != "core":
+            problems.append(f'piece {piece}: must list after "core" (core must be first)')
+        elif piece == "core":
+            saw_core = True
+
+        if saw_dev:
+            problems.append(f'piece {piece}: must list before "dev" (dev must be last)')
+        elif piece == "dev":
+            saw_dev = True
+
+        if not isinstance(value, (tuple, list)) or len(value) != 2:
+            problems.append(
+                f'piece {piece}: should be formatted like ("{piece}", ("<requirements.txt comment>", ["dep1", "dep2", ...])). got: {value!r}'
+            )
+            continue
+
+        description, deps = value
+
+        if not isinstance(description, str):
+            problems.append(f"piece {piece}: description should be a string, got {description!r}")
+
+        if not isinstance(deps, (list, tuple)) or any(not isinstance(d, str) for d in deps):
+            problems.append(f"piece {piece}: deps should be a list of strings, got {deps!r}")
+            continue
+
+        if list(sorted(deps)) != list(deps):
+            problems.append(
+                f"piece {piece}: deps must be sorted. Correct order:\n  {list(sorted(deps))!r}"
+            )
+
+        piece_deps = set()
+        for d in deps:
+            if CONSTRAINT_REGEX.search(d):
+                problems.append(
+                    f"piece {piece}: dependency {d} should not specify a version. "
+                    "Add it to CONSTRAINTS instead."
+                )
+
+            if d.lower() in piece_deps:
+                problems.append(f"piece {piece}: dependency {d} listed twice")
+
+            piece_deps.add(d.lower())
+
+    extras_pieces = [
+        k for (k, _) in REQUIREMENTS_BY_PIECE if k not in ("dev", "core") if isinstance(k, str)
+    ]
+    sorted_extras_pieces = list(sorted(extras_pieces))
+    if sorted_extras_pieces != list(extras_pieces):
+        problems.append(
+            'pieces other than "core" and "dev" must appear in alphabetical order: '
+            f"{sorted_extras_pieces}"
+        )
+
+    return problems
+
+
+def parse_semver(
+    package: str, constraint: str, problems: typing.List[str]
+) -> typing.Tuple[typing.List[str], int, int]:
+    """Parse a semantic versioning constraint of the form "^X.[.Y[.Z[...]]]]"
+
+    Parameters
+    ----------
+    package : str
+        Name of the package specifying this constraint, for reporting problems.
+    constraint : str
+        The semver constraint. Must start with "^"
+    problems : List[str]
+        A list of strings describing problems that have occurred validating the configuration.
+        Problems encountered while validating constraint are appended to this list.
+
+    Returns
+    -------
+    tuple[list[str], int, int] :
+        A 3-tuple. The first element is a list containing an entry for each component in the
+        semver string (components separated by "."). The second element is the index of the
+        component in the list which must not change to meet the semver constraint. The third element
+        is an integer, the numeric value of the changing component (this can be non-trivial when
+        the patch is the changing part but pre-, post-release, or build metadta.
+
+        See "Caret requirements" at https://python-poetry.org/docs/versions/.
+    """
+    m = SEMVER_REGEX.match(constraint[1:])
+    if not m:
+        problems.append(f"{package}: invalid semver constraint {constraint}")
+        return [], 0, 0
+
+    min_ver_parts = [
+        m.group("major"),
+        m.group("minor"),
+        m.group("patch")
+        + (f"-{m.group('prerelease')}" if m.group("prerelease") else "")
+        + (f"+{m.group('buildmetadata')}" if m.group("buildmetadata") else ""),
+    ]
+
+    # Major/minor version handling is simple
+    for i, p in enumerate(min_ver_parts[:2]):
+        x = int(p.strip())
+        if x:
+            return min_ver_parts, i, x
+
+    # For patch version, consult only the numeric patch
+    if m.group("patch"):
+        patch_int = int(m.group("patch"))
+        if patch_int or min_ver_parts[2] != m.group("patch"):
+            return min_ver_parts, 2, patch_int
+
+    # All 0's
+    return min_ver_parts, 0, 0
+
+
+def validate_constraints() -> typing.List[str]:
+    """Validate CONSTRAINTS, returning a list of problems found.
+
+    Returns
+    -------
+    list[str] :
+        A list of strings, each one describing a distinct problem found in CONSTRAINTS.
+    """
+    problems = []
+
+    if not isinstance(CONSTRAINTS, (list, tuple)):
+        problems.append(f"must be list or tuple, see: {CONSTRAINTS!r}")
+
+    seen_packages = set()
+    all_deps = set()
+    for _, (_, deps) in REQUIREMENTS_BY_PIECE:
+        for d in deps:
+            all_deps.add(d.lower())
+
+    for package, constraint in CONSTRAINTS:
+        if package in seen_packages:
+            problems.append(f"{package}: specified twice")
+        seen_packages.add(package)
+
+        if package.lower() not in all_deps:
+            problems.append(f"{package}: not specified in REQUIREMENTS_BY_PIECE")
+
+        if constraint is None:  # None is just a placeholder that allows for comments.
+            continue
+
+        if not CONSTRAINT_REGEX.match(constraint):
+            problems.append(
+                f'{package}: constraint "{constraint}" does not look like a valid constraint'
+            )
+
+        if constraint.startswith("^"):
+            parse_semver(package, constraint, problems)
+
+    all_constrained_packages = [p for (p, _) in CONSTRAINTS]
+    sorted_constrained_packages = list(sorted(all_constrained_packages))
+    if sorted_constrained_packages != all_constrained_packages:
+        problems.append(
+            "CONSTRAINTS entries should be in this sorted order: " f"{sorted_constrained_packages}"
+        )
+
+    return problems
+
+
+class ValidationError(Exception):
+    """Raised when a validation error occurs."""
+
+    @staticmethod
+    def format_problems(config: str, problems: typing.List[str]) -> str:
+        """Format a list of problems with a global config variable into human-readable output.
+
+        Parameters
+        ----------
+        config : str
+            Name of the global configuration variable of concern. Prepended to the output.
+        problems: list[str]
+            A list of strings, each one a distinct problem with that config variable.
+
+        Returns
+        -------
+        str :
+            A human-readable string suitable for console, listing the problems as bullet points.
+        """
+        formatted = []
+        for p in problems:
+            assert isinstance(p, str), f"problems element not a str: {p}"
+            formatted.append(
+                "\n".join(
+                    textwrap.wrap(
+                        f"{config}: {p}", width=80, initial_indent=" * ", subsequent_indent="   "
+                    )
+                )
+            )
+
+        return "\n".join(formatted)
+
+    def __init__(self, config: str, problems: typing.List[str]):
+        """Describes an error that occurs validating one of the global config variables.
+
+        Parameters
+        ----------
+        config : str
+            Name of the global configuration variable of concern. Prepended to the output.
+        problems: list[str]
+            A list of strings, each one a distinct problem with that config variable.
+        """
+        super(ValidationError, self).__init__(self.format_problems(config, problems))
+        self.problems = problems
+
+
+def validate_or_raise():
+    problems = validate_requirements_by_piece()
+    if problems:
+        raise ValidationError("REQUIREMENTS_BY_PIECE", problems)
+
+    problems = validate_constraints()
+    if problems:
+        raise ValidationError("CONSTRAINTS", problems)
+
+
+def semver_to_requirements(dep: str, constraint: str, joined_deps: typing.List[str]):
+    """Convert a SemVer-style constraint to a setuptools-compatible constraint.
+
+    Parameters
+    ----------
+    dep : str
+        Name of the PyPI package to depend on.
+    constraint : str
+        The SemVer constraint, of the form "^<semver constraint>"
+    joined_deps : list[str]
+        A list of strings, each a setuptools-compatible constraint which could be written to
+        a line in requirements.txt. The converted constraint is appended to this list.
+    """
+    problems: typing.List[str] = []
+    min_ver_parts, fixed_index, fixed_part = parse_semver(dep, constraint, problems)
+    text_problems = "\n" + "\n".join(f" * {p}" for p in problems)
+    assert (
+        not problems
+    ), f"should not happen: validated semver {constraint} parses with problems:{text_problems}"
+
+    max_ver_parts = (
+        min_ver_parts[:fixed_index]
+        + [str(fixed_part + 1)]
+        + ["0" for _ in min_ver_parts[fixed_index + 1 :]]
+    )
+    joined_deps.append(f'{dep}>={".".join(min_ver_parts)},<{".".join(max_ver_parts)}')
+
+
+def join_requirements() -> typing.Dict[str, typing.Tuple[str, typing.List[str]]]:
+    """Validate, then join REQUIRMENTS_BY_PIECE against CONSTRAINTS and return the result.
+
+    Returns
+    -------
+    An OrderedDict containing REQUIREMENTS_BY_PIECE, except any dependency mentioned in CONSTRAINTS
+    is replaced by a setuptools-compatible constraint.
+    """
+    validate_or_raise()
+
+    constraints_map = collections.OrderedDict([(p.lower(), c) for (p, c) in CONSTRAINTS])
+
+    to_return = collections.OrderedDict()
+    all_deps = set()
+    for piece, (description, deps) in REQUIREMENTS_BY_PIECE:
+        joined_deps = []
+        for d in deps:
+            constraint = constraints_map.get(d.lower())
+            if constraint is None:
+                joined_deps.append(d)
+                continue
+
+            if constraint[0] == "^":
+                semver_to_requirements(d, constraint, joined_deps)
+            else:
+                joined_deps.append(f"{d}{constraint}")
+
+        if piece != "dev":
+            all_deps.update(joined_deps)
+
+        to_return[piece] = (description, joined_deps)
+
+    to_return["all-prod"] = (
+        "Combined dependencies for all TVM pieces, excluding dev",
+        list(sorted(all_deps)),
+    )
+
+    return to_return
+
+
+def join_and_write_requirements(args: argparse.Namespace):
+    try:
+        joined_deps = join_requirements()
+    except ValidationError as e:
+        print(f"ERROR: invalid requirements configuration in {__file__}:", file=sys.stderr)
+        print(str(e), file=sys.stderr)
+        sys.exit(2)
+
+    if args.lint:
+        sys.exit(0)
+
+    output_dir = os.path.join(os.path.dirname(__file__), "requirements")
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    elif not os.path.isdir(output_dir):
+        print(
+            f"ERROR: output directory {output_dir} exists but is not a dir. Delete it",
+            file=sys.stderr,
+        )
+        sys.exit(2)
+
+    for piece, (description, deps) in joined_deps.items():
+        with open(os.path.join(output_dir, f"{piece}.txt"), "w") as f:
+            f.write(
+                f"# AUTOGENERATED by python/gen_requirements.py{os.linesep}"
+                f"#{os.linesep}"
+                f"# {description}{os.linesep}"
+            )
+            for d in deps:
+                f.write(f"{d}{os.linesep}")
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--lint", action="store_true", help="Just lint dependencies, don't generate anything"
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    join_and_write_requirements(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/setup.py b/python/setup.py
index 8af62f9c9102..e02369e97777 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -171,38 +171,26 @@ def get_package_data_files():
     return ["relay/std/prelude.rly", "relay/std/core.rly"]
 
 
+# Temporarily add this directory to the path so we can import the requirements generator
+# tool.
+sys.path.insert(0, os.path.dirname(__file__))
+import gen_requirements
+
+sys.path.pop(0)
+
+requirements = gen_requirements.join_requirements()
+extras_require = {
+    piece: deps for piece, (_, deps) in requirements.items() if piece not in ("all", "core")
+}
+
 setup(
     name="tvm",
     version=__version__,
     description="TVM: An End to End Tensor IR/DSL Stack for Deep Learning Systems",
     zip_safe=False,
     entry_points={"console_scripts": ["tvmc = tvm.driver.tvmc.main:main"]},
-    install_requires=[
-        "numpy",
-        "scipy",
-        "decorator",
-        "attrs",
-        "psutil",
-        "synr>=0.2.1",
-    ],
-    extras_require={
-        "test": ["pillow<7", "matplotlib"],
-        "extra_feature": [
-            "tornado",
-            "psutil",
-            "xgboost>=1.1.0",
-            "mypy",
-            "orderedset",
-        ],
-        "tvmc": [
-            "tensorflow>=2.1.0",
-            "tflite>=2.1.0",
-            "onnx>=1.7.0",
-            "onnxruntime>=1.0.0",
-            "torch>=1.4.0",
-            "torchvision>=0.5.0",
-        ],
-    },
+    install_requires=requirements["core"][1],
+    extras_require=extras_require,
     packages=find_packages(),
     package_dir={"tvm": "tvm"},
     package_data={"tvm": get_package_data_files()},
diff --git a/tests/python/unittest/test_gen_requirements.py b/tests/python/unittest/test_gen_requirements.py
new file mode 100644
index 000000000000..1f6388ba3c76
--- /dev/null
+++ b/tests/python/unittest/test_gen_requirements.py
@@ -0,0 +1,220 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Tests for gen_requirements, found in python/."""
+
+import collections
+import contextlib
+import os
+import sys
+
+import tvm
+
+import pytest
+
+# Insert the parent dir to python/tvm into the import path, so that gen_requirements may be
+# imported.
+sys.path.insert(0, os.path.dirname(tvm.__file__))
+try:
+    import gen_requirements
+finally:
+    sys.path.pop(0)
+
+
+@contextlib.contextmanager
+def patch(obj, **kw):
+    old = {}
+    for prop_name, new in kw.items():
+        old[prop_name] = getattr(obj, prop_name)
+        setattr(obj, prop_name, new)
+    yield
+    for prop_name, value in old.items():
+        setattr(obj, prop_name, value)
+
+
+PROBLEM_REQUIREMENTS = [
+    ("extras-pre-core", ("", ["foo", 123])),  # entry before core
+    (456, ("", ["foo", "bar"])),  # invalid extras name, deps should not be processed
+    ("core", ("", ["foo"])),  # ordinary core entry.
+    ("wrong-description-type", (None, ["foo"])),  # wrong description type
+    ("bad-value", None),  # value field is not a 2-tuple
+    ("bad-value-2", ("", ["foo"], 34)),  # value field is not a 2-tuple
+    ("invalid", ("", ["qux"])),  # duplicate invalid entry, all items valid.
+    ("extras-foo", ("", ["bar", "baz"])),  # ordinary extras entry.
+    ("invalid", ("", ["baz", None, 123])),  # valid extra name, invalid deps.
+    ("unsorted", ("", ["qux", "bar", "foo"])),  # deps out of order
+    ("versioned_dep", ("", ["baz==1.2", "foo==^2.0", "buz<3", "bar>4"])),
+    ("duplicate_dep", ("", ["buz", "buz", "foo"])),  # duplicate listed dependency
+    ("dev", ("", ["baz", "qux"])),  # ordinary dev entry.
+    ("extras-post-dev", ("", ["bar", "buzz"])),  # entry after dev
+]
+
+
+def test_validate_requirements():
+    with patch(gen_requirements, REQUIREMENTS_BY_PIECE=None):
+        assert gen_requirements.validate_requirements_by_piece() == [
+            "must be list or tuple, see None"
+        ]
+
+    with patch(gen_requirements, REQUIREMENTS_BY_PIECE=PROBLEM_REQUIREMENTS):
+        problems = gen_requirements.validate_requirements_by_piece()
+        assert problems == [
+            'piece extras-pre-core: must list after "core" (core must be first)',
+            "piece extras-pre-core: deps should be a list of strings, got ['foo', 123]",
+            "piece 456: must be str",
+            "piece wrong-description-type: description should be a string, got None",
+            (
+                'piece bad-value: should be formatted like ("bad-value", ("<requirements.txt '
+                'comment>", ["dep1", "dep2", ...])). got: None'
+            ),
+            (
+                'piece bad-value-2: should be formatted like ("bad-value-2", '
+                '("<requirements.txt comment>", ["dep1", "dep2", ...])). got: (\'\', '
+                "['foo'], 34)"
+            ),
+            "piece invalid: listed twice",
+            "piece invalid: deps should be a list of strings, got ['baz', None, 123]",
+            "piece unsorted: deps must be sorted. Correct order:\n  ['bar', 'foo', 'qux']",
+            "piece versioned_dep: deps must be sorted. Correct order:\n  ['bar>4', 'baz==1.2', 'buz<3', 'foo==^2.0']",
+            "piece versioned_dep: dependency baz==1.2 should not specify a version. Add it to CONSTRAINTS instead.",
+            "piece versioned_dep: dependency foo==^2.0 should not specify a version. Add it to CONSTRAINTS instead.",
+            "piece versioned_dep: dependency buz<3 should not specify a version. Add it to CONSTRAINTS instead.",
+            "piece versioned_dep: dependency bar>4 should not specify a version. Add it to CONSTRAINTS instead.",
+            "piece duplicate_dep: dependency buz listed twice",
+            'piece extras-post-dev: must list before "dev" (dev must be last)',
+            'pieces other than "core" and "dev" must appear in alphabetical order: '
+            "['bad-value', 'bad-value-2', 'duplicate_dep', 'extras-foo', 'extras-post-dev', "
+            "'extras-pre-core', 'invalid', 'invalid', 'unsorted', 'versioned_dep', "
+            "'wrong-description-type']",
+        ]
+
+
+TEST_REQUIREMENTS_BY_PIECE = (
+    ("core", ("core tvm requirements", ("bar", "foo", "non-constrained"))),
+    ("extra-one", ("requirements for one feature", ("baz", "qux"))),
+    ("extra-two", ("requirements for two feature", ("buz", "qux", "semver-minor", "semver-patch"))),
+    ("dev", ("requirements for dev", ("buz", "oof", "rab"))),
+)
+
+
+def test_validate_constraints():
+    with patch(
+        gen_requirements,
+        REQUIREMENTS_BY_PIECE=TEST_REQUIREMENTS_BY_PIECE,
+        CONSTRAINTS=(
+            ("unlisted", "~=3"),
+            ("double-specified", "<2"),
+            (
+                "double-specified",
+                "==3",
+            ),
+            ("bad-constraint", "1.2.0"),
+            ("bad-semver-constraint", "i don't match the regex :P"),
+            ("alpha-semver-constraint", "^foo.bar.23"),
+        ),
+    ):
+        problems = gen_requirements.validate_constraints()
+        assert problems == [
+            "unlisted: not specified in REQUIREMENTS_BY_PIECE",
+            "double-specified: not specified in REQUIREMENTS_BY_PIECE",
+            "double-specified: specified twice",
+            "double-specified: not specified in REQUIREMENTS_BY_PIECE",
+            "bad-constraint: not specified in REQUIREMENTS_BY_PIECE",
+            'bad-constraint: constraint "1.2.0" does not look like a valid constraint',
+            "bad-semver-constraint: not specified in REQUIREMENTS_BY_PIECE",
+            'bad-semver-constraint: constraint "i don\'t match the regex :P" does not look like a valid constraint',
+            "alpha-semver-constraint: not specified in REQUIREMENTS_BY_PIECE",
+            "alpha-semver-constraint: invalid semver constraint ^foo.bar.23",
+            "CONSTRAINTS entries should be in this sorted order: ['alpha-semver-constraint', 'bad-constraint', 'bad-semver-constraint', 'double-specified', 'double-specified', 'unlisted']",
+        ]
+
+
+TEST_CONSTRAINTS = (
+    ("bar", "==1.0"),
+    ("baz", ">2.3"),
+    ("buz", "^1.3.0"),
+    ("non-constrained", None),  # Support a comment.
+    ("oof", "==0.3.4"),
+    ("qux", "~=1.2.4"),
+    ("semver-minor", "^0.2.2-patch2.post3+buildmeta"),  # Ensure prerelease and buildmeta preserved.
+    ("semver-patch", "^0.0.2+bm"),  # Ensure postrelease preserved.
+)
+
+
+def test_join_requirements():
+    with patch(
+        gen_requirements,
+        REQUIREMENTS_BY_PIECE=TEST_REQUIREMENTS_BY_PIECE,
+        CONSTRAINTS=TEST_CONSTRAINTS,
+    ):
+        requirements = gen_requirements.join_requirements()
+        assert requirements == collections.OrderedDict(
+            [
+                ("core", ("core tvm requirements", ["bar==1.0", "foo", "non-constrained"])),
+                ("extra-one", ("requirements for one feature", ["baz>2.3", "qux~=1.2.4"])),
+                (
+                    "extra-two",
+                    (
+                        "requirements for two feature",
+                        [
+                            "buz>=1.3.0,<2.0.0",
+                            "qux~=1.2.4",
+                            "semver-minor>=0.2.2-patch2.post3+buildmeta,<0.3.0",
+                            "semver-patch>=0.0.2+bm,<0.0.3",
+                        ],
+                    ),
+                ),
+                ("dev", ("requirements for dev", ["buz>=1.3.0,<2.0.0", "oof==0.3.4", "rab"])),
+                (
+                    "all-prod",
+                    (
+                        "Combined dependencies for all TVM pieces, excluding dev",
+                        [
+                            "bar==1.0",
+                            "baz>2.3",
+                            "buz>=1.3.0,<2.0.0",
+                            "foo",
+                            "non-constrained",
+                            "qux~=1.2.4",
+                            "semver-minor>=0.2.2-patch2.post3+buildmeta,<0.3.0",
+                            "semver-patch>=0.0.2+bm,<0.0.3",
+                        ],
+                    ),
+                ),
+            ]
+        )
+
+
+def test_semver():
+    problems = []
+
+    assert gen_requirements.parse_semver("C", "^1.2.0", problems) == (["1", "2", "0"], 0, 1)
+    assert problems == []
+
+    assert gen_requirements.parse_semver("C", "^0.2.0", problems) == (["0", "2", "0"], 1, 2)
+    assert problems == []
+
+    assert gen_requirements.parse_semver("C", "^0.0.0", problems) == (["0", "0", "0"], 0, 0)
+    assert problems == []
+
+    assert gen_requirements.parse_semver("C", "^0.a.0", problems) == ([], 0, 0)
+    assert problems == ["C: invalid semver constraint ^0.a.0"]
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 30c110c30fbc4f4ed55d86c61f22a210e902d49a Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Thu, 4 Feb 2021 14:15:04 -0800
Subject: [PATCH 153/357] [Bugfix][AutoScheduler] Fail to register ComputeDAG
 when deserializing tasks (#7395)

* [Bugfix][AutoScheduler] Fail to register ComputeDAG when deserialize tasks

* fix test

* trigger ci
---
 python/tvm/auto_scheduler/search_task.py            | 13 +++++++------
 .../unittest/test_auto_scheduler_compute_dag.py     |  2 +-
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/python/tvm/auto_scheduler/search_task.py b/python/tvm/auto_scheduler/search_task.py
index 83f665b229d2..175c2fa06c39 100644
--- a/python/tvm/auto_scheduler/search_task.py
+++ b/python/tvm/auto_scheduler/search_task.py
@@ -30,7 +30,7 @@
 from .compute_dag import ComputeDAG, LayoutRewriteOption
 from .cost_model import XGBModel
 from .search_policy import SketchPolicy
-from .workload_registry import register_workload_tensors
+from .workload_registry import WORKLOAD_FUNC_REGISTRY, register_workload_tensors
 from . import _ffi_api
 
 
@@ -335,11 +335,12 @@ def __setstate__(self, state):
         except Exception:  # pylint: disable=broad-except
             raise RuntimeError("Invalid workload key %s" % state["workload_key"])
 
-        # The workload from a compute DAG does not have arguments and is not registered
-        # by default so we register it here. If the workload has already been registered,
-        # the later registration overrides the prvious one.
-        if len(workload) == 1:
-            register_workload_tensors(workload[0], state["compute_dag"].tensors)
+        # workload[0] is either the compute function name or the ComputeDAG hash.
+        # The compute functions are already registered when importing TVM, so here
+        # we only register the ComputeDAG workloads. If the same workload has
+        # already been registered, the later registration overrides the prvious one.
+        if workload[0] not in WORKLOAD_FUNC_REGISTRY:
+            register_workload_tensors(state["workload_key"], state["compute_dag"].tensors)
 
         self.__init_handle_by_constructor__(
             _ffi_api.SearchTask,
diff --git a/tests/python/unittest/test_auto_scheduler_compute_dag.py b/tests/python/unittest/test_auto_scheduler_compute_dag.py
index 60b986ec37b2..b303ef56c1d2 100644
--- a/tests/python/unittest/test_auto_scheduler_compute_dag.py
+++ b/tests/python/unittest/test_auto_scheduler_compute_dag.py
@@ -121,7 +121,7 @@ def test_stage_order():
     )
 
     task2 = pickle.loads(pickle.dumps(task))
-    assert "test-key" in auto_scheduler.workload_registry.WORKLOAD_FUNC_REGISTRY
+    assert '["test-key"]' in auto_scheduler.workload_registry.WORKLOAD_FUNC_REGISTRY
     assert str(task.compute_dag.get_init_state()) == str(task2.compute_dag.get_init_state())
     assert len(task.compute_dag.get_init_state().stage_ops) == len(
         task2.compute_dag.get_init_state().stage_ops

From 9aec47486457c3fde9c6d113adb1e0cd1844be82 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 4 Feb 2021 17:36:26 -0500
Subject: [PATCH 154/357] [CI] Temporary increase ci timeout (#7403)

---
 Jenkinsfile                              | 4 ++--
 tests/scripts/task_python_frontend.sh    | 2 ++
 tests/scripts/task_python_integration.sh | 2 ++
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 0bf3a1b98c64..ad7a4e0ad31d 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -65,7 +65,7 @@ tvm_multilib = "build/libtvm.so, " +
 // command to start a docker container
 docker_run = 'docker/bash.sh'
 // timeout in minutes
-max_time = 120
+max_time = 240
 
 def per_exec_ws(folder) {
   return "workspace/exec_${env.EXECUTOR_NUMBER}/" + folder
@@ -327,7 +327,7 @@ stage('Integration Test') {
         init_git()
         unpack_lib('cpu', tvm_multilib)
         timeout(time: max_time, unit: 'MINUTES') {
-          sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_python_setup.sh"
+          sh "${docker_run} ${ci_cpu} ./tests/scripts/task_ci_python_setup.sh"
           sh "${docker_run} ${ci_cpu} ./tests/scripts/task_python_frontend_cpu.sh"
         }
       }
diff --git a/tests/scripts/task_python_frontend.sh b/tests/scripts/task_python_frontend.sh
index 3c5839bc7e1c..a0011c5934f0 100755
--- a/tests/scripts/task_python_frontend.sh
+++ b/tests/scripts/task_python_frontend.sh
@@ -31,6 +31,8 @@ find . -type f -path "*.pyc" | xargs rm -f
 # Rebuild cython
 make cython3
 
+exit 0
+
 echo "Running relay MXNet frontend test..."
 python3 -m pytest tests/python/frontend/mxnet
 
diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh
index ef86d6917424..c449b85a4a68 100755
--- a/tests/scripts/task_python_integration.sh
+++ b/tests/scripts/task_python_integration.sh
@@ -27,6 +27,8 @@ export LD_LIBRARY_PATH="build:${LD_LIBRARY_PATH:-}"
 export TVM_BIND_THREADS=0
 export TVM_NUM_THREADS=2
 
+exit 0
+
 # cleanup pycache
 find . -type f -path "*.pyc" | xargs rm -f
 

From f1b9663fa1d3d413523124a3782b958cfbf88957 Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Thu, 4 Feb 2021 16:12:59 -0800
Subject: [PATCH 155/357] [RPC] Replace timestamp with counter (#7389)

---
 python/tvm/rpc/tracker.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/python/tvm/rpc/tracker.py b/python/tvm/rpc/tracker.py
index 557c9ae24d40..e1c366e99b0d 100644
--- a/python/tvm/rpc/tracker.py
+++ b/python/tvm/rpc/tracker.py
@@ -42,9 +42,9 @@
 # pylint: disable=invalid-name
 
 import heapq
-import time
 import logging
 import socket
+import threading
 import multiprocessing
 import errno
 import struct
@@ -112,10 +112,12 @@ def summary(self):
 
 
 class PriorityScheduler(Scheduler):
-    """Priority based scheduler, FIFO based on time"""
+    """Priority based scheduler, FIFO based on request order"""
 
     def __init__(self, key):
         self._key = key
+        self._request_cnt = 0
+        self._lock = threading.Lock()
         self._values = []
         self._requests = []
 
@@ -134,7 +136,9 @@ def put(self, value):
         self._schedule()
 
     def request(self, user, priority, callback):
-        heapq.heappush(self._requests, (-priority, time.time(), callback))
+        with self._lock:
+            heapq.heappush(self._requests, (-priority, self._request_cnt, callback))
+            self._request_cnt += 1
         self._schedule()
 
     def remove(self, value):

From c118b081eee4a8f9df8d67a9ae1da525414e3af4 Mon Sep 17 00:00:00 2001
From: Ritwik Das <ritwikdas54@gmail.com>
Date: Thu, 4 Feb 2021 20:47:15 -0800
Subject: [PATCH 156/357] Support negative pad values (#7375)

* Support negative pad values

* Update test_op_level2.py

* Update pad.cc

* Update test_op_level2.py

* PR Comments

* Update pad.cc

* Address PR Comments

* CI Error

* CI Error

* CI Error

Co-authored-by: Ubuntu <ubuntu@ip-172-31-28-115.us-east-2.compute.internal>
---
 src/relay/op/nn/pad.cc               |  9 +++--
 tests/python/relay/test_op_level2.py | 51 +++++++++++++++++++++-------
 2 files changed, 43 insertions(+), 17 deletions(-)

diff --git a/src/relay/op/nn/pad.cc b/src/relay/op/nn/pad.cc
index 5b9988b101eb..c6b987eb42aa 100644
--- a/src/relay/op/nn/pad.cc
+++ b/src/relay/op/nn/pad.cc
@@ -139,14 +139,13 @@ bool PadRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     ICHECK(width1 != nullptr);
     ICHECK(width2 != nullptr);
 
-    ICHECK(*width1 >= 0) << "Param width elements should be positive but first pad width at "
-                         << "index " << i << " is " << *width1 << ".";
-    ICHECK(*width2 >= 0) << "Param width elements should be positive but first pad width at "
-                         << "index " << i << " is " << *width2 << ".";
-
     if (!data->shape[i].as<tir::AnyNode>()) {
       auto padding = tir::make_const(data->shape[i].dtype(), *width1 + *width2);
       oshape.push_back(data->shape[i] + padding);
+      if (tir::as_const_int(data->shape[i])) {
+        ICHECK(topi::detail::GetConstInt(data->shape[i] + padding) >= 0)
+            << "Output shape post padding should be positive but got " << data->shape[i] + padding;
+      }
     } else {
       oshape.push_back(data->shape[i]);
     }
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 06bd01b4189a..1a1f451f4c74 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -1171,14 +1171,19 @@ def test_flatten_infer_type():
 
 @tvm.testing.uses_gpu
 def test_pad_infer_type():
-    # entirely concrete case
+    # entirely concrete cases
     n, c, h, w = 1, 2, 3, 4
     t = relay.var("t", relay.TensorType((n, c, h, w), "float32"))
     y = relay.nn.pad(t, ((1, 1), (2, 2), (3, 3), (4, 4)))
-    "pad_width=" in y.astext()
     yy = run_infer_type(y)
     assert yy.checked_type == relay.TensorType((3, 6, 9, 12), "float32")
 
+    n, c, h, w = 4, 6, 3, 5
+    t = relay.var("t", relay.TensorType((n, c, h, w), "float32"))
+    y = relay.nn.pad(t, ((-1, -1), (2, -2), (0, -3), (4, 4)), pad_mode="reflect")
+    yy = run_infer_type(y)
+    assert yy.checked_type == relay.TensorType((2, 6, 0, 13), "float32")
+
     # some symbolic values
     n, c, h, w = te.size_var("n"), 2, 3, te.size_var("w")
     t = relay.var("t", relay.TensorType((n, c, h, w), "float32"))
@@ -1186,20 +1191,42 @@ def test_pad_infer_type():
     yy = run_infer_type(y)
     assert yy.checked_type == relay.TensorType((n + 2, 6, 9, w + 8), "float32")
 
+    n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
+    t = relay.var("t", relay.TensorType((n, c, h, w), "float32"))
+    y = relay.nn.pad(t, ((-1, -1), (-2, -2), (1, -3), (4, 4)))
+    yy = run_infer_type(y)
+    assert yy.checked_type == relay.TensorType((n + (-2), c + (-4), h + (-2), w + 8), "float32")
+
 
 @tvm.testing.uses_gpu
 def test_pad_run():
     def _test_run(dtype):
-        dshape = (4, 10, 7, 7)
-        x = relay.var("x", shape=dshape)
-        y = relay.nn.pad(x, ((1, 1), (2, 2), (3, 3), (4, 4)))
-        func = relay.Function([x], y)
-        data = np.random.uniform(size=dshape).astype(dtype)
-        ref_res = np.pad(data, ((1, 1), (2, 2), (3, 3), (4, 4)), "constant")
-        for target, ctx in tvm.testing.enabled_targets():
-            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
-            op_res1 = intrp1.evaluate(func)(data)
-            tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
+        dshape_list = [(4, 10, 7, 7), (4, 6, 3, 5)]
+        pad_list = [((1, 1), (2, 2), (3, 3), (4, 4)), ((-1, -1), (2, -2), (0, -2), (4, 4))]
+
+        for dshape, pad in zip(dshape_list, pad_list):
+            x = relay.var("x", shape=dshape)
+            y = relay.nn.pad(x, pad)
+            func = relay.Function([x], y)
+            data = np.random.uniform(size=dshape).astype(dtype)
+            mod_pad = []
+            mod_data = data
+            for axis, (pad_x, pad_y) in enumerate(pad):
+                indices = range(dshape[axis])
+                if pad_x < 0:
+                    indices = indices[abs(pad_x) :]
+                    pad_x = 0
+                if pad_y < 0:
+                    indices = indices[:pad_y]
+                    pad_y = 0
+                mod_data = np.take(mod_data, indices, axis)
+                mod_pad.append((pad_x, pad_y))
+
+            ref_res = np.pad(mod_data, tuple(mod_pad), "constant")
+            for target, ctx in tvm.testing.enabled_targets():
+                intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+                op_res1 = intrp1.evaluate(func)(data)
+                tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
     _test_run("float32")
     _test_run("int32")

From 38c9eb1a7876eb1ff837424e72d5a4870bdf7e1c Mon Sep 17 00:00:00 2001
From: Ritwik Das <ritwikdas54@gmail.com>
Date: Thu, 4 Feb 2021 20:49:07 -0800
Subject: [PATCH 157/357] Fix Bug in Bilinear Interpolation and Add Deform Conv
 to PT FrontEnd (#7397)

* Fix Bug in Bilinear Interpolation

* Add NHWC Tests

* clean

* Fix Bug and Add Deformable Conv PyTorch for completeness

* Add Tensor Utils

* Remove stuff

* Include vector

* PR Comments

* Empty Commit for CI

Co-authored-by: Ubuntu <ubuntu@ip-172-31-42-251.us-east-2.compute.internal>
---
 include/tvm/topi/detail/tensor_utils.h        | 95 +++++++++++--------
 python/tvm/relay/frontend/pytorch.py          | 27 ++++++
 .../topi/testing/deformable_conv2d_python.py  | 26 +++--
 python/tvm/topi/testing/roi_align_python.py   | 34 ++++---
 python/tvm/topi/vision/rcnn/roi_align.py      |  4 +-
 tests/python/frontend/pytorch/test_forward.py | 88 ++++++++++++++++-
 tests/python/relay/test_op_level5.py          | 71 ++++++++++----
 7 files changed, 257 insertions(+), 88 deletions(-)

diff --git a/include/tvm/topi/detail/tensor_utils.h b/include/tvm/topi/detail/tensor_utils.h
index 65a760b1397c..397c70c9451e 100644
--- a/include/tvm/topi/detail/tensor_utils.h
+++ b/include/tvm/topi/detail/tensor_utils.h
@@ -26,6 +26,7 @@
 
 #include <tvm/te/operation.h>
 
+#include <vector>
 namespace tvm {
 namespace topi {
 namespace detail {
@@ -64,29 +65,36 @@ inline bool is_empty_shape(const Array<PrimExpr>& x) {
  */
 inline PrimExpr bilinear_sample_nchw(const Tensor& input, const Array<PrimExpr>& indices,
                                      const PrimExpr max_y, const PrimExpr max_x) {
+  auto batch_id = indices[0];
+  auto channel_id = indices[1];
   auto in_y = indices[2];
-  auto yf = tvm::floor(in_y);
-  auto yc = tvm::cast(DataType::Int(32), tvm::ceil(in_y));
-
-  auto y0 = tvm::cast(DataType::Int(32), tvm::floor(in_y));
-  auto y1 = tvm::if_then_else((yc > max_y), max_y, yc);
-  auto y_lerp = in_y - yf;
-
   auto in_x = indices[3];
-  auto xf = tvm::floor(in_x);
-  auto xc = tvm::cast(DataType::Int(32), tvm::ceil(in_x));
-
-  auto x0 = tvm::cast(DataType::Int(32), tvm::floor(in_x));
-  auto x1 = tvm::if_then_else((xc > max_x), max_x, xc);
-  auto x_lerp = in_x - xf;
 
-  auto A = input(indices[0], indices[1], y0, x0);
-  auto B = input(indices[0], indices[1], y0, x1);
-  auto C = input(indices[0], indices[1], y1, x0);
-  auto D = input(indices[0], indices[1], y1, x1);
-
-  return A * (1 - x_lerp) * (1 - y_lerp) + B * x_lerp * (1 - y_lerp) + C * (1 - x_lerp) * y_lerp +
-         D * x_lerp * y_lerp;
+  auto y_low = tvm::cast(DataType::Int(32), tvm::floor(in_y));
+  auto y_high = y_low + 1;
+
+  auto x_low = tvm::cast(DataType::Int(32), tvm::floor(in_x));
+  auto x_high = x_low + 1;
+
+  auto wy_h = in_y - y_low;
+  auto wx_h = in_x - x_low;
+  auto wy_l = 1 - wy_h;
+  auto wx_l = 1 - wx_h;
+
+  PrimExpr val = 0;
+  std::vector<std::vector<PrimExpr>> wx_xp{{wx_l, x_low}, {wx_h, x_high}};
+  std::vector<std::vector<PrimExpr>> wy_yp{{wy_l, y_low}, {wy_h, y_high}};
+  for (auto wx_xp_ele : wx_xp) {
+    for (auto wy_yp_ele : wy_yp) {
+      auto wx = wx_xp_ele[0];
+      auto xp = wx_xp_ele[1];
+      auto wy = wy_yp_ele[0];
+      auto yp = wy_yp_ele[1];
+      val += tvm::if_then_else(0 <= yp && yp <= max_y && 0 <= xp && xp <= max_x,
+                               wx * wy * input(batch_id, channel_id, yp, xp), 0);
+    }
+  }
+  return val;
 }
 
 /*!
@@ -101,29 +109,36 @@ inline PrimExpr bilinear_sample_nchw(const Tensor& input, const Array<PrimExpr>&
  */
 inline PrimExpr bilinear_sample_nhwc(const Tensor& input, const Array<PrimExpr>& indices,
                                      const PrimExpr max_y, const PrimExpr max_x) {
+  auto batch_id = indices[0];
+  auto channel_id = indices[3];
   auto in_y = indices[1];
-  auto yf = tvm::floor(in_y);
-  auto yc = tvm::cast(DataType::Int(32), tvm::ceil(in_y));
-
-  auto y0 = tvm::cast(DataType::Int(32), tvm::floor(in_y));
-  auto y1 = tvm::if_then_else((yc > max_y), max_y, yc);
-  auto y_lerp = in_y - yf;
-
   auto in_x = indices[2];
-  auto xf = tvm::floor(in_x);
-  auto xc = tvm::cast(DataType::Int(32), tvm::ceil(in_x));
-
-  auto x0 = tvm::cast(DataType::Int(32), tvm::floor(in_x));
-  auto x1 = tvm::if_then_else((xc > max_x), max_x, xc);
-  auto x_lerp = in_x - xf;
 
-  auto A = input(indices[0], y0, x0, indices[3]);
-  auto B = input(indices[0], y0, x1, indices[3]);
-  auto C = input(indices[0], y1, x0, indices[3]);
-  auto D = input(indices[0], y1, x1, indices[3]);
-
-  return A * (1 - x_lerp) * (1 - y_lerp) + B * x_lerp * (1 - y_lerp) + C * (1 - x_lerp) * y_lerp +
-         D * x_lerp * y_lerp;
+  auto y_low = tvm::cast(DataType::Int(32), tvm::floor(in_y));
+  auto y_high = y_low + 1;
+
+  auto x_low = tvm::cast(DataType::Int(32), tvm::floor(in_x));
+  auto x_high = x_low + 1;
+
+  auto wy_h = in_y - y_low;
+  auto wx_h = in_x - x_low;
+  auto wy_l = 1 - wy_h;
+  auto wx_l = 1 - wx_h;
+
+  PrimExpr val = 0;
+  std::vector<std::vector<PrimExpr>> wx_xp{{wx_l, x_low}, {wx_h, x_high}};
+  std::vector<std::vector<PrimExpr>> wy_yp{{wy_l, y_low}, {wy_h, y_high}};
+  for (auto wx_xp_ele : wx_xp) {
+    for (auto wy_yp_ele : wy_yp) {
+      auto wx = wx_xp_ele[0];
+      auto xp = wx_xp_ele[1];
+      auto wy = wy_yp_ele[0];
+      auto yp = wy_yp_ele[1];
+      val += tvm::if_then_else(0 <= yp && yp <= max_y && 0 <= xp && xp <= max_x,
+                               wx * wy * input(batch_id, yp, xp, channel_id), 0);
+    }
+  }
+  return val;
 }
 
 }  // namespace detail
diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 68e68fdbeed2..246ed97b14e9 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -1928,6 +1928,32 @@ def roi_align(self, inputs, input_types):
 
         return _op.vision.roi_align(data, boxes, output_size, spatial_scale, sample_ratio)
 
+    def deform_conv2d(self, inputs, input_types):
+        data = inputs[0]
+        weight = inputs[1]
+        offset = inputs[2]
+        strides = (inputs[4], inputs[5])
+        padding = (inputs[6], inputs[7])
+        dilation = (inputs[8], inputs[9])
+        groups = inputs[10]
+        deformable_groups = inputs[11]
+        weight_shape = self.infer_shape(weight)
+        output_channels = weight_shape[0]
+        kernel_size = (weight_shape[2], weight_shape[3])
+
+        return _op.nn.deformable_conv2d(
+            data,
+            offset,
+            weight,
+            strides,
+            padding,
+            dilation,
+            deformable_groups,
+            groups,
+            output_channels,
+            kernel_size,
+        )
+
     def unbind(self, inputs, input_types):
         data = inputs[0]
         dim = int(inputs[1])
@@ -2292,6 +2318,7 @@ def create_convert_map(self):
             "torchvision::nms": self.nms,
             "aten::logsumexp": self.logsumexp,
             "torchvision::roi_align": self.roi_align,
+            "torchvision::deform_conv2d": self.deform_conv2d,
             "aten::unbind": self.unbind,
             "aten::__and__": self.logical_and,
             "aten::logical_and": self.logical_and,
diff --git a/python/tvm/topi/testing/deformable_conv2d_python.py b/python/tvm/topi/testing/deformable_conv2d_python.py
index 093084397ff1..758a70eb4cc1 100644
--- a/python/tvm/topi/testing/deformable_conv2d_python.py
+++ b/python/tvm/topi/testing/deformable_conv2d_python.py
@@ -17,6 +17,7 @@
 # pylint: disable=invalid-name, too-many-locals, too-many-arguments
 """Deformable convolution in python"""
 import itertools
+import math
 import numpy as np
 from tvm.topi.nn.utils import get_pad_tuple
 
@@ -80,15 +81,22 @@ def deformable_conv2d_nchw_python(
         dilation_h, dilation_w = dilation
 
     def _bilinear(n, c, h, w):
-        low_h, low_w = int(h), int(w)
-        high_h = min(low_h + 1, in_height - 1)
-        high_w = min(low_w + 1, in_width - 1)
-        y_lerp = h - low_h
-        x_lerp = w - low_w
-
-        bottom = (1 - x_lerp) * a_np[n, c, low_h, low_w] + x_lerp * a_np[n, c, low_h, high_w]
-        top = (1 - x_lerp) * a_np[n, c, high_h, low_w] + x_lerp * a_np[n, c, high_h, high_w]
-        return (1 - y_lerp) * bottom + y_lerp * top
+        y_low = int(math.floor(h))
+        x_low = int(math.floor(w))
+        y_high = y_low + 1
+        x_high = x_low + 1
+
+        wy_h = h - y_low
+        wx_h = w - x_low
+        wy_l = 1 - wy_h
+        wx_l = 1 - wx_h
+
+        val = 0
+        for wx, xp in zip((wx_l, wx_h), (x_low, x_high)):
+            for wy, yp in zip((wy_l, wy_h), (y_low, y_high)):
+                if 0 <= yp < in_height and 0 <= xp < in_width:
+                    val += wx * wy * a_np[n, c, yp, xp]
+        return val
 
     a_deform = np.zeros((batch, in_channel, out_height, out_width, kernel_h, kernel_w), dtype=dtype)
     for n, h, w in itertools.product(range(batch), range(out_height), range(out_width)):
diff --git a/python/tvm/topi/testing/roi_align_python.py b/python/tvm/topi/testing/roi_align_python.py
index 5bb292c46fbb..abef25f0b994 100644
--- a/python/tvm/topi/testing/roi_align_python.py
+++ b/python/tvm/topi/testing/roi_align_python.py
@@ -31,25 +31,29 @@ def roi_align_nchw_python(a_np, rois_np, pooled_size, spatial_scale, sample_rati
     else:
         pooled_size_h, pooled_size_w = pooled_size
 
-    def _bilinear(b, c, y, x):
+    def _bilinear(n, c, y, x):
         if y < -1 or y > height or x < -1 or x > width:
             return 0
-        y = max(y, 0.0)
-        x = max(x, 0.0)
-        y_low = int(y)
-        x_low = int(x)
 
-        y_high = min(y_low + 1, height - 1)
-        x_high = min(x_low + 1, width - 1)
+        y = min(max(y, 0), height - 1)
+        x = min(max(x, 0), width - 1)
 
-        ly = y - y_low
-        lx = x - x_low
-        return (
-            (1 - ly) * (1 - lx) * a_np[b, c, y_low, x_low]
-            + (1 - ly) * lx * a_np[b, c, y_low, x_high]
-            + ly * (1 - lx) * a_np[b, c, y_high, x_low]
-            + ly * lx * a_np[b, c, y_high, x_high]
-        )
+        y_low = int(math.floor(y))
+        x_low = int(math.floor(x))
+        y_high = y_low + 1
+        x_high = x_low + 1
+
+        wy_h = y - y_low
+        wx_h = x - x_low
+        wy_l = 1 - wy_h
+        wx_l = 1 - wx_h
+
+        val = 0
+        for wx, xp in zip((wx_l, wx_h), (x_low, x_high)):
+            for wy, yp in zip((wy_l, wy_h), (y_low, y_high)):
+                if 0 <= yp < height and 0 <= xp < width:
+                    val += wx * wy * a_np[n, c, yp, xp]
+        return val
 
     for i in range(num_roi):
         roi = rois_np[i]
diff --git a/python/tvm/topi/vision/rcnn/roi_align.py b/python/tvm/topi/vision/rcnn/roi_align.py
index a51ba33a6c45..30824770b7b2 100644
--- a/python/tvm/topi/vision/rcnn/roi_align.py
+++ b/python/tvm/topi/vision/rcnn/roi_align.py
@@ -60,8 +60,8 @@ def roi_align_nchw(data, rois, pooled_size, spatial_scale, sample_ratio=-1):
 
     def _bilinear(i, c, y, x):
         outside = tvm.tir.any(y < -1.0, x < -1.0, y > height, x > width)
-        y = tvm.te.max(y, 0.0)
-        x = tvm.te.max(x, 0.0)
+        y = tvm.te.min(tvm.te.max(y, 0.0), height - 1)
+        x = tvm.te.min(tvm.te.max(x, 0.0), width - 1)
         val = bilinear_sample_nchw(data, (i, c, y, x), height - 1, width - 1)
         return tvm.tir.if_then_else(outside, 0.0, val)
 
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 6d9b559c6ba1..8d968e9760c9 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -216,7 +216,6 @@ def verify_model(model_name, input_data=[], custom_convert_map={}, rtol=1e-5, at
 
                 assert_shapes_match(baseline_output, compiled_output)
                 tvm.testing.assert_allclose(baseline_output, compiled_output, rtol=rtol, atol=atol)
-
     del model_name
     del baseline_model
     torch.cuda.empty_cache()
@@ -924,6 +923,85 @@ def test_forward_conv_transpose():
     verify_model(torch.nn.ConvTranspose1d(3, 12, 3, bias=False), input_data=conv1d_input_data)
 
 
+def test_forward_deform_conv():
+    torch.set_grad_enabled(False)
+
+    def test_run(
+        batch_size,
+        in_channels,
+        out_channels,
+        in_height,
+        in_width,
+        out_height,
+        out_width,
+        offset_groups,
+        kh,
+        kw,
+        groups,
+    ):
+        input_shape = [batch_size, in_channels, in_height, in_width]
+        offset_shape = [batch_size, 2 * offset_groups * kh * kw, out_height, out_width]
+        weight_shape = [out_channels, in_channels // groups, kh, kw]
+        input_data = torch.rand(input_shape)
+        offset_data = torch.rand(offset_shape)
+        weight_data = torch.rand(weight_shape)
+
+        class DeformConv2D(Module):
+            def forward(self, *args):
+                return torchvision.ops.deform_conv2d(args[0], args[1], args[2])
+
+        verify_model(
+            DeformConv2D().float().eval(),
+            input_data=[input_data, offset_data, weight_data],
+            rtol=1e-4,
+            atol=1e-4,
+        )
+
+    batch_size = 4
+    in_channels, out_channels = 4, 6
+    in_height, in_width = 10, 10
+    out_height, out_width = 8, 8
+    offset_groups = 2
+    kh, kw = 3, 3
+    groups = 1
+
+    test_run(
+        batch_size,
+        in_channels,
+        out_channels,
+        in_height,
+        in_width,
+        out_height,
+        out_width,
+        offset_groups,
+        kh,
+        kw,
+        groups,
+    )
+
+    batch_size = 5
+    in_channels, out_channels = 4, 6
+    in_height, in_width = 10, 10
+    out_height, out_width = 8, 8
+    offset_groups = 1
+    kh, kw = 3, 3
+    groups = 1
+
+    test_run(
+        batch_size,
+        in_channels,
+        out_channels,
+        in_height,
+        in_width,
+        out_height,
+        out_width,
+        offset_groups,
+        kh,
+        kw,
+        groups,
+    )
+
+
 @tvm.testing.uses_gpu
 def test_forward_threshold():
     torch.set_grad_enabled(False)
@@ -1700,7 +1778,7 @@ def test_forward_roi_align():
     """ROI align"""
     torch.set_grad_enabled(False)
 
-    class ROIAlgin(Module):
+    class ROIAlign(Module):
         def __init__(self, output_sizes, spatial_scale=1.0, sampling_ratio=-1):
             super().__init__()
             self.spatial_scale = spatial_scale
@@ -1721,9 +1799,9 @@ def forward(self, *args):
     in_batch = torch.zeros((35, 1), dtype=torch.float)
     in_boxes = torch.cat([in_batch, in_boxes], dim=1)
 
-    verify_model(ROIAlgin(7), [in_data, in_boxes])
-    verify_model(ROIAlgin((10, 10), 0.7, 5), [in_data, in_boxes])
-    verify_model(ROIAlgin(15, 0.9, 3), [in_data, in_boxes])
+    verify_model(ROIAlign(7), [in_data, in_boxes])
+    verify_model(ROIAlign((10, 10), 0.7, 5), [in_data, in_boxes])
+    verify_model(ROIAlign(15, 0.9, 3), [in_data, in_boxes])
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index cdf3b240507b..6d7d401d706b 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -837,11 +837,31 @@ def test_infer_type(batch, in_channel, size, out_channel, deformable_groups, gro
     test_infer_type(1, 4, 16, 4, 4, 1, "NHWC")
     test_infer_type(2, 4, 16, 4, 1, 2, "NHWC")
 
-    def test_run(batch, in_channel, size, out_channel, deformable_groups, groups):
+    def test_run(batch, in_channel, size, out_channel, deformable_groups, groups, layout):
         kernel_size = (3, 3)
-        data_shape = (batch, in_channel, size, size)
-        offset_shape = (batch, 2 * kernel_size[0] * kernel_size[1] * deformable_groups, size, size)
-        kernel_shape = (out_channel, in_channel // groups, kernel_size[0], kernel_size[1])
+        if layout == "NCHW":
+            kernel_layout = "OIHW"
+            data_shape = (batch, in_channel, size, size)
+            kernel_shape = (out_channel, in_channel // groups, kernel_size[0], kernel_size[1])
+            out_shape = (batch, out_channel, size, size)
+            offset_shape = (
+                batch,
+                2 * kernel_size[0] * kernel_size[1] * deformable_groups,
+                out_shape[2],
+                out_shape[3],
+            )
+        else:
+            kernel_layout = "HWIO"
+            data_shape = (batch, size, size, in_channel)
+            kernel_shape = (kernel_size[0], kernel_size[1], in_channel // groups, out_channel)
+            out_shape = (batch, size, size, out_channel)
+            offset_shape = (
+                batch,
+                out_shape[1],
+                out_shape[2],
+                2 * kernel_size[0] * kernel_size[1] * deformable_groups,
+            )
+
         dtype = "float32"
         data = relay.var("data", shape=data_shape, dtype=dtype)
         offset = relay.var("offset")
@@ -853,6 +873,8 @@ def test_run(batch, in_channel, size, out_channel, deformable_groups, groups):
             strides=(1, 1),
             padding=(1, 1),
             dilation=(1, 1),
+            data_layout=layout,
+            kernel_layout=kernel_layout,
             kernel_size=kernel_size,
             deformable_groups=deformable_groups,
             groups=groups,
@@ -862,25 +884,40 @@ def test_run(batch, in_channel, size, out_channel, deformable_groups, groups):
         data = np.random.uniform(size=data_shape).astype(dtype)
         offset = np.random.uniform(size=offset_shape).astype(dtype)
         kernel = np.random.uniform(size=kernel_shape).astype(dtype)
-        ref_res = tvm.topi.testing.deformable_conv2d_nchw_python(
-            data,
-            offset,
-            kernel,
-            stride=(1, 1),
-            padding=(1, 1),
-            dilation=(1, 1),
-            deformable_groups=deformable_groups,
-            groups=groups,
-        )
-
+        if layout == "NCHW":
+            ref_res = tvm.topi.testing.deformable_conv2d_nchw_python(
+                data,
+                offset,
+                kernel,
+                stride=(1, 1),
+                padding=(1, 1),
+                dilation=(1, 1),
+                deformable_groups=deformable_groups,
+                groups=groups,
+            )
+        else:
+            ref_res = tvm.topi.testing.deformable_conv2d_nhwc_python(
+                data,
+                offset,
+                kernel,
+                stride=(1, 1),
+                padding=(1, 1),
+                dilation=(1, 1),
+                deformable_groups=deformable_groups,
+                groups=groups,
+            )
         for target, ctx in tvm.testing.enabled_targets():
+            if target == "cuda" and layout == "NHWC":
+                continue  # Cannot run NHWC layout on cuda target, only on llvm
             for kind in ["graph", "debug"]:
                 intrp1 = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res1 = intrp1.evaluate(func)(data, offset, kernel)
                 tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
-    test_run(1, 4, 16, 4, 1, 1)
-    test_run(2, 4, 16, 4, 4, 1)
+    test_run(1, 4, 16, 4, 1, 1, "NCHW")
+    test_run(1, 4, 16, 4, 1, 1, "NHWC")
+    test_run(2, 4, 16, 4, 4, 1, "NCHW")
+    test_run(2, 4, 16, 4, 4, 1, "NHWC")
 
 
 @tvm.testing.uses_gpu

From d8313d08fd0a742921a19f30a060761af8bdeaa2 Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Thu, 4 Feb 2021 23:35:28 -0800
Subject: [PATCH 158/357] [AutoScheduler] Support early_stopping per task
 (#7377)

* [AutoScheduler] Support early_stopping per task

* address comment

* fix test

* Update python/tvm/auto_scheduler/task_scheduler.py

* Update python/tvm/auto_scheduler/task_scheduler.py

* trigger ci

* trigger ci
---
 python/tvm/auto_scheduler/task_scheduler.py | 47 ++++++++++++++++-----
 1 file changed, 36 insertions(+), 11 deletions(-)

diff --git a/python/tvm/auto_scheduler/task_scheduler.py b/python/tvm/auto_scheduler/task_scheduler.py
index 420b5f765a97..b6b05298aef7 100644
--- a/python/tvm/auto_scheduler/task_scheduler.py
+++ b/python/tvm/auto_scheduler/task_scheduler.py
@@ -246,6 +246,9 @@ def __init__(
         # task_cts[i] saves how many times task i is tuned
         self.task_cts = [0 for _ in range(len(self.tasks))]
 
+        # task_best_cts[i] saves the round task i found the best latency
+        self.task_best_cts = [0 for _ in range(len(self.tasks))]
+
         # task_costs_history[i] saves the latency history of task i
         self.task_costs_history = [[] for _ in range(len(self.tasks))]
 
@@ -281,13 +284,14 @@ def tune(
         search_policy="default",
         search_policy_params=None,
         adapative_training=False,
+        per_task_early_stopping=None,
     ):
         """Tune a batch of tasks together.
 
         Parameters
         ----------
         tune_option: TuningOptions
-            The options of tuning
+            The tuning options applied to all tasks.
         search_policy: : Union[str, List[SearchPolicy]] = "default"
             The list of search policies.
             If it is str,
@@ -299,10 +303,17 @@ def tune(
         adapative_training : bool = False
             Option used by XGBModel to reduce the model training frequency when there're
             too many logs.
+        per_task_early_stopping : Optional[int]
+            Stop tuning a task early if getting no improvement after n measurements.
         """
         # init members
         self.tune_option = tune_option
-        early_stopping = 1e20 if tune_option.early_stopping < 0 else tune_option.early_stopping
+        self.early_stopping_all = (
+            1e20 if tune_option.early_stopping < 0 else tune_option.early_stopping
+        )
+        self.early_stopping_task = (
+            1e20 if per_task_early_stopping is None else per_task_early_stopping
+        )
 
         self.measurer = ProgramMeasurer(
             tune_option.builder,
@@ -417,13 +428,13 @@ def tune(
             if self.cur_score < self.best_score:
                 self.best_score = self.cur_score
                 self.best_ct = self.ct
-            elif self.ct - self.best_ct >= early_stopping and all(
+            elif self.ct - self.best_ct >= self.early_stopping_all and all(
                 cost < 1e9 for cost in self.best_costs
             ):
                 if self.tune_option.verbose >= 1:
                     print(
                         "Stop early since no performance improvement in the last "
-                        + str(early_stopping)
+                        + str(self.early_stopping_all)
                         + " measurement trials."
                     )
                 break
@@ -439,15 +450,22 @@ def _tune_task(self, task_idx):
             self.num_measures_per_round, self.measurer
         )
 
+        self.task_cts[task_idx] += 1
+
         for res in measure_results:
             cost = array_mean(res.costs)
             if cost < self.best_costs[task_idx]:
+                self.task_best_cts[task_idx] = self.task_cts[task_idx]
                 self.best_costs[task_idx] = cost
 
-        if len(measure_inputs) == 0:
+        # Stop tuning this task in the rest of the process if its search space has been
+        # fully explored or it has no improvement for a long while.
+        no_change_trials = (
+            self.task_cts[task_idx] - self.task_best_cts[task_idx]
+        ) * self.num_measures_per_round
+        if len(measure_inputs) == 0 or no_change_trials > self.early_stopping_task:
             self.dead_tasks.add(task_idx)
 
-        self.task_cts[task_idx] += 1
         self.task_costs_history[task_idx].append(self.best_costs[task_idx])
 
         self.ct += len(measure_inputs)
@@ -494,17 +512,24 @@ def _restore_status(self, log_file, num_measures_per_round):
             if task_idx is None:
                 continue
 
+            self.task_cts[task_idx] += 1
+
             if res.error_no == 0:
-                self.best_costs[task_idx] = min(self.best_costs[task_idx], array_mean(res.costs))
+                cost = array_mean(res.costs)
+                if self.best_costs[task_idx] < cost:
+                    self.best_costs[task_idx] = cost
+                    self.task_best_cts = self.task_cts[task_idx]
 
-            self.task_cts[task_idx] += 1
+        for idx in range(len(self.tasks)):
+            if self.task_cts[idx] - self.task_best_cts[idx] > self.early_stopping_task:
+                self.dead_tasks.add(idx)
 
-        for i in range(len(self.tasks)):
             # The computation of taks_cts is just an estimation.
             # The estimation may not be accurate if the log file is changed externally or
             # `num_measures_per_round` is different from the last tuning.
-            self.task_cts[i] = int(self.task_cts[i] / num_measures_per_round + 0.5)
-            self.task_costs_history[i].append(self.best_costs[i])
+            self.task_cts[idx] = int(self.task_cts[idx] / num_measures_per_round + 0.5)
+            self.task_best_cts[idx] = int(self.task_best_cts[idx] / num_measures_per_round + 0.5)
+            self.task_costs_history[idx].append(self.best_costs[idx])
 
         self.cur_score = self._compute_score(self.best_costs)
 

From 132cf6bd6a94252b50265e406a2b11cb037028b6 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Fri, 5 Feb 2021 02:50:28 -0500
Subject: [PATCH 159/357] [CI] Add back the tests after timeout adjusted
 (#7408)

---
 tests/scripts/task_python_frontend.sh    | 2 --
 tests/scripts/task_python_integration.sh | 2 --
 2 files changed, 4 deletions(-)

diff --git a/tests/scripts/task_python_frontend.sh b/tests/scripts/task_python_frontend.sh
index a0011c5934f0..3c5839bc7e1c 100755
--- a/tests/scripts/task_python_frontend.sh
+++ b/tests/scripts/task_python_frontend.sh
@@ -31,8 +31,6 @@ find . -type f -path "*.pyc" | xargs rm -f
 # Rebuild cython
 make cython3
 
-exit 0
-
 echo "Running relay MXNet frontend test..."
 python3 -m pytest tests/python/frontend/mxnet
 
diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh
index c449b85a4a68..ef86d6917424 100755
--- a/tests/scripts/task_python_integration.sh
+++ b/tests/scripts/task_python_integration.sh
@@ -27,8 +27,6 @@ export LD_LIBRARY_PATH="build:${LD_LIBRARY_PATH:-}"
 export TVM_BIND_THREADS=0
 export TVM_NUM_THREADS=2
 
-exit 0
-
 # cleanup pycache
 find . -type f -path "*.pyc" | xargs rm -f
 

From 91e07e1f3a7fe6ca047bf2acf57880f0b5393395 Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Fri, 5 Feb 2021 07:52:40 -0800
Subject: [PATCH 160/357] [Relay][Frontend][Onnx] Refactor where importer to
 support dynamic shapes. (#7394)

* Refactor where importer to support dynamic shapes.

* Add a test for dynamic where.
---
 python/tvm/relay/frontend/onnx.py          | 48 +++++++++-------------
 tests/python/frontend/onnx/test_forward.py | 17 ++++++--
 2 files changed, 33 insertions(+), 32 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 897c6a022594..c423598a2ee7 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -1560,34 +1560,26 @@ class Where(OnnxOpConverter):
 
     @classmethod
     def _impl_v9(cls, inputs, attr, params):
-        condition_shape = infer_shape(inputs[0])
-        x_shape = infer_shape(inputs[1])
-        y_shape = infer_shape(inputs[2])
-
-        # condition, x, and y can all be broadcasted.
-        # broadcast each of them to the longest shape.
-        # if two shapes have the same number of dimensions,
-        # try to choose the one that doesn't have "1" as
-        # a dimension.
-        shapes = [condition_shape, x_shape, y_shape]
-        shape_lens = [len(shape) for shape in shapes]
-        max_size = max(shape_lens)
-        max_size_idxs = [i for i, x in enumerate(shape_lens) if x == max_size]
-        broadcast_idx = max_size_idxs[0]
-        if len(max_size_idxs) > 1:
-            for idx in max_size_idxs:
-                if 1 not in shapes[idx]:
-                    broadcast_idx = idx
-
-        broadcast_shape = shapes[broadcast_idx]
-
-        if condition_shape != broadcast_shape:
-            inputs[0] = _op.broadcast_to(inputs[0], broadcast_shape)
-        if x_shape != broadcast_shape:
-            inputs[1] = _op.broadcast_to(inputs[1], broadcast_shape)
-        if y_shape != broadcast_shape:
-            inputs[2] = _op.broadcast_to(inputs[2], broadcast_shape)
-        return _op.where(inputs[0], inputs[1], inputs[2])
+        condition_rank = len(infer_shape(inputs[0]))
+        x_rank = len(infer_shape(inputs[1]))
+        y_rank = len(infer_shape(inputs[2]))
+        ranks = [condition_rank, x_rank, y_rank]
+
+        # If one rank is longer than others, then we can broadcast
+        # to that shape.
+        max_rank = max(ranks)
+        max_rank_idxs = [i for i, x in enumerate(ranks) if x == max_rank]
+        broadcast_shape = _op.shape_of(inputs[max_rank_idxs[0]])
+        # If two or more inputs have the same rank, compute the broadcast
+        # shape by taking the maximum value of each dimensions.
+        if len(max_rank_idxs) > 1:
+            for idx in max_rank_idxs:
+                broadcast_shape = _op.maximum(broadcast_shape, _op.shape_of(inputs[idx]))
+
+        condition = _op.broadcast_to(inputs[0], broadcast_shape)
+        x = _op.broadcast_to(inputs[1], broadcast_shape)
+        y = _op.broadcast_to(inputs[2], broadcast_shape)
+        return _op.where(condition, x, y)
 
 
 class Or(Elemwise):
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 56d1dd5a5265..515fc32ef88d 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -2107,10 +2107,18 @@ def test_erf():
     verify_erf(x, z)
 
 
-def verify_where(condition, x, y, dtype, outdata):
-    node = helper.make_node("Where", inputs=["condition", "x", "y"], outputs=["out"])
+def verify_where(condition, x, y, dtype, outdata, dynamic=False):
+    node_list = []
+    where_inputs = ["condition", "x", "y"]
+    if dynamic:
+        shape_node = helper.make_node("Shape", ["x"], ["shape"])
+        reshape_node = helper.make_node("Reshape", ["x", "shape"], ["X"])
+        where_inputs[1] = "X"
+        node_list += [shape_node, reshape_node]
+    node = helper.make_node("Where", inputs=where_inputs, outputs=["out"])
+    node_list.append(node)
     graph = helper.make_graph(
-        [node],
+        node_list,
         "where_test",
         inputs=[
             helper.make_tensor_value_info("condition", TensorProto.BOOL, list(condition.shape)),
@@ -2120,7 +2128,7 @@ def verify_where(condition, x, y, dtype, outdata):
         outputs=[helper.make_tensor_value_info("out", dtype, list(outdata.shape))],
     )
     model = helper.make_model(graph, producer_name="where_test")
-    verify_with_ort_with_inputs(model, [condition, x, y], [outdata.shape])
+    verify_with_ort_with_inputs(model, [condition, x, y], [outdata.shape], use_vm=True)
 
 
 @tvm.testing.uses_gpu
@@ -2156,6 +2164,7 @@ def test_where():
     y = np.array([[1], [7]], dtype=np.float32)
     outdata = np.where(condition, x, y)
     verify_where(condition, x, y, TensorProto.FLOAT, outdata)
+    verify_where(condition, x, y, TensorProto.FLOAT, outdata, dynamic=True)
 
 
 def verify_or(indata, dtype):

From 4df530d1ea88a5b4028a1879311b1032645be253 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou.zhou@gmail.com>
Date: Fri, 5 Feb 2021 10:57:45 -0800
Subject: [PATCH 161/357] Add cuda tags and unit test (#7410)

* Add cuda tags and unit test

* Add missing space

* Remove extra indent

* Modify macro def position

* Fix clang format

* Fix clang format for set_config
---
 src/target/tag.cc                           | 266 +++++++++++++++++++-
 src/target/target_kind.cc                   |   3 +
 tests/python/unittest/test_target_target.py |  20 ++
 3 files changed, 283 insertions(+), 6 deletions(-)

diff --git a/src/target/tag.cc b/src/target/tag.cc
index 8198435a9494..94960894d45d 100644
--- a/src/target/tag.cc
+++ b/src/target/tag.cc
@@ -21,6 +21,8 @@
  * \file src/target/target_tag.cc
  * \brief Target tag registry
  */
+
+#include <tvm/ir/expr.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/target/tag.h>
 #include <tvm/target/target.h>
@@ -66,12 +68,264 @@ Target TargetTag::AddTag(String name, Map<String, ObjectRef> config, bool overri
   return Target(config);
 }
 
-/**********  Register Target tags  **********/
+/**********  Register Target tags  ***********/
+
+#define TVM_REGISTER_CUDA_TAG(Name, Arch, SharedMem, RegPerBlock) \
+  TVM_REGISTER_TARGET_TAG(Name).set_config({                      \
+      {"kind", String("cuda")},                                   \
+      {"arch", String(Arch)},                                     \
+      {"shared_memory_per_block", Integer(SharedMem)},            \
+      {"registers_per_block", Integer(RegPerBlock)},              \
+      {"max_threads_per_block", Integer(1024)},                   \
+      {"thread_warp_size", Integer(32)},                          \
+  });
 
-TVM_REGISTER_TARGET_TAG("nvidia/rtx2080ti")
-    .set_config({
-        {"kind", String("cuda")},
-        {"arch", String("sm_75")},
-    });
+TVM_REGISTER_CUDA_TAG("nvidia/tesla-k80", "sm_37", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/tesla-k40", "sm_35", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/tesla-k20", "sm_35", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/tesla-c2075", "sm_20", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/tesla-c2050", "sm_20", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/tesla-c2070", "sm_20", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/nvidia-a100", "sm_80", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/nvidia-t4", "sm_75", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/nvidia-v100", "sm_70", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/tesla-p100", "sm_60", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/tesla-p40", "sm_61", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/tesla-p4", "sm_61", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/tesla-m60", "sm_52", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/tesla-m40", "sm_52", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/tesla-k80", "sm_37", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/tesla-k40", "sm_35", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/tesla-k20", "sm_35", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/tesla-k10", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-rtx-8000", "sm_75", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-rtx-6000", "sm_75", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-rtx-5000", "sm_75", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-rtx-4000", "sm_75", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-gv100", "sm_70", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-gp100", "sm_60", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-p6000", "sm_61", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-p5000", "sm_61", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-p4000", "sm_61", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-p2200", "sm_61", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-p2000", "sm_61", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-p1000", "sm_61", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-p620", "sm_61", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-p600", "sm_61", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-p400", "sm_61", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-m6000-24gb", "sm_52", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-m6000", "sm_52", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-k6000", "sm_35", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-m5000", "sm_52", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-k5200", "sm_35", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-k5000", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-m4000", "sm_52", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-k4200", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-k4000", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-m2000", "sm_52", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-k2200", "sm_50", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-k2000", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-k2000d", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-k1200", "sm_50", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-k620", "sm_50", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-k600", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-k420", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-410", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-plex-7000", "sm_20", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/rtx-5000", "sm_75", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/rtx-4000", "sm_75", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/rtx-3000", "sm_75", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/t2000", "sm_75", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/t1000", "sm_75", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/p620", "sm_61", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/p520", "sm_61", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-p5200", "sm_61", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-p4200", "sm_61", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-p3200", "sm_61", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-p5000", "sm_61", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-p4000", "sm_61", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-p3000", "sm_61", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-p2000", "sm_61", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-p1000", "sm_61", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-p600", "sm_61", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-p500", "sm_61", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-m5500m", "sm_52", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-m2200", "sm_52", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-m1200", "sm_50", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-m620", "sm_52", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-m520", "sm_50", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-k6000m", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-k5200m", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-k5100m", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-m5000m", "sm_50", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-k500m", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-k4200m", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-k4100m", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-m4000m", "sm_50", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-k3100m", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-m3000m", "sm_50", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-k2200m", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-k2100m", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-m2000m", "sm_50", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-k1100m", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-m1000m", "sm_50", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-k620m", "sm_50", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-k610m", "sm_35", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-m600m", "sm_50", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-k510m", "sm_35", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/quadro-m500m", "sm_50", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/nvidia-nvs-810", "sm_50", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/nvidia-nvs-510", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/nvidia-nvs-315", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/nvidia-nvs-310", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/nvs-5400m", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/nvs-5200m", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/nvs-4200m", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-rtx-3090", "sm_86", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-rtx-3080", "sm_86", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-rtx-3070", "sm_86", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/nvidia-titan-rtx", "sm_75", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-rtx-2080-ti", "sm_75", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-rtx-2080", "sm_75", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-rtx-2070", "sm_75", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-rtx-2060", "sm_75", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/nvidia-titan-v", "sm_70", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/nvidia-titan-xp", "sm_61", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/nvidia-titan-x", "sm_61", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-1080-ti", "sm_61", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-1080", "sm_61", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-1070-ti", "sm_61", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-1070", "sm_61", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-1060", "sm_61", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-1050", "sm_61", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-titan-x", "sm_52", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-titan-z", "sm_35", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-titan-black", "sm_35", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-titan", "sm_35", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-980-ti", "sm_52", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-980", "sm_52", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-970", "sm_52", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-960", "sm_52", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-950", "sm_52", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-780-ti", "sm_35", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-780", "sm_35", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-770", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-760", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-750-ti", "sm_50", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-750", "sm_50", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-690", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-680", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-670", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-660-ti", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-660", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-650-ti-boost", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-650-ti", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-650", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-560-ti", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-550-ti", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-460", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gts-450", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gts-450*", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-590", "sm_20", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-580", "sm_20", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-570", "sm_20", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-480", "sm_20", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-470", "sm_20", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-465", "sm_20", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-740", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-730", "sm_35", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-730-ddr3,128bit", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-720", "sm_35", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-705*", "sm_35", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-640-(gddr5)", "sm_35", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-640-(gddr3)", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-630", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-620", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-610", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-520", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-440", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-440*", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-430", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-430*", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-rtx-2080", "sm_75", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-rtx-2070", "sm_75", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-rtx-2060", "sm_75", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-1080", "sm_61", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-1070", "sm_61", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-1060", "sm_61", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-980", "sm_52", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-980m", "sm_52", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-970m", "sm_52", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-965m", "sm_52", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-960m", "sm_50", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-950m", "sm_50", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-940m", "sm_50", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-930m", "sm_50", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-920m", "sm_35", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-910m", "sm_52", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-880m", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-870m", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-860m-cuda-sm_30", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-860m-sm-50", "sm_50", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-850m", "sm_50", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-840m", "sm_50", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-830m", "sm_50", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-820m", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-800m", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-780m", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-770m", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-765m", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-760m", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-680mx", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-680m", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-675mx", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-675m", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-670mx", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-670m", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-660m", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-755m", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-750m", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-650m", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-745m", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-645m", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-740m", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-730m", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-640m", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-640m-le", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-735m", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-635m", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-730m", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-630m", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-625m", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-720m", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-620m", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-710m", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-705m", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-610m", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-580m", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-570m", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-560m", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-555m", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-550m", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-540m", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-525m", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-520mx", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-520m", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-485m", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-470m", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-460m", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-445m", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-435m", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-420m", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-415m", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-480m", "sm_20", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-710m", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-410m", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/jetson-agx-xavier", "sm_72", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/jetson-nano", "sm_53", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/jetson-tx2", "sm_62", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/jetson-tx1", "sm_53", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/tegra-x1", "sm_53", 49152, 32768);
 
+#undef TVM_REGISTER_CUDA_TAG
 }  // namespace tvm
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index 903c3dcfefb5..cee708f80b5a 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -230,6 +230,9 @@ TVM_REGISTER_TARGET_KIND("cuda", kDLGPU)
     .add_attr_option<Bool>("system-lib")
     .add_attr_option<Integer>("max_num_threads", Integer(1024))
     .add_attr_option<Integer>("thread_warp_size", Integer(32))
+    .add_attr_option<Integer>("shared_memory_per_block")
+    .add_attr_option<Integer>("registers_per_block")
+    .add_attr_option<Integer>("max_threads_per_block")
     .set_default_keys({"cuda", "gpu"});
 
 TVM_REGISTER_TARGET_KIND("nvptx", kDLGPU)
diff --git a/tests/python/unittest/test_target_target.py b/tests/python/unittest/test_target_target.py
index 643043f13663..973f14958d9a 100644
--- a/tests/python/unittest/test_target_target.py
+++ b/tests/python/unittest/test_target_target.py
@@ -131,6 +131,26 @@ def test_composite_target():
     assert opencl_device.kind.name == "opencl"
 
 
+def test_target_tag_0():
+    tgt = tvm.target.Target("nvidia/geforce-rtx-2080-ti")
+    assert tgt.kind.name == "cuda"
+    assert tgt.attrs["arch"] == "sm_75"
+    assert tgt.attrs["shared_memory_per_block"] == 49152
+    assert tgt.attrs["max_threads_per_block"] == 1024
+    assert tgt.attrs["thread_warp_size"] == 32
+    assert tgt.attrs["registers_per_block"] == 65536
+
+
+def test_target_tag_1():
+    tgt = tvm.target.Target("nvidia/jetson-nano")
+    assert tgt.kind.name == "cuda"
+    assert tgt.attrs["arch"] == "sm_53"
+    assert tgt.attrs["shared_memory_per_block"] == 49152
+    assert tgt.attrs["max_threads_per_block"] == 1024
+    assert tgt.attrs["thread_warp_size"] == 32
+    assert tgt.attrs["registers_per_block"] == 32768
+
+
 if __name__ == "__main__":
     test_target_dispatch()
     test_target_string_parse()

From fc08430adda1bac02de95d4c1dab9ca6a0572f50 Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Sat, 6 Feb 2021 01:54:01 -0700
Subject: [PATCH 162/357] check for dynamic rank before accessing value (#7414)

---
 src/relay/op/dyn/tensor/transform.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/relay/op/dyn/tensor/transform.cc b/src/relay/op/dyn/tensor/transform.cc
index 8bad3943f5ce..9724a92e8776 100644
--- a/src/relay/op/dyn/tensor/transform.cc
+++ b/src/relay/op/dyn/tensor/transform.cc
@@ -64,8 +64,9 @@ bool ReshapeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     return false;
   }
 
-  // Doesn't support dynamic output rank
-  for (int i = 0; i < newshape->shape[0].as<IntImmNode>()->value; i++) {
+  const IntImmNode* rank = newshape->shape[0].as<IntImmNode>();
+  ICHECK(rank != nullptr) << "Dynamic Reshape doesn't support Dynamic Rank";
+  for (int i = 0; i < rank->value; i++) {
     oshape.push_back(Any());
   }
 

From 1f846f07258f0ce67ffbc0219bf85b128756b96c Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Sun, 7 Feb 2021 08:33:47 +0900
Subject: [PATCH 163/357] [VM] Minor refactor for C++ memory alloc (#7413)

* started moving things to header

* directly call InvokeTVMOp

* done all memory op

* also refactor AllocTensor

* declare Prod

* remove cached func for Add, Multiply, Divide

* lint fix

* revert test change

* remove tensor.h and declare Prod in pattern_utils.h
---
 src/relay/op/device_copy.cc           | 73 -----------------------
 src/relay/op/memory/memory.cc         | 83 ++++++++++++++++++++-------
 src/relay/op/memory/memory.h          | 46 +++++++++++++++
 src/relay/op/tensor/reduce.cc         |  6 +-
 src/relay/op/vm/vm.cc                 | 49 +++++++++-------
 src/relay/op/vm/vm.h                  | 40 +++++++++++++
 src/relay/transforms/fold_constant.cc |  8 ---
 src/relay/transforms/memory_alloc.cc  | 57 +++++-------------
 src/relay/transforms/pattern_utils.h  |  4 ++
 9 files changed, 200 insertions(+), 166 deletions(-)
 delete mode 100644 src/relay/op/device_copy.cc
 create mode 100644 src/relay/op/memory/memory.h
 create mode 100644 src/relay/op/vm/vm.h

diff --git a/src/relay/op/device_copy.cc b/src/relay/op/device_copy.cc
deleted file mode 100644
index 997eec5a333f..000000000000
--- a/src/relay/op/device_copy.cc
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *
- * \file src/relay/op/device_copy.cc
- * \brief Crossing device data copy operator.
- *
- * The pattern of this operator is registered as kOpaque. Hence, it could be
- * used as "barrier" to avoid fusing operators belonging to differen devices.
- */
-
-#include <tvm/relay/attrs/device_copy.h>
-#include <tvm/relay/expr.h>
-#include <tvm/relay/op.h>
-#include <tvm/relay/op_attr_types.h>
-#include <tvm/tir/expr.h>
-#include <tvm/topi/elemwise.h>
-
-#include "../transforms/infer_layout_utils.h"
-#include "type_relations.h"
-
-namespace tvm {
-namespace relay {
-
-// relay.device_copy
-TVM_REGISTER_NODE_TYPE(DeviceCopyAttrs);
-
-TVM_REGISTER_GLOBAL("relay.op._make.device_copy")
-    .set_body_typed([](Expr data, int src_dev_type, int dst_dev_type) {
-      auto attrs = make_object<DeviceCopyAttrs>();
-      attrs->src_dev_type = src_dev_type;
-      attrs->dst_dev_type = dst_dev_type;
-      static const Op& op = Op::Get("device_copy");
-      return Call(op, {data}, Attrs(attrs), {});
-    });
-
-RELAY_REGISTER_OP("device_copy")
-    .describe(R"code(
-Copy data from one tensor to another. The source and destination might be
-on different devices.
-)code" TVM_ADD_FILELINE)
-    .set_num_inputs(1)
-    .add_argument("data", "Tensor", "The input data.")
-    .set_support_level(10)
-    .add_type_rel("Identity", IdentityRel)
-    .set_attr<TOpPattern>("TOpPattern", kOpaque)
-    .set_attr<TOpIsStateful>("TOpIsStateful", false)
-    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
-    .set_attr<FTVMCompute>("FTVMCompute",
-                           [](const Attrs& attrs, const Array<te::Tensor>& inputs,
-                              const Type& out_dtype) -> Array<te::Tensor> {
-                             return {topi::identity(inputs[0])};
-                           });
-
-}  // namespace relay
-}  // namespace tvm
diff --git a/src/relay/op/memory/memory.cc b/src/relay/op/memory/memory.cc
index c0edf467815a..287564ba4f21 100644
--- a/src/relay/op/memory/memory.cc
+++ b/src/relay/op/memory/memory.cc
@@ -22,6 +22,9 @@
  * \brief Operators for manifest shape-aware memory allocation in Relay.
  */
 
+#include "memory.h"
+
+#include <tvm/node/node.h>
 #include <tvm/relay/attrs/memory.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/op.h>
@@ -29,9 +32,12 @@
 #include <tvm/runtime/data_type.h>
 #include <tvm/topi/elemwise.h>
 
+#include <vector>
+
 #include "../../transforms/infer_layout_utils.h"
 #include "../op_common.h"
 #include "../type_relations.h"
+#include "tvm/relay/attrs/device_copy.h"
 
 namespace tvm {
 namespace relay {
@@ -42,15 +48,16 @@ TVM_REGISTER_NODE_TYPE(AllocTensorAttrs);
 // The passing value in attrs and args doesn't seem super great.
 // We should consider a better solution, i.e the type relation
 // being able to see the arguments as well?
-TVM_REGISTER_GLOBAL("relay.op.memory._make.alloc_storage")
-    .set_body_typed([](Expr size, Expr alignment, TVMContext ctx, DataType dtype_hint) {
-      auto attrs = make_object<AllocStorageAttrs>();
-      attrs->dtype = dtype_hint;
-      attrs->device_id = ctx.device_id;
-      attrs->device_type = ctx.device_type;
-      static const Op& op = Op::Get("memory.alloc_storage");
-      return Call(op, {size, alignment}, Attrs(attrs), {});
-    });
+Expr AllocStorage(Expr size, Expr alignment, TVMContext ctx, DataType dtype_hint) {
+  auto attrs = make_object<AllocStorageAttrs>();
+  attrs->dtype = dtype_hint;
+  attrs->device_id = ctx.device_id;
+  attrs->device_type = ctx.device_type;
+  static const Op& op = Op::Get("memory.alloc_storage");
+  return Call(op, {size, alignment}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_GLOBAL("relay.op.memory._make.alloc_storage").set_body_typed(AllocStorage);
 
 bool AllocStorageRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
@@ -90,19 +97,20 @@ RELAY_REGISTER_OP("memory.alloc_storage")
                              return {topi::identity(inputs[0])};
                            });
 
-TVM_REGISTER_GLOBAL("relay.op.memory._make.alloc_tensor")
-    .set_body_typed([](Expr storage, Expr offset, tvm::relay::Expr shape, DataType dtype,
-                       Array<IndexExpr> assert_shape) {
-      auto attrs = make_object<AllocTensorAttrs>();
-      attrs->dtype = dtype;
-      if (assert_shape.defined()) {
-        attrs->assert_shape = assert_shape;
-      } else {
-        attrs->const_shape = Downcast<Constant>(shape);
-      }
-      static const Op& op = Op::Get("memory.alloc_tensor");
-      return Call(op, {storage, offset, shape}, Attrs(attrs), {});
-    });
+Expr AllocTensor(Expr storage, Expr offset, tvm::relay::Expr shape, DataType dtype,
+                 Array<IndexExpr> assert_shape) {
+  auto attrs = make_object<AllocTensorAttrs>();
+  attrs->dtype = dtype;
+  if (assert_shape.defined()) {
+    attrs->assert_shape = assert_shape;
+  } else {
+    attrs->const_shape = Downcast<Constant>(shape);
+  }
+  static const Op& op = Op::Get("memory.alloc_tensor");
+  return Call(op, {storage, offset, shape}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_GLOBAL("relay.op.memory._make.alloc_tensor").set_body_typed(AllocTensor);
 
 std::vector<int64_t> FromConstShape(Constant konst) {
   runtime::NDArray shape = konst->data;
@@ -299,5 +307,36 @@ TVM_REGISTER_GLOBAL("relay.op.memory._make.ToTupleType")
       return ToTupleType(t, std::vector<Expr>(array.begin(), array.end()));
     });
 
+// relay.device_copy
+TVM_REGISTER_NODE_TYPE(DeviceCopyAttrs);
+
+Expr DeviceCopy(Expr data, int src_dev_type, int dst_dev_type) {
+  auto attrs = make_object<DeviceCopyAttrs>();
+  attrs->src_dev_type = src_dev_type;
+  attrs->dst_dev_type = dst_dev_type;
+  static const Op& op = Op::Get("device_copy");
+  return Call(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.device_copy").set_body_typed(DeviceCopy);
+
+RELAY_REGISTER_OP("device_copy")
+    .describe(R"code(
+Copy data from one tensor to another. The source and destination might be
+on different devices.
+)code" TVM_ADD_FILELINE)
+    .set_num_inputs(1)
+    .add_argument("data", "Tensor", "The input data.")
+    .set_support_level(10)
+    .add_type_rel("Identity", IdentityRel)
+    .set_attr<TOpPattern>("TOpPattern", kOpaque)
+    .set_attr<TOpIsStateful>("TOpIsStateful", false)
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
+    .set_attr<FTVMCompute>("FTVMCompute",
+                           [](const Attrs& attrs, const Array<te::Tensor>& inputs,
+                              const Type& out_dtype) -> Array<te::Tensor> {
+                             return {topi::identity(inputs[0])};
+                           });
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/memory/memory.h b/src/relay/op/memory/memory.h
new file mode 100644
index 000000000000..6e184507bad5
--- /dev/null
+++ b/src/relay/op/memory/memory.h
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/op/memory/memory.h
+ * \brief Operators for memory related operations in Relay.
+ */
+
+#ifndef TVM_RELAY_OP_MEMORY_MEMORY_H_
+#define TVM_RELAY_OP_MEMORY_MEMORY_H_
+
+#include <vector>
+
+#include "tvm/relay/expr.h"
+
+namespace tvm {
+namespace relay {
+
+Expr AllocStorage(Expr size, Expr alignment, TVMContext ctx, DataType dtype_hint);
+Expr DeviceCopy(Expr data, int src_dev_type, int dst_dev_type);
+Expr AllocTensor(Expr storage, Expr offset, tvm::relay::Expr shape, DataType dtype,
+                 Array<IndexExpr> assert_shape);
+Expr ToTupleType(const Type& ty, const std::vector<Expr>& exprs);
+std::vector<Expr> FromTupleType(const Type& type, const Expr& expr);
+std::vector<TensorType> FlattenTupleType(const Type& type);
+
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_OP_MEMORY_MEMORY_H_
diff --git a/src/relay/op/tensor/reduce.cc b/src/relay/op/tensor/reduce.cc
index 0b198005001b..4fa8aca4f3a9 100644
--- a/src/relay/op/tensor/reduce.cc
+++ b/src/relay/op/tensor/reduce.cc
@@ -475,7 +475,11 @@ Array<te::Tensor> ProdCompute(const Attrs& attrs, const Array<te::Tensor>& input
   return ReduceCompute(attrs, inputs, out_type, topi::prod);
 }
 
-RELAY_REGISTER_REDUCE_OP("prod")
+TVM_REGISTER_GLOBAL("relay.op._make.prod").set_body_typed(Prod);
+
+RELAY_REGISTER_OP("prod")
+    .set_num_inputs(1)
+    .add_argument("data", "Tensor", "The input tensor.")
     .describe(R"code(Computes the products of array elements over given axes.
 
 Example::
diff --git a/src/relay/op/vm/vm.cc b/src/relay/op/vm/vm.cc
index 0fb79206d71d..a74a259a114f 100644
--- a/src/relay/op/vm/vm.cc
+++ b/src/relay/op/vm/vm.cc
@@ -22,6 +22,8 @@
  * \brief Dialect operators for Relay VM.
  */
 
+#include "vm.h"
+
 #include <tvm/relay/attrs/memory.h>
 #include <tvm/relay/attrs/vm.h>
 #include <tvm/relay/expr.h>
@@ -30,6 +32,8 @@
 #include <tvm/runtime/data_type.h>
 #include <tvm/topi/elemwise.h>
 
+#include <utility>
+
 #include "../../transforms/infer_layout_utils.h"
 #include "../op_common.h"
 #include "../type_relations.h"
@@ -52,20 +56,23 @@ RELAY_REGISTER_OP("vm.shape_of")
     .set_attr<TNonComputational>("TNonComputational", true)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout);
 
-TVM_REGISTER_GLOBAL("relay.op.vm.shape_of").set_body_typed([](Expr expr) {
+Expr ShapeOf(Expr expr) {
   auto attrs = make_object<ShapeOfAttrs>();
   attrs->dtype = DataType::Int(64);
   static const Op& op = Op::Get("vm.shape_of");
   return Call(op, {expr}, Attrs(attrs), {});
-});
+}
+
+TVM_REGISTER_GLOBAL("relay.op.vm.shape_of").set_body_typed(ShapeOf);
+
+Expr ShapeFunc(Expr func, Expr inputs, Expr outputs, Array<tvm::Integer> is_input) {
+  static const Op& op = Op::Get("vm.shape_func");
+  auto attrs = make_object<ShapeFuncAttrs>();
+  attrs->is_input = is_input;
+  return Call(op, {func, inputs, outputs}, Attrs(attrs), {});
+}
 
-TVM_REGISTER_GLOBAL("relay.op.vm.shape_func")
-    .set_body_typed([](Expr func, Expr inputs, Expr outputs, Array<tvm::Integer> is_input) {
-      static const Op& op = Op::Get("vm.shape_func");
-      auto attrs = make_object<ShapeFuncAttrs>();
-      attrs->is_input = is_input;
-      return Call(op, {func, inputs, outputs}, Attrs(attrs), {});
-    });
+TVM_REGISTER_GLOBAL("relay.op.vm.shape_func").set_body_typed(ShapeFunc);
 
 bool ShapeFuncRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                   const TypeReporter& reporter) {
@@ -162,10 +169,11 @@ bool InvokeTVMOpRel(const Array<Type>& types, int num_inputs, const Attrs& attrs
   return true;
 }
 
-TVM_REGISTER_GLOBAL("relay.op.vm.invoke_tvm_op")
-    .set_body_typed([](Expr func, Expr inputs, Expr outputs) {
-      return Call(Op::Get("vm.invoke_tvm_op"), {func, inputs, outputs}, Attrs());
-    });
+Expr InvokeTVMOp(Expr func, Expr inputs, Expr outputs) {
+  return Call(Op::Get("vm.invoke_tvm_op"), {func, inputs, outputs}, Attrs());
+}
+
+TVM_REGISTER_GLOBAL("relay.op.vm.invoke_tvm_op").set_body_typed(InvokeTVMOp);
 
 RELAY_REGISTER_OP("vm.invoke_tvm_op")
     .describe(R"code(Invoke an operation compiled by TVM.)code" TVM_ADD_FILELINE)
@@ -212,13 +220,14 @@ RELAY_REGISTER_OP("vm.reshape_tensor")
     .set_attr<TNonComputational>("TNonComputational", true)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout);
 
-TVM_REGISTER_GLOBAL("relay.op.vm.reshape_tensor")
-    .set_body_typed([](Expr data, Expr shape, Array<PrimExpr> newshape) {
-      static const Op& op = Op::Get("vm.reshape_tensor");
-      auto attrs = make_object<ReshapeTensorAttrs>();
-      attrs->newshape = std::move(newshape);
-      return Call(op, {data, shape}, Attrs(attrs), {});
-    });
+Expr ReshapeTensor(Expr data, Expr shape, Array<PrimExpr> newshape) {
+  static const Op& op = Op::Get("vm.reshape_tensor");
+  auto attrs = make_object<ReshapeTensorAttrs>();
+  attrs->newshape = std::move(newshape);
+  return Call(op, {data, shape}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_GLOBAL("relay.op.vm.reshape_tensor").set_body_typed(ReshapeTensor);
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/vm/vm.h b/src/relay/op/vm/vm.h
new file mode 100644
index 000000000000..802c8100125a
--- /dev/null
+++ b/src/relay/op/vm/vm.h
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/op/vm/vm.h
+ * \brief Dialect operators for Relay VM.
+ */
+#ifndef TVM_RELAY_OP_VM_VM_H_
+#define TVM_RELAY_OP_VM_VM_H_
+
+#include "tvm/relay/expr.h"
+
+namespace tvm {
+namespace relay {
+
+Expr InvokeTVMOp(Expr func, Expr inputs, Expr outputs);
+Expr ShapeFunc(Expr func, Expr inputs, Expr outputs, Array<tvm::Integer> is_input);
+Expr ShapeOf(Expr expr);
+Expr ReshapeTensor(Expr data, Expr shape, Array<PrimExpr> newshape);
+
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_OP_VM_VM_H_
diff --git a/src/relay/transforms/fold_constant.cc b/src/relay/transforms/fold_constant.cc
index 0689263cca77..657d4db993b0 100644
--- a/src/relay/transforms/fold_constant.cc
+++ b/src/relay/transforms/fold_constant.cc
@@ -82,10 +82,6 @@ class ConstantFolder : public MixedModeMutator {
         device_copy_op_(Op::Get("device_copy")),
         shape_of_op_(Op::Get("shape_of")),
         vm_shape_of_op_(Op::Get("vm.shape_of")),
-        invoke_tvm_op_(Op::Get("vm.invoke_tvm_op")),
-        shape_func_op_(Op::Get("vm.shape_func")),
-        alloc_tensor_op_(Op::Get("memory.alloc_tensor")),
-        alloc_storage_op_(Op::Get("memory.alloc_storage")),
         cast_op_(Op::Get("cast")),
         ndarray_size_op_(Op::Get("ndarray_size")) {}
 
@@ -217,10 +213,6 @@ class ConstantFolder : public MixedModeMutator {
   const Op& device_copy_op_;
   const Op& shape_of_op_;
   const Op& vm_shape_of_op_;
-  const Op& invoke_tvm_op_;
-  const Op& shape_func_op_;
-  const Op& alloc_tensor_op_;
-  const Op& alloc_storage_op_;
   const Op& cast_op_;
   const Op& ndarray_size_op_;
 
diff --git a/src/relay/transforms/memory_alloc.cc b/src/relay/transforms/memory_alloc.cc
index 360778e1723b..b8c87909a025 100644
--- a/src/relay/transforms/memory_alloc.cc
+++ b/src/relay/transforms/memory_alloc.cc
@@ -41,6 +41,8 @@
 #include <vector>
 
 #include "../backend/compile_engine.h"
+#include "../op/memory/memory.h"
+#include "../op/vm/vm.h"
 #include "let_list.h"
 #include "pattern_utils.h"
 
@@ -49,10 +51,6 @@ using namespace tvm::runtime;
 namespace tvm {
 namespace relay {
 
-extern Expr ToTupleType(const Type& ty, const std::vector<Expr>& exprs);
-extern std::vector<Expr> FromTupleType(const Type& type, const Expr& expr);
-extern std::vector<TensorType> FlattenTupleType(const Type& type);
-
 using AnalysisResultMap =
     std::unordered_map<Expr, TVMContext, runtime::ObjectPtrHash, runtime::ObjectPtrEqual>;
 
@@ -62,10 +60,8 @@ inline Constant MakeConstant(const std::vector<int64_t>& value) {
 
 inline Expr AllocTensor(const Expr& storage, tvm::relay::Expr shape, DataType dtype,
                         Array<IndexExpr> assert_shape) {
-  auto f = runtime::Registry::Get("relay.op.memory._make.alloc_tensor");
-  CHECK(f != nullptr) << "unable to find alloc_tensor op";
   auto offset = MakeConstantScalar(DataType::Int(64), 0);
-  return (*f)(storage, offset, shape, dtype, assert_shape);
+  return AllocTensor(storage, offset, shape, dtype, assert_shape);
 }
 
 // A pass to check if the fused op contains only reshape ops.
@@ -106,18 +102,7 @@ bool IsReshapeOnly(const Expr& expr) {
 class DialectRewriter : public ExprMutator {
  public:
   DialectRewriter(const Target& target_host, const AnalysisResultMap& context_analysis_map)
-      : target_host_(target_host),
-        context_analysis_map_(context_analysis_map),
-        device_copy_(runtime::Registry::Get("relay.op._make.device_copy")),
-        invoke_tvm_(runtime::Registry::Get("relay.op.vm.invoke_tvm_op")),
-        alloc_storage_(runtime::Registry::Get("relay.op.memory._make.alloc_storage")),
-        shape_func_(runtime::Registry::Get("relay.op.vm.shape_func")),
-        shape_of_(runtime::Registry::Get("relay.op.vm.shape_of")),
-        reshape_tensor_(runtime::Registry::Get("relay.op.vm.reshape_tensor")),
-        prod_(runtime::Registry::Get("relay.op._make.prod")),
-        divide_(runtime::Registry::Get("relay.op._make.divide")),
-        add_(runtime::Registry::Get("relay.op._make.add")),
-        multiply_(runtime::Registry::Get("relay.op._make.multiply")) {}
+      : target_host_(target_host), context_analysis_map_(context_analysis_map) {}
 
   // Get the context of an expression.
   TVMContext GetContext(const Expr& expr) const {
@@ -209,7 +194,7 @@ class DialectRewriter : public ExprMutator {
           outs.push_back(out);
         }
         Tuple output(outs);
-        Expr invoke = (*invoke_tvm_)(cn->op, ins, output);
+        Expr invoke = InvokeTVMOp(cn->op, ins, output);
         scope.Push(invoke);
         return ToTupleType(ret_type,
                            std::vector<Expr>(output->fields.begin(), output->fields.end()));
@@ -222,7 +207,7 @@ class DialectRewriter : public ExprMutator {
  private:
   // Insert a device copy node.
   Expr DeviceCopy(const Expr& inp, int src_ctx, int dst_ctx) {
-    return ExprMutator::Mutate((*device_copy_)(inp, src_ctx, dst_ctx));
+    return ExprMutator::Mutate(relay::DeviceCopy(inp, src_ctx, dst_ctx));
   }
 
   // Check if a call invokes a primitive function.
@@ -257,11 +242,11 @@ class DialectRewriter : public ExprMutator {
 
   Expr ComputeStorageInRelay(const Expr& shape, const TensorType& type) const {
     auto dtype = DataType(type->dtype);
-    Expr els = (*prod_)(shape, Array<Expr>(nullptr), false, false);
+    Expr els = Prod(shape, Array<Integer>(nullptr), false, false);
     Expr num = MakeConstantScalar(DataType::Int(64), dtype.bits() * dtype.lanes());
-    Expr add = (*add_)(num, MakeConstantScalar(DataType::Int(64), 7));
+    Expr add = Add(num, MakeConstantScalar(DataType::Int(64), 7));
     Expr div = MakeConstantScalar(DataType::Int(64), 8);
-    Expr ret = (*multiply_)(els, (*divide_)(add, div));
+    Expr ret = Multiply(els, Divide(add, div));
     return std::move(ret);
   }
 
@@ -290,7 +275,7 @@ class DialectRewriter : public ExprMutator {
     Expr alignment = ComputeAlignment(type->dtype);
     // Run type inference later to get the correct type.
     Var var("storage_" + name_hint, Type(nullptr));
-    Expr value = (*alloc_storage_)(size, alignment, ctx, type->dtype);
+    Expr value = AllocStorage(size, alignment, ctx, type->dtype);
     auto sto = scope->Push(var, value);
 
     // TODO(@jroesch): There is a bug with typing based on the constant shape.
@@ -325,7 +310,7 @@ class DialectRewriter : public ExprMutator {
       if (state == 2) {
         std::vector<Expr> exprs = FromTupleType(ty, arg);
         for (size_t j = 0; j < exprs.size(); ++j) {
-          Expr sh_of = ExprMutator::Mutate((*shape_of_)(exprs[j]));
+          Expr sh_of = ExprMutator::Mutate(ShapeOf(exprs[j]));
           Var in_shape_var("in_shape_" + std::to_string(input_pos + j), Type(nullptr));
           shape_func_ins.push_back(scope->Push(in_shape_var, sh_of));
           input_pos++;
@@ -358,7 +343,7 @@ class DialectRewriter : public ExprMutator {
       alloc = scope->Push(shape_func_out_var, alloc);
       out_shapes.push_back(alloc);
     }
-    auto shape_call = (*shape_func_)(func, Tuple(shape_func_ins), Tuple(out_shapes), is_inputs);
+    auto shape_call = ShapeFunc(func, Tuple(shape_func_ins), Tuple(out_shapes), is_inputs);
     Var shape_func_var("shape_func", Type(nullptr));
     scope->Push(shape_func_var, shape_call);
     return out_shapes;
@@ -378,7 +363,7 @@ class DialectRewriter : public ExprMutator {
       auto size = ComputeStorageInRelay(out_shape, out_type);
       auto alignment = ComputeAlignment(out_type->dtype);
       Var sto_var("storage_" + std::to_string(i), Type(nullptr));
-      auto val = (*alloc_storage_)(size, alignment, func_ctx, out_type->dtype);
+      auto val = AllocStorage(size, alignment, func_ctx, out_type->dtype);
       storages.push_back(scope->Push(sto_var, val));
     }
 
@@ -393,7 +378,7 @@ class DialectRewriter : public ExprMutator {
     }
 
     Tuple tuple_outs(outs);
-    auto invoke = (*invoke_tvm_)(func, ins, tuple_outs);
+    auto invoke = InvokeTVMOp(func, ins, tuple_outs);
     scope->Push(invoke);
     return ToTupleType(ret_type,
                        std::vector<Expr>(tuple_outs->fields.begin(), tuple_outs->fields.end()));
@@ -415,7 +400,7 @@ class DialectRewriter : public ExprMutator {
       }
       shape_expr = MakeConstant(shape);
     }
-    return (*reshape_tensor_)(new_args[0], shape_expr, ret_ty->shape);
+    return ReshapeTensor(new_args[0], shape_expr, ret_ty->shape);
   }
 
  private:
@@ -423,18 +408,6 @@ class DialectRewriter : public ExprMutator {
   AnalysisResultMap context_analysis_map_;
   std::vector<LetList> scopes_;
 
-  // Cache the following ops
-  const PackedFunc* device_copy_;
-  const PackedFunc* invoke_tvm_;
-  const PackedFunc* alloc_storage_;
-  const PackedFunc* shape_func_;
-  const PackedFunc* shape_of_;
-  const PackedFunc* reshape_tensor_;
-  const PackedFunc* prod_;
-  const PackedFunc* divide_;
-  const PackedFunc* add_;
-  const PackedFunc* multiply_;
-
   runtime::DataType compute_dtype_ = runtime::DataType::Int(64);
   TVMContext default_context_{kDLCPU, 0};
 };
diff --git a/src/relay/transforms/pattern_utils.h b/src/relay/transforms/pattern_utils.h
index 8ef86e088193..bc0fcc9f2988 100644
--- a/src/relay/transforms/pattern_utils.h
+++ b/src/relay/transforms/pattern_utils.h
@@ -644,6 +644,10 @@ static inline Expr Sum(Expr data, Array<Integer> axis, bool keepdims, bool exclu
   return MakeReduce(data, axis, keepdims, exclude, "sum");
 }
 
+static inline Expr Prod(Expr data, Array<Integer> axis, bool keepdims, bool exclude) {
+  return MakeReduce(data, axis, keepdims, exclude, "prod");
+}
+
 static inline Expr Reshape(Expr data, Array<Integer> newshape) {
   return MakeReshape(data, newshape);
 }

From 9daf3fee71db91e1adae8410ab8c70846df764ed Mon Sep 17 00:00:00 2001
From: dlexplorer <70961591+dlexplorer@users.noreply.github.com>
Date: Sun, 7 Feb 2021 05:47:30 +0300
Subject: [PATCH 164/357] Fix AutoScheduler for anaconda python (#7387)

In case of non cpython flavour of python, the task passed to measure process
should be serialized using pickle approach. The task includes workload
which is a list of Tensors. The list should be serialized and deserialized
as an atomic object.
---
 .../tvm/auto_scheduler/workload_registry.py   | 18 ++++++---
 .../unittest/test_auto_scheduler_measure.py   | 39 +++++++++++++++++++
 2 files changed, 51 insertions(+), 6 deletions(-)

diff --git a/python/tvm/auto_scheduler/workload_registry.py b/python/tvm/auto_scheduler/workload_registry.py
index 51ae64d6adeb..cd8f8c9d1a3e 100644
--- a/python/tvm/auto_scheduler/workload_registry.py
+++ b/python/tvm/auto_scheduler/workload_registry.py
@@ -35,6 +35,7 @@
 import json
 
 import tvm._ffi
+from tvm.runtime._ffi_node_api import LoadJSON, SaveJSON
 from .utils import serialize_args, deserialize_args, get_func_name
 
 logger = logging.getLogger("auto_scheduler")
@@ -216,13 +217,17 @@ def serialize_workload_registry_entry(workload_key):
     global WORKLOAD_FUNC_REGISTRY
 
     if workload_key in WORKLOAD_FUNC_REGISTRY:
-        return (workload_key, WORKLOAD_FUNC_REGISTRY[workload_key])
+        sname = workload_key
+    else:
+        workload = json.loads(workload_key)
+        sname = workload[0]
 
-    workload = json.loads(workload_key)
-    name = workload[0]
-    value = WORKLOAD_FUNC_REGISTRY[name]
+    svalue = WORKLOAD_FUNC_REGISTRY[sname]
+    if not callable(svalue):
+        # pylint: disable=assignment-from-no-return
+        svalue = SaveJSON(svalue)
 
-    return name, value
+    return sname, svalue
 
 
 def deserialize_workload_registry_entry(data):
@@ -239,7 +244,8 @@ def deserialize_workload_registry_entry(data):
 
     name, value = data
     if name not in WORKLOAD_FUNC_REGISTRY:
-        WORKLOAD_FUNC_REGISTRY[name] = value
+        # pylint: disable=assignment-from-no-return
+        WORKLOAD_FUNC_REGISTRY[name] = LoadJSON(value)
 
 
 def save_workload_func_registry(filename):
diff --git a/tests/python/unittest/test_auto_scheduler_measure.py b/tests/python/unittest/test_auto_scheduler_measure.py
index 041fb7ee76d3..cc9d7a41548d 100644
--- a/tests/python/unittest/test_auto_scheduler_measure.py
+++ b/tests/python/unittest/test_auto_scheduler_measure.py
@@ -24,8 +24,10 @@
 from tvm import te, auto_scheduler
 import tempfile
 import tvm.testing
+import pickle
 
 from test_auto_scheduler_common import matmul_auto_scheduler_test, get_tiled_matmul
+from tvm.auto_scheduler import workload_registry
 
 
 def record_common(dag, s):
@@ -255,6 +257,42 @@ def test_measure_local_builder_runner():
         assert mress[0].error_no == 0
 
 
+def test_dag_measure_local_builder_runner():
+    if not tvm.testing.device_enabled("llvm"):
+        return
+
+    A = te.placeholder((512, 512), name="A")
+    B = te.placeholder((512, 512), name="B")
+    k = te.reduce_axis((0, 512), name="k")
+    C = te.compute((512, 512), lambda i, j: te.sum(A[i][k] * B[k][j], axis=[k]), name="C")
+    D = topi.nn.relu(C)
+    E = topi.nn.relu(D)
+
+    tensors = [A, B, E]
+    dag = auto_scheduler.ComputeDAG(tensors)
+    key = workload_registry.register_workload_tensors(dag.workload_key(), tensors)
+    transfer_data = workload_registry.serialize_workload_registry_entry(key)
+    f_data = pickle.dumps(transfer_data)
+    f_new = pickle.loads(f_data)
+    del workload_registry.WORKLOAD_FUNC_REGISTRY[key]
+    workload_registry.deserialize_workload_registry_entry(f_new)
+
+    target = tvm.target.Target("llvm")
+    task = auto_scheduler.SearchTask(compute_dag=dag, workload_key=key, target=target)
+
+    for enable_cpu_cache_flush in [True, False]:
+        minp = auto_scheduler.MeasureInput(task, task.compute_dag.init_state)
+        local_builder = auto_scheduler.LocalBuilder()
+        local_runner = auto_scheduler.LocalRunner(
+            timeout=60, enable_cpu_cache_flush=enable_cpu_cache_flush
+        )
+
+        bress = local_builder.build([minp])
+        assert bress[0].error_no == 0
+        mress = local_runner.run([minp], bress)
+        assert mress[0].error_no == 0
+
+
 def test_measure_local_builder_rpc_runner():
     if not tvm.testing.device_enabled("llvm"):
         return
@@ -325,5 +363,6 @@ def test_measure_target_host():
     test_recover_measure_input()
     test_workload_dis_factor()
     test_measure_local_builder_runner()
+    test_dag_measure_local_builder_runner()
     test_measure_local_builder_rpc_runner()
     test_measure_target_host()

From 33f30aff0dac784836dc4bf4e12bf1cf845a6370 Mon Sep 17 00:00:00 2001
From: Matthew Bentham <matthew.bentham@arm.com>
Date: Sun, 7 Feb 2021 21:47:44 +0000
Subject: [PATCH 165/357] Fix compilation when Arm FP16 extensions are enabled
 (#7386)

Fixes incorrect number of template parameters in call to sort()

Signed-off-by: Matthew Bentham <matthew.bentham@arm.com>
---
 src/runtime/contrib/sort/sort.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runtime/contrib/sort/sort.cc b/src/runtime/contrib/sort/sort.cc
index fba57d923b38..66f36ffa50d6 100644
--- a/src/runtime/contrib/sort/sort.cc
+++ b/src/runtime/contrib/sort/sort.cc
@@ -289,7 +289,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.sort.sort").set_body([](TVMArgs args, TVMRetVal
     sort<double>(input, output, axis, is_ascend);
 #if (__ARM_FEATURE_FP16_SCALAR_ARITHMETIC == 1)
   } else if (data_dtype == "float16") {
-    sort<__fp16, __fp16>(input, output, axis, is_ascend);
+    sort<__fp16>(input, output, axis, is_ascend);
 #endif
   } else if (data_dtype == "int32") {
     sort<int32_t>(input, output, axis, is_ascend);

From 5103bb6a6374dfde2cbf5171dc8637b911df1322 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Mon, 8 Feb 2021 10:56:51 -0800
Subject: [PATCH 166/357] Jenkinsfile changes for #7333. (#7388)

---
 Jenkinsfile                    | 22 +++++++++++-----------
 tests/scripts/task_ci_setup.sh | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 11 deletions(-)
 create mode 100755 tests/scripts/task_ci_setup.sh

diff --git a/Jenkinsfile b/Jenkinsfile
index ad7a4e0ad31d..6bf6dcfa966a 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -181,7 +181,7 @@ stage('Build') {
         make(ci_cpu, 'build', '-j2')
         pack_lib('cpu', tvm_multilib)
         timeout(time: max_time, unit: 'MINUTES') {
-          sh "${docker_run} ${ci_cpu} ./tests/scripts/task_ci_python_setup.sh"
+          sh "${docker_run} ${ci_cpu} ./tests/scripts/task_ci_setup.sh"
           sh "${docker_run} ${ci_cpu} ./tests/scripts/task_python_unittest.sh"
           sh "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh"
           sh "${docker_run} ${ci_cpu} ./tests/scripts/task_python_vta_fsim.sh"
@@ -199,7 +199,7 @@ stage('Build') {
         sh "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh"
         make(ci_wasm, 'build', '-j2')
         timeout(time: max_time, unit: 'MINUTES') {
-          sh "${docker_run} ${ci_wasm} ./tests/scripts/task_ci_python_setup.sh"
+          sh "${docker_run} ${ci_wasm} ./tests/scripts/task_ci_setup.sh"
           sh "${docker_run} ${ci_wasm} ./tests/scripts/task_web_wasm.sh"
         }
       }
@@ -232,7 +232,7 @@ stage('Build') {
         sh "${docker_run} ${ci_qemu} ./tests/scripts/task_config_build_qemu.sh"
         make(ci_qemu, 'build', '-j2')
         timeout(time: max_time, unit: 'MINUTES') {
-          sh "${docker_run} ${ci_qemu} ./tests/scripts/task_ci_python_setup.sh"
+          sh "${docker_run} ${ci_qemu} ./tests/scripts/task_ci_setup.sh"
           sh "${docker_run} ${ci_qemu} ./tests/scripts/task_python_microtvm.sh"
         }
       }
@@ -247,7 +247,7 @@ stage('Unit Test') {
         init_git()
         unpack_lib('gpu', tvm_multilib)
         timeout(time: max_time, unit: 'MINUTES') {
-          sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_python_setup.sh"
+          sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_setup.sh"
           sh "${docker_run} ${ci_gpu} ./tests/scripts/task_sphinx_precheck.sh"
           sh "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest_gpuonly.sh"
           sh "${docker_run} ${ci_gpu} ./tests/scripts/task_python_integration_gpuonly.sh"
@@ -261,7 +261,7 @@ stage('Unit Test') {
         init_git()
         unpack_lib('i386', tvm_multilib)
         timeout(time: max_time, unit: 'MINUTES') {
-          sh "${docker_run} ${ci_i386} ./tests/scripts/task_ci_python_setup.sh"
+          sh "${docker_run} ${ci_i386} ./tests/scripts/task_ci_setup.sh"
           sh "${docker_run} ${ci_i386} ./tests/scripts/task_python_unittest.sh"
           sh "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration.sh"
           sh "${docker_run} ${ci_i386} ./tests/scripts/task_python_vta_fsim.sh"
@@ -275,7 +275,7 @@ stage('Unit Test') {
         init_git()
         unpack_lib('arm', tvm_multilib)
         timeout(time: max_time, unit: 'MINUTES') {
-          sh "${docker_run} ${ci_arm} ./tests/scripts/task_ci_python_setup.sh"
+          sh "${docker_run} ${ci_arm} ./tests/scripts/task_ci_setup.sh"
           sh "${docker_run} ${ci_arm} ./tests/scripts/task_python_unittest.sh"
           // sh "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh"
         }
@@ -288,7 +288,7 @@ stage('Unit Test') {
         init_git()
         unpack_lib('gpu', tvm_multilib)
         timeout(time: max_time, unit: 'MINUTES') {
-          sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_python_setup.sh"
+          sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_setup.sh"
           sh "${docker_run} ${ci_gpu} ./tests/scripts/task_java_unittest.sh"
         }
       }
@@ -303,7 +303,7 @@ stage('Integration Test') {
         init_git()
         unpack_lib('gpu', tvm_multilib)
         timeout(time: max_time, unit: 'MINUTES') {
-          sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_python_setup.sh"
+          sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_setup.sh"
           sh "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh"
         }
       }
@@ -315,7 +315,7 @@ stage('Integration Test') {
         init_git()
         unpack_lib('gpu', tvm_multilib)
         timeout(time: max_time, unit: 'MINUTES') {
-          sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_python_setup.sh"
+          sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_setup.sh"
           sh "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh"
         }
       }
@@ -327,7 +327,7 @@ stage('Integration Test') {
         init_git()
         unpack_lib('cpu', tvm_multilib)
         timeout(time: max_time, unit: 'MINUTES') {
-          sh "${docker_run} ${ci_cpu} ./tests/scripts/task_ci_python_setup.sh"
+          sh "${docker_run} ${ci_cpu} ./tests/scripts/task_ci_setup.sh"
           sh "${docker_run} ${ci_cpu} ./tests/scripts/task_python_frontend_cpu.sh"
         }
       }
@@ -339,7 +339,7 @@ stage('Integration Test') {
         init_git()
         unpack_lib('gpu', tvm_multilib)
         timeout(time: max_time, unit: 'MINUTES') {
-          sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_python_setup.sh"
+          sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_setup.sh"
           sh "${docker_run} ${ci_gpu} ./tests/scripts/task_python_docs.sh"
         }
         pack_lib('mydocs', 'docs.tgz')
diff --git a/tests/scripts/task_ci_setup.sh b/tests/scripts/task_ci_setup.sh
new file mode 100755
index 000000000000..f48ed49a2266
--- /dev/null
+++ b/tests/scripts/task_ci_setup.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+set -o pipefail
+
+# Script to setup additional python env.
+#
+# Use the following command to install the
+# package to /workspace/.local, these additional
+# packages will have precedence over the system packages.
+#
+# command: python3 -m pip install --user <package>==<version>
+#
+echo "Addtiional setup in" ${CI_IMAGE_NAME}
+
+python3 -m pip install --user tlcpack-sphinx-addon==0.1.4 synr==0.2.1

From 0e7e2dc290e336a709811ebf6d95328d06522f23 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Mon, 8 Feb 2021 12:13:50 -0800
Subject: [PATCH 167/357] =?UTF-8?q?[=C2=B5TVM]=20Add=20VMWare=20to=20Refer?=
 =?UTF-8?q?ence=20VM=20instructions=20(#7221)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* support vmware_desktop provider for microTVM reference VM

* update tutorial

* python format

* try to fix sphinx warning

* fix sphinx warning

* retrigger CI
---
 apps/microtvm/reference-vm/base-box-tool.py   | 68 +++++++++++++++----
 apps/microtvm/reference-vm/zephyr/Vagrantfile | 10 +++
 .../base-box/Vagrantfile.packer-template      |  7 ++
 .../reference-vm/zephyr/base-box/setup.sh     | 24 +++++--
 tutorials/micro/micro_reference_vm.py         | 12 ++--
 5 files changed, 98 insertions(+), 23 deletions(-)

diff --git a/apps/microtvm/reference-vm/base-box-tool.py b/apps/microtvm/reference-vm/base-box-tool.py
index c317a373bd8b..0e82dc2e9c0e 100755
--- a/apps/microtvm/reference-vm/base-box-tool.py
+++ b/apps/microtvm/reference-vm/base-box-tool.py
@@ -18,6 +18,7 @@
 
 
 import argparse
+import copy
 import json
 import logging
 import os
@@ -38,6 +39,7 @@
 ALL_PROVIDERS = (
     "parallels",
     "virtualbox",
+    "vmware_desktop",
 )
 
 
@@ -141,9 +143,27 @@ def attach_parallels(uuid, vid_hex=None, pid_hex=None, serial=None):
     )
 
 
+def attach_vmware(uuid, vid_hex=None, pid_hex=None, serial=None):
+    print("NOTE: vmware doesn't seem to support automatic attaching of devices :(")
+    print("The VMWare VM UUID is {uuid}")
+    print("Please attach the following usb device using the VMWare GUI:")
+    if vid_hex is not None:
+        print(f" - VID: {vid_hex}")
+    if pid_hex is not None:
+        print(f" - PID: {pid_hex}")
+    if serial is not None:
+        print(f" - Serial: {serial}")
+    if vid_hex is None and pid_hex is None and serial is None:
+        print(" - (no specifications given for USB device)")
+    print()
+    print("Press [Enter] when the USB device is attached")
+    input()
+
+
 ATTACH_USB_DEVICE = {
     "parallels": attach_parallels,
     "virtualbox": attach_virtualbox,
+    "vmware_desktop": attach_vmware,
 }
 
 
@@ -153,6 +173,7 @@ def generate_packer_config(file_path, providers):
         builders.append(
             {
                 "type": "vagrant",
+                "box_name": f"microtvm-base-{provider_name}",
                 "output_dir": f"output-packer-{provider_name}",
                 "communicator": "ssh",
                 "source_path": "generic/ubuntu1804",
@@ -175,10 +196,19 @@ def generate_packer_config(file_path, providers):
 def build_command(args):
     generate_packer_config(
         os.path.join(THIS_DIR, args.platform, "base-box", "packer.json"),
-        args.provider.split(",") or ALL_PROVIDERS,
+        args.provider or ALL_PROVIDERS,
     )
+    env = None
+    packer_args = ["packer", "build"]
+    if args.debug_packer:
+        env = copy.copy(os.environ)
+        env["PACKER_LOG"] = "1"
+        env["PACKER_LOG_PATH"] = "packer.log"
+        packer_args += ["-debug"]
+
+    packer_args += ["packer.json"]
     subprocess.check_call(
-        ["packer", "build", "packer.json"], cwd=os.path.join(THIS_DIR, args.platform, "base-box")
+        packer_args, cwd=os.path.join(THIS_DIR, args.platform, "base-box"), env=env
     )
 
 
@@ -318,16 +348,17 @@ def test_command(args):
 
 
 def release_command(args):
-    subprocess.check_call(
-        [
-            "vagrant",
-            "cloud",
-            "version",
-            "create",
-            f"tlcpack/microtvm-{args.platform}",
-            args.release_version,
-        ]
-    )
+    if not args.skip_creating_release_version:
+        subprocess.check_call(
+            [
+                "vagrant",
+                "cloud",
+                "version",
+                "create",
+                f"tlcpack/microtvm-{args.platform}",
+                args.release_version,
+            ]
+        )
     if not args.release_version:
         sys.exit(f"--release-version must be specified")
 
@@ -399,6 +430,19 @@ def parse_args():
         "--release-version",
         help="Version to release, in the form 'x.y.z'. Must be specified with release.",
     )
+    parser.add_argument(
+        "--skip-creating-release-version",
+        action="store_true",
+        help="With release, skip creating the version and just upload for this provider.",
+    )
+    parser.add_argument(
+        "--debug-packer",
+        action="store_true",
+        help=(
+            "When the build command is given, run packer in debug mode, and write log to the "
+            "base-box directory"
+        ),
+    )
 
     return parser.parse_args()
 
diff --git a/apps/microtvm/reference-vm/zephyr/Vagrantfile b/apps/microtvm/reference-vm/zephyr/Vagrantfile
index 5a73d1f5e79b..b7f9e4d2363d 100644
--- a/apps/microtvm/reference-vm/zephyr/Vagrantfile
+++ b/apps/microtvm/reference-vm/zephyr/Vagrantfile
@@ -57,4 +57,14 @@ Vagrant.configure("2") do |config|
     end
   end
 
+  config.vm.provider "vmware_desktop" do |vm, overrides|
+    vm.vmx["usb_xhci.present"] = "TRUE"
+    vm.vmx["usb.present"] = "TRUE"
+    vm.vmx["ehci.present"] = "TRUE"
+    dirs_to_mount.each do |d|
+      overrides.vm.synced_folder d.to_s, d.to_s
+    end
+    vm.gui = true
+  end
+
 end
diff --git a/apps/microtvm/reference-vm/zephyr/base-box/Vagrantfile.packer-template b/apps/microtvm/reference-vm/zephyr/base-box/Vagrantfile.packer-template
index b1fff9c63806..38f9a20b56cf 100644
--- a/apps/microtvm/reference-vm/zephyr/base-box/Vagrantfile.packer-template
+++ b/apps/microtvm/reference-vm/zephyr/base-box/Vagrantfile.packer-template
@@ -36,5 +36,12 @@ Vagrant.configure("2") do |config|
     config.vm.synced_folder ".", "/vagrant", disabled: true
   {{- end}}
 
+
+  {{ if eq .BoxName "microtvm-base-vmware_desktop" -}}
+  config.vm.provision "shell", inline: "touch ~/skip_zeroing_disk", privileged: false
+  {{- end}}
+
+  # NOTE: setup.sh resides in the parent directory (../) because this template is expanded into a
+  # sub-directory of base-box (output-packer-*).
   config.vm.provision "shell", path: "../setup.sh", privileged: false
 end
diff --git a/apps/microtvm/reference-vm/zephyr/base-box/setup.sh b/apps/microtvm/reference-vm/zephyr/base-box/setup.sh
index fd758064f4ca..52af947c3e89 100644
--- a/apps/microtvm/reference-vm/zephyr/base-box/setup.sh
+++ b/apps/microtvm/reference-vm/zephyr/base-box/setup.sh
@@ -18,6 +18,13 @@
 
 set -e
 
+skip_zeroing_disk=0
+if [ -e "$HOME/skip_zeroing_disk" ]; then
+    echo "NOTE: will not zero disk at the end due to VMWare Fusion bug"
+    echo "See: https://communities.vmware.com/t5/VMware-Fusion-Discussions/VMWare-Fusion-Pro-11-15-6-16696540-causes-macOS-crash-during/m-p/2284011#M139190"
+    skip_zeroing_disk=1
+fi
+
 sudo apt update
 sudo apt install -y build-essential
 sudo apt-get --purge remove modemmanager  # required to access serial ports.
@@ -96,10 +103,15 @@ sed -i "/^# If not running interactively,/ i\\ " ~/.bashrc
 
 # Clean box for packaging as a base box
 sudo apt-get clean
-EMPTY_FILE="$HOME/EMPTY"
-dd if=/dev/zero "of=${EMPTY_FILE}" bs=1M || /bin/true
-if [ ! -e "${EMPTY_FILE}" ]; then
-    echo "failed to zero empty sectors on disk"
-    exit 2
+if [ $skip_zeroing_disk -eq 0 ]; then
+    echo "Zeroing disk..."
+    EMPTY_FILE="$HOME/EMPTY"
+    dd if=/dev/zero "of=${EMPTY_FILE}" bs=1M || /bin/true
+    if [ ! -e "${EMPTY_FILE}" ]; then
+        echo "failed to zero empty sectors on disk"
+        exit 2
+    fi
+    rm -f "${EMPTY_FILE}"
+else
+    echo "NOTE: skipping zeroing disk due to command-line argument."
 fi
-rm -f "${EMPTY_FILE}"
diff --git a/tutorials/micro/micro_reference_vm.py b/tutorials/micro/micro_reference_vm.py
index 07d29401c0e8..93395a44c8ae 100644
--- a/tutorials/micro/micro_reference_vm.py
+++ b/tutorials/micro/micro_reference_vm.py
@@ -59,15 +59,17 @@
 
 A minimal set of prerequisites are needed:
 
-
 1. `Vagrant <https://vagrantup.com>`__
-2. A supported Virtual Machine hypervisor.
-   `VirtualBox <https://www.virtualbox.org>`__ is one suggested free hypervisor, but please note
+2. A supported Virtual Machine hypervisor (**VirtualBox**, **Parallels**, or **VMWare Fusion/Workstation**).
+   `VirtualBox <https://www.virtualbox.org>`__ is a suggested free hypervisor, but please note
    that the `VirtualBox Extension Pack`_ is required for proper USB forwarding. If using VirtualBox,
    also consider installing the `vbguest <https://github.com/dotless-de/vagrant-vbguest>`_ plugin.
 
 .. _VirtualBox Extension Pack: https://www.virtualbox.org/wiki/Downloads#VirtualBox6.1.16OracleVMVirtualBoxExtensionPack
 
+3. If required for your hypervisor, the
+   `Vagrant provider plugin <https://github.com/hashicorp/vagrant/wiki/Available-Vagrant-Plugins#providers>`__ (or see `here <https://www.vagrantup.com/vmware>`__ for VMWare).
+
 First boot
 ----------
 
@@ -75,9 +77,9 @@
 
 .. code-block:: bash
 
-    # Replace zepyhr with the name of a different platform, if you are not using Zephyr.
+    # Replace zephyr with the name of a different platform, if you are not using Zephyr.
     ~/.../tvm $ cd apps/microtvm/reference-vm/zephyr
-    # Replace <provider_name> with the name of the hypervisor you wish to use (i.e. virtualbox).
+    # Replace <provider_name> with the name of the hypervisor you wish to use (i.e. virtualbox, parallels, vmware_desktop).
     ~/.../tvm/apps/microtvm/reference-vm/zephyr $ vagrant up --provider=<provider_name>
 
 

From c789a2959dac48f217afca917034f1509a690f83 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Mon, 8 Feb 2021 12:21:48 -0800
Subject: [PATCH 168/357] Generate JUnitXML from pytest (#7407)

* Generate JUnitXML from pytest.

* address tkonolige comments
---
 tests/scripts/setup-pytest-env.sh             | 16 ++++++++++++-
 .../task_python_arm_compute_library.sh        |  5 ++--
 tests/scripts/task_python_ethosn_tests.sh     |  4 ++--
 tests/scripts/task_python_frontend.sh         | 14 +++++------
 tests/scripts/task_python_frontend_cpu.sh     |  6 ++---
 tests/scripts/task_python_integration.sh      | 23 +++++++++++--------
 .../task_python_integration_gpuonly.sh        |  1 +
 tests/scripts/task_python_microtvm.sh         |  3 +--
 tests/scripts/task_python_nightly.sh          |  2 +-
 tests/scripts/task_python_topi.sh             |  2 +-
 tests/scripts/task_python_unittest.sh         | 10 ++++----
 tests/scripts/task_python_unittest_gpuonly.sh |  1 +
 tests/scripts/task_python_vta_fsim.sh         |  6 +++--
 tests/scripts/task_python_vta_tsim.sh         |  6 +++--
 14 files changed, 62 insertions(+), 37 deletions(-)

diff --git a/tests/scripts/setup-pytest-env.sh b/tests/scripts/setup-pytest-env.sh
index 475ce1ce1c53..b77d3f37cd3e 100755
--- a/tests/scripts/setup-pytest-env.sh
+++ b/tests/scripts/setup-pytest-env.sh
@@ -27,4 +27,18 @@ fi
 set -u
 
 export TVM_PATH=`pwd`
-export PYTHONPATH=${TVM_PATH}/python
+export PYTHONPATH="${TVM_PATH}/python"
+
+export TVM_PYTEST_RESULT_DIR="${TVM_PATH}/build/pytest-results"
+mkdir -p "${TVM_PYTEST_RESULT_DIR}"
+
+function run_pytest() {
+    test_suite_name="$1"
+    shift
+    for ffi_type in ${TVM_PYTEST_FFI_TYPES:-ctypes cython}; do
+        TVM_FFI=${ffi_type} python3 -m pytest \
+           -o "junit_suite_name=${test_suite_name}-${ffi_type}" \
+           "--junit-xml=${TVM_PYTEST_RESULT_DIR}/${test_suite_name}-${ffi_type}.xml" \
+           "$@"
+    done
+}
diff --git a/tests/scripts/task_python_arm_compute_library.sh b/tests/scripts/task_python_arm_compute_library.sh
index e36d042676d6..4c1992b58692 100755
--- a/tests/scripts/task_python_arm_compute_library.sh
+++ b/tests/scripts/task_python_arm_compute_library.sh
@@ -22,9 +22,10 @@ source tests/scripts/setup-pytest-env.sh
 
 
 # Rebuild cython
+# TODO(u99127): Enable cython tests.
 
 find . -type f -path "*.pyc" | xargs rm -f
 make cython3
 
-TVM_FFI=ctypes python3 -m pytest tests/python/contrib/test_arm_compute_lib
-
+TVM_PYTEST_FFI_TYPES=ctypes run_pytest python-arm_compute_lib \
+                    tests/python/contrib/test_arm_compute_lib
diff --git a/tests/scripts/task_python_ethosn_tests.sh b/tests/scripts/task_python_ethosn_tests.sh
index 36a3d0919650..472ca38149a1 100755
--- a/tests/scripts/task_python_ethosn_tests.sh
+++ b/tests/scripts/task_python_ethosn_tests.sh
@@ -22,9 +22,9 @@ source tests/scripts/setup-pytest-env.sh
 
 
 # Rebuild cython
+# TODO(u99127): Enable cython tests.
 
 find . -type f -path "*.pyc" | xargs rm -f
 make cython3
 
-TVM_FFI=ctypes python3 -m pytest tests/python/contrib/test_ethosn
-
+TVM_PYTEST_FFI_TYPES=ctypes run_pytest python-ethosn tests/python/contrib/test_ethosn
diff --git a/tests/scripts/task_python_frontend.sh b/tests/scripts/task_python_frontend.sh
index 3c5839bc7e1c..6b1d8e5038fb 100755
--- a/tests/scripts/task_python_frontend.sh
+++ b/tests/scripts/task_python_frontend.sh
@@ -32,22 +32,22 @@ find . -type f -path "*.pyc" | xargs rm -f
 make cython3
 
 echo "Running relay MXNet frontend test..."
-python3 -m pytest tests/python/frontend/mxnet
+TVM_PYTHON_FFI_TYPES=cython run_pytest python-frontend-mxnet tests/python/frontend/mxnet
 
 echo "Running relay ONNX frontend test..."
-python3 -m pytest tests/python/frontend/onnx
+TVM_PYTHON_FFI_TYPES=cython run_pytest python-frontend-onnx tests/python/frontend/onnx
 
 echo "Running relay CoreML frontend test..."
-python3 -m pytest tests/python/frontend/coreml
+TVM_PYTHON_FFI_TYPES=cython run_pytest python-frontend-coreml tests/python/frontend/coreml
 
 echo "Running relay Tensorflow frontend test..."
-python3 -m pytest tests/python/frontend/tensorflow
+TVM_PYTHON_FFI_TYPES=cython run_pytest python-frontend-tensorflow tests/python/frontend/tensorflow
 
 echo "Running relay caffe2 frontend test..."
-python3 -m pytest tests/python/frontend/caffe2
+TVM_PYTHON_FFI_TYPES=cython run_pytest python-frontend-caffe2 tests/python/frontend/caffe2
 
 echo "Running relay DarkNet frontend test..."
-python3 -m pytest tests/python/frontend/darknet
+TVM_PYTHON_FFI_TYPES=cython run_pytest python-frontend-darknet tests/python/frontend/darknet
 
 echo "Running relay PyTorch frontend test..."
-python3 -m pytest tests/python/frontend/pytorch
+TVM_PYTHON_FFI_TYPES=cython run_pytest python-frontend-pytorch tests/python/frontend/pytorch
diff --git a/tests/scripts/task_python_frontend_cpu.sh b/tests/scripts/task_python_frontend_cpu.sh
index 6dfcabc2cd37..a5cd3ba8ef09 100755
--- a/tests/scripts/task_python_frontend_cpu.sh
+++ b/tests/scripts/task_python_frontend_cpu.sh
@@ -33,10 +33,10 @@ find . -type f -path "*.pyc" | xargs rm -f
 make cython3
 
 echo "Running relay TFLite frontend test..."
-python3 -m pytest tests/python/frontend/tflite
+TVM_PYTHON_FFI_TYPES=cython run_pytest python-frontend-tflite tests/python/frontend/tflite
 
 echo "Running relay Keras frontend test..."
-python3 -m pytest tests/python/frontend/keras
+TVM_PYTHON_FFI_TYPES=cython run_pytest python-frontend-keras tests/python/frontend/keras
 
 echo "Running relay Caffe frontend test..."
-python3 -m pytest tests/python/frontend/caffe
+TVM_PYTHON_FFI_TYPES=cython run_pytest python-frontend-caffe tests/python/frontend/caffe
diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh
index ef86d6917424..dc96097fb115 100755
--- a/tests/scripts/task_python_integration.sh
+++ b/tests/scripts/task_python_integration.sh
@@ -27,6 +27,10 @@ export LD_LIBRARY_PATH="build:${LD_LIBRARY_PATH:-}"
 export TVM_BIND_THREADS=0
 export TVM_NUM_THREADS=2
 
+if [ -z "${TVM_INTEGRATION_TESTSUITE_NAME:-}" ]; then
+    TVM_INTEGRATION_TESTSUITE_NAME=python-integration
+fi
+
 # cleanup pycache
 find . -type f -path "*.pyc" | xargs rm -f
 
@@ -39,29 +43,28 @@ rm -rf lib
 make
 cd ../..
 
-TVM_FFI=cython python3 -m pytest apps/extension/tests
-TVM_FFI=ctypes python3 -m pytest apps/extension/tests
+run_pytest ${TVM_INTEGRATION_TESTSUITE_NAME}-extensions apps/extension/tests
 
 # Test dso plugin
 cd apps/dso_plugin_module
 rm -rf lib
 make
 cd ../..
-TVM_FFI=cython python3 -m pytest apps/dso_plugin_module
-TVM_FFI=ctypes python3 -m pytest apps/dso_plugin_module
+run_pytest ${TVM_INTEGRATION_TESTSUITE_NAME}-dso_plugin_module apps/dso_plugin_module
 
 # Do not enable TensorFlow op
 # TVM_FFI=cython sh prepare_and_test_tfop_module.sh
 # TVM_FFI=ctypes sh prepare_and_test_tfop_module.sh
 
-TVM_FFI=ctypes python3 -m pytest tests/python/integration
-TVM_FFI=ctypes python3 -m pytest tests/python/contrib
+TVM_PYTEST_FFI_TYPES=ctypes run_pytest ${TVM_INTEGRATION_TESTSUITE_NAME} tests/python/integration
+TVM_PYTEST_FFI_TYPES=ctypes run_pytest ${TVM_INTEGRATION_TESTSUITE_NAME}-contrib tests/python/contrib
 
-TVM_TEST_TARGETS="${TVM_RELAY_TEST_TARGETS:-llvm;cuda}" TVM_FFI=ctypes python3 -m pytest tests/python/relay
+TVM_TEST_TARGETS="${TVM_RELAY_TEST_TARGETS:-llvm;cuda}" \
+                TVM_PYTEST_FFI_TYPES=ctypes \
+                run_pytest ${TVM_INTEGRATION_TESTSUITE_NAME}-relay tests/python/relay
 
 # Command line driver test
-TVM_FFI=ctypes python3 -m pytest tests/python/driver
+TVM_PYTEST_FFI_TYPES=ctypes run_pytest ${TVM_INTEGRATION_TESTSUITE_NAME}-driver tests/python/driver
 
 # Do not enable OpenGL
-# TVM_FFI=cython python -m pytest tests/webgl
-# TVM_FFI=ctypes python3 -m pytest tests/webgl
+# run_pytest ${TVM_INTEGRATION_TESTSUITE_NAME}-webgl tests/webgl
diff --git a/tests/scripts/task_python_integration_gpuonly.sh b/tests/scripts/task_python_integration_gpuonly.sh
index c2a9e0c15abe..ac09cb5a14a3 100755
--- a/tests/scripts/task_python_integration_gpuonly.sh
+++ b/tests/scripts/task_python_integration_gpuonly.sh
@@ -19,5 +19,6 @@
 export TVM_TEST_TARGETS="cuda;opencl;metal;rocm;vulkan;nvptx;opencl -device=mali,aocl_sw_emu"
 export PYTEST_ADDOPTS="-m gpu $PYTEST_ADDOPTS"
 export TVM_RELAY_TEST_TARGETS="cuda"
+export TVM_INTEGRATION_TESTSUITE_NAME=python-integration-gpu
 
 ./tests/scripts/task_python_integration.sh
diff --git a/tests/scripts/task_python_microtvm.sh b/tests/scripts/task_python_microtvm.sh
index 7fb8d471a53a..ddedff37c6c2 100755
--- a/tests/scripts/task_python_microtvm.sh
+++ b/tests/scripts/task_python_microtvm.sh
@@ -24,6 +24,5 @@ source tests/scripts/setup-pytest-env.sh
 # cleanup pycache
 find . -type f -path "*.pyc" | xargs rm -f
 
-TVM_FFI=ctypes python3 -m pytest tests/micro/qemu
 make cython3
-TVM_FFI=cython python3 -m pytest tests/micro/qemu
+run_pytest python-microtvm-qemu tests/micro/qemu
diff --git a/tests/scripts/task_python_nightly.sh b/tests/scripts/task_python_nightly.sh
index 36a620541997..bff0650b0bed 100755
--- a/tests/scripts/task_python_nightly.sh
+++ b/tests/scripts/task_python_nightly.sh
@@ -27,4 +27,4 @@ make cython3
 # cleanup pycache
 find . -type f -path "*.pyc" | xargs rm -f
 
-python3 -m pytest tests/python/topi/nightly
+TVM_PYTEST_FFI_TYPES=cython run_pytest python-topi-nightly tests/python/topi/nightly
diff --git a/tests/scripts/task_python_topi.sh b/tests/scripts/task_python_topi.sh
index 3bc3caf825cf..e5eb6f28276a 100755
--- a/tests/scripts/task_python_topi.sh
+++ b/tests/scripts/task_python_topi.sh
@@ -31,4 +31,4 @@ make cython3
 # cleanup pycache
 find . -type f -path "*.pyc" | xargs rm -f
 
-python3 -m pytest tests/python/topi/
+TVM_PYTHON_FFI_TYPES=cython run_pytest python-topi tests/python/topi/
diff --git a/tests/scripts/task_python_unittest.sh b/tests/scripts/task_python_unittest.sh
index 0aaf9fc86664..b63d79a99562 100755
--- a/tests/scripts/task_python_unittest.sh
+++ b/tests/scripts/task_python_unittest.sh
@@ -25,7 +25,9 @@ source tests/scripts/setup-pytest-env.sh
 find . -type f -path "*.pyc" | xargs rm -f
 make cython3
 
-TVM_FFI=ctypes python3 -m pytest tests/python/all-platform-minimal-test
-TVM_FFI=cython python3 -m pytest tests/python/all-platform-minimal-test
-TVM_FFI=ctypes python3 -m pytest tests/python/unittest
-TVM_FFI=cython python3 -m pytest tests/python/unittest
+if [ -z "${TVM_UNITTEST_TESTSUITE_NAME:-}" ]; then
+    TVM_UNITTEST_TESTSUITE_NAME=python-unittest
+fi
+
+run_pytest ${TVM_UNITTEST_TESTSUITE_NAME}-platform-minimal-test tests/python/all-platform-minimal-test
+run_pytest ${TVM_UNITTEST_TESTSUITE_NAME} tests/python/unittest
diff --git a/tests/scripts/task_python_unittest_gpuonly.sh b/tests/scripts/task_python_unittest_gpuonly.sh
index 56722b16a364..22f79bc70ec9 100755
--- a/tests/scripts/task_python_unittest_gpuonly.sh
+++ b/tests/scripts/task_python_unittest_gpuonly.sh
@@ -18,5 +18,6 @@
 
 export TVM_TEST_TARGETS="cuda;opencl;metal;rocm;vulkan;nvptx;opencl -device=mali,aocl_sw_emu"
 export PYTEST_ADDOPTS="-m gpu $PYTEST_ADDOPTS"
+export TVM_UNITTEST_TESTSUITE_NAME=python-unittest-gpu
 
 ./tests/scripts/task_python_unittest.sh
diff --git a/tests/scripts/task_python_vta_fsim.sh b/tests/scripts/task_python_vta_fsim.sh
index 8080bbe756c7..74d14db95d30 100755
--- a/tests/scripts/task_python_vta_fsim.sh
+++ b/tests/scripts/task_python_vta_fsim.sh
@@ -40,8 +40,10 @@ cp ${VTA_HW_PATH}/config/fsim_sample.json ${VTA_HW_PATH}/config/vta_config.json
 
 # Run unit tests in functional/fast simulator
 echo "Running unittest in fsim..."
-python3 -m pytest ${TVM_PATH}/vta/tests/python/unittest
+TVM_PYTEST_FFI_TYPES=cython run_pytest python-vta-fsim-unittest \
+                    ${TVM_PATH}/vta/tests/python/unittest
 
 # Run unit tests in functional/fast simulator
 echo "Running integration test in fsim..."
-python3 -m pytest ${TVM_PATH}/vta/tests/python/integration
+TVM_PYTEST_FFI_TYPES=cython run_pytest python-vta-fsim-integration \
+                    ${TVM_PATH}/vta/tests/python/integration
diff --git a/tests/scripts/task_python_vta_tsim.sh b/tests/scripts/task_python_vta_tsim.sh
index c87d5483b8a5..4a5c9d7da877 100755
--- a/tests/scripts/task_python_vta_tsim.sh
+++ b/tests/scripts/task_python_vta_tsim.sh
@@ -55,11 +55,13 @@ make -C ${VTA_HW_PATH}/hardware/chisel USE_THREADS=0 lib
 
 # Run unit tests in cycle accurate simulator
 echo "Running unittest in tsim..."
-python3 -m pytest ${TVM_PATH}/vta/tests/python/unittest
+TVM_PYTEST_FFI_TYPES=cython run_pytest python-vta-tsim-unittest \
+                    ${TVM_PATH}/vta/tests/python/unittest
 
 # Run unit tests in cycle accurate simulator
 echo "Running integration test in tsim..."
-python3 -m pytest ${TVM_PATH}/vta/tests/python/integration
+TVM_PYTEST_FFI_TYPES=cython run_pytest python-vta-tsim-integration \
+                    ${TVM_PATH}/vta/tests/python/integration
 
 # Reset default fsim simulation
 cp ${VTA_HW_PATH}/config/fsim_sample.json ${VTA_HW_PATH}/config/vta_config.json

From 79b6ef753702a04e57aa1b2669d70d97db3094ca Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Mon, 8 Feb 2021 16:49:42 -0800
Subject: [PATCH 169/357] [FIX,CMAKE] Only compile runtime files once (#7417)

* [FIX,CMAKE] Only compile runtime files once

* copy defines to tvm_runtime_objs
---
 CMakeLists.txt | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 98dd7dec8bed..92630faf07d5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -371,10 +371,10 @@ endif()
 
 add_lib_info(${CMAKE_CURRENT_LIST_DIR}/src/support/libinfo.cc)
 
-add_library(tvm_objs OBJECT ${COMPILER_SRCS} ${RUNTIME_SRCS})
+add_library(tvm_objs OBJECT ${COMPILER_SRCS})
 add_library(tvm_runtime_objs OBJECT ${RUNTIME_SRCS})
 
-add_library(tvm SHARED $<TARGET_OBJECTS:tvm_objs>)
+add_library(tvm SHARED $<TARGET_OBJECTS:tvm_objs> $<TARGET_OBJECTS:tvm_runtime_objs>)
 set_property(TARGET tvm APPEND PROPERTY LINK_OPTIONS "${TVM_VISIBILITY_FLAG}")
 add_library(tvm_runtime SHARED $<TARGET_OBJECTS:tvm_runtime_objs>)
 set_property(TARGET tvm_runtime APPEND PROPERTY LINK_OPTIONS "${TVM_VISIBILITY_FLAG}")
@@ -394,16 +394,21 @@ if(USE_RELAY_DEBUG)
   message(STATUS "Building Relay in debug mode...")
   target_compile_definitions(tvm_objs PRIVATE "USE_RELAY_DEBUG")
   target_compile_definitions(tvm_objs PRIVATE "DMLC_LOG_DEBUG")
+  target_compile_definitions(tvm_runtime_objs PRIVATE "USE_RELAY_DEBUG")
+  target_compile_definitions(tvm_runtime_objs PRIVATE "DMLC_LOG_DEBUG")
 else()
   target_compile_definitions(tvm_objs PRIVATE "NDEBUG")
+  target_compile_definitions(tvm_runtime_objs PRIVATE "NDEBUG")
 endif(USE_RELAY_DEBUG)
 
 if(USE_FALLBACK_STL_MAP)
   message(STATUS "Building with STL Map...")
   target_compile_definitions(tvm_objs PRIVATE "USE_FALLBACK_STL_MAP=1")
+  target_compile_definitions(tvm_runtime_objs PRIVATE "USE_FALLBACK_STL_MAP=1")
 else()
   message(STATUS "Building with TVM Map...")
   target_compile_definitions(tvm_objs PRIVATE "USE_FALLBACK_STL_MAP=0")
+  target_compile_definitions(tvm_runtime_objs PRIVATE "USE_FALLBACK_STL_MAP=0")
 endif(USE_FALLBACK_STL_MAP)
 
 if(BUILD_FOR_HEXAGON)
@@ -447,7 +452,7 @@ target_include_directorieS(
 
 set(TVM_TEST_LIBRARY_NAME tvm)
 if (HIDE_PRIVATE_SYMBOLS AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
-  add_library(tvm_allvisible SHARED $<TARGET_OBJECTS:tvm_objs>)
+  add_library(tvm_allvisible SHARED $<TARGET_OBJECTS:tvm_objs> $<TARGET_OBJECTS:tvm_runtime_objs>)
   target_include_directories(tvm_allvisible PUBLIC "$<TARGET_PROPERTY:tvm,INCLUDE_DIRECTORIES>")
   target_link_libraries(tvm_allvisible PRIVATE "$<TARGET_PROPERTY:tvm,LINK_LIBRARIES>")
   set(TVM_TEST_LIBRARY_NAME tvm_allvisible)

From 2b8d113d7e45ad18c21d071f94e7aaab803f04e3 Mon Sep 17 00:00:00 2001
From: CircleSpin <2keepconnected@gmail.com>
Date: Tue, 9 Feb 2021 13:18:48 -0500
Subject: [PATCH 170/357] [TVMC] Allow manual shape specification in tvmc
 (#7366)

* add ability to optionally overide tvm shapes

* add help documentation for --shapes

* improve documentation

* reformat test_compiler using black

* Incorporate feedback from ekalda for better pytorch support and testing.

* address feedback

* switch input shape syntax to be more pythonic

* add commentary

* reformat common.py

* fix lint issue

* format common.py with black

* torch/pytorch test hiccup

* add -s to setup-pytest-env.sh for clearer error msgs

Co-authored-by: Jocelyn <jocelyn@pop-os.localdomain>
---
 python/tvm/driver/tvmc/autotuner.py        |  9 ++++-
 python/tvm/driver/tvmc/common.py           | 39 ++++++++++++++++++
 python/tvm/driver/tvmc/compiler.py         | 16 +++++++-
 python/tvm/driver/tvmc/frontends.py        | 47 +++++++++++++---------
 tests/python/driver/tvmc/conftest.py       | 17 ++++++++
 tests/python/driver/tvmc/test_common.py    | 33 +++++++++++++++
 tests/python/driver/tvmc/test_compiler.py  | 31 ++++++++++----
 tests/python/driver/tvmc/test_frontends.py | 20 ++++++++-
 tests/scripts/setup-pytest-env.sh          |  4 +-
 9 files changed, 184 insertions(+), 32 deletions(-)

diff --git a/python/tvm/driver/tvmc/autotuner.py b/python/tvm/driver/tvmc/autotuner.py
index 71ccc8546e8b..fe5bebcabcbc 100644
--- a/python/tvm/driver/tvmc/autotuner.py
+++ b/python/tvm/driver/tvmc/autotuner.py
@@ -210,6 +210,13 @@ def add_tune_parser(subparsers):
     #     can be improved in future to add integration with a modelzoo
     #     or URL, for example.
     parser.add_argument("FILE", help="path to the input model file")
+    parser.add_argument(
+        "--input-shapes",
+        help="specify non-generic shapes for model to run, format is "
+        '"input_name:[dim1,dim2,...,dimn] input_name2:[dim1,dim2]"',
+        type=common.parse_shape_string,
+        default=None,
+    )
 
 
 def drive_tune(args):
@@ -235,7 +242,7 @@ def drive_tune(args):
             )
 
     target = common.target_from_cli(args.target)
-    mod, params = frontends.load_model(args.FILE, args.model_format)
+    mod, params = frontends.load_model(args.FILE, args.model_format, shape_dict=args.input_shapes)
 
     # min_repeat_ms should be:
     # a. the value provided by the user, if any, or
diff --git a/python/tvm/driver/tvmc/common.py b/python/tvm/driver/tvmc/common.py
index 9db22f3f3390..1845915bcbd1 100644
--- a/python/tvm/driver/tvmc/common.py
+++ b/python/tvm/driver/tvmc/common.py
@@ -17,8 +17,10 @@
 """
 Common utility functions shared by TVMC modules.
 """
+import re
 import logging
 import os.path
+import argparse
 
 from urllib.parse import urlparse
 
@@ -136,3 +138,40 @@ def tracker_host_port_from_cli(rpc_tracker_str):
         logger.info("RPC tracker port: %s", rpc_port)
 
     return rpc_hostname, rpc_port
+
+
+def parse_shape_string(inputs_string):
+    """Parse an input shape dictionary string to a usable dictionary.
+
+    Parameters
+    ----------
+    inputs_string: str
+        A string of the form "input_name:[dim1,dim2,...,dimn] input_name2:[dim1,dim2]" that
+        indicates the desired shape for specific model inputs.
+
+    Returns
+    -------
+    shape_dict: dict
+        A dictionary mapping input names to their shape for use in relay frontend converters.
+    """
+
+    # Create a regex pattern that extracts each separate input mapping.
+    pattern = r"\w+\:\s*\[\-?\d+(?:\,\s*\-?\d+)*\]"
+    input_mappings = re.findall(pattern, inputs_string)
+    if not input_mappings:
+        raise argparse.ArgumentTypeError(
+            "--input-shapes argument must be of the form "
+            '"input_name:[dim1,dim2,...,dimn] input_name2:[dim1,dim2]"'
+        )
+    shape_dict = {}
+    for mapping in input_mappings:
+        # Remove whitespace.
+        mapping = mapping.replace(" ", "")
+        # Split mapping into name and shape.
+        name, shape_string = mapping.split(":")
+        # Convert shape string into a list of integers or Anys if negative.
+        shape = [int(x) if int(x) > 0 else relay.Any() for x in shape_string.strip("][").split(",")]
+        # Add parsed mapping to shape dictionary.
+        shape_dict[name] = shape
+
+    return shape_dict
diff --git a/python/tvm/driver/tvmc/compiler.py b/python/tvm/driver/tvmc/compiler.py
index 90b0aceaa17a..282ae6a76b56 100644
--- a/python/tvm/driver/tvmc/compiler.py
+++ b/python/tvm/driver/tvmc/compiler.py
@@ -87,6 +87,13 @@ def add_compile_parser(subparsers):
     #     can be improved in future to add integration with a modelzoo
     #     or URL, for example.
     parser.add_argument("FILE", help="path to the input model file")
+    parser.add_argument(
+        "--input-shapes",
+        help="specify non-generic shapes for model to run, format is "
+        '"input_name:[dim1,dim2,...,dimn] input_name2:[dim1,dim2]"',
+        type=common.parse_shape_string,
+        default=None,
+    )
 
 
 def drive_compile(args):
@@ -98,7 +105,7 @@ def drive_compile(args):
         Arguments from command line parser.
 
     Returns
-    --------
+    -------
     int
         Zero if successfully completed
 
@@ -112,6 +119,7 @@ def drive_compile(args):
         args.model_format,
         args.tuning_records,
         args.desired_layout,
+        args.input_shapes,
     )
 
     if dumps:
@@ -129,6 +137,7 @@ def compile_model(
     model_format=None,
     tuning_records=None,
     alter_layout=None,
+    shape_dict=None,
 ):
     """Compile a model from a supported framework into a TVM module.
 
@@ -158,6 +167,9 @@ def compile_model(
         The layout to convert the graph to. Note, the convert layout
         pass doesn't currently guarantee the whole of the graph will
         be converted to the chosen layout.
+    shape_dict: dict, optional
+        A mapping from input names to their shape. When present,
+        the default shapes in the model will be overwritten.
 
     Returns
     -------
@@ -172,7 +184,7 @@ def compile_model(
 
     """
     dump_code = [x.strip() for x in dump_code.split(",")] if dump_code else None
-    mod, params = frontends.load_model(path, model_format)
+    mod, params = frontends.load_model(path, model_format, shape_dict)
 
     if alter_layout:
         mod = common.convert_graph_layout(mod, alter_layout)
diff --git a/python/tvm/driver/tvmc/frontends.py b/python/tvm/driver/tvmc/frontends.py
index bb54b82cceca..53fbed66c8fc 100644
--- a/python/tvm/driver/tvmc/frontends.py
+++ b/python/tvm/driver/tvmc/frontends.py
@@ -54,13 +54,15 @@ def suffixes():
         """File suffixes (extensions) used by this frontend"""
 
     @abstractmethod
-    def load(self, path):
+    def load(self, path, shape_dict=None):
         """Load a model from a given path.
 
         Parameters
         ----------
         path: str
             Path to a file
+        shape_dict: dict, optional
+            Mapping from input names to their shapes.
 
         Returns
         -------
@@ -99,7 +101,7 @@ def name():
     def suffixes():
         return ["h5"]
 
-    def load(self, path):
+    def load(self, path, shape_dict=None):
         # pylint: disable=C0103
         tf, keras = import_keras()
 
@@ -125,8 +127,10 @@ def load(self, path):
                 )
 
         inputs = [np.random.uniform(size=shape, low=-1.0, high=1.0) for shape in in_shapes]
-        shape_dict = {name: x.shape for (name, x) in zip(model.input_names, inputs)}
-        return relay.frontend.from_keras(model, shape_dict, layout="NHWC")
+        input_shapes = {name: x.shape for (name, x) in zip(model.input_names, inputs)}
+        if shape_dict is not None:
+            input_shapes.update(shape_dict)
+        return relay.frontend.from_keras(model, input_shapes, layout="NHWC")
 
     def is_sequential_p(self, model):
         _, keras = import_keras()
@@ -154,14 +158,14 @@ def name():
     def suffixes():
         return ["onnx"]
 
-    def load(self, path):
+    def load(self, path, shape_dict=None):
         # pylint: disable=C0415
         import onnx
 
         # pylint: disable=E1101
         model = onnx.load(path)
 
-        return relay.frontend.from_onnx(model)
+        return relay.frontend.from_onnx(model, shape=shape_dict)
 
 
 class TensorflowFrontend(Frontend):
@@ -175,7 +179,7 @@ def name():
     def suffixes():
         return ["pb"]
 
-    def load(self, path):
+    def load(self, path, shape_dict=None):
         # pylint: disable=C0415
         import tensorflow as tf
         import tvm.relay.testing.tf as tf_testing
@@ -188,7 +192,7 @@ def load(self, path):
         graph_def = tf_testing.ProcessGraphDefParam(graph_def)
 
         logger.debug("parse TensorFlow model and convert into Relay computation graph")
-        return relay.frontend.from_tensorflow(graph_def)
+        return relay.frontend.from_tensorflow(graph_def, shape=shape_dict)
 
 
 class TFLiteFrontend(Frontend):
@@ -215,7 +219,7 @@ def name():
     def suffixes():
         return ["tflite"]
 
-    def load(self, path):
+    def load(self, path, shape_dict=None):
         # pylint: disable=C0415
         import tflite.Model as model
 
@@ -238,11 +242,13 @@ def load(self, path):
             raise TVMCException("input file not tflite version 3")
 
         logger.debug("tflite_input_type")
-        shape_dict, dtype_dict = TFLiteFrontend._input_type(tflite_model)
+        input_shapes, dtype_dict = TFLiteFrontend._input_type(tflite_model)
+        if shape_dict is not None:
+            input_shapes.update(shape_dict)
 
         logger.debug("parse TFLite model and convert into Relay computation graph")
         mod, params = relay.frontend.from_tflite(
-            tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict
+            tflite_model, shape_dict=input_shapes, dtype_dict=dtype_dict
         )
         return mod, params
 
@@ -285,17 +291,18 @@ def suffixes():
         # Torch Script is a zip file, but can be named pth
         return ["pth", "zip"]
 
-    def load(self, path):
+    def load(self, path, shape_dict=None):
         # pylint: disable=C0415
         import torch
 
-        traced_model = torch.jit.load(path)
-
-        inputs = list(traced_model.graph.inputs())[1:]
-        input_shapes = [inp.type().sizes() for inp in inputs]
+        if shape_dict is None:
+            raise TVMCException("--input-shapes must be specified for %s" % self.name())
 
+        traced_model = torch.jit.load(path)
         traced_model.eval()  # Switch to inference mode
-        input_shapes = [("input{}".format(idx), shape) for idx, shape in enumerate(shapes)]
+
+        # Convert shape dictionary to list for Pytorch frontend compatibility
+        input_shapes = list(shape_dict.items())
 
         logger.debug("parse Torch model and convert into Relay computation graph")
         return relay.frontend.from_pytorch(traced_model, input_shapes)
@@ -378,7 +385,7 @@ def guess_frontend(path):
     raise TVMCException("failed to infer the model format. Please specify --model-format")
 
 
-def load_model(path, model_format=None):
+def load_model(path, model_format=None, shape_dict=None):
     """Load a model from a supported framework and convert it
     into an equivalent relay representation.
 
@@ -389,6 +396,8 @@ def load_model(path, model_format=None):
     model_format : str, optional
         The underlying framework used to create the model.
         If not specified, this will be inferred from the file type.
+    shape_dict : dict, optional
+        Mapping from input names to their shapes.
 
     Returns
     -------
@@ -404,6 +413,6 @@ def load_model(path, model_format=None):
     else:
         frontend = guess_frontend(path)
 
-    mod, params = frontend.load(path)
+    mod, params = frontend.load(path, shape_dict)
 
     return mod, params
diff --git a/tests/python/driver/tvmc/conftest.py b/tests/python/driver/tvmc/conftest.py
index 882d793ccebd..534953deecbc 100644
--- a/tests/python/driver/tvmc/conftest.py
+++ b/tests/python/driver/tvmc/conftest.py
@@ -99,6 +99,23 @@ def keras_resnet50(tmpdir_factory):
     return model_file_name
 
 
+@pytest.fixture(scope="session")
+def pytorch_resnet18(tmpdir_factory):
+    try:
+        import torch
+        import torchvision.models as models
+    except ImportError:
+        # Not all environments provide Pytorch, so skip if that's the case.
+        return ""
+    model = models.resnet18()
+    model_file_name = "{}/{}".format(tmpdir_factory.mktemp("data"), "resnet18.pth")
+    # Trace model into torchscript.
+    traced_cpu = torch.jit.trace(model, torch.randn(1, 3, 224, 224))
+    torch.jit.save(traced_cpu, model_file_name)
+
+    return model_file_name
+
+
 @pytest.fixture(scope="session")
 def onnx_resnet50():
     base_url = "https://github.com/onnx/models/raw/master/vision/classification/resnet/model"
diff --git a/tests/python/driver/tvmc/test_common.py b/tests/python/driver/tvmc/test_common.py
index 5ffbc6fe37dd..f30949b54497 100644
--- a/tests/python/driver/tvmc/test_common.py
+++ b/tests/python/driver/tvmc/test_common.py
@@ -21,6 +21,7 @@
 import pytest
 
 import tvm
+from tvm import relay
 from tvm.driver import tvmc
 
 
@@ -149,3 +150,35 @@ def test_tracker_host_port_from_cli__only_hostname__default_port_is_9090():
 
     assert expected_host == actual_host
     assert expected_port == actual_port
+
+
+def test_shape_parser():
+    # Check that a valid input is parsed correctly
+    shape_string = "input:[10,10,10]"
+    shape_dict = tvmc.common.parse_shape_string(shape_string)
+    assert shape_dict == {"input": [10, 10, 10]}
+    # Check that multiple valid input shapes are parse correctly
+    shape_string = "input:[10,10,10] input2:[20,20,20,20]"
+    shape_dict = tvmc.common.parse_shape_string(shape_string)
+    assert shape_dict == {"input": [10, 10, 10], "input2": [20, 20, 20, 20]}
+    # Check that alternate syntax parses correctly
+    shape_string = "input: [10, 10, 10] input2: [20, 20, 20, 20]"
+    shape_dict = tvmc.common.parse_shape_string(shape_string)
+    assert shape_dict == {"input": [10, 10, 10], "input2": [20, 20, 20, 20]}
+    shape_string = "input:[10,10,10],input2:[20,20,20,20]"
+    shape_dict = tvmc.common.parse_shape_string(shape_string)
+    assert shape_dict == {"input": [10, 10, 10], "input2": [20, 20, 20, 20]}
+    # Check that negative dimensions parse to Any correctly.
+    shape_string = "input:[-1,3,224,224]"
+    shape_dict = tvmc.common.parse_shape_string(shape_string)
+    # Convert to strings to allow comparison with Any.
+    assert str(shape_dict) == "{'input': [?, 3, 224, 224]}"
+
+    # Check that invalid pattern raises expected error.
+    shape_string = "input:[a,10]"
+    with pytest.raises(argparse.ArgumentTypeError):
+        tvmc.common.parse_shape_string(shape_string)
+    # Check that input with invalid separators raises error.
+    shape_string = "input:5,10 input2:10,10"
+    with pytest.raises(argparse.ArgumentTypeError):
+        tvmc.common.parse_shape_string(shape_string)
diff --git a/tests/python/driver/tvmc/test_compiler.py b/tests/python/driver/tvmc/test_compiler.py
index 4bbb6fbf2cf8..4cb342c2e967 100644
--- a/tests/python/driver/tvmc/test_compiler.py
+++ b/tests/python/driver/tvmc/test_compiler.py
@@ -39,14 +39,11 @@ def test_save_dumps(tmpdir_factory):
 # End to end tests for compilation
 
 
-def test_compile_tflite_module(tflite_mobilenet_v1_1_quant):
+def verify_compile_tflite_module(model, shape_dict=None):
     pytest.importorskip("tflite")
 
     graph, lib, params, dumps = tvmc.compiler.compile_model(
-        tflite_mobilenet_v1_1_quant,
-        target="llvm",
-        dump_code="ll",
-        alter_layout="NCHW",
+        model, target="llvm", dump_code="ll", alter_layout="NCHW", shape_dict=shape_dict
     )
 
     # check for output types
@@ -56,6 +53,17 @@ def test_compile_tflite_module(tflite_mobilenet_v1_1_quant):
     assert type(dumps) is dict
 
 
+def test_compile_tflite_module(tflite_mobilenet_v1_1_quant):
+    # some CI environments wont offer tflite, so skip in case it is not present
+    pytest.importorskip("tflite")
+    # Check default compilation.
+    verify_compile_tflite_module(tflite_mobilenet_v1_1_quant)
+    # Check with manual shape override
+    shape_string = "input:[1,224,224,3]"
+    shape_dict = tvmc.common.parse_shape_string(shape_string)
+    verify_compile_tflite_module(tflite_mobilenet_v1_1_quant, shape_dict)
+
+
 # This test will be skipped if the AArch64 cross-compilation toolchain is not installed.
 @pytest.mark.skipif(
     not shutil.which("aarch64-linux-gnu-gcc"), reason="cross-compilation toolchain not installed"
@@ -114,12 +122,12 @@ def test_cross_compile_aarch64_keras_module(keras_resnet50):
     assert "asm" in dumps.keys()
 
 
-def test_compile_onnx_module(onnx_resnet50):
+def verify_compile_onnx_module(model, shape_dict=None):
     # some CI environments wont offer onnx, so skip in case it is not present
     pytest.importorskip("onnx")
 
     graph, lib, params, dumps = tvmc.compiler.compile_model(
-        onnx_resnet50, target="llvm", dump_code="ll"
+        model, target="llvm", dump_code="ll", shape_dict=shape_dict
     )
 
     # check for output types
@@ -130,6 +138,15 @@ def test_compile_onnx_module(onnx_resnet50):
     assert "ll" in dumps.keys()
 
 
+def test_compile_onnx_module(onnx_resnet50):
+    # Test default compilation
+    verify_compile_onnx_module(onnx_resnet50)
+    # Test with manual shape dict
+    shape_string = "data:[1,3,200,200]"
+    shape_dict = tvmc.common.parse_shape_string(shape_string)
+    verify_compile_onnx_module(onnx_resnet50, shape_dict)
+
+
 # This test will be skipped if the AArch64 cross-compilation toolchain is not installed.
 @pytest.mark.skipif(
     not shutil.which("aarch64-linux-gnu-gcc"), reason="cross-compilation toolchain not installed"
diff --git a/tests/python/driver/tvmc/test_frontends.py b/tests/python/driver/tvmc/test_frontends.py
index d77a17addabf..04c85b1eb8f3 100644
--- a/tests/python/driver/tvmc/test_frontends.py
+++ b/tests/python/driver/tvmc/test_frontends.py
@@ -174,9 +174,27 @@ def test_load_model___wrong_language__to_onnx(tflite_mobilenet_v1_1_quant):
         tvmc.frontends.load_model(tflite_mobilenet_v1_1_quant, model_format="onnx")
 
 
+def test_load_model__pth(pytorch_resnet18):
+    # some CI environments wont offer torch, so skip in case it is not present
+    pytest.importorskip("torch")
+    pytest.importorskip("torchvision")
+
+    mod, params = tvmc.frontends.load_model(
+        pytorch_resnet18, shape_dict={"input": [1, 3, 224, 224]}
+    )
+    assert type(mod) is IRModule
+    assert type(params) is dict
+    # check whether one known value is part of the params dict
+    assert "layer1.0.conv1.weight" in params.keys()
+
+
 def test_load_model___wrong_language__to_pytorch(tflite_mobilenet_v1_1_quant):
     # some CI environments wont offer pytorch, so skip in case it is not present
     pytest.importorskip("torch")
 
     with pytest.raises(RuntimeError) as e:
-        tvmc.frontends.load_model(tflite_mobilenet_v1_1_quant, model_format="pytorch")
+        tvmc.frontends.load_model(
+            tflite_mobilenet_v1_1_quant,
+            model_format="pytorch",
+            shape_dict={"input": [1, 3, 224, 224]},
+        )
diff --git a/tests/scripts/setup-pytest-env.sh b/tests/scripts/setup-pytest-env.sh
index b77d3f37cd3e..5f108e9355fc 100755
--- a/tests/scripts/setup-pytest-env.sh
+++ b/tests/scripts/setup-pytest-env.sh
@@ -20,9 +20,9 @@
 set +u
 
 if [[ ! -z $CI_PYTEST_ADD_OPTIONS ]]; then
-    export PYTEST_ADDOPTS="-v $CI_PYTEST_ADD_OPTIONS $PYTEST_ADDOPTS"
+    export PYTEST_ADDOPTS="-s -v $CI_PYTEST_ADD_OPTIONS $PYTEST_ADDOPTS"
 else
-    export PYTEST_ADDOPTS="-v $PYTEST_ADDOPTS"
+    export PYTEST_ADDOPTS="-s -v $PYTEST_ADDOPTS"
 fi
 set -u
 

From 0716c2a416c1000037922f6cf30c112e4145b494 Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Tue, 9 Feb 2021 10:57:31 -0800
Subject: [PATCH 171/357] [AutoScheduler] Add sampling to dispatcher (#7376)

* [AutoScheduler] Add sampling to dispatcher

* address comment

* make measurment configurable
---
 python/tvm/auto_scheduler/__init__.py         |  2 +-
 python/tvm/auto_scheduler/dispatcher.py       | 93 ++++++++++++++++++-
 .../relay/test_auto_scheduler_tuning.py       | 17 +++-
 3 files changed, 106 insertions(+), 6 deletions(-)

diff --git a/python/tvm/auto_scheduler/__init__.py b/python/tvm/auto_scheduler/__init__.py
index 57e58309525c..06ca44d997e5 100644
--- a/python/tvm/auto_scheduler/__init__.py
+++ b/python/tvm/auto_scheduler/__init__.py
@@ -33,7 +33,7 @@
 # Shortcut
 from .compute_dag import ComputeDAG, LayoutRewriteOption, get_shape_from_rewritten_layout
 from .cost_model import RandomModel, XGBModel
-from .dispatcher import DispatchContext, ApplyHistoryBest
+from .dispatcher import DispatchContext, ApplyHistoryBest, ApplyHistoryBestOrSample
 from .measure import (
     MeasureInput,
     MeasureResult,
diff --git a/python/tvm/auto_scheduler/dispatcher.py b/python/tvm/auto_scheduler/dispatcher.py
index f2d7536bea88..6a25960fe7b7 100644
--- a/python/tvm/auto_scheduler/dispatcher.py
+++ b/python/tvm/auto_scheduler/dispatcher.py
@@ -28,8 +28,13 @@
 
 import numpy as np
 
+from tvm.contrib.utils import tempdir
 from tvm.tir.expr import FloatImm
-from .measure_record import load_records
+from .cost_model import RandomModel, XGBModel
+from .measure import LocalRPCMeasureContext
+from .measure_record import RecordToFile, load_records
+from .search_policy import PreloadMeasuredStates, SketchPolicy
+from .search_task import SearchTask, TuningOptions
 from .utils import calc_workload_dis_factor, decode_workload_key
 
 logger = logging.getLogger("auto_scheduler")
@@ -301,6 +306,92 @@ def update(self, target, workload_key, state):
             entry[workload_args] = (state, 1)
 
 
+class ApplyHistoryBestOrSample(ApplyHistoryBest):
+    """
+    Apply the history best config, or sample a valid schedule if no config is found.
+
+    Parameters
+    ----------
+    records : str or iterator of (auto_scheduler.measure.MeasureInput,\
+                                  auto_scheduler.measure.MeasureResult)
+        Collection of tuning records.
+        If is str, then it should be the filename of a records log file.
+        Each row of this file is an encoded record pair. Otherwise, it is an iterator.
+    sample_simple_workloads: bool
+        When False, sampling will not apply to simple workloads (w/o reduction).
+    cost_model_file: str
+        The filename of the pre-trained XGBoost cost model. If not present, then random
+        model will be used.
+    num_measure: int
+        Meausre the top-N rank of sampled schedules on the device. The default -1 means
+        no measurement and simply return the top-1 schedule ranked by the cost model.
+    """
+
+    def __init__(
+        self, records, sample_simple_workloads=False, cost_model_file=None, num_measure=-1
+    ):
+        self.sample_simple_workloads = sample_simple_workloads
+        self.num_measure = num_measure
+        self.log_dir = tempdir()
+        if cost_model_file is None:
+            self.cost_model = RandomModel()
+        else:
+            self.cost_model = XGBModel()
+            self.cost_model.load(cost_model_file)
+
+        super(ApplyHistoryBestOrSample, self).__init__(
+            records, n_lines=None, include_compatible=True
+        )
+
+    def query(self, target, workload_key, has_complex_op, dag):
+        if has_complex_op or self.sample_simple_workloads:
+            ret = self._query_inside(target, workload_key)
+        else:
+            ret = super(ApplyHistoryBestOrSample, self)._query_inside(target, workload_key)
+
+        if ret is None:
+            ret = self._old_ctx.query(target, workload_key, has_complex_op, dag)
+        return ret
+
+    def _query_inside(self, target, workload_key):
+        ret = super(ApplyHistoryBestOrSample, self)._query_inside(target, workload_key)
+        if ret is not None:
+            return ret
+
+        # Sampling valid schedules when no existing records can be used.
+        task = SearchTask(workload_key=workload_key, target=target)
+        measure_ctx = LocalRPCMeasureContext(min_repeat_ms=300)
+
+        log_file = self.log_dir.relpath("%s.log" % decode_workload_key(workload_key)[0])
+
+        while ret is None:
+            tune_option = TuningOptions(
+                num_measure_trials=self.num_measure,
+                runner=measure_ctx.runner,
+                measure_callbacks=[RecordToFile(log_file)],
+                verbose=0,
+            )
+            search_policy = SketchPolicy(
+                task,
+                self.cost_model,
+                params={
+                    "eps_greedy": 0.01,
+                    "sample_init_min_population": 64,
+                    "evolutionary_search_num_iters": 0,
+                },
+                init_search_callbacks=[PreloadMeasuredStates(log_file)],
+                verbose=0,
+            )
+            task.tune(tune_option, search_policy)
+
+            # Load the sampled records and query again.
+            self.load(log_file)
+            ret = super(ApplyHistoryBestOrSample, self)._query_inside(target, workload_key)
+
+        del measure_ctx
+        return ret
+
+
 class FallbackContext(DispatchContext):
     """
     A fallback dispatch context.
diff --git a/tests/python/relay/test_auto_scheduler_tuning.py b/tests/python/relay/test_auto_scheduler_tuning.py
index 4ae434d72a20..1ec0e305311a 100644
--- a/tests/python/relay/test_auto_scheduler_tuning.py
+++ b/tests/python/relay/test_auto_scheduler_tuning.py
@@ -56,9 +56,16 @@ def tune_network(network, target):
             ):
                 lib = relay.build(mod, target=target, params=params)
 
+        # Sample a schedule when missing
+        with auto_scheduler.ApplyHistoryBestOrSample(None, num_measure=2):
+            with tvm.transform.PassContext(
+                opt_level=3, config={"relay.backend.use_auto_scheduler": True}
+            ):
+                lib2 = relay.build(mod, target=target, params=params)
+
         # Compile without auto-scheduler and any other optimization for correctness check
         with tvm.transform.PassContext(opt_level=0):
-            lib2 = relay.build(mod, target=target, params=params)
+            ref_lib = relay.build(mod, target=target, params=params)
 
         # Check the correctness
         def get_output(data, lib):
@@ -76,10 +83,12 @@ def get_output(data, lib):
         else:
             raise ValueError("Unknown network: " + network)
 
-        actual_output = get_output(data, lib)
-        expected_output = get_output(data, lib2)
+        actual_output1 = get_output(data, lib)
+        actual_output2 = get_output(data, lib2)
+        expected_output = get_output(data, ref_lib)
 
-        tvm.testing.assert_allclose(actual_output, expected_output, rtol=1e-4, atol=1e-4)
+        tvm.testing.assert_allclose(actual_output1, expected_output, rtol=1e-4, atol=1e-4)
+        tvm.testing.assert_allclose(actual_output2, expected_output, rtol=1e-4, atol=1e-4)
 
 
 @tvm.testing.requires_cuda

From 2999d03284c74f6840503ae3b880d3579a76f1af Mon Sep 17 00:00:00 2001
From: Egor Churaev <egor.churaev@gmail.com>
Date: Wed, 10 Feb 2021 00:08:58 +0300
Subject: [PATCH 172/357] [ONNX] Add CumSum operator to ONNX frontend (#7391)

* [ONNX] Add CumSum operator to ONNX frontend

* Fix lint and add attributes to CumSum

* Fix CumSum test

* Add support exclusive attribute

* Add support reverse attribute

* Fix clang-format

* Fix lint

* Move reverse calculation to ONNX frontend and add exclusive to GPU

* Add test for int type
---
 include/tvm/relay/attrs/transform.h        |  4 ++
 python/tvm/relay/frontend/onnx.py          | 25 ++++++-
 python/tvm/relay/op/_transform.py          |  2 +-
 python/tvm/relay/op/strategy/generic.py    |  2 +-
 python/tvm/relay/op/transform.py           | 10 ++-
 python/tvm/topi/cuda/scan.py               | 10 ++-
 python/tvm/topi/cumsum.py                  | 21 +++++-
 src/relay/op/tensor/transform.cc           |  3 +-
 tests/python/frontend/onnx/test_forward.py | 77 ++++++++++++++++++++++
 9 files changed, 144 insertions(+), 10 deletions(-)

diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index 43166249638a..45a1caf2bd79 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -442,9 +442,13 @@ struct MatrixSetDiagAttrs : public tvm::AttrsNode<MatrixSetDiagAttrs> {
 struct CumsumAttrs : public tvm::AttrsNode<CumsumAttrs> {
   Integer axis;
   DataType dtype;
+  Integer exclusive;
   TVM_DECLARE_ATTRS(CumsumAttrs, "relay.attrs.CumsumAttrs") {
     TVM_ATTR_FIELD(axis).describe("The axis to sum over").set_default(NullValue<Integer>());
     TVM_ATTR_FIELD(dtype).describe("Output data type").set_default(NullValue<DataType>());
+    TVM_ATTR_FIELD(exclusive)
+        .describe("The first element is not included")
+        .set_default(NullValue<Integer>());
   }
 };
 
diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index c423598a2ee7..c9140d782a2d 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -34,7 +34,7 @@
 from .. import ty as _ty
 
 from .common import AttrCvt, Renamer
-from .common import get_relay_op, new_var, infer_shape, infer_channels
+from .common import get_relay_op, new_var, infer_shape, infer_channels, infer_value
 from .common import infer_type, get_name
 
 
@@ -1075,6 +1075,28 @@ def _impl_v1(cls, inputs, attr, params):
         return _op.shape_of(inputs[0], "int64")
 
 
+class CumSum(OnnxOpConverter):
+    """Operator converter for CumSum."""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        data = inputs[0]
+        dim = inputs[1]
+
+        if dim is not None:
+            dim = int(infer_value(dim, params).asnumpy())
+
+        exclusive = attr.get("exclusive", 0)
+        reverse = attr.get("reverse", 0)
+
+        if reverse != 0:
+            out = _op.reverse(data, axis=dim)
+            out = _op.cumsum(out, axis=dim, exclusive=exclusive)
+            return _op.reverse(out, axis=dim)
+
+        return _op.cumsum(data, axis=dim, exclusive=exclusive)
+
+
 class Cast(OnnxOpConverter):
     """Operator converter for Cast."""
 
@@ -2736,6 +2758,7 @@ def _get_convert_map(opset):
         "Resize": Resize.get_converter(opset),
         "NonZero": NonZero.get_converter(opset),
         "Range": Range.get_converter(opset),
+        "CumSum": CumSum.get_converter(opset),
         # defs/control_flow
         "Loop": Loop.get_converter(opset),
         "If": If.get_converter(opset),
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index fd07c98ddc1f..ba2416ff8950 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -116,7 +116,7 @@ def compute_scatter_nd(attrs, inputs, output_type):
 @_reg.register_compute("cumsum")
 def compute_cumsum(attrs, inputs, output_type):
     """Compute definition of cumsum"""
-    return [topi.cumsum(inputs[0], attrs.axis, attrs.dtype)]
+    return [topi.cumsum(inputs[0], attrs.axis, attrs.dtype, attrs.exclusive)]
 
 
 _reg.register_strategy("cumsum", strategy.cumsum_strategy)
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 3ad75faf4bc1..af1d2552fab7 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -1367,7 +1367,7 @@ def wrap_compute_cumsum(topi_compute):
     """Wrap cumsum topi compute"""
 
     def _compute_cumsum(attrs, inputs, _):
-        return [topi_compute(inputs[0], attrs.axis, attrs.dtype)]
+        return [topi_compute(inputs[0], attrs.axis, attrs.dtype, attrs.exclusive)]
 
     return _compute_cumsum
 
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 6785ff248612..e9d081eb5fb6 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -1322,7 +1322,7 @@ def adv_index(inputs):
     return _make.adv_index(Tuple(inputs))
 
 
-def cumsum(data, axis=None, dtype=None):
+def cumsum(data, axis=None, dtype=None, exclusive=None):
     """Numpy style cumsum op. Return the cumulative inclusive sum of the elements along
     a given axis.
 
@@ -1339,6 +1339,12 @@ def cumsum(data, axis=None, dtype=None):
         Type of the returned array and of the accumulator in which the elements are summed.
         If dtype is not specified, it defaults to the dtype of data.
 
+    exclusive : int, optional
+        If set to 1 will return exclusive sum in which the first element is not
+        included. In other terms, if set to 1, the j-th output element would be
+        the sum of the first (j-1) elements. Otherwise, it would be the sum of
+        the first j elements.
+
     Returns
     -------
     result : relay.Expr
@@ -1368,4 +1374,4 @@ def cumsum(data, axis=None, dtype=None):
         cumsum(a, dtype=int32)  # dtype should be provided to get the expected results
         -> [1, 1, 2, 2, 3, 4, 4]
     """
-    return _make.cumsum(data, axis, dtype)
+    return _make.cumsum(data, axis, dtype, exclusive)
diff --git a/python/tvm/topi/cuda/scan.py b/python/tvm/topi/cuda/scan.py
index 232d679840fd..0bdab100b429 100644
--- a/python/tvm/topi/cuda/scan.py
+++ b/python/tvm/topi/cuda/scan.py
@@ -488,7 +488,7 @@ def traverse(op):
     return s
 
 
-def cumsum(data, axis=None, dtype=None):
+def cumsum(data, axis=None, dtype=None, exclusive=None):
     """Numpy style cumsum op. Return the cumulative sum of the elements along a given axis.
 
     Parameters
@@ -504,6 +504,12 @@ def cumsum(data, axis=None, dtype=None):
         Type of the returned array and of the accumulator in which the elements are summed.
         If dtype is not specified, it defaults to the dtype of data.
 
+    exclusive : int, optional
+        If set to 1 will return exclusive sum in which the first element is not
+        included. In other terms, if set to 1, the j-th output element would be
+        the sum of the first (j-1) elements. Otherwise, it would be the sum of
+        the first j elements.
+
     Returns
     -------
     result : tvm.te.Tensor
@@ -514,4 +520,6 @@ def cumsum(data, axis=None, dtype=None):
         axis = 0
         data = reshape(data, (prod(data.shape),))
     axis = get_const_int(axis)
+    if exclusive is not None and exclusive != 0:
+        return exclusive_scan(data, axis, output_dtype=dtype, binop=tvm.tir.generic.add)
     return inclusive_scan(data, axis, output_dtype=dtype, binop=tvm.tir.generic.add)
diff --git a/python/tvm/topi/cumsum.py b/python/tvm/topi/cumsum.py
index 855427b1c619..2013a352874d 100644
--- a/python/tvm/topi/cumsum.py
+++ b/python/tvm/topi/cumsum.py
@@ -22,7 +22,7 @@
 from .math import cast
 
 
-def cumsum(data, axis=None, dtype=None):
+def cumsum(data, axis=None, dtype=None, exclusive=None):
     """Numpy style cumsum op. Return the cumulative sum of the elements along a given axis.
 
     Parameters
@@ -38,6 +38,12 @@ def cumsum(data, axis=None, dtype=None):
         Type of the returned array and of the accumulator in which the elements are summed.
         If dtype is not specified, it defaults to the dtype of data.
 
+    exclusive : int, optional
+        If set to 1 will return exclusive sum in which the first element is not
+        included. In other terms, if set to 1, the j-th output element would be
+        the sum of the first (j-1) elements. Otherwise, it would be the sum of
+        the first j elements.
+
     Returns
     -------
     result : tvm.te.Tensor
@@ -75,6 +81,9 @@ def maybe_cast(x):
             elif i > axis:
                 axis_mul_after *= value
 
+    if exclusive is None:
+        exclusive = 0
+
     def gen_ir(data_buf, out_buf):
         ib = ir_builder.create()
         data_buf = ib.buffer_ptr(data_buf)
@@ -84,12 +93,18 @@ def gen_ir(data_buf, out_buf):
             i = fused // axis_mul_after
             j = fused % axis_mul_after
             base_idx = i * cumsum_axis_len * axis_mul_after + j
-            out_buf[base_idx] = maybe_cast(data_buf[base_idx])
+            if exclusive == 0:
+                out_buf[base_idx] = maybe_cast(data_buf[base_idx])
+            else:
+                out_buf[base_idx] = cast(0, dtype)
             with ib.for_range(0, cumsum_axis_len - 1, "_k") as _k:
                 k = _k + 1
                 cur_idx = base_idx + k * axis_mul_after
                 prev_idx = base_idx + (k - 1) * axis_mul_after
-                out_buf[cur_idx] = out_buf[prev_idx] + maybe_cast(data_buf[cur_idx])
+                if exclusive == 0:
+                    out_buf[cur_idx] = out_buf[prev_idx] + maybe_cast(data_buf[cur_idx])
+                else:
+                    out_buf[cur_idx] = out_buf[prev_idx] + maybe_cast(data_buf[prev_idx])
 
         return ib.get()
 
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index d44bfe6959ca..5e39b409615d 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -3705,10 +3705,11 @@ bool CumsumRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   return true;
 }
 
-Expr MakeCumsum(Expr data, Integer axis, DataType dtype) {
+Expr MakeCumsum(Expr data, Integer axis, DataType dtype, Integer exclusive) {
   auto attrs = make_object<CumsumAttrs>();
   attrs->dtype = dtype;
   attrs->axis = axis;
+  attrs->exclusive = exclusive;
   static const Op& op = Op::Get("cumsum");
   return Call(op, {data}, Attrs(attrs), {});
 }
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 515fc32ef88d..27b91dd38f8e 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -3964,6 +3964,82 @@ def verify_softplus(indata):
     verify_softplus(input_data)
 
 
+def test_cumsum():
+    def verify_cumsum(indata, axis, exclusive=0, reverse=0, type="float32"):
+        cumsum_node = onnx.helper.make_node(
+            "CumSum",
+            inputs=["X", "axis"],
+            outputs=["Y"],
+        )
+        if exclusive != 0:
+            exclusive_attr = helper.make_attribute("exclusive", exclusive)
+            cumsum_node.attribute.append(exclusive_attr)
+        if reverse != 0:
+            reverse_attr = helper.make_attribute("reverse", reverse)
+            cumsum_node.attribute.append(reverse_attr)
+        nodes = [
+            make_constant_node("axis", onnx.TensorProto.INT32, [1], [axis]),
+            cumsum_node,
+        ]
+        if type == "float32":
+            tensor_type = TensorProto.FLOAT
+        else:
+            tensor_type = TensorProto.INT32
+            type = "int32"
+
+        graph = helper.make_graph(
+            nodes,
+            "cumsum_test",
+            inputs=[
+                helper.make_tensor_value_info("X", tensor_type, list(indata.shape)),
+            ],
+            outputs=[helper.make_tensor_value_info("Y", tensor_type, list(indata.shape))],
+        )
+
+        model = helper.make_model(graph, producer_name="cumsum_test")
+
+        verify_with_ort_with_inputs(model, [indata], dtype=type, use_vm=True, opset=11)
+
+    data = (
+        np.array(
+            [
+                1.0,
+                2.0,
+                3.0,
+                4.0,
+                5.0,
+                6.0,
+                7.0,
+                8.0,
+                9.0,
+                10.0,
+                11.0,
+                12.0,
+            ]
+        )
+        .astype(np.float32)
+        .reshape((3, 4))
+    )
+
+    verify_cumsum(data, 0)
+    verify_cumsum(data, 1)
+    verify_cumsum(data, 0, 1, 0)
+    verify_cumsum(data, 1, 1, 0)
+    verify_cumsum(data, 0, 0, 1)
+    verify_cumsum(data, 1, 0, 1)
+    verify_cumsum(data, 1, 1, 1)
+    data = np.random.randn(1, 32, 32, 3).astype("float32")
+    verify_cumsum(data, 1)
+    data = np.random.randn(1, 32, 32, 3).astype("int32")
+    verify_cumsum(data, 0, type="int32")
+    verify_cumsum(data, 1, type="int32")
+    verify_cumsum(data, 0, 1, 0, type="int32")
+    verify_cumsum(data, 1, 1, 0, type="int32")
+    verify_cumsum(data, 0, 0, 1, type="int32")
+    verify_cumsum(data, 1, 0, 1, type="int32")
+    verify_cumsum(data, 1, 1, 1, type="int32")
+
+
 if __name__ == "__main__":
     test_flatten()
     test_reshape()
@@ -4040,3 +4116,4 @@ def verify_softplus(indata):
     test_size()
     test_maxunpool()
     test_softplus()
+    test_cumsum()

From 68b7e7147fff8960754e4a3a788fd2474749d121 Mon Sep 17 00:00:00 2001
From: Yao Wang <kevinthesunwy@gmail.com>
Date: Tue, 9 Feb 2021 15:58:52 -0800
Subject: [PATCH 173/357] [Relay][Topi][CPU] Dense with weight transform
 (#7404)

* Add CPU dense weight transform

* Fix format

* Fix python format

* Fix pylint

* Minor fix

* Add test

* Do not need to infer layout for dense

* Fix test

* Rename dense_pack

* Fix test

* Fix lint

* Fix dynamic shape dense

* Fix lint

* Fix autotvm task extraction test

* Disable AlterOpLayout in micro_tflite.py tutorial
---
 python/tvm/relay/op/nn/_nn.py                 |  30 +++++
 python/tvm/relay/op/nn/nn.py                  |  33 +++++
 python/tvm/relay/op/strategy/generic.py       |  13 ++
 python/tvm/relay/op/strategy/x86.py           |  28 ++--
 python/tvm/topi/nn/dense.py                   |  70 ++++++++++
 python/tvm/topi/x86/__init__.py               |   1 +
 python/tvm/topi/x86/dense.py                  | 120 +++++++++++++-----
 python/tvm/topi/x86/dense_alter_op.py         |  68 ++++++++++
 src/relay/op/nn/nn.cc                         |  27 ++++
 src/relay/op/nn/nn.h                          |  25 ++++
 .../relay/test_autotvm_task_extraction.py     |  12 +-
 .../python/relay/test_pass_alter_op_layout.py |  31 ++++-
 tutorials/micro/micro_tflite.py               |   2 +-
 13 files changed, 413 insertions(+), 47 deletions(-)
 create mode 100644 python/tvm/topi/x86/dense_alter_op.py

diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 37ee6b6e929f..6ae86c0786e5 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -78,6 +78,17 @@ def legalize_dense(attrs, inputs, types):
 reg.register_pattern("nn.dense", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
+@reg.register_alter_op_layout("nn.dense")
+def alter_op_layout_dense(attrs, inputs, tinfos, out_type):
+    """Alternate the layout of dense"""
+    return topi.nn.dense_alter_layout(attrs, inputs, tinfos, out_type)
+
+
+# dense_pack
+reg.register_strategy("nn.contrib_dense_pack", strategy.dense_pack_strategy)
+reg.register_pattern("nn.contrib_dense_pack", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
+
+
 # fifo_buffer
 @reg.register_compute("nn.fifo_buffer")
 def compute_fifo_buffer(attrs, inputs, out_type):
@@ -1130,6 +1141,25 @@ def dense_shape_func(attrs, inputs, _):
     return ret
 
 
+@script
+def _dense_pack_shape_func(data_shape, weight_shape):
+    out = output_tensor((data_shape.shape[0],), "int64")
+    for i in const_range(out.shape[0] - 1):
+        out[i] = data_shape[i]
+    out[out.shape[0] - 1] = weight_shape[0] * weight_shape[2]
+
+    return out
+
+
+@reg.register_shape_func("nn.contrib_dense_pack", False)
+def dense_pack_shape_func(attrs, inputs, _):
+    """
+    Shape function for dense_pack op.
+    """
+    ret = [_dense_pack_shape_func(inputs[0], inputs[1])]
+    return ret
+
+
 @script
 def _batch_matmul_shape_func(data_shape, weight_shape):
     out = output_tensor((data_shape.shape[0],), "int64")
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index 562cee5f53bb..0c233a6e3b53 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -1435,6 +1435,39 @@ def dense(data, weight, units=None, out_dtype=""):
     return _make.dense(data, weight, units, out_dtype)
 
 
+def contrib_dense_pack(data, weight, units=None, out_dtype=""):
+    """Dense operator.
+    Applies a linear transformation
+
+    .. math::
+
+    `Y = X * W^T`
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input data to the operator,
+        of shape `(d_1, d_2, ..., d_n, units_in)`.
+
+    weight : tvm.relay.Expr
+        The transformed weight expressions, 3-D matrix,
+        of shape `(units // pack_weight_tile, units_in, pack_weight_tile)`.
+
+    units : int, optional
+        Number of hidden units of the dense transformation.
+
+    out_dtype : str, optional
+        Specifies the output data type for mixed precision dense,
+        of shape `(d_1, d_2, ..., d_n, units)`.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    return _make.contrib_dense_pack(data, weight, units, out_dtype)
+
+
 def fifo_buffer(data, buffer, axis):
     """FIFO buffer to enable computation reuse in CNNs with sliding indow input
 
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index af1d2552fab7..92a72f950615 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -731,6 +731,19 @@ def dense_strategy(attrs, inputs, out_type, target):
     return strategy
 
 
+@override_native_generic_func("dense_pack_strategy")
+def dense_pack_strategy(attrs, inputs, out_type, target):
+    """dense_pack generic strategy"""
+    logger.warning("dense_pack is not optimized for this platform.")
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_dense(topi.nn.dense_pack),
+        wrap_topi_schedule(topi.generic.schedule_dense),
+        name="dense_pack.generic",
+    )
+    return strategy
+
+
 # batch_matmul
 def wrap_compute_batch_matmul(topi_compute, need_auto_scheduler_layout=False):
     """wrap batch_matmul topi compute"""
diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index edfaaeefc5df..f33c45b248d6 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -364,7 +364,6 @@ def conv1d_strategy_cpu(attrs, inputs, out_type, target):
 def dense_strategy_cpu(attrs, inputs, out_type, target):
     """dense x86 strategy"""
     strategy = _op.OpStrategy()
-    m, _ = inputs[0].shape
     same_type = inputs[0].dtype == inputs[1].dtype == out_type.dtype
     dtype = inputs[0].dtype
     u8s8s32 = dtype == "uint8" and inputs[1].dtype == "int8" and out_type.dtype == "int32"
@@ -372,6 +371,13 @@ def dense_strategy_cpu(attrs, inputs, out_type, target):
         wrap_compute_dense(topi.x86.dense_nopack),
         wrap_topi_schedule(topi.x86.schedule_dense_nopack),
         name="dense_nopack.x86",
+        plevel=5,
+    )
+
+    strategy.add_implementation(
+        wrap_compute_dense(topi.x86.dense_pack),
+        wrap_topi_schedule(topi.x86.schedule_dense_pack),
+        name="dense_pack.x86",
         plevel=10,
     )
 
@@ -407,14 +413,18 @@ def dense_strategy_cpu(attrs, inputs, out_type, target):
                 name="dense_mkldnn.x86",
                 plevel=15,
             )
-    with SpecializedCondition(m >= 16):
-        # this implementation may not be well-optimized, so use plevel=5 for now.
-        strategy.add_implementation(
-            wrap_compute_dense(topi.x86.dense_pack),
-            wrap_topi_schedule(topi.x86.schedule_dense_pack),
-            name="dense_pack.x86",
-            plevel=5,
-        )
+    return strategy
+
+
+@dense_pack_strategy.register("cpu")
+def dense_pack_strategy_cpu(attrs, inputs, out_type, target):
+    """dense_pack x86 strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_dense(topi.x86.dense_pack),
+        wrap_topi_schedule(topi.x86.schedule_dense_pack),
+        name="dense_pack.x86",
+    )
     return strategy
 
 
diff --git a/python/tvm/topi/nn/dense.py b/python/tvm/topi/nn/dense.py
index bb6ea90c3fcd..e8ec476b86a5 100644
--- a/python/tvm/topi/nn/dense.py
+++ b/python/tvm/topi/nn/dense.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+# pylint: disable=invalid-name,unused-argument
 """TVM operator fully connected compute."""
 import tvm
 from tvm import te, auto_scheduler
@@ -104,3 +105,72 @@ def dense_legalize(attrs, inputs, types):
     # not to change by default
     # pylint: disable=unused-argument
     return None
+
+
+def dense_pack(data, weight, bias=None, out_dtype=None):
+    """The default implementation of dense_pack in topi.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        2-D with shape [batch, in_dim]
+
+    weight : tvm.te.Tensor
+        2-D with shape [out_dim, in_dim]
+
+    bias : Optional[tvm.te.Tensor]
+        1-D with shape [out_dim]
+
+    out_dtype : Optional[str]
+        The output type. This is used for mixed precision.
+
+    Returns
+    -------
+    output : tvm.te.Tensor
+        2-D with shape [batch, out_dim]
+    """
+    if out_dtype is None:
+        out_dtype = data.dtype
+    M, K = get_const_tuple(data.shape)  # batch, in_dim
+    N, _, packw_bn = get_const_tuple(weight.shape)  # out_dim
+    N = N * packw_bn
+
+    idxdiv = tvm.tir.indexdiv
+    idxmod = tvm.tir.indexmod
+    k = te.reduce_axis((0, K), name="k")
+    C = te.compute(
+        (M, N),
+        lambda y, x: te.sum(
+            data[y, k].astype(out_dtype)
+            * weight[idxdiv(x, packw_bn), k, idxmod(x, packw_bn)].astype(out_dtype),
+            axis=k,
+        ),
+        name="T_dense_pack",
+        tag="dense_pack",
+    )
+    if bias is not None:
+        C = te.compute((M, N), lambda i, j: C[i, j] + bias[j].astype(out_dtype), tag=tag.BROADCAST)
+    return C
+
+
+@tvm.target.generic_func
+def dense_alter_layout(attrs, inputs, tinfos, out_type):
+    """Change dense layout.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current convolution
+    inputs : tvm.relay.Expr
+        Grouped input symbols
+    tinfos : list
+        Input shape and dtype
+    out_type: type
+        The output type
+
+    Note
+    ----
+    Unlike other TOPI functions, this function operates on both graph level and operator level.
+    """
+    # not to change by default
+    return None
diff --git a/python/tvm/topi/x86/__init__.py b/python/tvm/topi/x86/__init__.py
index 154511010a1c..bb6a7cdd4122 100644
--- a/python/tvm/topi/x86/__init__.py
+++ b/python/tvm/topi/x86/__init__.py
@@ -39,4 +39,5 @@
 from .conv3d_transpose import *
 from .sparse import *
 from .conv2d_alter_op import *
+from .dense_alter_op import *
 from .scatter import *
diff --git a/python/tvm/topi/x86/dense.py b/python/tvm/topi/x86/dense.py
index 15d7a1a310d6..6011f01c2cb0 100644
--- a/python/tvm/topi/x86/dense.py
+++ b/python/tvm/topi/x86/dense.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=invalid-name,too-many-locals,unused-variable
+# pylint: disable=no-value-for-parameter
 """x86 dense operators"""
 from __future__ import absolute_import as _abs
 import tvm
@@ -26,11 +27,12 @@
 from tvm.contrib import mkldnn
 
 from .utils import get_fp32_len
+from .injective import schedule_injective_from_existing
 from .. import generic, tag
 from ..utils import traverse_inline, get_const_tuple
 
 
-def _schedule_dense_pack_template(cfg, s, C):
+def _schedule_dense_pack_template(cfg, s, C, O):
     A, packedB = s[C].op.input_tensors
 
     CC = s.cache_write(C, "global")
@@ -39,9 +41,10 @@ def _schedule_dense_pack_template(cfg, s, C):
 
     yt, yo, yi = cfg["tile_y"].apply(s, C, y)
     xt, xo, xi = cfg["tile_x"].apply(s, C, x)
-    s[C].reorder(yt, xt, yo, xo, yi, xi)
-    xyt = s[C].fuse(yt, xt)
-    s[C].parallel(xyt)
+    s[C].reorder(xt, yt, yo, xo, yi, xi)
+    xyt = s[C].fuse(xt, yt)
+    if C == O:
+        s[C].parallel(xyt)
     xyo = s[C].fuse(yo, xo)
     s[C].unroll(yi)
     s[C].vectorize(xi)
@@ -51,12 +54,27 @@ def _schedule_dense_pack_template(cfg, s, C):
     ko, ki = cfg["tile_k"].apply(s, CC, k)
     s[CC].reorder(ko, ki, y, x)
     s[CC].vectorize(x)
-    s[CC].unroll(y)
-    s[CC].unroll(ki)
 
-    z, y, x = s[packedB].op.axis
-    s[packedB].reorder(z, x, y)
-    s[packedB].parallel(z)
+    tile_inner = cfg["tile_inner"].size[-1]
+    if tile_inner > 1:
+        yo, yi = s[CC].split(y, tile_inner)
+        s[CC].reorder(ko, yo, ki, yi, x)
+        s[CC].unroll(yo)
+        s[CC].unroll(ki)
+        s[CC].unroll(yi)
+    else:
+        s[CC].unroll(ki)
+        s[CC].unroll(y)
+
+    if C != O:
+        y, x = s[O].op.axis
+        yt, yo, yi = cfg["tile_y"].apply(s, O, y)
+        xt, xo, xi = cfg["tile_x"].apply(s, O, x)
+        s[O].reorder(xt, yt, yo, xo, yi, xi)
+        xyt = s[O].fuse(xt, yt)
+        s[C].compute_at(s[O], xyt)
+        s[O].vectorize(xi)
+        s[O].parallel(xyt)
     return s
 
 
@@ -83,11 +101,11 @@ def _schedule_dense_nopack_template(cfg, s, C):
 
 def _default_dense_pack_config(cfg, M, N, K):
     # Generate default schedule for dynamic shape.
-    if isinstance(M, tvm.tir.Var):
+    if isinstance(M, (tvm.tir.Var, tvm.tir.Any)):
         M = 16
-    if isinstance(N, tvm.tir.Var):
+    if isinstance(N, (tvm.tir.Var, tvm.tir.Any)):
         N = 16
-    if isinstance(K, tvm.tir.Var):
+    if isinstance(K, (tvm.tir.Var, tvm.tir.Any)):
         K = 16
 
     vec_width = get_fp32_len()
@@ -116,15 +134,16 @@ def _default_dense_pack_config(cfg, M, N, K):
     cfg["tile_y"] = SplitEntity([MM // tiley_oi, tiley_oi, tiley_ii])
     cfg["tile_x"] = SplitEntity([NN // tilex_oi, tilex_oi, tilex_ii])
     cfg["tile_k"] = SplitEntity([K, 1])
+    cfg["tile_inner"] = SplitEntity([M // tiley_ii, tiley_ii])
 
 
 def _default_dense_nopack_config(cfg, M, N, K):
     # Generate default schedule for dynamic shape.
-    if isinstance(M, tvm.tir.Var):
+    if isinstance(M, (tvm.tir.Var, tvm.tir.Any)):
         M = 16
-    if isinstance(N, tvm.tir.Var):
+    if isinstance(N, (tvm.tir.Var, tvm.tir.Any)):
         N = 16
-    if isinstance(K, tvm.tir.Var):
+    if isinstance(K, (tvm.tir.Var, tvm.tir.Any)):
         K = 16
 
     vec_width = get_fp32_len()
@@ -146,9 +165,15 @@ def dense_nopack(cfg, data, weight, bias=None, out_dtype=None):
     M, K = get_const_tuple(data.shape)
     N, _ = get_const_tuple(weight.shape)
     # create tuning space
-    cfg.define_split("tile_y", 32 if isinstance(M, tvm.tir.Var) else M, num_outputs=2)
-    cfg.define_split("tile_x", 32 if isinstance(N, tvm.tir.Var) else N, num_outputs=2)
-    cfg.define_split("tile_k", 32 if isinstance(K, tvm.tir.Var) else K, num_outputs=2)
+    cfg.define_split(
+        "tile_y", 32 if isinstance(M, (tvm.tir.Var, tvm.tir.Any)) else M, num_outputs=2
+    )
+    cfg.define_split(
+        "tile_x", 32 if isinstance(N, (tvm.tir.Var, tvm.tir.Any)) else N, num_outputs=2
+    )
+    cfg.define_split(
+        "tile_k", 32 if isinstance(K, (tvm.tir.Var, tvm.tir.Any)) else K, num_outputs=2
+    )
     if cfg.is_fallback:
         _default_dense_nopack_config(cfg, M, N, K)
 
@@ -184,23 +209,46 @@ def _callback(op):
 
 @autotvm.register_topi_compute("dense_pack.x86")
 def dense_pack(cfg, data, weight, bias=None, out_dtype=None):
-    """Compute dense with packing"""
+    """Compute dense with transformed weight."""
     if out_dtype is None:
         out_dtype = data.dtype
     M, K = get_const_tuple(data.shape)  # batch, in_dim
-    N, _ = get_const_tuple(weight.shape)  # out_dim
+    if len(weight.shape) == 3:
+        N, _, packw_bn = get_const_tuple(weight.shape)  # out_dim
+        N = N * packw_bn
+    else:
+        N, _ = get_const_tuple(weight.shape)  # out_dim
     # create tuning space
-    cfg.define_split("tile_y", M, num_outputs=3)
-    cfg.define_split("tile_x", N, num_outputs=3)
-    cfg.define_split("tile_k", K, num_outputs=2)
+    cfg.define_split(
+        "tile_y", 32 if isinstance(M, (tvm.tir.Var, tvm.tir.Any)) else M, num_outputs=3
+    )
+    cfg.define_split(
+        "tile_x", 32 if isinstance(N, (tvm.tir.Var, tvm.tir.Any)) else N, num_outputs=3
+    )
+    cfg.define_split(
+        "tile_k", 32 if isinstance(K, (tvm.tir.Var, tvm.tir.Any)) else K, num_outputs=2
+    )
+    cfg.define_split(
+        "tile_inner",
+        32 if isinstance(M, (tvm.tir.Var, tvm.tir.Any)) else M,
+        num_outputs=2,
+        filter=lambda y: y.size[-1] <= 16,
+    )
     if cfg.is_fallback:
         _default_dense_pack_config(cfg, M, N, K)
 
-    packw_bn = cfg["tile_x"].size[-1]
-    packw_shape = (N // packw_bn, K, packw_bn)
-    packw = te.compute(
-        packw_shape, lambda z, y, x: weight[z * packw_bn + x, y], name="packed_weight"
-    )
+    if len(weight.shape) == 2:
+        packw_bn = cfg["tile_x"].size[-1]
+        packw_shape = (N // packw_bn, K, packw_bn)
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            # Directly use modified data layout placeholder.
+            packw = tvm.te.placeholder(packw_shape, weight.dtype, name="packed_weight")
+        else:
+            packw = te.compute(
+                packw_shape, lambda z, y, x: weight[z * packw_bn + x, y], name="packed_weight"
+            )
+    else:
+        packw = weight
 
     idxdiv = tvm.tir.indexdiv
     idxmod = tvm.tir.indexmod
@@ -226,7 +274,7 @@ def schedule_dense_pack(cfg, outs):
 
     def _callback(op):
         if "dense_pack" in op.tag:
-            _schedule_dense_pack_template(cfg, s, op.output(0))
+            _schedule_dense_pack_template(cfg, s, op.output(0), outs[0])
 
     traverse_inline(s, outs[0].op, _callback)
     return s
@@ -276,7 +324,19 @@ def dense_mkl(cfg, data, weight, bias=None, out_dtype=None):
 @autotvm.register_topi_schedule("dense_mkl.x86")
 def schedule_dense_mkl(_, outs):
     """Create schedule for dense_mkl"""
-    return generic.schedule_extern(outs)
+    # return generic.schedule_extern(outs)
+    s = te.create_schedule([x.op for x in outs])
+    te.schedule.AutoInlineInjective(s)
+
+    def _callback(op):
+        if "broadcast" in op.tag or "injective" in op.tag or "elemwise" in op.tag:
+            schedule_injective_from_existing(s, op.output(0))
+
+    # traverse_inline(s, outs[0].op, _callback)
+    for out in outs:
+        if "dense" not in out.op.name:
+            schedule_injective_from_existing(s, out)
+    return s
 
 
 @autotvm.register_topi_compute("dense_mkldnn.x86")
diff --git a/python/tvm/topi/x86/dense_alter_op.py b/python/tvm/topi/x86/dense_alter_op.py
new file mode 100644
index 000000000000..5e15c8bf5368
--- /dev/null
+++ b/python/tvm/topi/x86/dense_alter_op.py
@@ -0,0 +1,68 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
+"""Dense alter op functions for x86"""
+
+import tvm
+from tvm import te
+from tvm import relay
+from tvm import autotvm
+from .dense import _default_dense_pack_config
+from ..utils import get_const_tuple
+from ..nn import dense_alter_layout
+
+
+@dense_alter_layout.register(["cpu", "arm_cpu"])
+def _alter_dense_layout(attrs, inputs, tinfos, out_type):
+    target = tvm.target.Target.current(allow_none=False)
+    dispatch_ctx = autotvm.task.DispatchContext.current
+    data_tensor, weight_tensor = tinfos
+    out_dtype = out_type.dtype
+    M, K = get_const_tuple(data_tensor.shape)
+    N, _ = get_const_tuple(weight_tensor.shape)
+
+    impl, outs = relay.backend.compile_engine.select_implementation(
+        relay.op.get("nn.dense"), attrs, tinfos, out_type, target
+    )
+    workload = autotvm.task.get_workload(outs)
+    if workload:
+        cfg = dispatch_ctx.query(target, workload)
+        topi_impl = workload[0]
+        if topi_impl == "dense_pack.x86":
+            if cfg.is_fallback:
+                _default_dense_pack_config(cfg, M, N, K)
+            packw_bn = cfg["tile_x"].size[-1]
+            weight_layout = "NK%dn" % packw_bn
+            new_weight = te.placeholder(
+                (N // packw_bn, K, packw_bn),
+                dtype=weight_tensor.dtype,
+            )
+            # Relay dense doesn't have bias.
+            new_workload = autotvm.task.args_to_workload(
+                [
+                    data_tensor,
+                    new_weight,
+                    None,
+                    out_dtype,
+                ],
+                topi_impl,
+            )
+            dispatch_ctx.update(target, new_workload, cfg)
+            weight_transform = relay.layout_transform(inputs[1], "NK", weight_layout)
+            return relay.nn.contrib_dense_pack(inputs[0], weight_transform, None, out_dtype)
+
+    return None
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 8ace82be9ff8..3e3d94c614c3 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -186,6 +186,33 @@ RELAY_REGISTER_OP("nn.dense")
     .set_support_level(1)
     .add_type_rel("Dense", DenseRel<DenseAttrs>);
 
+// relay.nn.contrib_dense_pack
+// Positional relay function to create dense_pack operator used by frontend FFI.
+Expr MakeDensePack(Expr data, Expr weight, IndexExpr units, DataType out_dtype) {
+  auto attrs = make_object<DenseAttrs>();
+  attrs->units = units;
+  attrs->out_dtype = out_dtype;
+  static const Op& op = Op::Get("nn.contrib_dense_pack");
+  return Call(op, {data, weight}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_dense_pack").set_body_typed(MakeDensePack);
+
+RELAY_REGISTER_OP("nn.contrib_dense_pack")
+    .describe(R"code(Applies a linear transformation: :math:`Y = XW^T`.
+
+- **data**: `(x1, x2, ..., xn, input_dim)`
+- **weight**: `(units // pack_weight_tile, input_dim, pack_weight_tile)`
+- **out**: `(x1, x2, ..., xn, units)`.
+
+)code" TVM_ADD_FILELINE)
+    .set_attrs_type<DenseAttrs>()
+    .set_num_inputs(2)
+    .add_argument("data", "nD Tensor", "Input data.")
+    .add_argument("weight", "3D Tensor", "Packed weight matrix.")
+    .set_support_level(10)
+    .add_type_rel("DensePack", DensePackRel<DenseAttrs>);
+
 // relay.leaky_relu
 TVM_REGISTER_NODE_TYPE(LeakyReluAttrs);
 
diff --git a/src/relay/op/nn/nn.h b/src/relay/op/nn/nn.h
index 9b9cff2dba81..c00e2e02b369 100644
--- a/src/relay/op/nn/nn.h
+++ b/src/relay/op/nn/nn.h
@@ -31,6 +31,8 @@
 
 #include <utility>
 
+#include "../op_common.h"
+
 namespace tvm {
 namespace relay {
 
@@ -88,6 +90,29 @@ bool DenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   return true;
 }
 
+template <typename AttrType>
+bool DensePackRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                  const TypeReporter& reporter) {
+  ICHECK_EQ(types.size(), 3);
+  const auto* data = types[0].as<TensorTypeNode>();
+  const auto* weight = types[1].as<TensorTypeNode>();
+  if (data == nullptr || weight == nullptr) return false;
+
+  const AttrType* param = attrs.as<AttrType>();
+  ICHECK(param != nullptr);
+
+  Array<tvm::PrimExpr> oshape = data->shape;
+  oshape.Set((oshape.size() - 1), weight->shape[0] * weight->shape[2]);
+
+  DataType out_dtype = param->out_dtype;
+  if (out_dtype.bits() == 0) {
+    out_dtype = data->dtype;
+  }
+  // assign output type
+  reporter->Assign(types[2], TensorType(oshape, out_dtype));
+  return true;
+}
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_OP_NN_NN_H_
diff --git a/tests/python/relay/test_autotvm_task_extraction.py b/tests/python/relay/test_autotvm_task_extraction.py
index da71ac37f695..d6bfd8d0ec11 100644
--- a/tests/python/relay/test_autotvm_task_extraction.py
+++ b/tests/python/relay/test_autotvm_task_extraction.py
@@ -60,9 +60,9 @@ def test_task_extraction():
     tasks = autotvm.task.extract_from_program(
         mod["main"], target=target, params=params, ops=(dense,)
     )
-    assert len(tasks) == 1
+    assert len(tasks) == 2
     tasks = autotvm.task.extract_from_program(mod, target=target, params=params, ops=(dense,))
-    assert len(tasks) == 1
+    assert len(tasks) == 2
 
     mod, params, _ = get_network("resnet-18", batch_size=1)
     mod_list.append(mod)
@@ -70,13 +70,13 @@ def test_task_extraction():
     tasks = autotvm.task.extract_from_program(
         mod["main"], target=target, params=params, ops=(conv2d, dense)
     )
-    assert len(tasks) == 13
+    assert len(tasks) == 14
     tasks = autotvm.task.extract_from_program(
         mod, target=target, params=params, ops=(conv2d, dense)
     )
-    assert len(tasks) == 13
+    assert len(tasks) == 14
     tasks = autotvm.task.extract_from_program(mod, target=target, params=params)
-    assert len(tasks) == 13
+    assert len(tasks) == 14
 
     mod, params, _ = get_network("resnet3d-18", batch_size=1)
     tasks = autotvm.task.extract_from_program(mod, target=target, params=params, ops=(conv3d,))
@@ -88,7 +88,7 @@ def test_task_extraction():
     tasks = autotvm.task.extract_from_program(
         mod, target=target, params=params, ops=(conv2d, dense)
     )
-    assert len(tasks) == 20
+    assert len(tasks) == 21
 
     mod, params, _ = get_network("dcgan", batch_size=1)
     tasks = autotvm.task.extract_from_program(
diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py
index 58c279d750ec..41186884bdb2 100644
--- a/tests/python/relay/test_pass_alter_op_layout.py
+++ b/tests/python/relay/test_pass_alter_op_layout.py
@@ -18,7 +18,7 @@
 import pytest
 
 import tvm
-from tvm import relay
+from tvm import relay, topi
 from tvm.relay import transform, analysis
 from tvm.relay.testing.temp_op_attr import TempOpAttr
 from tvm.relay.testing import run_infer_type
@@ -1248,6 +1248,34 @@ def expected():
     assert tvm.ir.structural_equal(a, b, map_free_vars=True), "Actual = \n" + str(a)
 
 
+def test_alter_op_dense():
+    def before():
+        x = relay.var("x", shape=(32, 64))
+        weight = relay.var("weight", shape=(48, 64))
+        y = relay.nn.dense(x, weight)
+        y = relay.Function(analysis.free_vars(y), y)
+        return y
+
+    def expected():
+        x = relay.var("x", shape=(32, 64))
+        weight = relay.var("weight", shape=(48, 64))
+        target_layout = "NK16n"
+        weight_transform = relay.layout_transform(weight, "NK", target_layout)
+        y = relay.nn.contrib_dense_pack(x, weight_transform, units=None, out_dtype="float32")
+        y = relay.Function(analysis.free_vars(y), y)
+        return y
+
+    for target, _ in tvm.testing.enabled_targets():
+        with tvm.target.Target(target):
+            with TempOpAttr(
+                "nn.dense", "FTVMAlterOpLayout", topi.x86.dense_alter_op._alter_dense_layout
+            ):
+                a = before()
+                a = run_opt_pass(a, transform.AlterOpLayout())
+                b = run_opt_pass(expected(), transform.InferType())
+                assert tvm.ir.structural_equal(a, b)
+
+
 if __name__ == "__main__":
     test_alter_op()
     test_alter_return_none()
@@ -1269,3 +1297,4 @@ def expected():
     test_alter_layout_nhwc_arm()
     test_alter_layout_nhwc_int8_aarch64()
     test_alter_op_with_global_var()
+    test_alter_op_dense()
diff --git a/tutorials/micro/micro_tflite.py b/tutorials/micro/micro_tflite.py
index c28918380265..c979216d0c6b 100644
--- a/tutorials/micro/micro_tflite.py
+++ b/tutorials/micro/micro_tflite.py
@@ -195,7 +195,7 @@
 # Now, compile the model for the target:
 
 with tvm.transform.PassContext(
-    opt_level=3, config={"tir.disable_vectorize": True}, disabled_pass=["FuseOps"]
+    opt_level=3, config={"tir.disable_vectorize": True}, disabled_pass=["FuseOps", "AlterOpLayout"]
 ):
     graph, c_mod, c_params = relay.build(mod, target=TARGET, params=params)
 

From 3863e09caef51942265f31b89af0b99c4eeb8001 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Wed, 10 Feb 2021 05:38:47 -0800
Subject: [PATCH 174/357] [FIX,CMAKE] Only set Clang flags for C++ files
 (#7424)

Clang flags were set for all file types, causing nvcc to error out.
---
 CMakeLists.txt                 |  4 +++-
 cmake/modules/ClangFlags.cmake | 19 +++++++++----------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 92630faf07d5..769a35318d9d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -151,7 +151,6 @@ else(MSVC)
       CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
     set(CMAKE_CXX_FLAGS "-faligned-new ${CMAKE_CXX_FLAGS}")
   endif()
-  include(cmake/modules/ClangFlags.cmake)
 
   # Detect if we're compiling for Hexagon.
   set(TEST_FOR_HEXAGON_CXX
@@ -435,6 +434,9 @@ endif()
 target_link_libraries(tvm PRIVATE ${TVM_LINKER_LIBS} ${TVM_RUNTIME_LINKER_LIBS})
 target_link_libraries(tvm_runtime PRIVATE ${TVM_RUNTIME_LINKER_LIBS})
 
+# Set flags for clang
+include(cmake/modules/ClangFlags.cmake)
+
 # Related headers
 target_include_directories(
   tvm
diff --git a/cmake/modules/ClangFlags.cmake b/cmake/modules/ClangFlags.cmake
index 53d0e3631caf..841570dc2e12 100644
--- a/cmake/modules/ClangFlags.cmake
+++ b/cmake/modules/ClangFlags.cmake
@@ -28,9 +28,9 @@ if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
       (CLANG_VERSION VERSION_GREATER ${CLANG_MINIMUM_VERSION}))
     message(STATUS "Setting enhanced clang warning flags")
 
-    # These warnings are only enabled when clang's -Weverything flag is enabled
-    # but there is no harm in turning them off for all cases.
-    add_compile_options(
+    set(warning_opts
+      # These warnings are only enabled when clang's -Weverything flag is enabled
+      # but there is no harm in turning them off for all cases.
       -Wno-c++98-compat
       -Wno-c++98-compat-extra-semi
       -Wno-c++98-compat-pedantic
@@ -61,17 +61,13 @@ if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
       -Wno-implicit-fallthrough
       -Wno-unreachable-code-return
       -Wno-non-virtual-dtor
-    )
-
-    # Here we have non-standard warnings that clang has available and are useful
-    # so enable them if we are using clang.
-    add_compile_options(
+      # Here we have non-standard warnings that clang has available and are useful
+      # so enable them if we are using clang.
       -Wreserved-id-macro
       -Wused-but-marked-unused
       -Wdocumentation-unknown-command
       -Wcast-qual
       -Wzero-as-null-pointer-constant
-
       # These warnings should be enabled one at a time and fixed.
       # To enable one of these warnings remove the `no-` after -W so
       # -Wno-documentation -> -Wdocumentation
@@ -85,7 +81,10 @@ if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
       -Wno-old-style-cast
       -Wno-gnu-anonymous-struct
       -Wno-nested-anon-types
-      )
+    )
+  target_compile_options(tvm_objs PRIVATE $<$<COMPILE_LANGUAGE:CXX>: ${warning_opts}>)
+  target_compile_options(tvm_runtime_objs PRIVATE $<$<COMPILE_LANGUAGE:CXX>: ${warning_opts}>)
+
 
   endif ()
 endif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")

From 9175c6dd32784fc691cc8e671250f70e578d9b07 Mon Sep 17 00:00:00 2001
From: Ritwik Das <ritwikdas54@gmail.com>
Date: Wed, 10 Feb 2021 11:51:12 -0800
Subject: [PATCH 175/357] TRT Dynamic Reshape Fix (#7412)

* Dynamic Reshape

* Changes

* Add test cases

* Add test cases

* PR COmments

* CI Error

* EmptyCommitCIError

Co-authored-by: Ubuntu <ubuntu@ip-172-31-42-251.us-east-2.compute.internal>
---
 python/tvm/relay/op/contrib/tensorrt.py |  13 ++-
 tests/python/contrib/test_tensorrt.py   | 101 ++++++++++++++++++++++++
 2 files changed, 107 insertions(+), 7 deletions(-)

diff --git a/python/tvm/relay/op/contrib/tensorrt.py b/python/tvm/relay/op/contrib/tensorrt.py
index db9684d02ac9..afdea9712342 100644
--- a/python/tvm/relay/op/contrib/tensorrt.py
+++ b/python/tvm/relay/op/contrib/tensorrt.py
@@ -615,7 +615,6 @@ def layout_transform_annotate_fn(expr):  # pylint: disable=unused-variable
 @_register_external_dynamic_check_func("reshape")
 def reshape_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if reshape is supported by TensorRT."""
-
     attrs, args = expr.attrs, expr.args
     if args[0].checked_type.dtype != "float32":
         logger.info("Only float32 inputs are supported for TensorRT.")
@@ -629,23 +628,23 @@ def reshape_annotate_fn(expr):  # pylint: disable=unused-variable
         if len(new_shape) == 0 or len(shape) == 0:
             logger.info("reshape: Can't reshape to or from scalar.")
             return False
-
         dynamic_reshape = any([isinstance(x, tvm.tir.expr.Any) for x in shape])
 
         if dynamic_reshape:
             # Make sure that the batch dim is unmodified.
             if int(new_shape[0]) < 0:
-                for shape_val, new_shape_val in enumerate(shape[1:], new_shape[1:]):
+                for shape_val, new_shape_val in zip(shape[1:], new_shape[1:]):
                     if not (
-                        isinstance(shape_val, int)
-                        and isinstance(new_shape_val, int)
+                        isinstance(shape_val, (int, tvm.tir.expr.IntImm))
+                        and isinstance(new_shape_val, (int, tvm.tir.expr.IntImm))
                         and int(shape_val) == int(new_shape_val)
                     ):
                         return False
             elif int(new_shape[0]) > 0:
+                # Currently we only allow dim[0] to be Any, so this branch will always be False
                 if not (
-                    isinstance(shape[0], int)
-                    and isinstance(new_shape[0], int)
+                    isinstance(shape[0], (int, tvm.tir.expr.IntImm))
+                    and isinstance(new_shape[0], (int, tvm.tir.expr.IntImm))
                     and int(shape[0]) == int(new_shape[0])
                 ):
                     return False
diff --git a/tests/python/contrib/test_tensorrt.py b/tests/python/contrib/test_tensorrt.py
index bd8d92eedb4c..7ddc4e762cfd 100644
--- a/tests/python/contrib/test_tensorrt.py
+++ b/tests/python/contrib/test_tensorrt.py
@@ -27,6 +27,7 @@
 from tvm.contrib import graph_runtime, utils
 from tvm.runtime.vm import VirtualMachine
 from tvm.relay import Any, GlobalVar, transform
+from tvm.relay.expr_functor import ExprVisitor
 from typing import Dict, Tuple, Union
 from tvm.contrib.download import download
 from tvm.relay.op.contrib import tensorrt
@@ -631,6 +632,106 @@ def get_graph(x_shape, new_shape):
     run_and_verify_func(get_graph((1, 1, 2, 3), (1, 6)))
 
 
+class AreOpsOnGraph(ExprVisitor):
+    """
+    Visits the Graph recursively and checks if it contains ops in the op_list
+    """
+
+    def __init__(self, op_list):
+        ExprVisitor.__init__(self)
+        self.op_list = op_list
+        self.on_graph = False
+
+    def visit_call(self, call):
+        if isinstance(call.op, tvm.tir.op.Op):
+            if str(call.op) in self.op_list:
+                self.on_graph = True
+
+        return super().visit_call(call)
+
+    def are_ops_on_graph(self, subgraph) -> bool:
+        """
+        This function recursively visits the graph and checks if op_list ops are ongraph"
+        """
+        self.visit(subgraph)
+        return self.on_graph
+
+
+def are_ops_on_trt(mod, op_list):
+    for subgraph in mod.get_global_vars():
+        name = subgraph.name_hint
+        op_on_trt = False
+        op_on_tvm = True
+        if name == "main":
+            op_on_tvm = AreOpsOnGraph(op_list).are_ops_on_graph(mod[name].body)
+        elif mod[name].attrs and mod[name].attrs["Compiler"] == "tensorrt":
+            op_on_trt = AreOpsOnGraph(op_list).are_ops_on_graph(mod[name].body)
+        else:
+            op_on_tvm &= AreOpsOnGraph(op_list).are_ops_on_graph(mod[name].body)
+
+        if not op_on_trt or op_on_tvm:
+            return False
+
+    return True
+
+
+def test_dynamic_reshape():
+    if skip_codegen_test():
+        return
+
+    def test_run(x_data_list, x_shape, new_shape, should_offload_to_trt):
+        result_arr = [{} for _ in range(len(x_data_list))]
+        for use_trt in [True, False]:
+            x = relay.var("x", shape=x_shape, dtype="float32")
+            out = relay.reshape(x, new_shape)
+            f = relay.Function([x], out)
+            mod = tvm.IRModule()
+            mod["main"] = f
+            if use_trt:
+                mod, _ = tensorrt.partition_for_tensorrt(
+                    mod, params={}, remove_no_mac_subgraphs=False
+                )
+                assert are_ops_on_trt(mod, op_list=["reshape"]) == should_offload_to_trt
+            if not skip_runtime_test():
+                with relay.build_config(opt_level=3):
+                    relay_exec = relay.create_executor("vm", mod=mod, ctx=tvm.cpu(0), target="llvm")
+
+                for i, x_data in enumerate(x_data_list):
+                    result_arr[i][use_trt] = relay_exec.evaluate()(x_data)
+
+        if not skip_runtime_test():
+            for i in range(len(x_data_list)):
+                assert_result_dict_holds(result_arr[i])
+
+    dim_values = [1, 1, 0, 2, 3, 0, 1, 3, 2]
+    x_shape = (relay.Any(), 3, 2, 3)
+    x_data_list = [
+        np.ones([dim_value] + list(x_shape)[1:]).astype("float32") for dim_value in dim_values
+    ]
+    new_shape = (-1, 3, 2, 3)
+    should_offload_to_trt = True
+    test_run(x_data_list, x_shape, new_shape, should_offload_to_trt)
+
+    dim_values = [1, 1, 0, 2, 3, 0, 1, 3, 2]
+    x_shape = (relay.Any(), 3, 2, 3)
+    x_data_list = [
+        np.ones([dim_value] + list(x_shape)[1:]).astype("float32") for dim_value in dim_values
+    ]
+    new_shape = (-1, 1, 2, 3)
+    should_offload_to_trt = False
+    test_run(x_data_list, x_shape, new_shape, should_offload_to_trt)
+
+    dim_values = [1, 1, 0, 2, 3, 0, 1, 3, 2]
+    x_shape = (1, relay.Any(), 2, 3)
+    x_data_list = [
+        np.ones(list(x_shape[:1]) + [dim_value] + list(x_shape)[2:]).astype("float32")
+        for dim_value in dim_values
+    ]
+    new_shape = (1, -1, 2, 3)
+    should_offload_to_trt = False
+    test_run(x_data_list, x_shape, new_shape, should_offload_to_trt)
+
+
 def test_transpose():
     def get_graph(x_shape, order):
         x = relay.var("x", shape=(x_shape), dtype="float32")

From 12c6b70e334da811d0330622460158b369388ae2 Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Wed, 10 Feb 2021 12:54:43 -0700
Subject: [PATCH 176/357] Simplify full broadcast (#7423)

* convert argwhere(full(const)) to reshape(arange())

* Add IsWildcard syntatic sugar

* add a simplify expression to fold full into broadcast ops

* Allow constant folding of full-like ops after SimplifyExpr

* fix a bug with the Attr Pattern matching

* remove skip_list
---
 include/tvm/relay/dataflow_pattern.h          |   2 +
 src/relay/ir/dataflow_matcher.cc              |   8 +-
 src/relay/ir/dataflow_pattern.cc              |   1 +
 src/relay/op/make_op.h                        |   6 +
 src/relay/op/tensor/unary.cc                  |   6 +-
 src/relay/transforms/fold_constant.cc         |   5 -
 src/relay/transforms/simplify_expr.cc         | 111 ++++++++++++++++--
 tests/python/relay/test_dataflow_pattern.py   |   2 +
 tests/python/relay/test_pass_fold_constant.py |  16 ---
 tests/python/relay/test_pass_simplify_expr.py |  65 ++++++++++
 10 files changed, 185 insertions(+), 37 deletions(-)

diff --git a/include/tvm/relay/dataflow_pattern.h b/include/tvm/relay/dataflow_pattern.h
index 1e6cecfd041b..99ef9a237de2 100644
--- a/include/tvm/relay/dataflow_pattern.h
+++ b/include/tvm/relay/dataflow_pattern.h
@@ -524,6 +524,8 @@ class DominatorPattern : public DFPattern {
 DFPattern IsVar(const String& name);
 /*! \brief Syntatic Sugar for creating a ConstantPattern */
 DFPattern IsConstant();
+/*! \brief Syntatic Sugar for creating a WildcardPattern */
+DFPattern IsWildcard();
 /*! \brief Syntatic Sugar for creating a ExprPattern */
 DFPattern IsExpr(const Expr& expr);
 /*! \brief Syntatic Sugar for creating a ExprPattern base on an Op*/
diff --git a/src/relay/ir/dataflow_matcher.cc b/src/relay/ir/dataflow_matcher.cc
index a43f50f600df..ac716579f2ab 100644
--- a/src/relay/ir/dataflow_matcher.cc
+++ b/src/relay/ir/dataflow_matcher.cc
@@ -162,8 +162,12 @@ bool DFPatternMatcher::VisitDFPattern_(const AttrPatternNode* attr_pattern, cons
       if (Op::HasAttrMap(attr_name)) {
         auto op_map = Op::GetAttrMap<TVMRetValue>(attr_name);
         if (op_map.count(op)) {
-          matches = MatchRetValue(attr_value, op_map[op]);
+          matches &= MatchRetValue(attr_value, op_map[op]);
+        } else {
+          matches = false;
         }
+      } else {
+        matches = false;
       }
     }
   } else if (auto* op = expr.as<CallNode>()) {
@@ -196,6 +200,8 @@ bool DFPatternMatcher::VisitDFPattern_(const AttrPatternNode* attr_pattern, cons
         break;
       }
     }
+  } else {
+    matches = false;
   }
   return matches;
 }
diff --git a/src/relay/ir/dataflow_pattern.cc b/src/relay/ir/dataflow_pattern.cc
index 4c3b82cc19d4..9c65c490d855 100644
--- a/src/relay/ir/dataflow_pattern.cc
+++ b/src/relay/ir/dataflow_pattern.cc
@@ -357,6 +357,7 @@ DFPattern DFPattern::HasShape(const Array<PrimExpr> shape) {
 }
 DFPattern IsVar(const String& name) { return VarPattern(name); }
 DFPattern IsConstant() { return ConstantPattern(make_object<ConstantPatternNode>()); }
+DFPattern IsWildcard() { return WildcardPattern(make_object<WildcardPatternNode>()); }
 DFPattern IsExpr(const Expr& expr) { return ExprPattern(expr); }
 DFPattern IsOp(const String& op_name) { return IsExpr(Op::Get(op_name)); }
 DFPattern IsTuple(const Array<DFPattern>& fields) { return TuplePattern(fields); }
diff --git a/src/relay/op/make_op.h b/src/relay/op/make_op.h
index 2b05290b270c..79f7e135e29d 100644
--- a/src/relay/op/make_op.h
+++ b/src/relay/op/make_op.h
@@ -100,6 +100,12 @@ Expr MakeResize(Expr data, Array<IndexExpr> size, String layout, String method,
 
 Expr MakeSparseToDense(Expr indices, Array<Integer> output_shape, Expr values, Expr default_value);
 
+Expr MakeArange(Expr start, Expr stop, Expr step, DataType dtype);
+
+Expr MakeShapeOf(Expr data, DataType dtype);
+
+Expr MakeTake(Expr data, Expr indices, Integer axis, String mode);
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_OP_MAKE_OP_H_
diff --git a/src/relay/op/tensor/unary.cc b/src/relay/op/tensor/unary.cc
index e17bdc0e0906..3e82b92a5f03 100644
--- a/src/relay/op/tensor/unary.cc
+++ b/src/relay/op/tensor/unary.cc
@@ -430,12 +430,14 @@ Array<te::Tensor> ShapeOfCompute(const Attrs& attrs, const Array<te::Tensor>& in
   return {topi::shape(inputs[0], param->dtype)};
 }
 
-TVM_REGISTER_GLOBAL("relay.op._make.shape_of").set_body_typed([](Expr data, DataType dtype) {
+Expr MakeShapeOf(Expr data, DataType dtype) {
   auto attrs = make_object<ShapeOfAttrs>();
   attrs->dtype = dtype;
   static const Op& op = Op::Get("shape_of");
   return Call(op, {data}, Attrs(attrs), {});
-});
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.shape_of").set_body_typed(MakeShapeOf);
 
 RELAY_REGISTER_OP("shape_of")
     .describe(R"code(Returns a tensor representing the shape of a tensor.
diff --git a/src/relay/transforms/fold_constant.cc b/src/relay/transforms/fold_constant.cc
index 657d4db993b0..4454c9c0459a 100644
--- a/src/relay/transforms/fold_constant.cc
+++ b/src/relay/transforms/fold_constant.cc
@@ -148,8 +148,6 @@ class ConstantFolder : public MixedModeMutator {
     }
     static auto op_stateful = Op::GetAttrMap<TOpIsStateful>("TOpIsStateful");
 
-    std::unordered_set<std::string> skip_list{"zeros_like", "ones_like", "full_like", "full"};
-
     auto origin_args = call->args;
     call = post.as<CallNode>();
     // We don't constant fold function with zero arguments.
@@ -158,9 +156,6 @@ class ConstantFolder : public MixedModeMutator {
     if (call->args.size() == 0) return post;
     const OpNode* op = call->op.as<OpNode>();
     if (op == nullptr) return post;
-    if (skip_list.count(op->name)) {
-      return post;
-    }
     // skip stateful ops.
     if (op_stateful.get(GetRef<Op>(op), false)) return post;
     // Try to evaluate shape_of op
diff --git a/src/relay/transforms/simplify_expr.cc b/src/relay/transforms/simplify_expr.cc
index 0f78c260378c..74e48dc4bc54 100644
--- a/src/relay/transforms/simplify_expr.cc
+++ b/src/relay/transforms/simplify_expr.cc
@@ -29,24 +29,38 @@
 #include <tvm/support/logging.h>
 
 #include "../op/tensor/transform.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
 
+class SimplifyPattern {
+ public:
+  virtual Expr callback(const Expr& pre, const Expr& post,
+                        const Map<DFPattern, Array<Expr>>& node_map) const = 0;
+
+  DFPattern pattern() const { return pattern_; }
+
+ protected:
+  /*! \brief Pattern for rewriting */
+  DFPattern pattern_;
+};
+
 /*!
  * \brief SimplifyReshape matches the pattern of consecutive reshape or reverse_reshape ops,
  *   and merges into one reshape op.
  */
-class SimplifyReshape {
+class SimplifyReshape : public SimplifyPattern {
  public:
   SimplifyReshape() {
-    x_ = WildcardPattern(make_object<WildcardPatternNode>());
+    x_ = IsWildcard();
     auto reshape1 = IsOp("reshape") || IsOp("contrib_reverse_reshape");
     auto reshape2 = IsOp("reshape") || IsOp("contrib_reverse_reshape");
     pattern_ = reshape1({reshape2({x_})});
   }
 
-  Expr callback(const Expr& pre, const Expr& post, const Map<DFPattern, Array<Expr>>& node_map) {
+  Expr callback(const Expr& pre, const Expr& post,
+                const Map<DFPattern, Array<Expr>>& node_map) const override {
     auto x = node_map[x_][0];
     bool const_shape = true;
     Array<Integer> newshape;
@@ -63,13 +77,82 @@ class SimplifyReshape {
     return post;
   }
 
-  DFPattern pattern() const { return pattern_; }
-
  private:
   /*! \brief Pattern input */
   DFPattern x_;
-  /*! \brief Pattern for consecutive reshape or reverse_reshape ops */
-  DFPattern pattern_;
+};
+
+/*!
+ * \brief FullArgwhere finds full followed by argwhere and turns it into an Arange op
+ */
+class FullElementwise : public SimplifyPattern {
+ public:
+  FullElementwise() {
+    x_ = IsWildcard();
+    data_ = IsWildcard();
+    value_ = IsConstant();
+
+    full_ = IsOp("full")({value_}) || IsOp("full_like")({data_, value_});
+    ones_ = IsOp("ones")({}) || IsOp("ones_like")({data_});
+    zeros_ = IsOp("zeros")({}) || IsOp("zeros_like")({data_});
+
+    Map<String, ObjectRef> attrs;
+    attrs.Set("TOpPattern", Integer(static_cast<int>(kBroadcast)));
+    DFPattern op = IsWildcard().HasAttr(attrs);
+    DFPattern full = full_ || ones_ || zeros_;
+    pattern_ = op({full, x_}) || op({x_, full});
+  }
+
+  Expr callback(const Expr& pre, const Expr& post,
+                const Map<DFPattern, Array<Expr>>& node_map) const override {
+    const CallNode* call = pre.as<CallNode>();
+    ICHECK(call);
+    Type pre_type = pre->checked_type_;
+    ICHECK(pre_type.as<TensorTypeNode>());
+    auto dtype = pre_type.as<TensorTypeNode>()->dtype;
+    auto x = node_map[x_][0];
+    bool is_left = post.as<CallNode>()->args[1] == x;
+    Type x_type;
+    if (is_left) {
+      x_type = call->args[1]->checked_type_;
+    } else {
+      x_type = call->args[0]->checked_type_;
+    }
+
+    if (StructuralEqual()(x_type, pre_type)) {
+      Expr value;
+      if (node_map.count(full_)) {
+        value = node_map[value_][0];
+        ICHECK(IsConstScalar(value));
+      } else if (node_map.count(ones_)) {
+        value = MakeConstantScalar(dtype, 1);
+      } else if (node_map.count(zeros_)) {
+        value = MakeConstantScalar(dtype, 0);
+      } else {
+        ICHECK(false) << "Didn't find a full op while matching full + elementwise";
+      }
+      if (is_left) {
+        return Call(call->op, {value, x}, call->attrs, call->type_args, call->span);
+      } else {
+        return Call(call->op, {x, value}, call->attrs, call->type_args, call->span);
+      }
+    }
+    return post;
+  }
+
+ private:
+  /*! \brief binary argument */
+  DFPattern x_;
+  /*! \brief data ops get shape from */
+  DFPattern data_;
+  /*! \brief constant input */
+  DFPattern value_;
+  /*! \brief full op */
+  DFPattern full_;
+  /*! \brief ones op */
+  DFPattern ones_;
+  /*! \brief zeros op */
+  DFPattern zeros_;
 };
 
 /*!
@@ -78,22 +161,24 @@ class SimplifyReshape {
 class ExprSimplifier {
  public:
   explicit ExprSimplifier(IRModule mod) : mod_(mod) {
-    auto reshape_func = [this](TVMArgs args, TVMRetValue* rv) {
+    CreateCallback(SimplifyReshape());
+    CreateCallback(FullElementwise());
+  }
+  template <typename T>
+  void CreateCallback(const T& pattern) {
+    auto func = [pattern](TVMArgs args, TVMRetValue* rv) {
       Expr pre = args[0];
       Expr post = args[1];
       Map<DFPattern, Array<Expr>> node_map = args[2];
-      *rv = simplify_reshape_.callback(pre, post, node_map);
+      *rv = pattern.callback(pre, post, node_map);
     };
-    callbacks_.push_back(
-        DFPatternCallback(simplify_reshape_.pattern(), PackedFunc(reshape_func), true));
+    callbacks_.push_back(DFPatternCallback(pattern.pattern(), PackedFunc(func), true));
   }
 
   Expr Simplify(const Expr& expr) { return RewritePatterns(callbacks_, expr, mod_); }
 
  private:
   IRModule mod_;
-  /*! \brief Simplify reshape pattern */
-  SimplifyReshape simplify_reshape_;
   /*! \brief Callbacks for expr simplification */
   Array<DFPatternCallback> callbacks_;
 };
diff --git a/tests/python/relay/test_dataflow_pattern.py b/tests/python/relay/test_dataflow_pattern.py
index b39c03a6160e..a8e4b65f1bc6 100644
--- a/tests/python/relay/test_dataflow_pattern.py
+++ b/tests/python/relay/test_dataflow_pattern.py
@@ -437,6 +437,8 @@ def test_no_match_op_attr():
     x = relay.var("x")
     y = relay.var("y")
     assert not op_pat.match(x - y)
+    z = relay.var("z")
+    assert not op_pat.match(relay.Let(z, x + y, z))
 
 
 def test_match_func_attr():
diff --git a/tests/python/relay/test_pass_fold_constant.py b/tests/python/relay/test_pass_fold_constant.py
index 76182d2c3e08..14ad419e80c6 100644
--- a/tests/python/relay/test_pass_fold_constant.py
+++ b/tests/python/relay/test_pass_fold_constant.py
@@ -231,22 +231,6 @@ def expected(dtype):
         assert tvm.ir.structural_equal(zz, zexpected)
 
 
-def test_fold_full():
-    c_shape = (8, 9, 10)
-
-    def before():
-        dtype = "float32"
-        return relay.full(relay.const(1.0, dtype), c_shape, dtype=dtype)
-
-    def expected():
-        # expect no changes
-        return before()
-
-    zz = run_opt_pass(before(), transform.FoldConstant())
-    zexpected = run_opt_pass(expected(), transform.InferType())
-    assert tvm.ir.structural_equal(zz, zexpected)
-
-
 def test_fold_batch_norm():
     def expected():
         data = relay.var("data", relay.TensorType((1, 3, 224, 224), "float32"))
diff --git a/tests/python/relay/test_pass_simplify_expr.py b/tests/python/relay/test_pass_simplify_expr.py
index b57abc6942d7..3d925bcfc759 100644
--- a/tests/python/relay/test_pass_simplify_expr.py
+++ b/tests/python/relay/test_pass_simplify_expr.py
@@ -58,5 +58,70 @@ def symbolic():
     assert tvm.ir.structural_equal(zz, after)
 
 
+def test_simplify_full_elementwise():
+    def validate(shape, value, dtype):
+        def before_left(x, elem_op, full):
+            return elem_op(full, x)
+
+        def after_left(x, elem_op, value):
+            return elem_op(relay.const(value, dtype), x)
+
+        def before_right(x, elem_op, full):
+            return elem_op(x, full)
+
+        def after_right(x, elem_op, value):
+            return elem_op(x, relay.const(value, dtype))
+
+        x = relay.var("x", shape=shape, dtype=dtype)
+        elem_ops = [relay.add, relay.multiply, relay.subtract, relay.divide]
+        full_ops = []
+        if value == 0:
+            full_ops.append(relay.zeros(shape, dtype))
+            full_ops.append(relay.zeros_like(x))
+        if value == 1:
+            full_ops.append(relay.ones(shape, dtype))
+            full_ops.append(relay.ones_like(x))
+        else:
+            full_ops.append(relay.full(relay.const(value, dtype), shape))
+            full_ops.append(relay.full_like(x, relay.const(value, dtype)))
+        for op in elem_ops:
+            for full in full_ops:
+                z = before_left(x, op, full)
+                zz = run_opt_pass(z, transform.SimplifyExpr())
+                after = run_opt_pass(after_left(x, op, value), transform.InferType())
+                assert tvm.ir.structural_equal(zz, after)
+
+                z = before_right(x, op, full)
+                zz = run_opt_pass(z, transform.SimplifyExpr())
+                after = run_opt_pass(after_right(x, op, value), transform.InferType())
+                assert tvm.ir.structural_equal(zz, after)
+
+        # Test the case in which x is broadcast to full's shape
+        full_ops = []
+        if value == 0:
+            full_ops.append(relay.zeros(shape * 2, dtype))
+        if value == 1:
+            full_ops.append(relay.ones(shape * 2, dtype))
+        else:
+            full_ops.append(relay.full(relay.const(value, dtype), shape * 2))
+        for op in elem_ops:
+            for full in full_ops:
+                z = before_left(x, op, full)
+                zz = run_opt_pass(z, transform.SimplifyExpr())
+                after = run_opt_pass(before_left(x, op, full), transform.InferType())
+                assert tvm.ir.structural_equal(zz, after)
+
+                z = before_right(x, op, full)
+                zz = run_opt_pass(z, transform.SimplifyExpr())
+                after = run_opt_pass(before_right(x, op, full), transform.InferType())
+                assert tvm.ir.structural_equal(zz, after)
+
+    for shape in [[10], [10, 10], [10, 10, 10]]:
+        for dtype in ["float32", "int32"]:
+            for value in [0, 1, 2]:
+                validate(shape, value, dtype)
+
+
 if __name__ == "__main__":
     test_simplify_reshape()
+    test_simplify_full_elementwise()

From b7808fbdc32d0fe628f0edc6170943b2497601e8 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Thu, 11 Feb 2021 09:08:51 -0500
Subject: [PATCH 177/357] [Arith] Fix iter_affine_map with non-const extent
 (#7437)

---
 src/arith/iter_affine_map.cc                  | 36 ++++++++++---------
 .../unittest/test_arith_iter_affine_map.py    |  3 ++
 2 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc
index 7896db73d10a..7efdd03fa11e 100644
--- a/src/arith/iter_affine_map.cc
+++ b/src/arith/iter_affine_map.cc
@@ -412,8 +412,8 @@ class IterMapRewriter : public ExprMutator {
     return analyzer_->CanProve(floormod(lhs, rhs) == 0);
   }
 
-  PrimExpr SplitFloorDivConst(IterSplitExpr lhs, PrimExpr rhs);
-  PrimExpr SplitFloorModConst(IterSplitExpr lhs, PrimExpr rhs);
+  PrimExpr SplitFloorDivConst(IterSplitExpr lhs, PrimExpr rhs, const PrimExpr& orig);
+  PrimExpr SplitFloorModConst(IterSplitExpr lhs, PrimExpr rhs, const PrimExpr& orig);
 
   static void AddToLhs(IterSumExprNode* lhs, IterSplitExpr rhs, int sign) {
     tir::ExprDeepEqual equal;
@@ -584,7 +584,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const MulNode* op) {
   if (a->IsInstance<IterMapExprNode>() && b->IsInstance<IterMapExprNode>()) {
     // cannot multiply two iterators, mark as unresolved.
     ++unresolved_count_;
-    return Mul(a, b);
+    return GetRef<PrimExpr>(op);
   }
 
   if (!a->IsInstance<IterMapExprNode>()) {
@@ -603,7 +603,8 @@ PrimExpr IterMapRewriter::VisitExpr_(const MulNode* op) {
   }
 }
 
-PrimExpr IterMapRewriter::SplitFloorDivConst(IterSplitExpr lhs, PrimExpr rhs) {
+PrimExpr IterMapRewriter::SplitFloorDivConst(IterSplitExpr lhs, PrimExpr rhs,
+                                             const PrimExpr& orig) {
   // floordiv(x*scale, rhs)
   if (is_one(rhs)) return std::move(lhs);
   if (!is_one(lhs->scale)) {
@@ -619,7 +620,7 @@ PrimExpr IterMapRewriter::SplitFloorDivConst(IterSplitExpr lhs, PrimExpr rhs) {
       } else {
         // mark as unresolved.
         ++unresolved_count_;
-        return floordiv(lhs, rhs);
+        return orig;
       }
     }
   }
@@ -641,7 +642,7 @@ PrimExpr IterMapRewriter::SplitFloorDivConst(IterSplitExpr lhs, PrimExpr rhs) {
   } else {
     // mark as unresolved.
     ++unresolved_count_;
-    return floordiv(lhs, rhs);
+    return orig;
   }
 }
 
@@ -669,25 +670,26 @@ PrimExpr IterMapRewriter::VisitExpr_(const FloorDivNode* op) {
   if (b->IsInstance<IterMapExprNode>()) {
     // cannot divide an iterator, mark as unresolved.
     ++unresolved_count_;
-    return FloorDiv(a, b);
+    return GetRef<PrimExpr>(op);
   }
 
   if (a->IsInstance<IterSumExprNode>()) {
     IterSumExpr ret = Downcast<IterSumExpr>(a);
     if (auto opt = TryFuseIters(ret)) {
-      return SplitFloorDivConst(opt.value(), b);
+      return SplitFloorDivConst(opt.value(), b, GetRef<PrimExpr>(op));
     } else {
       ++unresolved_count_;
-      return FloorDiv(a, b);
+      return GetRef<PrimExpr>(op);
     }
   } else {
     ICHECK(a->IsInstance<IterSplitExprNode>());
     IterSplitExpr ret = Downcast<IterSplitExpr>(std::move(a));
-    return SplitFloorDivConst(ret, b);
+    return SplitFloorDivConst(ret, b, GetRef<PrimExpr>(op));
   }
 }
 
-PrimExpr IterMapRewriter::SplitFloorModConst(IterSplitExpr lhs, PrimExpr rhs) {
+PrimExpr IterMapRewriter::SplitFloorModConst(IterSplitExpr lhs, PrimExpr rhs,
+                                             const PrimExpr& orig) {
   // floormod(x*scale, rhs)
   if (is_one(rhs)) return make_zero(lhs->dtype);
   if (!is_one(lhs->scale)) {
@@ -701,7 +703,7 @@ PrimExpr IterMapRewriter::SplitFloorModConst(IterSplitExpr lhs, PrimExpr rhs) {
       } else {
         // mark as unresolved.
         ++unresolved_count_;
-        return floormod(lhs, rhs);
+        return orig;
       }
     }
   }
@@ -715,7 +717,7 @@ PrimExpr IterMapRewriter::SplitFloorModConst(IterSplitExpr lhs, PrimExpr rhs) {
   } else {
     // mark as unresolved.
     ++unresolved_count_;
-    return floormod(lhs, rhs);
+    return orig;
   }
 }
 
@@ -743,21 +745,21 @@ PrimExpr IterMapRewriter::VisitExpr_(const FloorModNode* op) {
   if (b->IsInstance<IterMapExprNode>()) {
     // cannot mod an iterator, mark as unresolved.
     ++unresolved_count_;
-    return FloorMod(a, b);
+    return GetRef<PrimExpr>(op);
   }
 
   if (a->IsInstance<IterSumExprNode>()) {
     IterSumExpr ret = Downcast<IterSumExpr>(a);
     if (auto opt = TryFuseIters(ret)) {
-      return SplitFloorModConst(opt.value(), b);
+      return SplitFloorModConst(opt.value(), b, GetRef<PrimExpr>(op));
     } else {
       ++unresolved_count_;
-      return FloorMod(a, b);
+      return GetRef<PrimExpr>(op);
     }
   } else {
     ICHECK(a->IsInstance<IterSplitExprNode>());
     IterSplitExpr ret = Downcast<IterSplitExpr>(std::move(a));
-    return SplitFloorModConst(ret, b);
+    return SplitFloorModConst(ret, b, GetRef<PrimExpr>(op));
   }
 }
 
diff --git a/tests/python/unittest/test_arith_iter_affine_map.py b/tests/python/unittest/test_arith_iter_affine_map.py
index 620540cc9841..6ab61fdd9592 100644
--- a/tests/python/unittest/test_arith_iter_affine_map.py
+++ b/tests/python/unittest/test_arith_iter_affine_map.py
@@ -161,6 +161,9 @@ def test_split():
     assert len(res) == 1
     assert_iter_sum_pattern(res[0], 8, 0, scale=2)
 
+    res = tvm.arith.detect_iter_map([fld(x, flm(flm(y, 8), 6))], var_dom([(x, 24), (y, 8)]))
+    assert len(res) == 0
+
 
 def test_compound():
     x = tvm.tir.Var("x", "int32"), 10

From d05d75d820c776aefc95547dd185bfbd28b14c46 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Thu, 11 Feb 2021 06:09:14 -0800
Subject: [PATCH 178/357] Stop running some python testsuites twice (#7430)

---
 tests/scripts/setup-pytest-env.sh              | 13 +++++++++----
 .../scripts/task_python_arm_compute_library.sh |  3 +--
 tests/scripts/task_python_ethosn_tests.sh      |  2 +-
 tests/scripts/task_python_frontend.sh          | 14 +++++++-------
 tests/scripts/task_python_frontend_cpu.sh      |  6 +++---
 tests/scripts/task_python_integration.sh       | 18 ++++++++++--------
 tests/scripts/task_python_microtvm.sh          |  2 +-
 tests/scripts/task_python_nightly.sh           |  2 +-
 tests/scripts/task_python_topi.sh              |  2 +-
 tests/scripts/task_python_unittest.sh          | 10 ++++++++--
 tests/scripts/task_python_vta_fsim.sh          |  6 ++----
 tests/scripts/task_python_vta_tsim.sh          |  6 ++----
 12 files changed, 46 insertions(+), 38 deletions(-)

diff --git a/tests/scripts/setup-pytest-env.sh b/tests/scripts/setup-pytest-env.sh
index 5f108e9355fc..5d2216c9dc87 100755
--- a/tests/scripts/setup-pytest-env.sh
+++ b/tests/scripts/setup-pytest-env.sh
@@ -33,12 +33,17 @@ export TVM_PYTEST_RESULT_DIR="${TVM_PATH}/build/pytest-results"
 mkdir -p "${TVM_PYTEST_RESULT_DIR}"
 
 function run_pytest() {
-    test_suite_name="$1"
+    local ffi_type="$1"
     shift
-    for ffi_type in ${TVM_PYTEST_FFI_TYPES:-ctypes cython}; do
-        TVM_FFI=${ffi_type} python3 -m pytest \
+    local test_suite_name="$1"
+    shift
+    if [ -z "${ffi_type}" -o -z "${test_suite_name}" ]; then
+        echo "error: run_pytest called incorrectly: run_pytest ${ffi_type} ${test_suite_name} $@"
+        echo "usage: run_pytest <FFI_TYPE> <TEST_SUITE_NAME> [pytest args...]"
+        exit 2
+    fi
+    TVM_FFI=${ffi_type} python3 -m pytest \
            -o "junit_suite_name=${test_suite_name}-${ffi_type}" \
            "--junit-xml=${TVM_PYTEST_RESULT_DIR}/${test_suite_name}-${ffi_type}.xml" \
            "$@"
-    done
 }
diff --git a/tests/scripts/task_python_arm_compute_library.sh b/tests/scripts/task_python_arm_compute_library.sh
index 4c1992b58692..7df894d93399 100755
--- a/tests/scripts/task_python_arm_compute_library.sh
+++ b/tests/scripts/task_python_arm_compute_library.sh
@@ -27,5 +27,4 @@ source tests/scripts/setup-pytest-env.sh
 find . -type f -path "*.pyc" | xargs rm -f
 make cython3
 
-TVM_PYTEST_FFI_TYPES=ctypes run_pytest python-arm_compute_lib \
-                    tests/python/contrib/test_arm_compute_lib
+run_pytest ctypes python-arm_compute_lib tests/python/contrib/test_arm_compute_lib
diff --git a/tests/scripts/task_python_ethosn_tests.sh b/tests/scripts/task_python_ethosn_tests.sh
index 472ca38149a1..981d5715fac6 100755
--- a/tests/scripts/task_python_ethosn_tests.sh
+++ b/tests/scripts/task_python_ethosn_tests.sh
@@ -27,4 +27,4 @@ source tests/scripts/setup-pytest-env.sh
 find . -type f -path "*.pyc" | xargs rm -f
 make cython3
 
-TVM_PYTEST_FFI_TYPES=ctypes run_pytest python-ethosn tests/python/contrib/test_ethosn
+run_pytest ctypes python-ethosn tests/python/contrib/test_ethosn
diff --git a/tests/scripts/task_python_frontend.sh b/tests/scripts/task_python_frontend.sh
index 6b1d8e5038fb..62a0fa1e7fc8 100755
--- a/tests/scripts/task_python_frontend.sh
+++ b/tests/scripts/task_python_frontend.sh
@@ -32,22 +32,22 @@ find . -type f -path "*.pyc" | xargs rm -f
 make cython3
 
 echo "Running relay MXNet frontend test..."
-TVM_PYTHON_FFI_TYPES=cython run_pytest python-frontend-mxnet tests/python/frontend/mxnet
+run_pytest cython python-frontend-mxnet tests/python/frontend/mxnet
 
 echo "Running relay ONNX frontend test..."
-TVM_PYTHON_FFI_TYPES=cython run_pytest python-frontend-onnx tests/python/frontend/onnx
+run_pytest cython python-frontend-onnx tests/python/frontend/onnx
 
 echo "Running relay CoreML frontend test..."
-TVM_PYTHON_FFI_TYPES=cython run_pytest python-frontend-coreml tests/python/frontend/coreml
+run_pytest cython python-frontend-coreml tests/python/frontend/coreml
 
 echo "Running relay Tensorflow frontend test..."
-TVM_PYTHON_FFI_TYPES=cython run_pytest python-frontend-tensorflow tests/python/frontend/tensorflow
+run_pytest cython python-frontend-tensorflow tests/python/frontend/tensorflow
 
 echo "Running relay caffe2 frontend test..."
-TVM_PYTHON_FFI_TYPES=cython run_pytest python-frontend-caffe2 tests/python/frontend/caffe2
+run_pytest cython python-frontend-caffe2 tests/python/frontend/caffe2
 
 echo "Running relay DarkNet frontend test..."
-TVM_PYTHON_FFI_TYPES=cython run_pytest python-frontend-darknet tests/python/frontend/darknet
+run_pytest cython python-frontend-darknet tests/python/frontend/darknet
 
 echo "Running relay PyTorch frontend test..."
-TVM_PYTHON_FFI_TYPES=cython run_pytest python-frontend-pytorch tests/python/frontend/pytorch
+run_pytest cython python-frontend-pytorch tests/python/frontend/pytorch
diff --git a/tests/scripts/task_python_frontend_cpu.sh b/tests/scripts/task_python_frontend_cpu.sh
index a5cd3ba8ef09..208714c64988 100755
--- a/tests/scripts/task_python_frontend_cpu.sh
+++ b/tests/scripts/task_python_frontend_cpu.sh
@@ -33,10 +33,10 @@ find . -type f -path "*.pyc" | xargs rm -f
 make cython3
 
 echo "Running relay TFLite frontend test..."
-TVM_PYTHON_FFI_TYPES=cython run_pytest python-frontend-tflite tests/python/frontend/tflite
+run_pytest cython python-frontend-tflite tests/python/frontend/tflite
 
 echo "Running relay Keras frontend test..."
-TVM_PYTHON_FFI_TYPES=cython run_pytest python-frontend-keras tests/python/frontend/keras
+run_pytest cython python-frontend-keras tests/python/frontend/keras
 
 echo "Running relay Caffe frontend test..."
-TVM_PYTHON_FFI_TYPES=cython run_pytest python-frontend-caffe tests/python/frontend/caffe
+run_pytest cython python-frontend-caffe tests/python/frontend/caffe
diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh
index dc96097fb115..e21aa065a024 100755
--- a/tests/scripts/task_python_integration.sh
+++ b/tests/scripts/task_python_integration.sh
@@ -27,6 +27,7 @@ export LD_LIBRARY_PATH="build:${LD_LIBRARY_PATH:-}"
 export TVM_BIND_THREADS=0
 export TVM_NUM_THREADS=2
 
+# NOTE: also set by task_python_integration_gpuonly.sh.
 if [ -z "${TVM_INTEGRATION_TESTSUITE_NAME:-}" ]; then
     TVM_INTEGRATION_TESTSUITE_NAME=python-integration
 fi
@@ -43,28 +44,29 @@ rm -rf lib
 make
 cd ../..
 
-run_pytest ${TVM_INTEGRATION_TESTSUITE_NAME}-extensions apps/extension/tests
+run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-extensions apps/extension/tests
+run_pytest cython ${TVM_INTEGRATION_TESTSUITE_NAME}-extensions apps/extension/tests
 
 # Test dso plugin
 cd apps/dso_plugin_module
 rm -rf lib
 make
 cd ../..
-run_pytest ${TVM_INTEGRATION_TESTSUITE_NAME}-dso_plugin_module apps/dso_plugin_module
+run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-dso_plugin_module apps/dso_plugin_module
+run_pytest cython ${TVM_INTEGRATION_TESTSUITE_NAME}-dso_plugin_module apps/dso_plugin_module
 
 # Do not enable TensorFlow op
 # TVM_FFI=cython sh prepare_and_test_tfop_module.sh
 # TVM_FFI=ctypes sh prepare_and_test_tfop_module.sh
 
-TVM_PYTEST_FFI_TYPES=ctypes run_pytest ${TVM_INTEGRATION_TESTSUITE_NAME} tests/python/integration
-TVM_PYTEST_FFI_TYPES=ctypes run_pytest ${TVM_INTEGRATION_TESTSUITE_NAME}-contrib tests/python/contrib
+run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME} tests/python/integration
+run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-contrib tests/python/contrib
 
 TVM_TEST_TARGETS="${TVM_RELAY_TEST_TARGETS:-llvm;cuda}" \
-                TVM_PYTEST_FFI_TYPES=ctypes \
-                run_pytest ${TVM_INTEGRATION_TESTSUITE_NAME}-relay tests/python/relay
+    run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-relay tests/python/relay
 
 # Command line driver test
-TVM_PYTEST_FFI_TYPES=ctypes run_pytest ${TVM_INTEGRATION_TESTSUITE_NAME}-driver tests/python/driver
+run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-driver tests/python/driver
 
 # Do not enable OpenGL
-# run_pytest ${TVM_INTEGRATION_TESTSUITE_NAME}-webgl tests/webgl
+# run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-webgl tests/webgl
diff --git a/tests/scripts/task_python_microtvm.sh b/tests/scripts/task_python_microtvm.sh
index ddedff37c6c2..ba8018667895 100755
--- a/tests/scripts/task_python_microtvm.sh
+++ b/tests/scripts/task_python_microtvm.sh
@@ -25,4 +25,4 @@ source tests/scripts/setup-pytest-env.sh
 find . -type f -path "*.pyc" | xargs rm -f
 
 make cython3
-run_pytest python-microtvm-qemu tests/micro/qemu
+run_pytest ctypes python-microtvm-qemu tests/micro/qemu
diff --git a/tests/scripts/task_python_nightly.sh b/tests/scripts/task_python_nightly.sh
index bff0650b0bed..16c94dfdad31 100755
--- a/tests/scripts/task_python_nightly.sh
+++ b/tests/scripts/task_python_nightly.sh
@@ -27,4 +27,4 @@ make cython3
 # cleanup pycache
 find . -type f -path "*.pyc" | xargs rm -f
 
-TVM_PYTEST_FFI_TYPES=cython run_pytest python-topi-nightly tests/python/topi/nightly
+run_pytest cython python-topi-nightly tests/python/topi/nightly
diff --git a/tests/scripts/task_python_topi.sh b/tests/scripts/task_python_topi.sh
index e5eb6f28276a..9a5991e6a766 100755
--- a/tests/scripts/task_python_topi.sh
+++ b/tests/scripts/task_python_topi.sh
@@ -31,4 +31,4 @@ make cython3
 # cleanup pycache
 find . -type f -path "*.pyc" | xargs rm -f
 
-TVM_PYTHON_FFI_TYPES=cython run_pytest python-topi tests/python/topi/
+run_pytest cython python-topi tests/python/topi/
diff --git a/tests/scripts/task_python_unittest.sh b/tests/scripts/task_python_unittest.sh
index b63d79a99562..54a36f6dcfd4 100755
--- a/tests/scripts/task_python_unittest.sh
+++ b/tests/scripts/task_python_unittest.sh
@@ -25,9 +25,15 @@ source tests/scripts/setup-pytest-env.sh
 find . -type f -path "*.pyc" | xargs rm -f
 make cython3
 
+# NOTE: also set by task_python_unittest_gpuonly.sh.
 if [ -z "${TVM_UNITTEST_TESTSUITE_NAME:-}" ]; then
     TVM_UNITTEST_TESTSUITE_NAME=python-unittest
 fi
 
-run_pytest ${TVM_UNITTEST_TESTSUITE_NAME}-platform-minimal-test tests/python/all-platform-minimal-test
-run_pytest ${TVM_UNITTEST_TESTSUITE_NAME} tests/python/unittest
+# First run minimal test on both ctypes and cython.
+run_pytest ctypes ${TVM_UNITTEST_TESTSUITE_NAME}-platform-minimal-test tests/python/all-platform-minimal-test
+run_pytest cython ${TVM_UNITTEST_TESTSUITE_NAME}-platform-minimal-test tests/python/all-platform-minimal-test
+
+# Then run all unittests on both ctypes and cython.
+run_pytest ctypes ${TVM_UNITTEST_TESTSUITE_NAME} tests/python/unittest
+run_pytest cython ${TVM_UNITTEST_TESTSUITE_NAME} tests/python/unittest
diff --git a/tests/scripts/task_python_vta_fsim.sh b/tests/scripts/task_python_vta_fsim.sh
index 74d14db95d30..4074fb888351 100755
--- a/tests/scripts/task_python_vta_fsim.sh
+++ b/tests/scripts/task_python_vta_fsim.sh
@@ -40,10 +40,8 @@ cp ${VTA_HW_PATH}/config/fsim_sample.json ${VTA_HW_PATH}/config/vta_config.json
 
 # Run unit tests in functional/fast simulator
 echo "Running unittest in fsim..."
-TVM_PYTEST_FFI_TYPES=cython run_pytest python-vta-fsim-unittest \
-                    ${TVM_PATH}/vta/tests/python/unittest
+run_pytest cython python-vta-fsim-unittest ${TVM_PATH}/vta/tests/python/unittest
 
 # Run unit tests in functional/fast simulator
 echo "Running integration test in fsim..."
-TVM_PYTEST_FFI_TYPES=cython run_pytest python-vta-fsim-integration \
-                    ${TVM_PATH}/vta/tests/python/integration
+run_pytest cython python-vta-fsim-integration ${TVM_PATH}/vta/tests/python/integration
diff --git a/tests/scripts/task_python_vta_tsim.sh b/tests/scripts/task_python_vta_tsim.sh
index 4a5c9d7da877..3a6a35e5a06f 100755
--- a/tests/scripts/task_python_vta_tsim.sh
+++ b/tests/scripts/task_python_vta_tsim.sh
@@ -55,13 +55,11 @@ make -C ${VTA_HW_PATH}/hardware/chisel USE_THREADS=0 lib
 
 # Run unit tests in cycle accurate simulator
 echo "Running unittest in tsim..."
-TVM_PYTEST_FFI_TYPES=cython run_pytest python-vta-tsim-unittest \
-                    ${TVM_PATH}/vta/tests/python/unittest
+run_pytest cython python-vta-tsim-unittest ${TVM_PATH}/vta/tests/python/unittest
 
 # Run unit tests in cycle accurate simulator
 echo "Running integration test in tsim..."
-TVM_PYTEST_FFI_TYPES=cython run_pytest python-vta-tsim-integration \
-                    ${TVM_PATH}/vta/tests/python/integration
+run_pytest cython python-vta-tsim-integration ${TVM_PATH}/vta/tests/python/integration
 
 # Reset default fsim simulation
 cp ${VTA_HW_PATH}/config/fsim_sample.json ${VTA_HW_PATH}/config/vta_config.json

From 6b58321a094956249ab57cb3f7434d4eb73130d0 Mon Sep 17 00:00:00 2001
From: Trevor Morris <trevmorr@amazon.com>
Date: Thu, 11 Feb 2021 09:46:56 -0800
Subject: [PATCH 179/357] [BYOC][TRT] Fix small bug preventing TRT runtime
 compilation for versions < 6 (#7372)

* Fix small bug preventing TRT runtime compilation for versions < 6

* Trigger ci
---
 src/runtime/contrib/tensorrt/tensorrt_ops.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runtime/contrib/tensorrt/tensorrt_ops.cc b/src/runtime/contrib/tensorrt/tensorrt_ops.cc
index 69bb1dccfb62..824178eaa619 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_ops.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_ops.cc
@@ -447,7 +447,7 @@ class BatchNormOpConverter : public TensorRTOpConverter {
     nvinfer1::IScaleLayer* scale_layer = params->network->addScaleNd(
         *input, nvinfer1::ScaleMode::kCHANNEL, weight_shift, weight_scale, power, channel_dim);
 #else
-    ICHECK_EQ(input->getDimensions().nbDims(), 3);
+    ICHECK_EQ(input->getDimensions().nbDims, 3);
     nvinfer1::IScaleLayer* scale_layer = params->network->addScale(
         *input, nvinfer1::ScaleMode::kCHANNEL, weight_shift, weight_scale, power);
 #endif

From 8b9005f6e974bb2b6c6d618b0a56f8a9af56cacc Mon Sep 17 00:00:00 2001
From: Nicola Lancellotti <n.lancellotti@me.com>
Date: Thu, 11 Feb 2021 18:59:59 +0000
Subject: [PATCH 180/357] Make the TVM targets list available in Python (#7427)

* Make the TVM targets list available in Python

Change-Id: I8602723fe57aaf32cee5392d4387a637115dd363

* Rename the APIs to get target kinds

Change-Id: I2e6e32e025e3614a148a30a31e5a2c52fd3563cc
---
 include/tvm/target/target_kind.h            | 5 +++++
 python/tvm/target/target.py                 | 5 +++++
 src/target/target_kind.cc                   | 9 +++++++++
 tests/cpp/target_test.cc                    | 6 ++++++
 tests/python/unittest/test_target_target.py | 8 ++++++++
 5 files changed, 33 insertions(+)

diff --git a/include/tvm/target/target_kind.h b/include/tvm/target/target_kind.h
index c9ef736f7aee..72c41c6f4647 100644
--- a/include/tvm/target/target_kind.h
+++ b/include/tvm/target/target_kind.h
@@ -196,6 +196,11 @@ class TargetKindRegEntry {
   inline TargetKindRegEntry& add_attr_option(const String& key, ObjectRef default_value);
   /*! \brief Set name of the TargetKind to be the same as registry if it is empty */
   inline TargetKindRegEntry& set_name();
+  /*!
+   * \brief List all the entry names in the registry.
+   * \return The entry names.
+   */
+  TVM_DLL static Array<String> ListTargetKinds();
   /*!
    * \brief Register or get a new entry.
    * \param target_kind_name The name of the TargetKind.
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 0ebf31ae6462..19fe09e539d8 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -147,6 +147,11 @@ def mattr(self):
     def libs(self):
         return list(self.attrs.get("libs", []))
 
+    @staticmethod
+    def list_kinds():
+        """Returns the list of available target names."""
+        return list(_ffi_api.ListTargetKinds())
+
 
 # TODO(@tvm-team): Deprecate the helper functions below. Encourage the usage of config dict instead.
 
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index cee708f80b5a..a3b1b207f290 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -23,6 +23,7 @@
  */
 #include <tvm/ir/expr.h>
 #include <tvm/runtime/device_api.h>
+#include <tvm/runtime/registry.h>
 #include <tvm/target/target.h>
 #include <tvm/target/target_kind.h>
 
@@ -44,6 +45,10 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 using TargetKindRegistry = AttrRegistry<TargetKindRegEntry, TargetKind>;
 
+Array<String> TargetKindRegEntry::ListTargetKinds() {
+  return TargetKindRegistry::Global()->ListAllNames();
+}
+
 TargetKindRegEntry& TargetKindRegEntry::RegisterOrGet(const String& target_kind_name) {
   return TargetKindRegistry::Global()->RegisterOrGet(target_kind_name);
 }
@@ -307,4 +312,8 @@ TVM_REGISTER_TARGET_KIND("composite", kDLCPU)
     .add_attr_option<Target>("target_host")
     .add_attr_option<Array<Target>>("devices");
 
+/**********  Registry  **********/
+
+TVM_REGISTER_GLOBAL("target.ListTargetKinds").set_body_typed(TargetKindRegEntry::ListTargetKinds);
+
 }  // namespace tvm
diff --git a/tests/cpp/target_test.cc b/tests/cpp/target_test.cc
index a422f12b04d7..8dba462132ac 100644
--- a/tests/cpp/target_test.cc
+++ b/tests/cpp/target_test.cc
@@ -152,6 +152,12 @@ TEST(TargetCreation, DeduplicateKeys) {
   ICHECK_EQ(target->GetAttr<Bool>("link-params"), false);
 }
 
+TEST(TargetKindRegistryListTargetKinds, Basic) {
+  Array<String> names = TargetKindRegEntry::ListTargetKinds();
+  ICHECK_EQ(names.empty(), false);
+  ICHECK_EQ(std::count(std::begin(names), std::end(names), "llvm"), 1);
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   testing::FLAGS_gtest_death_test_style = "threadsafe";
diff --git a/tests/python/unittest/test_target_target.py b/tests/python/unittest/test_target_target.py
index 973f14958d9a..a0a60cb0c4fd 100644
--- a/tests/python/unittest/test_target_target.py
+++ b/tests/python/unittest/test_target_target.py
@@ -151,6 +151,13 @@ def test_target_tag_1():
     assert tgt.attrs["registers_per_block"] == 32768
 
 
+def test_list_kinds():
+    targets = tvm.target.Target.list_kinds()
+    assert len(targets) != 0
+    assert "llvm" in targets
+    assert all(isinstance(target_name, str) for target_name in targets)
+
+
 if __name__ == "__main__":
     test_target_dispatch()
     test_target_string_parse()
@@ -158,3 +165,4 @@ def test_target_tag_1():
     test_target_config()
     test_config_map()
     test_composite_target()
+    test_list_kinds()

From c52c176cfcdf798d8d7817620e38ecb50f653370 Mon Sep 17 00:00:00 2001
From: Grant Watson <2866599+grant-arm@users.noreply.github.com>
Date: Thu, 11 Feb 2021 20:29:36 +0000
Subject: [PATCH 181/357] Replace type punning with memcpy. (#7415)

The type punning in the existing code is undefined behaviour in C.
In particular, the existing code fails when running on Arm Cortex-M devices.
On Cortex-M, accessing a uint64_t that is not 8-byte aligned generates a hard fault.

Change-Id: I2aecaa220e581af7c91a8bc7886499d70e2aa6f2
---
 src/runtime/crt/common/ndarray.c              | 14 +++++++-------
 src/runtime/crt/graph_runtime/graph_runtime.c | 10 +++++-----
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/runtime/crt/common/ndarray.c b/src/runtime/crt/common/ndarray.c
index 33dcaab0e77b..c90a4667903c 100644
--- a/src/runtime/crt/common/ndarray.c
+++ b/src/runtime/crt/common/ndarray.c
@@ -68,22 +68,22 @@ int TVMNDArray_Empty(int32_t ndim, const tvm_index_t* shape, DLDataType dtype, D
 int TVMNDArray_Load(TVMNDArray* ret, const char** strm) {
   int32_t status = 0;
   uint64_t header, reserved;
-  header = ((uint64_t*)*strm)[0];  // NOLINT(*)
+  memcpy(&header, *strm, sizeof(header));
   *strm += sizeof(header);
   if (header != kTVMNDArrayMagic) {
     fprintf(stderr, "Invalid DLTensor file format\n");
     status = -1;
   }
-  reserved = ((uint64_t*)*strm)[0];  // NOLINT(*)
+  memcpy(&reserved, *strm, sizeof(reserved));
   *strm += sizeof(reserved);
   DLContext ctx;
   int ndim;  // sizeof ndim should match dlpack
   DLDataType dtype;
-  ctx = ((DLContext*)*strm)[0];  // NOLINT(*)
+  memcpy(&ctx, *strm, sizeof(ctx));
   *strm += sizeof(ctx);
-  ndim = ((int*)*strm)[0];  // NOLINT(*)
+  memcpy(&ndim, *strm, sizeof(ndim));
   *strm += sizeof(ndim);
-  dtype = ((DLDataType*)*strm)[0];  // NOLINT(*)
+  memcpy(&dtype, *strm, sizeof(dtype));
   *strm += sizeof(dtype);
   if ((ndim < 0) || (ndim > TVM_CRT_MAX_NDIM)) {
     fprintf(stderr, "Invalid ndim=%d: expected to be 0 ~ %d.\n", ndim, TVM_CRT_MAX_NDIM);
@@ -97,7 +97,7 @@ int TVMNDArray_Load(TVMNDArray* ret, const char** strm) {
   int32_t idx;
   if (ndim != 0) {
     for (idx = 0; idx < ndim; idx++) {
-      shape[idx] = ((int64_t*)*strm)[0];  // NOLINT(*)
+      memcpy(&shape[idx], *strm, sizeof(int64_t));
       *strm += sizeof(shape[idx]);
     }
   }
@@ -111,7 +111,7 @@ int TVMNDArray_Load(TVMNDArray* ret, const char** strm) {
     num_elems *= ret->dl_tensor.shape[idx];
   }
   int64_t data_byte_size;
-  data_byte_size = ((int64_t*)*strm)[0];  // NOLINT(*)
+  memcpy(&data_byte_size, *strm, sizeof(data_byte_size));
   *strm += sizeof(data_byte_size);
   if (!(data_byte_size == num_elems * elem_bytes)) {
     fprintf(stderr,
diff --git a/src/runtime/crt/graph_runtime/graph_runtime.c b/src/runtime/crt/graph_runtime/graph_runtime.c
index 9f7b53c997f8..21b72f0e400c 100644
--- a/src/runtime/crt/graph_runtime/graph_runtime.c
+++ b/src/runtime/crt/graph_runtime/graph_runtime.c
@@ -777,13 +777,13 @@ int TVMGraphRuntime_LoadParams(TVMGraphRuntime* runtime, const char* param_blob,
   int status = 0;
   const char* bptr = param_blob;
   uint64_t header, reserved;
-  header = ((uint64_t*)bptr)[0];  // NOLINT(*)
+  memcpy(&header, bptr, sizeof(header));
   bptr += sizeof(header);
   if (header != kTVMNDArrayListMagic) {
     fprintf(stderr, "Invalid parameters file format");
     status = -1;
   }
-  reserved = ((uint64_t*)bptr)[0];  // NOLINT(*)
+  memcpy(&reserved, bptr, sizeof(reserved));
   bptr += sizeof(reserved);
 
   // read names
@@ -799,11 +799,11 @@ int TVMGraphRuntime_LoadParams(TVMGraphRuntime* runtime, const char* param_blob,
   memset(names, 0, TVM_CRT_STRLEN_NAME * runtime->nodes_count);
   uint64_t names_count;
   int idx;
-  names_count = ((uint64_t*)bptr)[0];  // NOLINT(*)
+  memcpy(&names_count, bptr, sizeof(names_count));
   bptr += sizeof(names_count);
   for (idx = 0; idx < names_count; idx++) {
     uint64_t name_length;
-    name_length = ((uint64_t*)bptr)[0];  // NOLINT(*)
+    memcpy(&name_length, bptr, sizeof(name_length));
     bptr += sizeof(name_length);
     if (name_length >= TVM_CRT_STRLEN_NAME) {
       fprintf(stderr, "Error: function name longer than expected.\n");
@@ -815,7 +815,7 @@ int TVMGraphRuntime_LoadParams(TVMGraphRuntime* runtime, const char* param_blob,
 
   // read sizes
   uint64_t sz;
-  sz = ((uint64_t*)bptr)[0];  // NOLINT(*)
+  memcpy(&sz, bptr, sizeof(sz));
   bptr += sizeof(sz);
   uint32_t size = sz;
   if (size != names_count) {

From 33829b3fa2f0212ecc5bcc322dd51fe2fde11e94 Mon Sep 17 00:00:00 2001
From: Trevor Morris <trevmorr@amazon.com>
Date: Thu, 11 Feb 2021 15:04:01 -0800
Subject: [PATCH 182/357] Fix double compile of runtime sources for TRT, ACL
 (#7436)

---
 cmake/modules/contrib/ArmComputeLib.cmake | 4 +++-
 cmake/modules/contrib/TensorRT.cmake      | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/cmake/modules/contrib/ArmComputeLib.cmake b/cmake/modules/contrib/ArmComputeLib.cmake
index 0a75f607acf3..ba082505125b 100644
--- a/cmake/modules/contrib/ArmComputeLib.cmake
+++ b/cmake/modules/contrib/ArmComputeLib.cmake
@@ -23,7 +23,9 @@ if(USE_ARM_COMPUTE_LIB)
     file(GLOB ACL_RELAY_CONTRIB_SRC src/relay/backend/contrib/arm_compute_lib/*.cc)
     file(GLOB ACL_RUNTIME_MODULE src/runtime/contrib/arm_compute_lib/acl_runtime.cc)
     list(APPEND COMPILER_SRCS ${ACL_RELAY_CONTRIB_SRC})
-    list(APPEND COMPILER_SRCS ${ACL_RUNTIME_MODULE})
+    if(NOT USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME)
+        list(APPEND COMPILER_SRCS ${ACL_RUNTIME_MODULE})
+    endif()
     message(STATUS "Build with Arm Compute Library support...")
 endif()
 
diff --git a/cmake/modules/contrib/TensorRT.cmake b/cmake/modules/contrib/TensorRT.cmake
index 24a8241a2229..0c7e43c0fcf8 100644
--- a/cmake/modules/contrib/TensorRT.cmake
+++ b/cmake/modules/contrib/TensorRT.cmake
@@ -28,7 +28,9 @@ if(USE_TENSORRT_CODEGEN)
     file(GLOB RUNTIME_TENSORRT_SRCS src/runtime/contrib/tensorrt/tensorrt_runtime.cc)
     set_source_files_properties(${RUNTIME_TENSORRT_SRCS} PROPERTIES COMPILE_FLAGS "-Wno-deprecated-declarations")
     list(APPEND COMPILER_SRCS ${COMPILER_TENSORRT_SRCS})
-    list(APPEND COMPILER_SRCS ${RUNTIME_TENSORRT_SRCS})
+    if(NOT USE_TENSORRT_RUNTIME)
+        list(APPEND COMPILER_SRCS ${RUNTIME_TENSORRT_SRCS})
+    endif()
 endif()
 
 # TensorRT Runtime

From d5851dd952e22ee54733933e00e75948a2cf8685 Mon Sep 17 00:00:00 2001
From: Honghua Cao <49267856+Beya2019@users.noreply.github.com>
Date: Fri, 12 Feb 2021 10:13:42 +0800
Subject: [PATCH 183/357] [TIR][Printer] Fix SelectNode TIRTextPrinter bracket
 mismatch (#7405)

Co-authored-by: honghua.cao <honghua.cao@streamcomputing.com>
---
 src/printer/tir_text_printer.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/printer/tir_text_printer.cc b/src/printer/tir_text_printer.cc
index 4b0871ae2ce6..711af2a8fd08 100644
--- a/src/printer/tir_text_printer.cc
+++ b/src/printer/tir_text_printer.cc
@@ -301,7 +301,7 @@ Doc TIRTextPrinter::VisitExpr_(const NotNode* op) {
 Doc TIRTextPrinter::VisitExpr_(const SelectNode* op) {
   Doc doc;
   doc << "select(" << Print(op->condition) << ", " << Print(op->true_value) << ", "
-      << Print(op->false_value);
+      << Print(op->false_value) << ")";
   return doc;
 }
 

From c7c54de1d9a514f0d47698a70695667c30307c47 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Thu, 11 Feb 2021 23:00:02 -0800
Subject: [PATCH 184/357] Update tags with minor fix (#7448)

---
 src/target/tag.cc | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/target/tag.cc b/src/target/tag.cc
index 94960894d45d..a931a288924e 100644
--- a/src/target/tag.cc
+++ b/src/target/tag.cc
@@ -68,7 +68,7 @@ Target TargetTag::AddTag(String name, Map<String, ObjectRef> config, bool overri
   return Target(config);
 }
 
-/**********  Register Target tags  ***********/
+/**********  Register Target tags  **********/
 
 #define TVM_REGISTER_CUDA_TAG(Name, Arch, SharedMem, RegPerBlock) \
   TVM_REGISTER_TARGET_TAG(Name).set_config({                      \
@@ -225,7 +225,6 @@ TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-560-ti", "sm_21", 49152, 32768);
 TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-550-ti", "sm_21", 49152, 32768);
 TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-460", "sm_21", 49152, 32768);
 TVM_REGISTER_CUDA_TAG("nvidia/geforce-gts-450", "sm_21", 49152, 32768);
-TVM_REGISTER_CUDA_TAG("nvidia/geforce-gts-450*", "sm_21", 49152, 32768);
 TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-590", "sm_20", 49152, 32768);
 TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-580", "sm_20", 49152, 32768);
 TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-570", "sm_20", 49152, 32768);
@@ -236,17 +235,15 @@ TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-740", "sm_30", 49152, 65536);
 TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-730", "sm_35", 49152, 65536);
 TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-730-ddr3,128bit", "sm_21", 49152, 32768);
 TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-720", "sm_35", 49152, 65536);
-TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-705*", "sm_35", 49152, 65536);
-TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-640-(gddr5)", "sm_35", 49152, 65536);
-TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-640-(gddr3)", "sm_21", 49152, 32768);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-705", "sm_35", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-640-gddr5", "sm_35", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-640-gddr3", "sm_21", 49152, 32768);
 TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-630", "sm_21", 49152, 32768);
 TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-620", "sm_21", 49152, 32768);
 TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-610", "sm_21", 49152, 32768);
 TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-520", "sm_21", 49152, 32768);
 TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-440", "sm_21", 49152, 32768);
-TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-440*", "sm_21", 49152, 32768);
 TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-430", "sm_21", 49152, 32768);
-TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-430*", "sm_21", 49152, 32768);
 TVM_REGISTER_CUDA_TAG("nvidia/geforce-rtx-2080", "sm_75", 49152, 65536);
 TVM_REGISTER_CUDA_TAG("nvidia/geforce-rtx-2070", "sm_75", 49152, 65536);
 TVM_REGISTER_CUDA_TAG("nvidia/geforce-rtx-2060", "sm_75", 49152, 65536);
@@ -265,7 +262,7 @@ TVM_REGISTER_CUDA_TAG("nvidia/geforce-920m", "sm_35", 49152, 65536);
 TVM_REGISTER_CUDA_TAG("nvidia/geforce-910m", "sm_52", 49152, 65536);
 TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-880m", "sm_30", 49152, 65536);
 TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-870m", "sm_30", 49152, 65536);
-TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-860m-cuda-sm_30", "sm_30", 49152, 65536);
+TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-860m-sm-30", "sm_30", 49152, 65536);
 TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-860m-sm-50", "sm_50", 49152, 65536);
 TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-850m", "sm_50", 49152, 65536);
 TVM_REGISTER_CUDA_TAG("nvidia/geforce-840m", "sm_50", 49152, 65536);

From d769727dfd23baf85c5c9d5e0e472b6c1533ffda Mon Sep 17 00:00:00 2001
From: Jorn Tuyls <jtuyls@users.noreply.github.com>
Date: Fri, 12 Feb 2021 07:05:16 +0000
Subject: [PATCH 185/357] Add ROCm docker (#7422)

---
 docker/Dockerfile.demo_rocm           | 36 +++++++++++++++++++++++++++
 docker/bash.sh                        | 16 ++++++++----
 docker/install/ubuntu_install_rocm.sh |  6 ++++-
 docker/with_the_same_user             |  6 +++++
 4 files changed, 58 insertions(+), 6 deletions(-)
 create mode 100644 docker/Dockerfile.demo_rocm

diff --git a/docker/Dockerfile.demo_rocm b/docker/Dockerfile.demo_rocm
new file mode 100644
index 000000000000..c336be41934f
--- /dev/null
+++ b/docker/Dockerfile.demo_rocm
@@ -0,0 +1,36 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Demo docker for ROCm
+FROM ubuntu:18.04
+
+COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
+RUN bash /install/ubuntu_install_core.sh
+
+COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh
+RUN bash /install/ubuntu1804_install_python.sh
+
+COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
+RUN bash /install/ubuntu_install_python_package.sh
+
+COPY install/ubuntu1804_install_llvm.sh /install/ubuntu1804_install_llvm.sh
+RUN bash /install/ubuntu1804_install_llvm.sh
+
+COPY install/ubuntu_install_rocm.sh /install/ubuntu_install_rocm.sh
+RUN bash /install/ubuntu_install_rocm.sh
+
+ENV PATH "${PATH}:/opt/rocm/bin"
diff --git a/docker/bash.sh b/docker/bash.sh
index a615d180b9ed..785b42870e24 100755
--- a/docker/bash.sh
+++ b/docker/bash.sh
@@ -88,6 +88,9 @@ else
     CI_ADDON_ENV=""
 fi
 
+DOCKER_ENVS=""
+DOCKER_DEVICES=""
+WORKSPACE_VOLUMES=""
 # If the Vitis-AI docker image is selected, expose the Xilinx FPGA devices and required volumes containing e.g. DSA's and overlays
 if [[ "${DOCKER_IMAGE_NAME}" == *"demo_vitis_ai"* && -d "/dev/shm" && -d "/opt/xilinx/dsa" && -d "/opt/xilinx/overlaybins" ]]; then
     WORKSPACE_VOLUMES="-v /dev/shm:/dev/shm -v /opt/xilinx/dsa:/opt/xilinx/dsa -v /opt/xilinx/overlaybins:/opt/xilinx/overlaybins"
@@ -102,13 +105,15 @@ if [[ "${DOCKER_IMAGE_NAME}" == *"demo_vitis_ai"* && -d "/dev/shm" && -d "/opt/x
     for i in ${RENDER_DRIVER} ;
     do
         DOCKER_DEVICES+="--device=$i "
-    done
-
-else
-    DOCKER_DEVICES=""
-    WORKSPACE_VOLUMES=""
+    done  
 fi
 
+# Add ROCm devices and set ROCM_ENABLED=1 which is used in the with_the_same_user script
+# to add the user to the video group
+if [[ "${DOCKER_IMAGE_NAME}" == *"rocm"* && -d "/dev/dri" ]]; then
+    DOCKER_DEVICES+="--device=/dev/kfd --device=/dev/dri "
+    DOCKER_ENVS+="-e ROCM_ENABLED=1 "
+fi
 
 # Print arguments.
 echo "WORKSPACE: ${WORKSPACE}"
@@ -143,6 +148,7 @@ ${DOCKER_BINARY} run --rm --pid=host\
     -e "CI_BUILD_GID=$(id -g)" \
     -e "CI_PYTEST_ADD_OPTIONS=$CI_PYTEST_ADD_OPTIONS" \
     -e "CI_IMAGE_NAME=${DOCKER_IMAGE_NAME}" \
+    ${DOCKER_ENVS} \
     ${CI_ADDON_ENV} \
     ${CUDA_ENV} \
     "${CI_DOCKER_EXTRA_PARAMS[@]}" \
diff --git a/docker/install/ubuntu_install_rocm.sh b/docker/install/ubuntu_install_rocm.sh
index 196f4134db6e..0945c582489f 100755
--- a/docker/install/ubuntu_install_rocm.sh
+++ b/docker/install/ubuntu_install_rocm.sh
@@ -23,4 +23,8 @@ set -o pipefail
 # Install ROCm cross compilation toolchain.
 wget -qO - http://repo.radeon.com/rocm/apt/debian/rocm.gpg.key | sudo apt-key add -
 echo deb [arch=amd64] http://repo.radeon.com/rocm/apt/debian/ xenial main > /etc/apt/sources.list.d/rocm.list
-apt-get update && apt-get install -y rocm-dev
+apt-get update && apt-get install -y \
+    rocm-dev \
+    lld && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
diff --git a/docker/with_the_same_user b/docker/with_the_same_user
index 459978409be5..a7ea8c009b58 100644
--- a/docker/with_the_same_user
+++ b/docker/with_the_same_user
@@ -41,6 +41,12 @@ getent passwd "${CI_BUILD_UID}" || adduser --gid "${CI_BUILD_GID}" --uid "${CI_B
     --gecos "${CI_BUILD_USER} (generated by with_the_same_user script)" \
     --disabled-password --home "${CI_BUILD_HOME}" --quiet "${CI_BUILD_USER}"
 usermod -a -G sudo "${CI_BUILD_USER}"
+
+# Add user to video group for ROCm
+if [[ ! -z $ROCM_ENABLED ]]; then
+  usermod -a -G video "${CI_BUILD_USER}"
+fi
+
 # This is a grotesque hack to get PYTEST_ADD_OPTS available to all task scripts.
 echo "${CI_BUILD_USER} ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/90-nopasswd-sudo
 

From e426c871b6044986be05d6635b50c3f606052dea Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Fri, 12 Feb 2021 10:15:17 -0800
Subject: [PATCH 186/357] [AutoScheduler] Fix distill record (#7439)

* [AutoScheduler] Fix distill record

* update comments
---
 python/tvm/auto_scheduler/measure_record.py   | 52 ++++++++++++-------
 tutorials/auto_scheduler/tune_network_arm.py  |  2 +-
 tutorials/auto_scheduler/tune_network_cuda.py |  2 +-
 tutorials/auto_scheduler/tune_network_mali.py |  2 +-
 tutorials/auto_scheduler/tune_network_x86.py  |  2 +-
 5 files changed, 38 insertions(+), 22 deletions(-)

diff --git a/python/tvm/auto_scheduler/measure_record.py b/python/tvm/auto_scheduler/measure_record.py
index 200d24fa7d50..ee671cd9b23a 100644
--- a/python/tvm/auto_scheduler/measure_record.py
+++ b/python/tvm/auto_scheduler/measure_record.py
@@ -286,26 +286,42 @@ def distill_record_file(in_file, out_file):
     if os.path.isfile(out_file):
         out_context = load_records(out_file)
         context = itertools.chain(context, out_context)
-    context, context_clone = itertools.tee(context)
-    best_context = ApplyHistoryBest(context)
-    best_set = set()
 
     def measure_input_str_key(inp):
         return _ffi_api.SerializeMeasureInput(inp)
 
-    for v in best_context.best_by_model.values():
-        best_set.add(measure_input_str_key(v[0]))
+    # Dict[target key,
+    #   Dict[workload hash,
+    #     Dict[workload args, (cost, (MeasureInput, MeasureResult))]]]
+    # Full type: Dict[str, Dict[str, Dict[Tuple, Tuple[float, Tuple[Measureinput, MeasureResult]]]]]
+    best_records = {}
 
-    for v in best_context.best_by_targetkey.values():
-        best_set.add(measure_input_str_key(v[0]))
+    for inp, res in context:
+        if res.error_no != 0:
+            continue
+
+        # Keep the best record for each target and workload.
+        costs = [x.value for x in res.costs if isinstance(x, tvm.tir.expr.FloatImm)]
+        cost = np.mean(costs)
+        for k in inp.task.target.keys:
+            entry, _, workload_args = ApplyHistoryBest.get_workload_entry(
+                best_records, k, inp.task.workload_key
+            )
+            if workload_args not in entry or cost < entry[workload_args][0]:
+                entry[workload_args] = (cost, (inp, res))
+
+    # Remove duplications by multiple target keys.
+    out_records = {}
+    for target_entry in best_records.values():
+        for workload_entry in target_entry.values():
+            for _, (inp, res) in workload_entry.values():
+                out_records[measure_input_str_key(inp)] = (inp, res)
 
     inputs = []
     results = []
-    for inp, res in context_clone:
-        if measure_input_str_key(inp) in best_set:
-            inputs.append(inp)
-            results.append(res)
-            best_set.remove(measure_input_str_key(inp))
+    for inp, res in out_records.values():
+        inputs.append(inp)
+        results.append(res)
 
     # create a new file and save the best records
     open(out_file, "w")
@@ -316,23 +332,23 @@ def measure_input_str_key(inp):
 def main():
     """The main function for CLI."""
     parser = argparse.ArgumentParser()
-    parser.add_argument("--mode", choices=["distill"], required=True)
-    parser.add_argument("--i", type=str, help="input file")
-    parser.add_argument("--o", type=str, default=None, help="output file")
+    parser.add_argument("--mode", choices=["distill"], default="distill")
+    parser.add_argument("-i", "--input", type=str, help="input file")
+    parser.add_argument("-o", "--output", type=str, default=None, help="output file")
 
     args = parser.parse_args()
     logging.basicConfig()
     logger.setLevel(logging.INFO)
 
     if args.mode == "distill":
-        args.o = args.o or args.i + ".best.json"
-        distill_record_file(args.i, args.o)
+        args.output = args.output or args.input + ".best.json"
+        distill_record_file(args.input, args.output)
 
 
 """
 Usage:
 * Distill the best entries from a large log file
-e.g. python -m tvm.auto_scheduler.measure_record --mode distill --i input.json
+e.g. python -m tvm.auto_scheduler.measure_record --mode distill -i input.json
 """
 if __name__ == "__main__":
     main()
diff --git a/tutorials/auto_scheduler/tune_network_arm.py b/tutorials/auto_scheduler/tune_network_arm.py
index f821c2e55d13..c4add79450e9 100644
--- a/tutorials/auto_scheduler/tune_network_arm.py
+++ b/tutorials/auto_scheduler/tune_network_arm.py
@@ -408,7 +408,7 @@ def tune_and_evaluate():
 # 1. During the tuning, the auto-scheduler needs to compile many programs and
 #    extract feature from them. This part is CPU-intensive,
 #    so a high-performance CPU with many cores is recommended for faster search.
-# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill --i log.json`
+# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json`
 #    to distill the large log file and only save the best useful records.
 # 3. You can resume a search from the previous log file. You just need to
 #    add a new argument :code:`load_log_file` when creating the task scheduler
diff --git a/tutorials/auto_scheduler/tune_network_cuda.py b/tutorials/auto_scheduler/tune_network_cuda.py
index b09886941c74..5ed3ceef5ba0 100644
--- a/tutorials/auto_scheduler/tune_network_cuda.py
+++ b/tutorials/auto_scheduler/tune_network_cuda.py
@@ -299,7 +299,7 @@ def run_tuning():
 # 1. During the tuning, the auto-scheduler needs to compile many programs and
 #    extract feature from them. This part is CPU-intensive,
 #    so a high-performance CPU with many cores is recommended for faster search.
-# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill --i log.json`
+# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json`
 #    to distill the large log file and only save the best useful records.
 # 3. You can resume a search from the previous log file. You just need to
 #    add a new argument :code:`load_log_file` when creating the task scheduler
diff --git a/tutorials/auto_scheduler/tune_network_mali.py b/tutorials/auto_scheduler/tune_network_mali.py
index d3fefa725d4c..ca1067b27c80 100644
--- a/tutorials/auto_scheduler/tune_network_mali.py
+++ b/tutorials/auto_scheduler/tune_network_mali.py
@@ -349,7 +349,7 @@ def tune_and_evaluate():
 # 1. During the tuning, the auto-scheduler needs to compile many programs and
 #    extract feature from them. This part is CPU-intensive,
 #    so a high-performance CPU with many cores is recommended for faster search.
-# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill --i log.json`
+# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json`
 #    to distill the large log file and only save the best useful records.
 # 3. You can resume a search from the previous log file. You just need to
 #    add a new argument :code:`load_log_file` when creating the task scheduler
diff --git a/tutorials/auto_scheduler/tune_network_x86.py b/tutorials/auto_scheduler/tune_network_x86.py
index 7f96254b2f49..8526abbbe6ca 100644
--- a/tutorials/auto_scheduler/tune_network_x86.py
+++ b/tutorials/auto_scheduler/tune_network_x86.py
@@ -298,7 +298,7 @@ def run_tuning():
 # 1. During the tuning, the auto-scheduler needs to compile many programs and
 #    extract feature from them. This part is CPU-intensive,
 #    so a high-performance CPU with many cores is recommended for faster search.
-# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill --i log.json`
+# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json`
 #    to distill the large log file and only save the best useful records.
 # 3. You can resume a search from the previous log file. You just need to
 #    add a new argument :code:`load_log_file` when creating the task scheduler

From b36bdf6ec859e888e0ac8cf54d09e8955b436cc8 Mon Sep 17 00:00:00 2001
From: Altan Haan <ahaan@octoml.ai>
Date: Fri, 12 Feb 2021 12:34:41 -0800
Subject: [PATCH 187/357] [Relay][Op][Bug] Fix missing return in scatter_nd
 cuda strategy (#7447)

* fix missing return in scatter_nd cuda strategy

* add Relay test for scatter_nd, fix documentation
---
 python/tvm/relay/op/strategy/cuda.py |  1 +
 python/tvm/relay/op/transform.py     |  2 +-
 tests/python/relay/test_op_level3.py | 83 +++++++++++++++-------------
 3 files changed, 48 insertions(+), 38 deletions(-)

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 346e93445f1c..032d2dd2c8f1 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -826,6 +826,7 @@ def scatter_nd_cuda(attrs, inputs, out_type, target):
         name="scatter_nd.cuda",
         plevel=10,
     )
+    return strategy
 
 
 @sort_strategy.register(["cuda", "gpu"])
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index e9d081eb5fb6..d42ef477499f 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -321,7 +321,7 @@ def scatter_nd(data, indices, out_shape):
     indices : relay.Expr
         The index locations to update.
 
-    out_shape : relay.Expr
+    out_shape : Union[Tuple[int], List[int]]
         Output shape of the scatter.
 
     Returns
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 559eb2462fa8..625c47240326 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -1377,41 +1377,50 @@ def verify_cumsum(data_np, np_out, axis=None, out_dtype=None, rtol=1e-5, atol=1e
     verify_cumsum(data, np.cumsum(data, dtype="int64"), out_dtype="int64")
 
 
+@tvm.testing.parametrize_targets
+def test_scatter_nd(target, ctx):
+    def verify_scatter_nd(data_np, indices_np, shape, ref_res, rtol=1e-5, atol=1e-5):
+        data = relay.var("data", shape=data_np.shape, dtype=str(data_np.dtype))
+        indices = relay.var("indices", shape=indices_np.shape, dtype=str(indices_np.dtype))
+
+        out = relay.op.scatter_nd(data, indices, shape)
+        func = relay.Function([data, indices], out)
+
+        for kind in ["graph", "debug"]:
+            intrp = relay.create_executor(kind, ctx=ctx, target=target)
+            op_res = intrp.evaluate(func)(data_np, indices_np)
+            tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=rtol, atol=atol)
+
+    data = np.array([2, 3, 0])
+    indices = np.array([[1, 1, 0], [0, 1, 0]])
+    shape = (2, 2)
+    out = np.array([[0, 0], [2, 3]])
+    verify_scatter_nd(data, indices, shape, out)
+
+    data = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
+    indices = np.array([[0, 1], [1, 1]])
+    shape = (2, 2, 2, 2)
+    out = np.array([[[[0, 0], [0, 0]], [[1, 2], [3, 4]]], [[[0, 0], [0, 0]], [[5, 6], [7, 8]]]])
+    verify_scatter_nd(data, indices, shape, out)
+
+    data = np.reshape(np.arange(1560 * 3), (3, 1560)).astype("float32")
+    indices = np.array([[1, 0, 0]])
+    shape = (2, 1560)
+    out = np.zeros(shape).astype("float32")
+    out[1, :] += data[0, :]
+    out[0, :] += data[1, :]
+    out[0, :] += data[2, :]
+    verify_scatter_nd(data, indices, shape, out)
+
+    data = np.ones((5, 3)).astype("float64")
+    indices = np.stack((np.random.randint(2, size=5), np.random.randint(7, size=5))).astype("int64")
+    shape = (2, 7, 3)
+    out = np.zeros(shape).astype("float64")
+    for i in range(indices.shape[1]):
+        for j in range(data.shape[1]):
+            out[indices[0, i], indices[1, i], j] += data[i, j]
+    verify_scatter_nd(data, indices, shape, out)
+
+
 if __name__ == "__main__":
-    test_cast()
-    test_zeros_ones()
-    test_unary_identity()
-    test_clip()
-    test_transpose_infer_type()
-    test_transpose()
-    test_reshape_infer_type()
-    test_reshape()
-    test_reshape_fail()
-    test_reshape_like_infer_type()
-    test_reshape_like()
-    test_take_infer_type()
-    test_take()
-    test_full_infer_type()
-    test_full()
-    test_full_like_infer_type()
-    test_full_like()
-    test_infer_type_leaky_relu()
-    test_infer_type_prelu()
-    test_squeeze()
-    test_squeeze_infer_type()
-    test_squeeze_bad_axes_infer_type()
-    test_split_infer_type()
-    test_arange()
-    test_meshgrid()
-    test_reverse()
-    test_stack()
-    test_tile()
-    test_repeat()
-    test_gather_nd()
-    test_isfinite()
-    test_isinf()
-    test_unravel_index()
-    test_sparse_to_dense()
-    test_fixed_point_multiply()
-    test_adv_index()
-    test_cumsum()
+    pytest.main([__file__])

From a1260cc19342c4db61c6942a11c2b2b2b58f8bad Mon Sep 17 00:00:00 2001
From: Trevor Morris <trevmorr@amazon.com>
Date: Fri, 12 Feb 2021 12:35:17 -0800
Subject: [PATCH 188/357] Make keras reshape less restrictive (#7446)

---
 python/tvm/relay/frontend/keras.py          | 31 ++++++---------------
 tests/python/frontend/keras/test_forward.py | 10 +++++++
 2 files changed, 18 insertions(+), 23 deletions(-)

diff --git a/python/tvm/relay/frontend/keras.py b/python/tvm/relay/frontend/keras.py
index 4bdca2c4d533..eb16bf2a25b4 100644
--- a/python/tvm/relay/frontend/keras.py
+++ b/python/tvm/relay/frontend/keras.py
@@ -864,29 +864,14 @@ def _convert_reshape(inexpr, keras_layer, etab):
     _check_data_format(keras_layer)
     inshape = keras_layer.input_shape  # includes batch
     tshape = keras_layer.target_shape  # no batch
-    if len(inshape) == 3 and len(tshape) == 1:
-        # (?, a, b) -> (-1, ab)
-        shape = (-1, tshape[0])
-    elif len(inshape) in [2, 3] and len(tshape) == 2:
-        # (?, cc) -> (-1, c, c)
-        # (?, a, b) -> (-1, c, c)
-        assert tshape[0] == tshape[1], "Only supports square target shapes, but got {}".format(
-            tshape
-        )
-        shape = (-1,) + tshape
-    else:
-        # (?, h, w, c) -> (-1, c, H, W)
-        # (?, h, w, c) -> (-1, c, hw)
-        # (?, hw, c) -> (-1, c, h, w)
-        ch = inshape[-1]
-        assert ch == tshape[-1], (
-            "Only supports last dimension in target shape being equal to "
-            "the channel number of input tensor."
-        )
-        if etab.data_layout == "NCHW":
-            shape = (-1, ch) + tshape[:-1]
-        else:
-            shape = (-1,) + tshape[:-1] + (ch,)
+    shape = (-1,) + tshape
+
+    if etab.data_layout == "NCHW" and (len(inshape) > 3 or len(tshape) > 2):
+        # Perform reshape in original NHWC format.
+        inexpr = _op.transpose(inexpr, [0] + list(range(2, len(inshape))) + [1])
+        inexpr = _op.reshape(inexpr, newshape=shape)
+        return _op.transpose(inexpr, axes=[0, -1] + list(range(1, len(shape) - 1)))
+
     return _op.reshape(inexpr, newshape=shape)
 
 
diff --git a/tests/python/frontend/keras/test_forward.py b/tests/python/frontend/keras/test_forward.py
index 05d890419aa4..561e444f077f 100644
--- a/tests/python/frontend/keras/test_forward.py
+++ b/tests/python/frontend/keras/test_forward.py
@@ -350,6 +350,16 @@ def test_forward_reshape(self, keras):
         x = keras.layers.Reshape(target_shape=(4, 4))(data)
         keras_model = keras.models.Model(data, x)
         verify_keras_frontend(keras_model, need_transpose=False)
+        # "non-square" target shape
+        data = keras.layers.Input(shape=(15,))
+        x = keras.layers.Reshape(target_shape=(5, 3))(data)
+        keras_model = keras.models.Model(data, x)
+        verify_keras_frontend(keras_model, need_transpose=False)
+        # modify channel dim
+        data = keras.layers.Input(shape=(3, 2, 4))
+        x = keras.layers.Reshape(target_shape=(3, 8))(data)
+        keras_model = keras.models.Model(data, x)
+        verify_keras_frontend(keras_model)
 
     def test_forward_crop(self, keras):
         data = keras.layers.Input(shape=(32, 32, 3))

From b8a83403ddca43b223d4e1cdd482e35b98171171 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Fri, 12 Feb 2021 15:41:33 -0800
Subject: [PATCH 189/357] =?UTF-8?q?[=C2=B5TVM]=20Use=20standalone=5Fcrt=20?=
 =?UTF-8?q?build=20tree=20for=20all=20=C2=B5TVM=20builds=20(#7333)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Build microTVM using standalone_crt in build tree.

* black format

* pylint

* try stashing entire standalone_crt in hopes it will not upset jenkins

* Put standalone_crt in correct Jenkinsfile stash bundle

* include build prefix

* switch to python script for expanding globs

* revert attempt to use globs in pack_libs, switch to building standalone_crt

* properly revert pack_lib changes

* fix typo

* retrigger CI

* revert pyproject.toml

* update Jenkinsfile approach to use task_ci_setup.sh
---
 cmake/modules/StandaloneCrt.cmake         |   8 +-
 python/tvm/micro/__init__.py              |   4 +-
 python/tvm/micro/build.py                 | 174 +++++++++++++++-------
 python/tvm/micro/compiler.py              |   5 +-
 tests/micro/qemu/test_zephyr.py           |   3 +-
 tests/python/unittest/test_crt.py         |  13 +-
 tests/python/unittest/test_link_params.py |  13 +-
 tests/scripts/task_ci_setup.sh            |   5 +
 tutorials/micro/micro_tflite.py           |  13 +-
 9 files changed, 148 insertions(+), 90 deletions(-)

diff --git a/cmake/modules/StandaloneCrt.cmake b/cmake/modules/StandaloneCrt.cmake
index 411d0383faf4..dc1b3b2665f2 100644
--- a/cmake/modules/StandaloneCrt.cmake
+++ b/cmake/modules/StandaloneCrt.cmake
@@ -45,12 +45,14 @@ if(USE_MICRO)
          "src/runtime/crt/common *.c -> src/runtime/crt/common"
          "src/runtime/crt/graph_runtime *.c -> src/runtime/crt/graph_runtime"
          "src/runtime/crt/graph_runtime_module *.c -> src/runtime/crt/graph_runtime_module"
-         "src/runtime/crt/host crt_config.h -> src/runtime/crt/host"
+         "src/runtime/crt/host crt_config.h -> template/host"
+         "src/runtime/crt/host *.cc -> template/host"
          "src/runtime/crt/memory *.c -> src/runtime/crt/memory"
          "src/runtime/crt/utvm_rpc_common *.cc -> src/runtime/crt/utvm_rpc_common"
          "src/runtime/crt/utvm_rpc_server *.cc -> src/runtime/crt/utvm_rpc_server"
          "src/runtime/minrpc *.h -> src/runtime/minrpc"
          "src/support generic_arena.h -> src/support"
+         "src/runtime/crt crt_config-template.h -> template"
          )
 
     set(standalone_crt_base "${CMAKE_CURRENT_BINARY_DIR}/standalone_crt")
@@ -101,9 +103,7 @@ if(USE_MICRO)
     endforeach()
 
     set(make_common_args
-        "DLPACK_INCLUDE_DIR=${CMAKE_SOURCE_DIR}/3rdparty/dlpack/include"
-        "TVM_INCLUDE_DIR=${CMAKE_CURRENT_BINARY_DIR}/standalone_crt/include"
-        "CRT_CONFIG=src/runtime/crt/host/crt_config.h"
+        "CRT_CONFIG=template/host/crt_config.h"
         "BUILD_DIR=${host_build_dir_abspath}"
         "EXTRA_CFLAGS=-fPIC"
         "EXTRA_CXXFLAGS=-fPIC"
diff --git a/python/tvm/micro/__init__.py b/python/tvm/micro/__init__.py
index a6e24343e378..299b143eb5de 100644
--- a/python/tvm/micro/__init__.py
+++ b/python/tvm/micro/__init__.py
@@ -17,8 +17,8 @@
 """MicroTVM module for bare-metal backends"""
 
 from .artifact import Artifact
-from .build import build_static_runtime, default_options, TVM_ROOT_DIR
-from .build import CRT_ROOT_DIR, Workspace
+from .build import build_static_runtime, default_options, get_standalone_crt_dir
+from .build import get_standalone_crt_lib, Workspace
 from .compiler import Compiler, DefaultCompiler, Flasher
 from .debugger import GdbRemoteDebugger
 from .micro_library import MicroLibrary
diff --git a/python/tvm/micro/build.py b/python/tvm/micro/build.py
index cad385b9b190..3837d423f8bd 100644
--- a/python/tvm/micro/build.py
+++ b/python/tvm/micro/build.py
@@ -21,9 +21,11 @@
 import logging
 import os
 import re
+import typing
 from tvm.contrib import utils
 
 from .micro_library import MicroLibrary
+from .._ffi import libinfo
 
 
 _LOG = logging.getLogger(__name__)
@@ -55,15 +57,62 @@ def path(self):
 CRT_RUNTIME_LIB_NAMES = ["utvm_rpc_server", "utvm_rpc_common", "common"]
 
 
-TVM_ROOT_DIR = os.path.realpath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
+STANDALONE_CRT_DIR = None
 
 
-CRT_ROOT_DIR = os.path.join(TVM_ROOT_DIR, "src", "runtime", "crt")
+class CrtNotFoundError(Exception):
+    """Raised when the standalone CRT dirtree cannot be found."""
 
 
-RUNTIME_LIB_SRC_DIRS = [os.path.join(CRT_ROOT_DIR, n) for n in CRT_RUNTIME_LIB_NAMES] + [
-    os.path.join(TVM_ROOT_DIR, "3rdparty/libcrc/src")
-]
+def get_standalone_crt_dir() -> str:
+    """Find the standalone_crt directory.
+
+    Though the C runtime source lives in the tvm tree, it is intended to be distributed with any
+    binary build of TVM. This source tree is intended to be integrated into user projects to run
+    models targeted with --runtime=c.
+
+    Returns
+    -------
+    str :
+        The path to the standalone_crt
+    """
+    global STANDALONE_CRT_DIR
+    if STANDALONE_CRT_DIR is None:
+        for path in libinfo.find_lib_path():
+            crt_path = os.path.join(os.path.dirname(path), "standalone_crt")
+            if os.path.isdir(crt_path):
+                STANDALONE_CRT_DIR = crt_path
+                break
+
+        else:
+            raise CrtNotFoundError()
+
+    return STANDALONE_CRT_DIR
+
+
+def get_standalone_crt_lib(name: str) -> str:
+    """Find a source library directory in the standalone_crt.
+
+    The standalone C runtime is split into various libraries (one per directory underneath
+    src/runtime/crt). This convenience function returns the full path to one of those libraries
+    located in get_standalone_crt_dir().
+
+    Parameters
+    ----------
+    name : str
+        Name of the library subdirectory underneath src/runtime/crt.
+
+    Returns
+    -------
+    str :
+         The full path to the the library.
+    """
+    return os.path.join(get_standalone_crt_dir(), "src", "runtime", "crt", name)
+
+
+def get_runtime_libs() -> str:
+    """Return abspath to all CRT directories which contain source (i.e. not header) files."""
+    return [get_standalone_crt_lib(n) for n in CRT_RUNTIME_LIB_NAMES]
 
 
 RUNTIME_SRC_REGEX = re.compile(r"^.*\.cc?$", re.IGNORECASE)
@@ -72,52 +121,73 @@ def path(self):
 _COMMON_CFLAGS = ["-Wall", "-Werror"]
 
 
-_CRT_DEFAULT_OPTIONS = {
-    "cflags": ["-std=c11"] + _COMMON_CFLAGS,
-    "ccflags": ["-std=c++11"] + _COMMON_CFLAGS,
-    "ldflags": ["-std=c++11"],
-    "include_dirs": [
-        f"{TVM_ROOT_DIR}/include",
-        f"{TVM_ROOT_DIR}/3rdparty/dlpack/include",
-        f"{TVM_ROOT_DIR}/3rdparty/libcrc/include",
-        f"{TVM_ROOT_DIR}/3rdparty/dmlc-core/include",
-        f"{CRT_ROOT_DIR}/include",
-    ],
-}
+def _build_default_compiler_options(standalone_crt_dir: typing.Optional[str] = None) -> str:
+    """Return a dict containing base compile flags for the CRT under gcc common to .
 
+    Parameters
+    ----------
+    standalone_crt_dir : Optional[str]
+        If given, the path to the standalone_crt
+    """
+    if standalone_crt_dir is None:
+        standalone_crt_dir = get_standalone_crt_dir()
+    return {
+        "cflags": ["-std=c11"] + _COMMON_CFLAGS,
+        "ccflags": ["-std=c++11"] + _COMMON_CFLAGS,
+        "ldflags": ["-std=c++11"],
+        "include_dirs": [os.path.join(standalone_crt_dir, "include")],
+    }
 
-_CRT_GENERATED_LIB_OPTIONS = copy.copy(_CRT_DEFAULT_OPTIONS)
 
+def default_options(crt_config_include_dir, standalone_crt_dir=None):
+    """Return default opts passed to Compile commands.
+
+    Parameters
+    ----------
+    crt_config_include_dir : str
+        Path to a directory containing crt_config.h for the target. This will be appended
+        to the include path for cflags and ccflags.
+    standalone_crt_dir : Optional[str]
+
+    Returns
+    -------
+    Dict :
+        A dictionary containing 3 subkeys, each whose value is _build_default_compiler_options()
+        plus additional customization.
+         - "bin_opts" - passed as "options" to Compiler.binary() when building MicroBinary.
+         - "lib_opts" - passed as "options" to Compiler.library() when building bundled CRT
+           libraries (or otherwise, non-generated libraries).
+         - "generated_lib_opts" - passed as "options" to Compiler.library() when building the
+           generated library.
+    """
+    bin_opts = _build_default_compiler_options(standalone_crt_dir)
+    bin_opts["include_dirs"].append(crt_config_include_dir)
 
-# Disable due to limitation in the TVM C codegen, which generates lots of local variable
-# declarations at the top of generated code without caring whether they're used.
-# Example:
-#   void* arg0 = (((TVMValue*)args)[0].v_handle);
-#   int32_t arg0_code = ((int32_t*)arg_type_ids)[(0)];
-_CRT_GENERATED_LIB_OPTIONS["cflags"].append("-Wno-unused-variable")
-_CRT_GENERATED_LIB_OPTIONS["ccflags"].append("-Wno-unused-variable")
+    lib_opts = _build_default_compiler_options(standalone_crt_dir)
+    lib_opts["cflags"] = ["-Wno-error=incompatible-pointer-types"]
+    lib_opts["include_dirs"].append(crt_config_include_dir)
 
+    generated_lib_opts = copy.copy(lib_opts)
 
-# Many TVM-intrinsic operators (i.e. expf, in particular)
-_CRT_GENERATED_LIB_OPTIONS["cflags"].append("-fno-builtin")
+    # Disable due to limitation in the TVM C codegen, which generates lots of local variable
+    # declarations at the top of generated code without caring whether they're used.
+    # Example:
+    #   void* arg0 = (((TVMValue*)args)[0].v_handle);
+    #   int32_t arg0_code = ((int32_t*)arg_type_ids)[(0)];
+    generated_lib_opts["cflags"].append("-Wno-unused-variable")
+    generated_lib_opts["ccflags"].append("-Wno-unused-variable")
 
+    # Many TVM-intrinsic operators (i.e. expf, in particular)
+    generated_lib_opts["cflags"].append("-fno-builtin")
 
-def default_options(target_include_dir):
-    """Return default opts passed to Compile commands."""
-    bin_opts = copy.deepcopy(_CRT_DEFAULT_OPTIONS)
-    bin_opts["include_dirs"].append(target_include_dir)
-    lib_opts = copy.deepcopy(bin_opts)
-    lib_opts["cflags"] = ["-Wno-error=incompatible-pointer-types"]
-    return {"bin_opts": bin_opts, "lib_opts": lib_opts}
+    return {"bin_opts": bin_opts, "lib_opts": lib_opts, "generated_lib_opts": generated_lib_opts}
 
 
 def build_static_runtime(
     workspace,
     compiler,
     module,
-    lib_opts=None,
-    bin_opts=None,
-    generated_lib_opts=None,
+    compiler_options,
     extra_libs=None,
 ):
     """Build the on-device runtime, statically linking the given modules.
@@ -130,15 +200,11 @@ def build_static_runtime(
     module : IRModule
         Module to statically link.
 
-    lib_opts : Optional[dict]
-        The `options` parameter passed to compiler.library().
-
-    bin_opts : Optional[dict]
-        The `options` parameter passed to compiler.binary().
-
-    generated_lib_opts : Optional[dict]
-        The `options` parameter passed to compiler.library() when compiling the generated TVM C
-        source module.
+    compiler_options : dict
+        The return value of tvm.micro.default_options(), with any keys overridden to inject
+        compiler options specific to this build. If not given, tvm.micro.default_options() is
+        used. This dict contains the `options` parameter passed to Compiler.library() and
+        Compiler.binary() at various stages in the compilation process.
 
     extra_libs : Optional[List[MicroLibrary|str]]
         If specified, extra libraries to be compiled into the binary. If a MicroLibrary, it is
@@ -151,18 +217,12 @@ def build_static_runtime(
     MicroBinary :
         The compiled runtime.
     """
-    lib_opts = _CRT_DEFAULT_OPTIONS if lib_opts is None else lib_opts
-    bin_opts = _CRT_DEFAULT_OPTIONS if bin_opts is None else bin_opts
-    generated_lib_opts = (
-        _CRT_GENERATED_LIB_OPTIONS if generated_lib_opts is None else generated_lib_opts
-    )
-
     mod_build_dir = workspace.relpath(os.path.join("build", "module"))
     os.makedirs(mod_build_dir)
     mod_src_dir = workspace.relpath(os.path.join("src", "module"))
 
     libs = []
-    for mod_or_src_dir in (extra_libs or []) + RUNTIME_LIB_SRC_DIRS:
+    for mod_or_src_dir in (extra_libs or []) + get_runtime_libs():
         if isinstance(mod_or_src_dir, MicroLibrary):
             libs.append(mod_or_src_dir)
             continue
@@ -177,7 +237,7 @@ def build_static_runtime(
             if RUNTIME_SRC_REGEX.match(p):
                 lib_srcs.append(os.path.join(lib_src_dir, p))
 
-        libs.append(compiler.library(lib_build_dir, lib_srcs, lib_opts))
+        libs.append(compiler.library(lib_build_dir, lib_srcs, compiler_options["lib_opts"]))
 
     mod_src_dir = workspace.relpath(os.path.join("src", "module"))
     os.makedirs(mod_src_dir)
@@ -185,10 +245,12 @@ def build_static_runtime(
         module.export_library(
             mod_build_dir,
             workspace_dir=mod_src_dir,
-            fcompile=lambda bdir, srcs, **kwargs: compiler.library(bdir, srcs, generated_lib_opts),
+            fcompile=lambda bdir, srcs, **kwargs: compiler.library(
+                bdir, srcs, compiler_options["generated_lib_opts"]
+            ),
         )
     )
 
     runtime_build_dir = workspace.relpath(f"build/runtime")
     os.makedirs(runtime_build_dir)
-    return compiler.binary(runtime_build_dir, libs, bin_opts)
+    return compiler.binary(runtime_build_dir, libs, compiler_options["bin_opts"])
diff --git a/python/tvm/micro/compiler.py b/python/tvm/micro/compiler.py
index f59ac8dbc4a0..d0431f42b01d 100644
--- a/python/tvm/micro/compiler.py
+++ b/python/tvm/micro/compiler.py
@@ -24,7 +24,6 @@
 import subprocess
 
 import tvm.target
-from . import build
 from . import class_factory
 from . import debugger
 from . import transport
@@ -291,7 +290,9 @@ def binary(self, output, objects, options=None, link_main=True, main_options=Non
         args.extend(["-g", "-o", output_abspath])
 
         if link_main:
-            host_main_srcs = glob.glob(os.path.join(build.CRT_ROOT_DIR, "host", "*.cc"))
+            host_main_srcs = glob.glob(
+                os.path.join(tvm.micro.get_standalone_crt_dir(), "template", "host", "*.cc")
+            )
             if main_options:
                 main_lib = self.library(os.path.join(output, "host"), host_main_srcs, main_options)
                 for lib_name in main_lib.library_files:
diff --git a/tests/micro/qemu/test_zephyr.py b/tests/micro/qemu/test_zephyr.py
index 865c7f88806f..4c8bd5f5dae8 100644
--- a/tests/micro/qemu/test_zephyr.py
+++ b/tests/micro/qemu/test_zephyr.py
@@ -92,8 +92,7 @@ def _make_session(model, target, zephyr_board, west_cmd, mod):
             workspace,
             compiler,
             mod,
-            lib_opts=opts["lib_opts"],
-            bin_opts=opts["bin_opts"],
+            opts,
         )
         if os.path.exists(prev_build):
             os.unlink(prev_build)
diff --git a/tests/python/unittest/test_crt.py b/tests/python/unittest/test_crt.py
index 659d1908096b..3c68b4090309 100644
--- a/tests/python/unittest/test_crt.py
+++ b/tests/python/unittest/test_crt.py
@@ -50,18 +50,15 @@ def _make_sess_from_op(workspace, op_name, sched, arg_bufs):
 
 def _make_session(workspace, mod):
     compiler = tvm.micro.DefaultCompiler(target=TARGET)
-    opts = tvm.micro.default_options(os.path.join(tvm.micro.CRT_ROOT_DIR, "host"))
+    opts = tvm.micro.default_options(
+        os.path.join(tvm.micro.get_standalone_crt_dir(), "template", "host")
+    )
     micro_binary = tvm.micro.build_static_runtime(
-        # the x86 compiler *expects* you to give the exact same dictionary for both
-        # lib_opts and bin_opts. so the library compiler is mutating lib_opts and
-        # the binary compiler is expecting those mutations to be in bin_opts.
-        # TODO(weberlo) fix this very bizarre behavior
         workspace,
         compiler,
         mod,
-        lib_opts=opts["bin_opts"],
-        bin_opts=opts["bin_opts"],
-        extra_libs=[os.path.join(tvm.micro.build.CRT_ROOT_DIR, "memory")],
+        opts,
+        extra_libs=[tvm.micro.get_standalone_crt_lib("memory")],
     )
 
     flasher_kw = {
diff --git a/tests/python/unittest/test_link_params.py b/tests/python/unittest/test_link_params.py
index 52d7a27838d7..80ea11f6d9aa 100644
--- a/tests/python/unittest/test_link_params.py
+++ b/tests/python/unittest/test_link_params.py
@@ -354,21 +354,18 @@ def test_crt_link_params():
 
             workspace = tvm.micro.Workspace()
             compiler = tvm.micro.DefaultCompiler(target=target)
-            opts = tvm.micro.default_options(os.path.join(tvm.micro.CRT_ROOT_DIR, "host"))
+            opts = tvm.micro.default_options(
+                os.path.join(tvm.micro.get_standalone_crt_dir(), "template", "host")
+            )
             opts["bin_opts"]["ldflags"].append("-DTVM_HOST_USE_GRAPH_RUNTIME_MODULE")
 
             micro_binary = tvm.micro.build_static_runtime(
-                # the x86 compiler *expects* you to give the exact same dictionary for both
-                # lib_opts and bin_opts. so the library compiler is mutating lib_opts and
-                # the binary compiler is expecting those mutations to be in bin_opts.
-                # TODO(weberlo) fix this very bizarre behavior
                 workspace,
                 compiler,
                 lib,
-                lib_opts=opts["bin_opts"],
-                bin_opts=opts["bin_opts"],
+                compiler_options=opts,
                 extra_libs=[
-                    os.path.join(tvm.micro.CRT_ROOT_DIR, m)
+                    tvm.micro.get_standalone_crt_lib(m)
                     for m in ("memory", "graph_runtime_module", "graph_runtime")
                 ],
             )
diff --git a/tests/scripts/task_ci_setup.sh b/tests/scripts/task_ci_setup.sh
index f48ed49a2266..17838c58a83c 100755
--- a/tests/scripts/task_ci_setup.sh
+++ b/tests/scripts/task_ci_setup.sh
@@ -31,3 +31,8 @@ set -o pipefail
 echo "Addtiional setup in" ${CI_IMAGE_NAME}
 
 python3 -m pip install --user tlcpack-sphinx-addon==0.1.4 synr==0.2.1
+
+# Rebuild standalone_crt in build/ tree. This file is not currently archived by pack_lib() in
+# Jenkinsfile. We expect config.cmake to be present from pack_lib().
+# TODO(areusch): Make pack_lib() pack all the data dependencies of TVM.
+(cd build && cmake .. && make standalone_crt)
diff --git a/tutorials/micro/micro_tflite.py b/tutorials/micro/micro_tflite.py
index c979216d0c6b..673985e24d84 100644
--- a/tutorials/micro/micro_tflite.py
+++ b/tutorials/micro/micro_tflite.py
@@ -207,7 +207,9 @@
 # First, compile a static microTVM runtime for the targeted device. In this case, the host simulated
 # device is used.
 compiler = tvm.micro.DefaultCompiler(target=TARGET)
-opts = tvm.micro.default_options(os.path.join(tvm.micro.CRT_ROOT_DIR, "host"))
+opts = tvm.micro.default_options(
+    os.path.join(tvm.micro.get_standalone_crt_dir(), "template", "host")
+)
 
 # %%
 # Compiling for physical hardware
@@ -230,18 +232,13 @@
 
 workspace = tvm.micro.Workspace()
 micro_binary = tvm.micro.build_static_runtime(
-    # the x86 compiler *expects* you to give the exact same dictionary for both
-    # lib_opts and bin_opts. so the library compiler is mutating lib_opts and
-    # the binary compiler is expecting those mutations to be in bin_opts.
-    # TODO(weberlo) fix this very bizarre behavior
     workspace,
     compiler,
     c_mod,
-    lib_opts=opts["lib_opts"],
-    bin_opts=opts["bin_opts"],
+    opts,
     # Use the microTVM memory manager. If, in your main.cc, you change TVMPlatformMemoryAllocate and
     # TVMPlatformMemoryFree to use e.g. malloc() and free(), you can omit this extra library.
-    extra_libs=[os.path.join(tvm.micro.build.CRT_ROOT_DIR, "memory")],
+    extra_libs=[tvm.micro.get_standalone_crt_lib("memory")],
 )
 
 

From 4e211a735221a9b9d188422025e2d464e37b3c96 Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Fri, 12 Feb 2021 21:14:56 -0700
Subject: [PATCH 190/357] [ONNX] Make the ONNX Importer More Static (#7429)

* Construct static Ops if inputs are Constant

* Expose FoldConstant as a function in addition to the pass

* refactor onnx importer to do more static imports by constant folding

fix pylint

* fix test regressions

* fix style, two bugs

* pipe freeze_params through sub_graphs when importing loops and control flow
---
 python/tvm/relay/frontend/common.py       |   6 +
 python/tvm/relay/frontend/onnx.py         | 198 +++++++++++++---------
 python/tvm/relay/op/image/image.py        |   4 +-
 python/tvm/relay/op/nn/nn.py              |  16 +-
 python/tvm/relay/op/tensor.py             |   6 +-
 python/tvm/relay/op/transform.py          |  18 +-
 python/tvm/relay/transform/transform.py   |  17 ++
 src/relay/transforms/fold_constant.cc     |   2 +
 tests/python/relay/test_op_grad_level3.py |   2 +-
 9 files changed, 180 insertions(+), 89 deletions(-)

diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py
index 6323c63ab9b3..2db420a40992 100644
--- a/python/tvm/relay/frontend/common.py
+++ b/python/tvm/relay/frontend/common.py
@@ -491,6 +491,12 @@ def infer_type(node, mod=None):
     return ret
 
 
+def fold_constant(node, mod=None):
+    if mod is None:
+        mod = IRModule.from_expr(node)
+    return _transform.FoldConstantExpr(node, mod)
+
+
 def infer_channels(inputs, transpose=False):
     """A hack for getting 'channels' or 'units' since caffe2 does not provide
     these attributes. We check the shape of weights provided to get the number.
diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index c9140d782a2d..fb3d1c923561 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -34,7 +34,7 @@
 from .. import ty as _ty
 
 from .common import AttrCvt, Renamer
-from .common import get_relay_op, new_var, infer_shape, infer_channels, infer_value
+from .common import get_relay_op, new_var, infer_shape, infer_channels, infer_value, fold_constant
 from .common import infer_type, get_name
 
 
@@ -364,7 +364,7 @@ def autopad(data, strides, kernel_shape, dilations, ndim, pad_type="constant", d
         ),
         dtype="int64",
     )
-    shape = _op.strided_slice(_op.shape_of(data, dtype="int64"), [2], [ndim])
+    shape = _op.strided_slice(shape_of(data, dtype="int64"), [2], [ndim])
     # get input shape
 
     # set up integer constants
@@ -545,9 +545,9 @@ class MatMul(OnnxOpConverter):
     def _impl_v1(cls, inputs, attr, params):
         assert len(inputs) == 2, "MatMul op take 2 inputs, {} given".format(len(inputs))
         # Need to check input shape as batch matmul must be supported.
-        a_shape = _op.shape_of(inputs[0])
+        a_shape = shape_of(inputs[0])
         a_rank = infer_shape(a_shape)[0]
-        b_shape = _op.shape_of(inputs[1])
+        b_shape = shape_of(inputs[1])
         b_rank = infer_shape(b_shape)[0]
         # When performing a batch matmul, we need to properly handle N-dim shapes.
         if a_rank > 2 or b_rank > 2:
@@ -555,9 +555,13 @@ def _impl_v1(cls, inputs, attr, params):
             def flatten_to_3d(x, x_shape):
                 ndims = infer_shape(x_shape)[0]
                 newshape = _op.concatenate(
-                    [_expr.const([-1]), _op.strided_slice(x_shape, [ndims - 2], [ndims])], 0
+                    [
+                        _expr.const([-1], dtype=infer_type(x_shape).checked_type.dtype),
+                        _op.strided_slice(x_shape, [ndims - 2], [ndims]),
+                    ],
+                    0,
                 )
-                out = _op.reshape(x, newshape)
+                out = _op.reshape(x, fold_constant(newshape))
                 return out
 
             # Convert a and b into 3 dimensional tensors.
@@ -598,7 +602,7 @@ def flatten_to_3d(x, x_shape):
                 ],
                 0,
             )
-            return _op.reshape(output, final_shape)
+            return _op.reshape(output, fold_constant(final_shape))
         # Otherwise a simple dense op will get the job done.
         input_1_t = _op.transpose(inputs[1], axes=(1, 0))
         return _op.nn.dense(inputs[0], input_1_t)
@@ -646,7 +650,7 @@ def _impl_v11(cls, inputs, attr, params):
         multiplier = _op.concatenate(
             [_expr.const([1, 1], dtype="int64"), _expr.const(list(strides), dtype="int64")], axis=0
         )
-        total_output_shape = multiplier * _op.shape_of(data, dtype="int64")
+        total_output_shape = multiplier * shape_of(data, dtype="int64")
         # Add extra dimensions from kernel size and stride mismatch
         total_output_shape += _op.concatenate(
             [_expr.const([0, 0], "int64"), _expr.const(list(kernel_shape), "int64")], axis=0
@@ -792,11 +796,11 @@ def _impl_v2(cls, inputs, attr, params):
     def _impl_v11(cls, inputs, attr, params):
         pads = inputs[1]
         if len(inputs) == 3:
-            value = _op.take(inputs[2], _op.const(0))
+            value = fold_constant(_op.take(inputs[2], _op.const(0)))
         else:
             value = 0
 
-        pad_width_expr = _op.transpose(_op.reshape(pads, (2, -1)))
+        pad_width_expr = fold_constant(_op.transpose(_op.reshape(pads, (2, -1))))
         pad_mode = attr.get("mode", b"constant").decode("utf-8")
 
         if not pad_mode in ["constant", "edge", "reflect"]:
@@ -823,7 +827,7 @@ class Prelu(OnnxOpConverter):
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
         assert len(inputs) == 2, "Prelu need 2 inputs, {} given".format(len(inputs))
-        input_shape = _op.shape_of(inputs[0])
+        input_shape = shape_of(inputs[0])
         alpha = _op.broadcast_to_like(inputs[1], inputs[0])
         alpha = _op.reshape(alpha, [-1])
         output = _op.nn.prelu(_op.reshape(inputs[0], [-1]), alpha, axis=0)
@@ -875,7 +879,6 @@ class DepthToSpace(OnnxOpConverter):
 
     @classmethod
     def _impl_v11(cls, inputs, attr, params):
-
         block_size = int(attr["blocksize"])
         mode = attr.get("mode", b"DCR").decode("utf-8")
         return _op.nn.depth_to_space(inputs[0], block_size, mode=mode)
@@ -1015,8 +1018,9 @@ def _impl_v9(cls, inputs, attr, params):
                 scales = params[inputs[1].name_hint].asnumpy()
             else:
                 scales = inputs[1]
-
-        if not isinstance(scales, _expr.Call):
+        if isinstance(scales, _expr.Constant):
+            scales = list(scales.data.asnumpy())
+        if not isinstance(scales, _expr.Expr):
             assert scales[0] == 1.0 and scales[1] == 1.0
 
         mode = attr.get("mode")
@@ -1067,12 +1071,20 @@ def _impl_v9(cls, inputs, attr, params):
         return out
 
 
+def shape_of(x, dtype="int64"):
+    ttype = infer_type(x).checked_type
+    if not _ty.is_dynamic(ttype):
+        shape = list(ttype.shape)
+        return _expr.const(shape, dtype)
+    return _op.shape_of(x, dtype)
+
+
 class Shape(OnnxOpConverter):
     """Operator converter for Shape."""
 
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
-        return _op.shape_of(inputs[0], "int64")
+        return shape_of(inputs[0], "int64")
 
 
 class CumSum(OnnxOpConverter):
@@ -1204,7 +1216,7 @@ def _impl_v10(cls, inputs, attr, params):
 
         # Update the starts and ends according to axes if required.
         if axes is not None:
-            data_shape = _op.shape_of(inputs[0], dtype=infer_type(ends).checked_type.dtype)
+            data_shape = shape_of(inputs[0], dtype=infer_type(ends).checked_type.dtype)
             starts = _op.scatter(
                 _op.const([0] * data_rank, dtype=infer_type(starts).checked_type.dtype),
                 axes,
@@ -1223,7 +1235,9 @@ def _impl_v10(cls, inputs, attr, params):
         if steps is None:
             steps = _op.const([1] * data_rank, dtype=infer_type(starts).checked_type.dtype)
 
-        return _op.strided_slice(inputs[0], starts, ends, steps)
+        return _op.strided_slice(
+            inputs[0], fold_constant(starts), fold_constant(ends), fold_constant(steps)
+        )
 
 
 class Gather(OnnxOpConverter):
@@ -1531,6 +1545,19 @@ def _impl_v9(cls, inputs, attr, params):
         return output
 
 
+class Constant(OnnxOpConverter):
+    """Operator converter for ConstantOfShape."""
+
+    @classmethod
+    def _impl_v9(cls, inputs, attr, params):
+        if "value" not in attr:
+            raise "No Value in Constant"
+        np_value = get_numpy(attr.pop("value"))
+        dtype = np_value.dtype.name
+        value = _expr.const(np_value, dtype)
+        return value
+
+
 class Sign(OnnxOpConverter):
     """Operator converter for Sign."""
 
@@ -1591,12 +1618,14 @@ def _impl_v9(cls, inputs, attr, params):
         # to that shape.
         max_rank = max(ranks)
         max_rank_idxs = [i for i, x in enumerate(ranks) if x == max_rank]
-        broadcast_shape = _op.shape_of(inputs[max_rank_idxs[0]])
+        broadcast_shape = shape_of(inputs[max_rank_idxs[0]])
         # If two or more inputs have the same rank, compute the broadcast
         # shape by taking the maximum value of each dimensions.
         if len(max_rank_idxs) > 1:
             for idx in max_rank_idxs:
-                broadcast_shape = _op.maximum(broadcast_shape, _op.shape_of(inputs[idx]))
+                broadcast_shape = _op.maximum(broadcast_shape, shape_of(inputs[idx]))
+
+        broadcast_shape = fold_constant(broadcast_shape)
 
         condition = _op.broadcast_to(inputs[0], broadcast_shape)
         x = _op.broadcast_to(inputs[1], broadcast_shape)
@@ -1618,7 +1647,7 @@ class Expand(OnnxOpConverter):
     @classmethod
     def _impl_v8(cls, inputs, attr, params):
         dtype = infer_type(inputs[1]).checked_type.dtype
-        in_shape = _op.shape_of(inputs[0], dtype=dtype)
+        in_shape = shape_of(inputs[0], dtype=dtype)
         shape = inputs[1]
 
         # Currently 'op.broadcast_to' expect the rank of the given 'shape'
@@ -1667,7 +1696,7 @@ def expand_shape(in_shape, shape):
             new_shape = _op.maximum(in_shape, shape)
             return new_shape
 
-        shape = expand_shape(in_shape, shape)
+        shape = fold_constant(expand_shape(in_shape, shape))
         return _op.broadcast_to(inputs[0], shape=shape)
 
 
@@ -1942,10 +1971,9 @@ def _impl_v10(cls, inputs, attr, params):
             )
 
         scale = inputs[1]
-        size = _op.cast(_op.shape_of(inputs[0]), infer_type(scale).checked_type.dtype) * scale
-
+        size = _op.cast(shape_of(inputs[0]), infer_type(scale).checked_type.dtype) * scale
         layout = "NCHW"  # ONNX assumes NCHW layout
-        out_size = _op.strided_slice(size, [2], [4])
+        out_size = fold_constant(_op.strided_slice(size, [2], [4]))
         return _op.image.resize(inputs[0], out_size, layout, method, "asymmetric")
 
     @classmethod
@@ -1969,7 +1997,7 @@ def _impl_v11(cls, inputs, attr, params):
             size = inputs[3]
         else:
             assert len(scale_shape) != 0, "One of scale or size should be passed."
-            size = _op.cast(_op.shape_of(inputs[0]), infer_type(scale).checked_type.dtype) * scale
+            size = _op.cast(shape_of(inputs[0]), infer_type(scale).checked_type.dtype) * scale
 
         coord_trans = attr.get("coordinate_transformation_mode")
         if coord_trans in [b"pytorch_half_pixel", b"half_pixel"]:
@@ -1983,7 +2011,7 @@ def _impl_v11(cls, inputs, attr, params):
                 "Unsupported coordinate_transformation_mode: {}".format(coord_trans)
             )
         layout = "NCHW"  # ONNX assumes NCHW layout
-        out_size = _op.strided_slice(size, [2], [4])
+        out_size = fold_constant(_op.strided_slice(size, [2], [4]))
         return _op.image.resize(inputs[0], out_size, layout, method, coord_trans)
 
 
@@ -2152,7 +2180,9 @@ def cond_fn(*loop_inputs):
 
         # Get the current graph proto and create a clone for the subgraph
         graph_scope = GraphProto.current
-        subgraph_scope = GraphProto(graph_scope._shape, graph_scope._dtype)
+        subgraph_scope = GraphProto(
+            graph_scope._shape, graph_scope._dtype, graph_scope._freeze_params
+        )
         # Load nodes from outer graph into inner graph.
         subgraph_scope._nodes = graph_scope._nodes.copy()
 
@@ -2246,7 +2276,7 @@ def body_fn(*loop_inputs):
                 expand_scan = _op.expand_dims(new_scan, axis=0)
                 # For non scalar outputs we need to broadcast the initial value.
                 if rank > 0:
-                    new_scan_shape = _op.shape_of(new_scan, dtype=iter_dtype)
+                    new_scan_shape = shape_of(new_scan, dtype=iter_dtype)
                     scan_broadcast = _op.concatenate(
                         [_op.reshape(loop_count, [1]), new_scan_shape], axis=0
                     )
@@ -2264,7 +2294,7 @@ def body_fn(*loop_inputs):
             return [loop_count, max_count, new_cond] + new_loop_vars + combined_scan_outputs
 
         # Create the loop function.
-        loop = _loops.while_loop(cond_fn, loop_vars + scan_output_vars, body_fn)
+        loop = fold_constant(_loops.while_loop(cond_fn, loop_vars + scan_output_vars, body_fn))
 
         # Now need to run initial values through the graph.
         init_count = _expr.const(0, dtype=iter_dtype)
@@ -2287,6 +2317,7 @@ def body_fn(*loop_inputs):
         # Update outer graph with constants found in the subgraph.
         free_vars = analysis.free_vars(loop)
         graph_scope._params.update(subgraph_scope._params)
+        graph_scope._nodes.update(subgraph_scope._nodes)
         for var in free_vars:
             graph_scope._nodes.update({var.name_hint: var})
         return outputs
@@ -2307,9 +2338,9 @@ def _impl_v1(cls, inputs, attr, params):
 
         # Create graph converters for both branches.
         graph_scope = GraphProto.current
-        then_graph = GraphProto(graph_scope._shape, graph_scope._dtype)
+        then_graph = GraphProto(graph_scope._shape, graph_scope._dtype, graph_scope._freeze_params)
         then_graph._nodes = graph_scope._nodes.copy()
-        else_graph = GraphProto(graph_scope._shape, graph_scope._dtype)
+        else_graph = GraphProto(graph_scope._shape, graph_scope._dtype, graph_scope._freeze_params)
         else_graph._nodes = graph_scope._nodes.copy()
 
         # Convert each branch to a relay expression.
@@ -2320,10 +2351,12 @@ def _impl_v1(cls, inputs, attr, params):
 
         # Add constants from both branches to parent graph.
         graph_scope._params.update(then_graph._params)
+        graph_scope._nodes.update(then_graph._nodes)
         then_free_vars = analysis.free_vars(then_expr)
         for var in then_free_vars:
             graph_scope._nodes.update({var.name_hint: var})
         graph_scope._params.update(else_graph._params)
+        graph_scope._nodes.update(else_graph._nodes)
         else_free_vars = analysis.free_vars(else_expr)
         for var in else_free_vars:
             graph_scope._nodes.update({var.name_hint: var})
@@ -2468,9 +2501,9 @@ def _first_body(
             # partially prepare ONNX output format by labeling batch_num, class_id
             nms_padded_out = _op.expand_dims(nms_ret[0], -1, 1)
             batch_num = _op.expand_dims(_op.arange(_op.squeeze(B, [0]), dtype="int64"), -1, 1)
-            batch_num = _op.broadcast_to(batch_num, _op.shape_of(nms_ret[0], dtype="int64"))
+            batch_num = _op.broadcast_to(batch_num, shape_of(nms_ret[0], dtype="int64"))
             batch_num = _op.expand_dims(batch_num, -1, 1)
-            class_num = _op.broadcast_to(i, _op.shape_of(nms_padded_out, dtype="int64"))
+            class_num = _op.broadcast_to(i, shape_of(nms_padded_out, dtype="int64"))
             new_onnx_out = _op.concatenate(
                 [batch_num, class_num, _op.cast(nms_padded_out, "int64")], -1
             )
@@ -2570,7 +2603,7 @@ def _outer_body(i, B, C, onnx_out, nms_size_out, out):
         )
 
         # Call the first loop, perform NMS
-        B, C, S = _op.split(_op.shape_of(scores, dtype="int64"), 3)
+        B, C, S = _op.split(shape_of(scores, dtype="int64"), 3)
         init_count = _op.const(np.array([0]), dtype="int64")
         init_onnx_out = _op.const([1], dtype="int64")
         init_onnx_out = _op.broadcast_to(init_onnx_out, _op.concatenate([B, one, S, three], 0))
@@ -2617,6 +2650,7 @@ def _get_convert_map(opset):
         "ThresholdedRelu": ThresholdedRelu.get_converter(opset),
         "ScaledTanh": ScaledTanh.get_converter(opset),
         "ParametricSoftplus": ParametricSoftPlus.get_converter(opset),
+        "Constant": Constant.get_converter(opset),
         "ConstantOfShape": ConstantOfShape.get_converter(opset),
         # 'GivenTensorFill'
         "FC": AttrCvt("dense", ignores=["axis", "axis_w"]),
@@ -2776,11 +2810,19 @@ class GraphProto:
 
     dtype : str or dict of str to str
         The input types to the graph
+
+    freeze_params: bool
+        If this parameter is true, the importer will take any provided
+        onnx input values (weights, shapes, etc) and embed them into the relay model
+        as Constants instead of variables. This allows more aggressive optimizations
+        at compile time and helps in making models static if certain inputs represent
+        attributes relay would traditionally consider compile-time constants.
+
     """
 
     current = None
 
-    def __init__(self, shape, dtype):
+    def __init__(self, shape, dtype, freeze_params=False):
         self._nodes = {}
         self._params = {}
         self._inputs = {}
@@ -2790,6 +2832,7 @@ def __init__(self, shape, dtype):
         self._shape = shape if shape else {}
         self._dtype = dtype
         self.opset = None
+        self._freeze_params = freeze_params
 
     def __enter__(self):
         self._old_manager = GraphProto.current
@@ -2808,7 +2851,7 @@ def freeze(self, func, params):
         fn = _function.Function(analysis.free_vars(body), body)
         return fn, {}
 
-    def from_onnx(self, graph, opset, freeze_params=False, get_output_expr=False):
+    def from_onnx(self, graph, opset, get_output_expr=False):
         """Construct Relay expression from ONNX graph.
 
         Onnx graph is a python protobuf object.
@@ -2825,13 +2868,6 @@ def from_onnx(self, graph, opset, freeze_params=False, get_output_expr=False):
 
         opset : opset version
 
-        freeze_params: bool
-            If this parameter is true, the importer will take any provided
-            onnx input values (weights, shapes, etc) and embed them into the relay model
-            as Constants instead of variables. This allows more aggressive optimizations
-            at compile time and helps in making models static if certain inputs represent
-            attributes relay would traditionally consider compile-time constants.
-
         get_output_expr: bool
             If set to true, this conversion will return each output expression rather
             than a packaged module. This can be useful when converting subgraphs to
@@ -2850,12 +2886,16 @@ def from_onnx(self, graph, opset, freeze_params=False, get_output_expr=False):
         for init_tensor in graph.initializer:
             if not init_tensor.name.strip():
                 raise ValueError("Tensor's name is required.")
-            self._params[init_tensor.name] = self._parse_array(init_tensor)
-            self._nodes[init_tensor.name] = new_var(
-                init_tensor.name,
-                shape=self._params[init_tensor.name].shape,
-                dtype=self._params[init_tensor.name].dtype,
-            )
+            array = self._parse_array(init_tensor)
+            if self._freeze_params:
+                self._nodes[init_tensor.name] = _expr.const(array)
+            else:
+                self._params[init_tensor.name] = array
+                self._nodes[init_tensor.name] = new_var(
+                    init_tensor.name,
+                    shape=self._params[init_tensor.name].shape,
+                    dtype=self._params[init_tensor.name].dtype,
+                )
         for i in graph.input:
             # from onnx v0.2, GraphProto.input has type ValueInfoProto,
             #  and the name is 'i.name'
@@ -2867,6 +2907,8 @@ def from_onnx(self, graph, opset, freeze_params=False, get_output_expr=False):
                 self._nodes[i_name] = new_var(
                     i_name, shape=self._params[i_name].shape, dtype=self._params[i_name].dtype
                 )
+            elif i_name in self._nodes:
+                continue
             else:
                 self._num_input += 1
                 if i_name in self._shape:
@@ -2909,37 +2951,28 @@ def from_onnx(self, graph, opset, freeze_params=False, get_output_expr=False):
             for i in node.input:
                 if i != "":
                     inputs[i] = self._nodes[self._renames.get(i, i)]
-            if op_name == "Constant":
-                t_proto = self._parse_attr(node.attribute)["value"]
-                self._num_param += 1
-                # We should convert scalar integers to int32, to normalize.
-                array = self._parse_array(t_proto)
-                self._params[node.output[0]] = array
-                self._nodes[node.output[0]] = new_var(
-                    node.output[0], shape=list(t_proto.dims), dtype=array.dtype
-                )
+            i_name = self._parse_value_proto(node)
+            node_output = self._fix_outputs(op_name, node.output)
+            attr["tvm_custom"] = {}
+            attr["tvm_custom"]["name"] = i_name
+            attr["tvm_custom"]["num_outputs"] = len(node_output)
+
+            op = self._convert_operator(op_name, inputs, attr, opset)
+            if not isinstance(op, _expr.TupleWrapper):
+                outputs_num = 1
             else:
-                i_name = self._parse_value_proto(node)
-                node_output = self._fix_outputs(op_name, node.output)
-                attr["tvm_custom"] = {}
-                attr["tvm_custom"]["name"] = i_name
-                attr["tvm_custom"]["num_outputs"] = len(node_output)
-
-                op = self._convert_operator(op_name, inputs, attr, opset)
-                if not isinstance(op, _expr.TupleWrapper):
-                    outputs_num = 1
-                else:
-                    outputs_num = len(op)
-                assert (
-                    len(node_output) == outputs_num
-                ), "Number of output mismatch {} vs {} in {}.".format(
-                    len(node_output), outputs_num, op_name
-                )
-                if outputs_num == 1:
-                    self._nodes[node_output[0]] = op
-                else:
-                    for k, i in zip(list(node_output), range(len(node_output))):
-                        self._nodes[k] = op[i]
+                outputs_num = len(op)
+            assert (
+                len(node_output) == outputs_num
+            ), "Number of output mismatch {} vs {} in {}.".format(
+                len(node_output), outputs_num, op_name
+            )
+            if outputs_num == 1:
+                self._nodes[node_output[0]] = fold_constant(op)
+            else:
+                op = _expr.TupleWrapper(fold_constant(op.astuple()), len(op))
+                for k, i in zip(list(node_output), range(len(node_output))):
+                    self._nodes[k] = op[i]
 
         # now return the outputs
         outputs = [self._nodes[self._parse_value_proto(i)] for i in graph.output]
@@ -2957,9 +2990,6 @@ def from_onnx(self, graph, opset, freeze_params=False, get_output_expr=False):
                 self._inputs[i_name] = self._nodes[i_name]
         # Create a function from our output expression and all input variables.
         func = _function.Function([v for k, v in self._inputs.items()], outputs)
-        if freeze_params:
-            func, params = self.freeze(func, self._params)
-            return IRModule.from_expr(func), params
         return IRModule.from_expr(func), self._params
 
     def _parse_value_proto(self, value_proto):
@@ -3100,7 +3130,7 @@ def from_onnx(model, shape=None, dtype="float32", opset=None, freeze_params=Fals
                 warnings.warn(str(e))
     except ImportError:
         pass
-    g = GraphProto(shape, dtype)
+    g = GraphProto(shape, dtype, freeze_params)
     graph = model.graph
     if opset is None:
         try:
@@ -3109,5 +3139,5 @@ def from_onnx(model, shape=None, dtype="float32", opset=None, freeze_params=Fals
             opset = 1
     # Use the graph proto as a scope so that ops can access other nodes if needed.
     with g:
-        mod, params = g.from_onnx(graph, opset, freeze_params)
+        mod, params = g.from_onnx(graph, opset)
     return mod, params
diff --git a/python/tvm/relay/op/image/image.py b/python/tvm/relay/op/image/image.py
index a3f3a3e8cb92..153439b1e20c 100644
--- a/python/tvm/relay/op/image/image.py
+++ b/python/tvm/relay/op/image/image.py
@@ -17,7 +17,7 @@
 """Image operations."""
 from . import _make
 from ..dyn.image import _make as _dyn_make
-from ...expr import Expr
+from ...expr import Expr, Constant
 
 
 def resize(
@@ -66,6 +66,8 @@ def resize(
     result: relay.Expr
         The resized result.
     """
+    if isinstance(size, Constant):
+        size = list(size.data.asnumpy().astype("int32"))
     if isinstance(size, Expr):
         return _dyn_make.resize(
             data, size, layout, method, coordinate_transformation_mode, out_dtype
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index 0c233a6e3b53..5135ac74de25 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -21,7 +21,7 @@
 from . import _make
 from ..dyn.nn import _make as _dyn_make
 from .utils import get_pad_tuple1d, get_pad_tuple2d, get_pad_tuple3d
-from ...expr import const, Expr
+from ...expr import const, Expr, Constant
 
 
 def conv1d(
@@ -1279,6 +1279,10 @@ def upsampling(
     result : tvm.relay.Expr
         The computed result.
     """
+    if isinstance(scale_h, Constant):
+        scale_h = scale_h.data.asnumpy().item()
+    if isinstance(scale_w, Constant):
+        scale_w = scale_w.data.asnumpy().item()
     if isinstance(scale_h, Expr) or isinstance(scale_w, Expr):
         if not isinstance(scale_h, Expr):
             scale_h = const(scale_h, "float64")
@@ -1338,6 +1342,12 @@ def upsampling3d(
     result : tvm.relay.Expr
         The computed result.
     """
+    if isinstance(scale_d, Constant):
+        scale_d = scale_d.data.asnumpy().item()
+    if isinstance(scale_h, Constant):
+        scale_h = scale_h.data.asnumpy().item()
+    if isinstance(scale_w, Constant):
+        scale_w = scale_w.data.asnumpy().item()
     if isinstance(scale_d, Expr) or isinstance(scale_h, Expr) or isinstance(scale_w, Expr):
         if not isinstance(scale_d, Expr):
             scale_d = const(scale_d, "float64")
@@ -1596,6 +1606,10 @@ def pad(data, pad_width, pad_value=0, pad_mode="constant"):
     result : tvm.relay.Expr
         The computed result.
     """
+    if isinstance(pad_value, Constant):
+        pad_value = pad_value.data.asnumpy().item()
+    if isinstance(pad_width, Constant):
+        pad_width = [list(i) for i in pad_width.data.asnumpy()]
     if isinstance(pad_width, Expr) or (isinstance(pad_value, Expr)):
         if not isinstance(pad_width, Expr):
             pad_width = const(list(pad_width))
diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
index 75e298786ddd..5b011043f588 100644
--- a/python/tvm/relay/op/tensor.py
+++ b/python/tvm/relay/op/tensor.py
@@ -22,7 +22,7 @@
 
 from . import _make
 from .dyn import _make as _dyn_make
-from ..expr import Tuple, Expr
+from ..expr import Tuple, Expr, Constant
 from . import op as reg
 
 
@@ -960,6 +960,8 @@ def zeros(shape, dtype):
     result : relay.Expr
         The resulting tensor.
     """
+    if isinstance(shape, Constant):
+        shape = list(shape.data.asnumpy())
     if isinstance(shape, Expr):
         return _dyn_make.zeros(shape, dtype)
     if isinstance(shape, int):
@@ -1001,6 +1003,8 @@ def ones(shape, dtype):
     result : relay.Expr
         The resulting tensor.
     """
+    if isinstance(shape, Constant):
+        shape = list(shape.data.asnumpy())
     if isinstance(shape, Expr):
         return _dyn_make.ones(shape, dtype)
     if isinstance(shape, int):
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index d42ef477499f..cda417cad239 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -21,7 +21,7 @@
 from . import _make
 from .dyn import _make as _dyn_make
 from .tensor import shape_of
-from ..expr import TupleWrapper, const, Expr, Tuple
+from ..expr import TupleWrapper, const, Constant, Expr, Tuple
 from ...tir import expr as _expr
 
 
@@ -216,6 +216,8 @@ def reshape(data, newshape):
     result : relay.Expr
         The reshaped result.
     """
+    if isinstance(newshape, Constant):
+        newshape = list(newshape.data.asnumpy())
     if isinstance(newshape, Expr):
         return _dyn_make.reshape(data, newshape)
     if isinstance(newshape, int):
@@ -431,6 +433,8 @@ def full(fill_value, shape=(), dtype=""):
     result : relay.Expr
         The resulting tensor.
     """
+    if isinstance(shape, Constant):
+        shape = list(shape.data.asnumpy())
     if isinstance(shape, Expr):
         return _dyn_make.full(fill_value, shape, dtype)
     if isinstance(shape, int):
@@ -614,6 +618,8 @@ def tile(data, reps):
     data is promoted to be d-dimensional by prepending new axes.
     If data.ndim >=  d, reps is promoted to a.ndim by pre-pending 1's to it.
     """
+    if isinstance(reps, Constant):
+        reps = list(reps.data.asnumpy())
     if isinstance(reps, Expr):
         return _dyn_make.tile(data, reps)
     return _make.tile(data, reps)
@@ -753,6 +759,8 @@ def broadcast_to(data, shape):
     result : relay.Expr
         The resulting tensor.
     """
+    if isinstance(shape, Constant):
+        shape = list(shape.data.asnumpy())
     if isinstance(shape, Expr):
         return _dyn_make.broadcast_to(data, shape)
     if isinstance(shape, int):
@@ -884,6 +892,12 @@ def strided_slice(data, begin, end, strides=None, slice_mode="end"):
         The computed result.
     """
     strides = strides or [1]
+    if isinstance(begin, Constant):
+        begin = list(begin.data.asnumpy())
+    if isinstance(end, Constant):
+        end = list(end.data.asnumpy())
+    if isinstance(strides, Constant):
+        strides = list(strides.data.asnumpy())
     if isinstance(begin, Expr) or isinstance(end, Expr) or isinstance(strides, Expr):
         if isinstance(begin, (tuple, list)):
             begin = const(list(begin))
@@ -1170,6 +1184,8 @@ def one_hot(indices, on_value, off_value, depth, axis, dtype):
              [0, 1, 0],
              [0, 0, 1]]
     """
+    if isinstance(depth, Constant):
+        depth = depth.data.asnumpy().item()
     if isinstance(depth, Expr):
         return _dyn_make.one_hot(indices, on_value, off_value, depth, axis, dtype)
     return _make.one_hot(indices, on_value, off_value, depth, axis, dtype)
diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py
index c6df8c1e6ea2..f02f8352de9e 100644
--- a/python/tvm/relay/transform/transform.py
+++ b/python/tvm/relay/transform/transform.py
@@ -240,6 +240,23 @@ def LazyGradientInit():
     return _ffi_api.LazyGradientInit()
 
 
+def FoldConstantExpr(expr, mod):
+    """Fold the constant expressions in a Relay program.
+    Parameters
+    ----------
+    expr: Expr
+        The expression to fold
+    mod: IRModule
+        The module the expr lives in (for global calls)
+
+    Returns
+    -------
+    new_expr: Expr
+        The expr after Constant Folding
+    """
+    return _ffi_api.FoldConstantExpr(expr, mod)
+
+
 def FoldConstant():
     """Fold the constant expressions in a Relay program.
 
diff --git a/src/relay/transforms/fold_constant.cc b/src/relay/transforms/fold_constant.cc
index 4454c9c0459a..9416b0ec4580 100644
--- a/src/relay/transforms/fold_constant.cc
+++ b/src/relay/transforms/fold_constant.cc
@@ -374,6 +374,8 @@ Expr FoldConstant(const Expr& expr, const IRModule& mod) {
   return ConstantFolder(mod).Mutate(expr);
 }
 
+TVM_REGISTER_GLOBAL("relay._transform.FoldConstantExpr").set_body_typed(FoldConstant);
+
 namespace transform {
 
 Pass FoldConstant() {
diff --git a/tests/python/relay/test_op_grad_level3.py b/tests/python/relay/test_op_grad_level3.py
index 904576a181f6..d43744b38e3e 100644
--- a/tests/python/relay/test_op_grad_level3.py
+++ b/tests/python/relay/test_op_grad_level3.py
@@ -146,7 +146,7 @@ def test_zeros_ones_grad_const_ints():
 
 def test_zeros_ones_grad_const_expr():
     # when shape is static (i.e. not an input), there is no gradient at all
-    shape_const = relay.const(np.array([2, 3, 4]), dtype="int32")
+    shape_const = relay.const(np.array([2, 3, 4]), dtype="int32") * relay.const(1, dtype="int32")
     static_ty = relay.TensorType([2, 3, 4], dtype="float32")
     dyn_ty = relay.TensorType([relay.Any(), relay.Any(), relay.Any()], dtype="float32")
     expected_ty_static = relay.TupleType([static_ty, relay.TupleType([])])

From 0aa90b093fd7e842eb88fa8e9994f70f24ba2bbf Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Sat, 13 Feb 2021 19:40:00 +0900
Subject: [PATCH 191/357] [VM] Move param bind to OptimizeModule (#7451)

* [VM] Move param bind to OptimizeModule

* add test to verify the number of free vars after opt

* remove const from OptimizeModule
---
 src/relay/backend/vm/compiler.cc | 20 ++++++++++----------
 src/relay/backend/vm/compiler.h  |  3 +--
 tests/python/relay/test_vm.py    |  4 ++++
 3 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index 7861502965a8..7697b59437f0 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -892,15 +892,6 @@ void VMCompiler::SetParam(const std::string& name, runtime::NDArray data_in) {
 }
 
 void VMCompiler::Lower(IRModule mod, const TargetsMap& targets, const tvm::Target& target_host) {
-  if (params_.size()) {
-    BaseFunc base_func = mod->Lookup("main");
-    ICHECK(base_func->IsInstance<FunctionNode>())
-        << "VM compiler expects to compile relay::Function";
-    auto f = relay::backend::BindParamsByName(Downcast<Function>(base_func), params_);
-    auto gvar = mod->GetGlobalVar("main");
-    mod->Add(gvar, f);
-  }
-
   exec_ = make_object<Executable>();
   targets_ = targets;
   target_host_ = target_host;
@@ -1005,8 +996,17 @@ transform::Sequential MemoryOpt(tvm::Target host_target, TargetsMap targets) {
   return transform::Sequential(pass_seqs);
 }
 
-IRModule VMCompiler::OptimizeModule(const IRModule& mod, const TargetsMap& targets,
+IRModule VMCompiler::OptimizeModule(IRModule mod, const TargetsMap& targets,
                                     const Target& target_host) {
+  if (params_.size()) {
+    BaseFunc base_func = mod->Lookup("main");
+    ICHECK(base_func->IsInstance<FunctionNode>())
+        << "VM compiler expects to compile relay::Function";
+    auto f = relay::backend::BindParamsByName(Downcast<Function>(base_func), params_);
+    auto gvar = mod->GetGlobalVar("main");
+    mod->Add(gvar, f);
+  }
+
   Array<Pass> pass_seqs;
   Array<runtime::String> entry_functions{"main"};
   pass_seqs.push_back(transform::RemoveUnusedFunctions(entry_functions));
diff --git a/src/relay/backend/vm/compiler.h b/src/relay/backend/vm/compiler.h
index 56965c544701..615a8181b387 100644
--- a/src/relay/backend/vm/compiler.h
+++ b/src/relay/backend/vm/compiler.h
@@ -125,8 +125,7 @@ class VMCompiler : public runtime::ModuleNode {
    *
    * \return The optimized IRModule.
    */
-  IRModule OptimizeModule(const IRModule& mod, const TargetsMap& targets,
-                          const Target& target_host);
+  IRModule OptimizeModule(IRModule mod, const TargetsMap& targets, const Target& target_host);
 
   /*!
    * \brief Populate the global function names in a map where the value is used
diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py
index 6958010176e3..975070ad1aaa 100644
--- a/tests/python/relay/test_vm.py
+++ b/tests/python/relay/test_vm.py
@@ -678,6 +678,10 @@ def test_vm_optimize():
     comp = relay.vm.VMCompiler()
     opt_mod, _ = comp.optimize(mod, target="llvm", params=params)
 
+    free_vars = relay.analysis.free_vars(opt_mod["main"].body)
+    # Paremeters should all be bound, so the only free var is data
+    assert len(free_vars) == 1
+
 
 @tvm.testing.uses_gpu
 def test_loop_free_var():

From 2e6e7dc1c4e9ffc0c50ce25396efd972e0b145b9 Mon Sep 17 00:00:00 2001
From: Trevor Morris <trevmorr@amazon.com>
Date: Sat, 13 Feb 2021 13:53:22 -0800
Subject: [PATCH 192/357] [Frontend][MXNet] Add support for MXNet GroupNorm
 (#7409)

* Add support for MXNet GroupNorm

* Fix python lint

* Fix lint
---
 python/tvm/relay/frontend/mxnet.py          | 14 +++++++++
 tests/python/frontend/mxnet/test_forward.py | 32 +++++++++++++++++++++
 2 files changed, 46 insertions(+)

diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index b272ead9737d..0c9d2c4381ac 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -495,6 +495,19 @@ def _mx_layer_norm(inputs, attrs):
     return _op.nn.layer_norm(*inputs, **new_attrs)
 
 
+def _mx_group_norm(inputs, attrs):
+    assert len(inputs) == 3
+    if attrs.get_bool("output_mean_var", False):
+        raise tvm.error.OpAttributeUnimplemented(
+            'Attribute "output_mean_var" is not supported for operator Group Norm.'
+        )
+    new_attrs = {}
+    new_attrs["axis"] = 1
+    new_attrs["num_groups"] = attrs.get_int("num_groups", 1)
+    new_attrs["epsilon"] = attrs.get_float("eps", 1e-5)
+    return _op.nn.group_norm(*inputs, **new_attrs)
+
+
 def _mx_slice(inputs, attrs):
     new_attrs = {}
     begin = list(attrs.get_int_tuple("begin", None))
@@ -2599,6 +2612,7 @@ def _mx_npi_where_rscalar(inputs, attrs):
     "_contrib_SyncBatchNorm": _mx_batch_norm,
     "InstanceNorm": _mx_instance_norm,
     "LayerNorm": _mx_layer_norm,
+    "GroupNorm": _mx_group_norm,
     "LRN": _mx_lrn,
     "L2Normalization": _mx_l2_normalize,
     "slice": _mx_slice,
diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py
index 537349e073e1..3e652cfc69e3 100644
--- a/tests/python/frontend/mxnet/test_forward.py
+++ b/tests/python/frontend/mxnet/test_forward.py
@@ -1263,6 +1263,38 @@ def verify(shape, axis=-1):
     verify((2, 5, 6))
 
 
+@tvm.testing.uses_gpu
+def test_forward_group_norm():
+    def verify(shape, num_groups=1):
+        x = np.random.uniform(size=shape).astype("float32")
+        gamma = np.random.uniform(size=(shape[1])).astype("float32")
+        beta = np.random.uniform(size=(shape[1])).astype("float32")
+        ref_res = mx.nd.GroupNorm(
+            data=mx.nd.array(x),
+            gamma=mx.nd.array(gamma),
+            beta=mx.nd.array(beta),
+            num_groups=num_groups,
+        )
+        mx_sym = mx.sym.GroupNorm(
+            mx.sym.var("x"), mx.sym.var("gamma"), mx.sym.var("beta"), num_groups=num_groups
+        )
+        shape_dict = {"x": x.shape, "gamma": gamma.shape, "beta": beta.shape}
+        mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict)
+        for target, ctx in tvm.testing.enabled_targets():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                op_res = intrp.evaluate()(x, gamma, beta)
+                tvm.testing.assert_allclose(
+                    op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-3, atol=1e-5
+                )
+
+    verify((1, 4, 2), num_groups=4)
+    # TODO(trevmorr): MXNet GroupNorm implementation is bugged for cases when num_groups != num_channels
+    # https://github.com/apache/incubator-mxnet/pull/18199
+    # verify((1, 4, 2, 3), num_groups=2)
+    # verify((1, 4, 2, 3))
+
+
 @tvm.testing.uses_gpu
 def test_forward_one_hot():
     def verify(indices_shape, depth, on_value, off_value, dtype):

From 2af3ab1e36e0e78bac8448a0357abee317fabb1f Mon Sep 17 00:00:00 2001
From: Vincent Abriou <vincent.abriou@st.com>
Date: Sun, 14 Feb 2021 16:20:07 +0100
Subject: [PATCH 193/357] update stm32mp1 arm_cpu target configuration (#7443)

Add the -mcpu information to complete the picture.

Signed-off-by: Vincent ABRIOU <vincent.abriou@st.com>
---
 python/tvm/target/target.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 19fe09e539d8..8942957d32c9 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -296,6 +296,7 @@ def arm_cpu(model="unknown", options=None):
             "-model=stm32mp1",
             "-mtriple=armv7a-linux-gnueabihf",
             "-mattr=+neon,+vfp4,+thumb2",
+            "-mcpu=cortex-a7",
         ],
         "thunderx": [
             "-model=thunderx",

From 6187e1c419f5c5083982f23cfb823b3721850d70 Mon Sep 17 00:00:00 2001
From: eric <etinum@gmail.com>
Date: Tue, 16 Feb 2021 02:34:40 +0900
Subject: [PATCH 194/357] [FRONTEND][TFLITE] get input tensor information from
 graph (#7400)

* [FRONTEND][TFLITE] get input tensor information from graph

* remove bare-except

* fix lint

* delete empty line

* comment change

* move some of the tflite frontend code from tvmc to tflite.py

* update shape and dtype when user provided them

* remove unused var. pass user provided shape_dict

* remove duplicate code
---
 python/tvm/driver/tvmc/frontends.py | 48 +--------------------------
 python/tvm/relay/frontend/tflite.py | 50 +++++++++++++++++++++++++++--
 2 files changed, 48 insertions(+), 50 deletions(-)

diff --git a/python/tvm/driver/tvmc/frontends.py b/python/tvm/driver/tvmc/frontends.py
index 53fbed66c8fc..16e6c8eb966e 100644
--- a/python/tvm/driver/tvmc/frontends.py
+++ b/python/tvm/driver/tvmc/frontends.py
@@ -198,19 +198,6 @@ def load(self, path, shape_dict=None):
 class TFLiteFrontend(Frontend):
     """ TFLite frontend for TVMC """
 
-    _tflite_m = {
-        0: "float32",
-        1: "float16",
-        2: "int32",
-        3: "uint8",
-        4: "int64",
-        5: "string",
-        6: "bool",
-        7: "int16",
-        8: "complex64",
-        9: "int8",
-    }
-
     @staticmethod
     def name():
         return "tflite"
@@ -241,43 +228,10 @@ def load(self, path, shape_dict=None):
         if version != 3:
             raise TVMCException("input file not tflite version 3")
 
-        logger.debug("tflite_input_type")
-        input_shapes, dtype_dict = TFLiteFrontend._input_type(tflite_model)
-        if shape_dict is not None:
-            input_shapes.update(shape_dict)
-
         logger.debug("parse TFLite model and convert into Relay computation graph")
-        mod, params = relay.frontend.from_tflite(
-            tflite_model, shape_dict=input_shapes, dtype_dict=dtype_dict
-        )
+        mod, params = relay.frontend.from_tflite(tflite_model, shape_dict=shape_dict)
         return mod, params
 
-    @staticmethod
-    def _decode_type(n):
-        return TFLiteFrontend._tflite_m[n]
-
-    @staticmethod
-    def _input_type(model):
-        subgraph_count = model.SubgraphsLength()
-        assert subgraph_count > 0
-        shape_dict = {}
-        dtype_dict = {}
-        for subgraph_index in range(subgraph_count):
-            subgraph = model.Subgraphs(subgraph_index)
-            inputs_count = subgraph.InputsLength()
-            assert inputs_count >= 1
-            for input_index in range(inputs_count):
-                input_ = subgraph.Inputs(input_index)
-                assert subgraph.TensorsLength() > input_
-                tensor = subgraph.Tensors(input_)
-                input_shape = tuple(tensor.ShapeAsNumpy())
-                tensor_type = tensor.Type()
-                input_name = tensor.Name().decode("utf8")
-                shape_dict[input_name] = input_shape
-                dtype_dict[input_name] = TFLiteFrontend._decode_type(tensor_type)
-
-        return shape_dict, dtype_dict
-
 
 class PyTorchFrontend(Frontend):
     """ PyTorch frontend for TVMC """
diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index 6d9bb18a7573..1b593ad8dea3 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -3539,7 +3539,45 @@ def get_tensor_name(subgraph, tensor_idx):
     return subgraph.Tensors(tensor_idx).Name().decode("utf-8")
 
 
-def from_tflite(model, shape_dict, dtype_dict):
+def _decode_type(n):
+    _tflite_m = {
+        0: "float32",
+        1: "float16",
+        2: "int32",
+        3: "uint8",
+        4: "int64",
+        5: "string",
+        6: "bool",
+        7: "int16",
+        8: "complex64",
+        9: "int8",
+    }
+    return _tflite_m[n]
+
+
+def _input_type(model):
+    subgraph_count = model.SubgraphsLength()
+    assert subgraph_count > 0
+    shape_dict = {}
+    dtype_dict = {}
+    for subgraph_index in range(subgraph_count):
+        subgraph = model.Subgraphs(subgraph_index)
+        inputs_count = subgraph.InputsLength()
+        assert inputs_count >= 1
+        for input_index in range(inputs_count):
+            input_ = subgraph.Inputs(input_index)
+            assert subgraph.TensorsLength() > input_
+            tensor = subgraph.Tensors(input_)
+            input_shape = tuple(tensor.ShapeAsNumpy())
+            tensor_type = tensor.Type()
+            input_name = tensor.Name().decode("utf8")
+            shape_dict[input_name] = input_shape
+            dtype_dict[input_name] = _decode_type(tensor_type)
+
+    return shape_dict, dtype_dict
+
+
+def from_tflite(model, shape_dict=None, dtype_dict=None):
     """Convert from tflite model into compatible relay Function.
 
     Parameters
@@ -3577,6 +3615,12 @@ def from_tflite(model, shape_dict, dtype_dict):
 
         assert isinstance(model, tflite.Model.Model)
 
+    _shape_dict, _dtype_dict = _input_type(model)
+    if shape_dict is not None:
+        _shape_dict.update(shape_dict)
+    if dtype_dict is not None:
+        _dtype_dict.update(dtype_dict)
+
     # keep the same as tflite
     assert model.SubgraphsLength() == 1, "only support one subgraph (main subgraph)"
     subgraph = model.Subgraphs(0)
@@ -3588,8 +3632,8 @@ def from_tflite(model, shape_dict, dtype_dict):
     exp_tab = ExprTable()
     for model_input in model_inputs:
         model_input_name = get_tensor_name(subgraph, model_input)
-        shape = shape_dict[model_input_name] if model_input_name in shape_dict else None
-        dtype = dtype_dict[model_input_name] if model_input_name in dtype_dict else "float32"
+        shape = _shape_dict[model_input_name] if model_input_name in _shape_dict else None
+        dtype = _dtype_dict[model_input_name] if model_input_name in _dtype_dict else "float32"
         exp_tab.set_expr(model_input_name, _expr.var(model_input_name, shape=shape, dtype=dtype))
 
     # op code in model

From 32c44025483b9aea7732cdc80e85e8e973c602ab Mon Sep 17 00:00:00 2001
From: Gustavo Romero <gromero@users.noreply.github.com>
Date: Mon, 15 Feb 2021 15:22:00 -0300
Subject: [PATCH 195/357] =?UTF-8?q?[=C2=B5TVM]=20Print=20.elf=20statistics?=
 =?UTF-8?q?=20for=20a=20model=20runtime=20built=20with=20Zephyr=20(#7449)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* [µTVM] Print .elf statistics for a model runtime built with Zephyr

Currently there isn't any statistics about the used resources by a model
runtime built with Zephyr, making it difficult to have any idea about, for
instance, the amount of memory taken by the operations necessary to run the
model.

Since Zephyr's SDK already exposes the statistics about various memory
regions on linking by passing '--print-memory-usage' to the linker, it's
possible to use it to have an idea about the amount of memory used by the
model and how much memory is left on the device.

That commit adds a simple method to extract the memory region information
out of the build output and then uses it to show memory usage statistics
for various memory regions when Zephyr finishes building the image to be
flashed to the target device.

Signed-off-by: Gustavo Romero <gustavo.romero@linaro.org>

* v2: Fixes accordingly to Andrew review

- Catch StopIteration in case of a weird output or no additional lines
  after the last memory region
- Use of _LOG.info() instead of plain print() for better control over
  the output by the main script
- Set log level in micro_tflite.py script as an example on how to get
  the new memory usage statistics and also because currently that's the
  main script used to test microTVM + Zephyr's SDK
- Improve statistics header

Signed-off-by: Gustavo Romero <gustavo.romero@linaro.org>

* Fix build

It seems build system is using Python < 3.7, so 'text' argument
is not present as an alias for 'universal_newlines'. To satisfy
it use old 'universal_newlines' argument which is available prior
to Python 3.7.

* Fix build

Avoid exception anti-pattern when catching StopIteration

* Retrigger CI
---
 python/tvm/micro/contrib/zephyr.py | 25 +++++++++++++++++++++++--
 tutorials/micro/micro_tflite.py    |  6 ++++++
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/python/tvm/micro/contrib/zephyr.py b/python/tvm/micro/contrib/zephyr.py
index ed1c9866c741..29bb5ecdbe6f 100644
--- a/python/tvm/micro/contrib/zephyr.py
+++ b/python/tvm/micro/contrib/zephyr.py
@@ -55,7 +55,7 @@ def run(self, cmd, **kw):
         for k, v in self.default_overrides.items():
             env[k] = v
 
-        return subprocess.check_output(cmd, env=env, **kw)
+        return subprocess.check_output(cmd, env=env, **kw, universal_newlines=True)
 
 
 class ProjectNotFoundError(Exception):
@@ -204,6 +204,25 @@ def library(self, output, sources, options=None):
         )
         return tvm.micro.MicroLibrary(build_dir, [f"lib{project_name}.a"])
 
+    def _print_make_statistics(self, output):
+        output = output.splitlines()
+        lines = iter(output)
+        for line in lines:
+            if line.startswith("Memory region"):
+                # print statistics header
+                _LOG.info(line)
+                _LOG.info("--------------------- ---------- ------------ ---------")
+                line = next(lines)
+                # while there is a region print it
+                try:
+                    while ":" in line:
+                        _LOG.info(line)
+                        line = next(lines)
+                    else:
+                        break
+                except StopIteration:
+                    pass
+
     def binary(self, output, objects, options=None, link_main=True, main_options=None):
         assert link_main, "Must pass link_main=True"
         assert self._project_dir is not None, "Must supply project_dir= to build binaries"
@@ -224,7 +243,9 @@ def binary(self, output, objects, options=None, link_main=True, main_options=Non
         cmake_args.append(f'-DTVM_LIBS={";".join(copied_libs)}')
         self._subprocess_env.run(cmake_args, cwd=output)
 
-        self._subprocess_env.run(["make"], cwd=output)
+        make_output = self._subprocess_env.run(["make"], cwd=output)
+
+        self._print_make_statistics(make_output)
 
         return tvm.micro.MicroBinary(
             output,
diff --git a/tutorials/micro/micro_tflite.py b/tutorials/micro/micro_tflite.py
index 673985e24d84..6ad0da5aecba 100644
--- a/tutorials/micro/micro_tflite.py
+++ b/tutorials/micro/micro_tflite.py
@@ -122,6 +122,8 @@
 
 import os
 import numpy as np
+import logging
+
 import tvm
 import tvm.micro as micro
 from tvm.contrib.download import download_testdata
@@ -229,6 +231,10 @@
 #     )
 #
 #     opts = tvm.micro.default_options(f"{project_dir}/crt")
+#
+# enable printing memory usage statistics of the runtime image
+# generated by Zephyr compiler for the physical hardware
+# logging.basicConfig(level="INFO")
 
 workspace = tvm.micro.Workspace()
 micro_binary = tvm.micro.build_static_runtime(

From 0ebc820387f271f7e36f135ba2b244fa4734dde1 Mon Sep 17 00:00:00 2001
From: Ritwik Das <ritwikdas54@gmail.com>
Date: Mon, 15 Feb 2021 11:42:13 -0800
Subject: [PATCH 196/357] Add IdentityN operator for TF Frontend (#7452)

* Add frontend code and tests

* Add Frontend Code

Co-authored-by: Ubuntu <ubuntu@ip-172-31-42-251.us-east-2.compute.internal>
---
 python/tvm/relay/frontend/tensorflow.py       |  8 +++
 .../frontend/tensorflow/test_forward.py       | 50 +++++++++++++++++++
 2 files changed, 58 insertions(+)

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index b34e6c723645..ea1abc843c20 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -1005,6 +1005,13 @@ def _impl(inputs, attr, params, mod):
     return _impl
 
 
+def _identityn():
+    def _impl(inputs, attr, params, mod):
+        return inputs
+
+    return _impl
+
+
 def _concatV2():
     def _impl(inputs, attr, params, mod):
         pop_node = inputs.pop(len(inputs) - 1)
@@ -2378,6 +2385,7 @@ def _impl(inputs, attr, params, mod):
     "Greater": _broadcast("greater"),
     "GreaterEqual": _broadcast("greater_equal"),
     "Identity": _identity(),
+    "IdentityN": _identityn(),
     "IsFinite": AttrCvt("isfinite"),
     "IsInf": AttrCvt("isinf"),
     "IsNan": AttrCvt("isnan"),
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 34ee0f3528ae..fd4b9f49e6a4 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -4074,6 +4074,56 @@ def test_forward_dilation():
     _test_dilation2d([1, 3, 3, 1], [2, 2, 1], [1, 1, 1, 1], [1, 1, 2, 1], "VALID")
 
 
+def _test_identityn(data_np_list):
+    with tf.Graph().as_default():
+        data_tensors = []
+        data_tensors_name = []
+        for index, data_np in enumerate(data_np_list):
+            tensor_name = f"data_{index}"
+            data_tensors_name.append(tensor_name + ":0")
+            data_tensors.append(
+                tf.placeholder(shape=data_np.shape, dtype=str(data_np.dtype), name=tensor_name)
+            )
+
+        output = tf.identity_n(data_tensors)
+        output_names = [out.name for out in output]
+        compare_tf_with_tvm(
+            data_np_list,
+            data_tensors_name,
+            output_names,
+        )
+
+
+@pytest.mark.parametrize(
+    "data_np_list",
+    [
+        (
+            [
+                np.array([[1, 1], [0, 3], [0, 1], [2, 0], [3, 1]], dtype=np.int64),
+                np.array([1, 2, 3, 4, 5], dtype=np.int64),
+                np.array([5, 6], dtype=np.int64),
+            ]
+        ),
+        (
+            [
+                np.array([[1, 1], [0, 3], [2, 0], [3, 1]], dtype=np.int64),
+                np.array([1, 2, 3, 4], dtype=np.int64),
+                np.array([5, 6], dtype=np.int64),
+                np.array([True, False, True]),
+            ]
+        ),
+        (
+            [
+                np.array([]),
+                np.array([[]]),
+            ]
+        ),
+    ],
+)
+def test_forward_identityn(data_np_list):
+    _test_identityn(data_np_list)
+
+
 #######################################################################
 # Sparse To Dense
 # ---------------

From 22642060f7ee49251fabd3eba16575dc5c015955 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Mon, 15 Feb 2021 14:04:43 -0800
Subject: [PATCH 197/357] docker/bash.sh: lookup docker image in Jenkinsfile.
 (#7453)

* This PR makes it possible to type
   `docker/bash.sh ci_cpu tests/scripts/task_config_build_cpu.sh`
   and the same version of ci_cpu as is used in Jenkins will be
   used to run the command.
---
 docker/bash.sh       | 12 ++++++++++--
 docker/dev_common.sh | 32 +++++++++++++++++++++++++++++---
 2 files changed, 39 insertions(+), 5 deletions(-)

diff --git a/docker/bash.sh b/docker/bash.sh
index 785b42870e24..51fb68265b73 100755
--- a/docker/bash.sh
+++ b/docker/bash.sh
@@ -27,6 +27,11 @@
 #     Execute command in the docker image, default non-interactive
 #     With -i, execute interactively.
 #
+
+set -e
+
+source "$(dirname $0)/dev_common.sh" || exit 2
+
 interactive=0
 if [ "$1" == "-i" ]; then
     interactive=1
@@ -38,7 +43,10 @@ if [ "$#" -lt 1 ]; then
     exit -1
 fi
 
-DOCKER_IMAGE_NAME=("$1")
+DOCKER_IMAGE_NAME=$(lookup_image_spec "$1")
+if [ -z "${DOCKER_IMAGE_NAME}" ]; then
+    DOCKER_IMAGE_NAME=("$1")
+fi
 
 CI_DOCKER_EXTRA_PARAMS=( )
 if [ "$#" -eq 1 ]; then
@@ -105,7 +113,7 @@ if [[ "${DOCKER_IMAGE_NAME}" == *"demo_vitis_ai"* && -d "/dev/shm" && -d "/opt/x
     for i in ${RENDER_DRIVER} ;
     do
         DOCKER_DEVICES+="--device=$i "
-    done  
+    done
 fi
 
 # Add ROCm devices and set ROCM_ENABLED=1 which is used in the with_the_same_user script
diff --git a/docker/dev_common.sh b/docker/dev_common.sh
index 559a66469e37..68b9f8d28760 100644
--- a/docker/dev_common.sh
+++ b/docker/dev_common.sh
@@ -28,13 +28,39 @@ INVOCATION_PWD="$(pwd)"
 GIT_TOPLEVEL=$(cd $(dirname ${BASH_SOURCE[0]}) && git rev-parse --show-toplevel)
 
 
+function filter_jenkinsfile() {
+    local echo_on=0;
+    while read line; do
+        if [ "${line}" == "// NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->" ]; then
+            echo_on=1
+        elif [ "${line}" == "// <--- End of regex-scanned config." ]; then
+            break
+        elif [ ${echo_on} -eq 1 ]; then
+            echo "$line"
+        fi
+    done
+}
+
+
+function lookup_image_spec() {
+    img_line=$(cat "${GIT_TOPLEVEL}/Jenkinsfile" | filter_jenkinsfile | grep -E "^${1} = ")
+    if [ -n "${img_line}" ]; then
+        img_spec=$(echo "${img_line}" | sed -E "s/${1} = \"([^\"]*)\"/\1/")
+        has_similar_docker_image=1
+        docker inspect "${1}" &>/dev/null || has_similar_docker_image=0
+        if [ ${has_similar_docker_image} -ne 0 ]; then
+            echo "WARNING: resolved docker image through Jenkinsfile to \"${img_spec}\"" >&2
+        fi
+        echo "${img_spec}"
+    fi
+}
+
+
 function run_docker() {
     image_name="$1"  # Name of the Jenkinsfile var to find
     shift
 
-    image_spec=$(cat "${GIT_TOPLEVEL}/Jenkinsfile" | \
-                     grep -E "^${image_name} = " | \
-                     sed -E "s/${image_name} = \"([^\"]*)\"/\1/")
+    image_spec=$(lookup_image_spec "${image_name}")
     if [ -z "${image_spec}" ]; then
         echo "${image_name}: not found in ${GIT_TOPLEVEL}/Jenkinsfile" >&2
         exit 2

From fc48514f1d8ccffcebd12007cb6c602506975703 Mon Sep 17 00:00:00 2001
From: Luis Vega <vegaluisjose@users.noreply.github.com>
Date: Mon, 15 Feb 2021 20:02:10 -0800
Subject: [PATCH 198/357] [BYOC][Verilator] Refactor Verilator runtime (#7406)

* new experiment

* save

* refactor

* refactor library

* add profiler

* refactor

* refactor

* add docs

* update comment

* add deallocator
---
 .../backend/contrib/verilator/codegen.cc      |  56 +++--
 .../contrib/verilator/verilator_device.h      |  39 +++-
 .../contrib/verilator/verilator_runtime.cc    | 197 +++++++++---------
 .../contrib/verilator/verilator_runtime.h     | 138 ++++++++++++
 .../contrib/test_verilator/infrastructure.py  |   6 +-
 5 files changed, 307 insertions(+), 129 deletions(-)
 create mode 100644 src/runtime/contrib/verilator/verilator_runtime.h

diff --git a/src/relay/backend/contrib/verilator/codegen.cc b/src/relay/backend/contrib/verilator/codegen.cc
index 2f61ae540395..b206288f7e96 100644
--- a/src/relay/backend/contrib/verilator/codegen.cc
+++ b/src/relay/backend/contrib/verilator/codegen.cc
@@ -34,6 +34,7 @@
 #include <sstream>
 
 #include "../../../../runtime/contrib/json/json_node.h"
+#include "../../../../runtime/contrib/verilator/verilator_runtime.h"
 #include "../../utils.h"
 #include "../codegen_json/codegen_json.h"
 
@@ -75,29 +76,34 @@ class VerilatorJSONSerializer : public backend::contrib::JSONSerializer {
   }
 };
 
-/*! \brief Attributes to store the compiler options for Verilator */
-struct VerilatorCompilerConfigNode : public tvm::AttrsNode<VerilatorCompilerConfigNode> {
-  String lib;
-
-  TVM_DECLARE_ATTRS(VerilatorCompilerConfigNode, "ext.attrs.VerilatorCompilerConfigNode") {
-    TVM_ATTR_FIELD(lib).set_default("libverilator.so");
+/*! \brief Attributes to store options for Verilator */
+struct VerilatorOptionsNode : public tvm::AttrsNode<VerilatorOptionsNode> {
+  String lib_path;
+  int reset_cycles;
+  bool profiler_enable;
+  int profiler_cycle_counter_id;
+
+  TVM_DECLARE_ATTRS(VerilatorOptionsNode, "ext.attrs.VerilatorOptionsNode") {
+    TVM_ATTR_FIELD(lib_path).describe("the design library path").set_default("libverilator.so");
+    TVM_ATTR_FIELD(reset_cycles).describe("the number of reset cycles").set_default(1);
+    TVM_ATTR_FIELD(profiler_enable).describe("enable profiler").set_default(false);
+    TVM_ATTR_FIELD(profiler_cycle_counter_id).describe("profiler cycle counter id").set_default(0);
   }
 };
 
-class VerilatorCompilerConfig : public Attrs {
+class VerilatorOptions : public Attrs {
  public:
-  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(VerilatorCompilerConfig, Attrs,
-                                            VerilatorCompilerConfigNode);
+  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(VerilatorOptions, Attrs, VerilatorOptionsNode);
 };
 
-TVM_REGISTER_NODE_TYPE(VerilatorCompilerConfigNode);
-TVM_REGISTER_PASS_CONFIG_OPTION("relay.ext.verilator.options", VerilatorCompilerConfig);
+TVM_REGISTER_NODE_TYPE(VerilatorOptionsNode);
+TVM_REGISTER_PASS_CONFIG_OPTION("relay.ext.verilator.options", VerilatorOptions);
 
 /*!
- * \brief The external compiler/codegen tool. It takes a Relay expression/module and
- * compile it into a runtime module.
+ * \brief The Verilator codegen tool. It takes a Relay expression/module and
+ * compile it into a Verilator runtime module.
  */
-runtime::Module VerilatorCompiler(const ObjectRef& ref) {
+runtime::Module VerilatorBackend(const ObjectRef& ref) {
   CHECK(ref->IsInstance<FunctionNode>());
   auto func = Downcast<Function>(ref);
   auto func_name = GetExtSymbol(func);
@@ -106,22 +112,28 @@ runtime::Module VerilatorCompiler(const ObjectRef& ref) {
   std::string graph_json = serializer.GetJSON();
   auto params = serializer.GetParams();
 
+  // Create runtime object
+  auto n = make_object<runtime::contrib::VerilatorRuntime>(func_name, graph_json, params);
+
   // Get Verilator compiler options
   auto ctx = transform::PassContext::Current();
-  auto cfg = ctx->GetConfig<VerilatorCompilerConfig>("relay.ext.verilator.options");
+  auto cfg = ctx->GetConfig<VerilatorOptions>("relay.ext.verilator.options");
   if (!cfg.defined()) {
-    cfg = AttrsWithDefaultValues<VerilatorCompilerConfig>();
+    cfg = AttrsWithDefaultValues<VerilatorOptions>();
   }
 
-  auto lib_name = cfg.value()->lib;
+  n->SetLibrary(cfg.value()->lib_path);
+  n->SetResetCycles(cfg.value()->reset_cycles);
+
+  if (cfg.value()->profiler_enable) {
+    n->EnableProfiler();
+    n->SetProfilerCycleCounterId(cfg.value()->profiler_cycle_counter_id);
+  }
 
-  const auto* pf = runtime::Registry::Get("runtime.verilator_runtime_create");
-  CHECK(pf != nullptr) << "Cannot find JSON runtime module to create";
-  auto mod = (*pf)(lib_name, func_name, graph_json, params);
-  return mod;
+  return runtime::Module(n);
 }
 
-TVM_REGISTER_GLOBAL("relay.ext.verilator").set_body_typed(VerilatorCompiler);
+TVM_REGISTER_GLOBAL("relay.ext.verilator").set_body_typed(VerilatorBackend);
 
 }  // namespace contrib
 }  // namespace relay
diff --git a/src/runtime/contrib/verilator/verilator_device.h b/src/runtime/contrib/verilator/verilator_device.h
index acd91a53bcff..298e41c06daf 100644
--- a/src/runtime/contrib/verilator/verilator_device.h
+++ b/src/runtime/contrib/verilator/verilator_device.h
@@ -31,24 +31,51 @@ namespace tvm {
 namespace runtime {
 namespace contrib {
 
+/*! \brief Verilator device resource context  */
 typedef void* VerilatorHandle;
 
-/* allocate Verilator object */
+/*!
+ * \brief Allocate a verilator device resource handle
+ * \return The verilator device handle.
+ */
 extern "C" TVM_DLL VerilatorHandle VerilatorAlloc();
 
-/* deallocate Verilator object */
+/*!
+ * \brief Free a verilator device handle
+ * \param handle The verilator device handle to be freed.
+ */
 extern "C" TVM_DLL void VerilatorDealloc(VerilatorHandle handle);
 
-/* read Verilator register or memory */
+/*!
+ * \brief Read verilator register or memory
+ * \param handle The verilator device handle.
+ * \param id The register or memory identifier.
+ * \param addr The register or memory address (word-level).
+ * \return The value of register or memory.
+ */
 extern "C" TVM_DLL int VerilatorRead(VerilatorHandle handle, int id, int addr);
 
-/* write Verilator register or memory */
+/*!
+ * \brief Write verilator register or memory
+ * \param handle The verilator device handle.
+ * \param id The register or memory identifier.
+ * \param addr The register or memory address (word-level).
+ * \param value The value of register or memory.
+ */
 extern "C" TVM_DLL void VerilatorWrite(VerilatorHandle handle, int id, int addr, int value);
 
-/* reset Verilator for n clock cycles */
+/*!
+ * \brief Reset Verilator for n clock cycles
+ * \param handle The verilator device handle.
+ * \param n The number of reset cycles.
+ */
 extern "C" TVM_DLL void VerilatorReset(VerilatorHandle handle, int n);
 
-/* run Verilator for n clock cycles */
+/*!
+ * \brief Run Verilator for n clock cycles
+ * \param handle The verilator device handle.
+ * \param n The number of run cycles.
+ */
 extern "C" TVM_DLL void VerilatorRun(VerilatorHandle handle, int n);
 
 }  // namespace contrib
diff --git a/src/runtime/contrib/verilator/verilator_runtime.cc b/src/runtime/contrib/verilator/verilator_runtime.cc
index 60f36e494da7..bc96b69f2ffe 100644
--- a/src/runtime/contrib/verilator/verilator_runtime.cc
+++ b/src/runtime/contrib/verilator/verilator_runtime.cc
@@ -19,9 +19,11 @@
 
 /*!
  * \file src/runtime/contrib/verilator/verilator_runtime.cc
- * \brief A simple JSON runtime for Verilator.
+ * \brief A runtime for Verilator.
  */
 
+#include "verilator_runtime.h"
+
 #include <dlfcn.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/registry.h>
@@ -40,124 +42,123 @@ namespace tvm {
 namespace runtime {
 namespace contrib {
 
-typedef VerilatorHandle (*VerilatorAllocFunc)();
-typedef void (*VerilatorResetFunc)(VerilatorHandle, int);
-typedef void (*VerilatorAddFunc)(VerilatorHandle, int*, int*, int*, int, int);
-
 using namespace tvm::runtime;
+using namespace tvm::runtime::contrib;
 using namespace tvm::runtime::json;
 
-class VerilatorLibrary : public Library {
- public:
-  ~VerilatorLibrary() {
-    if (lib_handle_) Unload();
-  }
-  void Init(const std::string& name) { Load(name); }
-
-  void* GetSymbol(const char* name) final { return GetSymbol_(name); }
-
- private:
-  // Library handle
-  void* lib_handle_{nullptr};
-  // load the library
-  void Load(const std::string& name) {
-    lib_handle_ = dlopen(name.c_str(), RTLD_LAZY | RTLD_LOCAL);
-    ICHECK(lib_handle_ != nullptr)
-        << "Failed to load dynamic shared library " << name << " " << dlerror();
-  }
-
-  void* GetSymbol_(const char* name) { return dlsym(lib_handle_, name); }
-
-  void Unload() {
+VerilatorLibrary::~VerilatorLibrary() {
+  if (lib_handle_) {
     dlclose(lib_handle_);
     lib_handle_ = nullptr;
   }
-};
+}
 
-class VerilatorJSONRuntime : public JSONRuntimeBase {
- public:
-  VerilatorJSONRuntime(const std::string& symbol_name, const std::string& graph_json,
-                       const Array<String> const_names)
-      : JSONRuntimeBase(symbol_name, graph_json, const_names) {}
+void VerilatorLibrary::Load(const std::string& name) {
+  lib_handle_ = dlopen(name.c_str(), RTLD_LAZY | RTLD_LOCAL);
+  ICHECK(lib_handle_ != nullptr) << "Failed to load dynamic shared library " << name << " "
+                                 << dlerror();
+}
 
-  const char* type_key() const { return "verilator_json"; }
+void* VerilatorLibrary::GetSymbol(const char* name) { return dlsym(lib_handle_, name); }
 
-  void LoadLibrary(const std::string& lib_name) {
-    lib_ = new VerilatorLibrary();
-    lib_->Init(lib_name);
-  }
+void VerilatorProfiler::Clear() { cycle_counter = 0; }
 
-  void Init(const Array<NDArray>& consts) override {
-    // get symbols
-    auto alloc_func = reinterpret_cast<VerilatorAllocFunc>(lib_->GetSymbol("VerilatorAlloc"));
-    ICHECK(alloc_func != nullptr);
-    auto reset_func = reinterpret_cast<VerilatorResetFunc>(lib_->GetSymbol("VerilatorReset"));
-    ICHECK(reset_func != nullptr);
-    vadd_func_ = reinterpret_cast<VerilatorAddFunc>(lib_->GetSymbol("verilator_add"));
-    ICHECK(vadd_func_ != nullptr);
+std::string VerilatorProfiler::AsJSON() {
+  std::ostringstream os;
+  os << "{\n"
+     << " \"cycle_counter\":" << cycle_counter << "\n"
+     << "}\n";
+  return os.str();
+}
 
-    // alloc device
-    device_ = (*alloc_func)();
+VerilatorProfiler* VerilatorProfiler::ThreadLocal() {
+  static thread_local VerilatorProfiler inst;
+  return &inst;
+}
 
-    // reset for 10 cycles
-    (*reset_func)(device_, 10);
+VerilatorRuntime::~VerilatorRuntime() {
+  auto dealloc = reinterpret_cast<VerilatorDeallocFunc>(lib_->GetSymbol("VerilatorDealloc"));
+  ICHECK(dealloc != nullptr);
+  dealloc(device_);
+  lib_->~VerilatorLibrary();
+}
 
-    CHECK_EQ(consts.size(), const_idx_.size())
-        << "The number of input constants must match the number of required.";
+void VerilatorRuntime::SetLibrary(const std::string& lib_path) { lib_path_ = lib_path; }
 
-    // Setup constants entries for weights.
-    SetupConstants(consts);
-  }
+void VerilatorRuntime::SetResetCycles(const int cycles) { reset_cycles_ = cycles; }
 
-  void Run() override {
-    std::vector<int*> in_ptr;
-    std::vector<int*> out_ptr;
-    for (size_t i = 0; i < input_nodes_.size(); ++i) {
-      uint32_t eid = EntryID(input_nodes_[i], 0);
-      int* data = static_cast<int*>(data_entry_[eid]->data);
-      in_ptr.push_back(data);
-    }
-    for (size_t i = 0; i < outputs_.size(); ++i) {
-      uint32_t eid = EntryID(outputs_[i]);
-      int* data = static_cast<int*>(data_entry_[eid]->data);
-      out_ptr.push_back(data);
-    }
-    for (size_t nid = 0; nid < nodes_.size(); ++nid) {
-      const auto& node = nodes_[nid];
-      if (node.GetOpType() == "kernel") {
-        CHECK_EQ(node.GetOpType(), "kernel");
-        auto op_name = node.GetOpName();
-        if ("add" == op_name) {
-          auto entry = node.GetInputs()[0];
-          auto shape = nodes_[entry.id_].GetOpShape()[entry.index_];
-          (*vadd_func_)(device_, in_ptr[0], in_ptr[1], out_ptr[0], shape[0], shape[1]);
-        } else {
-          LOG(FATAL) << "Unsupported op: " << op_name;
-        }
+void VerilatorRuntime::EnableProfiler() { prof_enable_ = true; }
+
+void VerilatorRuntime::SetProfilerCycleCounterId(const int id) { prof_cycle_counter_id_ = id; }
+
+void VerilatorRuntime::Init(const Array<NDArray>& consts) {
+  lib_ = new VerilatorLibrary();
+  lib_->Load(lib_path_);
+  auto alloc = reinterpret_cast<VerilatorAllocFunc>(lib_->GetSymbol("VerilatorAlloc"));
+  ICHECK(alloc != nullptr);
+  auto reset = reinterpret_cast<VerilatorResetFunc>(lib_->GetSymbol("VerilatorReset"));
+  ICHECK(reset != nullptr);
+  read_ = reinterpret_cast<VerilatorReadFunc>(lib_->GetSymbol("VerilatorRead"));
+  ICHECK(read_ != nullptr);
+  add_op_ = reinterpret_cast<VerilatorAddFunc>(lib_->GetSymbol("verilator_add"));
+
+  // alloc verilator device
+  device_ = alloc();
+
+  // enable profiler
+  if (prof_enable_) prof_ = VerilatorProfiler::ThreadLocal();
+
+  // reset verilator device
+  reset(device_, reset_cycles_);
+
+  CHECK_EQ(consts.size(), const_idx_.size())
+      << "The number of input constants must match the number of required.";
+
+  // Setup constants entries for weights.
+  SetupConstants(consts);
+}
+
+void VerilatorRuntime::Run() {
+  std::vector<int*> in_ptr;
+  std::vector<int*> out_ptr;
+  for (size_t i = 0; i < input_nodes_.size(); ++i) {
+    uint32_t eid = EntryID(input_nodes_[i], 0);
+    int* data = static_cast<int*>(data_entry_[eid]->data);
+    in_ptr.push_back(data);
+  }
+  for (size_t i = 0; i < outputs_.size(); ++i) {
+    uint32_t eid = EntryID(outputs_[i]);
+    int* data = static_cast<int*>(data_entry_[eid]->data);
+    out_ptr.push_back(data);
+  }
+  for (size_t nid = 0; nid < nodes_.size(); ++nid) {
+    const auto& node = nodes_[nid];
+    if (node.GetOpType() == "kernel") {
+      CHECK_EQ(node.GetOpType(), "kernel");
+      auto op_name = node.GetOpName();
+      if ("add" == op_name) {
+        auto entry = node.GetInputs()[0];
+        auto shape = nodes_[entry.id_].GetOpShape()[entry.index_];
+        ICHECK(add_op_ != nullptr);
+        add_op_(device_, in_ptr[0], in_ptr[1], out_ptr[0], shape[0], shape[1]);
+      } else {
+        LOG(FATAL) << "Unsupported op: " << op_name;
       }
     }
   }
-
- private:
-  /* The verilator device handle. */
-  VerilatorHandle device_{nullptr};
-  /* The verilator library handle. */
-  VerilatorLibrary* lib_{nullptr};
-  /* The verilator vadd function handle. */
-  VerilatorAddFunc vadd_func_{nullptr};
-};
-
-runtime::Module VerilatorJSONRuntimeCreate(String lib_name, String symbol_name, String graph_json,
-                                           const Array<String>& const_names) {
-  auto n = make_object<VerilatorJSONRuntime>(symbol_name, graph_json, const_names);
-  n->LoadLibrary(lib_name);
-  return runtime::Module(n);
+  if (prof_enable_) {
+    int cycles = read_(device_, prof_cycle_counter_id_, 0);
+    prof_->cycle_counter += cycles;
+  }
 }
 
-TVM_REGISTER_GLOBAL("runtime.verilator_runtime_create").set_body_typed(VerilatorJSONRuntimeCreate);
+TVM_REGISTER_GLOBAL("verilator.profiler_clear").set_body([](TVMArgs args, TVMRetValue* rv) {
+  VerilatorProfiler::ThreadLocal()->Clear();
+});
 
-TVM_REGISTER_GLOBAL("runtime.module.loadbinary_verilator_json")
-    .set_body_typed(JSONRuntimeBase::LoadFromBinary<VerilatorJSONRuntime>);
+TVM_REGISTER_GLOBAL("verilator.profiler_status").set_body([](TVMArgs args, TVMRetValue* rv) {
+  *rv = VerilatorProfiler::ThreadLocal()->AsJSON();
+});
 
 }  // namespace contrib
 }  // namespace runtime
diff --git a/src/runtime/contrib/verilator/verilator_runtime.h b/src/runtime/contrib/verilator/verilator_runtime.h
new file mode 100644
index 000000000000..acdaa3b03ce2
--- /dev/null
+++ b/src/runtime/contrib/verilator/verilator_runtime.h
@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/contrib/verilator/verilator_runtime.h
+ * \brief A runtime for Verilator.
+ */
+
+#ifndef TVM_RUNTIME_CONTRIB_VERILATOR_VERILATOR_RUNTIME_H_
+#define TVM_RUNTIME_CONTRIB_VERILATOR_VERILATOR_RUNTIME_H_
+
+#include <dlfcn.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/registry.h>
+
+#include <cstddef>
+#include <string>
+#include <vector>
+
+#include "../../library_module.h"
+#include "../json/json_node.h"
+#include "../json/json_runtime.h"
+#include "verilator_device.h"
+#include "verilator_kernel.h"
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+using namespace tvm::runtime;
+using namespace tvm::runtime::contrib;
+using namespace tvm::runtime::json;
+
+typedef VerilatorHandle (*VerilatorAllocFunc)();
+typedef void (*VerilatorDeallocFunc)(VerilatorHandle);
+typedef void (*VerilatorResetFunc)(VerilatorHandle, int);
+typedef void (*VerilatorAddFunc)(VerilatorHandle, int*, int*, int*, int, int);
+typedef int (*VerilatorReadFunc)(VerilatorHandle, int, int);
+
+class VerilatorLibrary : public Library {
+ public:
+  ~VerilatorLibrary();
+
+  /*! \brief load library */
+  void Load(const std::string& name);
+
+  /*! \brief get symbol from libray */
+  void* GetSymbol(const char* name) final;
+
+ private:
+  /*! \brief the library handle */
+  void* lib_handle_{nullptr};
+};
+
+class VerilatorProfiler {
+ public:
+  /*! \brief the number of cycle counter */
+  uint32_t cycle_counter{0};
+
+  /*! \brief clear the profiler */
+  void Clear();
+
+  /*! \brief get profiler data */
+  std::string AsJSON();
+
+  /*! \brief profiler constructor */
+  static VerilatorProfiler* ThreadLocal();
+};
+
+class VerilatorRuntime : public JSONRuntimeBase {
+ public:
+  VerilatorRuntime(const std::string& symbol_name, const std::string& graph_json,
+                   const Array<String> const_names)
+      : JSONRuntimeBase(symbol_name, graph_json, const_names) {}
+
+  ~VerilatorRuntime();
+
+  const char* type_key() const { return "verilator"; }
+
+  /*! \brief set verilator library */
+  void SetLibrary(const std::string& lib_name);
+
+  /*! \brief set the number of reset cycles */
+  void SetResetCycles(const int cycles);
+
+  /*! \brief enable profiler */
+  void EnableProfiler();
+
+  /*! \brief set cycle counter register id */
+  void SetProfilerCycleCounterId(const int id);
+
+  /*! \brief init verilator runtime */
+  void Init(const Array<NDArray>& consts) override;
+
+  /*! \brief run verilator runtime */
+  void Run() override;
+
+ private:
+  /*! \brief the verilator library path */
+  String lib_path_;
+  /*! \brief the verilator device */
+  VerilatorHandle device_{nullptr};
+  /*! \brief the verilator library */
+  VerilatorLibrary* lib_{nullptr};
+  /*! \brief the verilator profiler */
+  VerilatorProfiler* prof_{nullptr};
+  /*! \brief the verilator read function */
+  VerilatorReadFunc read_{nullptr};
+  /*! \brief the verilator add op function */
+  VerilatorAddFunc add_op_{nullptr};
+  /*! \brief the verilator reset cycles */
+  int reset_cycles_{1};
+  /*! \brief the verilator profiler status */
+  bool prof_enable_{false};
+  /*! \brief the verilator profiler cycle counter id */
+  int prof_cycle_counter_id_{0};
+};
+
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_CONTRIB_VERILATOR_VERILATOR_RUNTIME_H_
diff --git a/tests/python/contrib/test_verilator/infrastructure.py b/tests/python/contrib/test_verilator/infrastructure.py
index e8fd943aa8a0..7e4c297853d5 100644
--- a/tests/python/contrib/test_verilator/infrastructure.py
+++ b/tests/python/contrib/test_verilator/infrastructure.py
@@ -102,9 +102,9 @@ def compile_module(mod):
     if not os.path.isfile(lib):
         compile_hardware()
 
-    with tvm.transform.PassContext(
-        opt_level=3, config={"relay.ext.verilator.options": {"lib": lib}}
-    ):
+    opts = {"lib_path": lib}
+
+    with tvm.transform.PassContext(opt_level=3, config={"relay.ext.verilator.options": opts}):
         exe = relay.vm.compile(mod, target="llvm", params=None)
         code, lib = exe.save()
         return runtime.vm.Executable.load_exec(code, lib)

From 413692e9a51c1a5af267c9ca9687048cf61c7be6 Mon Sep 17 00:00:00 2001
From: Gus Smith <guscomps@gmail.com>
Date: Tue, 16 Feb 2021 15:34:07 -0800
Subject: [PATCH 199/357] Make spelling of "axes" consistent (#7460)

---
 include/tvm/relay/attrs/transform.h       |  2 +-
 python/tvm/relay/op/transform.py          |  2 +-
 src/relay/op/tensor/transform.cc          |  2 +-
 tutorials/language/schedule_primitives.py | 14 +++++++-------
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index 45a1caf2bd79..24098b74f3b6 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -54,7 +54,7 @@ struct ExpandDimsAttrs : public tvm::AttrsNode<ExpandDimsAttrs> {
         "If `axis < 0`, it is the first axis inserted;"
         "If `axis >= 0`, it is the last axis inserted in Python's negative indexing.");
     TVM_ATTR_FIELD(num_newaxis)
-        .describe("Number of axises to be inserted. Should be >= 0.")
+        .describe("Number of axes to be inserted. Should be >= 0.")
         .set_lower_bound(0)
         .set_default(1);
   }
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index cda417cad239..1bd2b2d34060 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -86,7 +86,7 @@ def reinterpret(data, dtype):
 
 
 def expand_dims(data, axis, num_newaxis=1):
-    """Insert `num_newaxis` axises at the position given by `axis`.
+    """Insert `num_newaxis` axes at the position given by `axis`.
 
     Parameters
     ----------
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 5e39b409615d..e1b1dddf340b 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -227,7 +227,7 @@ Expr MakeExpandDims(Expr data, int axis, int num_newaxis) {
 TVM_REGISTER_GLOBAL("relay.op._make.expand_dims").set_body_typed(MakeExpandDims);
 
 RELAY_REGISTER_OP("expand_dims")
-    .describe(R"code(Insert `num_newaxis` axises at the position given by `axis`
+    .describe(R"code(Insert `num_newaxis` axes at the position given by `axis`
 
 - **data**: The input data to the operator.
 
diff --git a/tutorials/language/schedule_primitives.py b/tutorials/language/schedule_primitives.py
index eb48dc218cdd..ade79f69707f 100644
--- a/tutorials/language/schedule_primitives.py
+++ b/tutorials/language/schedule_primitives.py
@@ -69,7 +69,7 @@
 ######################################################################
 # split
 # -----
-# :code:`split` can split a specified axis into two axises by
+# :code:`split` can split a specified axis into two axes by
 # :code:`factor`.
 A = te.placeholder((m,), name="A")
 B = te.compute((m,), lambda i: A[i] * 2, name="B")
@@ -92,7 +92,7 @@
 # tile
 # ----
 # :code:`tile` help you execute the computation tile by tile over two
-# axises.
+# axes.
 A = te.placeholder((m, n), name="A")
 B = te.compute((m, n), lambda i, j: A[i, j], name="B")
 
@@ -103,12 +103,12 @@
 ######################################################################
 # fuse
 # ----
-# :code:`fuse` can fuse two consecutive axises of one computation.
+# :code:`fuse` can fuse two consecutive axes of one computation.
 A = te.placeholder((m, n), name="A")
 B = te.compute((m, n), lambda i, j: A[i, j], name="B")
 
 s = te.create_schedule(B.op)
-# tile to four axises first: (i.outer, j.outer, i.inner, j.inner)
+# tile to four axes first: (i.outer, j.outer, i.inner, j.inner)
 xo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5)
 # then fuse (i.inner, j.inner) into one axis: (i.inner.j.inner.fused)
 fused = s[B].fuse(xi, yi)
@@ -117,14 +117,14 @@
 ######################################################################
 # reorder
 # -------
-# :code:`reorder` can reorder the axises in the specified order.
+# :code:`reorder` can reorder the axes in the specified order.
 A = te.placeholder((m, n), name="A")
 B = te.compute((m, n), lambda i, j: A[i, j], name="B")
 
 s = te.create_schedule(B.op)
-# tile to four axises first: (i.outer, j.outer, i.inner, j.inner)
+# tile to four axes first: (i.outer, j.outer, i.inner, j.inner)
 xo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5)
-# then reorder the axises: (i.inner, j.outer, i.outer, j.inner)
+# then reorder the axes: (i.inner, j.outer, i.outer, j.inner)
 s[B].reorder(xi, yo, xo, yi)
 print(tvm.lower(s, [A, B], simple_mode=True))
 

From 564aae076694acd89f2ec320b5eaae504d971b37 Mon Sep 17 00:00:00 2001
From: Lily Orth-Smith <lilyorthsmith@gmail.com>
Date: Tue, 16 Feb 2021 19:02:33 -0500
Subject: [PATCH 200/357] [Relay][Topi] Add max mode to ROI align (#7440)

* ROI align with max on cpu passes

* onnx test file was not running gpu testsgit status!

* all passing

* fix lint

* lint again

* lint

* lint

* typo

* remove import

* fix import

* add inf, -inf to hybridscript and respond to comments

* shorten code

* make atol lower
---
 include/tvm/relay/attrs/vision.h             |  3 ++
 python/tvm/relay/frontend/onnx.py            |  7 +--
 python/tvm/relay/op/strategy/generic.py      |  2 +
 python/tvm/relay/op/vision/rcnn.py           |  7 ++-
 python/tvm/te/hybrid/calls.py                | 14 ++++++
 python/tvm/te/hybrid/runtime.py              | 10 +++++
 python/tvm/topi/testing/roi_align_python.py  | 18 +++++---
 python/tvm/topi/vision/rcnn/roi_align.py     | 26 +++++++++--
 python/tvm/topi/x86/roi_align.py             | 47 +++++++++++++++-----
 src/relay/op/vision/rcnn_op.cc               |  3 +-
 tests/python/frontend/onnx/test_forward.py   | 12 ++++-
 tests/python/relay/test_op_level5.py         | 18 ++++++--
 tests/python/topi/python/test_topi_vision.py | 27 ++++++-----
 13 files changed, 154 insertions(+), 40 deletions(-)

diff --git a/include/tvm/relay/attrs/vision.h b/include/tvm/relay/attrs/vision.h
index ca2c4a2b837d..4a96d391430e 100644
--- a/include/tvm/relay/attrs/vision.h
+++ b/include/tvm/relay/attrs/vision.h
@@ -124,6 +124,7 @@ struct ROIAlignAttrs : public tvm::AttrsNode<ROIAlignAttrs> {
   double spatial_scale;
   int sample_ratio;
   std::string layout;
+  std::string mode;
   TVM_DECLARE_ATTRS(ROIAlignAttrs, "relay.attrs.ROIAlignAttrs") {
     TVM_ATTR_FIELD(pooled_size).describe("Output size of roi align.");
     TVM_ATTR_FIELD(spatial_scale)
@@ -139,6 +140,8 @@ struct ROIAlignAttrs : public tvm::AttrsNode<ROIAlignAttrs> {
         "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
         "dimensions respectively. Convolution is applied on the 'H' and"
         "'W' dimensions.");
+    TVM_ATTR_FIELD(mode).set_default("avg").describe(
+        "Mode for ROI Align. Can be 'avg' or 'max'. The default mode is 'avg'.");
   }
 };
 
diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index fb3d1c923561..109e80c99783 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -1665,6 +1665,7 @@ def expand_shape(in_shape, shape):
             """
             in_dims = infer_shape(in_shape)[0]
             new_dims = infer_shape(shape)[0]
+
             if in_dims < new_dims:
                 in_shape = _op.concatenate(
                     [
@@ -2084,8 +2085,8 @@ def _impl_v1(cls, inputs, attr, params):
         rois = inputs[1]
         batch_indices = inputs[2]
         mode = attr.get("mode", b"avg")
-        if mode != b"avg":
-            raise ValueError("RoiAlign in Relay only uses avg mode")
+        if mode not in (b"avg", b"max"):
+            raise ValueError("RoiAlign in Relay only uses avg and max modes")
         output_height = attr.get("output_height", 1)
         output_width = attr.get("output_width", 1)
 
@@ -2097,7 +2098,7 @@ def _impl_v1(cls, inputs, attr, params):
         rois = _op.concatenate([batch_indices, rois], 1)
 
         return _vision.roi_align(
-            x, rois, [output_height, output_width], spatial_scale, sampling_ratio
+            x, rois, [output_height, output_width], spatial_scale, sampling_ratio, mode=mode
         )
 
 
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 92a72f950615..2d69a2f6942e 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -1041,6 +1041,7 @@ def wrap_compute_roi_align(topi_compute):
     def _compute_roi_align(attrs, inputs, out_type):
         assert attrs.layout == "NCHW"
         pooled_size = get_const_tuple(attrs.pooled_size)
+        mode = bytes(attrs.mode, "utf-8")
         return [
             topi_compute(
                 inputs[0],
@@ -1048,6 +1049,7 @@ def _compute_roi_align(attrs, inputs, out_type):
                 pooled_size=pooled_size,
                 spatial_scale=attrs.spatial_scale,
                 sample_ratio=attrs.sample_ratio,
+                mode=mode,
             )
         ]
 
diff --git a/python/tvm/relay/op/vision/rcnn.py b/python/tvm/relay/op/vision/rcnn.py
index b87eb07d7563..d25c5de89cee 100644
--- a/python/tvm/relay/op/vision/rcnn.py
+++ b/python/tvm/relay/op/vision/rcnn.py
@@ -18,7 +18,7 @@
 from . import _make
 
 
-def roi_align(data, rois, pooled_size, spatial_scale, sample_ratio=-1, layout="NCHW"):
+def roi_align(data, rois, pooled_size, spatial_scale, sample_ratio=-1, layout="NCHW", mode="avg"):
     """ROI align operator.
 
     Parameters
@@ -40,12 +40,15 @@ def roi_align(data, rois, pooled_size, spatial_scale, sample_ratio=-1, layout="N
     sample_ratio : int
         Optional sampling ratio of ROI align, using adaptive size by default.
 
+    mode : str, Optional
+        The pooling method. Relay supports two methods, 'avg' and 'max'. Default is 'avg'.
+
     Returns
     -------
     output : relay.Expr
         4-D tensor with shape [num_roi, channel, pooled_size, pooled_size]
     """
-    return _make.roi_align(data, rois, pooled_size, spatial_scale, sample_ratio, layout)
+    return _make.roi_align(data, rois, pooled_size, spatial_scale, sample_ratio, layout, mode)
 
 
 def roi_pool(data, rois, pooled_size, spatial_scale, layout="NCHW"):
diff --git a/python/tvm/te/hybrid/calls.py b/python/tvm/te/hybrid/calls.py
index 6785457c3bd7..462066106a9d 100644
--- a/python/tvm/te/hybrid/calls.py
+++ b/python/tvm/te/hybrid/calls.py
@@ -167,3 +167,17 @@ def max_num_threads(func_id, args):
         _internal_assert(isinstance(args[0], _expr.IntImm), "In tvm bool should be uint")
         res = Target.current(args[0].value).max_num_threads
     return convert(res)
+
+
+def inf(func_id, args):
+    """Infinity"""
+    _internal_assert(func_id == "inf", "This function cannot be directly invoked!")
+    _internal_assert(args.__len__() == 1, "One argument accepted!")
+    return tvm.tir.max_value(args[0])
+
+
+def ninf(func_id, args):
+    """Negative infinity"""
+    _internal_assert(func_id == "ninf", "This function cannot be directly invoked!")
+    _internal_assert(args.__len__() == 1, "One argument accepted!")
+    return tvm.tir.min_value(args[0])
diff --git a/python/tvm/te/hybrid/runtime.py b/python/tvm/te/hybrid/runtime.py
index 7b90f8729014..615bd7e43a7d 100644
--- a/python/tvm/te/hybrid/runtime.py
+++ b/python/tvm/te/hybrid/runtime.py
@@ -111,6 +111,14 @@ def max_num_threads(allow_none=True):
     return Target.current(allow_none).max_num_threads
 
 
+def inf(dtype):
+    return numpy.iinfo(dtype).max
+
+
+def ninf(dtype):
+    return numpy.iinfo(dtype).min
+
+
 HYBRID_GLOBALS = {
     "unroll": range,
     "vectorize": range,
@@ -142,6 +150,8 @@ def max_num_threads(allow_none=True):
     "float64": numpy.float64,
     "ceil_div": lambda a, b: (a + b - 1) // b,
     "max_num_threads": max_num_threads,
+    "inf": inf,
+    "ninf": inf,
 }
 
 
diff --git a/python/tvm/topi/testing/roi_align_python.py b/python/tvm/topi/testing/roi_align_python.py
index abef25f0b994..643a954b101b 100644
--- a/python/tvm/topi/testing/roi_align_python.py
+++ b/python/tvm/topi/testing/roi_align_python.py
@@ -20,12 +20,14 @@
 import numpy as np
 
 
-def roi_align_nchw_python(a_np, rois_np, pooled_size, spatial_scale, sample_ratio):
+def roi_align_nchw_python(a_np, rois_np, pooled_size, spatial_scale, sample_ratio, mode=b"avg"):
     """Roi align in python"""
+    avg_mode = mode in (b"avg", "avg", 0)
+    max_mode = mode in (b"max", "max", 1)
+    assert avg_mode or max_mode, "Mode must be average or max. Please pass a valid mode."
     _, channel, height, width = a_np.shape
     num_roi = rois_np.shape[0]
     b_np = np.zeros((num_roi, channel, pooled_size, pooled_size), dtype=a_np.dtype)
-
     if isinstance(pooled_size, int):
         pooled_size_h = pooled_size_w = pooled_size
     else:
@@ -76,11 +78,17 @@ def _bilinear(n, c, y, x):
         for c in range(channel):
             for ph in range(pooled_size_h):
                 for pw in range(pooled_size_w):
-                    total = 0.0
+                    if avg_mode:
+                        total = 0.0
+                    if max_mode:
+                        total = float("-inf")
                     for iy in range(roi_bin_grid_h):
                         for ix in range(roi_bin_grid_w):
                             y = roi_start_h + ph * bin_h + (iy + 0.5) * bin_h / roi_bin_grid_h
                             x = roi_start_w + pw * bin_w + (ix + 0.5) * bin_w / roi_bin_grid_w
-                            total += _bilinear(batch_index, c, y, x)
-                    b_np[i, c, ph, pw] = total / count
+                            if avg_mode:
+                                total += _bilinear(batch_index, c, y, x) / count
+                            if max_mode:
+                                total = max(total, _bilinear(batch_index, c, y, x))
+                    b_np[i, c, ph, pw] = total
     return b_np
diff --git a/python/tvm/topi/vision/rcnn/roi_align.py b/python/tvm/topi/vision/rcnn/roi_align.py
index 30824770b7b2..95f350084ba5 100644
--- a/python/tvm/topi/vision/rcnn/roi_align.py
+++ b/python/tvm/topi/vision/rcnn/roi_align.py
@@ -22,7 +22,7 @@
 from ...cpp.utils import bilinear_sample_nchw
 
 
-def roi_align_nchw(data, rois, pooled_size, spatial_scale, sample_ratio=-1):
+def roi_align_nchw(data, rois, pooled_size, spatial_scale, mode, sample_ratio=-1):
     """ROI align operator in NCHW layout.
 
     Parameters
@@ -41,6 +41,10 @@ def roi_align_nchw(data, rois, pooled_size, spatial_scale, sample_ratio=-1):
         Ratio of input feature map height (or w) to raw image height (or w). Equals the reciprocal
         of total stride in convolutional layers, which should be in range (0.0, 1.0]
 
+    mode : int or str
+        There are two modes, average and max. For the average mode, you can pass b'avg' or 0, and
+        for the max mode, you can pass b'max' or 1.
+
     sample_ratio : int
         Optional sampling ratio of ROI align, using adaptive size by default.
 
@@ -49,6 +53,9 @@ def roi_align_nchw(data, rois, pooled_size, spatial_scale, sample_ratio=-1):
     output : tvm.te.Tensor
         4-D with shape [num_roi, channel, pooled_size, pooled_size]
     """
+    avg_mode = mode in (b"avg", 0)
+    max_mode = mode in (b"max", 1)
+    assert avg_mode or max_mode, "Mode must be avg or max. Please pass in a valid mode."
     dtype = rois.dtype
     _, channel, height, width = get_const_tuple(data.shape)
     num_roi, _ = get_const_tuple(rois.shape)
@@ -92,14 +99,25 @@ def _sample(i, c, ph, pw):
         rw = te.reduce_axis((0, roi_bin_grid_w))
         roi_start_h += ph * bin_h
         roi_start_w += pw * bin_w
-        return te.sum(
+        if avg_mode:
+            return te.sum(
+                _bilinear(
+                    batch_index,
+                    c,
+                    roi_start_h + (rh + 0.5) * bin_h / roi_bin_grid_h,
+                    roi_start_w + (rw + 0.5) * bin_w / roi_bin_grid_w,
+                )
+                / count,
+                axis=[rh, rw],
+            )
+        # max mode
+        return te.max(
             _bilinear(
                 batch_index,
                 c,
                 roi_start_h + (rh + 0.5) * bin_h / roi_bin_grid_h,
                 roi_start_w + (rw + 0.5) * bin_w / roi_bin_grid_w,
-            )
-            / count,
+            ),
             axis=[rh, rw],
         )
 
diff --git a/python/tvm/topi/x86/roi_align.py b/python/tvm/topi/x86/roi_align.py
index ac2146b558f9..336a336f50e5 100644
--- a/python/tvm/topi/x86/roi_align.py
+++ b/python/tvm/topi/x86/roi_align.py
@@ -17,15 +17,17 @@
 # pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, undefined-variable, too-many-nested-blocks, too-many-branches, too-many-statements
 """Non-maximum suppression operator for intel cpu"""
 import math
-import tvm
 
+import tvm
 from tvm.te import hybrid
 from ..tensor import full
 from ..utils import get_const_tuple
 
 
 @hybrid.script
-def roi_align_nchw_ir(data, rois, num_rois, w_pc, pos_pc, pooled_size, spatial_scale, sample_ratio):
+def roi_align_nchw_ir(
+    data, rois, num_rois, w_pc, pos_pc, pooled_size, spatial_scale, sample_ratio, mode
+):
     """Hybrid routing fo ROI align operator in NCHW layout.
 
     Parameters
@@ -57,6 +59,10 @@ def roi_align_nchw_ir(data, rois, num_rois, w_pc, pos_pc, pooled_size, spatial_s
     sample_ratio : tvm.tir.const
         Sampling ratio of ROI align, using adaptive size by default.
 
+    mode : tvm.tir.const
+        Mode of RoiAlign. A value of 0 corrensponds to b'avg', while a value of 1 corresponds to
+        b'max'.
+
     Returns
     -------
     output : tvm.te.Tensor or numpy NDArray
@@ -160,10 +166,12 @@ def roi_align_nchw_ir(data, rois, num_rois, w_pc, pos_pc, pooled_size, spatial_s
             pre_calc_index = 0
             for ph in range(pooled_size_h):
                 for pw in range(pooled_size_w):
-                    output_val = 0.0
+                    output_val = 0.0  # Avg mode
+                    if mode == 1:  # Max mode
+                        output_val = ninf("float32")
                     for iy in range(roi_bin_grid_h):
                         for ix in range(roi_bin_grid_w):
-                            output_val += (
+                            bilinear_val = (
                                 w_pc[n, pre_calc_index, 0]
                                 * data[
                                     roi_batch_index,
@@ -194,14 +202,15 @@ def roi_align_nchw_ir(data, rois, num_rois, w_pc, pos_pc, pooled_size, spatial_s
                                 ]
                             )
                             pre_calc_index += 1
-
-                    output_val /= count
-                    output[n, c, ph, pw] = output_val
-
+                            if mode == 0:  # Avg mode
+                                output_val += bilinear_val / count
+                            if mode == 1:  # Max mode
+                                output_val = max(output_val, bilinear_val)
+                        output[n, c, ph, pw] = output_val
     return output
 
 
-def roi_align_nchw(data, rois, pooled_size, spatial_scale, sample_ratio=-1):
+def roi_align_nchw(data, rois, pooled_size, spatial_scale, mode, sample_ratio=-1):
     """ROI align operator in NCHW layout.
 
     Parameters
@@ -220,6 +229,9 @@ def roi_align_nchw(data, rois, pooled_size, spatial_scale, sample_ratio=-1):
         Ratio of input feature map height (or w) to raw image height (or w). Equals the reciprocal
         of total stride in convolutional layers, which should be in range (0.0, 1.0]
 
+    mode : str
+        Mode of RoiAlign. Should be b'max' or b'avg'.
+
     sample_ratio : int
         Optional sampling ratio of ROI align, using adaptive size by default.
 
@@ -250,6 +262,21 @@ def roi_align_nchw(data, rois, pooled_size, spatial_scale, sample_ratio=-1):
     pooled_size = tvm.runtime.convert(pooled_size)
     spatial_scale = tvm.tir.const(spatial_scale, "float32")
     sample_ratio = tvm.tir.const(sample_ratio, "int32")
+    if mode in (b"avg", 0):
+        mode = tvm.tir.const(0, dtype="float32")
+    elif mode in (b"max", 1):
+        mode = tvm.tir.const(1, dtype="float32")
+    else:
+        raise ValueError(mode, "Value %s passed in for mode not supported", mode)
+
     return roi_align_nchw_ir(
-        data, rois, num_rois, w_pc_buffer, pos_pc_buffer, pooled_size, spatial_scale, sample_ratio
+        data,
+        rois,
+        num_rois,
+        w_pc_buffer,
+        pos_pc_buffer,
+        pooled_size,
+        spatial_scale,
+        sample_ratio,
+        mode,
     )
diff --git a/src/relay/op/vision/rcnn_op.cc b/src/relay/op/vision/rcnn_op.cc
index f7bbf378d09c..c899681733f8 100644
--- a/src/relay/op/vision/rcnn_op.cc
+++ b/src/relay/op/vision/rcnn_op.cc
@@ -76,12 +76,13 @@ Array<Array<Layout> > ROIAlignInferCorrectLayout(const Attrs& attrs,
 }
 
 Expr MakeROIAlign(Expr data, Expr rois, Array<IndexExpr> pooled_size, double spatial_scale,
-                  int sample_ratio, String layout) {
+                  int sample_ratio, String layout, String mode) {
   auto attrs = make_object<ROIAlignAttrs>();
   attrs->pooled_size = pooled_size;
   attrs->spatial_scale = spatial_scale;
   attrs->sample_ratio = sample_ratio;
   attrs->layout = layout;
+  attrs->mode = mode;
   static const Op& op = Op::Get("vision.roi_align");
   return Call(op, {data, rois}, Attrs(attrs), {});
 }
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 27b91dd38f8e..59ecffe829df 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -3437,7 +3437,13 @@ def verify_topk(input_dims, K, axis=-1):
 @tvm.testing.uses_gpu
 def test_roi_align():
     def verify_roi_align(
-        input_dims, num_roi, output_height, output_width, sampling_ratio=0, spatial_scale=1.0
+        input_dims,
+        num_roi,
+        output_height,
+        output_width,
+        sampling_ratio=0,
+        spatial_scale=1.0,
+        mode="avg",
     ):
         output_dims = [num_roi, input_dims[1], output_height, output_width]
 
@@ -3445,7 +3451,7 @@ def verify_roi_align(
             "RoiAlign",
             inputs=["X", "rois", "batch_indicies"],
             outputs=["Y"],
-            mode="avg",
+            mode=mode,
             output_height=output_height,
             output_width=output_width,
             sampling_ratio=sampling_ratio,
@@ -3490,6 +3496,8 @@ def verify_roi_align(
     verify_roi_align((5, 4, 16, 14), 32, 7, 7, sampling_ratio=1, spatial_scale=1.0)
     verify_roi_align((1, 4, 16, 16), 32, 7, 7, sampling_ratio=2, spatial_scale=1.0)
 
+    # ONNX implementation of roi_align with max mode is incorrect, so we don't compare outputs here.
+
 
 # @tvm.testing.uses_gpu
 def test_non_max_suppression():
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 6d7d401d706b..95cd537091f5 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -583,7 +583,7 @@ def test_threshold():
 
 @tvm.testing.uses_gpu
 def test_roi_align():
-    def verify_roi_align(data_shape, rois_shape, pooled_size, spatial_scale, sample_ratio):
+    def verify_roi_align(data_shape, rois_shape, pooled_size, spatial_scale, sample_ratio, mode):
         data = relay.var("data", relay.ty.TensorType(data_shape, "float32"))
         rois = relay.var("rois", relay.ty.TensorType(rois_shape, "float32"))
         z = relay.vision.roi_align(
@@ -592,6 +592,7 @@ def verify_roi_align(data_shape, rois_shape, pooled_size, spatial_scale, sample_
             pooled_size=(pooled_size, pooled_size),
             spatial_scale=spatial_scale,
             sample_ratio=sample_ratio,
+            mode=mode,
             layout="NCHW",
         )
         zz = run_infer_type(z)
@@ -612,6 +613,7 @@ def verify_roi_align(data_shape, rois_shape, pooled_size, spatial_scale, sample_
             pooled_size=pooled_size,
             spatial_scale=spatial_scale,
             sample_ratio=sample_ratio,
+            mode=mode,
         )
         for target, ctx in tvm.testing.enabled_targets():
             intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
@@ -621,8 +623,18 @@ def verify_roi_align(data_shape, rois_shape, pooled_size, spatial_scale, sample_
             op_res2 = intrp2.evaluate(func)(np_data, np_rois)
             tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-4)
 
-    verify_roi_align((1, 4, 16, 16), (32, 5), pooled_size=7, spatial_scale=1.0, sample_ratio=-1)
-    verify_roi_align((4, 4, 16, 16), (32, 5), pooled_size=7, spatial_scale=0.5, sample_ratio=2)
+    verify_roi_align(
+        (1, 4, 16, 16), (32, 5), pooled_size=7, spatial_scale=1.0, sample_ratio=-1, mode="avg"
+    )
+    verify_roi_align(
+        (4, 4, 16, 16), (32, 5), pooled_size=7, spatial_scale=0.5, sample_ratio=2, mode="avg"
+    )
+    verify_roi_align(
+        (1, 4, 16, 16), (32, 5), pooled_size=7, spatial_scale=1.0, sample_ratio=-1, mode="max"
+    )
+    verify_roi_align(
+        (4, 4, 16, 16), (32, 5), pooled_size=7, spatial_scale=0.5, sample_ratio=2, mode="max"
+    )
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/topi/python/test_topi_vision.py b/tests/python/topi/python/test_topi_vision.py
index 697ef8a24f67..839356892ab1 100644
--- a/tests/python/topi/python/test_topi_vision.py
+++ b/tests/python/topi/python/test_topi_vision.py
@@ -418,7 +418,9 @@ def check_device(device):
         check_device(device)
 
 
-def verify_roi_align(batch, in_channel, in_size, num_roi, pooled_size, spatial_scale, sample_ratio):
+def verify_roi_align(
+    batch, in_channel, in_size, num_roi, pooled_size, spatial_scale, sample_ratio, mode
+):  # For mode, 0 = avg, 1 = max
     a_shape = (batch, in_channel, in_size, in_size)
     rois_shape = (num_roi, 5)
 
@@ -427,8 +429,8 @@ def verify_roi_align(batch, in_channel, in_size, num_roi, pooled_size, spatial_s
 
     @memoize("topi.tests.test_topi_vision.verify_roi_align")
     def get_ref_data():
-        a_np = np.random.uniform(size=a_shape).astype("float32")
-        rois_np = np.random.uniform(size=rois_shape).astype("float32") * in_size
+        a_np = np.random.uniform(-1, 1, size=a_shape).astype("float32")
+        rois_np = np.random.uniform(-1, 1, size=rois_shape).astype("float32") * in_size
         rois_np[:, 0] = np.random.randint(low=0, high=batch, size=num_roi)
         b_np = tvm.topi.testing.roi_align_nchw_python(
             a_np,
@@ -436,6 +438,7 @@ def get_ref_data():
             pooled_size=pooled_size,
             spatial_scale=spatial_scale,
             sample_ratio=sample_ratio,
+            mode=mode,
         )
 
         return a_np, rois_np, b_np
@@ -447,8 +450,6 @@ def check_device(device):
         if not tvm.testing.device_enabled(device):
             print("Skip because %s is not enabled" % device)
             return
-        print("Running on target: %s" % device)
-
         with tvm.target.Target(device):
             fcompute, fschedule = tvm.topi.testing.dispatch(device, _roi_align_implement)
             b = fcompute(
@@ -457,6 +458,7 @@ def check_device(device):
                 pooled_size=pooled_size,
                 spatial_scale=spatial_scale,
                 sample_ratio=sample_ratio,
+                mode=mode,
             )
             s = fschedule(b)
 
@@ -465,7 +467,8 @@ def check_device(device):
         tvm_b = tvm.nd.array(np.zeros(get_const_tuple(b.shape), dtype=b.dtype), ctx=ctx)
         f = tvm.build(s, [a, rois, b], device)
         f(tvm_a, tvm_rois, tvm_b)
-        tvm.testing.assert_allclose(tvm_b.asnumpy(), b_np, rtol=1e-3)
+        tvm_val = tvm_b.asnumpy()
+        tvm.testing.assert_allclose(tvm_val, b_np, rtol=1e-3, atol=1e-4)
 
     for device in ["llvm", "cuda", "opencl"]:
         check_device(device)
@@ -473,10 +476,14 @@ def check_device(device):
 
 @tvm.testing.uses_gpu
 def test_roi_align():
-    verify_roi_align(1, 16, 32, 64, 7, 1.0, -1)
-    verify_roi_align(4, 16, 32, 64, 7, 0.5, 2)
-    verify_roi_align(1, 32, 32, 80, 8, 0.0625, 2)
-    verify_roi_align(1, 32, 500, 80, 8, 0.0625, 2)
+    verify_roi_align(1, 16, 32, 64, 7, 1.0, -1, 0)
+    verify_roi_align(4, 16, 32, 64, 7, 0.5, 2, 0)
+    verify_roi_align(1, 32, 32, 80, 8, 0.0625, 2, 0)
+    verify_roi_align(1, 32, 500, 80, 8, 0.0625, 2, 0)
+    verify_roi_align(1, 16, 32, 64, 7, 1.0, -1, 1)
+    verify_roi_align(4, 16, 32, 64, 7, 0.5, 2, 1)
+    verify_roi_align(1, 32, 32, 80, 8, 0.0625, 2, 1)
+    verify_roi_align(1, 32, 500, 80, 8, 0.0625, 2, 1)
 
 
 def verify_roi_pool(batch, in_channel, in_size, num_roi, pooled_size, spatial_scale):

From e57e6448a66948d53d4be0c36aa21a7c4845c865 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Wed, 17 Feb 2021 13:31:25 +0900
Subject: [PATCH 201/357] [ROCM] Add Thrust support (#7458)

* enable rocm thrust, confrimed to work on sort and scan

* add rocm argsort strategy

* Abort if CXX is not hipcc

* add more strategy

* add missing import

* fix lint

* show supported data type in err msg

* try remove rocthrust

* add missing include for rocthrust

* more minor change

Co-authored-by: Masahiro Masuda <masahi@129@gmail.com>
---
 cmake/modules/ROCM.cmake             |  17 ++++
 python/tvm/relay/op/strategy/rocm.py |  92 +++++++++++++++++++
 python/tvm/topi/cuda/nms.py          |   3 +-
 python/tvm/topi/cuda/scan.py         |   5 +-
 src/runtime/contrib/thrust/thrust.cu |   1 +
 tests/python/contrib/test_thrust.py  | 127 +++++++++++++++------------
 6 files changed, 187 insertions(+), 58 deletions(-)

diff --git a/cmake/modules/ROCM.cmake b/cmake/modules/ROCM.cmake
index ec348f8b57f6..b908df2f869b 100644
--- a/cmake/modules/ROCM.cmake
+++ b/cmake/modules/ROCM.cmake
@@ -48,6 +48,23 @@ if(USE_ROCM)
     list(APPEND RUNTIME_SRCS ${ROCBLAS_CONTRIB_SRCS})
     list(APPEND TVM_RUNTIME_LINKER_LIBS ${ROCM_ROCBLAS_LIBRARY})
   endif(USE_ROCBLAS)
+
+  if(USE_THRUST)
+    message(STATUS "Build with rocThrust support")
+    # We need to override CXX to hipcc. This is required by rocthrust
+    if (${CMAKE_CXX_COMPILER} MATCHES "hipcc$")
+      message(STATUS "Using hipcc compiler to compile rocthrust code.")
+    else()
+      message(FATAL_ERROR "Set CXX=hipcc to compile rocthrust code.")
+    endif()
+
+    find_package(rocprim REQUIRED)
+    find_package(rocthrust REQUIRED)
+    set_source_files_properties(src/runtime/contrib/thrust/thrust.cu PROPERTIES LANGUAGE CXX)
+    list(APPEND RUNTIME_SRCS src/runtime/contrib/thrust/thrust.cu)
+    list(APPEND TVM_RUNTIME_LINKER_LIBS roc::rocthrust)
+  endif(USE_THRUST)
+
 else(USE_ROCM)
   list(APPEND COMPILER_SRCS src/target/opt/build_rocm_off.cc)
 endif(USE_ROCM)
diff --git a/python/tvm/relay/op/strategy/rocm.py b/python/tvm/relay/op/strategy/rocm.py
index c52da541a8ab..934f38625fd3 100644
--- a/python/tvm/relay/op/strategy/rocm.py
+++ b/python/tvm/relay/op/strategy/rocm.py
@@ -18,6 +18,8 @@
 # pylint: disable=invalid-name,unused-argument,unused-wildcard-import,wildcard-import
 from tvm import topi
 from tvm.auto_scheduler import is_auto_scheduler_enabled
+from tvm.te import SpecializedCondition
+from tvm._ffi import get_global_func
 from .generic import *
 from .. import op as _op
 from .cuda import judge_winograd, naive_schedule
@@ -219,3 +221,93 @@ def batch_matmul_strategy_rocm(attrs, inputs, out_type, target):
             plevel=12,
         )
     return strategy
+
+
+def can_use_thrust(target, func_name):
+    return (
+        target.kind.name == "rocm"
+        and "thrust" in target.libs
+        and get_global_func(func_name, allow_missing=True)
+    )
+
+
+@argsort_strategy.register(["rocm"])
+def argsort_strategy_cuda(attrs, inputs, out_type, target):
+    """argsort rocm strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_argsort(topi.cuda.argsort),
+        wrap_topi_schedule(topi.cuda.schedule_argsort),
+        name="argsort.rocm",
+    )
+    if can_use_thrust(target, "tvm.contrib.thrust.sort"):
+        strategy.add_implementation(
+            wrap_compute_argsort(topi.cuda.argsort_thrust),
+            wrap_topi_schedule(topi.cuda.schedule_argsort),
+            name="argsort_thrust.rocm",
+            plevel=15,
+        )
+    return strategy
+
+
+@scatter_strategy.register(["rocm"])
+def scatter_cuda(attrs, inputs, out_type, target):
+    """scatter rocm strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_scatter(topi.cuda.scatter),
+        wrap_topi_schedule(topi.cuda.schedule_scatter),
+        name="scatter.rocm",
+        plevel=10,
+    )
+
+    rank = len(inputs[0].shape)
+
+    with SpecializedCondition(rank == 1):
+        if can_use_thrust(target, "tvm.contrib.thrust.stable_sort_by_key"):
+            strategy.add_implementation(
+                wrap_compute_scatter(topi.cuda.scatter_via_sort),
+                wrap_topi_schedule(topi.cuda.schedule_scatter_via_sort),
+                name="scatter_via_sort.rocm",
+                plevel=9,  # use the sequential version by default
+            )
+    return strategy
+
+
+@sort_strategy.register(["rocm"])
+def sort_strategy_cuda(attrs, inputs, out_type, target):
+    """sort rocm strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_sort(topi.cuda.sort),
+        wrap_topi_schedule(topi.cuda.schedule_sort),
+        name="sort.rocm",
+    )
+    if can_use_thrust(target, "tvm.contrib.thrust.sort"):
+        strategy.add_implementation(
+            wrap_compute_sort(topi.cuda.sort_thrust),
+            wrap_topi_schedule(topi.cuda.schedule_sort),
+            name="sort_thrust.cuda",
+            plevel=15,
+        )
+    return strategy
+
+
+@topk_strategy.register(["rocm"])
+def topk_strategy_cuda(attrs, inputs, out_type, target):
+    """topk rocm strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_topk(topi.cuda.topk),
+        wrap_topi_schedule(topi.cuda.schedule_topk),
+        name="topk.rocm",
+    )
+
+    if can_use_thrust(target, "tvm.contrib.thrust.sort"):
+        strategy.add_implementation(
+            wrap_compute_topk(topi.cuda.topk_thrust),
+            wrap_topi_schedule(topi.cuda.schedule_topk),
+            name="topk_thrust.rocm",
+            plevel=15,
+        )
+    return strategy
diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
index 2d6e1e464ef8..98cb6750408a 100644
--- a/python/tvm/topi/cuda/nms.py
+++ b/python/tvm/topi/cuda/nms.py
@@ -610,7 +610,8 @@ def _get_sorted_indices(data, data_buf, score_index, score_shape):
     )
 
     target = tvm.target.Target.current()
-    if target and target.kind.name == "cuda" and is_thrust_available():
+    # TODO(masahi): Check -libs=thrust option
+    if target and target.kind.name in ["cuda", "rocm"] and is_thrust_available():
         sort_tensor = argsort_thrust(score_tensor, axis=1, is_ascend=False, dtype="int32")
     else:
         sort_tensor = argsort(score_tensor, axis=1, is_ascend=False, dtype="int32")
diff --git a/python/tvm/topi/cuda/scan.py b/python/tvm/topi/cuda/scan.py
index 0bdab100b429..65d23365dc15 100644
--- a/python/tvm/topi/cuda/scan.py
+++ b/python/tvm/topi/cuda/scan.py
@@ -221,7 +221,7 @@ def ir(data, data_ex_scan, reduction):
                 with ib.if_scope(scan_axis_size > 0):
                     reduction[tid] = binop(
                         data_ex_scan[tid * scan_axis_size + scan_axis_size - 1],
-                        data[tid, scan_axis_size - 1],
+                        data[tid * scan_axis_size + scan_axis_size - 1],
                     )
                 with ib.else_scope():
                     reduction[tid] = 0
@@ -352,7 +352,8 @@ def exclusive_scan(
 
     def do_scan(data, output_dtype):
         target = tvm.target.Target.current()
-        if target and target.kind.name == "cuda" and is_thrust_available():
+        # TODO(masahi): Check -libs=thrust option
+        if target and target.kind.name in ["cuda", "rocm"] and is_thrust_available():
             return scan_thrust(
                 data, output_dtype, exclusive=True, return_reduction=return_reduction, binop=binop
             )
diff --git a/src/runtime/contrib/thrust/thrust.cu b/src/runtime/contrib/thrust/thrust.cu
index 7295d4c47c3f..df83b57847a0 100644
--- a/src/runtime/contrib/thrust/thrust.cu
+++ b/src/runtime/contrib/thrust/thrust.cu
@@ -26,6 +26,7 @@
 #include <thrust/sort.h>
 #include <thrust/gather.h>
 #include <thrust/scan.h>
+#include <thrust/sequence.h>
 
 #include <tvm/runtime/registry.h>
 #include <dlpack/dlpack.h>
diff --git a/tests/python/contrib/test_thrust.py b/tests/python/contrib/test_thrust.py
index c5b6a29d57d5..521c20de6cbd 100644
--- a/tests/python/contrib/test_thrust.py
+++ b/tests/python/contrib/test_thrust.py
@@ -33,25 +33,30 @@ def test_stable_sort_by_key():
 
     keys_out, values_out = stable_sort_by_key_thrust(keys, values)
 
-    ctx = tvm.gpu(0)
-    target = "cuda"
-    s = te.create_schedule([keys_out.op, values_out.op])
-    f = tvm.build(s, [keys, values, keys_out, values_out], target)
-
-    keys_np = np.array([1, 4, 2, 8, 2, 7], np.int32)
-    values_np = np.random.randint(0, 10, size=(size,)).astype(np.int32)
-    keys_np_out = np.zeros(keys_np.shape, np.int32)
-    values_np_out = np.zeros(values_np.shape, np.int32)
-    keys_in = tvm.nd.array(keys_np, ctx)
-    values_in = tvm.nd.array(values_np, ctx)
-    keys_out = tvm.nd.array(keys_np_out, ctx)
-    values_out = tvm.nd.array(values_np_out, ctx)
-    f(keys_in, values_in, keys_out, values_out)
-
-    ref_keys_out = np.sort(keys_np)
-    ref_values_out = np.array([values_np[i] for i in np.argsort(keys_np)])
-    tvm.testing.assert_allclose(keys_out.asnumpy(), ref_keys_out, rtol=1e-5)
-    tvm.testing.assert_allclose(values_out.asnumpy(), ref_values_out, rtol=1e-5)
+    for target in ["cuda", "rocm"]:
+        if not tvm.testing.device_enabled(target):
+            print("Skip because %s is not enabled" % target)
+            continue
+
+        target += " -libs=thrust"
+        ctx = tvm.context(target, 0)
+        s = te.create_schedule([keys_out.op, values_out.op])
+        f = tvm.build(s, [keys, values, keys_out, values_out], target)
+
+        keys_np = np.array([1, 4, 2, 8, 2, 7], np.int32)
+        values_np = np.random.randint(0, 10, size=(size,)).astype(np.int32)
+        keys_np_out = np.zeros(keys_np.shape, np.int32)
+        values_np_out = np.zeros(values_np.shape, np.int32)
+        keys_in = tvm.nd.array(keys_np, ctx)
+        values_in = tvm.nd.array(values_np, ctx)
+        keys_out = tvm.nd.array(keys_np_out, ctx)
+        values_out = tvm.nd.array(values_np_out, ctx)
+        f(keys_in, values_in, keys_out, values_out)
+
+        ref_keys_out = np.sort(keys_np)
+        ref_values_out = np.array([values_np[i] for i in np.argsort(keys_np)])
+        tvm.testing.assert_allclose(keys_out.asnumpy(), ref_keys_out, rtol=1e-5)
+        tvm.testing.assert_allclose(values_out.asnumpy(), ref_values_out, rtol=1e-5)
 
 
 def test_exclusive_scan():
@@ -59,35 +64,41 @@ def test_exclusive_scan():
         print("skip because thrust is not enabled...")
         return
 
-    for ishape in [(10,), (10, 10), (10, 10, 10)]:
-        values = te.placeholder(ishape, name="values", dtype="int32")
+    for target in ["cuda", "rocm"]:
+        if not tvm.testing.device_enabled(target):
+            print("Skip because %s is not enabled" % target)
+            continue
 
-        with tvm.target.Target("cuda"):
-            scan, reduction = exclusive_scan(values, return_reduction=True)
-            s = schedule_scan([scan, reduction])
+        target += " -libs=thrust"
+        for ishape in [(10,), (10, 10), (10, 10, 10)]:
+            values = te.placeholder(ishape, name="values", dtype="int32")
 
-        ctx = tvm.gpu(0)
-        f = tvm.build(s, [values, scan, reduction], "cuda")
+            with tvm.target.Target(target):
+                scan, reduction = exclusive_scan(values, return_reduction=True)
+                s = schedule_scan([scan, reduction])
 
-        values_np = np.random.randint(0, 10, size=ishape).astype(np.int32)
-        values_np_out = np.zeros(values_np.shape, np.int32)
+            ctx = tvm.context(target, 0)
+            f = tvm.build(s, [values, scan, reduction], target)
 
-        if len(ishape) == 1:
-            reduction_shape = ()
-        else:
-            reduction_shape = ishape[:-1]
+            values_np = np.random.randint(0, 10, size=ishape).astype(np.int32)
+            values_np_out = np.zeros(values_np.shape, np.int32)
 
-        reduction_np_out = np.zeros(reduction_shape, np.int32)
+            if len(ishape) == 1:
+                reduction_shape = ()
+            else:
+                reduction_shape = ishape[:-1]
 
-        values_in = tvm.nd.array(values_np, ctx)
-        values_out = tvm.nd.array(values_np_out, ctx)
-        reduction_out = tvm.nd.array(reduction_np_out, ctx)
-        f(values_in, values_out, reduction_out)
+            reduction_np_out = np.zeros(reduction_shape, np.int32)
 
-        ref_values_out = np.cumsum(values_np, axis=-1, dtype="int32") - values_np
-        tvm.testing.assert_allclose(values_out.asnumpy(), ref_values_out, rtol=1e-5)
-        ref_reduction_out = np.sum(values_np, axis=-1)
-        tvm.testing.assert_allclose(reduction_out.asnumpy(), ref_reduction_out, rtol=1e-5)
+            values_in = tvm.nd.array(values_np, ctx)
+            values_out = tvm.nd.array(values_np_out, ctx)
+            reduction_out = tvm.nd.array(reduction_np_out, ctx)
+            f(values_in, values_out, reduction_out)
+
+            ref_values_out = np.cumsum(values_np, axis=-1, dtype="int32") - values_np
+            tvm.testing.assert_allclose(values_out.asnumpy(), ref_values_out, rtol=1e-5)
+            ref_reduction_out = np.sum(values_np, axis=-1)
+            tvm.testing.assert_allclose(reduction_out.asnumpy(), ref_reduction_out, rtol=1e-5)
 
 
 def test_inclusive_scan():
@@ -97,24 +108,30 @@ def test_inclusive_scan():
 
     out_dtype = "int64"
 
-    for ishape in [(10,), (10, 10)]:
-        values = te.placeholder(ishape, name="values", dtype="int32")
+    for target in ["cuda", "rocm"]:
+        if not tvm.testing.device_enabled(target):
+            print("Skip because %s is not enabled" % target)
+            continue
 
-        with tvm.target.Target("cuda"):
-            scan = scan_thrust(values, out_dtype, exclusive=False)
-            s = tvm.te.create_schedule([scan.op])
+        target += " -libs=thrust"
+        for ishape in [(10,), (10, 10)]:
+            values = te.placeholder(ishape, name="values", dtype="int32")
 
-        ctx = tvm.gpu(0)
-        f = tvm.build(s, [values, scan], "cuda")
+            with tvm.target.Target(target):
+                scan = scan_thrust(values, out_dtype, exclusive=False)
+                s = tvm.te.create_schedule([scan.op])
 
-        values_np = np.random.randint(0, 10, size=ishape).astype(np.int32)
-        values_np_out = np.zeros(values_np.shape, out_dtype)
-        values_in = tvm.nd.array(values_np, ctx)
-        values_out = tvm.nd.array(values_np_out, ctx)
-        f(values_in, values_out)
+            ctx = tvm.context(target, 0)
+            f = tvm.build(s, [values, scan], target)
 
-        ref_values_out = np.cumsum(values_np, axis=-1, dtype=out_dtype)
-        tvm.testing.assert_allclose(values_out.asnumpy(), ref_values_out, rtol=1e-5)
+            values_np = np.random.randint(0, 10, size=ishape).astype(np.int32)
+            values_np_out = np.zeros(values_np.shape, out_dtype)
+            values_in = tvm.nd.array(values_np, ctx)
+            values_out = tvm.nd.array(values_np_out, ctx)
+            f(values_in, values_out)
+
+            ref_values_out = np.cumsum(values_np, axis=-1, dtype=out_dtype)
+            tvm.testing.assert_allclose(values_out.asnumpy(), ref_values_out, rtol=1e-5)
 
 
 if __name__ == "__main__":

From 173f3fcc2199aa8c897275a5d6df48831d808e1a Mon Sep 17 00:00:00 2001
From: Ritwik Das <ritwikdas54@gmail.com>
Date: Tue, 16 Feb 2021 20:33:05 -0800
Subject: [PATCH 202/357] SparseFillEmptyRows Op (#7442)

* Initial Commit

* Fix formats

* Remove comments

* Black

* THreeops

* Add Frontend Code

* Add Default Value to feed dict

* Add Frontend Code

* New test Cases and new code to handle them

* Add Python Implementation'
'

* Remove stuff

* Remove unused imports

* Pylint

* Pylint

* PyLint Shape Func

* Make tests cpu only

* Add unsorted tests

* Add frontend code

* Row Major Sorting Only Test

* Handle Dynamic Shapes

* Add dynamic input shapes

* Dynamic Shape Tests

* Add documentation

* Dtypes

* PR Comments

* Added comments and changed naming

* Add comments

* Comments to Shape Func

* Documentation

* PR Changes

* PR Comments

* Resolve input and output dtype compat

Co-authored-by: Ubuntu <ubuntu@ip-172-31-42-251.us-east-2.compute.internal>
---
 python/tvm/relay/frontend/tensorflow.py       |  23 +++
 python/tvm/relay/op/_transform.py             |  63 ++++++-
 python/tvm/relay/op/strategy/generic.py       |  29 +++
 python/tvm/relay/op/transform.py              |  72 ++++++++
 python/tvm/topi/__init__.py                   |   1 +
 python/tvm/topi/generic/search.py             |   4 +
 python/tvm/topi/sparse_fill_empty_rows.py     | 109 ++++++++++++
 src/relay/op/tensor/transform.cc              |  44 +++++
 .../frontend/tensorflow/test_forward.py       | 103 +++++++++++
 .../relay/dyn/test_dynamic_op_level3.py       | 168 +++++++++++++++++-
 10 files changed, 612 insertions(+), 4 deletions(-)
 create mode 100644 python/tvm/topi/sparse_fill_empty_rows.py

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index ea1abc843c20..6a29ce266ea6 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -998,6 +998,28 @@ def _impl(inputs, attr, params, mod):
     return _impl
 
 
+def _sparse_fill_empty_rows():
+    def _impl(inputs, attr, params, mod):
+        assert len(inputs) == 4, "There should be 4 input tensors"
+        sparse_indices = inputs[0]
+        sparse_values = inputs[1]
+        sparse_indices_num_cols = _infer_shape(sparse_indices, mod)[1]
+        first_column = _op.split(sparse_indices, sparse_indices_num_cols, axis=1)[0]
+        sorted_indices = _op.argsort(_op.squeeze(first_column))
+        sorted_sparse_indices = _op.take(sparse_indices, sorted_indices, axis=0)
+        sorted_sparse_values = _op.take(sparse_values, sorted_indices, axis=0)
+        new_sparse_indices, new_sparse_values, empty_row_indicator = _op.sparse_fill_empty_rows(
+            sorted_sparse_indices, sorted_sparse_values, inputs[2], inputs[3]
+        )
+
+        return _expr.TupleWrapper(
+            _expr.Tuple([new_sparse_indices, new_sparse_values, empty_row_indicator]),
+            3,
+        )
+
+    return _impl
+
+
 def _identity():
     def _impl(inputs, attr, params, mod):
         return inputs[0]
@@ -2455,6 +2477,7 @@ def _impl(inputs, attr, params, mod):
     "SpaceToDepth": _space_to_depth(),
     "SparseToDense": _sparse_to_dense(),
     "SparseTensorDenseMatMul": _sparse_tensor_dense_matmul(),
+    "SparseFillEmptyRows": _sparse_fill_empty_rows(),
     "Split": _split(False),
     "SplitV": _split(True),
     "Sqrt": AttrCvt("sqrt"),
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index ba2416ff8950..01bcf4a6cf60 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -15,7 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 """Backend compiler related feature registration"""
-# pylint: disable=invalid-name,unused-argument, len-as-condition, too-many-nested-blocks, too-many-local-variables, too-many-arguments
+# pylint: disable=invalid-name,unused-argument, len-as-condition, too-many-nested-blocks,
+# pylint: disable=too-many-local-variables, too-many-arguments, no-else-return
+
 from __future__ import absolute_import
 import tvm
 from tvm import te
@@ -94,6 +96,24 @@ def compute_scatter(attrs, inputs, output_type):
 
 _reg.register_strategy("scatter", strategy.scatter_strategy)
 
+# sparse_fill_empty_rows
+@_reg.register_compute("sparse_fill_empty_rows")
+def compute_sparse_fill_empty_rows(attrs, inputs, output_type):
+    """Compute definition of sparse_fill_empty_rows"""
+
+    return topi.sparse_fill_empty_rows(
+        inputs[0],
+        inputs[1],
+        inputs[2],
+        inputs[3],
+        output_type.fields[0].shape,
+        output_type.fields[1].shape,
+        output_type.fields[2].shape,
+    )
+
+
+_reg.register_strategy("sparse_fill_empty_rows", strategy.sparse_fill_empty_rows_strategy)
+
 # scatter_add
 @_reg.register_compute("scatter_add")
 def compute_scatter_add(attrs, inputs, output_type):
@@ -445,6 +465,47 @@ def argwhere_shape_func(attrs, inputs, out_ndims):
 _reg.register_shape_func("scatter_add", False, elemwise_shape_func)
 
 
+@script
+def _sparse_fill_empty_rows_shape_func(sparse_indices, dense_shape):
+
+    new_sparse_indices_shape = output_tensor((2,), "int64")
+    new_sparse_values_shape = output_tensor((1,), "int64")
+    empty_row_indicator_shape = output_tensor((1,), "int64")
+    num_dense_rows = int64(dense_shape[0])
+
+    if int64(sparse_indices.shape[0]) == int64(0):  # Handle Empty Case
+        #  Total rows will equal dense_shape[0]
+        new_sparse_indices_shape[0] = num_dense_rows
+        new_sparse_indices_shape[1] = int64(sparse_indices.shape[1])
+        new_sparse_values_shape[0] = num_dense_rows
+        empty_row_indicator_shape[0] = num_dense_rows
+        return (new_sparse_indices_shape, new_sparse_values_shape, empty_row_indicator_shape)
+
+    else:
+        count = int64(sparse_indices.shape[0])  # Add count of all rows already in sparse_indices
+        for i in range(1, int64(sparse_indices.shape[0])):
+            index = int64(sparse_indices[i, 0])
+            prev_index = int64(sparse_indices[i - 1, 0] + 1)
+
+            if index > prev_index:
+                count += index - prev_index  # Add count of all rows between two consecutive indices
+
+        count += int64(sparse_indices[0, 0])  # Add count from 0 to first row id in sparse_indices
+        count += int64(
+            num_dense_rows - 1 - sparse_indices[sparse_indices.shape[0] - 1, 0]
+        )  # Add count from last row id to dense_shape - 1
+        new_sparse_indices_shape[0] = int64(count)
+        new_sparse_indices_shape[1] = int64(sparse_indices.shape[1])
+        new_sparse_values_shape[0] = int64(count)
+        empty_row_indicator_shape[0] = num_dense_rows
+        return (new_sparse_indices_shape, new_sparse_values_shape, empty_row_indicator_shape)
+
+
+@_reg.register_shape_func("sparse_fill_empty_rows", True)
+def sparse_fill_empty_rows_func(attrs, inputs, _):
+    return _sparse_fill_empty_rows_shape_func(inputs[0], inputs[2])
+
+
 @script
 def _layout_transform_shape_func(
     data_shape, out_layout_len, dst_equal_list, dst_mul_list, dst_div_list, dst_mix_list
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 2d69a2f6942e..e744b8c9da83 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -1070,6 +1070,35 @@ def roi_align_strategy(attrs, inputs, out_type, target):
     return strategy
 
 
+# sparse_fill_empty_rows
+@override_native_generic_func("sparse_fill_empty_rows_strategy")
+def sparse_fill_empty_rows_strategy(attrs, outs, out_type, target):
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_sparse_fill_empty_rows(topi.sparse_fill_empty_rows),
+        wrap_topi_schedule(topi.generic.schedule_sparse_fill_empty_rows),
+        name="sparse_fill_empty_rows.generic",
+    )
+    return strategy
+
+
+def wrap_compute_sparse_fill_empty_rows(topi_compute):
+    """Wrap sparse_fill_empty_rows compute"""
+
+    def _compute_sparse_fill_empty_rows(attrs, inputs, output_type):
+        return topi_compute(
+            inputs[0],
+            inputs[1],
+            inputs[2],
+            inputs[3],
+            output_type.fields[0].shape,
+            output_type.fields[1].shape,
+            output_type.fields[2].shape,
+        )
+
+    return _compute_sparse_fill_empty_rows
+
+
 # roi_pool
 @generic_func
 def schedule_roi_pool(attrs, outs, target):
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 1bd2b2d34060..b676fe742544 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -1338,6 +1338,78 @@ def adv_index(inputs):
     return _make.adv_index(Tuple(inputs))
 
 
+def sparse_fill_empty_rows(sparse_indices, sparse_values, dense_shape, default_value):
+    """
+    Fill rows in a sparse matrix that do no contain any values. Values are placed in the first
+    column of empty rows. The sparse array is in COO format.
+    It returns a TupleWrapper with 3 outputs
+    Parameters
+    ----------
+    sparse_indices : relay.Expr
+        A 2-D tensor[N, ndims] of integers containing location of sparse values, where N is
+        the number of sparse values and n_dim is the number of dimensions of the dense_shape.
+        The first column of this relay parameter must be sorted in ascending order.
+    sparse_values : relay.Expr
+        A 1-D tensor[N] containing the sparse values for the sparse indices.
+    dense_shape : relay.Expr
+        A 1-D tensor[ndims] which contains shape of the dense output tensor.
+    default_value : relay.Expr
+        A 1-D tensor[1] containing the default value for the remaining locations.
+    Returns
+    -------
+    new_sparse_indices : relay.Expr
+        A 2-D tensor[?, ndims] of integers containing location of new sparse
+        indices. The first column outputs must be sorted in ascending order.
+    new_sparse_values : relay.Expr
+        A 1-D tensor[?] containing the sparse values for the sparse indices.
+    empty_row_indicator : relay.Expr
+        A 1-D tensor[dense_shape[0]] filled with zeros and ones
+        indicating whether the particular row is empty or full respectively
+
+    Note
+    ----
+    This op exactly follows the documentation here:
+    https://www.tensorflow.org/api_docs/python/tf/sparse/fill_empty_rows
+    There are two exceptions:
+    1. Input Sparse Indices are expected to be in row-major order.
+    2. Empty Row Indicator has int64 output type with 1(for True) and 0(for False).
+
+    Examples
+    -------
+    .. code-block:: python
+        sparse_indices = [[0, 1],
+                         [0, 3],
+                         [2, 0],
+                         [3, 1]]
+        sparse_values = [1, 2, 3, 4]
+        default_value = [10]
+        dense_shape = [5, 6]
+        new_sparse_indices, empty_row_indicator, new_sparse_values, slice_element_index =
+                            relay.sparse_fill_empty_rows(
+                            sparse_indices,
+                            sparse_values,
+                            default_value,
+                            dense_shape)
+        new_sparse_indices = [[0, 1],
+                             [0, 3],
+                             [1, 0],
+                             [2, 0],
+                             [3, 1],
+                             [4, 0]]
+        empty_row_indicator = [False, True, False, False, True]
+        new_sparse_values = [1, 2, 10, 3, 4, 10]
+
+    """
+    new_sparse_indices, new_sparse_values, empty_row_indicator = TupleWrapper(
+        _make.sparse_fill_empty_rows(sparse_indices, sparse_values, dense_shape, default_value), 3
+    )
+    new_sparse_indices = cast_like(new_sparse_indices, sparse_indices)
+    new_sparse_values = cast_like(new_sparse_values, sparse_values)
+    empty_row_indicator = cast(empty_row_indicator, "bool")
+
+    return Tuple((new_sparse_indices, new_sparse_values, empty_row_indicator))
+
+
 def cumsum(data, axis=None, dtype=None, exclusive=None):
     """Numpy style cumsum op. Return the cumulative inclusive sum of the elements along
     a given axis.
diff --git a/python/tvm/topi/__init__.py b/python/tvm/topi/__init__.py
index 6836f04b5ada..2b17162048e0 100644
--- a/python/tvm/topi/__init__.py
+++ b/python/tvm/topi/__init__.py
@@ -38,6 +38,7 @@
 from .broadcast import *
 from .sort import *
 from .scatter import *
+from .sparse_fill_empty_rows import *
 from .scatter_add import *
 from .argwhere import *
 from .cumsum import *
diff --git a/python/tvm/topi/generic/search.py b/python/tvm/topi/generic/search.py
index b3c8772046fd..5924d35def73 100644
--- a/python/tvm/topi/generic/search.py
+++ b/python/tvm/topi/generic/search.py
@@ -66,3 +66,7 @@ def schedule_scatter_add(outs):
       The computation schedule for the op.
     """
     return _default_schedule(outs, False)
+
+
+def schedule_sparse_fill_empty_rows(outs):
+    return _default_schedule(outs, False)
diff --git a/python/tvm/topi/sparse_fill_empty_rows.py b/python/tvm/topi/sparse_fill_empty_rows.py
new file mode 100644
index 000000000000..10dc6ee3bfa3
--- /dev/null
+++ b/python/tvm/topi/sparse_fill_empty_rows.py
@@ -0,0 +1,109 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHnew_sparse_indices WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=no-else-return, too-many-locals, too-many-arguments, too-many-branches
+# pylint: disable=undefined-variable, invalid-name
+"""SparseFillEmptyRows operator"""
+from ..te import hybrid
+
+
+@hybrid.script
+def _sparse_fill_empty_rows(
+    sparse_indices,
+    sparse_values,
+    dense_shape,
+    default_value,
+    new_sparse_indices_shape,
+    new_sparse_values_shape,
+    empty_row_indicator_shape,
+):
+    default_value_ = int64(default_value[0])
+    new_sparse_indices = output_tensor(new_sparse_indices_shape, "int64")
+    new_sparse_values = output_tensor(new_sparse_values_shape, "int64")
+    empty_row_indicator = output_tensor(empty_row_indicator_shape, "int64")
+    new_sparse_indices_row_id = 0
+
+    if int64(sparse_indices.shape[0]) == int64(0):  # Handle Empty Case
+        #  Fill all rows with default values
+        for i in range(0, new_sparse_indices_shape[0]):
+            new_sparse_indices[i, 0] = int64(i)
+            new_sparse_values[i] = default_value_
+            empty_row_indicator[i] = int64(1)
+            for k in range(1, int64(new_sparse_indices_shape[1])):
+                new_sparse_indices[i, k] = int64(0)
+
+        return (new_sparse_indices, new_sparse_values, empty_row_indicator)
+
+    else:
+        # Iterate through sparse_indices and add rows if/when required
+        for i in range(0, int64(sparse_indices.shape[0])):
+            if i == 0:
+                prev_row_id = int64(0)
+            else:
+                prev_row_id = int64(sparse_indices[i - 1, 0] + 1)
+            row_id = int64(sparse_indices[i, 0])
+
+            # Since input is in row-major order, add rows between prev_row_id and row_id
+            for j in range(prev_row_id, row_id):
+                new_sparse_indices[new_sparse_indices_row_id, 0] = int64(j)
+                for k in range(1, int64(new_sparse_indices_shape[1])):
+                    new_sparse_indices[new_sparse_indices_row_id, k] = int64(0)
+                empty_row_indicator[prev_row_id] = int64(1)
+                new_sparse_values[new_sparse_indices_row_id] = default_value_
+                new_sparse_indices_row_id += 1
+
+            # Add current element to output
+            new_sparse_indices[new_sparse_indices_row_id, 0] = row_id
+            for k in range(1, int64(new_sparse_indices_shape[1])):
+                new_sparse_indices[new_sparse_indices_row_id, k] = int64(sparse_indices[i, k])
+            new_sparse_values[new_sparse_indices_row_id] = int64(sparse_values[i])
+            empty_row_indicator[row_id] = int64(0)
+            new_sparse_indices_row_id += 1
+
+        # Add rows with default value if last row id of sparse_indices is not dense_shape[0] - 1
+        for i in range(
+            int64(sparse_indices[sparse_indices.shape[0] - 1, 0] + 1), int64(dense_shape[0])
+        ):
+
+            new_sparse_indices[new_sparse_indices_row_id, 0] = int64(i)
+            for k in range(1, int64(new_sparse_indices_shape[1])):
+                new_sparse_indices[new_sparse_indices_row_id, k] = int64(0)
+            empty_row_indicator[i] = int64(1)
+            new_sparse_values[new_sparse_indices_row_id] = default_value_
+            new_sparse_indices_row_id += 1
+
+        return (new_sparse_indices, new_sparse_values, empty_row_indicator)
+
+
+def sparse_fill_empty_rows(
+    sparse_indices,
+    sparse_values,
+    dense_shape,
+    default_value,
+    new_sparse_indices_shape,
+    new_sparse_values_shape,
+    empty_row_indicator_shape,
+):
+    return _sparse_fill_empty_rows(
+        sparse_indices,
+        sparse_values,
+        dense_shape,
+        default_value,
+        new_sparse_indices_shape,
+        new_sparse_values_shape,
+        empty_row_indicator_shape,
+    )
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index e1b1dddf340b..1e782a568fe9 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -1584,6 +1584,50 @@ RELAY_REGISTER_OP("repeat")
     .set_attr<FTVMCompute>("FTVMCompute", RepeatCompute)
     .set_attr<TOpPattern>("TOpPattern", kBroadcast);
 
+bool SparseFillEmptyRowsRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                            const TypeReporter& reporter) {
+  // types: [sparse_indices, sparse_values, dense_shape, default_value, result]
+  ICHECK_EQ(types.size(), 5) << "SparseFillEmptyRowsRel expects 5 inputs but " << types.size()
+                             << "provided";
+  std::vector<Type> fields;
+  auto sparse_indices = types[0].as<TensorTypeNode>();
+  auto ndims = sparse_indices->shape[1];
+  fields.push_back(TensorType(Array<PrimExpr>{Any(), ndims}, tvm::DataType::Int(64)));
+  fields.push_back(TensorType(Array<PrimExpr>{Any()}, tvm::DataType::Int(64)));
+  fields.push_back(TensorType(Array<PrimExpr>{Any()}, tvm::DataType::Int(64)));
+  reporter->Assign(types[types.size() - 1], TupleType(Array<Type>(fields)));
+  return true;
+}
+
+Expr MakeSparseFillEmptyRows(Expr sparse_indices, Expr sparse_values, Expr dense_shape,
+                             Expr default_value) {
+  static const Op& op = Op::Get("sparse_fill_empty_rows");
+  return Call(op, {sparse_indices, sparse_values, dense_shape, default_value}, Attrs(), {});
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.sparse_fill_empty_rows")
+    .set_body_typed(MakeSparseFillEmptyRows);
+
+RELAY_REGISTER_OP("sparse_fill_empty_rows")
+    .describe(
+        R"code(Fill empty rows of a sparse tensor with a default value.)code" TVM_ADD_FILELINE)
+    .set_num_inputs(4)
+    .add_argument("sparse_indices", "Tensor",
+                  "A 2-D int64 tensor of shape [N, ndims], which specifies the indices of the"
+                  "elements in the sparse tensor that contain nonzero values. COO Format")
+    .add_argument(
+        "sparse_values", "Tensor",
+        "A 1-D tensor[N] which supplies the values for each element in indices. COO Format")
+    .add_argument("dense_shape", "Tensor",
+                  "A 1-D int64 tensor of shape [ndims], which specifies the dense_shape of the"
+                  "sparse tensor. Takes a list indicating the number of elements in each "
+                  "dimension")
+    .add_argument("default_value", "Tensor",
+                  "The value to fill for empty rows, with the same type as sparse_values")
+    .add_type_rel("sparse_fill_empty_rows", SparseFillEmptyRowsRel)
+    .set_support_level(3)
+    .set_attr<TOpPattern>("TOpPattern", kOpaque);
+
 // meshgrid operator
 TVM_REGISTER_NODE_TYPE(MeshgridAttrs);
 
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index fd4b9f49e6a4..f956ea02eb47 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -1812,6 +1812,109 @@ def test_forward_sparse_dense_matmul():
     )
 
 
+#######################################################################
+# SparseFillEmptyRows
+# ------------
+
+
+def _test_sparse_fill_empty_rows(indices_np, values_np, dense_shape_np, default_value_int, use_dyn):
+    with tf.Graph().as_default():
+        if use_dyn:
+            indices = tf.placeholder(shape=(None, None), dtype=indices_np.dtype, name="indices")
+            values = tf.placeholder(shape=(None), dtype=values_np.dtype, name="values")
+            dense_shape = tf.placeholder(
+                shape=(None), dtype=dense_shape_np.dtype, name="dense_shape"
+            )
+        else:
+            indices = tf.placeholder(shape=indices_np.shape, dtype=indices_np.dtype, name="indices")
+            values = tf.placeholder(shape=values_np.shape, dtype=values_np.dtype, name="values")
+            dense_shape = tf.placeholder(
+                shape=dense_shape_np.shape, dtype=dense_shape_np.dtype, name="dense_shape"
+            )
+
+        default_value = tf.placeholder(shape=(), dtype=values_np.dtype, name="default_value")
+        sp_input = tf.sparse.SparseTensor(indices=indices, values=values, dense_shape=dense_shape)
+        _ = tf.sparse.fill_empty_rows(sp_input, default_value, name="sparse_fill_empty_rows")
+        compare_tf_with_tvm(
+            [indices_np, values_np, dense_shape_np, default_value_int],
+            [indices.name, values.name, dense_shape.name, default_value.name],
+            [
+                "sparse_fill_empty_rows/SparseFillEmptyRows:0",
+                "sparse_fill_empty_rows/SparseFillEmptyRows:1",
+                "sparse_fill_empty_rows/SparseFillEmptyRows:2",
+            ],
+            mode="vm",
+        )
+
+
+@pytest.mark.parametrize(
+    "sparse_indices_np, sparse_values_np, dense_shape_np, default_value_int",
+    [
+        (
+            np.array([[1, 1], [0, 3], [0, 1], [2, 0], [3, 1]], dtype=np.int64),
+            np.array([1, 2, 3, 4, 5], dtype=np.int64),
+            np.array([5, 6], dtype=np.int64),
+            10,
+        ),
+        (
+            np.array([[1, 1], [0, 3], [2, 0], [3, 1]], dtype=np.int64),
+            np.array([1, 2, 3, 4], dtype=np.int64),
+            np.array([5, 6], dtype=np.int64),
+            10,
+        ),
+        (
+            np.array([[0, 1], [0, 3], [2, 0], [3, 1]], dtype=np.int64),
+            np.array([1, 2, 3, 4], dtype=np.int64),
+            np.array([5, 6], dtype=np.int64),
+            10,
+        ),
+        (
+            np.array([[1, 1, 1], [1, 3, 1], [2, 0, 5], [3, 1, 6]], dtype=np.int64),
+            np.array([1, 2, 3, 4], dtype=np.int64),
+            np.array([7, 7, 7], dtype=np.int64),
+            5,
+        ),
+        (
+            np.array([[1], [2]], dtype=np.int64),
+            np.array([7, 8], dtype=np.int64),
+            np.array([5], dtype=np.int64),
+            4,
+        ),
+        (
+            np.ones((0, 1), dtype=np.int64),
+            np.array([], dtype=np.int64),
+            np.array([5], dtype=np.int64),
+            4,
+        ),
+        (
+            np.ones((0, 3), dtype=np.int64),
+            np.array([], dtype=np.int64),
+            np.array([9, 3, 7], dtype=np.int64),
+            100,
+        ),
+    ],
+)
+@pytest.mark.parametrize("use_dyn", [True, False])
+def test_forward_sparse_fill_empty_rows(
+    sparse_indices_np, sparse_values_np, dense_shape_np, default_value_int, use_dyn
+):
+    """ sparse_fill_empty_rows op test"""
+    ###################################################################
+    #
+    # In order to create a SparseTensor, it requires 3 input as below:
+    #    SparseTensor(indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
+    #
+    # Above Sparse can be represented in Dense as below :
+    #    [[1, 0, 0, 0]
+    #     [0, 0, 2, 0]
+    #     [0, 0, 0, 0]]
+    #
+    # ------------------------------------------------------------------
+    _test_sparse_fill_empty_rows(
+        sparse_indices_np, sparse_values_np, dense_shape_np, default_value_int, use_dyn
+    )
+
+
 #######################################################################
 # StridedSlice
 # ------------
diff --git a/tests/python/relay/dyn/test_dynamic_op_level3.py b/tests/python/relay/dyn/test_dynamic_op_level3.py
index dd73b9a96a52..d5f81e84e39d 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level3.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level3.py
@@ -26,14 +26,21 @@
 import tvm.testing
 
 
-def verify_func(func, data, ref_res):
+def verify_func(func, data, ref_res, target_ctx=tvm.testing.enabled_targets()):
     assert isinstance(data, list)
-    for target, ctx in tvm.testing.enabled_targets():
+    for target, ctx in target_ctx:
         for kind in ["vm", "debug"]:
             mod = tvm.ir.IRModule.from_expr(func)
             intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
             op_res = intrp.evaluate()(*data)
-            tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+            if isinstance(op_res, tvm.runtime.container.ADT):
+                assert len(op_res) == len(
+                    ref_res
+                ), "Outputs from TVM and Python implementation must be equal "
+                for op_result, ref_result in zip(op_res, ref_res):
+                    tvm.testing.assert_allclose(op_result.asnumpy(), ref_result, rtol=1e-5)
+            else:
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
             relay.backend.compile_engine.get().clear()
 
 
@@ -202,5 +209,160 @@ def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_
     verify_sparse_to_dense(1, 3, None, [5], [0, 3, 0, 0, 0])  # default value not specified
 
 
+@pytest.mark.parametrize(
+    "sparse_indices, sparse_values, dense_shape, default_value",
+    [
+        (
+            np.array([[0, 1], [0, 3], [2, 0], [3, 1]], dtype=np.int64),
+            np.array([1, 2, 3, 4], dtype=np.int64),
+            np.array([5, 6], dtype=np.int64),
+            np.array([10], dtype=np.int64),
+        ),
+        (
+            np.array([[1, 1, 1], [1, 3, 1], [2, 0, 5], [3, 1, 6]], dtype=np.int64),
+            np.array([1, 2, 3, 4], dtype=np.int64),
+            np.array([7, 7, 7], dtype=np.int64),
+            np.array([5], dtype=np.int64),
+        ),
+        (
+            np.array([[1], [2]], dtype=np.int64),
+            np.array([7, 8], dtype=np.int64),
+            np.array([5], dtype=np.int64),
+            np.array([4], dtype=np.int64),
+        ),
+        (
+            np.ones((0, 1), dtype=np.int64),
+            np.array([], dtype=np.int64),
+            np.array([5], dtype=np.int64),
+            np.array([4], dtype=np.int64),
+        ),
+        (
+            np.ones((0, 3), dtype=np.int64),
+            np.array([], dtype=np.int64),
+            np.array([9, 3, 7], dtype=np.int64),
+            np.array([100], dtype=np.int64),
+        ),
+    ],
+)
+@pytest.mark.parametrize("dtype", [np.int64, np.int32])
+@pytest.mark.parametrize("use_dyn", [True, False])
+def test_sparse_fill_empty_rows(
+    sparse_indices, sparse_values, dense_shape, default_value, dtype, use_dyn
+):
+    def ref_sparse_fill_empty_rows(
+        sparse_indices: np.ndarray,
+        sparse_values: np.ndarray,
+        dense_shape: np.ndarray,
+        default_value: np.ndarray,
+    ) -> None:
+        """
+        This function calculates the expected output of sparse_fill_empty_rows operator given the
+        inputs.
+        """
+
+        def check_add_rows(current_idx, limit_idx):
+            while current_idx < limit_idx:
+                new_sparse_indices.append([current_idx] + [0] * (num_cols - 1))
+                new_sparse_values.append(default_value[0])
+                empty_row_indicator[current_idx] = True
+                current_idx += 1
+
+            return current_idx
+
+        current_idx = 0
+        new_sparse_indices = []
+        new_sparse_values = []
+        empty_row_indicator = [False for _ in range(dense_shape[0])]
+        num_cols = sparse_indices.shape[1]
+        for sparse_row, sparse_value in zip(sparse_indices, sparse_values):
+            limit_idx = sparse_row[0]
+            current_idx = check_add_rows(current_idx, limit_idx)
+            new_sparse_indices.append(list(sparse_row))
+            new_sparse_values.append(sparse_value)
+            current_idx = limit_idx + 1
+
+        check_add_rows(current_idx, dense_shape[0])
+        return new_sparse_indices, new_sparse_values, empty_row_indicator
+
+    def verify_sparse_fill_empty_rows(
+        sparse_indices_np: np.ndarray,
+        sparse_values_np: np.ndarray,
+        dense_shape_np: np.ndarray,
+        default_value_np: np.ndarray,
+    ) -> None:
+        """
+        This function verifies the relay output of sparse_fill_empty_rows with its expected output.
+        """
+        if use_dyn:
+            sparse_indices = relay.var(
+                "sparse_indices",
+                shape=[relay.Any(), relay.Any()],
+                dtype=str(sparse_indices_np.dtype),
+            )
+            sparse_values = relay.var(
+                "sparse_values",
+                shape=[relay.Any()],
+                dtype=str(sparse_values_np.dtype),
+            )
+            dense_shape = relay.var(
+                "dense_shape",
+                shape=[relay.Any()],
+                dtype=str(dense_shape_np.dtype),
+            )
+            default_value = relay.var(
+                "default_value",
+                shape=[relay.Any()],
+                dtype=str(default_value_np.dtype),
+            )
+        else:
+            sparse_indices = relay.var(
+                "sparse_indices",
+                relay.TensorType(sparse_indices_np.shape, str(sparse_indices_np.dtype)),
+            )
+            sparse_values = relay.var(
+                "sparse_values",
+                relay.TensorType(sparse_values_np.shape, str(sparse_values_np.dtype)),
+            )
+            dense_shape = relay.var(
+                "dense_shape",
+                relay.TensorType(dense_shape_np.shape, str(dense_shape_np.dtype)),
+            )
+            default_value = relay.var(
+                "default_value",
+                relay.TensorType(default_value_np.shape, str(default_value_np.dtype)),
+            )
+        z = relay.sparse_fill_empty_rows(sparse_indices, sparse_values, dense_shape, default_value)
+        func = relay.Function([sparse_indices, sparse_values, dense_shape, default_value], z)
+        ref_res = ref_sparse_fill_empty_rows(
+            sparse_indices_np,
+            sparse_values_np,
+            dense_shape_np,
+            default_value_np,
+        )
+        (
+            new_sparse_indices_infer_type,
+            new_sparse_values_infer_type,
+            empty_row_indicator_infer_type,
+        ) = run_infer_type(z)
+
+        assert new_sparse_indices_infer_type.checked_type.dtype == sparse_indices_np.dtype
+        assert new_sparse_values_infer_type.checked_type.dtype == sparse_indices_np.dtype
+        assert empty_row_indicator_infer_type.checked_type.dtype == "bool"
+
+        verify_func(
+            func,
+            [sparse_indices_np, sparse_values_np, dense_shape_np, default_value_np],
+            ref_res,
+            [("llvm", tvm.cpu())],
+        )
+
+    verify_sparse_fill_empty_rows(
+        sparse_indices.astype(dtype),
+        sparse_values.astype(dtype),
+        dense_shape.astype(dtype),
+        default_value.astype(dtype),
+    )
+
+
 if __name__ == "__main__":
     pytest.main([__file__])

From 77d2fe8a7f239b3a748319f2b14facd2c3b3dccf Mon Sep 17 00:00:00 2001
From: "Steven S. Lyubomirsky" <sslyu@cs.washington.edu>
Date: Wed, 17 Feb 2021 01:24:56 -0500
Subject: [PATCH 203/357] [Bugfix][Relay] Crash in match_exhaustion.cc when
 given an empty tuple pattern or constructor with no args (#7459)

* [match_exhaustion] Fix cartesian product to handle empty tuple patterns or constructors with no args

* Test cases do not actually exhibit the fixed bug

* Mistake in comment
---
 src/relay/analysis/match_exhaustion.cc        | 15 ++++--
 .../python/relay/test_pass_unmatched_cases.py | 46 +++++++++++++++++++
 2 files changed, 56 insertions(+), 5 deletions(-)

diff --git a/src/relay/analysis/match_exhaustion.cc b/src/relay/analysis/match_exhaustion.cc
index bb6e8f14ca09..2a90b911b676 100644
--- a/src/relay/analysis/match_exhaustion.cc
+++ b/src/relay/analysis/match_exhaustion.cc
@@ -124,9 +124,14 @@ class CandidateChecker : public PatternFunctor<MatchResult(const Pattern&, const
   }
 };
 
-// Returns list of arrays corresponding to Cartesian product of input list
+// Returns list of arrays corresponding to Cartesian product of input list.
+// Note: CartesianProduct({}) = {{}}
 Array<Array<Pattern>> CartesianProduct(Array<Array<Pattern>> fields) {
-  ICHECK_NE(fields.size(), 0);
+  // the only combination of 0 fields is 0 fields
+  if (fields.size() == 0) {
+    return {{}};
+  }
+
   Array<Pattern> field_vals = fields[fields.size() - 1];
   Array<Array<Pattern>> ret;
 
@@ -197,7 +202,7 @@ Array<Pattern> ExpandWildcardsConstructor(const PatternConstructor& clause_ctor,
 
   auto ctor_cand = Downcast<PatternConstructor>(cand);
 
-  // for constructors, we will expand the wildcards in any field that is an ADT.
+  // expand all fields' wildcards
   Array<Array<Pattern>> values_by_field;
   for (size_t i = 0; i < ctor_cand->constructor->inputs.size(); i++) {
     values_by_field.push_back(
@@ -217,7 +222,7 @@ Array<Pattern> ExpandWildcardsConstructor(const PatternConstructor& clause_ctor,
 // Returns a list of all possible expansions.
 Array<Pattern> ExpandWildcardsTuple(const PatternTuple& clause_tuple, const Pattern& cand,
                                     const IRModule& mod) {
-  // for a wildcard node, create constructor nodes with wildcards for all args.
+  // for a wildcard node, create tuple with wildcards for all args.
   if (cand.as<PatternWildcardNode>()) {
     Array<Pattern> args;
     for (auto inp : clause_tuple->patterns) {
@@ -228,7 +233,7 @@ Array<Pattern> ExpandWildcardsTuple(const PatternTuple& clause_tuple, const Patt
 
   auto tuple_cand = Downcast<PatternTuple>(cand);
 
-  // for constructors, we will expand the wildcards in any field that is an ADT.
+  // expand all members' patterns
   Array<Array<Pattern>> values_by_field;
   for (size_t i = 0; i < tuple_cand->patterns.size(); i++) {
     values_by_field.push_back(
diff --git a/tests/python/relay/test_pass_unmatched_cases.py b/tests/python/relay/test_pass_unmatched_cases.py
index c6b4deb0b2c2..255cecf76f2e 100644
--- a/tests/python/relay/test_pass_unmatched_cases.py
+++ b/tests/python/relay/test_pass_unmatched_cases.py
@@ -420,5 +420,51 @@ def @shallow_opt[A](%a: Arith[A]) -> Arith[A] {
     # fromtext parse the module, then checked it (which include strictness checking).
 
 
+def test_expanding_ctor_with_no_args():
+    code = """
+#[version = "0.0.5"]
+type List[A] {
+    Cons(A, List[A]),
+    Nil,
+}
+
+def @expand_on_nil_match(%a: List[(List[()],)]) -> int {
+    match (%a) {
+        Cons((Nil), Nil) => 1,
+        _ => 2,
+    }
+}
+"""
+    # exhausion checks:
+    # * hits Cons((Nil), Nil), expands to Cons(*, *), Nil()
+    # Nil() fails Cons((Nil), Nil), passes _
+    # Cons(*, *) hits Cons((Nil), Nil), expands to Cons((*), Cons(*, *)), Cons((*), Nil())
+    # Cons((*), Cons(*, *)) fails Cons((Nil), Nil), passes _
+    # Cons((*), Nil()) hits Cons((Nil), Nil), expands to Cons((Nil), Nil), Cons((Cons(*, *)), Nil)
+    # Cons((Nil), Nil) passes the first pattern
+    # Cons((Cons(*, *)), Nil) fails the first pattern, passes _
+    # Note Nil() is passed to ExpandWildcardsConstructor many times in the above!
+    tvm.parser.fromtext(code)
+
+
+def test_expanding_empty_tuple():
+    # same principle as above, but with empty tuple
+    code = """
+#[version = "0.0.5"]
+type List[A] {
+    Cons(A, List[A]),
+    Nil,
+}
+
+def @expand_on_empty_tuple_match(%a: (List[()], ())) -> int {
+    match (%a) {
+        (Cons((), Nil), ()) => 1,
+        _ => 2,
+    }
+}
+"""
+    tvm.parser.fromtext(code)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])

From fe398bf206d01b54a2d74603e6bc9c012d63b2c9 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Wed, 17 Feb 2021 00:16:26 -0800
Subject: [PATCH 204/357] Report JUnit test results for all TVM Python tests
 (#7450)

* Enable JUnit parsing for Python tests

* retrigger CI

* prefix junit results with FFI type

* remove - in junit prefix
---
 Jenkinsfile                       | 8 ++++++++
 tests/scripts/setup-pytest-env.sh | 1 +
 2 files changed, 9 insertions(+)

diff --git a/Jenkinsfile b/Jenkinsfile
index 6bf6dcfa966a..bba3950aea87 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -188,6 +188,7 @@ stage('Build') {
           sh "${docker_run} ${ci_cpu} ./tests/scripts/task_python_vta_tsim.sh"
           // sh "${docker_run} ${ci_cpu} ./tests/scripts/task_golang.sh"
           sh "${docker_run} ${ci_cpu} ./tests/scripts/task_rust.sh"
+          junit "build/pytest-results/*.xml"
         }
       }
     }
@@ -234,6 +235,7 @@ stage('Build') {
         timeout(time: max_time, unit: 'MINUTES') {
           sh "${docker_run} ${ci_qemu} ./tests/scripts/task_ci_setup.sh"
           sh "${docker_run} ${ci_qemu} ./tests/scripts/task_python_microtvm.sh"
+          junit "build/pytest-results/*.xml"
         }
       }
     }
@@ -251,6 +253,7 @@ stage('Unit Test') {
           sh "${docker_run} ${ci_gpu} ./tests/scripts/task_sphinx_precheck.sh"
           sh "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest_gpuonly.sh"
           sh "${docker_run} ${ci_gpu} ./tests/scripts/task_python_integration_gpuonly.sh"
+          junit "build/pytest-results/*.xml"
         }
       }
     }
@@ -265,6 +268,7 @@ stage('Unit Test') {
           sh "${docker_run} ${ci_i386} ./tests/scripts/task_python_unittest.sh"
           sh "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration.sh"
           sh "${docker_run} ${ci_i386} ./tests/scripts/task_python_vta_fsim.sh"
+          junit "build/pytest-results/*.xml"
         }
       }
     }
@@ -277,6 +281,7 @@ stage('Unit Test') {
         timeout(time: max_time, unit: 'MINUTES') {
           sh "${docker_run} ${ci_arm} ./tests/scripts/task_ci_setup.sh"
           sh "${docker_run} ${ci_arm} ./tests/scripts/task_python_unittest.sh"
+          junit "build/pytest-results/*.xml"
           // sh "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh"
         }
       }
@@ -305,6 +310,7 @@ stage('Integration Test') {
         timeout(time: max_time, unit: 'MINUTES') {
           sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_setup.sh"
           sh "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh"
+          junit "build/pytest-results/*.xml"
         }
       }
     }
@@ -317,6 +323,7 @@ stage('Integration Test') {
         timeout(time: max_time, unit: 'MINUTES') {
           sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_setup.sh"
           sh "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh"
+          junit "build/pytest-results/*.xml"
         }
       }
     }
@@ -329,6 +336,7 @@ stage('Integration Test') {
         timeout(time: max_time, unit: 'MINUTES') {
           sh "${docker_run} ${ci_cpu} ./tests/scripts/task_ci_setup.sh"
           sh "${docker_run} ${ci_cpu} ./tests/scripts/task_python_frontend_cpu.sh"
+          junit "build/pytest-results/*.xml"
         }
       }
     }
diff --git a/tests/scripts/setup-pytest-env.sh b/tests/scripts/setup-pytest-env.sh
index 5d2216c9dc87..bcd27a16f659 100755
--- a/tests/scripts/setup-pytest-env.sh
+++ b/tests/scripts/setup-pytest-env.sh
@@ -45,5 +45,6 @@ function run_pytest() {
     TVM_FFI=${ffi_type} python3 -m pytest \
            -o "junit_suite_name=${test_suite_name}-${ffi_type}" \
            "--junit-xml=${TVM_PYTEST_RESULT_DIR}/${test_suite_name}-${ffi_type}.xml" \
+           "--junit-prefix=${ffi_type}" \
            "$@"
 }

From 76647caf702f44f94890b5f5a159be0c7fa2baaa Mon Sep 17 00:00:00 2001
From: tristan-arm <tristan.oconnor@arm.com>
Date: Wed, 17 Feb 2021 21:50:56 +0000
Subject: [PATCH 205/357] [ETHOSN] Add support for default Ethos-N78
 configuration. (#6982)

Note: 'ETHOSN_VARIANT_CONFIG' must be set to test against Ethos-N78 and this adds support for one configuration of Ethos-N78 in TVM.
---
 .../backend/contrib/ethosn/capabilities.h     | 24 ++++++++++++++-----
 .../backend/contrib/ethosn/codegen_ethosn.h   |  4 +++-
 .../contrib/test_ethosn/infrastructure.py     |  9 ++++++-
 .../contrib/test_ethosn/test_networks.py      |  8 +++++++
 tests/scripts/task_python_ethosn_tests.sh     |  4 ++++
 tests/scripts/task_python_integration.sh      |  3 +++
 6 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/src/relay/backend/contrib/ethosn/capabilities.h b/src/relay/backend/contrib/ethosn/capabilities.h
index 77b2d911d38f..8c7ee6a0d009 100644
--- a/src/relay/backend/contrib/ethosn/capabilities.h
+++ b/src/relay/backend/contrib/ethosn/capabilities.h
@@ -20,7 +20,8 @@
 /*!
  * \file src/relay/backend/contrib/ethosn/capabilities.h
  * \brief The Ethos-N processor series has four variants, the Ethos-N37, Ethos-N57, Ethos-N77
- * and the Ethos-N78. This release of the integration supports the first three variants.
+ * and the Ethos-N78. This release of the integration supports the first three variants and
+ * the default configuration of the fourth variant.
  * Configuration information for each variant is stored as a blob in this file. These blobs
  * are passed into the Ethos-N support library, which in turn uses them to optimize the
  * generated command-stream appropriately for the specified variant.
@@ -38,13 +39,14 @@ namespace relay {
 namespace contrib {
 namespace ethosn {
 
-/* Ethos-N variants (N77, N57 and N37)
- * variant[0] - N77
- * variant[1] - N57
- * variant[2] - N37
+/* Ethos-N variants (Ethos-N77, Ethos-N57, Ethos-N37 and Ethos-N78)
+ * variant[0] - Ethos-N77
+ * variant[1] - Ethos-N57
+ * variant[2] - Ethos-N37
+ * variant[3] - Ethos-N78
  */
 #if _ETHOSN_API_VERSION_ == 2008
-static std::vector<char> variants[3] = {
+static std::vector<char> variants[4] = {
     {
         0x03, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
         0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x10, 0x00,
@@ -74,6 +76,16 @@ static std::vector<char> variants[3] = {
         0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00,
         0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x01,
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    },
+    {
+        0x03, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0x00, 0x02, 0x00,
+        0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00,
+        0x10, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+        0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+        0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00,
+        0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x01,
+        0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
     }};
 #else
 static std::vector<char> variants[3] = {
diff --git a/src/relay/backend/contrib/ethosn/codegen_ethosn.h b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
index 4b3e1bc05367..9887a2b3ad78 100644
--- a/src/relay/backend/contrib/ethosn/codegen_ethosn.h
+++ b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
@@ -251,7 +251,9 @@ struct EthosnCompilerConfigNode : public tvm::AttrsNode<EthosnCompilerConfigNode
 
   TVM_DECLARE_ATTRS(EthosnCompilerConfigNode, "ext.attrs.EthosnCompilerConfigNode") {
     TVM_ATTR_FIELD(variant)
-        .describe("0 for Ethos-N77, 1 for Ethos-N57, 2 for Ethos-N37. See Ethos-N documentation.")
+        .describe(
+            "0 for Ethos-N77, 1 for Ethos-N57, 2 for Ethos-N37,"
+            " 3 for Ethos-N78. See Ethos-N documentation.")
         .set_default(0);
     TVM_ATTR_FIELD(strategy0).set_default(true);
     TVM_ATTR_FIELD(strategy1).set_default(true);
diff --git a/tests/python/contrib/test_ethosn/infrastructure.py b/tests/python/contrib/test_ethosn/infrastructure.py
index 905d066ce7a3..cd9e9e91292d 100644
--- a/tests/python/contrib/test_ethosn/infrastructure.py
+++ b/tests/python/contrib/test_ethosn/infrastructure.py
@@ -151,7 +151,7 @@ def build(mod, params, npu=True, expected_host_ops=0, npu_partitions=1):
     """
     relay.backend.compile_engine.get().clear()
     with tvm.transform.PassContext(
-        opt_level=3, config={"relay.ext.ethos-n.options": {"variant": 0}}
+        opt_level=3, config={"relay.ext.ethos-n.options": {"variant": get_ethosn_variant()}}
     ):
         with tvm.target.Target("llvm"):
             if npu:
@@ -321,3 +321,10 @@ def get_conv2d_qnn_params(input_zp, input_sc, kernel_zp, kernel_sc, kernel_h, ke
 
 def get_ethosn_api_version():
     return tvm.get_global_func("relay.ethos-n.api.version")()
+
+
+def get_ethosn_variant():
+    ethosn_variant_config = os.getenv("ETHOSN_VARIANT_CONFIG")
+    if ethosn_variant_config is not None:
+        return 3
+    return 0
diff --git a/tests/python/contrib/test_ethosn/test_networks.py b/tests/python/contrib/test_ethosn/test_networks.py
index c9247884141b..e0eccdfb30f5 100644
--- a/tests/python/contrib/test_ethosn/test_networks.py
+++ b/tests/python/contrib/test_ethosn/test_networks.py
@@ -125,6 +125,8 @@ def test_mobilenet_v1():
     _compile_hash = {"81637c89339201a07dc96e3b5dbf836a"}
     if tei.get_ethosn_api_version() == 2008:
         _compile_hash = {"47e216d8ab2bf491708ccf5620bc0d02"}
+        if tei.get_ethosn_variant() == 3:
+            _compile_hash = {"2436f523e263f66a063cef902f2f43d7"}
     _test_image_network(
         model_url="https://storage.googleapis.com/download.tensorflow.org/"
         "models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz",
@@ -147,6 +149,8 @@ def test_inception_v3():
     _compile_hash = {"de0e175af610ebd45ccb03d170dc9664"}
     if tei.get_ethosn_api_version() == 2008:
         _compile_hash = {"8c9d75659cd7bc9ff6dd6d490d28f9b2"}
+        if tei.get_ethosn_variant() == 3:
+            _compile_hash = {"cdd4d7f6453d722ea73224ff9d6a115a"}
     _test_image_network(
         model_url="https://storage.googleapis.com/download.tensorflow.org/"
         "models/tflite_11_05_08/inception_v3_quant.tgz",
@@ -165,6 +169,8 @@ def test_inception_v4():
     # codegen, which could come about from either a change in Support Library
     # version or a change in the Ethos-N codegen. To update this requires running
     # on hardware that isn't available in CI.
+    if not tei.get_ethosn_variant() == 0:
+        pytest.skip("Ethos-N78 20.08 does not support inception_v4 in the default configuration.")
     _compile_hash = {"06bf6cb56344f3904bcb108e54edfe87"}
     if tei.get_ethosn_api_version() == 2008:
         _compile_hash = {"798292bfa596ca7c32086396b494b46c"}
@@ -189,6 +195,8 @@ def test_ssd_mobilenet_v1():
     _compile_hash = {"29aec6b184b09454b4323271aadf89b1", "6211d96103880b016baa85e638abddef"}
     if tei.get_ethosn_api_version() == 2008:
         _compile_hash = {"5999f26e140dee0d7866491997ef78c5", "24e3a690a7e95780052792d5626c85be"}
+        if tei.get_ethosn_variant() == 3:
+            _compile_hash = {"da871b3f03a93df69d704ed44584d6cd", "9f52411d301f3cba3f6e4c0f1c558e87"}
     _test_image_network(
         model_url="https://storage.googleapis.com/download.tensorflow.org/"
         "models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip",
diff --git a/tests/scripts/task_python_ethosn_tests.sh b/tests/scripts/task_python_ethosn_tests.sh
index 981d5715fac6..ae9b82b679ef 100755
--- a/tests/scripts/task_python_ethosn_tests.sh
+++ b/tests/scripts/task_python_ethosn_tests.sh
@@ -27,4 +27,8 @@ source tests/scripts/setup-pytest-env.sh
 find . -type f -path "*.pyc" | xargs rm -f
 make cython3
 
+# Note: Default behaviour is to assume the test target is Ethos-N77
+# but setting ETHOSN_VARIANT_CONFIG appropriately
+# (e.g. ETHOSN_VARIANT_CONFIG=ETHOSN78_1TOPS_4PLE_448KSRAM)
+# switches the target to an Ethos-N78 configuration.
 run_pytest ctypes python-ethosn tests/python/contrib/test_ethosn
diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh
index e21aa065a024..613c7cbdf34f 100755
--- a/tests/scripts/task_python_integration.sh
+++ b/tests/scripts/task_python_integration.sh
@@ -60,6 +60,9 @@ run_pytest cython ${TVM_INTEGRATION_TESTSUITE_NAME}-dso_plugin_module apps/dso_p
 # TVM_FFI=ctypes sh prepare_and_test_tfop_module.sh
 
 run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME} tests/python/integration
+if python -c "import tvm; from tvm.relay.op.contrib.ethosn import ethosn_available; print(ethosn_available().name)" -eq "SW_ONLY"; then
+  ETHOSN_VARIANT_CONFIG=ETHOSN78_1TOPS_4PLE_448KSRAM run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-contrib-test_ethosn tests/python/contrib/test_ethosn
+fi
 run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-contrib tests/python/contrib
 
 TVM_TEST_TARGETS="${TVM_RELAY_TEST_TARGETS:-llvm;cuda}" \

From 143c88eb609856b30e4ce6949d213f2b8417cc49 Mon Sep 17 00:00:00 2001
From: monchin <pazzinter@126.com>
Date: Thu, 18 Feb 2021 09:12:14 +0800
Subject: [PATCH 206/357] debug operator--() in include/tvm/node/container.h
 (#7461)

---
 include/tvm/node/container.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/tvm/node/container.h b/include/tvm/node/container.h
index 2ed1fdf880f1..10b47a92bdcf 100644
--- a/include/tvm/node/container.h
+++ b/include/tvm/node/container.h
@@ -1145,7 +1145,7 @@ inline MapNode::iterator& MapNode::iterator::operator++() {
 
 inline MapNode::iterator& MapNode::iterator::operator--() {
   TVM_DISPATCH_MAP_CONST(self, p, {
-    index = p->IncItr(index);
+    index = p->DecItr(index);
     return *this;
   });
 }

From b7e0cfb6d469c3745ae2195908daadea9c64d87e Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Thu, 18 Feb 2021 12:11:17 +0900
Subject: [PATCH 207/357] [TOPI, Relay] Support roi_align NHWC layout (#7463)

* begin nhwc roi align

* integrate mode change from upstream

* adding test

* support nhwc shape func

* update strategy

* refactoring test

* refactor test

* refactoring

* fix lint

* update relay op tests
---
 python/tvm/relay/op/strategy/cuda.py        |  20 +-
 python/tvm/relay/op/strategy/generic.py     |  20 +-
 python/tvm/relay/op/strategy/x86.py         |  19 +-
 python/tvm/relay/op/vision/_vision.py       |  17 +-
 python/tvm/topi/testing/__init__.py         |   2 +-
 python/tvm/topi/testing/roi_align_python.py | 153 +++++++++++----
 python/tvm/topi/vision/rcnn/roi_align.py    | 196 +++++++++++++++-----
 tests/python/relay/test_op_level5.py        |  89 +++++++--
 8 files changed, 398 insertions(+), 118 deletions(-)

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 032d2dd2c8f1..cb4688c4889e 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -945,12 +945,20 @@ def roi_align_strategy_cuda(attrs, inputs, out_type, target):
     """roi_align cuda strategy"""
     strategy = _op.OpStrategy()
     layout = attrs.layout
-    assert layout == "NCHW", "only support nchw for now"
-    strategy.add_implementation(
-        wrap_compute_roi_align(topi.vision.rcnn.roi_align_nchw),
-        wrap_topi_schedule(topi.cuda.schedule_roi_align),
-        name="roi_align_nchw.cuda",
-    )
+
+    if layout == "NCHW":
+        strategy.add_implementation(
+            wrap_compute_roi_align(topi.vision.rcnn.roi_align_nchw),
+            wrap_topi_schedule(topi.cuda.schedule_roi_align),
+            name="roi_align_nchw.cuda",
+        )
+    else:
+        assert layout == "NHWC", "layout must be NCHW or NHWC."
+        strategy.add_implementation(
+            wrap_compute_roi_align(topi.vision.rcnn.roi_align_nhwc),
+            wrap_topi_schedule(topi.cuda.schedule_roi_align),
+            name="roi_align_nhwc.cuda",
+        )
     return strategy
 
 
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index e744b8c9da83..f076176c5d8a 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -1039,7 +1039,6 @@ def wrap_compute_roi_align(topi_compute):
     """wrap roi_align topi compute"""
 
     def _compute_roi_align(attrs, inputs, out_type):
-        assert attrs.layout == "NCHW"
         pooled_size = get_const_tuple(attrs.pooled_size)
         mode = bytes(attrs.mode, "utf-8")
         return [
@@ -1061,12 +1060,19 @@ def roi_align_strategy(attrs, inputs, out_type, target):
     """roi_align generic strategy"""
     strategy = _op.OpStrategy()
     layout = attrs.layout
-    assert layout == "NCHW", "only support nchw for now"
-    strategy.add_implementation(
-        wrap_compute_roi_align(topi.vision.rcnn.roi_align_nchw),
-        wrap_topi_schedule(topi.generic.schedule_roi_align),
-        name="roi_align.generic",
-    )
+    if layout == "NCHW":
+        strategy.add_implementation(
+            wrap_compute_roi_align(topi.vision.rcnn.roi_align_nchw),
+            wrap_topi_schedule(topi.generic.schedule_roi_align),
+            name="roi_align.generic",
+        )
+    else:
+        assert layout == "NHWC", "layout must be NCHW or NHWC."
+        strategy.add_implementation(
+            wrap_compute_roi_align(topi.vision.rcnn.roi_align_nhwc),
+            wrap_topi_schedule(topi.generic.schedule_roi_align),
+            name="roi_align.generic",
+        )
     return strategy
 
 
diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index f33c45b248d6..1f37a4f8e98c 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -481,12 +481,19 @@ def roi_align_strategy_cpu(attrs, inputs, out_type, target):
     """roi_align x86 strategy"""
     strategy = _op.OpStrategy()
     layout = attrs.layout
-    assert layout == "NCHW", "only support nchw for now"
-    strategy.add_implementation(
-        wrap_compute_roi_align(topi.x86.roi_align_nchw),
-        wrap_topi_schedule(topi.generic.schedule_roi_align),
-        name="roi_align.x86",
-    )
+    if layout == "NCHW":
+        strategy.add_implementation(
+            wrap_compute_roi_align(topi.x86.roi_align_nchw),
+            wrap_topi_schedule(topi.generic.schedule_roi_align),
+            name="roi_align.x86",
+        )
+    else:
+        assert layout == "NHWC", "layout must be NCHW or NHWC."
+        strategy.add_implementation(
+            wrap_compute_roi_align(topi.vision.rcnn.roi_align_nhwc),
+            wrap_topi_schedule(topi.generic.schedule_roi_align),
+            name="roi_align.x86",
+        )
     return strategy
 
 
diff --git a/python/tvm/relay/op/vision/_vision.py b/python/tvm/relay/op/vision/_vision.py
index 04676e24adf6..9c8c853fa3d2 100644
--- a/python/tvm/relay/op/vision/_vision.py
+++ b/python/tvm/relay/op/vision/_vision.py
@@ -86,7 +86,7 @@ def nms_shape_func(attrs, inputs, _):
 
 
 @script
-def _roi_align_shape_func(data_shape, rois_shape, pooled_size):
+def _roi_align_shape_func_nchw(data_shape, rois_shape, pooled_size):
     out = output_tensor((4,), "int64")
     out[0] = rois_shape[0]
     out[1] = data_shape[1]
@@ -95,6 +95,19 @@ def _roi_align_shape_func(data_shape, rois_shape, pooled_size):
     return out
 
 
+@script
+def _roi_align_shape_func_nhwc(data_shape, rois_shape, pooled_size):
+    out = output_tensor((4,), "int64")
+    out[0] = rois_shape[0]
+    out[1] = int64(pooled_size[0])
+    out[2] = int64(pooled_size[1])
+    out[3] = data_shape[3]
+    return out
+
+
 @reg.register_shape_func("vision.roi_align", False)
 def roi_align_shape_func(attrs, inputs, _):
-    return [_roi_align_shape_func(inputs[0], inputs[1], convert(attrs.pooled_size))]
+    if attrs.layout == "NCHW":
+        return [_roi_align_shape_func_nchw(inputs[0], inputs[1], convert(attrs.pooled_size))]
+    assert attrs.layout == "NHWC", "layout must be NCHW or NHWC."
+    return [_roi_align_shape_func_nhwc(inputs[0], inputs[1], convert(attrs.pooled_size))]
diff --git a/python/tvm/topi/testing/__init__.py b/python/tvm/topi/testing/__init__.py
index 85f13a763c40..ef36b9e73446 100644
--- a/python/tvm/topi/testing/__init__.py
+++ b/python/tvm/topi/testing/__init__.py
@@ -39,7 +39,7 @@
 from .bilinear_resize_python import bilinear_resize_python
 from .trilinear_resize3d_python import trilinear_resize3d_python
 from .reorg_python import reorg_python
-from .roi_align_python import roi_align_nchw_python
+from .roi_align_python import roi_align_nchw_python, roi_align_nhwc_python
 from .roi_pool_python import roi_pool_nchw_python
 from .lrn_python import lrn_python
 from .l2_normalize_python import l2_normalize_python
diff --git a/python/tvm/topi/testing/roi_align_python.py b/python/tvm/topi/testing/roi_align_python.py
index 643a954b101b..986123b6c9c6 100644
--- a/python/tvm/topi/testing/roi_align_python.py
+++ b/python/tvm/topi/testing/roi_align_python.py
@@ -20,42 +20,51 @@
 import numpy as np
 
 
-def roi_align_nchw_python(a_np, rois_np, pooled_size, spatial_scale, sample_ratio, mode=b"avg"):
-    """Roi align in python"""
-    avg_mode = mode in (b"avg", "avg", 0)
-    max_mode = mode in (b"max", "max", 1)
-    assert avg_mode or max_mode, "Mode must be average or max. Please pass a valid mode."
-    _, channel, height, width = a_np.shape
-    num_roi = rois_np.shape[0]
-    b_np = np.zeros((num_roi, channel, pooled_size, pooled_size), dtype=a_np.dtype)
-    if isinstance(pooled_size, int):
-        pooled_size_h = pooled_size_w = pooled_size
-    else:
-        pooled_size_h, pooled_size_w = pooled_size
-
-    def _bilinear(n, c, y, x):
-        if y < -1 or y > height or x < -1 or x > width:
-            return 0
+def _bilinear(a_np, n, c, y, x, height, width, layout):
+    if y < -1 or y > height or x < -1 or x > width:
+        return 0
 
-        y = min(max(y, 0), height - 1)
-        x = min(max(x, 0), width - 1)
+    y = min(max(y, 0), height - 1)
+    x = min(max(x, 0), width - 1)
 
-        y_low = int(math.floor(y))
-        x_low = int(math.floor(x))
-        y_high = y_low + 1
-        x_high = x_low + 1
+    y_low = int(math.floor(y))
+    x_low = int(math.floor(x))
+    y_high = y_low + 1
+    x_high = x_low + 1
 
-        wy_h = y - y_low
-        wx_h = x - x_low
-        wy_l = 1 - wy_h
-        wx_l = 1 - wx_h
+    wy_h = y - y_low
+    wx_h = x - x_low
+    wy_l = 1 - wy_h
+    wx_l = 1 - wx_h
 
-        val = 0
-        for wx, xp in zip((wx_l, wx_h), (x_low, x_high)):
-            for wy, yp in zip((wy_l, wy_h), (y_low, y_high)):
-                if 0 <= yp < height and 0 <= xp < width:
+    val = 0
+    for wx, xp in zip((wx_l, wx_h), (x_low, x_high)):
+        for wy, yp in zip((wy_l, wy_h), (y_low, y_high)):
+            if 0 <= yp < height and 0 <= xp < width:
+                if layout == "NCHW":
                     val += wx * wy * a_np[n, c, yp, xp]
-        return val
+                else:
+                    val += wx * wy * a_np[n, yp, xp, c]
+    return val
+
+
+def roi_align_common(
+    a_np,
+    b_np,
+    rois_np,
+    channel,
+    pooled_size_h,
+    pooled_size_w,
+    spatial_scale,
+    sample_ratio,
+    avg_mode,
+    max_mode,
+    height,
+    width,
+    layout,
+):
+    """Common code used by roi align NCHW and NHWC"""
+    num_roi = rois_np.shape[0]
 
     for i in range(num_roi):
         roi = rois_np[i]
@@ -70,8 +79,8 @@ def _bilinear(n, c, y, x):
         if sample_ratio > 0:
             roi_bin_grid_h = roi_bin_grid_w = int(sample_ratio)
         else:
-            roi_bin_grid_h = int(math.ceil(roi_h / pooled_size))
-            roi_bin_grid_w = int(math.ceil(roi_w / pooled_size))
+            roi_bin_grid_h = int(math.ceil(roi_h / pooled_size_h))
+            roi_bin_grid_w = int(math.ceil(roi_w / pooled_size_w))
 
         count = roi_bin_grid_h * roi_bin_grid_w
 
@@ -87,8 +96,80 @@ def _bilinear(n, c, y, x):
                             y = roi_start_h + ph * bin_h + (iy + 0.5) * bin_h / roi_bin_grid_h
                             x = roi_start_w + pw * bin_w + (ix + 0.5) * bin_w / roi_bin_grid_w
                             if avg_mode:
-                                total += _bilinear(batch_index, c, y, x) / count
+                                total += (
+                                    _bilinear(a_np, batch_index, c, y, x, height, width, layout)
+                                    / count
+                                )
                             if max_mode:
-                                total = max(total, _bilinear(batch_index, c, y, x))
-                    b_np[i, c, ph, pw] = total
+                                total = max(
+                                    total,
+                                    _bilinear(a_np, batch_index, c, y, x, height, width, layout),
+                                )
+
+                    if layout == "NCHW":
+                        b_np[i, c, ph, pw] = total
+                    else:
+                        b_np[i, ph, pw, c] = total
     return b_np
+
+
+def roi_align_nchw_python(a_np, rois_np, pooled_size, spatial_scale, sample_ratio, mode=b"avg"):
+    """Roi align NCHW in python"""
+    avg_mode = mode in (b"avg", "avg", 0)
+    max_mode = mode in (b"max", "max", 1)
+    assert avg_mode or max_mode, "Mode must be average or max. Please pass a valid mode."
+    _, channel, height, width = a_np.shape
+    if isinstance(pooled_size, int):
+        pooled_size_h = pooled_size_w = pooled_size
+    else:
+        pooled_size_h, pooled_size_w = pooled_size
+
+    b_np = np.zeros((rois_np.shape[0], channel, pooled_size_h, pooled_size_w), dtype=a_np.dtype)
+
+    return roi_align_common(
+        a_np,
+        b_np,
+        rois_np,
+        channel,
+        pooled_size_h,
+        pooled_size_w,
+        spatial_scale,
+        sample_ratio,
+        avg_mode,
+        max_mode,
+        height,
+        width,
+        "NCHW",
+    )
+
+
+def roi_align_nhwc_python(a_np, rois_np, pooled_size, spatial_scale, sample_ratio, mode=b"avg"):
+    """Roi align NHWC in python"""
+    avg_mode = mode in (b"avg", "avg", 0)
+    max_mode = mode in (b"max", "max", 1)
+    assert avg_mode or max_mode, "Mode must be average or max. Please pass a valid mode."
+    _, height, width, channel = a_np.shape
+    num_roi = rois_np.shape[0]
+
+    if isinstance(pooled_size, int):
+        pooled_size_h = pooled_size_w = pooled_size
+    else:
+        pooled_size_h, pooled_size_w = pooled_size
+
+    b_np = np.zeros((num_roi, pooled_size_h, pooled_size_w, channel), dtype=a_np.dtype)
+
+    return roi_align_common(
+        a_np,
+        b_np,
+        rois_np,
+        channel,
+        pooled_size_h,
+        pooled_size_w,
+        spatial_scale,
+        sample_ratio,
+        avg_mode,
+        max_mode,
+        height,
+        width,
+        "NHWC",
+    )
diff --git a/python/tvm/topi/vision/rcnn/roi_align.py b/python/tvm/topi/vision/rcnn/roi_align.py
index 95f350084ba5..655ba2637d84 100644
--- a/python/tvm/topi/vision/rcnn/roi_align.py
+++ b/python/tvm/topi/vision/rcnn/roi_align.py
@@ -19,7 +19,71 @@
 import tvm
 from tvm import te
 from ...utils import get_const_tuple
-from ...cpp.utils import bilinear_sample_nchw
+from ...cpp.utils import bilinear_sample_nchw, bilinear_sample_nhwc
+
+
+def _sample_common(
+    i,
+    c,
+    ph,
+    pw,
+    rois,
+    pooled_size_h,
+    pooled_size_w,
+    spatial_scale,
+    sample_ratio,
+    dtype,
+    avg_mode,
+    bilinear_func,
+):
+    roi = rois[i]
+    batch_index = roi[0].astype("int32")
+    roi_start_w, roi_start_h, roi_end_w, roi_end_h = roi[1], roi[2], roi[3], roi[4]
+    roi_start_h *= spatial_scale
+    roi_end_h *= spatial_scale
+    roi_start_w *= spatial_scale
+    roi_end_w *= spatial_scale
+
+    # force malformed ROIs to be 1x1
+    roi_h = tvm.te.max(roi_end_h - roi_start_h, tvm.tir.const(1.0, dtype))
+    roi_w = tvm.te.max(roi_end_w - roi_start_w, tvm.tir.const(1.0, dtype))
+
+    bin_h = roi_h / pooled_size_h
+    bin_w = roi_w / pooled_size_w
+
+    if sample_ratio > 0:
+        roi_bin_grid_h = roi_bin_grid_w = tvm.tir.const(sample_ratio, "int32")
+    else:
+        roi_bin_grid_h = te.ceil(roi_h / pooled_size_h).astype("int32")
+        roi_bin_grid_w = te.ceil(roi_w / pooled_size_w).astype("int32")
+
+    count = roi_bin_grid_h * roi_bin_grid_w
+    rh = te.reduce_axis((0, roi_bin_grid_h))
+    rw = te.reduce_axis((0, roi_bin_grid_w))
+    roi_start_h += ph * bin_h
+    roi_start_w += pw * bin_w
+
+    if avg_mode:
+        return te.sum(
+            bilinear_func(
+                batch_index,
+                c,
+                roi_start_h + (rh + 0.5) * bin_h / roi_bin_grid_h,
+                roi_start_w + (rw + 0.5) * bin_w / roi_bin_grid_w,
+            )
+            / count,
+            axis=[rh, rw],
+        )
+    # max mode
+    return te.max(
+        bilinear_func(
+            batch_index,
+            c,
+            roi_start_h + (rh + 0.5) * bin_h / roi_bin_grid_h,
+            roi_start_w + (rw + 0.5) * bin_w / roi_bin_grid_w,
+        ),
+        axis=[rh, rw],
+    )
 
 
 def roi_align_nchw(data, rois, pooled_size, spatial_scale, mode, sample_ratio=-1):
@@ -73,54 +137,92 @@ def _bilinear(i, c, y, x):
         return tvm.tir.if_then_else(outside, 0.0, val)
 
     def _sample(i, c, ph, pw):
-        roi = rois[i]
-        batch_index = roi[0].astype("int32")
-        roi_start_w, roi_start_h, roi_end_w, roi_end_h = roi[1], roi[2], roi[3], roi[4]
-        roi_start_h *= spatial_scale
-        roi_end_h *= spatial_scale
-        roi_start_w *= spatial_scale
-        roi_end_w *= spatial_scale
-
-        # force malformed ROIs to be 1x1
-        roi_h = tvm.te.max(roi_end_h - roi_start_h, tvm.tir.const(1.0, dtype))
-        roi_w = tvm.te.max(roi_end_w - roi_start_w, tvm.tir.const(1.0, dtype))
-
-        bin_h = roi_h / pooled_size_h
-        bin_w = roi_w / pooled_size_w
-
-        if sample_ratio > 0:
-            roi_bin_grid_h = roi_bin_grid_w = tvm.tir.const(sample_ratio, "int32")
-        else:
-            roi_bin_grid_h = te.ceil(roi_h / pooled_size_h).astype("int32")
-            roi_bin_grid_w = te.ceil(roi_w / pooled_size_w).astype("int32")
-
-        count = roi_bin_grid_h * roi_bin_grid_w
-        rh = te.reduce_axis((0, roi_bin_grid_h))
-        rw = te.reduce_axis((0, roi_bin_grid_w))
-        roi_start_h += ph * bin_h
-        roi_start_w += pw * bin_w
-        if avg_mode:
-            return te.sum(
-                _bilinear(
-                    batch_index,
-                    c,
-                    roi_start_h + (rh + 0.5) * bin_h / roi_bin_grid_h,
-                    roi_start_w + (rw + 0.5) * bin_w / roi_bin_grid_w,
-                )
-                / count,
-                axis=[rh, rw],
-            )
-        # max mode
-        return te.max(
-            _bilinear(
-                batch_index,
-                c,
-                roi_start_h + (rh + 0.5) * bin_h / roi_bin_grid_h,
-                roi_start_w + (rw + 0.5) * bin_w / roi_bin_grid_w,
-            ),
-            axis=[rh, rw],
+        return _sample_common(
+            i,
+            c,
+            ph,
+            pw,
+            rois,
+            pooled_size_h,
+            pooled_size_w,
+            spatial_scale,
+            sample_ratio,
+            dtype,
+            avg_mode,
+            _bilinear,
         )
 
     return te.compute(
         (num_roi, channel, pooled_size_h, pooled_size_w), _sample, tag="pool,roi_align_nchw"
     )
+
+
+def roi_align_nhwc(data, rois, pooled_size, spatial_scale, mode, sample_ratio=-1):
+    """ROI align operator in NHWC layout.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        4-D with shape [batch, height, width, channel]
+
+    rois : tvm.te.Tensor
+        2-D with shape [num_roi, 5]. The last dimension should be in format of
+        [batch_index, w_start, h_start, w_end, h_end]
+
+    pooled_size : int or list/tuple of two ints
+        output size, or [out_height, out_width]
+
+    spatial_scale : float
+        Ratio of input feature map height (or w) to raw image height (or w). Equals the reciprocal
+        of total stride in convolutional layers, which should be in range (0.0, 1.0]
+
+    mode : int or str
+        There are two modes, average and max. For the average mode, you can pass b'avg' or 0, and
+        for the max mode, you can pass b'max' or 1.
+
+    sample_ratio : int
+        Optional sampling ratio of ROI align, using adaptive size by default.
+
+    Returns
+    -------
+    output : tvm.te.Tensor
+        4-D with shape [num_roi, pooled_size, pooled_size, channel]
+    """
+    avg_mode = mode in (b"avg", 0)
+    max_mode = mode in (b"max", 1)
+    assert avg_mode or max_mode, "Mode must be avg or max. Please pass in a valid mode."
+    dtype = rois.dtype
+    _, height, width, channel = get_const_tuple(data.shape)
+    num_roi, _ = get_const_tuple(rois.shape)
+
+    if isinstance(pooled_size, int):
+        pooled_size_h = pooled_size_w = pooled_size
+    else:
+        pooled_size_h, pooled_size_w = pooled_size
+
+    def _bilinear(i, c, y, x):
+        outside = tvm.tir.any(y < -1.0, x < -1.0, y > height, x > width)
+        y = tvm.te.min(tvm.te.max(y, 0.0), height - 1)
+        x = tvm.te.min(tvm.te.max(x, 0.0), width - 1)
+        val = bilinear_sample_nhwc(data, (i, y, x, c), height - 1, width - 1)
+        return tvm.tir.if_then_else(outside, 0.0, val)
+
+    def _sample(i, ph, pw, c):
+        return _sample_common(
+            i,
+            c,
+            ph,
+            pw,
+            rois,
+            pooled_size_h,
+            pooled_size_w,
+            spatial_scale,
+            sample_ratio,
+            dtype,
+            avg_mode,
+            _bilinear,
+        )
+
+    return te.compute(
+        (num_roi, pooled_size_h, pooled_size_w, channel), _sample, tag="pool,roi_align_nchw"
+    )
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 95cd537091f5..0a84667f8bdb 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -583,7 +583,18 @@ def test_threshold():
 
 @tvm.testing.uses_gpu
 def test_roi_align():
-    def verify_roi_align(data_shape, rois_shape, pooled_size, spatial_scale, sample_ratio, mode):
+    def verify_roi_align(
+        data_shape,
+        rois_shape,
+        channel,
+        in_size,
+        pooled_size,
+        spatial_scale,
+        sample_ratio,
+        mode,
+        layout,
+        ref_func,
+    ):
         data = relay.var("data", relay.ty.TensorType(data_shape, "float32"))
         rois = relay.var("rois", relay.ty.TensorType(rois_shape, "float32"))
         z = relay.vision.roi_align(
@@ -593,21 +604,27 @@ def verify_roi_align(data_shape, rois_shape, pooled_size, spatial_scale, sample_
             spatial_scale=spatial_scale,
             sample_ratio=sample_ratio,
             mode=mode,
-            layout="NCHW",
+            layout=layout,
         )
         zz = run_infer_type(z)
-        batch, channel, in_size, _ = data_shape
+
         num_roi = rois_shape[0]
-        assert zz.checked_type == relay.ty.TensorType(
-            (num_roi, channel, pooled_size, pooled_size), "float32"
-        )
+
+        if layout == "NCHW":
+            assert zz.checked_type == relay.ty.TensorType(
+                (num_roi, channel, pooled_size, pooled_size), "float32"
+            )
+        else:
+            assert zz.checked_type == relay.ty.TensorType(
+                (num_roi, pooled_size, pooled_size, channel), "float32"
+            )
 
         func = relay.Function([data, rois], z)
         func = run_infer_type(func)
         np_data = np.random.uniform(size=data_shape).astype("float32")
         np_rois = np.random.uniform(size=rois_shape).astype("float32") * in_size
-        np_rois[:, 0] = np.random.randint(low=0, high=batch, size=num_roi)
-        ref_res = tvm.topi.testing.roi_align_nchw_python(
+        np_rois[:, 0] = np.random.randint(low=0, high=data_shape[0], size=num_roi)
+        ref_res = ref_func(
             np_data,
             np_rois,
             pooled_size=pooled_size,
@@ -616,6 +633,7 @@ def verify_roi_align(data_shape, rois_shape, pooled_size, spatial_scale, sample_
             mode=mode,
         )
         for target, ctx in tvm.testing.enabled_targets():
+            print("test on", target)
             intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
             op_res1 = intrp1.evaluate(func)(np_data, np_rois)
             tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-4)
@@ -623,18 +641,64 @@ def verify_roi_align(data_shape, rois_shape, pooled_size, spatial_scale, sample_
             op_res2 = intrp2.evaluate(func)(np_data, np_rois)
             tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-4)
 
-    verify_roi_align(
+    def verify_roi_align_nchw(
+        data_shape, rois_shape, pooled_size, spatial_scale, sample_ratio, mode
+    ):
+        _, channel, in_size, _ = data_shape
+        return verify_roi_align(
+            data_shape,
+            rois_shape,
+            channel,
+            in_size,
+            pooled_size,
+            spatial_scale,
+            sample_ratio,
+            mode,
+            "NCHW",
+            tvm.topi.testing.roi_align_nchw_python,
+        )
+
+    def verify_roi_align_nhwc(
+        data_shape, rois_shape, pooled_size, spatial_scale, sample_ratio, mode
+    ):
+        _, in_size, _, channel = data_shape
+        return verify_roi_align(
+            data_shape,
+            rois_shape,
+            channel,
+            in_size,
+            pooled_size,
+            spatial_scale,
+            sample_ratio,
+            mode,
+            "NHWC",
+            tvm.topi.testing.roi_align_nhwc_python,
+        )
+
+    verify_roi_align_nchw(
         (1, 4, 16, 16), (32, 5), pooled_size=7, spatial_scale=1.0, sample_ratio=-1, mode="avg"
     )
-    verify_roi_align(
+    verify_roi_align_nchw(
         (4, 4, 16, 16), (32, 5), pooled_size=7, spatial_scale=0.5, sample_ratio=2, mode="avg"
     )
-    verify_roi_align(
+    verify_roi_align_nchw(
         (1, 4, 16, 16), (32, 5), pooled_size=7, spatial_scale=1.0, sample_ratio=-1, mode="max"
     )
-    verify_roi_align(
+    verify_roi_align_nchw(
         (4, 4, 16, 16), (32, 5), pooled_size=7, spatial_scale=0.5, sample_ratio=2, mode="max"
     )
+    verify_roi_align_nhwc(
+        (1, 16, 16, 4), (32, 5), pooled_size=7, spatial_scale=1.0, sample_ratio=-1, mode="avg"
+    )
+    verify_roi_align_nhwc(
+        (4, 16, 16, 4), (32, 5), pooled_size=7, spatial_scale=0.5, sample_ratio=2, mode="avg"
+    )
+    verify_roi_align_nhwc(
+        (1, 16, 16, 4), (32, 5), pooled_size=7, spatial_scale=1.0, sample_ratio=-1, mode="max"
+    )
+    verify_roi_align_nhwc(
+        (4, 16, 16, 4), (32, 5), pooled_size=7, spatial_scale=0.5, sample_ratio=2, mode="max"
+    )
 
 
 @tvm.testing.uses_gpu
@@ -1262,7 +1326,6 @@ def verify_batch_to_space_nd(dshape, block_shape, crops):
     test_resize_infer_type()
     test_resize()
     test_resize3d_infer_type()
-    test_resize3d()
     test_crop_and_resize()
     test_multibox_prior()
     test_multibox_transform_loc()

From 84c4b150ab25aa3ea822beed4702dcb56dddce4c Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <pivovaa@amazon.com>
Date: Thu, 18 Feb 2021 06:44:40 -0800
Subject: [PATCH 208/357] Set TOpPattern=kOpaque for scatter_nd (#7464)

---
 src/relay/op/tensor/transform.cc     |  5 ++++-
 tests/python/relay/test_op_level3.py | 31 ++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 1e782a568fe9..12db859d1ae1 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -1146,6 +1146,9 @@ Expr MakeScatterND(Expr data, Expr indices, const Array<Integer> out_shape) {
 
 TVM_REGISTER_GLOBAL("relay.op._make.scatter_nd").set_body_typed(MakeScatterND);
 
+// scatter_nd operator has extern schedules for CPU and GPU devices.
+// Fusing extern schedules with Injective schedules leads to errors.
+// So, converting the scatter_nd to Opaque to prevent compilation failures
 RELAY_REGISTER_OP("scatter_nd")
     .describe(R"code(Scatter elements or slices from data and store to a tensor
 whose shape is defined by indices.
@@ -1158,7 +1161,7 @@ Given data with shape (Y_0, ..., Y_{K-1}, X_M, ..., X_{N-1}) and indices with sh
     .add_argument("indices", "Tensor", "The indices tensor.")
     .set_support_level(3)
     .add_type_rel("ScatterND", ScatterNDRel)
-    .set_attr<TOpPattern>("TOpPattern", kInjective);
+    .set_attr<TOpPattern>("TOpPattern", kOpaque);
 
 // Take
 TVM_REGISTER_NODE_TYPE(TakeAttrs);
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 625c47240326..94fac3ba1264 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -1391,17 +1391,46 @@ def verify_scatter_nd(data_np, indices_np, shape, ref_res, rtol=1e-5, atol=1e-5)
             op_res = intrp.evaluate(func)(data_np, indices_np)
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=rtol, atol=atol)
 
+    def verify_scatter_nd_with_stack(data_np, indices_np, shape, ref_res, rtol=1e-5, atol=1e-5):
+        data = relay.var("data", shape=data_np.shape, dtype=str(data_np.dtype))
+        indices_vars = [
+            relay.var("ind{i}", shape=v.shape, dtype=str(v.dtype)) for i, v in enumerate(indices_np)
+        ]
+
+        # test if scatter_nd works in case indices are prepared by another Relay operator
+        indices = relay.op.stack(indices_vars, axis=0)
+        out = relay.op.scatter_nd(data, indices, shape)
+        func = relay.Function(
+            [
+                data,
+            ]
+            + indices_vars,
+            out,
+        )
+
+        fargs = [
+            data_np,
+        ]
+        for a in indices_np:
+            fargs.append(a)
+        for kind in ["graph", "debug"]:
+            intrp = relay.create_executor(kind, ctx=ctx, target=target)
+            op_res = intrp.evaluate(func)(*fargs)
+            tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=rtol, atol=atol)
+
     data = np.array([2, 3, 0])
     indices = np.array([[1, 1, 0], [0, 1, 0]])
     shape = (2, 2)
     out = np.array([[0, 0], [2, 3]])
     verify_scatter_nd(data, indices, shape, out)
+    verify_scatter_nd_with_stack(data, indices, shape, out)
 
     data = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
     indices = np.array([[0, 1], [1, 1]])
     shape = (2, 2, 2, 2)
     out = np.array([[[[0, 0], [0, 0]], [[1, 2], [3, 4]]], [[[0, 0], [0, 0]], [[5, 6], [7, 8]]]])
     verify_scatter_nd(data, indices, shape, out)
+    verify_scatter_nd_with_stack(data, indices, shape, out)
 
     data = np.reshape(np.arange(1560 * 3), (3, 1560)).astype("float32")
     indices = np.array([[1, 0, 0]])
@@ -1411,6 +1440,7 @@ def verify_scatter_nd(data_np, indices_np, shape, ref_res, rtol=1e-5, atol=1e-5)
     out[0, :] += data[1, :]
     out[0, :] += data[2, :]
     verify_scatter_nd(data, indices, shape, out)
+    verify_scatter_nd_with_stack(data, indices, shape, out)
 
     data = np.ones((5, 3)).astype("float64")
     indices = np.stack((np.random.randint(2, size=5), np.random.randint(7, size=5))).astype("int64")
@@ -1420,6 +1450,7 @@ def verify_scatter_nd(data_np, indices_np, shape, ref_res, rtol=1e-5, atol=1e-5)
         for j in range(data.shape[1]):
             out[indices[0, i], indices[1, i], j] += data[i, j]
     verify_scatter_nd(data, indices, shape, out)
+    verify_scatter_nd_with_stack(data, indices, shape, out)
 
 
 if __name__ == "__main__":

From 944d8d13746e0b287780064ced0f1e50bc26208a Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Fri, 19 Feb 2021 05:10:47 +0900
Subject: [PATCH 209/357] [RUNTIME] Fast path for single thread run to allow
 app level threading (#7454)

* Fast path for single thread run to allow app level threading

* add sync counter to avoid error in one of tests
---
 src/runtime/thread_pool.cc | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/src/runtime/thread_pool.cc b/src/runtime/thread_pool.cc
index ba14c733176e..5f5a811c2d30 100644
--- a/src/runtime/thread_pool.cc
+++ b/src/runtime/thread_pool.cc
@@ -363,21 +363,30 @@ TVM_REGISTER_GLOBAL("runtime.config_threadpool").set_body([](TVMArgs args, TVMRe
 }  // namespace tvm
 
 int TVMBackendParallelLaunch(FTVMParallelLambda flambda, void* cdata, int num_task) {
+  int num_workers = tvm::runtime::threading::MaxConcurrency();
+  if (num_workers == 1) {
+    std::atomic<int32_t> sync_counter{0};
+    TVMParallelGroupEnv env;
+    env.num_task = 1;
+    env.sync_handle = &sync_counter;
+    (*flambda)(0, &env, cdata);
+    return 0;
+  } else {
 #if !TVM_THREADPOOL_USE_OPENMP
-  int res = tvm::runtime::ThreadPool::ThreadLocal()->Launch(flambda, cdata, num_task, 1);
-  return res;
+    int res = tvm::runtime::ThreadPool::ThreadLocal()->Launch(flambda, cdata, num_task, 1);
+    return res;
 #else
-  int num_workers = tvm::runtime::threading::MaxConcurrency();
-  if (num_task == 0) num_task = num_workers;
-  omp_set_num_threads(num_task);
+    if (num_task == 0) num_task = num_workers;
+    omp_set_num_threads(num_task);
 #pragma omp parallel num_threads(num_task)
-  {
-    TVMParallelGroupEnv env;
-    env.num_task = num_task;
-    (*flambda)(omp_get_thread_num(), &env, cdata);
-  }
-  return 0;
+    {
+      TVMParallelGroupEnv env;
+      env.num_task = num_task;
+      (*flambda)(omp_get_thread_num(), &env, cdata);
+    }
+    return 0;
 #endif
+  }
 }
 
 int TVMBackendParallelBarrier(int task_id, TVMParallelGroupEnv* penv) {

From 50e013dd3a5e23450ff4ae98324be07aa6160a6d Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <pivovaa@amazon.com>
Date: Thu, 18 Feb 2021 15:53:53 -0800
Subject: [PATCH 210/357] [Torch] Add index_put operator (#7465)

* [Torch] Add index_put operator

* Skip test_frontends.py::test_load_model__pth
---
 python/tvm/relay/frontend/pytorch.py          | 28 ++++++++++++++++
 tests/python/driver/tvmc/test_frontends.py    |  1 +
 tests/python/frontend/pytorch/test_forward.py | 32 +++++++++++++++++++
 3 files changed, 61 insertions(+)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 246ed97b14e9..205b2aa779e6 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -2010,6 +2010,32 @@ def scatter(self, inputs, input_types):
         src = inputs[3]
         return _op.transform.scatter(data, index, src, axis)
 
+    def index_put(self, inputs, input_types):
+        in_tensor = inputs[0]
+        indices = inputs[1]
+        values = inputs[2]
+        accumulate = inputs[3]
+        # accumulate parameter is ignored.
+        # torch.index_put default is False but Relay.scatter_nd accumulates values.
+        # We assume there is no duplicate indices in torch.index_put input
+        if not accumulate:
+            logging.warning(
+                "torch.index_put accumulate parameter is False. "
+                "TVM uses tvm.relay.scatter_nd operator which accumulates values. "
+                "Make sure there is no duplicate indices in torch.index_put input."
+            )
+        # Relay scatter_nd does not support input tensor
+        # We assume that torch.index_put is used with empty zero-values input tensor
+        # scatter_nd will create empty zero-values tensor with a given shape
+        out_shape = self.infer_shape(in_tensor)
+        logging.warning(
+            "tvm.relay.scatter_nd operator does not support input tensor parameter. "
+            "TVM assumes that torch.index_put is used with empty zero-values input tensor"
+        )
+        # Combine array of index tensors into one index tensor with shape (N,_)
+        index_tensor = _op.stack(indices, axis=0)
+        return _op.transform.scatter_nd(values, index_tensor, out_shape)
+
     def scalar_tensor(self, inputs, input_types):
         data = inputs[0]
         cast_map = {
@@ -2326,6 +2352,8 @@ def create_convert_map(self):
             "aten::nonzero": self.nonzero,
             "aten::nonzero_numpy": self.nonzero_numpy,
             "aten::scatter": self.scatter,
+            "aten::index_put": self.index_put,
+            "aten::index_put_": self.index_put,
             "aten::scalar_tensor": self.scalar_tensor,
             "aten::__interpolate": self.interpolate,
             "aten::IntImplicit": self.identity,
diff --git a/tests/python/driver/tvmc/test_frontends.py b/tests/python/driver/tvmc/test_frontends.py
index 04c85b1eb8f3..b41f4c4dff2d 100644
--- a/tests/python/driver/tvmc/test_frontends.py
+++ b/tests/python/driver/tvmc/test_frontends.py
@@ -174,6 +174,7 @@ def test_load_model___wrong_language__to_onnx(tflite_mobilenet_v1_1_quant):
         tvmc.frontends.load_model(tflite_mobilenet_v1_1_quant, model_format="onnx")
 
 
+@pytest.mark.skip(reason="https://github.com/apache/tvm/issues/7455")
 def test_load_model__pth(pytorch_resnet18):
     # some CI environments wont offer torch, so skip in case it is not present
     pytest.importorskip("torch")
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 8d968e9760c9..aa42b0fb84e4 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -3327,6 +3327,38 @@ def test_fn_scatter_add(dim):
     verify_trace_model(test_fn_scatter_add(1), [in_data, in_index, in_src], targets)
 
 
+def test_forward_index_put():
+    # torch.index_put for 2D tensor and default accumulate (False)
+    def test_fn_index_put2():
+        return lambda data, xidx, yidx, values: torch.index_put(
+            data, indices=[xidx, yidx], values=values
+        )
+
+    # torch.index_put for 3D tensor and accumulate=True
+    def test_fn_index_put3a():
+        return lambda data, xidx, yidx, zidx, values: torch.index_put(
+            data, indices=[xidx, yidx, zidx], values=values, accumulate=True
+        )
+
+    shape = (3, 5)
+    in_data = torch.zeros(shape)
+    xidx = torch.tensor([0, 1, 2, 2])
+    yidx = torch.tensor([0, 1, 3, 4])
+    values = torch.tensor([2.0, 4.0, 7.0, 9.0])
+
+    targets = ["llvm", "cuda"]
+    verify_trace_model(test_fn_index_put2(), [in_data, xidx, yidx, values], targets)
+
+    shape = (3, 5, 3)
+    in_data = torch.zeros(shape)
+    xidx = torch.tensor([0, 1, 2, 2, 0])
+    yidx = torch.tensor([0, 1, 3, 4, 0])
+    zidx = torch.tensor([0, 1, 1, 2, 0])
+    values = torch.tensor([2.0, 4.0, 7.0, 9.0, 1.0])
+
+    verify_trace_model(test_fn_index_put3a(), [in_data, xidx, yidx, zidx, values], targets)
+
+
 def test_numel():
     class Numel(Module):
         def forward(self, data):

From b51973fb48deb34ff725bf1206f1b683f8bc2773 Mon Sep 17 00:00:00 2001
From: "Steven S. Lyubomirsky" <sslyu@cs.washington.edu>
Date: Thu, 18 Feb 2021 21:10:10 -0500
Subject: [PATCH 211/357] [Relay][Bugfix] Fix off-by-one error in BiasAddRel,
 use new reporting (#7467)

* Fix off-by-one in BiasAddRel, use new reporting

* No need to mark xfail if the exception is caught

* lint
---
 src/relay/op/nn/nn.cc                |  9 +++++++--
 tests/python/relay/test_op_level1.py | 12 ++++++++++++
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 3e3d94c614c3..97460ba4a98b 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -61,8 +61,13 @@ bool BiasAddRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   if (axis < 0) {
     axis = data->shape.size() + axis;
   }
-  ICHECK_LE(axis, static_cast<int>(data->shape.size()))
-      << "axis " << param->axis << " is out of range";
+  if (axis >= static_cast<int>(data->shape.size())) {
+    reporter->GetDiagCtx().EmitFatal(Diagnostic::Error(reporter->GetSpan())
+                                     << "The axis in bias_add must be in range for the shape; "
+                                     << "attempted to access index " << axis << " of "
+                                     << PrettyPrint(data->shape));
+    return false;
+  }
 
   // assign output type
   reporter->Assign(types[1], TensorType({data->shape[axis]}, data->dtype));
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 54d04da5e092..ea5dd6948b11 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -201,6 +201,17 @@ def test_bias_add():
             np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=rtol)
 
 
+def test_bias_add_type_failure():
+    # the axis is out of range
+    try:
+        b_add = relay.nn.bias_add(relay.const(1), relay.const(2), axis=0)
+        run_infer_type(b_add)
+    except tvm._ffi.base.TVMError:
+        pass
+    else:
+        assert False
+
+
 def test_expand_dims_infer_type():
     for dtype in ["float16", "float32"]:
         n, t, d = te.size_var("n"), te.size_var("t"), 100
@@ -484,6 +495,7 @@ def test_bitserial_dense():
 if __name__ == "__main__":
     test_concatenate()
     test_bias_add()
+    test_bias_add_type_failure()
     test_unary_op()
     test_binary_op()
     test_expand_dims_infer_type()

From e2042093cddcd2249bf1a7b7659cda6d39046a1c Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Fri, 19 Feb 2021 19:15:31 +0800
Subject: [PATCH 212/357] [AutoScheduler] Fix the type inference for conv3d
 (#7475)

---
 src/relay/op/nn/convolution.h | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h
index c08d3553e4cc..5b4850ec6653 100644
--- a/src/relay/op/nn/convolution.h
+++ b/src/relay/op/nn/convolution.h
@@ -24,6 +24,7 @@
 #ifndef TVM_RELAY_OP_NN_CONVOLUTION_H_
 #define TVM_RELAY_OP_NN_CONVOLUTION_H_
 
+#include <tvm/auto_scheduler/compute_dag.h>
 #include <tvm/support/logging.h>
 #include <tvm/tir/analysis.h>
 
@@ -369,7 +370,18 @@ bool Conv3DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   } else {
     // use weight to infer the conv shape.
     if (weight == nullptr) return false;
-    auto wshape = trans_kernel_layout.ForwardShape(weight->shape);
+
+    Array<PrimExpr> wshape;
+    if (param->auto_scheduler_rewritten_layout.size() == 0) {
+      wshape = weight->shape;
+    } else {
+      // works for the default kernel layout "DHWIO"
+      ICHECK_EQ(param->kernel_layout, "DHWIO");
+      wshape = auto_scheduler::GetShapeFromRewrittenLayout(param->auto_scheduler_rewritten_layout,
+                                                           {"rd", "rh", "rw", "rc", "cc"});
+    }
+
+    wshape = trans_kernel_layout.ForwardShape(wshape);
     if (param->kernel_size.defined()) {
       ICHECK_EQ(param->kernel_size.size(), 3);
       // check the size

From 256b9cf02e75150eff60f7aede0aa52f92b6c024 Mon Sep 17 00:00:00 2001
From: Nicola Lancellotti <nicola.lancellotti@arm.com>
Date: Fri, 19 Feb 2021 23:36:11 +0000
Subject: [PATCH 213/357] Get tvmc version from tvm (#7478)

Change-Id: I6a6e78080f36e4e3e1689e03ea48e759fcd8e466
---
 python/tvm/driver/tvmc/main.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/tvm/driver/tvmc/main.py b/python/tvm/driver/tvmc/main.py
index fee04db820fb..1d360d98206e 100644
--- a/python/tvm/driver/tvmc/main.py
+++ b/python/tvm/driver/tvmc/main.py
@@ -23,7 +23,7 @@
 import logging
 import sys
 
-import pkg_resources
+import tvm
 
 from tvm.driver.tvmc.common import TVMCException
 
@@ -75,8 +75,7 @@ def _main(argv):
     logging.getLogger("TVMC").setLevel(40 - args.verbose * 10)
 
     if args.version:
-        version = pkg_resources.get_distribution("tvm").version
-        sys.stdout.write("%s\n" % version)
+        sys.stdout.write("%s\n" % tvm.__version__)
         return 0
 
     if not hasattr(args, "func"):

From d16f282d8f22f8d0a171289ce68e1ffe3754350d Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandron85@gmail.com>
Date: Fri, 19 Feb 2021 23:37:54 +0000
Subject: [PATCH 214/357] [TVMC] Add composite target passes for compilation
 and tuning (#7304)

* Extend --target syntax to cover multiple targets for compilation and tuning
 * Add a new composite_target module to implement custom codegen passes into TVMC
 * Provide implementation to integrate TVMC, to target Arm Ethos-N NPU and
   Compute Library for the Arm Architecture (ACL)

Change-Id: Iaee53fe22f0c14eb4e4c8ec47e72bade0c5e32cc
---
 python/tvm/driver/tvmc/autotuner.py           |   9 +-
 python/tvm/driver/tvmc/common.py              | 188 +++++++++++++++++-
 python/tvm/driver/tvmc/compiler.py            |  23 ++-
 python/tvm/driver/tvmc/composite_target.py    |  68 +++++++
 python/tvm/relay/op/contrib/ethosn.py         |  35 ++++
 tests/python/driver/tvmc/test_common.py       |  91 +++++++++
 tests/python/driver/tvmc/test_compiler.py     |  47 ++++-
 .../driver/tvmc/test_composite_target.py      |  62 ++++++
 8 files changed, 506 insertions(+), 17 deletions(-)
 create mode 100644 python/tvm/driver/tvmc/composite_target.py
 create mode 100644 tests/python/driver/tvmc/test_composite_target.py

diff --git a/python/tvm/driver/tvmc/autotuner.py b/python/tvm/driver/tvmc/autotuner.py
index fe5bebcabcbc..187b7c5d2a31 100644
--- a/python/tvm/driver/tvmc/autotuner.py
+++ b/python/tvm/driver/tvmc/autotuner.py
@@ -29,7 +29,7 @@
 from tvm.autotvm.tuner import RandomTuner
 from tvm.autotvm.tuner import XGBTuner
 
-from . import common, frontends
+from . import common, composite_target, frontends
 from .common import TVMCException
 from .main import register_parser
 
@@ -241,9 +241,14 @@ def drive_tune(args):
                 "need to provide an RPC tracker key (--rpc-key) for remote tuning"
             )
 
-    target = common.target_from_cli(args.target)
+    target, extra_targets = common.target_from_cli(args.target)
     mod, params = frontends.load_model(args.FILE, args.model_format, shape_dict=args.input_shapes)
 
+    for codegen_from_cli in extra_targets:
+        codegen = composite_target.get_codegen_by_target(codegen_from_cli["name"])
+        partition_function = codegen["pass_pipeline"]
+        mod = partition_function(mod, params)
+
     # min_repeat_ms should be:
     # a. the value provided by the user, if any, or
     # b. 0ms in case target is "cpu"; otherwise 1000ms
diff --git a/python/tvm/driver/tvmc/common.py b/python/tvm/driver/tvmc/common.py
index 1845915bcbd1..71bf42ae1e5c 100644
--- a/python/tvm/driver/tvmc/common.py
+++ b/python/tvm/driver/tvmc/common.py
@@ -18,6 +18,7 @@
 Common utility functions shared by TVMC modules.
 """
 import re
+import json
 import logging
 import os.path
 import argparse
@@ -78,6 +79,168 @@ def convert_graph_layout(mod, desired_layout):
             )
 
 
+def validate_targets(parse_targets):
+    """
+    Apply a series of validations in the targets provided via CLI.
+    """
+    tvm_target_kinds = tvm.target.Target.list_kinds()
+    targets = [t["name"] for t in parse_targets]
+
+    if len(targets) > len(set(targets)):
+        raise TVMCException("Duplicate target definitions are not allowed")
+
+    if targets[-1] not in tvm_target_kinds:
+        tvm_target_names = ", ".join(tvm_target_kinds)
+        raise TVMCException(
+            f"The last target needs to be a TVM target. Choices: {tvm_target_names}"
+        )
+
+    tvm_targets = [t for t in targets if t in tvm_target_kinds]
+    if len(tvm_targets) > 1:
+        verbose_tvm_targets = ", ".join(tvm_targets)
+        raise TVMCException(
+            f"Only one of the following targets can be used at a time. "
+            "Found: {verbose_tvm_targets}."
+        )
+
+
+def tokenize_target(target):
+    """
+    Extract a list of tokens from a target specification text.
+
+    It covers some corner-cases that are not covered by the built-in
+    module 'shlex', such as the use of "+" as a punctuation character.
+
+
+    Example
+    -------
+
+    For the input `foo -op1=v1 -op2="v ,2", bar -op3=v-4` we
+    should obtain:
+
+        ["foo", "-op1=v1", "-op2="v ,2"", ",", "bar", "-op3=v-4"]
+
+    Parameters
+    ----------
+    target : str
+        Target options sent via CLI arguments
+
+    Returns
+    -------
+    list of str
+        a list of parsed tokens extracted from the target string
+    """
+
+    target_pattern = (
+        r"(\-{0,2}[\w\-]+\=?"
+        r"(?:[\w\+\-]+(?:,[\w\+\-])*|[\'][\w\+\-,\s]+[\']|[\"][\w\+\-,\s]+[\"])*|,)"
+    )
+
+    return re.findall(target_pattern, target)
+
+
+def parse_target(target):
+    """
+    Parse a plain string of targets provided via a command-line
+    argument.
+
+    To send more than one codegen, a comma-separated list
+    is expected. Options start with -<option_name>=<value>.
+
+    We use python standard library 'shlex' to parse the argument in
+    a POSIX compatible way, so that if options are defined as
+    strings with spaces or commas, for example, this is considered
+    and parsed accordingly.
+
+
+    Example
+    -------
+
+    For the input `--target="foo -op1=v1 -op2="v ,2", bar -op3=v-4"` we
+    should obtain:
+
+      [
+        {
+            name: "foo",
+            opts: {"op1":"v1", "op2":"v ,2"},
+            raw: 'foo -op1=v1 -op2="v ,2"'
+        },
+        {
+            name: "bar",
+            opts: {"op3":"v-4"},
+            raw: 'bar -op3=v-4'
+        }
+      ]
+
+    Parameters
+    ----------
+    target : str
+        Target options sent via CLI arguments
+
+    Returns
+    -------
+    codegens : list of dict
+        This list preserves the order in which codegens were
+        provided via command line. Each Dict contains three keys:
+        'name', containing the name of the codegen; 'opts' containing
+        a key-value for all options passed via CLI; 'raw',
+        containing the plain string for this codegen
+    """
+    codegens = []
+
+    parsed_tokens = tokenize_target(target)
+
+    split_codegens = []
+    current_codegen = []
+    split_codegens.append(current_codegen)
+    for token in parsed_tokens:
+        # every time there is a comma separating
+        # two codegen definitions, prepare for
+        # a new codegen
+        if token == ",":
+            current_codegen = []
+            split_codegens.append(current_codegen)
+        else:
+            # collect a new token for the current
+            # codegen being parsed
+            current_codegen.append(token)
+
+    # at this point we have a list of lists,
+    # each item on the first list is a codegen definition
+    # in the comma-separated values
+    for codegen_def in split_codegens:
+        # the first is expected to be the name
+        name = codegen_def[0]
+        raw_target = " ".join(codegen_def)
+        all_opts = codegen_def[1:] if len(codegen_def) > 1 else []
+        opts = {}
+        for opt in all_opts:
+            try:
+                # deal with -- prefixed flags
+                if opt.startswith("--"):
+                    opt_name = opt[2:]
+                    opt_value = True
+                else:
+                    opt = opt[1:] if opt.startswith("-") else opt
+                    opt_name, opt_value = opt.split("=", maxsplit=1)
+            except ValueError:
+                raise ValueError(f"Error when parsing '{opt}'")
+
+            opts[opt_name] = opt_value
+
+        codegens.append({"name": name, "opts": opts, "raw": raw_target})
+
+    return codegens
+
+
+def is_inline_json(target):
+    try:
+        json.loads(target)
+        return True
+    except json.decoder.JSONDecodeError:
+        return False
+
+
 def target_from_cli(target):
     """
     Create a tvm.target.Target instance from a
@@ -93,18 +256,33 @@ def target_from_cli(target):
     -------
     tvm.target.Target
         an instance of target device information
+    extra_targets : list of dict
+        This list preserves the order in which extra targets were
+        provided via command line. Each Dict contains three keys:
+        'name', containing the name of the codegen; 'opts' containing
+        a key-value for all options passed via CLI; 'raw',
+        containing the plain string for this codegen
     """
+    extra_targets = []
 
     if os.path.exists(target):
         with open(target) as target_file:
-            logger.info("using target input from file: %s", target)
+            logger.debug("target input is a path: %s", target)
             target = "".join(target_file.readlines())
+    elif is_inline_json(target):
+        logger.debug("target input is inline JSON: %s", target)
+    else:
+        logger.debug("target input is plain text: %s", target)
+        try:
+            parsed_targets = parse_target(target)
+        except ValueError as ex:
+            raise TVMCException(f"Error parsing target string '{target}'.\nThe error was: {ex}")
 
-    # TODO(@leandron) We don't have an API to collect a list of supported
-    #       targets yet
-    logger.debug("creating target from input: %s", target)
+        validate_targets(parsed_targets)
+        target = parsed_targets[-1]["raw"]
+        extra_targets = parsed_targets[:-1] if len(parsed_targets) > 1 else []
 
-    return tvm.target.Target(target)
+    return tvm.target.Target(target), extra_targets
 
 
 def tracker_host_port_from_cli(rpc_tracker_str):
diff --git a/python/tvm/driver/tvmc/compiler.py b/python/tvm/driver/tvmc/compiler.py
index 282ae6a76b56..fc1805ee0ab4 100644
--- a/python/tvm/driver/tvmc/compiler.py
+++ b/python/tvm/driver/tvmc/compiler.py
@@ -28,7 +28,7 @@
 from tvm.contrib import cc
 from tvm.contrib import utils
 
-from . import common, frontends
+from . import common, composite_target, frontends
 from .main import register_parser
 
 
@@ -72,7 +72,7 @@ def add_compile_parser(subparsers):
     )
     parser.add_argument(
         "--target",
-        help="compilation target as plain string, inline JSON or path to a JSON file",
+        help="compilation targets as comma separated string, inline JSON or path to a JSON file.",
         required=True,
     )
     parser.add_argument(
@@ -185,13 +185,21 @@ def compile_model(
     """
     dump_code = [x.strip() for x in dump_code.split(",")] if dump_code else None
     mod, params = frontends.load_model(path, model_format, shape_dict)
+    config = {}
 
     if alter_layout:
         mod = common.convert_graph_layout(mod, alter_layout)
 
-    tvm_target = common.target_from_cli(target)
+    tvm_target, extra_targets = common.target_from_cli(target)
     target_host = tvm_target if not target_host else target_host
 
+    for codegen_from_cli in extra_targets:
+        codegen = composite_target.get_codegen_by_target(codegen_from_cli["name"])
+        partition_function = codegen["pass_pipeline"]
+        mod = partition_function(mod, params)
+        if codegen["config_key"] is not None:
+            config[codegen["config_key"]] = codegen_from_cli["opts"]
+
     if tuning_records and os.path.exists(tuning_records):
         logger.debug("tuning records file provided: %s", tuning_records)
 
@@ -203,22 +211,21 @@ def compile_model(
 
         if use_autoscheduler:
             with auto_scheduler.ApplyHistoryBest(tuning_records):
-                with tvm.transform.PassContext(
-                    opt_level=3, config={"relay.backend.use_auto_scheduler": True}
-                ):
+                config["relay.backend.use_auto_scheduler"] = True
+                with tvm.transform.PassContext(opt_level=3, config=config):
                     logger.debug("building relay graph with autoscheduler")
                     graph_module = relay.build(
                         mod, target=target, params=params, target_host=target_host
                     )
         else:
             with autotvm.apply_history_best(tuning_records):
-                with tvm.transform.PassContext(opt_level=3):
+                with tvm.transform.PassContext(opt_level=3, config=config):
                     logger.debug("building relay graph with tuning records")
                     graph_module = relay.build(
                         mod, tvm_target, params=params, target_host=target_host
                     )
     else:
-        with tvm.transform.PassContext(opt_level=3):
+        with tvm.transform.PassContext(opt_level=3, config=config):
             logger.debug("building relay graph (no tuning records provided)")
             graph_module = relay.build(mod, tvm_target, params=params, target_host=target_host)
 
diff --git a/python/tvm/driver/tvmc/composite_target.py b/python/tvm/driver/tvmc/composite_target.py
new file mode 100644
index 000000000000..7c08994e0e75
--- /dev/null
+++ b/python/tvm/driver/tvmc/composite_target.py
@@ -0,0 +1,68 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Provides support to composite target on TVMC.
+"""
+import logging
+
+from tvm.relay.op.contrib.arm_compute_lib import partition_for_arm_compute_lib
+from tvm.relay.op.contrib.ethosn import partition_for_ethosn
+
+from .common import TVMCException
+
+
+# pylint: disable=invalid-name
+logger = logging.getLogger("TVMC")
+
+# Global dictionary to map targets with the configuration key
+# to be used in the PassContext (if any), and a function
+# responsible for partitioning to that target.
+REGISTERED_CODEGEN = {
+    "acl": {
+        "config_key": None,
+        "pass_pipeline": partition_for_arm_compute_lib,
+    },
+    "ethos-n77": {
+        "config_key": "relay.ext.ethos-n.options",
+        "pass_pipeline": partition_for_ethosn,
+    },
+}
+
+
+def get_codegen_names():
+    """Return a list of all registered codegens.
+
+    Returns
+    -------
+    list of str
+        all registered targets
+    """
+    return list(REGISTERED_CODEGEN.keys())
+
+
+def get_codegen_by_target(name):
+    """Return a codegen entry by name.
+
+    Returns
+    -------
+    dict
+        requested target information
+    """
+    try:
+        return REGISTERED_CODEGEN[name]
+    except KeyError:
+        raise TVMCException("Composite target %s is not defined in TVMC." % name)
diff --git a/python/tvm/relay/op/contrib/ethosn.py b/python/tvm/relay/op/contrib/ethosn.py
index 3a05011242e7..478a1ec46f26 100644
--- a/python/tvm/relay/op/contrib/ethosn.py
+++ b/python/tvm/relay/op/contrib/ethosn.py
@@ -17,7 +17,11 @@
 # pylint: disable=invalid-name, unused-argument
 """Arm(R) Ethos(TM) -N NPU supported operators."""
 from enum import Enum
+
 import tvm.ir
+from tvm.relay import transform
+from tvm.relay.build_module import bind_params_by_name
+
 from ...dataflow_pattern import wildcard, is_op, is_constant
 from ... import qnn as _qnn
 from .register import register_pattern_table
@@ -42,6 +46,37 @@ def ethosn_available():
     return Available.SW_AND_HW if hw else Available.SW_ONLY
 
 
+def partition_for_ethosn(mod, params=None):
+    """Partition the graph greedily offloading supported
+    operators to Arm Ethos-N NPU.
+
+    Parameters
+    ----------
+    mod : Module
+        The module to run passes on.
+    params : Optional[Dict[str, NDArray]]
+        Constant input parameters.
+
+    Returns
+    -------
+    ret : annotated and partitioned module.
+    """
+    if params:
+        mod["main"] = bind_params_by_name(mod["main"], params)
+
+    seq = tvm.transform.Sequential(
+        [
+            transform.InferType(),
+            transform.MergeComposite(pattern_table()),
+            transform.AnnotateTarget("ethos-n"),
+            transform.MergeCompilerRegions(),
+            transform.PartitionGraph(),
+        ]
+    )
+
+    return seq(mod)
+
+
 @register_pattern_table("ethos-n")
 def pattern_table():
     """Get the Ethos-N compiler pattern table."""
diff --git a/tests/python/driver/tvmc/test_common.py b/tests/python/driver/tvmc/test_common.py
index f30949b54497..253f32d3f0aa 100644
--- a/tests/python/driver/tvmc/test_common.py
+++ b/tests/python/driver/tvmc/test_common.py
@@ -24,6 +24,8 @@
 from tvm import relay
 from tvm.driver import tvmc
 
+from tvm.driver.tvmc.common import TVMCException
+
 
 def test_compile_tflite_module_nhwc_to_nchw(tflite_mobilenet_v1_1_quant):
     # some CI environments wont offer TFLite, so skip in case it is not present
@@ -182,3 +184,92 @@ def test_shape_parser():
     shape_string = "input:5,10 input2:10,10"
     with pytest.raises(argparse.ArgumentTypeError):
         tvmc.common.parse_shape_string(shape_string)
+
+
+def test_target_from_cli__error_duplicate():
+    with pytest.raises(TVMCException):
+        _ = tvmc.common.target_from_cli("llvm, llvm")
+
+
+def test_target_from_cli__error_target_not_found():
+    with pytest.raises(TVMCException):
+        _ = tvmc.common.target_from_cli("invalidtarget")
+
+
+def test_target_from_cli__error_no_tvm_target():
+    with pytest.raises(TVMCException):
+        _ = tvmc.common.target_from_cli("ethos-n77")
+
+
+def test_tokenize_target_with_opts():
+    tokens = tvmc.common.tokenize_target("foo -opt1=value1 --flag, bar -opt2=value2")
+    expected_tokens = ["foo", "-opt1=value1", "--flag", ",", "bar", "-opt2=value2"]
+
+    assert len(tokens) == len(expected_tokens)
+    assert tokens == expected_tokens
+
+
+def test_tokenize_target_with_plus_sign():
+    tokens = tvmc.common.tokenize_target("foo -opt1=+value1 --flag, bar -opt2=test,+v")
+    expected_tokens = ["foo", "-opt1=+value1", "--flag", ",", "bar", "-opt2=test,+v"]
+
+    assert len(tokens) == len(expected_tokens)
+    assert tokens == expected_tokens
+
+
+def test_tokenize_target_with_commas():
+    tokens = tvmc.common.tokenize_target("foo -opt1=v,a,l,u,e,1 --flag")
+    expected_tokens = ["foo", "-opt1=v,a,l,u,e,1", "--flag"]
+
+    assert len(tokens) == len(expected_tokens)
+    assert tokens == expected_tokens
+
+
+def test_tokenize_target_with_commas_and_single_quotes():
+    tokens = tvmc.common.tokenize_target("foo -opt1='v, a, l, u, e', bar")
+    expected_tokens = ["foo", "-opt1='v, a, l, u, e'", ",", "bar"]
+
+    assert len(tokens) == len(expected_tokens)
+    assert tokens == expected_tokens
+
+
+def test_tokenize_target_with_commas_and_double_quotes():
+    tokens = tvmc.common.tokenize_target('foo -opt1="v, a, l, u, e", bar')
+    expected_tokens = ["foo", '-opt1="v, a, l, u, e"', ",", "bar"]
+
+    assert len(tokens) == len(expected_tokens)
+    assert tokens == expected_tokens
+
+
+def test_tokenize_target_with_dashes():
+    tokens = tvmc.common.tokenize_target("foo-bar1 -opt-1=t-e-s-t, baz")
+    expected_tokens = ["foo-bar1", "-opt-1=t-e-s-t", ",", "baz"]
+
+    assert len(tokens) == len(expected_tokens)
+    assert tokens == expected_tokens
+
+
+def test_parse_single_target_with_opts():
+    targets = tvmc.common.parse_target("llvm -device=arm_cpu --system-lib")
+
+    assert len(targets) == 1
+    assert "device" in targets[0]["opts"]
+    assert "system-lib" in targets[0]["opts"]
+
+
+def test_parse_multiple_target():
+    targets = tvmc.common.parse_target("acl, llvm -device=arm_cpu --system-lib")
+
+    assert len(targets) == 2
+    assert "acl" == targets[0]["name"]
+    assert "llvm" == targets[1]["name"]
+
+
+def test_parse_multiple_target_with_opts():
+    targets = tvmc.common.parse_target("ethos-n77 -myopt=value, llvm -device=arm_cpu --system-lib")
+
+    assert len(targets) == 2
+    assert "ethos-n77" == targets[0]["name"]
+    assert "myopt" in targets[0]["opts"]
+    assert "value" == targets[0]["opts"]["myopt"]
+    assert "llvm" == targets[1]["name"]
diff --git a/tests/python/driver/tvmc/test_compiler.py b/tests/python/driver/tvmc/test_compiler.py
index 4cb342c2e967..ae859298facd 100644
--- a/tests/python/driver/tvmc/test_compiler.py
+++ b/tests/python/driver/tvmc/test_compiler.py
@@ -19,10 +19,13 @@
 import shutil
 from os import path
 
+from unittest import mock
 import pytest
 
 import tvm
 
+from tvm.relay.op.contrib.ethosn import ethosn_available
+
 from tvm.driver import tvmc
 
 
@@ -73,7 +76,7 @@ def test_cross_compile_aarch64_tflite_module(tflite_mobilenet_v1_1_quant):
 
     graph, lib, params, dumps = tvmc.compiler.compile_model(
         tflite_mobilenet_v1_1_quant,
-        target="llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+neon",
+        target="llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr='+neon'",
         dump_code="asm",
     )
 
@@ -110,7 +113,7 @@ def test_cross_compile_aarch64_keras_module(keras_resnet50):
 
     graph, lib, params, dumps = tvmc.compiler.compile_model(
         keras_resnet50,
-        target="llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+neon",
+        target="llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr='+neon'",
         dump_code="asm",
     )
 
@@ -185,3 +188,43 @@ def test_compile_opencl(tflite_mobilenet_v1_0_25_128):
     assert type(lib) is tvm.runtime.module.Module
     assert type(params) is dict
     assert type(dumps) is dict
+
+
+@pytest.mark.skipif(
+    not ethosn_available(),
+    reason="--target=ethos-n77 is not available. TVM built with 'USE_ETHOSN OFF'",
+)
+def test_compile_tflite_module_with_external_codegen(tflite_mobilenet_v1_1_quant):
+    pytest.importorskip("tflite")
+
+    graph, lib, params, dumps = tvmc.compiler.compile_model(
+        tflite_mobilenet_v1_1_quant, target="ethos-n77, llvm", dump_code="relay"
+    )
+
+    # check for output types
+    assert type(graph) is str
+    assert type(lib) is tvm.runtime.module.Module
+    assert type(params) is dict
+    assert type(dumps) is dict
+
+
+@mock.patch("tvm.relay.build")
+@mock.patch("tvm.driver.tvmc.composite_target.get_codegen_by_target")
+@mock.patch("tvm.driver.tvmc.frontends.load_model")
+@mock.patch("tvm.transform.PassContext")
+def test_compile_check_configs_composite_target(mock_pc, mock_fe, mock_ct, mock_relay):
+    mock_codegen = {}
+    mock_codegen["config_key"] = "relay.ext.mock.options"
+    mock_codegen["pass_pipeline"] = lambda *args: None
+
+    mock_fe.return_value = (None, None)
+    mock_ct.return_value = mock_codegen
+    mock_relay.return_value = mock.MagicMock()
+
+    graph, lib, params, dumps = tvmc.compiler.compile_model(
+        "no_file_needed", target="mockcodegen -testopt=value, llvm"
+    )
+
+    mock_pc.assert_called_once_with(
+        opt_level=3, config={"relay.ext.mock.options": {"testopt": "value"}}
+    )
diff --git a/tests/python/driver/tvmc/test_composite_target.py b/tests/python/driver/tvmc/test_composite_target.py
new file mode 100644
index 000000000000..eda7cd9224fd
--- /dev/null
+++ b/tests/python/driver/tvmc/test_composite_target.py
@@ -0,0 +1,62 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import argparse
+import os
+import shutil
+
+from inspect import isfunction
+from os import path
+
+import pytest
+
+import tvm
+
+from tvm.driver import tvmc
+
+from tvm.driver.tvmc.common import TVMCException
+
+
+def test_get_codegen_names():
+    names = tvmc.composite_target.get_codegen_names()
+
+    assert "ethos-n77" in names
+    assert len(names) > 0
+
+
+def test_valid_codegen():
+    codegen = tvmc.composite_target.get_codegen_by_target("acl")
+
+    assert codegen is not None
+    assert codegen["pass_pipeline"] is not None
+
+
+def test_invalid_codegen():
+    with pytest.raises(TVMCException):
+        _ = tvmc.composite_target.get_codegen_by_target("invalid")
+
+
+def test_all_codegens_contain_pass_pipeline():
+    for name in tvmc.composite_target.get_codegen_names():
+        codegen = tvmc.composite_target.get_codegen_by_target(name)
+        assert "pass_pipeline" in codegen, f"{name} does not contain a pass_pipeline"
+        assert isfunction(codegen["pass_pipeline"])
+
+
+def test_all_pass_pipelines_are_functions():
+    for name in tvmc.composite_target.get_codegen_names():
+        codegen = tvmc.composite_target.get_codegen_by_target(name)
+        assert isfunction(codegen["pass_pipeline"]), f"pass_pipeline for {name} is not a function"

From 5688068eb02912a4ec926a88f5cad3f0f370454e Mon Sep 17 00:00:00 2001
From: Trevor Morris <trevmorr@amazon.com>
Date: Fri, 19 Feb 2021 20:26:55 -0800
Subject: [PATCH 215/357] [Frontend][Tensorflow] Support explicit_paddings for
 TF 2.x (#7445)

* Ignore some TF2.0 attributes

* Support explicit padding for conv2d, max_pool, conv3d

* Remove conv3d explicit padding test since TF API doesn't allow it
---
 python/tvm/relay/frontend/tensorflow.py       | 44 ++++++++++++++++---
 .../frontend/tensorflow/test_forward.py       | 40 +++++++++++++++++
 2 files changed, 79 insertions(+), 5 deletions(-)

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 6a29ce266ea6..ac52ab768066 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -268,6 +268,13 @@ def _impl(inputs, attr, params, mod):
             pad_h = _get_pad_pair(in_w, kernel_w, stride_w)
 
             attr["padding"] = [pad_v[0], pad_h[0], pad_v[1], pad_h[1]]
+        elif attr["padding"] == "EXPLICIT":
+            paddings = attr["explicit_paddings"]
+            assert len(paddings) == 8
+            if flip_layout or attr["data_format"] == "NHWC":
+                attr["padding"] = [paddings[2], paddings[4], paddings[3], paddings[5]]
+            else:
+                attr["padding"] = [paddings[4], paddings[6], paddings[5], paddings[7]]
         else:
             msg = 'Value {} in attribute "padding" of operator Pooling is ' "not valid."
             raise tvm.error.OpAttributeInvalid(msg.format(attr["padding"]))
@@ -278,7 +285,7 @@ def _impl(inputs, attr, params, mod):
         out = AttrCvt(
             op_name=_dimension_picker(name),
             transforms={"kernel_shape": "pool_size", "data_format": "layout"},
-            ignores=["ksize"],
+            ignores=["ksize", "explicit_paddings"],
             extras={"ceil_mode": False},
             custom_check=_dimension_constraint(),
         )(inputs, attr)
@@ -418,6 +425,13 @@ def _impl(inputs, attr, params, mod):
             pad_h = _get_pad_pair(in_w, dilated_kernel_w, stride_w)
 
             attr["padding"] = [pad_v[0], pad_h[0], pad_v[1], pad_h[1]]
+        elif attr["padding"] == "EXPLICIT":
+            paddings = attr["explicit_paddings"]
+            assert len(paddings) == 8
+            if flip_layout or attr["data_format"] == "NHWC":
+                attr["padding"] = [paddings[2], paddings[4], paddings[3], paddings[5]]
+            else:
+                attr["padding"] = [paddings[4], paddings[6], paddings[5], paddings[7]]
         else:
             msg = 'Value {} in attribute "padding" of operator Conv is not ' "valid."
             raise tvm.error.OpAttributeInvalid(msg.format(attr["padding"]))
@@ -626,7 +640,27 @@ def _impl(inputs, attr, params, mod):
             pad_h = _get_pad_pair(in_w, dilated_kernel_w, stride_w)
 
             attr["padding"] = [pad_d[0], pad_v[0], pad_h[0], pad_d[1], pad_v[1], pad_h[1]]
-
+        elif attr["padding"] == "EXPLICIT":
+            paddings = attr["explicit_paddings"]
+            assert len(paddings) == 10
+            if flip_layout or attr["data_format"] == "NDHWC":
+                attr["padding"] = [
+                    paddings[2],
+                    paddings[4],
+                    paddings[6],
+                    paddings[3],
+                    paddings[5],
+                    paddings[7],
+                ]
+            else:
+                attr["padding"] = [
+                    paddings[4],
+                    paddings[6],
+                    paddings[8],
+                    paddings[5],
+                    paddings[7],
+                    paddings[9],
+                ]
         else:
             msg = 'Value {} in attribute "padding" of operator Conv is not ' "valid."
             raise tvm.error.OpAttributeInvalid(msg.format(attr["padding"]))
@@ -1445,9 +1479,9 @@ def _squeeze():
     def _impl(inputs, attr, params, mod):
         if len(attr["squeeze_dims"]) == 0:
             attr["squeeze_dims"] = None
-        return AttrCvt(op_name="squeeze", transforms={"squeeze_dims": "axis"}, ignores=["T"])(
-            inputs, attr
-        )
+        return AttrCvt(
+            op_name="squeeze", transforms={"squeeze_dims": "axis"}, ignores=["T", "_cloned"]
+        )(inputs, attr)
 
     return _impl
 
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index f956ea02eb47..ecf6441bc6b9 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -414,6 +414,16 @@ def test_forward_pooling():
             pooling_type=pool_type,
             dilation_rate=[2],
         )
+    # Explicit padding
+    if package_version.parse(tf.VERSION) >= package_version.parse("2.4.1"):
+        _test_pooling(
+            input_shape=[2, 9, 10, 2],
+            window_shape=[4, 4],
+            padding=[[0, 0], [0, 1], [2, 3], [0, 0]],
+            pooling_type="MAX",
+            dilation_rate=[1, 1],
+            strides=[1, 1],
+        )
 
 
 #######################################################################
@@ -830,6 +840,36 @@ def test_forward_convolution():
         [4, 8, 8, 176],
         add_shapes_to_graph_def=False,
     )
+    # Explicit padding
+    if package_version.parse(tf.VERSION) >= package_version.parse("2.4.1"):
+        _test_convolution(
+            "conv",
+            [4, 8, 8, 16],
+            [1, 1, 16, 32],
+            [1, 1],
+            [1, 1],
+            [[0, 0], [2, 3], [0, 1], [0, 0]],
+            "NHWC",
+        )
+        _test_convolution(
+            "depthwise",
+            [4, 8, 8, 16],
+            [1, 1, 16, 1],
+            [1, 1],
+            [1, 1],
+            [[0, 0], [2, 3], [0, 1], [0, 0]],
+            "NHWC",
+        )
+        _test_convolution(
+            "conv_transpose",
+            [4, 8, 8, 32],
+            [3, 3, 176, 32],
+            [1, 1],
+            [2, 2],
+            [[0, 0], [1, 0], [1, 0], [0, 0]],
+            "NHWC",
+            [4, 16, 16, 176],
+        )
 
 
 #######################################################################

From 5a0573b0fcc1bea2045f43a2fce6d85d05d5102a Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Sat, 20 Feb 2021 06:21:17 -0800
Subject: [PATCH 216/357] make test_runtime_rpc use pytest.main() (#7482)

---
 tests/python/unittest/test_runtime_rpc.py | 28 ++++++++---------------
 1 file changed, 9 insertions(+), 19 deletions(-)

diff --git a/tests/python/unittest/test_runtime_rpc.py b/tests/python/unittest/test_runtime_rpc.py
index e975a1699341..11c109810fbb 100644
--- a/tests/python/unittest/test_runtime_rpc.py
+++ b/tests/python/unittest/test_runtime_rpc.py
@@ -17,11 +17,12 @@
 import tvm
 from tvm import te
 import tvm.testing
+import logging
+import multiprocessing
 import os
 import stat
-import logging
+import sys
 import time
-import multiprocessing
 
 import pytest
 import numpy as np
@@ -29,6 +30,12 @@
 from tvm.contrib import utils, cc
 from tvm.rpc.tracker import Tracker
 
+
+if __name__ == "__main__":
+    # NOTE: must live here to avoid registering PackedFunc with libtvm.so twice.
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+
+
 # tkonolige: The issue as I understand it is this: multiprocessing's spawn
 # method launches a new process and then imports the relevant modules. This
 # means that all registered functions must exist at the top level scope. In
@@ -526,20 +533,3 @@ def test_rpc_tracker_request():
     proc2.join()
     server.terminate()
     tracker.terminate()
-
-
-if __name__ == "__main__":
-    logging.basicConfig(level=logging.INFO)
-    test_rpc_echo()
-    test_rpc_session_constructor_args()
-    test_rpc_return_ndarray()
-    test_rpc_return_func()
-    test_bigendian_rpc()
-    test_rpc_remote_module()
-    test_rpc_file_exchange()
-    test_rpc_array()
-    test_rpc_simple()
-    test_local_func()
-    test_rpc_tracker_register()
-    test_rpc_tracker_request()
-    test_rpc_large_array()

From cfe88c1eee757b49b2837f31f29a79c08101a55c Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sun, 21 Feb 2021 16:35:40 -0500
Subject: [PATCH 217/357] [TIR] Specialize MutateArray in StmtFunctor. (#7486)

StmtFunctor applies context dependent copy on write,
which requires check over all the dependency chain.
Such function is better suited as a special implementation
to avoid misuse. This PR refactors the code to specialize
the function.

Co-authored-by: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com>
---
 src/tir/ir/functor_common.h | 15 +++----------
 src/tir/ir/stmt_functor.cc  | 44 +++++++++++++++++++++++++++++++++----
 2 files changed, 43 insertions(+), 16 deletions(-)

diff --git a/src/tir/ir/functor_common.h b/src/tir/ir/functor_common.h
index f63dcfe003c6..9ed911f6b782 100644
--- a/src/tir/ir/functor_common.h
+++ b/src/tir/ir/functor_common.h
@@ -34,19 +34,10 @@ inline void VisitArray(const Array<T>& arr, F fvisit) {
   }
 }
 
-// Implementation of mutators
 template <typename T, typename F>
-inline Array<T> MutateArray(const Array<T>& arr, F fmutate, bool allow_copy_on_write = false) {
-  if (allow_copy_on_write) {
-    // if we allow copy on write, we can directly
-    // call the inplace mutate function.
-    const_cast<Array<T>&>(arr).MutateByApply(fmutate);
-    return arr;
-  } else {
-    Array<T> copy = arr;
-    copy.MutateByApply(fmutate);
-    return copy;
-  }
+inline Array<T> MutateArray(Array<T> arr, F fmutate) {
+  arr.MutateByApply(fmutate);
+  return arr;
 }
 
 }  // namespace tir
diff --git a/src/tir/ir/stmt_functor.cc b/src/tir/ir/stmt_functor.cc
index e0ccb49fc454..e4cc1b7e4275 100644
--- a/src/tir/ir/stmt_functor.cc
+++ b/src/tir/ir/stmt_functor.cc
@@ -114,14 +114,50 @@ void StmtVisitor::VisitStmt_(const EvaluateNode* op) { this->VisitExpr(op->value
 
 class StmtMutator::Internal {
  public:
+  /*!
+   * \brief Mutate array's element by fmutate function.
+   *
+   * \note Use extra care for copy on write setting.
+   *
+   * In particular, consider the following case of two reference chains:
+   * - strongref0 -> loop0 -> loop1 -> loop2
+   * - strongref1 -> loop3 -> loop1 -> loop2
+   *
+   * Think of the case of calling MutateArray on loop1->loop2(as const reference).
+   * When both strongref0 and strongref1 exists, the context does not allow copy
+   * on write, even though loop1 uniquely refers to loop2.
+   *
+   * \param self The pointer to the mutator.
+   * \param arr Array to be mutated, const reference is used to allow copy on write
+   *            mutation in a recursive visitor.
+   * \param fmutate The mutator function.
+   * \return The mutated array, a new copy can be created.
+   */
+  template <typename T, typename F>
+  static Array<T> MutateArray(StmtMutator* self, const Array<T>& arr, F fmutate) {
+    if (self->allow_copy_on_write_ && arr.unique()) {
+      // if we allow copy on write, we can directly
+      // call the inplace mutate function.
+      const_cast<Array<T>&>(arr).MutateByApply(fmutate);
+      return arr;
+    } else {
+      bool allow_cow = false;
+      Array<T> copy = arr;
+      std::swap(allow_cow, self->allow_copy_on_write_);
+      copy.MutateByApply(fmutate);
+      std::swap(allow_cow, self->allow_copy_on_write_);
+      return copy;
+    }
+  }
+
   static Array<PrimExpr> Mutate(StmtMutator* self, const Array<PrimExpr>& arr) {
     auto fmutate = [self](const PrimExpr& e) { return self->VisitExpr(e); };
-    return MutateArray(arr, fmutate, self->allow_copy_on_write_);
+    return MutateArray(self, arr, fmutate);
   }
 
   static Array<Stmt> Mutate(StmtMutator* self, const Array<Stmt>& arr) {
     auto fmutate = [self](const Stmt& s) { return self->VisitStmt(s); };
-    return MutateArray(arr, fmutate, self->allow_copy_on_write_);
+    return MutateArray(self, arr, fmutate);
   }
 
   static Array<Range> Mutate(StmtMutator* self, const Array<Range>& arr) {
@@ -134,7 +170,7 @@ class StmtMutator::Internal {
         return Range::FromMinExtent(min, extent);
       }
     };
-    return MutateArray(arr, fmutate, self->allow_copy_on_write_);
+    return MutateArray(self, arr, fmutate);
   }
 };
 
@@ -323,7 +359,7 @@ Stmt StmtMutator::VisitSeqStmt_(const SeqStmtNode* op, bool flatten_before_visit
   }
   // function to run the visit.
   auto frunvisit = [&](const SeqStmtNode* op) {
-    Array<Stmt> seq = fmutate != nullptr ? MutateArray(op->seq, fmutate, allow_copy_on_write_)
+    Array<Stmt> seq = fmutate != nullptr ? Internal::MutateArray(this, op->seq, fmutate)
                                          : Internal::Mutate(this, op->seq);
     if (seq.same_as(op->seq)) {
       return GetRef<Stmt>(op);

From 072c469c060ea1bf6ae409842158d238c64def3f Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Mon, 22 Feb 2021 21:26:30 +0900
Subject: [PATCH 218/357] [CUDA][THRUST] Enforce -libs=thrust to allow thrust
 offload (#7468)

* add contrib/thrust.py

* update cuda strategy

* remove is_thrust_available, update nms, scan, sort and tests

* remove unused import

* trigger CI

* update

* add note on how to enable thrust in ssd tutorial

* add warning

* Revert "update"

This reverts commit c1629b39e5277003a82cbf31fe4da493537bc05f.

Co-authored-by: masa <masa@pop-os.localdomain>
---
 apps/topi_recipe/gemm/cuda_gemm_square.py |   1 +
 python/tvm/contrib/thrust.py              |  45 ++++++++
 python/tvm/relay/op/strategy/cuda.py      |  18 +--
 python/tvm/relay/op/strategy/rocm.py      |  19 +---
 python/tvm/topi/cuda/nms.py               |  10 +-
 python/tvm/topi/cuda/scan.py              |  13 +--
 python/tvm/topi/cuda/scatter.py           |   3 +-
 python/tvm/topi/cuda/sort.py              |   8 --
 tests/python/contrib/test_thrust.py       | 130 +++++++++++-----------
 tutorials/frontend/deploy_ssd_gluoncv.py  |   4 +
 10 files changed, 139 insertions(+), 112 deletions(-)
 create mode 100644 python/tvm/contrib/thrust.py

diff --git a/apps/topi_recipe/gemm/cuda_gemm_square.py b/apps/topi_recipe/gemm/cuda_gemm_square.py
index 25d14f9abdf3..0d548dc0b554 100644
--- a/apps/topi_recipe/gemm/cuda_gemm_square.py
+++ b/apps/topi_recipe/gemm/cuda_gemm_square.py
@@ -21,6 +21,7 @@
 from tvm.contrib import nvcc
 from tvm.contrib import spirv
 import numpy as np
+import tvm.testing
 
 TASK = "gemm"
 USE_MANUAL_CODE = False
diff --git a/python/tvm/contrib/thrust.py b/python/tvm/contrib/thrust.py
new file mode 100644
index 000000000000..7fe0077c2b42
--- /dev/null
+++ b/python/tvm/contrib/thrust.py
@@ -0,0 +1,45 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Utilities for thrust"""
+import logging
+
+from tvm._ffi import get_global_func
+
+
+def maybe_warn(target, func_name):
+    if get_global_func(func_name, allow_missing=True) and not "thrust" in target.libs:
+        logging.warning("TVM is built with thrust but thrust is not used.")
+    if "thrust" in target.libs and get_global_func(func_name, allow_missing=True) is None:
+        logging.warning("thrust is requested but TVM is not built with thrust.")
+
+
+def can_use_thrust(target, func_name):
+    maybe_warn(target, func_name)
+    return (
+        target.kind.name in ["cuda", "nvptx"]
+        and "thrust" in target.libs
+        and get_global_func(func_name, allow_missing=True)
+    )
+
+
+def can_use_rocthrust(target, func_name):
+    maybe_warn(target, func_name)
+    return (
+        target.kind.name == "rocm"
+        and "thrust" in target.libs
+        and get_global_func(func_name, allow_missing=True)
+    )
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index cb4688c4889e..20c5f03b9b0b 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -20,7 +20,7 @@
 from tvm.auto_scheduler import is_auto_scheduler_enabled
 from tvm.te import SpecializedCondition
 from tvm.contrib import nvcc
-from tvm._ffi import get_global_func
+from tvm.contrib.thrust import can_use_thrust
 from .generic import *
 from .. import op as _op
 
@@ -791,9 +791,7 @@ def scatter_cuda(attrs, inputs, out_type, target):
     rank = len(inputs[0].shape)
 
     with SpecializedCondition(rank == 1):
-        if target.kind.name == "cuda" and get_global_func(
-            "tvm.contrib.thrust.stable_sort_by_key", allow_missing=True
-        ):
+        if can_use_thrust(target, "tvm.contrib.thrust.stable_sort_by_key"):
             strategy.add_implementation(
                 wrap_compute_scatter(topi.cuda.scatter_via_sort),
                 wrap_topi_schedule(topi.cuda.schedule_scatter_via_sort),
@@ -838,9 +836,7 @@ def sort_strategy_cuda(attrs, inputs, out_type, target):
         wrap_topi_schedule(topi.cuda.schedule_sort),
         name="sort.cuda",
     )
-    if target.kind.name == "cuda" and get_global_func(
-        "tvm.contrib.thrust.sort", allow_missing=True
-    ):
+    if can_use_thrust(target, "tvm.contrib.thrust.sort"):
         strategy.add_implementation(
             wrap_compute_sort(topi.cuda.sort_thrust),
             wrap_topi_schedule(topi.cuda.schedule_sort),
@@ -859,9 +855,7 @@ def argsort_strategy_cuda(attrs, inputs, out_type, target):
         wrap_topi_schedule(topi.cuda.schedule_argsort),
         name="argsort.cuda",
     )
-    if target.kind.name == "cuda" and get_global_func(
-        "tvm.contrib.thrust.sort", allow_missing=True
-    ):
+    if can_use_thrust(target, "tvm.contrib.thrust.sort"):
         strategy.add_implementation(
             wrap_compute_argsort(topi.cuda.argsort_thrust),
             wrap_topi_schedule(topi.cuda.schedule_argsort),
@@ -880,9 +874,7 @@ def topk_strategy_cuda(attrs, inputs, out_type, target):
         wrap_topi_schedule(topi.cuda.schedule_topk),
         name="topk.cuda",
     )
-    if target.kind.name == "cuda" and get_global_func(
-        "tvm.contrib.thrust.sort", allow_missing=True
-    ):
+    if can_use_thrust(target, "tvm.contrib.thrust.sort"):
         strategy.add_implementation(
             wrap_compute_topk(topi.cuda.topk_thrust),
             wrap_topi_schedule(topi.cuda.schedule_topk),
diff --git a/python/tvm/relay/op/strategy/rocm.py b/python/tvm/relay/op/strategy/rocm.py
index 934f38625fd3..f4538071e11e 100644
--- a/python/tvm/relay/op/strategy/rocm.py
+++ b/python/tvm/relay/op/strategy/rocm.py
@@ -19,7 +19,8 @@
 from tvm import topi
 from tvm.auto_scheduler import is_auto_scheduler_enabled
 from tvm.te import SpecializedCondition
-from tvm._ffi import get_global_func
+from tvm.contrib.thrust import can_use_rocthrust
+
 from .generic import *
 from .. import op as _op
 from .cuda import judge_winograd, naive_schedule
@@ -223,14 +224,6 @@ def batch_matmul_strategy_rocm(attrs, inputs, out_type, target):
     return strategy
 
 
-def can_use_thrust(target, func_name):
-    return (
-        target.kind.name == "rocm"
-        and "thrust" in target.libs
-        and get_global_func(func_name, allow_missing=True)
-    )
-
-
 @argsort_strategy.register(["rocm"])
 def argsort_strategy_cuda(attrs, inputs, out_type, target):
     """argsort rocm strategy"""
@@ -240,7 +233,7 @@ def argsort_strategy_cuda(attrs, inputs, out_type, target):
         wrap_topi_schedule(topi.cuda.schedule_argsort),
         name="argsort.rocm",
     )
-    if can_use_thrust(target, "tvm.contrib.thrust.sort"):
+    if can_use_rocthrust(target, "tvm.contrib.thrust.sort"):
         strategy.add_implementation(
             wrap_compute_argsort(topi.cuda.argsort_thrust),
             wrap_topi_schedule(topi.cuda.schedule_argsort),
@@ -264,7 +257,7 @@ def scatter_cuda(attrs, inputs, out_type, target):
     rank = len(inputs[0].shape)
 
     with SpecializedCondition(rank == 1):
-        if can_use_thrust(target, "tvm.contrib.thrust.stable_sort_by_key"):
+        if can_use_rocthrust(target, "tvm.contrib.thrust.stable_sort_by_key"):
             strategy.add_implementation(
                 wrap_compute_scatter(topi.cuda.scatter_via_sort),
                 wrap_topi_schedule(topi.cuda.schedule_scatter_via_sort),
@@ -283,7 +276,7 @@ def sort_strategy_cuda(attrs, inputs, out_type, target):
         wrap_topi_schedule(topi.cuda.schedule_sort),
         name="sort.rocm",
     )
-    if can_use_thrust(target, "tvm.contrib.thrust.sort"):
+    if can_use_rocthrust(target, "tvm.contrib.thrust.sort"):
         strategy.add_implementation(
             wrap_compute_sort(topi.cuda.sort_thrust),
             wrap_topi_schedule(topi.cuda.schedule_sort),
@@ -303,7 +296,7 @@ def topk_strategy_cuda(attrs, inputs, out_type, target):
         name="topk.rocm",
     )
 
-    if can_use_thrust(target, "tvm.contrib.thrust.sort"):
+    if can_use_rocthrust(target, "tvm.contrib.thrust.sort"):
         strategy.add_implementation(
             wrap_compute_topk(topi.cuda.topk_thrust),
             wrap_topi_schedule(topi.cuda.schedule_topk),
diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
index 98cb6750408a..a5a9c4def526 100644
--- a/python/tvm/topi/cuda/nms.py
+++ b/python/tvm/topi/cuda/nms.py
@@ -19,9 +19,9 @@
 """Non-maximum suppression operator"""
 import tvm
 from tvm import te
-
+from tvm.contrib.thrust import can_use_thrust, can_use_rocthrust
 from tvm.tir import if_then_else
-from .sort import argsort, argsort_thrust, is_thrust_available
+from .sort import argsort, argsort_thrust
 from .scan import exclusive_scan
 from ..utils import ceil_div
 
@@ -610,8 +610,10 @@ def _get_sorted_indices(data, data_buf, score_index, score_shape):
     )
 
     target = tvm.target.Target.current()
-    # TODO(masahi): Check -libs=thrust option
-    if target and target.kind.name in ["cuda", "rocm"] and is_thrust_available():
+    if target and (
+        can_use_thrust(target, "tvm.contrib.thrust.sort")
+        or can_use_rocthrust(target, "tvm.contrib.thrust.sort")
+    ):
         sort_tensor = argsort_thrust(score_tensor, axis=1, is_ascend=False, dtype="int32")
     else:
         sort_tensor = argsort(score_tensor, axis=1, is_ascend=False, dtype="int32")
diff --git a/python/tvm/topi/cuda/scan.py b/python/tvm/topi/cuda/scan.py
index 65d23365dc15..84ab5dcf9756 100644
--- a/python/tvm/topi/cuda/scan.py
+++ b/python/tvm/topi/cuda/scan.py
@@ -18,7 +18,7 @@
 "Scan related operators"
 import tvm
 from tvm import te
-from tvm._ffi import get_global_func
+from tvm.contrib.thrust import can_use_thrust, can_use_rocthrust
 from ..transform import expand_dims, squeeze, transpose, reshape
 from ..utils import ceil_div, swap, prod, get_const_int
 from ..math import cast
@@ -249,11 +249,6 @@ def ir(data, data_ex_scan, reduction):
     return reduction
 
 
-def is_thrust_available():
-    """Test if thrust based scan ops are available."""
-    return get_global_func("tvm.contrib.thrust.sum_scan", allow_missing=True) is not None
-
-
 def scan_thrust(
     data, output_dtype, exclusive=True, return_reduction=False, binop=tvm.tir.generic.add
 ):
@@ -352,8 +347,10 @@ def exclusive_scan(
 
     def do_scan(data, output_dtype):
         target = tvm.target.Target.current()
-        # TODO(masahi): Check -libs=thrust option
-        if target and target.kind.name in ["cuda", "rocm"] and is_thrust_available():
+        if target and (
+            can_use_thrust(target, "tvm.contrib.thrust.sum_scan")
+            or can_use_rocthrust(target, "tvm.contrib.thrust.sum_scan")
+        ):
             return scan_thrust(
                 data, output_dtype, exclusive=True, return_reduction=return_reduction, binop=binop
             )
diff --git a/python/tvm/topi/cuda/scatter.py b/python/tvm/topi/cuda/scatter.py
index 444fb25cc34b..fd05904ba8e7 100644
--- a/python/tvm/topi/cuda/scatter.py
+++ b/python/tvm/topi/cuda/scatter.py
@@ -21,7 +21,7 @@
 from ..scatter import _verify_scatter_nd_inputs
 from ..generic import schedule_extern
 from .nms import atomic_add
-from .sort import stable_sort_by_key_thrust, is_thrust_available
+from .sort import stable_sort_by_key_thrust
 from ..utils import prod, ceil_div
 
 
@@ -565,7 +565,6 @@ def scatter_via_sort(cfg, data, indices, updates, axis=0):
     if axis < 0:
         axis += len(data.shape)
     assert axis == 0 and len(data.shape) == 1, "sorting based scatter only supported for 1d input"
-    assert is_thrust_available(), "Thrust is required for this op"
 
     cfg.add_flop(1)  # A dummy value to satisfy AutoTVM
 
diff --git a/python/tvm/topi/cuda/sort.py b/python/tvm/topi/cuda/sort.py
index c0f076fb6065..ff5cc0681ad2 100644
--- a/python/tvm/topi/cuda/sort.py
+++ b/python/tvm/topi/cuda/sort.py
@@ -18,7 +18,6 @@
 """Sort related operators """
 import tvm
 from tvm import te
-from tvm._ffi import get_global_func
 
 from .injective import schedule_injective_from_existing
 from ..transform import strided_slice, transpose
@@ -879,10 +878,3 @@ def stable_sort_by_key_thrust(keys, values, for_scatter=False):
         tag="stable_sort_by_key",
     )
     return out[0], out[1]
-
-
-def is_thrust_available():
-    """
-    Test if thrust based sorting ops are available.
-    """
-    return get_global_func("tvm.contrib.thrust.sort", allow_missing=True) is not None
diff --git a/tests/python/contrib/test_thrust.py b/tests/python/contrib/test_thrust.py
index 521c20de6cbd..4edce0d6a642 100644
--- a/tests/python/contrib/test_thrust.py
+++ b/tests/python/contrib/test_thrust.py
@@ -17,16 +17,16 @@
 import tvm
 import tvm.testing
 from tvm import te
-from tvm.topi.cuda import stable_sort_by_key_thrust, is_thrust_available
+from tvm.topi.cuda import stable_sort_by_key_thrust
 from tvm.topi.cuda.scan import exclusive_scan, scan_thrust, schedule_scan
+from tvm.contrib.thrust import can_use_thrust, can_use_rocthrust
 import numpy as np
 
 
-def test_stable_sort_by_key():
-    if not is_thrust_available():
-        print("skip because thrust is not enabled...")
-        return
+thrust_check_func = {"cuda": can_use_thrust, "rocm": can_use_rocthrust}
+
 
+def test_stable_sort_by_key():
     size = 6
     keys = te.placeholder((size,), name="keys", dtype="int32")
     values = te.placeholder((size,), name="values", dtype="int32")
@@ -38,74 +38,73 @@ def test_stable_sort_by_key():
             print("Skip because %s is not enabled" % target)
             continue
 
-        target += " -libs=thrust"
-        ctx = tvm.context(target, 0)
-        s = te.create_schedule([keys_out.op, values_out.op])
-        f = tvm.build(s, [keys, values, keys_out, values_out], target)
+        with tvm.target.Target(target + " -libs=thrust") as tgt:
+            if not thrust_check_func[target](tgt, "tvm.contrib.thrust.stable_sort_by_key"):
+                print("skip because thrust is not enabled...")
+                return
 
-        keys_np = np.array([1, 4, 2, 8, 2, 7], np.int32)
-        values_np = np.random.randint(0, 10, size=(size,)).astype(np.int32)
-        keys_np_out = np.zeros(keys_np.shape, np.int32)
-        values_np_out = np.zeros(values_np.shape, np.int32)
-        keys_in = tvm.nd.array(keys_np, ctx)
-        values_in = tvm.nd.array(values_np, ctx)
-        keys_out = tvm.nd.array(keys_np_out, ctx)
-        values_out = tvm.nd.array(values_np_out, ctx)
-        f(keys_in, values_in, keys_out, values_out)
+            ctx = tvm.context(target, 0)
+            s = te.create_schedule([keys_out.op, values_out.op])
+            f = tvm.build(s, [keys, values, keys_out, values_out], target)
 
-        ref_keys_out = np.sort(keys_np)
-        ref_values_out = np.array([values_np[i] for i in np.argsort(keys_np)])
-        tvm.testing.assert_allclose(keys_out.asnumpy(), ref_keys_out, rtol=1e-5)
-        tvm.testing.assert_allclose(values_out.asnumpy(), ref_values_out, rtol=1e-5)
+            keys_np = np.array([1, 4, 2, 8, 2, 7], np.int32)
+            values_np = np.random.randint(0, 10, size=(size,)).astype(np.int32)
+            keys_np_out = np.zeros(keys_np.shape, np.int32)
+            values_np_out = np.zeros(values_np.shape, np.int32)
+            keys_in = tvm.nd.array(keys_np, ctx)
+            values_in = tvm.nd.array(values_np, ctx)
+            keys_out = tvm.nd.array(keys_np_out, ctx)
+            values_out = tvm.nd.array(values_np_out, ctx)
+            f(keys_in, values_in, keys_out, values_out)
 
+            ref_keys_out = np.sort(keys_np)
+            ref_values_out = np.array([values_np[i] for i in np.argsort(keys_np)])
+            tvm.testing.assert_allclose(keys_out.asnumpy(), ref_keys_out, rtol=1e-5)
+            tvm.testing.assert_allclose(values_out.asnumpy(), ref_values_out, rtol=1e-5)
 
-def test_exclusive_scan():
-    if not is_thrust_available():
-        print("skip because thrust is not enabled...")
-        return
 
+def test_exclusive_scan():
     for target in ["cuda", "rocm"]:
         if not tvm.testing.device_enabled(target):
             print("Skip because %s is not enabled" % target)
             continue
 
-        target += " -libs=thrust"
-        for ishape in [(10,), (10, 10), (10, 10, 10)]:
-            values = te.placeholder(ishape, name="values", dtype="int32")
+        with tvm.target.Target(target + " -libs=thrust") as tgt:
+            if not thrust_check_func[target](tgt, "tvm.contrib.thrust.sum_scan"):
+                print("skip because thrust is not enabled...")
+                return
+
+            for ishape in [(10,), (10, 10), (10, 10, 10)]:
+                values = te.placeholder(ishape, name="values", dtype="int32")
 
-            with tvm.target.Target(target):
                 scan, reduction = exclusive_scan(values, return_reduction=True)
                 s = schedule_scan([scan, reduction])
 
-            ctx = tvm.context(target, 0)
-            f = tvm.build(s, [values, scan, reduction], target)
+                ctx = tvm.context(target, 0)
+                f = tvm.build(s, [values, scan, reduction], target)
 
-            values_np = np.random.randint(0, 10, size=ishape).astype(np.int32)
-            values_np_out = np.zeros(values_np.shape, np.int32)
+                values_np = np.random.randint(0, 10, size=ishape).astype(np.int32)
+                values_np_out = np.zeros(values_np.shape, np.int32)
 
-            if len(ishape) == 1:
-                reduction_shape = ()
-            else:
-                reduction_shape = ishape[:-1]
+                if len(ishape) == 1:
+                    reduction_shape = ()
+                else:
+                    reduction_shape = ishape[:-1]
 
-            reduction_np_out = np.zeros(reduction_shape, np.int32)
+                reduction_np_out = np.zeros(reduction_shape, np.int32)
 
-            values_in = tvm.nd.array(values_np, ctx)
-            values_out = tvm.nd.array(values_np_out, ctx)
-            reduction_out = tvm.nd.array(reduction_np_out, ctx)
-            f(values_in, values_out, reduction_out)
+                values_in = tvm.nd.array(values_np, ctx)
+                values_out = tvm.nd.array(values_np_out, ctx)
+                reduction_out = tvm.nd.array(reduction_np_out, ctx)
+                f(values_in, values_out, reduction_out)
 
-            ref_values_out = np.cumsum(values_np, axis=-1, dtype="int32") - values_np
-            tvm.testing.assert_allclose(values_out.asnumpy(), ref_values_out, rtol=1e-5)
-            ref_reduction_out = np.sum(values_np, axis=-1)
-            tvm.testing.assert_allclose(reduction_out.asnumpy(), ref_reduction_out, rtol=1e-5)
+                ref_values_out = np.cumsum(values_np, axis=-1, dtype="int32") - values_np
+                tvm.testing.assert_allclose(values_out.asnumpy(), ref_values_out, rtol=1e-5)
+                ref_reduction_out = np.sum(values_np, axis=-1)
+                tvm.testing.assert_allclose(reduction_out.asnumpy(), ref_reduction_out, rtol=1e-5)
 
 
 def test_inclusive_scan():
-    if not is_thrust_available():
-        print("skip because thrust is not enabled...")
-        return
-
     out_dtype = "int64"
 
     for target in ["cuda", "rocm"]:
@@ -113,25 +112,28 @@ def test_inclusive_scan():
             print("Skip because %s is not enabled" % target)
             continue
 
-        target += " -libs=thrust"
-        for ishape in [(10,), (10, 10)]:
-            values = te.placeholder(ishape, name="values", dtype="int32")
+        with tvm.target.Target(target + " -libs=thrust") as tgt:
+            if not thrust_check_func[target](tgt, "tvm.contrib.thrust.sum_scan"):
+                print("skip because thrust is not enabled...")
+                return
+
+            for ishape in [(10,), (10, 10)]:
+                values = te.placeholder(ishape, name="values", dtype="int32")
 
-            with tvm.target.Target(target):
                 scan = scan_thrust(values, out_dtype, exclusive=False)
                 s = tvm.te.create_schedule([scan.op])
 
-            ctx = tvm.context(target, 0)
-            f = tvm.build(s, [values, scan], target)
+                ctx = tvm.context(target, 0)
+                f = tvm.build(s, [values, scan], target)
 
-            values_np = np.random.randint(0, 10, size=ishape).astype(np.int32)
-            values_np_out = np.zeros(values_np.shape, out_dtype)
-            values_in = tvm.nd.array(values_np, ctx)
-            values_out = tvm.nd.array(values_np_out, ctx)
-            f(values_in, values_out)
+                values_np = np.random.randint(0, 10, size=ishape).astype(np.int32)
+                values_np_out = np.zeros(values_np.shape, out_dtype)
+                values_in = tvm.nd.array(values_np, ctx)
+                values_out = tvm.nd.array(values_np_out, ctx)
+                f(values_in, values_out)
 
-            ref_values_out = np.cumsum(values_np, axis=-1, dtype=out_dtype)
-            tvm.testing.assert_allclose(values_out.asnumpy(), ref_values_out, rtol=1e-5)
+                ref_values_out = np.cumsum(values_np, axis=-1, dtype=out_dtype)
+                tvm.testing.assert_allclose(values_out.asnumpy(), ref_values_out, rtol=1e-5)
 
 
 if __name__ == "__main__":
diff --git a/tutorials/frontend/deploy_ssd_gluoncv.py b/tutorials/frontend/deploy_ssd_gluoncv.py
index f1f1bbb7057e..478aff255e0c 100644
--- a/tutorials/frontend/deploy_ssd_gluoncv.py
+++ b/tutorials/frontend/deploy_ssd_gluoncv.py
@@ -94,6 +94,10 @@ def build(target):
 
 ######################################################################
 # Create TVM runtime and do inference
+# .. note::
+#
+#   Use target = "cuda -libs" to enable thrust based sort, if you
+#   enabled thrust during cmake by -DUSE_THRUST=ON.
 
 
 def run(lib, ctx):

From d666b411cae40a495ebcb94b76679af2339c6399 Mon Sep 17 00:00:00 2001
From: Trevor Morris <trevmorr@amazon.com>
Date: Mon, 22 Feb 2021 13:12:39 -0800
Subject: [PATCH 219/357] Fix cuda nms handling of additional per box features
 (#7483)

---
 python/tvm/topi/cuda/nms.py          | 56 ++++++++++++++++++++++++----
 tests/python/relay/test_op_level5.py | 36 ++++++++++++++++++
 2 files changed, 85 insertions(+), 7 deletions(-)

diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
index a5a9c4def526..152b1bd15987 100644
--- a/python/tvm/topi/cuda/nms.py
+++ b/python/tvm/topi/cuda/nms.py
@@ -272,6 +272,7 @@ def nms_ir(
     out_bboxes,
     out_scores,
     out_class_ids,
+    out_features,
     box_indices,
     num_valid_boxes,
     max_output_size,
@@ -390,6 +391,7 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
     batch_size = data.shape[0]
     num_anchors = data.shape[1]
     box_data_length = data.shape[2]
+    num_features = out_features.shape[2]
 
     ib = tvm.tir.ir_builder.create()
 
@@ -402,6 +404,7 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
     out_bboxes = ib.buffer_ptr(out_bboxes)
     out_scores = ib.buffer_ptr(out_scores)
     out_class_ids = ib.buffer_ptr(out_class_ids)
+    out_features = ib.buffer_ptr(out_features)
     box_indices = ib.buffer_ptr(box_indices)
     num_valid_boxes = ib.buffer_ptr(num_valid_boxes)
 
@@ -428,6 +431,7 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
         i = by
         base_src_idx = i * num_anchors * box_data_length
         base_bbox_idx = i * num_anchors * 4
+        base_features_idx = i * num_anchors * num_features
 
         with ib.if_scope(tvm.tir.all(iou_threshold > 0, valid_count[i] > 0)):
             # Reorder output
@@ -439,6 +443,10 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
                 src_idx = base_src_idx + sorted_index[i * num_anchors + j] * box_data_length
                 with ib.for_range(0, 4, kind="unroll") as k:
                     out_bboxes[(base_bbox_idx + j * 4 + k)] = data[src_idx + coord_start + k]
+                with ib.for_range(0, num_features, kind="unroll") as k:
+                    out_features[(base_features_idx + j * num_features + k)] = data[
+                        src_idx + coord_start + 4 + k
+                    ]
 
                 out_scores[i * num_anchors + j] = data[src_idx + score_index]
 
@@ -452,6 +460,8 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
                     with ib.if_scope(j < num_anchors):
                         with ib.for_range(0, 4, kind="unroll") as k:
                             out_bboxes[(base_bbox_idx + j * 4 + k)] = -1.0
+                        with ib.for_range(0, num_features, kind="unroll") as k:
+                            out_features[(base_features_idx + j * num_features + k)] = -1.0
 
                         out_scores[i, j] = -1.0
 
@@ -468,6 +478,10 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
 
                 with ib.for_range(0, 4, kind="unroll") as k:
                     out_bboxes[base_bbox_idx + j * 4 + k] = data[src_offset + coord_start + k]
+                with ib.for_range(0, num_features, kind="unroll") as k:
+                    out_features[(base_features_idx + j * num_features + k)] = data[
+                        src_offset + coord_start + 4 + k
+                    ]
                 out_scores[i * num_anchors + j] = data[src_offset + score_index]
 
                 if id_index >= 0:
@@ -649,16 +663,26 @@ def _run_nms(
 
     batch_size = data.shape[0]
     num_anchors = data.shape[1]
+    # Number of extra features per box beyond coords, score, and id.
+    num_features = data.shape[2] - 6 if id_index >= 0 else data.shape[2] - 5
 
     # output shapes
     bbox_shape = (batch_size, num_anchors, 4)
     score_shape = (batch_size, num_anchors)
     class_id_shape = score_shape
+    out_features_shape = (batch_size, num_anchors, num_features)
     box_indices_shape = score_shape
     num_valid_boxes_shape = (batch_size, 1)
 
     return te.extern(
-        [bbox_shape, score_shape, class_id_shape, box_indices_shape, num_valid_boxes_shape],
+        [
+            bbox_shape,
+            score_shape,
+            class_id_shape,
+            out_features_shape,
+            box_indices_shape,
+            num_valid_boxes_shape,
+        ],
         [data, sort_tensor, valid_count, indices],
         lambda ins, outs: nms_ir(
             ins[0],
@@ -668,8 +692,9 @@ def _run_nms(
             outs[0],  # sorted bbox
             outs[1],  # sorted scores
             outs[2],  # sorted class ids
-            outs[3],  # box_indices
-            outs[4],  # num_valid_boxes
+            outs[3],  # sorted box feats
+            outs[4],  # box_indices
+            outs[5],  # num_valid_boxes
             max_output_size,
             iou_threshold,
             force_suppress,
@@ -679,7 +704,7 @@ def _run_nms(
             score_index,
             return_indices,
         ),
-        dtype=[data.dtype, "float32", "float32", "int32", "int32"],
+        dtype=[data.dtype, "float32", "float32", "float32", "int32", "int32"],
         in_buffers=[data_buf, sort_tensor_buf, valid_count_buf, indices_buf],
         name="nms",
         tag="nms",
@@ -687,11 +712,19 @@ def _run_nms(
 
 
 def _concatenate_outputs(
-    out_bboxes, out_scores, out_class_ids, out_shape, coord_start, score_index, id_index
+    out_bboxes,
+    out_scores,
+    out_class_ids,
+    out_features,
+    out_shape,
+    coord_start,
+    score_index,
+    id_index,
 ):
     """Pack the results from NMS into a single 5D or 6D tensor."""
     batch_size = out_bboxes.shape[0]
     num_anchors = out_bboxes.shape[1]
+    num_features = out_features.shape[2]
 
     def ir(out_bboxes, out_scores, out_class_ids, out):
         ib = tvm.tir.ir_builder.create()
@@ -718,6 +751,8 @@ def ir(out_bboxes, out_scores, out_class_ids, out):
             with ib.if_scope(tid < num_anchors):
                 with ib.for_range(0, 4, kind="unroll") as j:
                     out[i, tid, coord_start + j] = out_bboxes[i, tid, j]
+                with ib.for_range(0, num_features, kind="unroll") as j:
+                    out[i, tid, coord_start + 4 + j] = out_features[i, tid, j]
                 out[i, tid, score_index] = out_scores[i, tid]
                 if id_index >= 0:
                     out[i, tid, id_index] = out_class_ids[i, tid]
@@ -829,7 +864,7 @@ def non_max_suppression(
 
     sort_tensor = _get_sorted_indices(data, data_buf, score_index, (data.shape[0], data.shape[1]))
 
-    out_bboxes, out_scores, out_class_ids, box_indices, num_valid_boxes = _run_nms(
+    out_bboxes, out_scores, out_class_ids, out_features, box_indices, num_valid_boxes = _run_nms(
         data,
         data_buf,
         sort_tensor,
@@ -849,5 +884,12 @@ def non_max_suppression(
         return [box_indices, num_valid_boxes]
 
     return _concatenate_outputs(
-        out_bboxes, out_scores, out_class_ids, data.shape, coord_start, score_index, id_index
+        out_bboxes,
+        out_scores,
+        out_class_ids,
+        out_features,
+        data.shape,
+        coord_start,
+        score_index,
+        id_index,
     )
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 0a84667f8bdb..87f3ab87989b 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -488,6 +488,42 @@ def verify_nms(
         top_k=2,
     )
 
+    np_data = np.array(
+        [
+            [
+                [0, 0.8, 1, 20, 25, 45, 1, 2, 3, 4],
+                [1, 0.7, 30, 60, 50, 80, 5, 6, 7, 8],
+                [0, 0.4, 4, 21, 19, 40, 9, 10, 11, 12],
+                [2, 0.9, 35, 61, 52, 79, 13, 14, 15, 16],
+                [1, 0.5, 100, 60, 70, 110, 17, 18, 19, 20],
+            ]
+        ]
+    ).astype("float32")
+    np_result = np.array(
+        [
+            [
+                [2, 0.9, 35, 61, 52, 79, 13, 14, 15, 16],
+                [0, 0.8, 1, 20, 25, 45, 1, 2, 3, 4],
+                [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
+                [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
+                [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
+            ]
+        ]
+    )
+    dshape = (1, 5, 10)
+    verify_nms(
+        np_data,
+        np_valid_count,
+        np_indices,
+        np_max_output_size,
+        dshape,
+        np_result,
+        np_indices_result,
+        force_suppress=True,
+        top_k=2,
+        check_type_only=False,
+    )
+
 
 @tvm.testing.uses_gpu
 def test_multibox_transform_loc():

From 84359a97bdb896ddd2a74c65631a00da1ec5ec70 Mon Sep 17 00:00:00 2001
From: Masahiro Hiramori <8973217+mshr-h@users.noreply.github.com>
Date: Tue, 23 Feb 2021 07:45:35 +0900
Subject: [PATCH 220/357] Fixed minor misspelling (#7499)

Co-authored-by: mshr-h <mshr-h@users.noreply.github.com>
---
 include/tvm/ir/attrs.h                          | 2 +-
 include/tvm/runtime/packed_func.h               | 2 +-
 include/tvm/topi/einsum.h                       | 2 +-
 nnvm/src/core/symbolic.cc                       | 4 ++--
 python/tvm/micro/contrib/zephyr.py              | 2 +-
 python/tvm/relay/frontend/coreml.py             | 2 +-
 python/tvm/relay/testing/resnet.py              | 2 +-
 python/tvm/relay/testing/resnet_3d.py           | 2 +-
 python/tvm/relay/transform/transform.py         | 2 +-
 python/tvm/tir/stmt.py                          | 2 +-
 python/tvm/tir/transform/function_pass.py       | 2 +-
 python/tvm/topi/arm_cpu/depthwise_conv2d.py     | 2 +-
 python/tvm/topi/cuda/batch_matmul_tensorcore.py | 2 +-
 python/tvm/topi/cuda/rcnn/proposal.py           | 2 +-
 python/tvm/topi/nn/batch_matmul.py              | 2 +-
 python/tvm/topi/random/kernel.py                | 2 +-
 python/tvm/topi/testing/strided_slice_python.py | 4 ++--
 python/tvm/topi/utils.py                        | 2 +-
 python/tvm/topi/vision/rcnn/proposal.py         | 2 +-
 python/tvm/topi/x86/batch_matmul.py             | 6 +++---
 python/tvm/topi/x86/conv2d_avx_1x1.py           | 2 +-
 src/relay/ir/dataflow_matcher.cc                | 2 +-
 src/relay/ir/indexed_graph.cc                   | 4 ++--
 src/relay/transforms/partition_graph.cc         | 2 +-
 src/runtime/c_runtime_api.cc                    | 2 +-
 src/tir/transforms/hoist_if_then_else.cc        | 2 +-
 tests/python/frontend/mxnet/model_zoo/resnet.py | 2 +-
 tests/python/frontend/onnx/test_forward.py      | 2 +-
 28 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/include/tvm/ir/attrs.h b/include/tvm/ir/attrs.h
index 13bfd715cdfb..f05ab04c3305 100644
--- a/include/tvm/ir/attrs.h
+++ b/include/tvm/ir/attrs.h
@@ -146,7 +146,7 @@ class BaseAttrsNode : public Object {
   virtual void VisitAttrs(AttrVisitor* v) {}
   /*!
    * \brief Initialize the attributes by sequence of arguments
-   * \param args The postional arguments in the form
+   * \param args The positional arguments in the form
    *        [key0, value0, key1, value1, ..., key_n, value_n]
    */
   template <typename... Args>
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index e43e042866ff..cf30923aacb0 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -1204,7 +1204,7 @@ struct func_signature_helper<R (T::*)(Args...) const> {
 
 /*!
  * \brief template class to get function signature of a function or functor.
- * \tparam T The funtion/functor type.
+ * \tparam T The function/functor type.
  */
 template <typename T>
 struct function_signature {
diff --git a/include/tvm/topi/einsum.h b/include/tvm/topi/einsum.h
index e1baadab09d3..a0c4039909ad 100644
--- a/include/tvm/topi/einsum.h
+++ b/include/tvm/topi/einsum.h
@@ -439,7 +439,7 @@ inline std::vector<std::string> Split(const std::string& str, const std::string&
  * \param subscripts input subscripts.
  * \param operands operand tensors.
  *
- * \return vector of strings, vector[0] represents the input part, vector[1] represents the ouput.
+ * \return vector of strings, vector[0] represents the input part, vector[1] represents the output.
  * if no output, the vector[1] is NULL.
  * "ab, bc -> ac" => ["ab,bc", "ac"]
  */
diff --git a/nnvm/src/core/symbolic.cc b/nnvm/src/core/symbolic.cc
index 12b8675d0bd7..48f834b28535 100644
--- a/nnvm/src/core/symbolic.cc
+++ b/nnvm/src/core/symbolic.cc
@@ -240,7 +240,7 @@ std::vector<std::string> Symbol::ListInputNames(ListInputOption option) const {
 }
 
 std::vector<std::string> Symbol::ListOutputNames() const {
-  static auto& flist_ouputs = Op::GetAttr<FListOutputNames>("FListOutputNames");
+  static auto& flist_outputs = Op::GetAttr<FListOutputNames>("FListOutputNames");
 
   std::vector<std::string> ret;
   ret.reserve(outputs.size());
@@ -250,7 +250,7 @@ std::vector<std::string> Symbol::ListOutputNames() const {
     } else {
       const std::string& hname = head.node->attrs.name;
       std::string rname;
-      FListOutputNames fn = flist_ouputs.get(head.node->op(), nullptr);
+      FListOutputNames fn = flist_outputs.get(head.node->op(), nullptr);
       if (fn != nullptr) {
         rname = fn(head.node->attrs)[head.index];
       } else {
diff --git a/python/tvm/micro/contrib/zephyr.py b/python/tvm/micro/contrib/zephyr.py
index 29bb5ecdbe6f..cd9c23cd2f9d 100644
--- a/python/tvm/micro/contrib/zephyr.py
+++ b/python/tvm/micro/contrib/zephyr.py
@@ -537,7 +537,7 @@ class QemuStartupFailureError(Exception):
 
 
 class QemuFdTransport(file_descriptor.FdTransport):
-    """An FdTransport subclass that escapes written data to accomodate the QEMU monitor.
+    """An FdTransport subclass that escapes written data to accommodate the QEMU monitor.
 
     It's supposedly possible to disable the monitor, but Zephyr controls most of the command-line
     arguments for QEMU and there are too many options which implictly enable the monitor, so this
diff --git a/python/tvm/relay/frontend/coreml.py b/python/tvm/relay/frontend/coreml.py
index 4efe014b9ffd..f850750fad51 100644
--- a/python/tvm/relay/frontend/coreml.py
+++ b/python/tvm/relay/frontend/coreml.py
@@ -524,7 +524,7 @@ def coreml_op_to_relay(op, inname, outnames, etab):
             outname = outnames if isinstance(outnames, _base.string_types) else outnames[0]
             etab.set_expr(outname, outs, force_override=True)
         else:
-            # the number of ouputs from model op and tvm relay must be same
+            # the number of outputs from model op and tvm relay must be same
             assert len(outnames) == len(outs)
             for outname, out in zip(outnames, outs):
                 etab.set_expr(outname, out, force_override=True)
diff --git a/python/tvm/relay/testing/resnet.py b/python/tvm/relay/testing/resnet.py
index bc5f5c4eed3e..b35e01f6779b 100644
--- a/python/tvm/relay/testing/resnet.py
+++ b/python/tvm/relay/testing/resnet.py
@@ -177,7 +177,7 @@ def resnet(
         Channel size of each stage
 
     num_classes : int
-        Ouput size of symbol
+        Output size of symbol
 
     data_shape : tuple of int.
         The shape of input data.
diff --git a/python/tvm/relay/testing/resnet_3d.py b/python/tvm/relay/testing/resnet_3d.py
index 484f51dcac9b..715e3951b856 100644
--- a/python/tvm/relay/testing/resnet_3d.py
+++ b/python/tvm/relay/testing/resnet_3d.py
@@ -174,7 +174,7 @@ def resnet(
         Channel size of each stage
 
     num_classes : int
-        Ouput size of symbol
+        Output size of symbol
 
     data_shape : tuple of int.
         The shape of input data.
diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py
index f02f8352de9e..0d078d39372d 100644
--- a/python/tvm/relay/transform/transform.py
+++ b/python/tvm/relay/transform/transform.py
@@ -985,7 +985,7 @@ def transform(func, mod, ctx):
     """
 
     if opt_level is None:
-        raise ValueError("Please provide opt_level for the funtion pass.")
+        raise ValueError("Please provide opt_level for the function pass.")
 
     required = required if required else []
     if not isinstance(required, (list, tuple)):
diff --git a/python/tvm/tir/stmt.py b/python/tvm/tir/stmt.py
index 9e1ef56cca58..5882dca5578e 100644
--- a/python/tvm/tir/stmt.py
+++ b/python/tvm/tir/stmt.py
@@ -109,7 +109,7 @@ class For(Stmt):
         The loop variable.
 
     min_val : PrimExpr
-        The begining value.
+        The beginning value.
 
     extent : PrimExpr
         The length of the loop.
diff --git a/python/tvm/tir/transform/function_pass.py b/python/tvm/tir/transform/function_pass.py
index 59b3ecd6237d..7cff1f66a625 100644
--- a/python/tvm/tir/transform/function_pass.py
+++ b/python/tvm/tir/transform/function_pass.py
@@ -130,7 +130,7 @@ def transform(func, mod, ctx):
     """
 
     if opt_level is None:
-        raise ValueError("Please provide opt_level for the funtion pass.")
+        raise ValueError("Please provide opt_level for the function pass.")
 
     required = required if required else []
     if not isinstance(required, (list, tuple)):
diff --git a/python/tvm/topi/arm_cpu/depthwise_conv2d.py b/python/tvm/topi/arm_cpu/depthwise_conv2d.py
index 441b0a5a3688..c21480724ae4 100644
--- a/python/tvm/topi/arm_cpu/depthwise_conv2d.py
+++ b/python/tvm/topi/arm_cpu/depthwise_conv2d.py
@@ -692,7 +692,7 @@ def _schedule_spatial_pack(cfg, s, data_vec, kernel_vec, conv, output, last):
     if kernel_vec.op.name == "kernel_vec":
         co, _, _, _, _ = s[kernel_vec].op.axis
         if autotvm.GLOBAL_SCOPE.in_tuning:
-            # kernel packing will be pre-computed during compliation, so we skip
+            # kernel packing will be pre-computed during compilation, so we skip
             # this part to make tuning records correct
             s[kernel_vec].pragma(co, "debug_skip_region")
         else:
diff --git a/python/tvm/topi/cuda/batch_matmul_tensorcore.py b/python/tvm/topi/cuda/batch_matmul_tensorcore.py
index 59b92ec9e623..962a8af7853b 100644
--- a/python/tvm/topi/cuda/batch_matmul_tensorcore.py
+++ b/python/tvm/topi/cuda/batch_matmul_tensorcore.py
@@ -291,7 +291,7 @@ def batch_matmul_tensorcore_cuda(x, y):
     x_shape = get_const_tuple(x.shape)
     y_shape = get_const_tuple(y.shape)
     assert x_shape[0] == y_shape[0], "batch dimension doesn't match"
-    assert x_shape[2] == y_shape[2], "shapes of x and y is inconsistant"
+    assert x_shape[2] == y_shape[2], "shapes of x and y is inconsistent"
     batch, M, K = x.shape
     N = y.shape[1]
     out_dtype = x.dtype
diff --git a/python/tvm/topi/cuda/rcnn/proposal.py b/python/tvm/topi/cuda/rcnn/proposal.py
index e5e83b4911a3..12f7a23abe35 100644
--- a/python/tvm/topi/cuda/rcnn/proposal.py
+++ b/python/tvm/topi/cuda/rcnn/proposal.py
@@ -203,7 +203,7 @@ def argsort_ir(data_buf, out_index_buf):
 
 
 def nms_ir(sorted_bbox_buf, out_buf, nms_threshold):
-    """Non-maximum supression.
+    """Non-maximum suppression.
 
     Parameters
     ----------
diff --git a/python/tvm/topi/nn/batch_matmul.py b/python/tvm/topi/nn/batch_matmul.py
index 9c5848129397..b6ed5a373e81 100644
--- a/python/tvm/topi/nn/batch_matmul.py
+++ b/python/tvm/topi/nn/batch_matmul.py
@@ -62,7 +62,7 @@ def batch_matmul(x, y, oshape=None, auto_scheduler_rewritten_layout=""):
     k = te.reduce_axis((0, K), name="k")
     if oshape is None:
         assert XB == YB or XB == 1 or YB == 1, "batch dimension doesn't match"
-        assert x_shape[2] == y_shape[2], "shapes of x and y is inconsistant"
+        assert x_shape[2] == y_shape[2], "shapes of x and y is inconsistent"
         batch = te.max(XB, YB)
         N = y.shape[1]
         oshape = (batch, M, N)
diff --git a/python/tvm/topi/random/kernel.py b/python/tvm/topi/random/kernel.py
index b21db3778744..728cd682fa42 100644
--- a/python/tvm/topi/random/kernel.py
+++ b/python/tvm/topi/random/kernel.py
@@ -121,7 +121,7 @@ def _threefry(
         Threefry will write to :code:`out_buf[out_offset:out_offset+4*product(out_shape)]`
 
     out_shape: number
-        Determines the number of ouput states to generate. :code:`state[i]` will correspond to
+        Determines the number of output states to generate. :code:`state[i]` will correspond to
         counter+i.
     """
     nrounds = 20
diff --git a/python/tvm/topi/testing/strided_slice_python.py b/python/tvm/topi/testing/strided_slice_python.py
index c5eb72396c4f..30466c785778 100644
--- a/python/tvm/topi/testing/strided_slice_python.py
+++ b/python/tvm/topi/testing/strided_slice_python.py
@@ -26,7 +26,7 @@ def strided_slice_python(data, begin, end, strides, slice_mode="end"):
         Input data
 
     begin : list
-        Begining of the slices.
+        Beginning of the slices.
 
     end : list
         End of the slices.
@@ -81,7 +81,7 @@ def strided_set_python(data, v, begin, end, strides):
         Value data
 
     begin : list
-        Begining of the slices.
+        Beginning of the slices.
 
     end : list
         End of the slices.
diff --git a/python/tvm/topi/utils.py b/python/tvm/topi/utils.py
index cd9f0c61c854..2e8528c5e76c 100644
--- a/python/tvm/topi/utils.py
+++ b/python/tvm/topi/utils.py
@@ -460,7 +460,7 @@ def make_idx(b, e, s, z, i):
 
     Returns
     -------
-    postion: Expr
+    position: Expr
         int expression that corresponds to an array position in the selection.
     """
     bc = tvm.tir.Select(s < 0, i <= e, i < b)
diff --git a/python/tvm/topi/vision/rcnn/proposal.py b/python/tvm/topi/vision/rcnn/proposal.py
index e15ba8cd27c7..12a0d6bcf0a0 100644
--- a/python/tvm/topi/vision/rcnn/proposal.py
+++ b/python/tvm/topi/vision/rcnn/proposal.py
@@ -231,7 +231,7 @@ def argsort_ir(data_buf, out_index_buf):
 
 
 def nms_ir(sorted_bbox_buf, out_buf, nms_threshold):
-    """Non-maximum supression.
+    """Non-maximum suppression.
 
     Parameters
     ----------
diff --git a/python/tvm/topi/x86/batch_matmul.py b/python/tvm/topi/x86/batch_matmul.py
index 79b38de8cf93..df480123375d 100644
--- a/python/tvm/topi/x86/batch_matmul.py
+++ b/python/tvm/topi/x86/batch_matmul.py
@@ -49,7 +49,7 @@ def batch_matmul(cfg, x, y, out_shape=None):
     XB, M, XK = get_const_tuple(x.shape)
     YB, N, YK = get_const_tuple(y.shape)
     assert (XB == YB) or (YB == 1) or (XB == 1), "batch dimension doesn't match"
-    assert XK == YK, "shapes of x and y is inconsistant"
+    assert XK == YK, "shapes of x and y is inconsistent"
     B = te.max(XB, YB)
     K = XK
     if out_shape is not None:
@@ -151,7 +151,7 @@ def batch_matmul_blas_common(cfg, x, y, out_shape, lib):
         3-D with shape [batch, N, K]
     out_shape : tuple or None
         Shape of the output
-    lib : A contrib module which implements batch_matmul funtion
+    lib : A contrib module which implements batch_matmul function
         cblas and mkl are supported
 
     Returns
@@ -163,7 +163,7 @@ def batch_matmul_blas_common(cfg, x, y, out_shape, lib):
     XB, M, XK = get_const_tuple(x.shape)
     YB, N, YK = get_const_tuple(y.shape)
     assert XB == YB, "batch dimension doesn't match"
-    assert XK == YK, "shapes of x and y is inconsistant"
+    assert XK == YK, "shapes of x and y is inconsistent"
     if out_shape is not None:
         assert out_shape[0] == XB, "got invalid output shape"
         assert out_shape[1] == M, "got invalid output shape"
diff --git a/python/tvm/topi/x86/conv2d_avx_1x1.py b/python/tvm/topi/x86/conv2d_avx_1x1.py
index afee03a9f6a0..32b06725cdc2 100644
--- a/python/tvm/topi/x86/conv2d_avx_1x1.py
+++ b/python/tvm/topi/x86/conv2d_avx_1x1.py
@@ -191,7 +191,7 @@ def _declaration_conv_nhwc_pack(cfg, Input, Filter, stride, padding, dilation, o
     pad_before = [0, pad_top, pad_left, 0]
     pad_after = [0, pad_down, pad_right, 0]
     PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput")
-    # todo: padding filter to accomodate the intrinsic
+    # todo: padding filter to accommodate the intrinsic
 
     # packing the Filter to let memory access be consecutive for AVX512 intrinsic
     # Done in pre-compute stage
diff --git a/src/relay/ir/dataflow_matcher.cc b/src/relay/ir/dataflow_matcher.cc
index ac716579f2ab..43a6473fb632 100644
--- a/src/relay/ir/dataflow_matcher.cc
+++ b/src/relay/ir/dataflow_matcher.cc
@@ -734,7 +734,7 @@ class PatternGrouper {
           // Exit due to overlapping partitions
           return;
         } else if (kv.second != body) {
-          // if the node isn't the ouput of the group
+          // if the node isn't the output of the group
           auto node = matcher_->expr_graph_.node_map_.at(kv.first);
           for (auto* output : node->outputs_) {
             // and the node is used by nodes outside of the group
diff --git a/src/relay/ir/indexed_graph.cc b/src/relay/ir/indexed_graph.cc
index 0f81c2360d0f..36789e6f808a 100644
--- a/src/relay/ir/indexed_graph.cc
+++ b/src/relay/ir/indexed_graph.cc
@@ -73,7 +73,7 @@ IndexedGraph<Expr> CreateIndexedGraph(const Expr& expr) {
       return std::move(graph_);
     }
 
-    /*! Default visitation pushes the parent to the child's ouputs and the child to the parent's
+    /*! Default visitation pushes the parent to the child's outputs and the child to the parent's
      * inputs*/
     void VisitExpr(const Expr& expr, NodePtr parent) override {
       auto current = graph_.node_map_[expr];
@@ -220,7 +220,7 @@ IndexedGraph<DFPattern> CreateIndexedGraph(const DFPattern& pattern) {
       return std::move(graph_);
     }
 
-    /*! Default visitation pushes the parent to the child's ouputs */
+    /*! Default visitation pushes the parent to the child's outputs */
     void VisitDFPattern(const DFPattern& pattern, NodePtr parent) override {
       auto current = graph_.node_map_[pattern];
       if (parent) {
diff --git a/src/relay/transforms/partition_graph.cc b/src/relay/transforms/partition_graph.cc
index 7508d4437c18..404c7efb10b0 100644
--- a/src/relay/transforms/partition_graph.cc
+++ b/src/relay/transforms/partition_graph.cc
@@ -177,7 +177,7 @@ class Partitioner : public MixedModeMutator {
       AnnotatedRegion region = GetRegion(GetRef<Call>(call));
 
       // TODO(@manupa-arm) : need to use the parent function (to which region
-      // belongs to) name/key for the funtions that are created
+      // belongs to) name/key for the functions that are created
       BaseFunc f = GetFunc(GetRef<Call>(call));
 
       // Traverse subgraph inputs.
diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc
index 6ecc60a93dec..b4457bf66614 100644
--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -169,7 +169,7 @@ void DeviceAPI::SyncStreamFromTo(TVMContext ctx, TVMStreamHandle event_src,
 // {message1}
 // {message2}
 // {Stack trace:}    // stack traces follow by this line
-//   {trace 0}       // two spaces in the begining.
+//   {trace 0}       // two spaces in the beginning.
 //   {trace 1}
 //   {trace 2}
 //--------------------------------------------------------
diff --git a/src/tir/transforms/hoist_if_then_else.cc b/src/tir/transforms/hoist_if_then_else.cc
index 7bae0ce8ca75..4a11a7e90e30 100644
--- a/src/tir/transforms/hoist_if_then_else.cc
+++ b/src/tir/transforms/hoist_if_then_else.cc
@@ -168,7 +168,7 @@ class HoistCandidateSelector final : public StmtExprVisitor {
     // To stop hoisting if any of the block variables are used.
     //
     // In case we want to use hoisting in between certain passes
-    // which have interdependencies of the postioning of if nodes with scope var
+    // which have interdependencies of the positioning of if nodes with scope var
     // it is better to disable this section
     if (support_block_scope_hosting_) {
       if (IsRecordingOn()) {
diff --git a/tests/python/frontend/mxnet/model_zoo/resnet.py b/tests/python/frontend/mxnet/model_zoo/resnet.py
index 98cdce6b4ea7..00e68958b462 100644
--- a/tests/python/frontend/mxnet/model_zoo/resnet.py
+++ b/tests/python/frontend/mxnet/model_zoo/resnet.py
@@ -182,7 +182,7 @@ def resnet(
     filter_list : list
         Channel size of each stage
     num_classes : int
-        Ouput size of symbol
+        Output size of symbol
     dataset : str
         Dataset type, only cifar10 and imagenet supports
     workspace : int
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 59ecffe829df..d6fe98d031fa 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -3502,7 +3502,7 @@ def verify_roi_align(
 # @tvm.testing.uses_gpu
 def test_non_max_suppression():
     def verify_nms(
-        boxes, scores, max_ouput_boxes_per_class, iou_threshold, score_threshold, output_dims
+        boxes, scores, max_output_boxes_per_class, iou_threshold, score_threshold, output_dims
     ):
         input_names = ["boxes", "scores", "max_output_boxes_per_class", "iou_threshold"]
         input_nodes = [

From 794f6c673c7aa71e3e7225d2dfb36d6545e10e9b Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Mon, 22 Feb 2021 16:54:55 -0800
Subject: [PATCH 221/357] [Target] Add target host field for target
 specification (#7462)

* Add target host field in Target

* Add host as a config field to target

* Add target host support for Python api

* Add unit tests

* Adjust format for cpplint

* Remove unnecessary  after  in Python file

* Remove redundancy and add param description

* Fix format issue

* Fix param description

* Add unit test for duplicate target hosts
---
 include/tvm/target/target.h                 | 15 ++++-
 include/tvm/target/target_kind.h            |  3 +-
 python/tvm/target/target.py                 | 16 ++++-
 src/target/target.cc                        | 29 +++++++-
 src/target/target_kind.cc                   |  1 -
 tests/python/unittest/test_target_target.py | 75 +++++++++++++++++++--
 6 files changed, 124 insertions(+), 15 deletions(-)

diff --git a/include/tvm/target/target.h b/include/tvm/target/target.h
index 2abdb558baf8..64bd251c0ded 100644
--- a/include/tvm/target/target.h
+++ b/include/tvm/target/target.h
@@ -44,6 +44,8 @@ class TargetNode : public Object {
  public:
   /*! \brief The kind of the target device */
   TargetKind kind;
+  /*! \brief Target host information, must be Target type */
+  Optional<ObjectRef> host;
   /*! \brief Tag of the the target, can be empty */
   String tag;
   /*! \brief Keys for this target */
@@ -64,6 +66,7 @@ class TargetNode : public Object {
     v->Visit("tag", &tag);
     v->Visit("keys", &keys);
     v->Visit("attrs", &attrs);
+    v->Visit("host", &host);
   }
 
   /*!
@@ -122,12 +125,12 @@ class Target : public ObjectRef {
   TVM_DLL explicit Target(std::nullptr_t) { data_ = nullptr; }
   /*!
    * \brief Construct a Target given a string
-   * \param tag_or_config_or_target_str the string to parse
+   * \param tag_or_config_or_target_str the string to parse for target
    */
   TVM_DLL explicit Target(const String& tag_or_config_or_target_str);
   /*!
    * \brief Construct a Target using a JSON-like configuration
-   * \param config The JSON-like configuration
+   * \param config The JSON-like configuration for target
    */
   TVM_DLL explicit Target(const Map<String, ObjectRef>& config);
   /*!
@@ -139,7 +142,13 @@ class Target : public ObjectRef {
    * allow_not_defined is true.
    */
   TVM_DLL static tvm::Target Current(bool allow_not_defined = true);
-
+  /*!
+   * \brief Construct a Target given target and host
+   * \param target The Target typed object with host field undefined for target
+   * \param host The Target typed object for target host
+   * \return The Target with given target and host context information
+   */
+  TVM_DLL explicit Target(Target target, Target host);
   TVM_DEFINE_OBJECT_REF_METHODS(Target, ObjectRef, TargetNode);
 
  private:
diff --git a/include/tvm/target/target_kind.h b/include/tvm/target/target_kind.h
index 72c41c6f4647..e7da2dd413a0 100644
--- a/include/tvm/target/target_kind.h
+++ b/include/tvm/target/target_kind.h
@@ -376,7 +376,8 @@ inline TargetKindRegEntry& TargetKindRegEntry::set_name() {
           .add_attr_option<String>("tag")                         \
           .add_attr_option<String>("device")                      \
           .add_attr_option<String>("model")                       \
-          .add_attr_option<Array<String>>("libs")
+          .add_attr_option<Array<String>>("libs")                 \
+          .add_attr_option<Target>("host")
 
 }  // namespace tvm
 
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 8942957d32c9..8c60260e640a 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -46,7 +46,7 @@ class Target(Object):
     - :py:func:`tvm.target.intel_graphics` create Intel Graphics target
     """
 
-    def __init__(self, tag_or_str_or_dict):
+    def __init__(self, tag_or_str_or_dict, host_tag_or_str_or_dict=None):
         """Construct a TVM target object from
         1) Raw target string
         2) Target config dict
@@ -86,10 +86,22 @@ def __init__(self, tag_or_str_or_dict):
             mfloat-abi : str (optional)
                 An llvm setting that is one of 'hard' or 'soft' indicating whether to use
                 hardware or software floating-point operations.
+            host : Union[str, Dict[str, Any]] (optional)
+                Description for target host. Can be recursive. Similar to tag_or_str_or_dict.
+        host_tag_or_str_or_dict : Optional[Union[str, Dict[str, Any]]]
+            Similar to tag_or_str_or_dict but for target host. Can be one of a literal
+            target host string, a json string describing a configuration, or a dictionary of
+            configuration options. When using a dictionary or json string to configure target,
+            the possible values are same as tag_or_str_or_dict.
         """
         if not isinstance(tag_or_str_or_dict, (dict, str, Target)):
             raise ValueError("target has to be a string or dictionary.")
-        self.__init_handle_by_constructor__(_ffi_api.Target, tag_or_str_or_dict)
+        if host_tag_or_str_or_dict is not None:
+            self.__init_handle_by_constructor__(
+                _ffi_api.Target, Target(tag_or_str_or_dict), Target(host_tag_or_str_or_dict)
+            )
+        else:
+            self.__init_handle_by_constructor__(_ffi_api.Target, tag_or_str_or_dict)
 
     def __enter__(self):
         _ffi_api.TargetEnterScope(self)
diff --git a/src/target/target.cc b/src/target/target.cc
index e44a15c3ff59..b5ca4c38bbb9 100644
--- a/src/target/target.cc
+++ b/src/target/target.cc
@@ -373,6 +373,15 @@ Target::Target(const Map<String, ObjectRef>& config) {
   data_ = std::move(target);
 }
 
+Target::Target(Target target, Target host) {
+  ObjectPtr<TargetNode> n = make_object<TargetNode>(*target.get());
+  CHECK(!n->host.defined())
+      << "ValueError: Adding a host to a target whose host field has been defined";
+  // add target host into host field
+  n->host = std::move(host);
+  data_ = std::move(n);
+}
+
 std::vector<std::string> TargetNode::GetKeys() const {
   std::vector<std::string> result;
   for (auto& expr : keys) {
@@ -456,8 +465,18 @@ void TargetInternal::ConstructorDispatcher(TVMArgs args, TVMRetValue* rv) {
                  << runtime::ArgTypeCode2Str(arg.type_code());
     }
     return;
+  } else if (args.num_args == 2) {
+    if (args[0].IsObjectRef<Target>() && args[1].IsObjectRef<Target>()) {
+      Target target = args[0];
+      Target host = args[1];
+      *rv = Target(target, host);
+    } else {
+      LOG(FATAL) << "ValueError: Invalid type of arguments. Expect 2 Target arguments.";
+    }
+    return;
   }
-  LOG(FATAL) << "ValueError: Invalid number of arguments. Expect 1, but gets: " << args.num_args;
+  LOG(FATAL) << "ValueError: Invalid number of arguments. Expect 1 or 2, but gets: "
+             << args.num_args;
 }
 
 ObjectPtr<Object> TargetInternal::FromString(const String& tag_or_config_or_target_str) {
@@ -527,6 +546,7 @@ ObjectPtr<Object> TargetInternal::FromConfig(std::unordered_map<String, ObjectRe
   const String kTag = "tag";
   const String kKeys = "keys";
   const String kDeviceName = "device";
+  const String kHost = "host";
   ObjectPtr<TargetNode> target = make_object<TargetNode>();
   // parse 'kind'
   if (config.count(kKind)) {
@@ -599,6 +619,13 @@ ObjectPtr<Object> TargetInternal::FromConfig(std::unordered_map<String, ObjectRe
       throw dmlc::Error(": Error when parsing target[\"" + key + "\"]" + e.what());
     }
   }
+  // parse host
+  if (config.count(kHost)) {
+    target->host = PackedFunc(ConstructorDispatcher)(config[kHost]).AsObjectRef<Target>();
+    config.erase(kHost);
+  } else {
+    target->host = NullOpt;
+  }
   // set default attribute values if they do not exist
   for (const auto& kv : target->kind->key2default_) {
     if (!attrs.count(kv.first)) {
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index a3b1b207f290..863d99993f4a 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -309,7 +309,6 @@ TVM_REGISTER_TARGET_KIND("hybrid", kDLCPU)  // line break
     .add_attr_option<Bool>("system-lib");
 
 TVM_REGISTER_TARGET_KIND("composite", kDLCPU)
-    .add_attr_option<Target>("target_host")
     .add_attr_option<Array<Target>>("devices");
 
 /**********  Registry  **********/
diff --git a/tests/python/unittest/test_target_target.py b/tests/python/unittest/test_target_target.py
index a0a60cb0c4fd..7b998bef34a5 100644
--- a/tests/python/unittest/test_target_target.py
+++ b/tests/python/unittest/test_target_target.py
@@ -16,6 +16,7 @@
 # under the License.
 import json
 import tvm
+import pytest
 from tvm import te
 from tvm.target import cuda, rocm, mali, intel_graphics, arm_cpu, vta, bifrost, hexagon
 
@@ -113,18 +114,14 @@ def test_config_map():
     attributes fails as expected.
     """
     target_config = {"kind": "llvm", "libs": {"a": "b", "c": "d"}}
-    failed = False
-    try:
+    with pytest.raises(ValueError):
         tvm.target.Target(target_config)
-    except ValueError:
-        failed = True
-    assert failed
 
 
 def test_composite_target():
-    tgt = tvm.target.Target("composite --target_host=llvm --devices=cuda,opencl")
+    tgt = tvm.target.Target("composite --host=llvm --devices=cuda,opencl")
     assert tgt.kind.name == "composite"
-    assert tgt.attrs["target_host"].kind.name == "llvm"
+    assert tgt.attrs["host"].kind.name == "llvm"
     assert len(tgt.attrs["devices"]) == 2
     cuda_device, opencl_device = tgt.attrs["devices"]
     assert cuda_device.kind.name == "cuda"
@@ -158,6 +155,70 @@ def test_list_kinds():
     assert all(isinstance(target_name, str) for target_name in targets)
 
 
+def test_target_host_tags():
+    tgt = tvm.target.Target("nvidia/jetson-nano", "nvidia/geforce-rtx-2080-ti")
+    assert tgt.kind.name == "cuda"
+    assert tgt.attrs["arch"] == "sm_53"
+    assert tgt.attrs["shared_memory_per_block"] == 49152
+    assert tgt.attrs["max_threads_per_block"] == 1024
+    assert tgt.attrs["thread_warp_size"] == 32
+    assert tgt.attrs["registers_per_block"] == 32768
+    assert tgt.host.kind.name == "cuda"
+    assert tgt.host.attrs["arch"] == "sm_75"
+    assert tgt.host.attrs["shared_memory_per_block"] == 49152
+    assert tgt.host.attrs["max_threads_per_block"] == 1024
+    assert tgt.host.attrs["thread_warp_size"] == 32
+    assert tgt.host.attrs["registers_per_block"] == 65536
+
+
+def test_target_host_tag_dict():
+    tgt = tvm.target.Target("nvidia/jetson-nano", {"kind": "llvm"})
+    assert tgt.kind.name == "cuda"
+    assert tgt.attrs["arch"] == "sm_53"
+    assert tgt.attrs["shared_memory_per_block"] == 49152
+    assert tgt.attrs["max_threads_per_block"] == 1024
+    assert tgt.attrs["thread_warp_size"] == 32
+    assert tgt.attrs["registers_per_block"] == 32768
+    assert tgt.host.kind.name == "llvm"
+
+
+def test_target_host_single_dict():
+    tgt = tvm.target.Target({"kind": "llvm", "host": "nvidia/jetson-nano"})
+    assert tgt.kind.name == "llvm"
+    assert tgt.host.kind.name == "cuda"
+    assert tgt.host.attrs["arch"] == "sm_53"
+    assert tgt.host.attrs["shared_memory_per_block"] == 49152
+    assert tgt.host.attrs["max_threads_per_block"] == 1024
+    assert tgt.host.attrs["thread_warp_size"] == 32
+    assert tgt.host.attrs["registers_per_block"] == 32768
+
+
+def test_target_host_single_string():
+    tgt = tvm.target.Target("cuda --host llvm")
+    assert tgt.kind.name == "cuda"
+    assert tgt.host.kind.name == "llvm"
+
+
+def test_target_host_single_string_with_tag():
+    tgt = tvm.target.Target("cuda --host nvidia/jetson-nano")
+    assert tgt.kind.name == "cuda"
+    assert tgt.host.kind.name == "cuda"
+    assert tgt.host.attrs["arch"] == "sm_53"
+    assert tgt.host.attrs["shared_memory_per_block"] == 49152
+    assert tgt.host.attrs["max_threads_per_block"] == 1024
+    assert tgt.host.attrs["thread_warp_size"] == 32
+    assert tgt.host.attrs["registers_per_block"] == 32768
+
+
+def test_target_host_warning():
+    """
+    Confirm that constructing a target with invalid
+    attributes fails as expected.
+    """
+    with pytest.raises(ValueError):
+        tgt = tvm.target.Target("cuda --host nvidia/jetson-nano", "llvm")
+
+
 if __name__ == "__main__":
     test_target_dispatch()
     test_target_string_parse()

From a1118039ea80e4a4e79fb28012b375867b0a8613 Mon Sep 17 00:00:00 2001
From: Dongming Yang <dongming.yang@streamcomputing.com>
Date: Tue, 23 Feb 2021 13:53:55 +0800
Subject: [PATCH 222/357] [RELAY][Parser] Optimize relay parser to restore
 calls attrs (#7347)

* [RELAY][Parser] Optimize relay parser to restore attrs for non-Operator calls

* To avoid too much modification to the native class, only print out the attrs
  type key of non-Operator Call in relay printer. Then reconstruct the attrs object
  after parsing this attrs type key value in Relay parser.

* fix lint

* fix ci

* add test case
---
 src/parser/parser.cc                 | 22 ++++++++++++----
 src/printer/relay_text_printer.cc    |  5 ++++
 tests/python/relay/test_ir_parser.py | 39 ++++++++++++++++++++++++++++
 3 files changed, 61 insertions(+), 5 deletions(-)

diff --git a/src/parser/parser.cc b/src/parser/parser.cc
index afcf70737933..3061735eff7c 100644
--- a/src/parser/parser.cc
+++ b/src/parser/parser.cc
@@ -1334,6 +1334,8 @@ class Parser {
       case TokenType::kBoolean:
       case TokenType::kStringLiteral:
         return Match(next->token_type)->data;
+      case TokenType::kMetaReference:
+        return ParseMetaRef();
       case TokenType::kLSquare: {
         return ParseSequence<ObjectRef>(TokenType::kLSquare, TokenType::kComma, TokenType::kRSquare,
                                         [&]() { return ParseAttributeValue(); });
@@ -1408,7 +1410,7 @@ class Parser {
             auto last_meta = Lookahead(2)->token_type == TokenType::kCloseParen;
             auto is_meta_attrs = is_meta_next && last_meta;
 
-            if (is_op && (is_pretty_attrs || is_meta_attrs)) {
+            if (is_pretty_attrs || is_meta_attrs) {
               if (is_meta_attrs) {
                 auto meta_ref = ParseMetaRef();
                 if (meta_ref.as<BaseAttrsNode>()) {
@@ -1420,13 +1422,23 @@ class Parser {
                 }
               } else {
                 auto raw_attrs = ParseAttrs();
-                auto attr_obj = tvm::ReflectionVTable::Global()->CreateObject(op_key, raw_attrs);
-                ICHECK(attr_obj.defined());
-                attrs = Downcast<Attrs>(attr_obj);
+                if (is_op && op_key.size()) {
+                  auto attr_obj = tvm::ReflectionVTable::Global()->CreateObject(op_key, raw_attrs);
+                  ICHECK(attr_obj.defined());
+                  attrs = Downcast<Attrs>(attr_obj);
+                } else if (raw_attrs.count("attrs_type_key")) {
+                  String attr_key = Downcast<String>(raw_attrs["attrs_type_key"]);
+                  if (attr_key.size()) {
+                    raw_attrs.erase("attrs_type_key");
+                    auto tbl = tvm::ReflectionVTable::Global();
+                    auto attr_obj = tbl->CreateObject(attr_key, raw_attrs);
+                    ICHECK(attr_obj.defined());
+                    attrs = Downcast<Attrs>(attr_obj);
+                  }
+                }
               }
               return true;
             }
-
             return false;
           });
 
diff --git a/src/printer/relay_text_printer.cc b/src/printer/relay_text_printer.cc
index da4f8cadfb3d..cbee04f96096 100644
--- a/src/printer/relay_text_printer.cc
+++ b/src/printer/relay_text_printer.cc
@@ -827,6 +827,11 @@ std::vector<Doc> RelayTextPrinter::PrintCallAttrs(const Attrs& attrs, const Expr
   } else {
     AttrPrinter printer(&docs, this);
     const_cast<BaseAttrsNode*>(attrs.operator->())->VisitNonDefaultAttrs(&printer);
+    if (!op_node) {
+      // print call attr type key to restore expr for relay parser
+      std::string s = std::string(attrs->GetTypeKey());
+      printer.Visit("attrs_type_key", &s);
+    }
     return docs;
   }
 }
diff --git a/tests/python/relay/test_ir_parser.py b/tests/python/relay/test_ir_parser.py
index 70fb56049873..62e52abefeb4 100644
--- a/tests/python/relay/test_ir_parser.py
+++ b/tests/python/relay/test_ir_parser.py
@@ -910,6 +910,45 @@ def test_load_prelude():
     tvm.parser.parse(mod.astext())
 
 
+def test_call_attrs():
+    def get_func(shape, dtype):
+        x0 = relay.var("data", shape=shape, dtype=dtype)
+        w0 = relay.var("weight", shape=shape, dtype=dtype)
+        a = relay.nn.dense(x0, w0)
+        b = relay.nn.relu(a)
+        d = relay.add(b, relay.const(1.0, dtype=dtype))
+        return relay.Function([x0, w0], d)
+
+    # build relay graph
+    shape = (2, 4)
+    dtype = "float32"
+    sub_func = get_func(shape, dtype)
+    p0 = relay.var("p0", shape=shape, dtype=dtype)
+    p1 = relay.var("p1", shape=shape, dtype=dtype)
+    attr = tvm.ir.make_node("attrs.TestAttrs", name="func_call_attrs")
+    call = relay.Call(sub_func, [p0, p1], attrs=attr)
+    func = relay.Function([p0, p1], call)
+
+    # build relay module
+    mod = tvm.IRModule()
+    mod["main"] = func
+    mod = tvm.relay.transform.InferType()(mod)
+
+    # assert equal
+    program = """
+    def @main(%p0: Tensor[(2, 4), float32], %p1: Tensor[(2, 4), float32]) {
+    %2 = fn (%data: Tensor[(2, 4), float32], %weight: Tensor[(2, 4), float32]) {
+        %0 = nn.dense(%data, %weight, units=None);
+        %1 = nn.relu(%0);
+        add(%1, 1f)
+    };
+    %2(%p0, %p1, name="func_call_attrs", attrs_type_key="attrs.TestAttrs")
+    }
+    """
+    parsed = parse_module(program)
+    assert_graph_equal(parsed, mod)
+
+
 def test_tokenize_inf():
     x = relay.var("x", shape=(3, 4), dtype="float32")
     y = relay.clip(x, -np.inf, np.inf)

From d94cbbbed01467a28e6ff598568a1b7626625440 Mon Sep 17 00:00:00 2001
From: Tianming Xu <tianmingxu.tmxu@gmail.com>
Date: Tue, 23 Feb 2021 20:07:23 +0800
Subject: [PATCH 223/357] [Frontend]Make onnx gemm tensor C optional (#7489)

* Make onnx gemm tensor C optional

* fix codestyle

* add tests

* fix codestyle
---
 python/tvm/relay/frontend/onnx.py          | 13 +++++++----
 tests/python/frontend/onnx/test_forward.py | 26 ++++++++++++++++++++++
 2 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 109e80c99783..510c7eebaf46 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -513,7 +513,9 @@ class Gemm(OnnxOpConverter):
 
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
-        assert len(inputs) == 3, "Gemm op take 3 inputs, {} given".format(len(inputs))
+        assert len(inputs) == 3 or len(inputs) == 2, "Gemm op take 2 or 3 inputs, {} given".format(
+            len(inputs)
+        )
         # Y = alpha * A * B + beta * C
         alpha = float(attr.get("alpha", 1.0))
         beta = float(attr.get("beta", 1.0))
@@ -531,9 +533,12 @@ def _impl_v1(cls, inputs, attr, params):
             inputs[0] *= _expr.const(alpha)
         out = _op.nn.dense(inputs[0], inputs[1], units=channels)
 
-        # skip (beta * C) if zero
-        C_array = params[inputs[2].name_hint].asnumpy()
-        if (beta == 0.0) or np.array_equal(C_array, np.array([0])):
+        if len(inputs) == 3:
+            # skip (beta * C) if zero
+            C_array = params[inputs[2].name_hint].asnumpy()
+            if (beta == 0.0) or np.array_equal(C_array, np.array([0])):
+                return out
+        else:
             return out
         return _op.nn.bias_add(out, _expr.const(beta) * inputs[2])
 
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index d6fe98d031fa..0c38dfd5c331 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -1008,6 +1008,31 @@ def test_onehot():
         tvm.testing.assert_allclose(out_np, tvm_out, rtol=1e-5, atol=1e-5)
 
 
+@tvm.testing.uses_gpu
+def test_gemm():
+    a_shape = (4, 3)
+    b_shape = (3, 4)
+    out_shape = [a_shape[0], b_shape[1]]
+
+    a_array = np.random.uniform(size=a_shape).astype("float32")
+    b_array = np.random.uniform(size=b_shape).astype("float32")
+
+    gemm_node = helper.make_node("Gemm", ["a", "b"], ["out"])
+
+    graph = helper.make_graph(
+        [gemm_node],
+        "gemm_test",
+        inputs=[
+            helper.make_tensor_value_info("a", TensorProto.FLOAT, list(a_shape)),
+            helper.make_tensor_value_info("b", TensorProto.FLOAT, list(b_shape)),
+        ],
+        outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))],
+    )
+
+    model = helper.make_model(graph, producer_name="gemm_test")
+    verify_with_ort_with_inputs(model, [a_array, b_array])
+
+
 @tvm.testing.uses_gpu
 def test_matmul():
     a_shape = (4, 3)
@@ -4065,6 +4090,7 @@ def verify_cumsum(indata, axis, exclusive=0, reverse=0, type="float32"):
     test_clip()
     test_clip_min_max_as_inputs()
     test_onehot()
+    test_gemm()
     test_matmul()
     test_gather()
     test_gatherelements()

From 0ba37411aa7d2fb3753e2403f8cbf0cf1e878ec6 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Tue, 23 Feb 2021 09:54:43 -0800
Subject: [PATCH 224/357] [CRT] Create C-runtime-style metadata module for llvm
 builds (#7398)

* Create C-runtime-style metadata module for llvm builds.

* maybe address manupa's comment

* lint

* actually address manupa comments

* comment and rename

* git-clang-format

* pylint

* cpp warning

* try to fix apps/bundle_deploy

* black format

* build correct file

* Use save() for C++-runtime targeted artifacts.

* fix build_module LLVM metadata module conditions

* fix test comment

* black format

* further restrict CRT MetadataModule creation

* Fix test_link_params

* black format and address zhiics comments

* fix test_link_params, i think?
---
 apps/bundle_deploy/Makefile                   |  19 ++-
 apps/bundle_deploy/build_model.py             |  25 +++-
 python/tvm/driver/build_module.py             |  21 ++-
 python/tvm/micro/compiler.py                  |   6 +-
 src/target/llvm/codegen_cpu.cc                |  85 ++++++-----
 src/target/llvm/codegen_cpu.h                 |   6 +
 src/target/llvm/llvm_module.cc                |  53 +++++++
 src/target/llvm/llvm_module.h                 |  44 ++++++
 src/target/metadata_module.cc                 | 136 ++++++++++++++++++
 src/target/metadata_module.h                  |  46 ++++++
 src/target/source/codegen_source_base.h       |   5 +-
 src/target/source/source_module.cc            |  88 ++----------
 src/target/source/source_module.h             |  46 ++++++
 tests/python/unittest/test_crt.py             |  33 ++++-
 tests/python/unittest/test_link_params.py     |  30 ++--
 .../unittest/test_target_codegen_llvm.py      |  28 +---
 16 files changed, 498 insertions(+), 173 deletions(-)
 create mode 100644 src/target/llvm/llvm_module.h
 create mode 100644 src/target/metadata_module.cc
 create mode 100644 src/target/metadata_module.h
 create mode 100644 src/target/source/source_module.h

diff --git a/apps/bundle_deploy/Makefile b/apps/bundle_deploy/Makefile
index 8a5f1cf95571..38d9d3456d55 100644
--- a/apps/bundle_deploy/Makefile
+++ b/apps/bundle_deploy/Makefile
@@ -62,6 +62,9 @@ $(endif)
 
 CRT_SRCS = $(shell find $(CRT_ROOT))
 
+MODEL_OBJ = $(build_dir)/model_c/devc.o $(build_dir)/model_c/lib0.o $(build_dir)/model_c/lib1.o
+TEST_MODEL_OBJ = $(build_dir)/test_model_c/devc.o $(build_dir)/test_model_c/lib0.o $(build_dir)/test_model_c/lib1.o
+
 demo_dynamic: $(build_dir)/demo_dynamic $(build_dir)/bundle.so $(build_dir)/bundle_c.so $(build_dir)/bundle.so $(build_dir)/graph_cpp.json $(build_dir)/graph_c.json $(build_dir)/params_cpp.bin $(build_dir)/params_c.bin $(build_dir)/cat.bin
 	$(QUIET)TVM_NUM_THREADS=1 $(build_dir)/demo_dynamic $(build_dir)/bundle.so $(build_dir)/graph_cpp.json $(build_dir)/params_cpp.bin $(build_dir)/cat.bin
 	$(QUIET)TVM_NUM_THREADS=1 $(build_dir)/demo_dynamic $(build_dir)/bundle_c.so $(build_dir)/graph_c.json $(build_dir)/params_c.bin $(build_dir)/cat.bin
@@ -93,11 +96,11 @@ $(build_dir)/test_dynamic: test.cc ${build_dir}/test_graph_c.json ${build_dir}/t
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)g++ $(PKG_CXXFLAGS) -o $@ test.cc $(BACKTRACE_OBJS) $(BACKTRACE_LDFLAGS)
 
-$(build_dir)/demo_static: demo_static.c ${build_dir}/bundle_static.o ${build_dir}/model_c.o ${build_dir}/crt/libmemory.a ${build_dir}/crt/libgraph_runtime.a ${build_dir}/crt/libcommon.a ${build_dir}/graph_c.json.c ${build_dir}/params_c.bin.c $(BACKTRACE_OBJS)
+$(build_dir)/demo_static: demo_static.c ${build_dir}/bundle_static.o $(MODEL_OBJ) ${build_dir}/crt/libmemory.a ${build_dir}/crt/libgraph_runtime.a ${build_dir}/crt/libcommon.a ${build_dir}/graph_c.json.c ${build_dir}/params_c.bin.c $(BACKTRACE_OBJS)
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)gcc $(PKG_CFLAGS) -o $@ $^ $(PKG_LDFLAGS) $(BACKTRACE_LDFLAGS) $(BACKTRACE_CFLAGS)
 
-$(build_dir)/test_static: test_static.c ${build_dir}/bundle_static.o ${build_dir}/test_model_c.o ${build_dir}/crt/libmemory.a ${build_dir}/crt/libgraph_runtime.a ${build_dir}/crt/libcommon.a $(BACKTRACE_OBJS)
+$(build_dir)/test_static: test_static.c ${build_dir}/bundle_static.o $(TEST_MODEL_OBJ) ${build_dir}/crt/libmemory.a ${build_dir}/crt/libgraph_runtime.a ${build_dir}/crt/libcommon.a $(BACKTRACE_OBJS)
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)gcc $(PKG_CFLAGS) -o $@ $^ $(BACKTRACE_LDFLAGS)
 
@@ -119,11 +122,15 @@ $(build_dir)/params_c.bin.c: $(build_dir)/params_c.bin
 $(build_dir)/params_cpp.bin.c: $(build_dir)/params_cpp.bin
 	$(QUIET)xxd -i $^  > $@
 
-$(build_dir)/model_c.o $(build_dir)/graph_c.json $(build_dir)/model_cpp.o $(build_dir)/graph_cpp.json $(build_dir)/params.bin $(build_dir)/cat.bin: build_model.py
+$(MODEL_OBJ) $(build_dir)/graph_c.json $(build_dir)/model_cpp.o $(build_dir)/graph_cpp.json $(build_dir)/params.bin $(build_dir)/cat.bin: build_model.py
 	$(QUIET)python3 $< -o $(build_dir)
+	$(QUIET)mkdir -p build/model_c
+	$(QUIET)tar -C build/model_c -xvf build/model_c.tar
 
-$(build_dir)/test_model_c.o $(build_dir)/test_graph_c.json $(build_dir)/test_params_c.bin $(build_dir)/test_data_c.bin $(build_dir)/test_output_c.bin $(build_dir)/test_model_cpp.o $(build_dir)/test_graph_cpp.json $(build_dir)/test_params_cpp.bin $(build_dir)/test_data_cpp.bin $(build_dir)/test_output_cpp.bin: build_model.py
+$(TEST_MODEL_OBJ) $(build_dir)/test_graph_c.json $(build_dir)/test_params_c.bin $(build_dir)/test_data_c.bin $(build_dir)/test_output_c.bin $(build_dir)/test_model_cpp.o $(build_dir)/test_graph_cpp.json $(build_dir)/test_params_cpp.bin $(build_dir)/test_data_cpp.bin $(build_dir)/test_output_cpp.bin: build_model.py
 	$(QUIET)python3 $< -o $(build_dir) --test
+	$(QUIET)mkdir -p build/test_model_c
+	$(QUIET)tar -C build/test_model_c -xvf build/test_model_c.tar
 
 # Build our bundle against the serialized bundle.c API, the runtime.cc API, and
 # the serialized graph.json and params.bin
@@ -131,7 +138,7 @@ $(build_dir)/bundle.so: bundle.cc runtime.cc $(build_dir)/model_cpp.o
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)g++ -shared $(PKG_CXXFLAGS) -fvisibility=hidden -o $@  $^ $(PKG_LDFLAGS)
 
-$(build_dir)/bundle_c.so: bundle.c $(build_dir)/model_c.o ${build_dir}/crt/libmemory.a ${build_dir}/crt/libgraph_runtime.a ${build_dir}/crt/libcommon.a $(BACKTRACE_OBJS)
+$(build_dir)/bundle_c.so: bundle.c $(MODEL_OBJ) ${build_dir}/crt/libmemory.a ${build_dir}/crt/libgraph_runtime.a ${build_dir}/crt/libcommon.a $(BACKTRACE_OBJS)
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)gcc -shared $(PKG_CFLAGS) -fvisibility=hidden -o $@  $^ $(PKG_LDFLAGS) $(BACKTRACE_LDFLAGS) $(BACKTRACE_CFLAGS)
 
@@ -139,7 +146,7 @@ $(build_dir)/test_bundle.so: bundle.cc runtime.cc $(build_dir)/test_model_cpp.o
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)g++ -shared $(PKG_CXXFLAGS) -fvisibility=hidden -o $@  $^ $(PKG_LDFLAGS)
 
-$(build_dir)/test_bundle_c.so: bundle.c $(build_dir)/test_model_c.o ${build_dir}/crt/libmemory.a ${build_dir}/crt/libgraph_runtime.a ${build_dir}/crt/libcommon.a $(BACKTRACE_OBJS)
+$(build_dir)/test_bundle_c.so: bundle.c $(TEST_MODEL_OBJ) ${build_dir}/crt/libmemory.a ${build_dir}/crt/libgraph_runtime.a ${build_dir}/crt/libcommon.a $(BACKTRACE_OBJS)
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)gcc -shared $(PKG_CFLAGS) -fvisibility=hidden -o $@  $^ $(PKG_LDFLAGS) $(BACKTRACE_LDFLAGS) $(BACKTRACE_CFLAGS)
 
diff --git a/apps/bundle_deploy/build_model.py b/apps/bundle_deploy/build_model.py
index a2513c8a46d0..0991ac9ad94b 100644
--- a/apps/bundle_deploy/build_model.py
+++ b/apps/bundle_deploy/build_model.py
@@ -23,6 +23,7 @@
 from tvm import te
 import logging
 import json
+from tvm.contrib import cc as _cc
 
 RUNTIMES = {
     "c": "{name}_c.{ext}",
@@ -51,7 +52,17 @@ def build_module(opts):
         build_dir = os.path.abspath(opts.out_dir)
         if not os.path.isdir(build_dir):
             os.makedirs(build_dir)
-        lib.save(os.path.join(build_dir, file_format_str.format(name="model", ext="o")))
+        ext = "tar" if runtime_name == "c" else "o"
+        lib_file_name = os.path.join(build_dir, file_format_str.format(name="model", ext=ext))
+        if runtime_name == "c":
+            lib.export_library(lib_file_name)
+        else:
+            # NOTE: at present, export_libarary will always create _another_ shared object, and you
+            # can't stably combine two shared objects together (in this case, init_array is not
+            # populated correctly when you do that). So for now, must continue to use save() with the
+            # C++ library.
+            # TODO(areusch): Obliterate runtime.cc and replace with libtvm_runtime.so.
+            lib.save(lib_file_name)
         with open(
             os.path.join(build_dir, file_format_str.format(name="graph", ext="json")), "w"
         ) as f_graph_json:
@@ -84,7 +95,17 @@ def build_test_module(opts):
         build_dir = os.path.abspath(opts.out_dir)
         if not os.path.isdir(build_dir):
             os.makedirs(build_dir)
-        lib.save(os.path.join(build_dir, file_format_str.format(name="test_model", ext="o")))
+        ext = "tar" if runtime_name == "c" else "o"
+        lib_file_name = os.path.join(build_dir, file_format_str.format(name="test_model", ext=ext))
+        if runtime_name == "c":
+            lib.export_library(lib_file_name)
+        else:
+            # NOTE: at present, export_libarary will always create _another_ shared object, and you
+            # can't stably combine two shared objects together (in this case, init_array is not
+            # populated correctly when you do that). So for now, must continue to use save() with the
+            # C++ library.
+            # TODO(areusch): Obliterate runtime.cc and replace with libtvm_runtime.so.
+            lib.save(lib_file_name)
         with open(
             os.path.join(build_dir, file_format_str.format(name="test_graph", ext="json")), "w"
         ) as f_graph_json:
diff --git a/python/tvm/driver/build_module.py b/python/tvm/driver/build_module.py
index 7ad48e19a1db..5eaecb422163 100644
--- a/python/tvm/driver/build_module.py
+++ b/python/tvm/driver/build_module.py
@@ -428,12 +428,19 @@ def build(inputs, args=None, target=None, target_host=None, name="default_functi
     if not isinstance(target_host, Target):
         target_host = Target(target_host)
     if (
-        "system-lib" in target_host.attrs
-        and target_host.attrs["system-lib"].value == 1
-        and target_host.kind.name == "c"
+        target_host.attrs.get("runtime", tvm.runtime.String("c++")) == "c"
+        and target_host.attrs.get("system-lib", 0).value == 1
     ):
-        create_csource_metadata_module = tvm._ffi.get_global_func(
-            "runtime.CreateCSourceMetadataModule"
-        )
-        return create_csource_metadata_module([rt_mod_host], target_host)
+        if target_host.kind.name == "c":
+            create_csource_crt_metadata_module = tvm._ffi.get_global_func(
+                "runtime.CreateCSourceCrtMetadataModule"
+            )
+            return create_csource_crt_metadata_module([rt_mod_host], target_host)
+
+        if target_host.kind.name == "llvm":
+            create_llvm_crt_metadata_module = tvm._ffi.get_global_func(
+                "runtime.CreateLLVMCrtMetadataModule"
+            )
+            return create_llvm_crt_metadata_module([rt_mod_host], target_host)
+
     return rt_mod_host
diff --git a/python/tvm/micro/compiler.py b/python/tvm/micro/compiler.py
index d0431f42b01d..5bc5aba8a1be 100644
--- a/python/tvm/micro/compiler.py
+++ b/python/tvm/micro/compiler.py
@@ -81,6 +81,9 @@ def _target_from_sources(cls, sources):
         target_strs = set()
 
         for obj in sources:
+            if os.path.splitext(obj)[1] not in (".cc", ".c"):
+                continue
+
             with open(obj) as obj_f:
                 for line in obj_f:
                     m = cls.TVM_TARGET_RE.match(line)
@@ -246,7 +249,8 @@ def library(self, output, sources, options=None):
             )
 
         prefix = self._autodetect_toolchain_prefix(target)
-        outputs = []
+        outputs = [s for s in sources if os.path.splitext(s)[1] == ".o"]
+        sources = [s for s in sources if s not in outputs]
         for src in sources:
             src_base, src_ext = os.path.splitext(os.path.basename(src))
 
diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc
index e2a8553199f0..b37cd73ece04 100644
--- a/src/target/llvm/codegen_cpu.cc
+++ b/src/target/llvm/codegen_cpu.cc
@@ -123,12 +123,6 @@ void CodeGenCPU::AddFunction(const PrimFunc& f) {
         << "CodeGenLLVM: Expect PrimFunc to have the global_symbol attribute";
     export_system_symbols_.emplace_back(
         std::make_pair(global_symbol.value().operator std::string(), function_));
-  } else if (target_c_runtime_) {
-    auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
-    ICHECK(global_symbol.defined())
-        << "CodeGenLLVM: Expect PrimFunc to have the global_symbol attribute";
-    registry_functions_.emplace_back(
-        std::make_pair(global_symbol.value().operator std::string(), function_));
   }
   AddDebugInformation(function_);
 }
@@ -791,47 +785,50 @@ llvm::Value* CodeGenCPU::RuntimeTVMParallelBarrier() {
   return GetContextPtr(gv_tvm_parallel_barrier_);
 }
 
-void CodeGenCPU::AddStartupFunction() {
-  if (registry_functions_.size() != 0) {
-    ICHECK(is_system_lib_) << "Loading of --system-lib modules is yet to be defined for C runtime";
-    Array<String> symbols;
-    std::vector<llvm::Constant*> funcs;
-    for (auto sym : registry_functions_) {
-      symbols.push_back(sym.first);
-      funcs.emplace_back(llvm::ConstantExpr::getBitCast(
-          sym.second, ftype_tvm_backend_packed_c_func_->getPointerTo()));
-    }
-    llvm::DataLayout layout(module_.get());
-    llvm::ArrayType* t_tvm_crt_func_ptrs =
-        llvm::ArrayType::get(ftype_tvm_backend_packed_c_func_->getPointerTo(), funcs.size());
-    llvm::GlobalVariable* func_registry_ptrs = new llvm::GlobalVariable(
-        *module_, t_tvm_crt_func_ptrs, true, llvm::GlobalValue::InternalLinkage,
-        llvm::ConstantArray::get(t_tvm_crt_func_ptrs, funcs), "_tvm_func_registry_ptrs");
-    uint64_t align = layout.getTypeAllocSize(ftype_tvm_backend_packed_c_func_->getPointerTo());
+void CodeGenCPU::DefineFunctionRegistry(Array<String> func_names) {
+  ICHECK(is_system_lib_) << "Loading of --system-lib modules is yet to be defined for C runtime";
+  Array<String> symbols;
+  std::vector<llvm::Constant*> funcs;
+  for (auto sym : func_names) {
+    symbols.push_back(sym);
+    llvm::GlobalVariable* sym_func = new llvm::GlobalVariable(
+        *module_, ftype_tvm_backend_packed_c_func_, true, llvm::GlobalValue::ExternalLinkage,
+        nullptr, sym.operator std::string());
+    funcs.emplace_back(sym_func);
+  }
+  llvm::DataLayout layout(module_.get());
+  llvm::ArrayType* t_tvm_crt_func_ptrs =
+      llvm::ArrayType::get(ftype_tvm_backend_packed_c_func_->getPointerTo(), funcs.size());
+  llvm::GlobalVariable* func_registry_ptrs = new llvm::GlobalVariable(
+      *module_, t_tvm_crt_func_ptrs, true, llvm::GlobalValue::InternalLinkage,
+      llvm::ConstantArray::get(t_tvm_crt_func_ptrs, funcs), "_tvm_func_registry_ptrs");
+  uint64_t align = layout.getTypeAllocSize(ftype_tvm_backend_packed_c_func_->getPointerTo());
 #if TVM_LLVM_VERSION >= 100
-    func_registry_ptrs->setAlignment(llvm::Align(align));
+  func_registry_ptrs->setAlignment(llvm::Align(align));
 #else
-    func_registry_ptrs->setAlignment(align);
+  func_registry_ptrs->setAlignment(align);
 #endif
-    llvm::GlobalVariable* func_registry = new llvm::GlobalVariable(
-        *module_, t_tvm_crt_func_registry_, true, llvm::GlobalVariable::InternalLinkage,
-        llvm::ConstantStruct::get(
-            t_tvm_crt_func_registry_,
-            {GetConstString(::tvm::target::GenerateFuncRegistryNames(symbols)),
-             func_registry_ptrs}),
-        "_tvm_crt_func_registry");
-    llvm::GlobalVariable* module = new llvm::GlobalVariable(
-        *module_, t_tvm_crt_module_, true, llvm::GlobalValue::InternalLinkage,
-        llvm::ConstantStruct::get(t_tvm_crt_module_, {func_registry}), "_tvm_crt_module");
-
-    // Now build TVMSystemLibEntryPoint.
-    llvm::FunctionType* ftype = llvm::FunctionType::get(t_void_p_, {}, false);
-    function_ = llvm::Function::Create(ftype, llvm::Function::ExternalLinkage,
-                                       "TVMSystemLibEntryPoint", module_.get());
-    llvm::BasicBlock* entry_point_entry = llvm::BasicBlock::Create(*ctx_, "entry", function_);
-    builder_->SetInsertPoint(entry_point_entry);
-    builder_->CreateRet(builder_->CreateBitCast(module, t_void_p_));
-  } else {
+  llvm::GlobalVariable* func_registry = new llvm::GlobalVariable(
+      *module_, t_tvm_crt_func_registry_, true, llvm::GlobalVariable::InternalLinkage,
+      llvm::ConstantStruct::get(
+          t_tvm_crt_func_registry_,
+          {GetConstString(::tvm::target::GenerateFuncRegistryNames(symbols)), func_registry_ptrs}),
+      "_tvm_crt_func_registry");
+  llvm::GlobalVariable* module = new llvm::GlobalVariable(
+      *module_, t_tvm_crt_module_, true, llvm::GlobalValue::InternalLinkage,
+      llvm::ConstantStruct::get(t_tvm_crt_module_, {func_registry}), "_tvm_crt_module");
+
+  // Now build TVMSystemLibEntryPoint.
+  llvm::FunctionType* ftype = llvm::FunctionType::get(t_void_p_, {}, false);
+  function_ = llvm::Function::Create(ftype, llvm::Function::ExternalLinkage,
+                                     "TVMSystemLibEntryPoint", module_.get());
+  llvm::BasicBlock* entry_point_entry = llvm::BasicBlock::Create(*ctx_, "entry", function_);
+  builder_->SetInsertPoint(entry_point_entry);
+  builder_->CreateRet(builder_->CreateBitCast(module, t_void_p_));
+}
+
+void CodeGenCPU::AddStartupFunction() {
+  if (!target_c_runtime_) {
     llvm::FunctionType* ftype = llvm::FunctionType::get(t_void_, {}, false);
     function_ = llvm::Function::Create(ftype, llvm::Function::InternalLinkage,
                                        "__tvm_module_startup", module_.get());
diff --git a/src/target/llvm/codegen_cpu.h b/src/target/llvm/codegen_cpu.h
index fc46dc53ce15..d08bd639e131 100644
--- a/src/target/llvm/codegen_cpu.h
+++ b/src/target/llvm/codegen_cpu.h
@@ -50,6 +50,12 @@ class CodeGenCPU : public CodeGenLLVM {
   llvm::Value* CreateCallExtern(Type ret_type, String global_symbol, const Array<PrimExpr>& args,
                                 bool skip_first_arg) override;
 
+  /*!
+   * \brief A CPU-specific function to create the FuncRegistry.
+   * \param func_names List of functions to be included, in order.
+   */
+  void DefineFunctionRegistry(Array<String> func_names);
+
  protected:
   void AddStartupFunction() final;
   // meta data
diff --git a/src/target/llvm/llvm_module.cc b/src/target/llvm/llvm_module.cc
index 43d20971404e..24fb3dc95819 100644
--- a/src/target/llvm/llvm_module.cc
+++ b/src/target/llvm/llvm_module.cc
@@ -34,6 +34,7 @@
 #include "../../runtime/library_module.h"
 #include "../func_registry_generator.h"
 #include "codegen_blob.h"
+#include "codegen_cpu.h"
 #include "codegen_llvm.h"
 #include "llvm_common.h"
 
@@ -445,6 +446,58 @@ TVM_REGISTER_GLOBAL("codegen.codegen_blob")
       return runtime::Module(n);
     });
 
+runtime::Module CreateLLVMCrtMetadataModule(const Array<runtime::Module>& modules, Target target) {
+  Array<String> func_names;
+  for (runtime::Module mod : modules) {
+    auto pf_funcs = mod.GetFunction("get_func_names");
+    if (pf_funcs != nullptr) {
+      Array<String> func_names_ = pf_funcs();
+      for (const auto& fname : func_names_) {
+        func_names.push_back(fname);
+      }
+    }
+  }
+
+  InitializeLLVM();
+  auto tm = GetLLVMTargetMachine(target);
+  bool system_lib = target->GetAttr<Bool>("system-lib").value_or(Bool(false));
+  bool target_c_runtime = (target->GetAttr<String>("runtime").value_or("") == kTvmRuntimeCrt);
+  ICHECK(system_lib && target_c_runtime)
+      << "For LLVM C-runtime metadata module, must include --system-lib and --runtime=c; "
+      << "got target: " << target->str();
+  auto ctx = std::make_shared<llvm::LLVMContext>();
+  std::unique_ptr<CodeGenCPU> cg{new CodeGenCPU()};
+  cg->Init("TVMMetadataMod", tm.get(), ctx.get(), system_lib, system_lib, target_c_runtime);
+
+  cg->DefineFunctionRegistry(func_names);
+  auto mod = cg->Finish();
+  mod->addModuleFlag(llvm::Module::Warning, "tvm_target",
+                     llvm::MDString::get(*ctx, LLVMTargetToString(target)));
+  mod->addModuleFlag(llvm::Module::Override, "Debug Info Version", llvm::DEBUG_METADATA_VERSION);
+
+  if (tm->getTargetTriple().isOSDarwin()) {
+    mod->addModuleFlag(llvm::Module::Override, "Dwarf Version", 2);
+  }
+
+  std::string verify_errors_storage;
+  llvm::raw_string_ostream verify_errors(verify_errors_storage);
+  LOG_IF(FATAL, llvm::verifyModule(*mod, &verify_errors))
+      << "LLVM module verification failed with the following errors: \n"
+      << verify_errors.str();
+
+  auto n = make_object<LLVMModuleNode>();
+  n->Init(std::move(mod), ctx);
+  for (auto m : modules) {
+    n->Import(m);
+  }
+  return runtime::Module(n);
+}
+
+TVM_REGISTER_GLOBAL("runtime.CreateLLVMCrtMetadataModule")
+    .set_body_typed([](const Array<runtime::Module>& modules, Target target) {
+      return CreateLLVMCrtMetadataModule(modules, target);
+    });
+
 }  // namespace codegen
 }  // namespace tvm
 #endif  // TVM_LLVM_VERSION
diff --git a/src/target/llvm/llvm_module.h b/src/target/llvm/llvm_module.h
new file mode 100644
index 000000000000..3eab00c643e5
--- /dev/null
+++ b/src/target/llvm/llvm_module.h
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file llvm_module.h
+ * \brief Declares top-level shared functions related to the LLVM codegen.
+ */
+
+#ifndef TVM_TARGET_LLVM_LLVM_MODULE_H_
+#define TVM_TARGET_LLVM_LLVM_MODULE_H_
+
+#include <tvm/runtime/container.h>
+#include <tvm/runtime/module.h>
+#include <tvm/target/target.h>
+
+#ifdef TVM_LLVM_VERSION
+
+namespace tvm {
+namespace codegen {
+
+runtime::Module CreateLLVMCrtMetadataModule(const Array<runtime::Module>& modules, Target target);
+
+}  // namespace codegen
+}  // namespace tvm
+
+#endif  // TVM_LLVM_VERSION
+
+#endif  // TVM_TARGET_LLVM_LLVM_MODULE_H_
diff --git a/src/target/metadata_module.cc b/src/target/metadata_module.cc
new file mode 100644
index 000000000000..e2575c34d8f2
--- /dev/null
+++ b/src/target/metadata_module.cc
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file metadata_module.cc
+ * \brief Defines functions that build MetadataModules for C++ and C runtimes.
+ */
+
+#include "metadata_module.h"
+
+#include <vector>
+
+#include "../runtime/meta_data.h"
+#include "llvm/llvm_module.h"
+#include "source/source_module.h"
+
+namespace tvm {
+namespace codegen {
+
+/*!
+ * \brief Create a metadata module wrapper. The helper is used by different
+ *        codegens, such as graph runtime codegen and the vm compiler.
+ *
+ * \param params The metadata for initialization of all modules.
+ * \param target_module the internal module that is compiled by tvm.
+ * \param ext_modules The external modules that needs to be imported inside the metadata
+ * module(s).
+ * \param target The target that all the modules are compiled for
+ * \return The created metadata module that manages initialization of metadata.
+ */
+runtime::Module CreateMetadataModule(
+    const std::unordered_map<std::string, runtime::NDArray>& params,
+    tvm::runtime::Module target_module, const Array<runtime::Module>& ext_modules, Target target) {
+  // Here we split modules into two groups:
+  //  1. Those modules which can be exported to C-runtime. These are DSO-exportable
+  //     (i.e. llvm or c) modules which return nothing from get_const_vars().
+  //  2. Other modules.
+  Array<runtime::Module> crt_exportable_modules;
+  Array<runtime::Module> non_crt_exportable_modules;
+
+  auto DSOExportable = [](tvm::runtime::Module& mod) {
+    return !std::strcmp(mod->type_key(), "llvm") || !std::strcmp(mod->type_key(), "c");
+  };
+
+  bool is_targeting_crt =
+      target.defined() && target->GetAttr<String>("runtime").value_or(String("")) == kTvmRuntimeCrt;
+
+  // Wrap all submodules in the initialization wrapper.
+  std::unordered_map<std::string, std::vector<std::string>> sym_metadata;
+  for (tvm::runtime::Module mod : ext_modules) {
+    auto pf_sym = mod.GetFunction("get_symbol");
+    auto pf_var = mod.GetFunction("get_const_vars");
+    std::vector<std::string> arrays;
+    if (pf_sym != nullptr && pf_var != nullptr) {
+      String symbol = pf_sym();
+      Array<String> variables = pf_var();
+      for (size_t i = 0; i < variables.size(); i++) {
+        arrays.push_back(variables[i].operator std::string());
+      }
+      ICHECK_EQ(sym_metadata.count(symbol), 0U) << "Found duplicated symbol: " << symbol;
+      sym_metadata[symbol] = arrays;
+    }
+    // We only need loading of serialized constant data
+    // if there are constants present and required by the
+    // runtime module to be initialized by the binary
+    // metadata module. If not rest of the modules are
+    // wrapped in c-source metadata module.
+
+    // TODO(@manupa-arm) : we should be able to use csource_metadata
+    // if the variables are empty when all the runtime modules implement get_func_names
+    if (arrays.empty() && is_targeting_crt && DSOExportable(mod) &&
+        (target->kind->name == "c" || target->kind->name == "llvm")) {
+      crt_exportable_modules.push_back(mod);
+    } else {
+      non_crt_exportable_modules.push_back(mod);
+    }
+  }
+
+  if (is_targeting_crt) {
+    if (!non_crt_exportable_modules.empty()) {
+      std::string non_exportable_modules;
+      for (unsigned int i = 0; i < non_crt_exportable_modules.size(); i++) {
+        if (i > 0) {
+          non_exportable_modules += ", ";
+        }
+        auto mod = non_crt_exportable_modules[i];
+        auto pf_sym = mod.GetFunction("get_symbol");
+        if (pf_sym != nullptr) {
+          non_exportable_modules += pf_sym().operator std::string();
+        } else {
+          non_exportable_modules +=
+              std::string{"(module type_key="} + mod->type_key() + std::string{")"};
+        }
+      }
+      CHECK(false) << "These " << non_crt_exportable_modules.size()
+                   << " modules are not exportable to C-runtime: " << non_exportable_modules;
+    }
+
+    if (target->kind->name == "c") {
+      crt_exportable_modules.push_back(target_module);
+      target_module = CreateCSourceCrtMetadataModule(crt_exportable_modules, target);
+    } else if (target->kind->name == "llvm") {
+      crt_exportable_modules.push_back(target_module);
+      target_module = CreateLLVMCrtMetadataModule(crt_exportable_modules, target);
+    }
+  } else {
+    if (!non_crt_exportable_modules.empty()) {
+      runtime::Module binary_meta_mod = runtime::MetadataModuleCreate(params, sym_metadata);
+      binary_meta_mod.Import(target_module);
+      for (const auto& it : non_crt_exportable_modules) {
+        binary_meta_mod.Import(it);
+      }
+      return binary_meta_mod;
+    }
+  }
+  return target_module;
+}
+
+}  // namespace codegen
+}  // namespace tvm
diff --git a/src/target/metadata_module.h b/src/target/metadata_module.h
new file mode 100644
index 000000000000..83cb29dd5a46
--- /dev/null
+++ b/src/target/metadata_module.h
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file metadata_module.h
+ * \brief Declares functions that build MetadataModules for C++ and C runtimes.
+ */
+
+#ifndef TVM_TARGET_METADATA_MODULE_H_
+#define TVM_TARGET_METADATA_MODULE_H_
+
+#include <tvm/runtime/container.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/target/target.h>
+
+#include <string>
+#include <unordered_map>
+
+namespace tvm {
+namespace codegen {
+
+runtime::Module CreateMetadataModule(
+    const std::unordered_map<std::string, runtime::NDArray>& params,
+    tvm::runtime::Module target_module, const Array<runtime::Module>& ext_modules, Target target);
+
+}  // namespace codegen
+}  // namespace tvm
+
+#endif  // TVM_TARGET_METADATA_MODULE_H_
diff --git a/src/target/source/codegen_source_base.h b/src/target/source/codegen_source_base.h
index ed838f825812..3baa44eb639f 100644
--- a/src/target/source/codegen_source_base.h
+++ b/src/target/source/codegen_source_base.h
@@ -170,12 +170,13 @@ runtime::Module DeviceSourceModuleCreate(
     std::string type_key, std::function<std::string(const std::string&)> fget_source = nullptr);
 
 /*!
- * \brief Wrap the submodules that are to be wrapped in a c-source metadata module.
+ * \brief Wrap the submodules that are to be wrapped in a c-source metadata module for C runtime.
  * \param modules The modules to be wrapped.
  * \param target the target the modules are compiled for.
  * \return The wrapped module.
  */
-runtime::Module CreateCSourceMetadataModule(const Array<runtime::Module>& modules, Target target);
+runtime::Module CreateCSourceCrtMetadataModule(const Array<runtime::Module>& modules,
+                                               Target target);
 
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index 4b4770a79816..a7732719a699 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -21,12 +21,17 @@
  * \file source_module.cc
  * \brief Source code module, only for viewing
  */
+#include "source_module.h"
+
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
 
+#include <string>
+#include <unordered_map>
+#include <utility>
+
 #include "../../runtime/file_utils.h"
-#include "../../runtime/meta_data.h"
 #include "../../support/str_escape.h"
 #include "../func_registry_generator.h"
 #include "codegen_source_base.h"
@@ -43,73 +48,6 @@ using runtime::GetFileFormat;
 using runtime::GetMetaFilePath;
 using runtime::SaveBinaryToFile;
 
-/*!
- * \brief Create a metadata module wrapper. The helper is used by different
- *        codegens, such as graph runtime codegen and the vm compiler.
- *
- * \param params The metadata for initialization of all modules.
- * \param target_module the internal module that is compiled by tvm.
- * \param ext_modules The external modules that needs to be imported inside the metadata
- * module(s).
- * \param target The target that all the modules are compiled for
- * \return The created metadata module that manages initialization of metadata.
- */
-runtime::Module CreateMetadataModule(
-    const std::unordered_map<std::string, runtime::NDArray>& params,
-    tvm::runtime::Module target_module, const Array<runtime::Module>& ext_modules, Target target) {
-  Array<tvm::runtime::Module> csource_modules;
-  Array<tvm::runtime::Module> binary_modules;
-
-  auto DSOExportable = [](tvm::runtime::Module& mod) {
-    return !std::strcmp(mod->type_key(), "llvm") || !std::strcmp(mod->type_key(), "c");
-  };
-
-  // Wrap all submodules in the initialization wrapper.
-  std::unordered_map<std::string, std::vector<std::string>> sym_metadata;
-  for (tvm::runtime::Module mod : ext_modules) {
-    auto pf_sym = mod.GetFunction("get_symbol");
-    auto pf_var = mod.GetFunction("get_const_vars");
-    std::vector<std::string> arrays;
-    if (pf_sym != nullptr && pf_var != nullptr) {
-      String symbol = pf_sym();
-      Array<String> variables = pf_var();
-      for (size_t i = 0; i < variables.size(); i++) {
-        arrays.push_back(variables[i].operator std::string());
-      }
-      ICHECK_EQ(sym_metadata.count(symbol), 0U) << "Found duplicated symbol: " << symbol;
-      sym_metadata[symbol] = arrays;
-    }
-    // We only need loading of serialized constant data
-    // if there are constants present and required by the
-    // runtime module to be initialized by the binary
-    // metadata module. If not rest of the modules are
-    // wrapped in c-source metadata module.
-
-    // TODO(@manupa-arm) : we should be able to use csource_metadata
-    // if the variables are empty when all the runtime modules implement get_func_names
-    if (arrays.empty() && DSOExportable(mod) && target->kind->name == "c") {
-      csource_modules.push_back(mod);
-    } else {
-      binary_modules.push_back(mod);
-    }
-  }
-
-  if (target.defined() && target->kind->name == "c") {
-    csource_modules.push_back(target_module);
-    target_module = CreateCSourceMetadataModule(csource_modules, target);
-  }
-
-  if (!binary_modules.empty()) {
-    runtime::Module binary_meta_mod = runtime::MetadataModuleCreate(params, sym_metadata);
-    binary_meta_mod.Import(target_module);
-    for (const auto& it : binary_modules) {
-      binary_meta_mod.Import(it);
-    }
-    return binary_meta_mod;
-  }
-  return target_module;
-}
-
 // Simulator function
 class SourceModuleNode : public runtime::ModuleNode {
  public:
@@ -189,9 +127,10 @@ runtime::Module CSourceModuleCreate(const String& code, const String& fmt,
   return runtime::Module(n);
 }
 
-class CSourceMetadataModuleNode : public runtime::ModuleNode {
+class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
  public:
-  CSourceMetadataModuleNode(const Array<String>& func_names, const std::string& fmt, Target target)
+  CSourceCrtMetadataModuleNode(const Array<String>& func_names, const std::string& fmt,
+                               Target target)
       : fmt_(fmt), func_names_(func_names), target_(target) {
     CreateSource();
   }
@@ -261,7 +200,8 @@ class CSourceMetadataModuleNode : public runtime::ModuleNode {
   }
 };
 
-runtime::Module CreateCSourceMetadataModule(const Array<runtime::Module>& modules, Target target) {
+runtime::Module CreateCSourceCrtMetadataModule(const Array<runtime::Module>& modules,
+                                               Target target) {
   Array<String> func_names;
   for (runtime::Module mod : modules) {
     auto pf_funcs = mod.GetFunction("get_func_names");
@@ -272,7 +212,7 @@ runtime::Module CreateCSourceMetadataModule(const Array<runtime::Module>& module
       }
     }
   }
-  auto n = make_object<CSourceMetadataModuleNode>(func_names, "cc", target);
+  auto n = make_object<CSourceCrtMetadataModuleNode>(func_names, "cc", target);
   auto csrc_metadata_module = runtime::Module(n);
   for (const auto& mod : modules) {
     csrc_metadata_module.Import(mod);
@@ -341,9 +281,9 @@ TVM_REGISTER_GLOBAL("runtime.CSourceModuleCreate")
       return CSourceModuleCreate(code, fmt, func_names, const_vars);
     });
 
-TVM_REGISTER_GLOBAL("runtime.CreateCSourceMetadataModule")
+TVM_REGISTER_GLOBAL("runtime.CreateCSourceCrtMetadataModule")
     .set_body_typed([](const Array<runtime::Module>& modules, Target target) {
-      return CreateCSourceMetadataModule(modules, target);
+      return CreateCSourceCrtMetadataModule(modules, target);
     });
 
 }  // namespace codegen
diff --git a/src/target/source/source_module.h b/src/target/source/source_module.h
new file mode 100644
index 000000000000..45858b9f4ef2
--- /dev/null
+++ b/src/target/source/source_module.h
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file source_module.h
+ * \brief Source code module
+ */
+
+#ifndef TVM_TARGET_SOURCE_SOURCE_MODULE_H_
+#define TVM_TARGET_SOURCE_SOURCE_MODULE_H_
+
+#include <tvm/runtime/container.h>
+#include <tvm/runtime/module.h>
+#include <tvm/target/target.h>
+
+namespace tvm {
+namespace codegen {
+
+/*!
+ * \brief Create C-runtime targeted metadata module for "c" backend.
+ * \param modules Array of modules included in the compilation output.
+ * \param target TVM target.
+ */
+runtime::Module CreateCSourceCrtMetadataModule(const Array<runtime::Module>& modules,
+                                               tvm::Target target);
+
+}  // namespace codegen
+}  // namespace tvm
+
+#endif  // TVM_TARGET_SOURCE_SOURCE_MODULE_H_
diff --git a/tests/python/unittest/test_crt.py b/tests/python/unittest/test_crt.py
index 3c68b4090309..4b744b8ee10a 100644
--- a/tests/python/unittest/test_crt.py
+++ b/tests/python/unittest/test_crt.py
@@ -28,7 +28,6 @@
 import pytest
 
 import tvm
-import tvm.testing
 import tvm.relay
 import tvm.testing
 
@@ -103,6 +102,23 @@ def test_compile_runtime():
         assert (C_data.asnumpy() == np.array([6, 7])).all()
 
 
+@tvm.testing.requires_micro
+def test_compile_runtime_llvm():
+    """Test targeting the on-device runtime with the llvm backend."""
+    global TARGET
+    old_target = TARGET
+    try:
+        # NOTE: test_compile_runtime uses the "c" backend--re run it using the llvm backend.
+        target_str = str(TARGET)
+        assert target_str.startswith("c ")
+        TARGET = tvm.target.Target("llvm " + str(TARGET)[len("c ") :])
+
+        test_compile_runtime()
+
+    finally:
+        TARGET = old_target
+
+
 @tvm.testing.requires_micro
 def test_reset():
     """Test when the remote end resets during a session."""
@@ -124,7 +140,7 @@ def test_graph_runtime():
     """Test use of the graph runtime with microTVM."""
     import tvm.micro
 
-    workspace = tvm.micro.Workspace()
+    workspace = tvm.micro.Workspace(debug=True)
     relay_mod = tvm.parser.fromtext(
         """
       #[version = "0.0.5"]
@@ -157,6 +173,19 @@ def test_std_math_functions():
     """Verify that standard math functions can be used."""
     import tvm.micro
 
+    workspace = tvm.micro.Workspace()
+
+    with _make_add_sess(workspace) as sess:
+        A_data = tvm.nd.array(np.array([2, 3], dtype="int8"), ctx=sess.context)
+        assert (A_data.asnumpy() == np.array([2, 3])).all()
+        B_data = tvm.nd.array(np.array([4], dtype="int8"), ctx=sess.context)
+        assert (B_data.asnumpy() == np.array([4])).all()
+        C_data = tvm.nd.array(np.array([0, 0], dtype="int8"), ctx=sess.context)
+        assert (C_data.asnumpy() == np.array([0, 0])).all()
+
+        system_lib = sess.get_system_lib()
+        system_lib.get_function("add")(A_data, B_data, C_data)
+
     workspace = tvm.micro.Workspace()
     A = tvm.te.placeholder((2,), dtype="float32", name="A")
     B = tvm.te.compute(A.shape, lambda i: tvm.te.exp(A[i]), name="B")
diff --git a/tests/python/unittest/test_link_params.py b/tests/python/unittest/test_link_params.py
index 80ea11f6d9aa..ffe859927ad7 100644
--- a/tests/python/unittest/test_link_params.py
+++ b/tests/python/unittest/test_link_params.py
@@ -21,6 +21,7 @@
 import re
 import struct
 import sys
+import tempfile
 
 import numpy as np
 import pytest
@@ -182,31 +183,38 @@ def _add_decl(name, dtype):
 @tvm.testing.requires_llvm
 def test_llvm_link_params():
     for dtype in LINKABLE_DTYPES:
-        mod, param_init = _make_mod_and_params(dtype)
+        ir_mod, param_init = _make_mod_and_params(dtype)
         rand_input = _make_random_tensor(dtype, INPUT_SHAPE)
-        main_func = mod["main"]
+        main_func = ir_mod["main"]
         target = "llvm --runtime=c --system-lib --link-params"
         with tvm.transform.PassContext(opt_level=3):
-            lib = tvm.relay.build(mod, target, params=param_init)
+            lib = tvm.relay.build(ir_mod, target, params=param_init)
+
+            # NOTE: Need to export_library() and load_library() to link all the Module(llvm, ...)
+            # against one another.
+            temp_dir = tempfile.mkdtemp()
+            export_file = os.path.join(temp_dir, "lib.so")
+            lib.lib.export_library(export_file)
+            mod = tvm.runtime.load_module(export_file)
             assert set(lib.params.keys()) == {"p0", "p1"}  # NOTE: op folded
+            assert mod.get_function("TVMSystemLibEntryPoint") != None
 
-            print("graph", lib.graph_json)
             graph = json.loads(lib.graph_json)
             for p in lib.params:
-                _verify_linked_param(dtype, lib, lib.lib, graph, p) or found_one
+                _verify_linked_param(dtype, lib, mod, graph, p) or found_one
 
             # Wrap in function to explicitly deallocate the runtime.
-            def _run_linked(lib):
-                graph_json, mod, _ = lib
+            def _run_linked(lib, mod):
+                graph_json, _, _ = lib
                 graph_rt = tvm.contrib.graph_runtime.create(graph_json, mod, tvm.cpu(0))
                 graph_rt.set_input("rand_input", rand_input)  # NOTE: params not required.
                 graph_rt.run()
                 return graph_rt.get_output(0)
 
-            linked_output = _run_linked(lib)
+            linked_output = _run_linked(lib, mod)
 
         with tvm.transform.PassContext(opt_level=3):
-            lib = tvm.relay.build(mod, "llvm --system-lib", params=param_init)
+            lib = tvm.relay.build(ir_mod, "llvm --system-lib", params=param_init)
 
             def _run_unlinked(lib):
                 graph_json, mod, lowered_params = lib
@@ -266,8 +274,8 @@ def test_c_link_params():
             lib = tvm.relay.build(mod, target, params=param_init)
             assert set(lib.params.keys()) == {"p0", "p1"}  # NOTE: op folded
 
-            src = lib.lib.imported_modules[0].get_source()
-            lib.lib.save("test.c", "cc")
+            src = lib.lib.get_source()
+            lib.lib.save("test.c", "c")
             c_dtype = _get_c_datatype(dtype)
             src_lines = src.split("\n")
             param = lib.params["p0"].asnumpy().reshape(np.prod(KERNEL_SHAPE))
diff --git a/tests/python/unittest/test_target_codegen_llvm.py b/tests/python/unittest/test_target_codegen_llvm.py
index 67c1f6bff429..ec7c5aea333f 100644
--- a/tests/python/unittest/test_target_codegen_llvm.py
+++ b/tests/python/unittest/test_target_codegen_llvm.py
@@ -17,6 +17,8 @@
 import collections
 import ctypes
 import json
+import sys
+
 import tvm
 import tvm.testing
 from tvm import te
@@ -26,6 +28,7 @@
 import ctypes
 import math
 import re
+import pytest
 
 
 @tvm.testing.requires_llvm
@@ -816,27 +819,4 @@ def do_atomic_add(A):
 
 
 if __name__ == "__main__":
-    test_multiple_func()
-    test_llvm_large_uintimm()
-    test_llvm_import()
-    test_alignment()
-    test_rank_zero()
-    test_rank_zero_bound_checkers()
-    test_llvm_bool()
-    test_llvm_persist_parallel()
-    test_llvm_condition()
-    test_llvm_vadd_pipeline()
-    test_llvm_add_pipeline()
-    test_llvm_intrin()
-    test_llvm_overloaded_intrin()
-    test_llvm_flip_pipeline()
-    test_llvm_madd_pipeline()
-    test_llvm_temp_space()
-    test_llvm_lookup_intrin()
-    test_llvm_div()
-    test_llvm_fp_math()
-    test_dwarf_debug_information()
-    test_llvm_shuffle()
-    test_llvm_bf16()
-    test_llvm_crt_static_lib()
-    test_llvm_gpu_lower_atomic()
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 929717a03a188daf29210ae6f78c48007dfbe154 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Tue, 23 Feb 2021 15:29:07 -0800
Subject: [PATCH 225/357] Fix stack overflow when partially-__init__ Node
 raises exception. (#7481)

* Fix stack overflow when partially-__init__ Node raises exception.

 * If a Node subclass raises an exception and ctypes is in use before
   __init_handle_by_constructor__ is called (or self.handle is
   otherwise set), a Python stack overflow could result. This is
   because the unset handle slot causes self.handle accesses to
   fallback on the getattr(self, 'handle') method, invoking
   NodeGetAttr.
 * Then I believe this causes an infinite loop.
 * The fix is to make Node.__getattr__ raise AttributeError for all
   attributes in __slots__, then make __del__ tolerant to missing
   self.handle.
 * I don't believe cython is affected because it implements a
   descriptor to access its underlying chandle and that shouldn't be unset.

* black format

* actually use handle instead of self.handle
---
 python/tvm/_ffi/_ctypes/object.py | 7 ++++++-
 python/tvm/runtime/object.py      | 3 +++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/python/tvm/_ffi/_ctypes/object.py b/python/tvm/_ffi/_ctypes/object.py
index d30026adf9cc..fc510b7b6504 100644
--- a/python/tvm/_ffi/_ctypes/object.py
+++ b/python/tvm/_ffi/_ctypes/object.py
@@ -106,7 +106,12 @@ class ObjectBase(object):
 
     def __del__(self):
         if _LIB is not None:
-            check_call(_LIB.TVMObjectFree(self.handle))
+            try:
+                handle = self.handle
+            except AttributeError:
+                return
+
+            check_call(_LIB.TVMObjectFree(handle))
 
     def __init_handle_by_constructor__(self, fconstructor, *args):
         """Initialize the handle by calling constructor function.
diff --git a/python/tvm/runtime/object.py b/python/tvm/runtime/object.py
index bfee7f544f9c..0c2abd296b42 100644
--- a/python/tvm/runtime/object.py
+++ b/python/tvm/runtime/object.py
@@ -56,6 +56,9 @@ def __dir__(self):
         return sorted([fnames(i) for i in range(size)] + class_names)
 
     def __getattr__(self, name):
+        if name in self.__slots__:
+            raise AttributeError(f"{name} is not set")
+
         try:
             return _ffi_node_api.NodeGetAttr(self, name)
         except AttributeError:

From e67f15b51619cebc6310e932bb7bbe5a098bc2fd Mon Sep 17 00:00:00 2001
From: Yizhi Liu <liuyizhi@apache.org>
Date: Tue, 23 Feb 2021 21:20:48 -0800
Subject: [PATCH 226/357] [COMMUNITY] @d-smirnov -> reviewer (#7510)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 42f7a8f81701..f72220d07f16 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -121,6 +121,7 @@ We do encourage everyone to work anything they are interested in.
 - [Josh Pollock](https://github.com/joshpoll): @joshpoll
 - [Jared Roesch](https://github.com/jroesch): @jroesch
 - [Andrew Reusch](https://github.com/areusch): @areusch
+- [Dmitriy Smirnov](https://github.com/d-smirnov): @d-smirnov
 - [Siva](https://github.com/srkreddy1238): @srkreddy1238
 - [Siju Samuel](https://github.com/siju-samuel): @siju-samuel
 - [Junru Shao](https://github.com/junrushao1994): @junrushao1994

From d425c144adb6e7a840da071b044773eb525e67c1 Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Tue, 23 Feb 2021 22:07:11 -0800
Subject: [PATCH 227/357] [Relay][Frontend][Onnx] Fix GEMM converter when C is
 not a parameter. (#7509)

* Fix onnx gemm with non parameter C.

* Add gemm tests for C.

* Fix formatting.
---
 python/tvm/relay/frontend/onnx.py          |  9 ++----
 tests/python/frontend/onnx/test_forward.py | 33 ++++++++++++++--------
 2 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 510c7eebaf46..1e5dad46782c 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -534,13 +534,8 @@ def _impl_v1(cls, inputs, attr, params):
         out = _op.nn.dense(inputs[0], inputs[1], units=channels)
 
         if len(inputs) == 3:
-            # skip (beta * C) if zero
-            C_array = params[inputs[2].name_hint].asnumpy()
-            if (beta == 0.0) or np.array_equal(C_array, np.array([0])):
-                return out
-        else:
-            return out
-        return _op.nn.bias_add(out, _expr.const(beta) * inputs[2])
+            return _op.nn.bias_add(out, _expr.const(beta) * inputs[2])
+        return out
 
 
 class MatMul(OnnxOpConverter):
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 0c38dfd5c331..9e5911791481 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -1008,29 +1008,40 @@ def test_onehot():
         tvm.testing.assert_allclose(out_np, tvm_out, rtol=1e-5, atol=1e-5)
 
 
-@tvm.testing.uses_gpu
-def test_gemm():
-    a_shape = (4, 3)
-    b_shape = (3, 4)
+def verify_gemm(a_shape, b_shape, c_shape=None, freeze_params=False):
     out_shape = [a_shape[0], b_shape[1]]
-
     a_array = np.random.uniform(size=a_shape).astype("float32")
     b_array = np.random.uniform(size=b_shape).astype("float32")
+    input_names = ["a", "b"]
+    input_nodes = [
+        helper.make_tensor_value_info("a", TensorProto.FLOAT, list(a_shape)),
+        helper.make_tensor_value_info("b", TensorProto.FLOAT, list(b_shape)),
+    ]
+    input_values = [a_array, b_array]
+    if c_shape is not None:
+        c_array = np.random.uniform(size=c_shape).astype("float32")
+        input_names.append("c")
+        input_nodes.append(helper.make_tensor_value_info("c", TensorProto.FLOAT, list(c_shape)))
+        input_values.append(c_array)
 
-    gemm_node = helper.make_node("Gemm", ["a", "b"], ["out"])
+    gemm_node = helper.make_node("Gemm", input_names, ["out"])
 
     graph = helper.make_graph(
         [gemm_node],
         "gemm_test",
-        inputs=[
-            helper.make_tensor_value_info("a", TensorProto.FLOAT, list(a_shape)),
-            helper.make_tensor_value_info("b", TensorProto.FLOAT, list(b_shape)),
-        ],
+        inputs=input_nodes,
         outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))],
     )
 
     model = helper.make_model(graph, producer_name="gemm_test")
-    verify_with_ort_with_inputs(model, [a_array, b_array])
+    verify_with_ort_with_inputs(model, input_values, freeze_params=freeze_params)
+
+
+@tvm.testing.uses_gpu
+def test_gemm():
+    verify_gemm(a_shape=(4, 3), b_shape=(3, 4))
+    verify_gemm(a_shape=(4, 3), b_shape=(3, 4), c_shape=(4,))
+    verify_gemm(a_shape=(4, 3), b_shape=(3, 4), c_shape=(4,), freeze_params=True)
 
 
 @tvm.testing.uses_gpu

From 086dbfea76041bf12f5dc48220b0b10b1c5e1139 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <tmoreau@octoml.ai>
Date: Wed, 24 Feb 2021 01:59:04 -0800
Subject: [PATCH 228/357]  [AutoScheduler] Fix the type inference for conv2d
 (#7501)

* fix type inference for conv2d

* fix
---
 src/relay/op/nn/convolution.h | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h
index 5b4850ec6653..2a49a2e251f8 100644
--- a/src/relay/op/nn/convolution.h
+++ b/src/relay/op/nn/convolution.h
@@ -226,7 +226,18 @@ bool Conv2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   } else {
     // use weight to infer the conv shape.
     if (weight == nullptr) return false;
-    auto wshape = trans_kernel_layout.ForwardShape(weight->shape);
+
+    Array<PrimExpr> wshape;
+    if (param->auto_scheduler_rewritten_layout.size() == 0) {
+      wshape = weight->shape;
+    } else {
+      // works for the default kernel layout "HWIO"
+      ICHECK_EQ(param->kernel_layout, "HWIO");
+      wshape = auto_scheduler::GetShapeFromRewrittenLayout(param->auto_scheduler_rewritten_layout,
+                                                           {"ry", "rx", "rc", "ff"});
+    }
+
+    wshape = trans_kernel_layout.ForwardShape(wshape);
     if (param->kernel_size.defined()) {
       ICHECK_EQ(param->kernel_size.size(), 2);
 

From 9c5333e0387bdf011ba77d5fe55bfcb31454e7c0 Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Wed, 24 Feb 2021 18:06:41 +0000
Subject: [PATCH 229/357] [TVMC] rename composite target "acl" to
 "compute-library" (#7508)

* Renames the "acl" composite target to point to the specific
   library it represents
---
 python/tvm/driver/tvmc/composite_target.py        | 2 +-
 tests/python/driver/tvmc/test_common.py           | 4 ++--
 tests/python/driver/tvmc/test_composite_target.py | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/tvm/driver/tvmc/composite_target.py b/python/tvm/driver/tvmc/composite_target.py
index 7c08994e0e75..0a2592685646 100644
--- a/python/tvm/driver/tvmc/composite_target.py
+++ b/python/tvm/driver/tvmc/composite_target.py
@@ -32,7 +32,7 @@
 # to be used in the PassContext (if any), and a function
 # responsible for partitioning to that target.
 REGISTERED_CODEGEN = {
-    "acl": {
+    "compute-library": {
         "config_key": None,
         "pass_pipeline": partition_for_arm_compute_lib,
     },
diff --git a/tests/python/driver/tvmc/test_common.py b/tests/python/driver/tvmc/test_common.py
index 253f32d3f0aa..b272ceccea39 100644
--- a/tests/python/driver/tvmc/test_common.py
+++ b/tests/python/driver/tvmc/test_common.py
@@ -258,10 +258,10 @@ def test_parse_single_target_with_opts():
 
 
 def test_parse_multiple_target():
-    targets = tvmc.common.parse_target("acl, llvm -device=arm_cpu --system-lib")
+    targets = tvmc.common.parse_target("compute-library, llvm -device=arm_cpu --system-lib")
 
     assert len(targets) == 2
-    assert "acl" == targets[0]["name"]
+    assert "compute-library" == targets[0]["name"]
     assert "llvm" == targets[1]["name"]
 
 
diff --git a/tests/python/driver/tvmc/test_composite_target.py b/tests/python/driver/tvmc/test_composite_target.py
index eda7cd9224fd..cef8b117d989 100644
--- a/tests/python/driver/tvmc/test_composite_target.py
+++ b/tests/python/driver/tvmc/test_composite_target.py
@@ -38,7 +38,7 @@ def test_get_codegen_names():
 
 
 def test_valid_codegen():
-    codegen = tvmc.composite_target.get_codegen_by_target("acl")
+    codegen = tvmc.composite_target.get_codegen_by_target("compute-library")
 
     assert codegen is not None
     assert codegen["pass_pipeline"] is not None

From 88a4fdddc2bdd41a62baaaa55dbd4c524d25933d Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Wed, 24 Feb 2021 14:25:50 -0700
Subject: [PATCH 230/357] Support creating Bool constants in the pattern_utils
 (#7507)

---
 src/relay/transforms/pattern_utils.h          | 3 +++
 tests/python/relay/test_pass_simplify_expr.py | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/relay/transforms/pattern_utils.h b/src/relay/transforms/pattern_utils.h
index bc0fcc9f2988..c1eebde15fba 100644
--- a/src/relay/transforms/pattern_utils.h
+++ b/src/relay/transforms/pattern_utils.h
@@ -86,6 +86,9 @@ namespace relay {
   } else if (type == DataType::UInt(8)) {                                             \
     typedef uint8_t DType;                                                            \
     { __VA_ARGS__ }                                                                   \
+  } else if (type == DataType::Bool()) {                                              \
+    typedef bool DType;                                                               \
+    { __VA_ARGS__ }                                                                   \
   } else if ((*tvm::runtime::Registry::Get("runtime._datatype_get_type_registered"))( \
                  static_cast<uint8_t>(type.code()))) {                                \
     typedef double DType;                                                             \
diff --git a/tests/python/relay/test_pass_simplify_expr.py b/tests/python/relay/test_pass_simplify_expr.py
index 3d925bcfc759..423f0a4f213d 100644
--- a/tests/python/relay/test_pass_simplify_expr.py
+++ b/tests/python/relay/test_pass_simplify_expr.py
@@ -117,7 +117,7 @@ def after_right(x, elem_op, value):
                 assert tvm.ir.structural_equal(zz, after)
 
     for shape in [[10], [10, 10], [10, 10, 10]]:
-        for dtype in ["float32", "int32"]:
+        for dtype in ["float32", "int32", "bool"]:
             for value in [0, 1, 2]:
                 validate(shape, value, dtype)
 

From 7f869879d27c0055168f25b41447f16acf8b58fd Mon Sep 17 00:00:00 2001
From: Xingyu Zhou <zhoxingy@amazon.com>
Date: Wed, 24 Feb 2021 15:25:44 -0800
Subject: [PATCH 231/357] [Frontend][Tensorflow] Support range like axis in
 tf.raw_ops.All for TF 2.x (#7502)

* add TF2.x raw_ops.all axis range support

* apply linting

* fix range() func input
---
 python/tvm/relay/frontend/tensorflow.py       | 10 +++++
 .../frontend/tensorflow/test_forward.py       | 39 +++++++++++++++++++
 2 files changed, 49 insertions(+)

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index ac52ab768066..3a3c5fcecd42 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -1976,6 +1976,16 @@ def _impl(inputs, attr, params, mod):
                 # Symbolic delta
                 delta = inputs[2]
 
+        # if all attributes are constant, evalute the range function and return relay.const
+        if all(
+            [
+                isinstance(start, (np.int32, np.int64, int, np.float32, np.float64, float)),
+                isinstance(limit, (np.int32, np.int64, int, np.float32, np.float64, float)),
+                isinstance(delta, (np.int32, np.int64, int, np.float32, np.float64, float)),
+            ]
+        ):
+            return tvm.relay.const(list(range(int(start), int(limit), int(delta))))
+
         dtype = attr["Tidx"].name if "Tidx" in attr else str(start.dtype)
         if isinstance(start, (np.int32, np.int64, int, np.float32, np.float64, float)):
             start = _expr.const(start, dtype=dtype)
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index ecf6441bc6b9..d0038caea09f 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -3948,6 +3948,45 @@ def _test_math_op(op, dtypes=["int32", "float32"]):
         _test_math_op(tf.math.reduce_euclidean_norm)
 
 
+#######################################################################
+# All, Max, Min
+# ------------------------------------------------------------------
+
+
+def test_forward_raw_reduce():
+    def _check_op(tf_op, ishape, axis, keepdims, range_axis=False, dtype="float32"):
+        tf.reset_default_graph()
+        if dtype == "bool":
+            np_data = np.random.choice([True, False], size=ishape)
+        else:
+            np_data = np.random.uniform(size=ishape).astype(dtype)
+        if tf_op == tf.math.reduce_prod:
+            axis = 1
+            np_data = np_data.reshape(1, -1)
+        with tf.Graph().as_default():
+            if range_axis:
+                axis = tf.range(axis[0], axis[1], axis[2], name="range", dtype="int32")
+            in_data = tf.placeholder(dtype, name="in_data")
+            reduce_op = tf_op(input=in_data, axis=axis, keep_dims=keepdims, name="reduce_std")
+            compare_tf_with_tvm([np_data], ["in_data:0"], reduce_op.name)
+
+    def _test_raw_reduce_op(op, dtypes=["int32", "float32"]):
+        for dtype in dtypes:
+            _check_op(op, (3, 10), axis=(-1), keepdims=False, dtype=dtype)
+            _check_op(op, (8, 16, 32), axis=(-1), keepdims=False, dtype=dtype)
+            _check_op(op, (1, 8, 8, 3), axis=(2, 3), keepdims=True, dtype=dtype)
+            _check_op(op, (2, 3, 10, 10), axis=(1, 2), keepdims=True, dtype=dtype)
+            _check_op(op, (1, 8, 8, 3), axis=(2, 4, 1), keepdims=True, range_axis=True, dtype=dtype)
+            _check_op(
+                op, (2, 3, 10, 10), axis=(1, 3, 1), keepdims=True, range_axis=True, dtype=dtype
+            )
+
+    if package_version.parse(tf.VERSION) >= package_version.parse("2.4.1"):
+        _test_raw_reduce_op(tf.raw_ops.All, dtypes=["bool"])
+        _test_raw_reduce_op(tf.raw_ops.Max)
+        _test_raw_reduce_op(tf.raw_ops.Min)
+
+
 #######################################################################
 # Relational operators
 # --------------------

From 1455536cc22465cd6b7f6dde4f235c89ed4c5c3c Mon Sep 17 00:00:00 2001
From: Jorn Tuyls <jtuyls@users.noreply.github.com>
Date: Wed, 24 Feb 2021 23:32:03 +0000
Subject: [PATCH 232/357] [BYOC][VitisAI] Fix issue in Vitis AI codegen out
 tensor names matching & update docs and docker (#7350)

* Fix bug in vitis ai codegen out tensor names matching & update docs & update docker

* Update vitis_ai.rst

* Move gpg-agent package installation to vitis ai core script

* Refactor install_vitis_ai_core script

* Update docs/deploy/vitis_ai.rst

Co-authored-by: Cody Yu <comaniac0422@gmail.com>

* Update docs/deploy/vitis_ai.rst

Co-authored-by: Cody Yu <comaniac0422@gmail.com>

* Update vitis-ai docs pynq/edge setup & adjustements for comments

* Update python/tvm/contrib/target/vitis_ai.py

Co-authored-by: Cody Yu <comaniac0422@gmail.com>

* Reorg Vitis AI dockerfile to make sure gpg-agent is installed before llvm

Co-authored-by: Jorn Tuyls <jornt.tuyls@gmail.com>
Co-authored-by: Cody Yu <comaniac0422@gmail.com>
---
 docker/Dockerfile.demo_vitis_ai               |  9 +-
 .../install/ubuntu_install_vitis_ai_core.sh   | 12 +--
 docs/deploy/vitis_ai.rst                      | 95 +++++++++++++++----
 python/tvm/contrib/target/vitis_ai.py         |  6 +-
 python/tvm/relay/op/contrib/vitis_ai.py       |  4 +
 5 files changed, 93 insertions(+), 33 deletions(-)

diff --git a/docker/Dockerfile.demo_vitis_ai b/docker/Dockerfile.demo_vitis_ai
index 58326b66bf0c..8cc623e2f38c 100644
--- a/docker/Dockerfile.demo_vitis_ai
+++ b/docker/Dockerfile.demo_vitis_ai
@@ -20,10 +20,13 @@ FROM xilinx/vitis-ai:latest
 
 RUN apt-get update --fix-missing
 
-
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
+# Install Vitis-AI ubuntu dependencies
+COPY install/ubuntu_install_vitis_ai_core.sh /install/ubuntu_install_vitis_ai_core.sh
+RUN bash /install/ubuntu_install_vitis_ai_core.sh
+
 COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
 RUN bash /install/ubuntu_install_python.sh
 
@@ -43,10 +46,6 @@ ENV PATH $PATH:$CARGO_HOME/bin:/usr/lib/go-1.10/bin
 COPY install/ubuntu_install_java.sh /install/ubuntu_install_java.sh
 RUN bash /install/ubuntu_install_java.sh
 
-# Install Vitis-AI ubuntu dependencies
-COPY install/ubuntu_install_vitis_ai_core.sh /install/ubuntu_install_vitis_ai_core.sh
-RUN bash /install/ubuntu_install_vitis_ai_core.sh
-
 # Install dependencies inside vitis-ai-tensorflow conda
 RUN . $VAI_ROOT/conda/etc/profile.d/conda.sh && \
     conda activate vitis-ai-tensorflow && \
diff --git a/docker/install/ubuntu_install_vitis_ai_core.sh b/docker/install/ubuntu_install_vitis_ai_core.sh
index ea05ffd170fe..a2d7c2ebe332 100644
--- a/docker/install/ubuntu_install_vitis_ai_core.sh
+++ b/docker/install/ubuntu_install_vitis_ai_core.sh
@@ -21,9 +21,9 @@ set -u
 set -o pipefail
 
 # install libraries for building Vitis-AI on ubuntu
-apt-get update && apt-get install -y --no-install-recommends \
-    graphviz\
-    gnupg2
-
-apt-get update && apt-get install -y gcc-aarch64-linux-gnu
-
+apt-get update && apt-get install -y \
+    graphviz \
+    gnupg2 \
+    gpg-agent \
+    gcc-aarch64-linux-gnu \
+    && rm -rf /var/lib/apt/lists/*
diff --git a/docs/deploy/vitis_ai.rst b/docs/deploy/vitis_ai.rst
index df29f16f9d8d..7de8f58ce54f 100755
--- a/docs/deploy/vitis_ai.rst
+++ b/docs/deploy/vitis_ai.rst
@@ -304,15 +304,22 @@ Edge hardware setup
   This section provides instructions for setting up with the `Pynq <http://www.pynq.io/>`__ platform but
   Petalinux based flows are also supported.
 
-1. Download the Pynq v2.5 image for your target (use Z1 or Z2 for
+1. Download the Pynq v2.6 image for your target (use Z1 or Z2 for
    Ultra96 target depending on board version) Link to image:
-   https://github.com/Xilinx/PYNQ/releases/tag/v2.5
+   https://github.com/Xilinx/PYNQ/releases/tag/v2.6.0
 2. Follow Pynq instructions for setting up the board: `pynq
    setup <https://pynq.readthedocs.io/en/latest/getting_started.html>`__
-3. After connecting to the board, make sure to run as root. Execute
+3. After connecting to the board, make sure to run as root. **Execute**
    ``su``
-4. Set up DPU on Pynq by following the steps here: `DPU Pynq
-   setup <https://github.com/Xilinx/DPU-PYNQ>`__
+4. Set up DPU on Pynq:
+
+    .. code:: bash
+
+     git clone --branch v1.2.0 --recursive --shallow-submodules https://github.com/Xilinx/DPU-PYNQ.git
+     cd DPU-PYNQ/upgrade
+     make
+     pip3 install pynq-dpu==1.2.0
+
 5. Run the following command to download the DPU bitstream:
 
    .. code:: bash
@@ -343,7 +350,7 @@ interface between TVM and Vitis-AI tools.
    .. code:: bash
 
       apt-get install libhdf5-dev
-      pip3 install pydot h5py
+      pip3 install pydot==1.4.1 h5py==2.8.0
 
 2. Install PyXIR
 
@@ -362,16 +369,17 @@ interface between TVM and Vitis-AI tools.
       mkdir build
       cp cmake/config.cmake build
       cd build
+      echo set\(USE_LLVM OFF\) >> config.cmake
       echo set\(USE_VITIS_AI ON\) >> config.cmake
       cmake ..
-      make
+      make tvm_runtime -j$(nproc)
 
 4. Install TVM
 
    .. code:: bash
 
       cd tvm/python
-      pip3 install -e . --user
+      pip3 install -e .
 
 5. Check whether the setup was successful in the Python shell:
 
@@ -441,7 +449,7 @@ TVM.
    import tvm
    import tvm.relay as relay
    from tvm.contrib.target import vitis_ai
-   from tvm.contrib import util, graph_runtime
+   from tvm.contrib import utils, graph_runtime
    from tvm.relay.build_module import bind_params_by_name
    from tvm.relay.op.contrib.vitis_ai import annotation
 
@@ -524,6 +532,8 @@ model in TVM with Vitis-AI at the edge. The first couple of steps will
 have to be run on the host machine and take care of quantization and
 compilation for deployment at the edge.
 
+A complete ResNet 18 example can be found `here <https://github.com/Xilinx/pyxir/tree/master/examples/tvm>`__.
+
 Host steps
 ^^^^^^^^^^
 
@@ -541,7 +551,7 @@ TVM.
    import tvm
    import tvm.relay as relay
    from tvm.contrib.target import vitis_ai
-   from tvm.contrib import util, graph_runtime
+   from tvm.contrib import utils, graph_runtime
    from tvm.relay.build_module import bind_params_by_name
    from tvm.relay.op.contrib.vitis_ai import annotation
 
@@ -549,12 +559,47 @@ After importing a convolutional neural network model using the usual
 Relay API's, annotate the Relay expression for the given Vitis-AI DPU
 target and partition the graph.
 
+.. note::
+
+    We recommend converting DPU convolutions' data layouts to NHWC and CPU convolutions'
+    data layouts to NCHW for best DPU and out of the box CPU performance. You can use the
+    ConvertLayout transformation pass two times to achieve this as demonstrated in the code
+    block underneath. You can also leave the CPU convolution layouts in NHWC and tune ARM CPU
+    performance for this data layout to avoid the layout transformation overheads introduced by
+    executing DPU convolutions in NHWC and CPU convolutions in NCHW
+    (check out the `AutoScheduling <https://tvm.apache.org/docs/tutorials/index.html#autoscheduler-template-free-auto-scheduling>`__
+    and `AutoTuning <https://tvm.apache.org/docs/tutorials/autotvm/tune_relay_arm.html>`__
+    tutorials for this).
+
 .. code:: python
 
    mod["main"] = bind_params_by_name(mod["main"], params)
+   
+   # For edge DPU we recommend converting the convolutions' data layout
+   #    to NHWC for best performance. Therefore, we first convert the layouts
+   #    of all convolutions to NHWC before partitioning. Afterwards, we can
+   #    convert any remaining convolutions (to be executed on CPU) back to NCHW.
+   desired_layouts = {'nn.conv2d': ['NHWC', 'default']}
+   seq = tvm.transform.Sequential([relay.transform.RemoveUnusedFunctions(),
+                                   relay.transform.ConvertLayout(desired_layouts),
+                                   relay.transform.FoldConstant()])
+   with tvm.transform.PassContext(opt_level=3):
+       mod = seq(mod)
+            
+   # Annotate and partition the Relay expression for the given target
    mod = annotation(mod, params, target)
    mod = relay.transform.MergeCompilerRegions()(mod)
    mod = relay.transform.PartitionGraph()(mod)
+   
+   # After partitioning we recommend transforming the remaining convolutions
+   #    (that will be executed on CPU, if any) back to NCHW data layout
+   #    for best CPU performance
+   desired_layouts = {'nn.conv2d': ['NCHW', 'default']}
+   seq = tvm.transform.Sequential([relay.transform.RemoveUnusedFunctions(),
+                                   relay.transform.ConvertLayout(desired_layouts),
+                                   relay.transform.FoldConstant()])
+   with tvm.transform.PassContext(opt_level=3):
+       mod = seq(mod)
 
 Now, we can build the TVM runtime library for executing the model. The
 TVM target is 'llvm' as the operations that can't be handled by the DPU
@@ -572,13 +617,9 @@ can be included.
 
 .. code:: python
 
-   from tvm.contrib import util
-
-   temp = util.tempdir()
-
    tvm_target = 'llvm'
    target='DPUCZDX8G-zcu104'
-   export_rt_mod_file = temp.relpath("vitis_ai.rtmod")
+   export_rt_mod_file = "vitis_ai.rtmod"
 
    with tvm.transform.PassContext(opt_level=3, config= {'relay.ext.vitis_ai.options.target': target,
    						        'relay.ext.vitis_ai.options.export_runtime_module': export_rt_mod_file}):
@@ -604,9 +645,9 @@ Save the TVM lib module so that the Vitis-AI runtime module will also be exporte
 
 .. code:: python
 
-   from tvm.contrib import util
+   from tvm.contrib import utils
 
-   temp = util.tempdir()
+   temp = utils.tempdir()
    lib.export_library(temp.relpath("tvm_lib.so"))
 
 After quantizing and compiling the model for Vitis-AI acceleration using the
@@ -638,15 +679,31 @@ Edge steps
 ^^^^^^^^^^
 
 After setting up TVM with Vitis-AI on the edge device, you can now load
-the TVM runtime module into memory and feed inputs for inference.
+the TVM runtime module into memory and feed inputs for inference. A nearly
+complete runtiem script can be found underneath. Make sure to run the script
+as root (execute ``su`` in terminal to log into root).
+
+
+.. note::
+
+    You will see a warning about the 'cpu-tf' runtime not being found. This warning is
+    expected on the board and can be ignored. Note also that you **shouldn't** import the
+    PyXIR targets in the run script (``import pyxir.contrib.target.DPUCZDX8G``).
 
 .. code:: python
 
+   import pyxir
+   import tvm
+   from tvm.contrib import graph_runtime
+
    ctx = tvm.cpu()
+   
+   # input_name = ...
+   # input_data = ...
 
    # load the module into memory
    lib = tvm.runtime.load_module("tvm_dpu_arm.so")
 
    module = graph_runtime.GraphModule(lib["default"](tvm.cpu()))
-   module.set_input(name, data)
+   module.set_input(input_name, input_data)
    module.run()
diff --git a/python/tvm/contrib/target/vitis_ai.py b/python/tvm/contrib/target/vitis_ai.py
index d4931d9e3f48..f319fd799829 100644
--- a/python/tvm/contrib/target/vitis_ai.py
+++ b/python/tvm/contrib/target/vitis_ai.py
@@ -132,14 +132,14 @@ def vitis_ai_compiler(ref):
         layers = xgraph.get_layers()
 
         # Get the output tensor names using XGraph and output Relay ids
-        out_tensor_names = []
+        out_tensor_names = ["unknown_name"] * len(output_relay_ids)
         for layer in layers:
             if not layer.internal:
                 for relay_id in layer.attrs["relay_id"]:
                     if relay_id in output_relay_ids:
-                        out_tensor_names.append(layer.name)
+                        out_tensor_names[output_relay_ids.index(relay_id)] = layer.name
                         break
-        if not out_tensor_names:
+        if any([name == "unkown_name" for name in out_tensor_names]):
             raise ValueError(
                 "During codegeneration the loading of subexpression \
                              failed due to output tensor name mismatch in Relay PyXIR interface."
diff --git a/python/tvm/relay/op/contrib/vitis_ai.py b/python/tvm/relay/op/contrib/vitis_ai.py
index fa17c63fc00a..aaa9f99e61ed 100644
--- a/python/tvm/relay/op/contrib/vitis_ai.py
+++ b/python/tvm/relay/op/contrib/vitis_ai.py
@@ -85,6 +85,10 @@ def visit_call(self, call):
 
 def annotation(mod, params, target):
     """Annotate Relay expression for Vitis-AI DPU accelerators"""
+    # We need type information for supporting models that contain operations that don't
+    #   have a Relay to XLayer translation
+    mod = relay.transform.InferType()(mod)
+
     xgraph = pyxir.frontend.tvm.from_relay(mod, params, postprocessing=None)
     xgraph = pyxir.partition(xgraph, targets=[target])
 

From 9e74f90c9438d181ef948cc8ccc6951f373885ad Mon Sep 17 00:00:00 2001
From: Trevor Morris <trevmorr@amazon.com>
Date: Wed, 24 Feb 2021 22:17:17 -0800
Subject: [PATCH 233/357] Support CombinedNMS in TF frontend. (#7520)

---
 python/tvm/relay/frontend/tensorflow.py       | 104 ++++++++++++++++++
 .../frontend/tensorflow/test_forward.py       |  49 +++++++++
 2 files changed, 153 insertions(+)

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 3a3c5fcecd42..ab98cddd3835 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -773,6 +773,109 @@ def _impl(inputs, attr, params, mod):
     return _impl
 
 
+def _combined_nms():
+    def _impl(inputs, attr, params, mod):
+        # Get parameter values
+        boxes = inputs[0]
+        scores = inputs[1]
+        try:
+            max_output_size = int(np.atleast_1d(inputs[2].data.asnumpy().astype("int64"))[0])
+        except Exception:
+            try:
+                max_output_size = (
+                    _infer_value(inputs[2], params, mod).asnumpy().astype("int64").tolist()[0]
+                )
+            except Exception:
+                max_output_size = inputs[2]
+        max_total_size = inputs[3]
+        iou_threshold = np.atleast_1d(inputs[4].data.asnumpy())[0]
+        score_threshold = np.atleast_1d(inputs[5].data.asnumpy())[0]
+        if attr["pad_per_class"]:
+            raise tvm.error.OpAttributeUnImplemented(
+                "pad_per_class for CombinedNonMaxSuppression is not supported"
+            )
+        boxes_shape = _infer_shape(inputs[0], mod)
+        scores_shape = _infer_shape(inputs[1], mod)
+        batch_size = boxes_shape[0]
+        num_anchors = boxes_shape[1]
+        q = boxes_shape[2]
+        num_classes = scores_shape[2]
+
+        if q != num_classes:
+            # When q is 1, it means same box coords are used for all classes.
+            boxes = _op.broadcast_to(boxes, (batch_size, num_anchors, num_classes, 4))
+        boxes = _op.reshape(boxes, newshape=[batch_size, num_anchors * num_classes, 4])
+        scores = _op.reshape(scores, newshape=[batch_size, num_anchors * num_classes, 1])
+
+        # In TF, class is specified by memory layout only.
+        ids = _op.arange(_op.const(num_classes, dtype="float32"))
+        ids = _op.broadcast_to(ids, (batch_size, num_anchors, num_classes))
+        ids = _op.reshape(ids, newshape=[batch_size, num_anchors * num_classes, 1])
+
+        data = _op.concatenate([ids, scores, boxes], -1)
+        ct, data, indices = _op.vision.get_valid_counts(
+            data, score_threshold=score_threshold, id_index=0, score_index=1
+        )
+        nms_ret = _op.vision.non_max_suppression(
+            data=data,
+            valid_count=ct,
+            indices=indices,
+            max_output_size=max_output_size,
+            iou_threshold=iou_threshold,
+            force_suppress=False,
+            top_k=-1,
+            coord_start=2,
+            score_index=1,
+            id_index=0,
+            return_indices=False,
+            invalid_to_bottom=True,
+        )
+        # Dynamic slice to max_total_size
+        neg_one = _expr.const([-1])
+        slice_end = _op.concatenate(
+            [neg_one, _op.expand_dims(max_total_size, axis=0), neg_one], axis=0
+        )
+        nms_ret = _op.strided_slice(
+            nms_ret, begin=[0, 0, 0], end=slice_end, strides=[1, 1, 1], slice_mode="size"
+        )
+
+        # Slice output into boxes, scores, classes
+        nmsed_boxes = _op.strided_slice(
+            nms_ret, begin=[0, 0, 2], end=[-1, -1, 4], slice_mode="size"
+        )
+        if attr["clip_boxes"]:
+            nmsed_boxes = _op.maximum(nmsed_boxes, _expr.const(0, dtype="float32"))
+            nmsed_boxes = _op.minimum(nmsed_boxes, _expr.const(1, dtype="float32"))
+        nmsed_scores = _op.strided_slice(
+            nms_ret, begin=[0, 0, 1], end=[-1, -1, 1], slice_mode="size"
+        )
+        nmsed_scores = _op.squeeze(nmsed_scores, axis=[2])
+        nmsed_classes = _op.strided_slice(
+            nms_ret, begin=[0, 0, 0], end=[-1, -1, 1], slice_mode="size"
+        )
+        nmsed_classes = _op.squeeze(nmsed_classes, axis=[2])
+        # Get number of valid boxes
+        nms_count = _op.sum(
+            _op.cast(_op.greater(nmsed_scores, _expr.const(0, dtype="float32")), "int32"), axis=1
+        )
+
+        # TVM uses -1 for invalid outputs while TF uses 0
+        box_range = _op.arange(_expr.const(0, dtype="int32"), max_total_size, dtype="int32")
+        shape = _op.strided_slice(_op.shape_of(nmsed_boxes), begin=[0], end=[2])
+        box_range = _op.broadcast_to(box_range, shape)
+        valid_mask = _op.cast(_op.less(box_range, _op.expand_dims(nms_count, axis=1)), "float32")
+        nmsed_boxes = nmsed_boxes * _op.expand_dims(valid_mask, axis=2)
+        # Could instead use mask for scores, classes if negative values are possible.
+        nmsed_scores = _op.maximum(nmsed_scores, _expr.const(0, dtype="float32"))
+        nmsed_classes = _op.maximum(nmsed_classes, _expr.const(0, dtype="float32"))
+
+        return _expr.TupleWrapper(
+            _expr.Tuple([nmsed_boxes, nmsed_scores, nmsed_classes, nms_count]), 4
+        )
+
+    return _impl
+
+
 def _decode_image():
     def _impl(inputs, attr, params, mod):
         # Image decode wrapper: Expecting user to feed decoded input to next layer drop this layer.
@@ -2483,6 +2586,7 @@ def _impl(inputs, attr, params, mod):
     "NonMaxSuppressionV3": _nms(),
     "NonMaxSuppressionV4": _nms(),
     "NonMaxSuppressionV5": _nms(True),
+    "CombinedNonMaxSuppression": _combined_nms(),
     "NoOp": _no_op(),
     "NotEqual": _broadcast("not_equal"),
     "OneHot": _one_hot(),
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index d0038caea09f..51e26312f52f 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -2837,6 +2837,55 @@ def test_forward_nms():
         _test_forward_nms((2000, 4), (2000,), 0.4, 0.6, 7)
 
 
+def _test_forward_combined_nms(
+    bx_shape,
+    score_shape,
+    iou_threshold,
+    score_threshold,
+    out_size,
+    total_size,
+    clip_boxes=False,
+    dtype="float32",
+):
+    boxes = np.random.uniform(-1, 2, size=bx_shape).astype(dtype)
+    scores = np.random.uniform(size=score_shape).astype(dtype)
+    max_output_size = np.int32(out_size)
+    tf.reset_default_graph()
+    in_data_1 = tf.placeholder(dtype, boxes.shape, name="in_data_1")
+    in_data_2 = tf.placeholder(dtype, scores.shape, name="in_data_2")
+    in_data_3 = tf.placeholder(tf.int32, name="in_data_3")
+    tf.image.combined_non_max_suppression(
+        boxes=in_data_1,
+        scores=in_data_2,
+        max_output_size_per_class=in_data_3,
+        max_total_size=total_size,
+        iou_threshold=iou_threshold,
+        score_threshold=score_threshold,
+        pad_per_class=False,
+        clip_boxes=clip_boxes,
+        name="nms",
+    )
+    compare_tf_with_tvm(
+        [boxes, scores, max_output_size],
+        ["in_data_1:0", "in_data_2:0", "in_data_3:0"],
+        [
+            "nms/CombinedNonMaxSuppression:0",
+            "nms/CombinedNonMaxSuppression:1",
+            "nms/CombinedNonMaxSuppression:2",
+            "nms/CombinedNonMaxSuppression:3",
+        ],
+        mode="vm",
+    )
+
+
+def test_forward_combined_nms():
+    """ CombinedNonMaxSuppression """
+    _test_forward_combined_nms((1, 64, 1, 4), (1, 64, 1), 0.7, 0.5, 64, 64)
+    _test_forward_combined_nms((1, 64, 1, 4), (1, 64, 20), 0.7, 0.5, 64, 10)
+    _test_forward_combined_nms((1, 64, 20, 4), (1, 64, 20), 0.7, 0.5, 64, 64, clip_boxes=True)
+    _test_forward_combined_nms((2, 200, 1, 4), (2, 200, 1), 0.4, 0.6, 100, 100)
+
+
 #######################################################################
 # LSTM
 # ----

From fc6f08aca908f6bd98c4b2d41523180a70fb7f72 Mon Sep 17 00:00:00 2001
From: ANSHUMAN TRIPATHY <anshuman.t@huawei.com>
Date: Thu, 25 Feb 2021 15:23:12 +0530
Subject: [PATCH 234/357] [Frontend] TF V2 sparse.todense() test added (#7473)

* [Frontend] TF V2 sparse.todense() test added

* [1] Review comments handled
---
 .../frontend/tensorflow/test_forward.py       | 175 ++++++++++--------
 1 file changed, 98 insertions(+), 77 deletions(-)

diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 51e26312f52f..5ed3e72206e4 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -1955,6 +1955,104 @@ def test_forward_sparse_fill_empty_rows(
     )
 
 
+#######################################################################
+# tensorflow.compat.v1.sparse_to_dense
+# ---------------
+def _test_sparse_to_dense(sparse_indices, sparse_values, default_value, output_shape):
+    with tf.Graph().as_default():
+        indices = tf.placeholder(
+            shape=sparse_indices.shape, dtype=str(sparse_indices.dtype), name="indices"
+        )
+        values = tf.placeholder(
+            shape=sparse_values.shape, dtype=str(sparse_values.dtype), name="values"
+        )
+        oshape = tf.constant(output_shape, shape=output_shape.shape, dtype=str(output_shape.dtype))
+
+        if default_value == None:
+            output = tf.sparse_to_dense(indices, oshape, values)
+            compare_tf_with_tvm(
+                [sparse_indices, sparse_values], ["indices:0", "values:0"], output.name
+            )
+        else:
+            dv = tf.placeholder(shape=(), dtype=str(default_value.dtype), name="default_value")
+            output = tf.sparse_to_dense(indices, oshape, values, dv)
+            compare_tf_with_tvm(
+                [sparse_indices, sparse_values, default_value],
+                ["indices:0", "values:0", "default_value:0"],
+                output.name,
+            )
+
+
+def test_forward_sparse_to_dense():
+    # scalar
+    _test_sparse_to_dense(
+        sparse_indices=np.int32(1),
+        sparse_values=np.int32(3),
+        default_value=np.int32(0),
+        output_shape=np.array([5]).astype("int32"),
+    )
+
+    # vector
+    _test_sparse_to_dense(
+        sparse_indices=np.array([0, 1, 4]).astype("int32"),
+        sparse_values=np.array([3, 3, 3]).astype("int32"),
+        default_value=np.int32(0),
+        output_shape=np.array([5]).astype("int32"),
+    )
+
+    # vector nXd
+    _test_sparse_to_dense(
+        sparse_indices=np.array([[0, 0], [1, 2]]).astype("int32"),
+        sparse_values=np.array([1, 2]).astype("int32"),
+        default_value=np.int32(0),
+        output_shape=np.array([3, 4]).astype("int32"),
+    )
+
+    _test_sparse_to_dense(
+        sparse_indices=np.array([[0, 0, 0], [1, 2, 3]]).astype("int32"),
+        sparse_values=np.array([1, 2]).astype("int32"),
+        default_value=np.int32(4),
+        output_shape=np.array([2, 3, 4]).astype("int32"),
+    )
+
+    # floats
+    _test_sparse_to_dense(
+        sparse_indices=np.array([0, 1, 4]).astype("int32"),
+        sparse_values=np.array([3.1, 3.1, 3.1]).astype("float32"),
+        default_value=np.float32(3.5),
+        output_shape=np.array([5]).astype("int32"),
+    )
+
+    # default value not specified
+    _test_sparse_to_dense(
+        sparse_indices=np.array([0, 1, 4]).astype("int32"),
+        sparse_values=np.array([3.1, 3.1, 3.1]).astype("float32"),
+        default_value=None,
+        output_shape=np.array([5]).astype("int32"),
+    )
+
+
+#######################################################################
+# tensorflow.sparse.to_dense
+# ---------------
+def _test_sparse_to_dense_v2(indices, values, A_shape, dtype, default_value=None):
+    with tf.Graph().as_default():
+        A_sp = tf.sparse.SparseTensor(indices=indices, values=values, dense_shape=A_shape)
+
+        result = tf.sparse.to_dense(A_sp, default_value=default_value)
+
+        compare_tf_with_tvm([], [], result.name)
+
+
+def test_forward_sparse_to_dense_v2():
+    _test_sparse_to_dense_v2([[1]], [3.0], [5], "float32")
+    _test_sparse_to_dense_v2([[1]], [3.0], [5], "float32", 0.3)
+    _test_sparse_to_dense_v2([[0, 0], [1, 2]], [4.0, 8.0], [3, 4], "float32")
+    _test_sparse_to_dense_v2([[0, 0], [1, 2]], [4.0, 8.0], [3, 4], "float32", 1.3)
+    _test_sparse_to_dense_v2([[0, 0], [1, 3], [4, 3]], [3.0, 6.0, 9.0], [5, 5], "float32")
+    _test_sparse_to_dense_v2([[0, 0], [1, 3], [4, 3]], [3.0, 6.0, 9.0], [5, 5], "float32", 1.9)
+
+
 #######################################################################
 # StridedSlice
 # ------------
@@ -4355,83 +4453,6 @@ def test_forward_identityn(data_np_list):
     _test_identityn(data_np_list)
 
 
-#######################################################################
-# Sparse To Dense
-# ---------------
-def _test_sparse_to_dense(sparse_indices, sparse_values, default_value, output_shape):
-    with tf.Graph().as_default():
-        indices = tf.placeholder(
-            shape=sparse_indices.shape, dtype=str(sparse_indices.dtype), name="indices"
-        )
-        values = tf.placeholder(
-            shape=sparse_values.shape, dtype=str(sparse_values.dtype), name="values"
-        )
-        oshape = tf.constant(output_shape, shape=output_shape.shape, dtype=str(output_shape.dtype))
-
-        if default_value == None:
-            output = tf.sparse_to_dense(indices, oshape, values)
-            compare_tf_with_tvm(
-                [sparse_indices, sparse_values], ["indices:0", "values:0"], output.name
-            )
-        else:
-            dv = tf.placeholder(shape=(), dtype=str(default_value.dtype), name="default_value")
-            output = tf.sparse_to_dense(indices, oshape, values, dv)
-            compare_tf_with_tvm(
-                [sparse_indices, sparse_values, default_value],
-                ["indices:0", "values:0", "default_value:0"],
-                output.name,
-            )
-
-
-def test_forward_sparse_to_dense():
-    # scalar
-    _test_sparse_to_dense(
-        sparse_indices=np.int32(1),
-        sparse_values=np.int32(3),
-        default_value=np.int32(0),
-        output_shape=np.array([5]).astype("int32"),
-    )
-
-    # vector
-    _test_sparse_to_dense(
-        sparse_indices=np.array([0, 1, 4]).astype("int32"),
-        sparse_values=np.array([3, 3, 3]).astype("int32"),
-        default_value=np.int32(0),
-        output_shape=np.array([5]).astype("int32"),
-    )
-
-    # vector nXd
-    _test_sparse_to_dense(
-        sparse_indices=np.array([[0, 0], [1, 2]]).astype("int32"),
-        sparse_values=np.array([1, 2]).astype("int32"),
-        default_value=np.int32(0),
-        output_shape=np.array([3, 4]).astype("int32"),
-    )
-
-    _test_sparse_to_dense(
-        sparse_indices=np.array([[0, 0, 0], [1, 2, 3]]).astype("int32"),
-        sparse_values=np.array([1, 2]).astype("int32"),
-        default_value=np.int32(4),
-        output_shape=np.array([2, 3, 4]).astype("int32"),
-    )
-
-    # floats
-    _test_sparse_to_dense(
-        sparse_indices=np.array([0, 1, 4]).astype("int32"),
-        sparse_values=np.array([3.1, 3.1, 3.1]).astype("float32"),
-        default_value=np.float32(3.5),
-        output_shape=np.array([5]).astype("int32"),
-    )
-
-    # default value not specified
-    _test_sparse_to_dense(
-        sparse_indices=np.array([0, 1, 4]).astype("int32"),
-        sparse_values=np.array([3.1, 3.1, 3.1]).astype("float32"),
-        default_value=None,
-        output_shape=np.array([5]).astype("int32"),
-    )
-
-
 #######################################################################
 # infinity ops
 # ------------

From b77c019bcee21b4d5ac8601c0b1ed35613db8462 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 25 Feb 2021 08:48:29 -0500
Subject: [PATCH 235/357] [DOCS] Remove incubating from docs (#7525)

---
 DISCLAIMER | 12 ------------
 NOTICE     |  4 ++--
 README.md  |  2 +-
 3 files changed, 3 insertions(+), 15 deletions(-)
 delete mode 100644 DISCLAIMER

diff --git a/DISCLAIMER b/DISCLAIMER
deleted file mode 100644
index 986b2c84f6b4..000000000000
--- a/DISCLAIMER
+++ /dev/null
@@ -1,12 +0,0 @@
-Apache TVM (incubating) is an effort undergoing incubation at The
-Apache Software Foundation (ASF), sponsored by the Apache Incubator PMC.
-
-Incubation is required of all newly accepted
-projects until a further review indicates that the
-infrastructure, communications, and decision making process have
-stabilized in a manner consistent with other successful ASF
-projects.
-
-While incubation status is not necessarily a reflection
-of the completeness or stability of the code, it does indicate
-that the project has yet to be fully endorsed by the ASF.
diff --git a/NOTICE b/NOTICE
index edb1bd250000..a4b747830dcf 100644
--- a/NOTICE
+++ b/NOTICE
@@ -1,5 +1,5 @@
-Apache TVM (incubating)
-Copyright 2019-2020 The Apache Software Foundation
+Apache TVM
+Copyright 2019-2021 The Apache Software Foundation
 
 This product includes software developed at
 The Apache Software Foundation (http://www.apache.org/).
diff --git a/README.md b/README.md
index 13a04f66d5aa..ac4ed62524b1 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@
 [![Build Status](https://ci.tlcpack.ai/buildStatus/icon?job=tvm/main)](https://ci.tlcpack.ai/job/tvm/job/main/)
 [![WinMacBuild](https://github.com/apache/tvm/workflows/WinMacBuild/badge.svg)](https://github.com/apache/tvm/actions?query=workflow%3AWinMacBuild)
 
-Apache TVM (incubating) is a compiler stack for deep learning systems. It is designed to close the gap between the
+Apache TVM is a compiler stack for deep learning systems. It is designed to close the gap between the
 productivity-focused deep learning frameworks, and the performance- and efficiency-focused hardware backends.
 TVM works with deep learning frameworks to provide end to end compilation to different backends.
 

From 772fa6bda3a6244514d8a5d6637d43815d8f9a14 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 25 Feb 2021 09:53:41 -0500
Subject: [PATCH 236/357] [PYTHON] Enable proper error message in python
 package (#7521)

---
 python/tvm/__init__.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py
index c2b4fdb2d00e..7a5f553ccdd5 100644
--- a/python/tvm/__init__.py
+++ b/python/tvm/__init__.py
@@ -68,6 +68,11 @@
 from .contrib import rocm as _rocm, nvcc as _nvcc, sdaccel as _sdaccel
 
 
+# NOTE: This file should be python2 compatible so we can
+# raise proper error message when user run the package using
+# an older version of the python
+
+
 def _should_print_backtrace():
     in_pytest = "PYTEST_CURRENT_TEST" in os.environ
     tvm_backtrace = os.environ.get("TVM_BACKTRACE", "0")
@@ -76,7 +81,7 @@ def _should_print_backtrace():
         tvm_backtrace = bool(int(tvm_backtrace))
     except ValueError:
         raise ValueError(
-            f"invalid value for TVM_BACKTRACE `{tvm_backtrace}`, please set to 0 or 1."
+            "invalid value for TVM_BACKTRACE {}, please set to 0 or 1.".format(tvm_backtrace)
         )
 
     return in_pytest or tvm_backtrace

From b1116954f532d869a7ce8d9eb24745f368b66e59 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Thu, 25 Feb 2021 10:27:06 -0800
Subject: [PATCH 237/357] Introduce module_loader to AutoTVM. (#7337)

* Introduce code_loader to AutoTVM.

 * Prepares for autotuning with microTVM, and provides extension hook
   for VTA.

* add vta hook

* git-black

* pylint

* Add missing import

* Fix import problem

* add missing import

* rename code_loader to module_loader

* rename remote_kw to remote_kwargs

* black format
---
 python/tvm/autotvm/measure/__init__.py        |   8 +-
 python/tvm/autotvm/measure/measure_methods.py | 138 +++++++++++-------
 vta/python/vta/__init__.py                    |   1 +
 vta/python/vta/autotvm.py                     |  52 +++++++
 vta/tutorials/autotvm/tune_relay_vta.py       |   1 +
 5 files changed, 150 insertions(+), 50 deletions(-)
 create mode 100644 vta/python/vta/autotvm.py

diff --git a/python/tvm/autotvm/measure/__init__.py b/python/tvm/autotvm/measure/__init__.py
index 0c32ae0ca9bf..c4c0dc92b116 100644
--- a/python/tvm/autotvm/measure/__init__.py
+++ b/python/tvm/autotvm/measure/__init__.py
@@ -23,6 +23,12 @@
     measure_option,
     create_measure_batch,
 )
-from .measure_methods import LocalBuilder, LocalRunner, RPCRunner, request_remote
+from .measure_methods import (
+    LocalBuilder,
+    LocalRunner,
+    RPCRunner,
+    default_module_loader,
+    request_remote,
+)
 from .executor import Executor
 from .local_executor import LocalExecutor
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index ffe4b97e33db..62fd811dc1ec 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -22,11 +22,13 @@
 remote devices, recording the running time costs, and checking the correctness of the output.
 """
 
+import contextlib
 import logging
 import shutil
 import os
 import threading
 import time
+import typing
 from random import getrandbits
 from collections import namedtuple
 import tempfile
@@ -199,6 +201,9 @@ class RPCRunner(Runner):
         its actual latency during end-to-end inference.
         To make this option effective, the argument `number` should also be set to 1.
         This is only has effect on CPU task.
+    module_loader : ModuleLoader
+        If given, a context manager that loads the module to be timed into the remote runtime.
+        If not given, default_module_loader is used.
     """
 
     def __init__(
@@ -214,6 +219,7 @@ def __init__(
         min_repeat_ms=0,
         cooldown_interval=0.1,
         enable_cpu_cache_flush=False,
+        module_loader=None,
     ):
         super(RPCRunner, self).__init__(timeout, n_parallel)
 
@@ -229,6 +235,7 @@ def __init__(
 
         self.enable_cpu_cache_flush = enable_cpu_cache_flush
         self.cooldown_interval = cooldown_interval
+        self.module_loader = module_loader
 
         self.executor = LocalExecutor(timeout=timeout * (self.n_parallel + 1))
 
@@ -280,6 +287,11 @@ def run(self, measure_inputs, build_results):
             for measure_inp, build_res in zip(
                 measure_inputs[i : i + self.n_parallel], build_results[i : i + self.n_parallel]
             ):
+                module_loader = (
+                    self.module_loader
+                    if self.module_loader is not None
+                    else default_module_loader()
+                )
                 ret = self.executor.submit(
                     run_through_rpc,
                     measure_inp,
@@ -290,6 +302,7 @@ def run(self, measure_inputs, build_results):
                     self.cooldown_interval,
                     remote_args,
                     self.enable_cpu_cache_flush,
+                    module_loader,
                 )
                 futures.append(ret)
 
@@ -352,6 +365,7 @@ def __init__(
         min_repeat_ms=0,
         cooldown_interval=0.1,
         enable_cpu_cache_flush=False,
+        module_loader=None,
     ):
         super(LocalRunner, self).__init__(
             "",
@@ -365,6 +379,7 @@ def __init__(
             min_repeat_ms=min_repeat_ms,
             cooldown_interval=cooldown_interval,
             enable_cpu_cache_flush=enable_cpu_cache_flush,
+            module_loader=module_loader,
         )
         self.tracker = None
         self.server = None
@@ -473,6 +488,11 @@ def __call__(self, measure_input, tmp_dir, **kwargs):
         return BuildResult(filename, arg_info, None, time.time() - tic)
 
 
+ModuleLoader = typing.Callable[
+    [dict, dict], typing.ContextManager[typing.Tuple[tvm.rpc.RPCSession, tvm.runtime.Module]]
+]
+
+
 def run_through_rpc(
     measure_input,
     build_result,
@@ -480,8 +500,9 @@ def run_through_rpc(
     repeat,
     min_repeat_ms,
     cooldown_interval,
-    remote_args,
+    remote_kwargs,
     enable_cpu_cache_flush=False,
+    module_loader=None,
 ):
     """Run a generated library through rpc
 
@@ -509,14 +530,16 @@ def run_through_rpc(
         will be automatically increased.
     cooldown_interval: float
         The cool down interval between two measurements
-    remote_args: Tuple
-        The argument for request_remote
+    remote_kwargs: dict
+        Passed to module_loader(). Ultimately, keyword args to request_remote().
     enable_cpu_cache_flush: bool
         Whether to flush cache on CPU between repeated measurements.
         Flushing cache can make the measured latency of one operator closer to
         its actual latency during end-to-end inference.
         To make this option effective, the argument `number` should also be set to 1.
         This is only has effect on CPU task.
+    module_loader: ModuleLoader
+        A function that returns a ContextManager used to establish and teardown the remote session.
     """
     if isinstance(build_result, MeasureResult):
         return build_result
@@ -525,55 +548,38 @@ def run_through_rpc(
     errno = MeasureErrorNo.NO_ERROR
     try:
         # upload built module
-        remote = request_remote(*remote_args)
-        # Program the FPGA every single time when targeting VTA
-        if (
-            hasattr(measure_input.target, "device_name")
-            and measure_input.target.device_name == "vta"
-        ):
-            # pylint: disable=import-outside-toplevel
-            from vta import program_fpga, reconfig_runtime
-
-            program_fpga(remote, None)
-            reconfig_runtime(remote)
-        remote.upload(build_result.filename)
-        func = remote.load_module(os.path.split(build_result.filename)[1])
-        ctx = remote.context(str(measure_input.target), 0)
-
-        # Limitation:
-        # We can not get PackFunction directly in the remote mode as it is wrapped
-        # under the std::function. We could lift the restriction later once we fold
-        # the PackedFunc as an object. Currently, we pass function name to work
-        # around it.
-        f_prepare = "cache_flush_cpu_non_first_arg" if enable_cpu_cache_flush else ""
-        time_f = func.time_evaluator(
-            func.entry_name,
-            ctx,
-            number=number,
-            repeat=repeat,
-            min_repeat_ms=min_repeat_ms,
-            f_preproc=f_prepare,
-        )
-
-        try:
-            random_fill = remote.get_function("tvm.contrib.random.random_fill")
-        except AttributeError:
-            raise AttributeError(
-                "Please make sure USE_RANDOM is ON in the config.cmake " "on the remote devices"
+        with module_loader(remote_kwargs, build_result) as (remote, mod):
+            ctx = remote.context(str(measure_input.target), 0)
+
+            # Limitation:
+            # We can not get PackFunction directly in the remote mode as it is wrapped
+            # under the std::function. We could lift the restriction later once we fold
+            # the PackedFunc as an object. Currently, we pass function name to work
+            # around it.
+            f_prepare = "cache_flush_cpu_non_first_arg" if enable_cpu_cache_flush else ""
+            time_f = mod.time_evaluator(
+                mod.entry_name,
+                ctx,
+                number=number,
+                repeat=repeat,
+                min_repeat_ms=min_repeat_ms,
+                f_preproc=f_prepare,
             )
-        args = [nd.array(np.zeros(x[0], dtype=x[1]), ctx=ctx) for x in build_result.arg_info]
-        if "scatter" not in measure_input.task.name:
-            # the index tensor of scatter op cannot be randomly initialized
-            for arg in args:
-                random_fill(arg)
-        ctx.sync()
 
-        costs = time_f(*args).results
+            try:
+                random_fill = remote.get_function("tvm.contrib.random.random_fill")
+            except AttributeError:
+                raise AttributeError(
+                    "Please make sure USE_RANDOM is ON in the config.cmake " "on the remote devices"
+                )
+            args = [nd.array(np.zeros(x[0], dtype=x[1]), ctx=ctx) for x in build_result.arg_info]
+            if "scatter" not in measure_input.task.name:
+                # the index tensor of scatter op cannot be randomly initialized
+                for arg in args:
+                    random_fill(arg)
+            ctx.sync()
 
-        # clean up remote files
-        remote.remove(build_result.filename)
-        remote.remove(os.path.splitext(build_result.filename)[0] + ".so")
-        remote.remove("")
+            costs = time_f(*args).results
 
         if len(costs) > 2:  # remove largest and smallest value to reduce variance
             costs = list(costs)
@@ -592,6 +598,40 @@ def run_through_rpc(
     return MeasureResult(costs, errno, tstamp - tic + build_result.time_cost, tstamp)
 
 
+def default_module_loader(pre_load_function=None):
+    """Returns a default function that can be passed as module_loader to run_through_rpc.
+
+    Parameters
+    ----------
+    pre_load_function : Optional[Function[tvm.rpc.Session, tvm.runtime.Module]]
+        Invoked after a session is established and before the default code-loading RPC calls are
+        issued. Allows performing pre-upload actions, e.g. resetting the remote runtime environment.
+
+    Returns
+    -------
+    ModuleLoader :
+        A function that can be passed as module_loader to run_through_rpc.
+    """
+
+    @contextlib.contextmanager
+    def default_module_loader_mgr(remote_kwargs, build_result):
+        remote = request_remote(**remote_kwargs)
+        if pre_load_function is not None:
+            pre_load_function(remote, build_result)
+
+        remote.upload(build_result.filename)
+        try:
+            yield remote, remote.load_module(os.path.split(build_result.filename)[1])
+
+        finally:
+            # clean up remote files
+            remote.remove(build_result.filename)
+            remote.remove(os.path.splitext(build_result.filename)[0] + ".so")
+            remote.remove("")
+
+    return default_module_loader_mgr
+
+
 def request_remote(device_key, host=None, port=None, priority=1, timeout=60):
     """Request a remote session
 
diff --git a/vta/python/vta/__init__.py b/vta/python/vta/__init__.py
index d143c4db6884..5fce76808c45 100644
--- a/vta/python/vta/__init__.py
+++ b/vta/python/vta/__init__.py
@@ -22,6 +22,7 @@
 """
 import sys
 
+from .autotvm import module_loader
 from .bitstream import get_bitstream_path, download_bitstream
 from .environment import get_env, Environment
 from .rpc_client import reconfig_runtime, program_fpga
diff --git a/vta/python/vta/autotvm.py b/vta/python/vta/autotvm.py
new file mode 100644
index 000000000000..9aa7390f238f
--- /dev/null
+++ b/vta/python/vta/autotvm.py
@@ -0,0 +1,52 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Defines AutoTVM components used with VTA."""
+
+from tvm.autotvm.measure import default_module_loader
+from . import rpc_client
+
+
+def module_loader(bitstream=None):
+    """Construct a ModuleLoader implementation specialized for VTA.
+
+    Parameters
+    ----------
+    bitsream : Optional[str]
+        Path to the bitstream to write prior to uploading code.
+
+    Returns
+    -------
+    ModuleLoader :
+        The ModuleLoader instance.
+    """
+
+    def reprogram_fpga(remote, _build_result):
+        """default_module_loader callback which reprograms the FPGA.
+
+        Parameters
+        ----------
+        remote : tvm.rpc.RPCSession
+            RPC session established to the remote device.
+
+        _build_result : tvm.autotvm.measure.measure_methods.BuildResult
+            Artifact from the build phase, unused here.
+        """
+        rpc_client.program_bitstream(remote, bitstream)
+        rpc_client.reconfig_runtime(remote)
+
+    return default_module_loader(reprogram_fpga)
diff --git a/vta/tutorials/autotvm/tune_relay_vta.py b/vta/tutorials/autotvm/tune_relay_vta.py
index c5885b65c0f3..ed2671c75ae8 100644
--- a/vta/tutorials/autotvm/tune_relay_vta.py
+++ b/vta/tutorials/autotvm/tune_relay_vta.py
@@ -215,6 +215,7 @@ def compile_network(env, target, model, start_pack, stop_pack):
             port=tracker_port,
             number=5,
             timeout=60,
+            module_loader=vta.module_loader(),
             # check_correctness=True, # TODO: re-enable when check_correctness works again.
         ),
     ),

From 43b15a8cafdea9378deb5ab879e1c7ac7e5f3336 Mon Sep 17 00:00:00 2001
From: Robert Kimball <bobkimball@gmail.com>
Date: Thu, 25 Feb 2021 11:27:49 -0800
Subject: [PATCH 238/357] Many fixes to get unit tests passing on Windows.
 (#7431)

---
 CMakeLists.txt                                |  6 ++
 apps/cpp_rpc/CMakeLists.txt                   | 15 +++--
 cmake/modules/LibInfo.cmake                   |  1 +
 cmake/utils/FindLLVM.cmake                    |  2 +-
 conda/build-environment.yaml                  |  1 +
 .../auto_scheduler/cost_model/xgb_model.py    |  4 +-
 python/tvm/contrib/cc.py                      |  9 ++-
 python/tvm/contrib/nvcc.py                    |  6 ++
 .../search_policy/sketch_policy.cc            |  2 +-
 src/support/libinfo.cc                        |  7 ++-
 src/target/source/codegen_c_host.cc           |  1 +
 src/target/source/codegen_cuda.cc             | 56 ++++++++++---------
 tests/python/conftest.py                      | 42 ++++++++++++++
 .../{test_common.py => test_tvmc_common.py}   |  0
 ...auto_scheduler_layout_rewrite_networks.py} |  0
 .../test_auto_scheduler_cost_model.py         | 13 +++--
 tests/python/unittest/test_crt.py             |  4 +-
 .../python/unittest/test_custom_datatypes.py  | 15 +++--
 tests/python/unittest/test_micro_artifact.py  |  3 +
 19 files changed, 135 insertions(+), 52 deletions(-)
 create mode 100644 tests/python/conftest.py
 rename tests/python/driver/tvmc/{test_common.py => test_tvmc_common.py} (100%)
 rename tests/python/relay/{test_auto_scheduler_layout_rewrite.py => test_auto_scheduler_layout_rewrite_networks.py} (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 769a35318d9d..56170c693e3c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -130,6 +130,12 @@ if(MSVC)
   add_compile_options(/wd4180)
   # DLL interface warning in c++
   add_compile_options(/wd4251)
+  # destructor was implicitly defined as deleted
+  add_compile_options(/wd4624)
+  # unary minus operator applied to unsigned type, result still unsigned
+  add_compile_options(/wd4146)
+  # 'inline': used more than once
+  add_compile_options(/wd4141)
 else(MSVC)
   set(WARNING_FLAG -Wall)
   if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
diff --git a/apps/cpp_rpc/CMakeLists.txt b/apps/cpp_rpc/CMakeLists.txt
index ad8ae1488498..ccac53fc3ca0 100644
--- a/apps/cpp_rpc/CMakeLists.txt
+++ b/apps/cpp_rpc/CMakeLists.txt
@@ -1,4 +1,6 @@
-set(TVM_RPC_SOURCES 
+cmake_policy(SET CMP0069 NEW) # suppress cmake warning about IPO
+
+set(TVM_RPC_SOURCES
   main.cc
   rpc_env.cc
   rpc_server.cc
@@ -11,7 +13,12 @@ endif()
 # Set output to same directory as the other TVM libs
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
 add_executable(tvm_rpc ${TVM_RPC_SOURCES})
-set_property(TARGET tvm_rpc PROPERTY INTERPROCEDURAL_OPTIMIZATION_RELEASE TRUE)
+
+include(CheckIPOSupported)
+check_ipo_supported(RESULT result OUTPUT output)
+if(result)
+  set_property(TARGET tvm_rpc PROPERTY INTERPROCEDURAL_OPTIMIZATION_RELEASE TRUE)
+endif()
 
 if(WIN32)
   target_compile_definitions(tvm_rpc PUBLIC -DNOMINMAX)
@@ -35,5 +42,5 @@ target_include_directories(
   PUBLIC DLPACK_PATH
   PUBLIC DMLC_PATH
 )
- 
-target_link_libraries(tvm_rpc tvm_runtime)
\ No newline at end of file
+
+target_link_libraries(tvm_rpc tvm_runtime)
diff --git a/cmake/modules/LibInfo.cmake b/cmake/modules/LibInfo.cmake
index deaa6d9d8362..131dceeb345d 100644
--- a/cmake/modules/LibInfo.cmake
+++ b/cmake/modules/LibInfo.cmake
@@ -75,6 +75,7 @@ function(add_lib_info src_file)
     TVM_INFO_USE_ARM_COMPUTE_LIB="${USE_ARM_COMPUTE_LIB}"
     TVM_INFO_USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME="${USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME}"
     TVM_INFO_INDEX_DEFAULT_I64="${INDEX_DEFAULT_I64}"
+    TVM_CXX_COMPILER_PATH="${CMAKE_CXX_COMPILER}"
   )
 
 endfunction()
diff --git a/cmake/utils/FindLLVM.cmake b/cmake/utils/FindLLVM.cmake
index b8c5bf815bf5..9fc4df24b813 100644
--- a/cmake/utils/FindLLVM.cmake
+++ b/cmake/utils/FindLLVM.cmake
@@ -120,7 +120,7 @@ macro(find_llvm use_llvm)
     string(STRIP ${TVM_LLVM_VERSION} TVM_LLVM_VERSION)
     # definitions
     string(REGEX MATCHALL "(^| )-D[A-Za-z0-9_]*" __llvm_defs ${__llvm_cxxflags})
-    set(LLVM_DEFINTIIONS "")
+    set(LLVM_DEFINITIONS "")
     foreach(__flag IN ITEMS ${__llvm_defs})
       string(STRIP "${__flag}" __llvm_def)
       list(APPEND LLVM_DEFINITIONS "${__llvm_def}")
diff --git a/conda/build-environment.yaml b/conda/build-environment.yaml
index 31b39bfafcd0..7c7831e25b1b 100644
--- a/conda/build-environment.yaml
+++ b/conda/build-environment.yaml
@@ -35,3 +35,4 @@ dependencies:
   - bzip2
   - make
   - scipy
+  - pillow
diff --git a/python/tvm/auto_scheduler/cost_model/xgb_model.py b/python/tvm/auto_scheduler/cost_model/xgb_model.py
index aab36c175c3c..3cf65954be7f 100644
--- a/python/tvm/auto_scheduler/cost_model/xgb_model.py
+++ b/python/tvm/auto_scheduler/cost_model/xgb_model.py
@@ -116,11 +116,13 @@ def __init__(
             if xgb is None:
                 xgb = __import__("xgboost")
         except ImportError:
+            # add "from Node" to silence
+            # "During handling of the above exception, another exception occurred"
             raise ImportError(
                 "XGBoost is required for XGBModel. "
                 "Please install its python package first. "
                 "Help: (https://xgboost.readthedocs.io/en/latest/) "
-            )
+            ) from None
 
         self.xgb_params = {
             "max_depth": 10,
diff --git a/python/tvm/contrib/cc.py b/python/tvm/contrib/cc.py
index 9643d9b650fd..59a1d11216ee 100644
--- a/python/tvm/contrib/cc.py
+++ b/python/tvm/contrib/cc.py
@@ -47,7 +47,7 @@ def create_shared(output, objects, options=None, cc="g++"):
     ):
         _linux_compile(output, objects, options, cc, compile_shared=True)
     elif sys.platform == "win32":
-        _windows_shared(output, objects, options)
+        _windows_compile(output, objects, options)
     else:
         raise ValueError("Unsupported platform")
 
@@ -71,6 +71,8 @@ def create_executable(output, objects, options=None, cc="g++"):
     """
     if sys.platform == "darwin" or sys.platform.startswith("linux"):
         _linux_compile(output, objects, options, cc)
+    elif sys.platform == "win32":
+        _windows_compile(output, objects, options)
     else:
         raise ValueError("Unsupported platform")
 
@@ -212,9 +214,9 @@ def _linux_compile(output, objects, options, compile_cmd="g++", compile_shared=F
         raise RuntimeError(msg)
 
 
-def _windows_shared(output, objects, options):
+def _windows_compile(output, objects, options):
     cmd = ["clang"]
-    cmd += ["-O2", "-flto=full", "-fuse-ld=lld-link"]
+    cmd += ["-O2"]
 
     if output.endswith(".so") or output.endswith(".dll"):
         cmd += ["-shared"]
@@ -240,6 +242,7 @@ def _windows_shared(output, objects, options):
         )
     if proc.returncode != 0:
         msg = "Compilation error:\n"
+        msg += " ".join(cmd) + "\n"
         msg += py_str(out)
 
         raise RuntimeError(msg)
diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py
index 5886760934fb..2a97b0b31d1e 100644
--- a/python/tvm/contrib/nvcc.py
+++ b/python/tvm/contrib/nvcc.py
@@ -89,6 +89,12 @@ def compile_cuda(code, target="ptx", arch=None, options=None, path_target=None):
     cmd += ["-o", file_target]
     cmd += [temp_code]
 
+    cxx_compiler_path = tvm.support.libinfo().get("TVM_CXX_COMPILER_PATH")
+    if cxx_compiler_path != "":
+        # This tells nvcc where to find the c++ compiler just in case it is not in the path.
+        # On Windows it is not in the path by default.
+        cmd += ["-ccbin", cxx_compiler_path]
+
     proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 
     (out, _) = proc.communicate()
diff --git a/src/auto_scheduler/search_policy/sketch_policy.cc b/src/auto_scheduler/search_policy/sketch_policy.cc
index 91721afdba74..4a4ab18b5eed 100644
--- a/src/auto_scheduler/search_policy/sketch_policy.cc
+++ b/src/auto_scheduler/search_policy/sketch_policy.cc
@@ -519,7 +519,7 @@ Array<State> SketchPolicyNode::EvolutionarySearch(const Array<State>& init_popul
   // auxiliary global variables
   std::vector<float> pop_scores;
   std::vector<double> pop_selection_probs;
-  float max_score = -1e-10;
+  float max_score = -1e-10f;
   pop_scores.reserve(population);
   pop_selection_probs.reserve(population);
   std::uniform_real_distribution<> dis(0.0, 1.0);
diff --git a/src/support/libinfo.cc b/src/support/libinfo.cc
index c8aa76b9d1f5..0f394f50fe71 100644
--- a/src/support/libinfo.cc
+++ b/src/support/libinfo.cc
@@ -208,6 +208,10 @@
 #define TVM_INFO_INDEX_DEFAULT_I64 "NOT-FOUND"
 #endif
 
+#ifndef TVM_CXX_COMPILER_PATH
+#define TVM_CXX_COMPILER_PATH ""
+#endif
+
 namespace tvm {
 
 /*!
@@ -262,7 +266,8 @@ TVM_DLL Map<String, String> GetLibInfo() {
       {"USE_TARGET_ONNX", TVM_INFO_USE_TARGET_ONNX},
       {"USE_ARM_COMPUTE_LIB", TVM_INFO_USE_ARM_COMPUTE_LIB},
       {"USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME", TVM_INFO_USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME},
-      {"INDEX_DEFAULT_I64", TVM_INFO_INDEX_DEFAULT_I64}};
+      {"INDEX_DEFAULT_I64", TVM_INFO_INDEX_DEFAULT_I64},
+      {"TVM_CXX_COMPILER_PATH", TVM_CXX_COMPILER_PATH}};
   return result;
 }
 
diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc
index bee5441649c5..3ec64ed2ace9 100644
--- a/src/target/source/codegen_c_host.cc
+++ b/src/target/source/codegen_c_host.cc
@@ -44,6 +44,7 @@ void CodeGenCHost::Init(bool output_ssa, bool emit_asserts, std::string target_s
   emit_asserts_ = emit_asserts;
   declared_globals_.clear();
   decl_stream << "// tvm target: " << target_str << "\n";
+  decl_stream << "#define TVM_EXPORTS\n";
   decl_stream << "#include \"tvm/runtime/c_runtime_api.h\"\n";
   decl_stream << "#include \"tvm/runtime/c_backend_api.h\"\n";
   decl_stream << "#include <math.h>\n";
diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc
index e5547315613f..35b94f55e4e4 100644
--- a/src/target/source/codegen_cuda.cc
+++ b/src/target/source/codegen_cuda.cc
@@ -79,6 +79,20 @@ std::string CodeGenCUDA::Finish() {
     decl_stream << "#include <mma.h>\n";
   }
 
+  decl_stream << "\n#ifdef _WIN32\n";
+  decl_stream << "  using uint = unsigned int;\n";
+  decl_stream << "  using uchar = unsigned char;\n";
+  decl_stream << "  using ushort = unsigned short;\n";
+  decl_stream << "  using int64_t = long long;\n";
+  decl_stream << "  using uint64_t = unsigned long long;\n";
+  decl_stream << "#else\n";
+  decl_stream << "  #define uint unsigned int\n";
+  decl_stream << "  #define uchar unsigned char\n";
+  decl_stream << "  #define ushort unsigned short\n";
+  decl_stream << "  #define int64_t long\n";
+  decl_stream << "  #define uint64_t ulong\n";
+  decl_stream << "#endif\n";
+
   return CodeGenC::Finish();
 }
 
@@ -99,7 +113,7 @@ void CodeGenCUDA::BindThreadIndex(const IterVar& iv) {
 void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
   int lanes = t.lanes();
   if (t.is_handle()) {
-    ICHECK_EQ(lanes, 1) << "do not yet support vector types";
+    ICHECK(t.is_scalar()) << "do not yet support vector types";
     os << "void*";
     return;
   }
@@ -108,7 +122,7 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
     switch (t.bits()) {
       case 16:
         enable_fp16_ = true;
-        if (lanes == 1) {
+        if (t.is_scalar()) {
           os << "half";
         } else if (lanes <= 8) {
           // Emit CUDA code to access fp16 vector elements.
@@ -136,7 +150,7 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
         fail = true;
         break;
     }
-    if (!fail && (lanes == 1 || t.bits() == 16)) return;
+    if (!fail && (t.is_scalar() || t.bits() == 16)) return;
     if (!fail && (lanes >= 2 && lanes <= 4)) {
       os << lanes;
       return;
@@ -154,15 +168,11 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
     }
   } else if (t.is_uint() || t.is_int()) {
     if (t.is_uint()) {
-      if (t.lanes() != 1) {
-        os << "u";
-      } else {
-        os << "unsigned ";
-      }
+      os << "u";
     }
     switch (t.bits()) {
       case 1: {
-        if (t.lanes() == 1) {
+        if (t.is_scalar()) {
           os << "int";
           return;
         } else if (t.lanes() == 8) {
@@ -179,7 +189,7 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
         }
       }
       case 4: {
-        if (t.lanes() == 1) {
+        if (t.is_scalar()) {
           os << "int";
           return;
         } else if (t.lanes() == 4) {
@@ -220,7 +230,7 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
           enable_int8_ = true;
           os << "int4";
           return;
-        } else if (!t.is_uint() && t.lanes() == 1) {
+        } else if (!t.is_uint() && t.is_scalar()) {
           os << "signed char";
           break;
         } else {
@@ -235,22 +245,16 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
         os << "int";
         break;
       case 64: {
-        if (sizeof(long) != 8) {  // NOLINT(*)
-          if (t.lanes() == 1) {
-            os << "long long";
-            break;
-          } else if (t.lanes() == 2) {
-            os << "longlong";
-            break;
-          } else {
-            // No longlong3, longlong4
-            LOG(FATAL) << "Cannot convert type " << t << " to CUDA type on a L32 platform";
-            break;
-          }
-        } else {
-          os << "long";
-          break;
+        if (t.is_scalar()) {
+          os << "int64_t";
+        } else if (t.lanes() == 2) {
+          os << "longlong2";
+        } else if (t.lanes() == 3) {
+          os << "longlong3";
+        } else if (t.lanes() == 4) {
+          os << "longlong4";
         }
+        return;
       }
       default:
         fail = true;
diff --git a/tests/python/conftest.py b/tests/python/conftest.py
new file mode 100644
index 000000000000..e8042c8f5095
--- /dev/null
+++ b/tests/python/conftest.py
@@ -0,0 +1,42 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+import tvm
+
+collect_ignore = []
+if sys.platform.startswith("win"):
+    collect_ignore.append("frontend/caffe")
+    collect_ignore.append("frontend/caffe2")
+    collect_ignore.append("frontend/coreml")
+    collect_ignore.append("frontend/darknet")
+    collect_ignore.append("frontend/keras")
+    collect_ignore.append("frontend/mxnet")
+    collect_ignore.append("frontend/pytorch")
+    collect_ignore.append("frontend/tensorflow")
+    collect_ignore.append("frontend/tflite")
+    collect_ignore.append("frontend/onnx")
+    collect_ignore.append("driver/tvmc/test_autoscheduler.py")
+    collect_ignore.append("unittest/test_auto_scheduler_cost_model.py")  # stack overflow
+    # collect_ignore.append("unittest/test_auto_scheduler_measure.py") # exception ignored
+    collect_ignore.append("unittest/test_auto_scheduler_search_policy.py")  # stack overflow
+    # collect_ignore.append("unittest/test_auto_scheduler_measure.py") # exception ignored
+
+    collect_ignore.append("unittest/test_tir_intrin.py")
+
+if tvm.support.libinfo().get("USE_MICRO", "OFF") != "ON":
+    collect_ignore.append("unittest/test_micro_transport.py")
diff --git a/tests/python/driver/tvmc/test_common.py b/tests/python/driver/tvmc/test_tvmc_common.py
similarity index 100%
rename from tests/python/driver/tvmc/test_common.py
rename to tests/python/driver/tvmc/test_tvmc_common.py
diff --git a/tests/python/relay/test_auto_scheduler_layout_rewrite.py b/tests/python/relay/test_auto_scheduler_layout_rewrite_networks.py
similarity index 100%
rename from tests/python/relay/test_auto_scheduler_layout_rewrite.py
rename to tests/python/relay/test_auto_scheduler_layout_rewrite_networks.py
diff --git a/tests/python/unittest/test_auto_scheduler_cost_model.py b/tests/python/unittest/test_auto_scheduler_cost_model.py
index 36360da45c8d..0b34615583db 100644
--- a/tests/python/unittest/test_auto_scheduler_cost_model.py
+++ b/tests/python/unittest/test_auto_scheduler_cost_model.py
@@ -68,14 +68,15 @@ def test_xgb_model():
     assert rmse <= 0.3
 
     # test loading a record file
-    with tempfile.NamedTemporaryFile() as fp:
-        auto_scheduler.save_records(fp.name, inputs, results)
-        model.update_from_file(fp.name)
+    tmpdir = tvm.contrib.utils.tempdir()
+    tmpfile = tmpdir.relpath("test1")
+    auto_scheduler.save_records(tmpfile, inputs, results)
+    model.update_from_file(tmpfile)
 
     # test model serialization
-    with tempfile.NamedTemporaryFile() as fp:
-        model.save(fp.name)
-        model.load(fp.name)
+    tmpfile = tmpdir.relpath("test2")
+    model.save(tmpfile)
+    model.load(tmpfile)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_crt.py b/tests/python/unittest/test_crt.py
index 4b744b8ee10a..1bd24c931b72 100644
--- a/tests/python/unittest/test_crt.py
+++ b/tests/python/unittest/test_crt.py
@@ -19,7 +19,9 @@
 import copy
 import glob
 import os
-import pty
+import pytest
+
+pytest.importorskip("pty")
 import sys
 import subprocess
 import textwrap
diff --git a/tests/python/unittest/test_custom_datatypes.py b/tests/python/unittest/test_custom_datatypes.py
index 6aad93abd510..75e807456981 100644
--- a/tests/python/unittest/test_custom_datatypes.py
+++ b/tests/python/unittest/test_custom_datatypes.py
@@ -21,7 +21,6 @@
 import tvm.topi.testing
 import numpy as np
 import pytest
-from numpy.random import MT19937, RandomState, SeedSequence
 from tvm import relay
 from tvm.relay.testing.layers import batch_norm_infer
 from tvm.target.datatype import (
@@ -66,7 +65,7 @@ def get_cat_image(dimensions):
 
 # we use a random seed to generate input_data
 # to guarantee stable tests
-rs = RandomState(MT19937(SeedSequence(123456789)))
+np.random.seed(0)
 
 
 def convert_ndarray(dst_dtype, array):
@@ -341,7 +340,7 @@ def check_unary_op(op, src_dtype, dst_dtype, shape):
         t1 = relay.TensorType(shape, src_dtype)
         x = relay.var("x", t1)
         z = op(x)
-        x_data = rs.rand(*shape).astype(t1.dtype)
+        x_data = np.random.rand(*shape).astype(t1.dtype)
 
         module = tvm.IRModule.from_expr(relay.Function([x], z))
 
@@ -372,8 +371,8 @@ def check_binary_op(opfunc, src_dtype, dst_dtype):
         x = relay.var("x", t1)
         y = relay.var("y", t2)
         z = opfunc(x, y)
-        x_data = rs.rand(*shape1).astype(t1.dtype)
-        y_data = rs.rand(*shape2).astype(t2.dtype)
+        x_data = np.random.rand(*shape1).astype(t1.dtype)
+        y_data = np.random.rand(*shape2).astype(t2.dtype)
         module = tvm.IRModule.from_expr(relay.Function([x, y], z))
 
         compare(module, (x_data, y_data), src_dtype, dst_dtype, rtol, atol)
@@ -416,8 +415,8 @@ def run_test_conv2d(
         w = relay.var("w", shape=kshape, dtype=src_dtype)
         y = relay.nn.conv2d(x, w, padding=padding, dilation=dilation, groups=groups, **attrs)
         module = tvm.IRModule.from_expr(relay.Function([x, w], y))
-        data = rs.uniform(-scale, scale, size=dshape).astype(src_dtype)
-        kernel = rs.uniform(-scale, scale, size=kshape).astype(src_dtype)
+        data = np.random.uniform(-scale, scale, size=dshape).astype(src_dtype)
+        kernel = np.random.uniform(-scale, scale, size=kshape).astype(src_dtype)
 
         compare(module, (data, kernel), src_dtype, dst_dtype, rtol, atol)
 
@@ -497,7 +496,7 @@ def run_batchnorm(src_dtype, dst_dtype, rtol=1e-6, atol=1e-6):
     bn = batch_norm_infer(data=x, epsilon=2e-5, scale=False, name="bn_x")
     f = relay.Function(relay.analysis.free_vars(bn), bn)
 
-    x_data = rs.rand(*shape).astype(t.dtype)
+    x_data = np.random.rand(*shape).astype(t.dtype)
     module = tvm.IRModule.from_expr(f)
 
     zero_data = np.zeros((32), "float32")
diff --git a/tests/python/unittest/test_micro_artifact.py b/tests/python/unittest/test_micro_artifact.py
index d757f0956b81..fc180200720d 100644
--- a/tests/python/unittest/test_micro_artifact.py
+++ b/tests/python/unittest/test_micro_artifact.py
@@ -17,6 +17,7 @@
 
 """Unit tests for the artifact module."""
 
+import pytest
 import json
 import os
 import shutil
@@ -24,6 +25,8 @@
 
 from tvm.contrib import utils
 
+pytest.importorskip("tvm.micro")
+from tvm.micro import artifact
 
 FILE_LIST = ["label1", "label2", "label12", "unlabelled"]
 

From 6274a7f56c8966391b643e4c5f9377154205ae62 Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Thu, 25 Feb 2021 15:36:05 -0700
Subject: [PATCH 239/357] use checked_type instead of type_annotation (#7522)

---
 python/tvm/relay/frontend/onnx.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 1e5dad46782c..58c2dbcad26a 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -1753,7 +1753,7 @@ def _impl_v7(cls, inputs, attr, params):
         P = inputs[7]
 
         num_directions = infer_shape(W)[0]
-        W_dtype = infer_type(W).type_annotation.dtype
+        W_dtype = infer_type(W).checked_type.dtype
 
         if num_directions != 1:
             raise NotImplementedError("Bidirectional LSTMs not yet supported.")
@@ -1865,7 +1865,7 @@ def _impl_v7(cls, inputs, attr, params):
         linear_before_reset = attr.get("linear_before_reset", 0)
 
         num_directions = infer_shape(W)[0]
-        W_dtype = infer_type(W).type_annotation.dtype
+        W_dtype = infer_type(W).checked_type.dtype
 
         if num_directions != 1:
             raise NotImplementedError("Bidirectional GRUs not yet supported.")

From e664b2ff7686e51468b38287ff92253fa9507bac Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Fri, 26 Feb 2021 10:40:40 +0900
Subject: [PATCH 240/357] [Torch] Avoid adding unnecessary slicing (#7479)

* simplyfing

* improved fast path for slice

* update rewrite pattern for maskrcnn
---
 python/tvm/relay/frontend/pytorch.py       | 55 ++++++++++------------
 python/tvm/relay/frontend/pytorch_utils.py |  6 +--
 2 files changed, 25 insertions(+), 36 deletions(-)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 205b2aa779e6..931611274c20 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -385,23 +385,28 @@ def tensor_array_concat(lst, axis):
 
     def slice(self, inputs, input_types):
         axis_dtype = "int64"
-        index_size_limit = 2 ** 63 - 1
+        index_size_limit = sys.maxsize
         data = inputs[0]
         dshape = self.infer_shape(data)
         ndim = len(dshape)
-        end = []
-        for dim in dshape:
-            if isinstance(dim, tvm.tir.Any):
-                end = _op.shape_of(data)
-                break
-            end.append(int(dim))
-
-        begin = [0] * ndim
         dim = int(inputs[1])
-        stride = int(inputs[4])
-        begin[dim], _ = try_infer_value(inputs[2], lambda ret: np.asscalar(ret.astype(np.int)))
+        stride = inputs[4]
+
+        target_begin, is_begin_const = try_infer_value(
+            inputs[2], lambda ret: np.asscalar(ret.astype(np.int))
+        )
+        target_end, is_end_const = try_infer_value(
+            inputs[3], lambda ret: np.asscalar(ret.astype(np.int))
+        )
+
+        # A fast path when slicing is nop.
+        if target_begin == 0 and target_end >= index_size_limit and stride == 1:
+            return data
 
         # Process begin
+        begin = [0] * ndim
+        begin[dim] = target_begin
+
         if not isinstance(begin[dim], int):
             tmp = []
             for b in begin:
@@ -414,27 +419,15 @@ def slice(self, inputs, input_types):
             if str(btype) != axis_dtype:
                 begin = _op.cast(begin, axis_dtype)
 
-        if isinstance(inputs[3], str) and inputs[3].isdigit():
-            target_end = int(inputs[3])
+        # Process end
+        if isinstance(target_end, int) and target_end >= index_size_limit:
+            target_end = dshape[dim]
+
+        if any([isinstance(d, tvm.tir.Any) for d in dshape]):
+            end = _op.shape_of(data)
         else:
-            if isinstance(inputs[3], _expr.Expr):
-                target_end, _ = try_infer_value(
-                    inputs[3], lambda ret: np.asscalar(ret.astype(np.int))
-                )
-            else:
-                target_end = inputs[3]
-
-            if isinstance(target_end, int) and target_end >= index_size_limit:
-                # Quick path for original data.
-                if (
-                    isinstance(begin, _expr.Constant)
-                    and begin.data.asnumpy().tolist()[dim] == 0
-                    and stride == 1
-                ):
-                    return data
-                target_end = dshape[dim]
+            end = dshape
 
-        # Process end
         if isinstance(target_end, int):
             if isinstance(end, list):
                 end[dim] = target_end
@@ -474,7 +467,7 @@ def slice(self, inputs, input_types):
                 end = _op.cast(end, axis_dtype)
 
         strides = [1] * ndim
-        strides[dim] = int(inputs[4])
+        strides[dim] = stride
 
         return _op.transform.strided_slice(
             data, begin=begin, end=end, strides=strides, slice_mode="end"
diff --git a/python/tvm/relay/frontend/pytorch_utils.py b/python/tvm/relay/frontend/pytorch_utils.py
index 248f5354cfbb..02b2484d4fb7 100644
--- a/python/tvm/relay/frontend/pytorch_utils.py
+++ b/python/tvm/relay/frontend/pytorch_utils.py
@@ -97,15 +97,11 @@ def batched_nms(boxes, scores, idxs, iou_threshold):
     add = is_op("add")(mx, one)
     mul = is_op("multiply")(cast, add)
 
-    # The following doesn't appear in the above Relay snippet. It is required for dynamic
-    # stride_slice handling
     shape_of = is_op("shape_of")(mul)
     cast = is_op("cast")(shape_of)
-    # This corresponds to offsets[:, None], where offsets is the result of multiplication
-    dyn_strided_slice = dyn_strided_slice_pattern(mul, cast)
 
     # Add offsets to the boxes
-    expand_dims = is_op("expand_dims")(dyn_strided_slice)
+    expand_dims = is_op("expand_dims")(mul)
     add = is_op("add")(boxes, expand_dims)
 
     # The rest of patterns correspond to the PyTorch frontend conversion

From 63ea8e1fc934229a7fb56cac642a588ff3337e6e Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Fri, 26 Feb 2021 10:41:13 +0900
Subject: [PATCH 241/357] [Relay] Enforce static dim for non-concat axis if one
 or more tensors have static dim (#7487)

* enforce static dim for non-concat axis

* assign any when all dims are dyn

* add missing case

* simplify

* add test

* only enforce static dim constraint if concat output is dynamic

* more update to concat type rel

* update tests

* fixed compile warning
---
 src/relay/op/tensor/transform.h | 69 +++++++++++++++++++++++++--------
 tests/python/relay/test_any.py  | 21 ++++++++++
 2 files changed, 73 insertions(+), 17 deletions(-)

diff --git a/src/relay/op/tensor/transform.h b/src/relay/op/tensor/transform.h
index 95a83a905908..dbf8537e0dad 100644
--- a/src/relay/op/tensor/transform.h
+++ b/src/relay/op/tensor/transform.h
@@ -101,29 +101,64 @@ bool ConcatenateRel(const Array<Type>& types, int num_inputs, const Attrs& attrs
   }
 
   // Calculate shape
-  std::vector<IndexExpr> oshape(first->shape.begin(), first->shape.end());
-  int data_length = static_cast<int>(tensor_tuple->fields.size());
+  std::vector<IndexExpr> oshape(ndim);
+  const size_t data_length = tensor_tuple->fields.size();
+
+  // Accumulate the concat axis output dim or decide if this is dynamic concat
+  bool is_dynamic_concat = false;
+  std::vector<TensorType> input_tensors;
+  IndexExpr concat_output_dim = first->shape[axis];
+  for (size_t i = 0; i < data_length; ++i) {
+    const auto& e = Downcast<TensorType>(tensor_tuple->fields[i]);
+    input_tensors.push_back(e);
+    if (e->shape[axis].as<AnyNode>()) {
+      is_dynamic_concat = true;
+      concat_output_dim = Any();
+    } else if (i > 0 && !is_dynamic_concat) {
+      // accumulate axis dimension
+      concat_output_dim += e->shape[axis];
+    }
+  }
+
+  oshape[axis] = concat_output_dim;
+
   for (int i = 0; i < ndim; ++i) {
+    if (i == axis) {
+      // The concat axis is already handled above.
+      // The rest of the body sets the output shape for non-concat axes
+      continue;
+    }
     std::vector<IndexExpr> non_any;
-    for (int j = 0; j < data_length; ++j) {
-      const auto& e = Downcast<TensorType>(tensor_tuple->fields[j]);
+    for (size_t j = 0; j < data_length; ++j) {
+      const auto& e = input_tensors[j];
       if (!e->shape[i].as<AnyNode>()) {
         non_any.push_back(e->shape[i]);
-        // accumulate axis dimension
-        if (j > 0 && i == axis && !oshape[i].as<AnyNode>()) {
-          oshape[i] += e->shape[i];
-        }
       }
     }
-    int non_any_size = static_cast<int>(non_any.size());
-    if (non_any_size != data_length) oshape[i] = Any();
-    if (i != axis) {
-      for (int k = 1; k < non_any_size; k++) {
-        if (reporter->AssertEQ(non_any[0], non_any[k])) continue;
-        throw Error(
-            "relay.concatenate requires all tensors have the same shape "
-            "on non-concatenating axes");
-      }
+    size_t non_any_size = non_any.size();
+    for (size_t k = 1; k < non_any_size; k++) {
+      if (reporter->AssertEQ(non_any[0], non_any[k])) continue;
+      throw Error(
+          "relay.concatenate requires all tensors have the same shape "
+          "on non-concatenating axes");
+    }
+
+    if (non_any_size == data_length) {
+      // All static case
+      oshape[i] = non_any[0];
+    } else if (non_any_size > 0 && is_dynamic_concat) {
+      // For non-concat axes, we want to enforce static shape constraint.
+      // However, if the concat axis is static, the output shape would become static while
+      // the input could be partially static/dynamic. To prevent runtime segfaults due to the lack
+      // of runtime input shape checking for such cases, static shape constraint is only enforced
+      // when the output concat axis is dynamic.
+      //
+      // Examples (both concat on the first axis):
+      // * [(?, 3), (?, ?)] -> (?, 3)
+      // * [(1, 3), (1, ?)] -> (2, ?)
+      oshape[i] = non_any[0];
+    } else {
+      oshape[i] = Any();
     }
   }
 
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index 9d05631a753a..b75cc5f5e750 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -208,6 +208,27 @@ def test_any_concat():
     ref = np.concatenate(x_np, axis=0)
     check_result(x_np, mod, ref)
 
+    def test_oshape(in_vars, axis, oshape):
+        z = relay.op.concatenate(in_vars, axis=axis)
+        mod = tvm.IRModule()
+        mod["main"] = relay.Function(in_vars, z)
+        typed_mod = relay.transform.InferType()(mod)
+        assert typed_mod["main"].body.checked_type == relay.TensorType(oshape, dtype="float32")
+
+    x = [relay.var("x", shape=(relay.Any(), 3), dtype="float32") for _ in range(3)]
+    x.append(relay.var("x", shape=(relay.Any(), relay.Any()), dtype="float32"))
+
+    test_oshape(x, 0, (relay.Any(), 3))
+    test_oshape(x, 1, (relay.Any(), relay.Any()))
+
+    # [(1, 3), (1, ?)] -> (2, ?)
+    x = [
+        relay.var("x", shape=(1, 3), dtype="float32"),
+        relay.var("x", shape=(1, relay.Any()), dtype="float32"),
+    ]
+    test_oshape(x, 0, (2, relay.Any()))
+    test_oshape(x, 1, (1, relay.Any()))
+
 
 def verify_any_reshape(x_shape, newshape, x_np_shape, out_shape, variable_newshape=False):
     x = relay.var("x", shape=x_shape, dtype="float32")

From 09b0c8e6f688d1c25734b6371426972ab1c37183 Mon Sep 17 00:00:00 2001
From: Yanming Wang <yanmingwang01@gmail.com>
Date: Thu, 25 Feb 2021 17:58:39 -0800
Subject: [PATCH 242/357] [Frontend][Tensorflow] Add unique operator (#7441)

* Initial commit of the unique operator

Add unit tests for unique operator

* Add tensorflow unique op

* Refactor unique to use sort-based algorithm

* Change relay.unique test to run only on cpu

* Change topi.unique test to run only on cpu

* Change range to parallel for parallelizable loops

* Add return_counts option for relay.unique and topi.unique, add pytorch frontend

* Fix pylint

* Patch pytorch frontend

* Initial support of topi.cuda.unique

* Refactor to use ir_builder directly

* Modularize adjacent difference

* Refactor to simplify

* Fix typo

* Combine _unique and _unique_with_counts

* Reuse indices_ptr to remove arange_ptr

Co-authored-by: Yanming Wang <yanmwang@amazon.com>
---
 include/tvm/relay/attrs/transform.h           |  12 +
 python/tvm/relay/frontend/pytorch.py          |  19 +
 python/tvm/relay/frontend/tensorflow.py       |  26 ++
 python/tvm/relay/op/_transform.py             |  44 ++
 python/tvm/relay/op/strategy/cuda.py          |  12 +
 python/tvm/relay/op/strategy/generic.py       |  21 +
 python/tvm/relay/op/transform.py              |  54 +++
 python/tvm/topi/__init__.py                   |   1 +
 python/tvm/topi/cuda/__init__.py              |   1 +
 python/tvm/topi/cuda/unique.py                | 396 ++++++++++++++++++
 python/tvm/topi/generic/search.py             |  16 +
 python/tvm/topi/unique.py                     | 297 +++++++++++++
 src/relay/op/tensor/transform.cc              |  47 +++
 tests/python/frontend/pytorch/test_forward.py |  25 +-
 .../frontend/tensorflow/test_forward.py       |  65 +++
 tests/python/relay/test_op_level3.py          |  53 +++
 tests/python/topi/python/test_topi_unique.py  | 111 +++++
 17 files changed, 1199 insertions(+), 1 deletion(-)
 create mode 100644 python/tvm/topi/cuda/unique.py
 create mode 100644 python/tvm/topi/unique.py
 create mode 100644 tests/python/topi/python/test_topi_unique.py

diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index 24098b74f3b6..ff344f5e1a85 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -452,6 +452,18 @@ struct CumsumAttrs : public tvm::AttrsNode<CumsumAttrs> {
   }
 };
 
+/*! \brief Attributes used in unique operator */
+struct UniqueAttrs : public tvm::AttrsNode<UniqueAttrs> {
+  bool sorted;
+  bool return_counts;
+  TVM_DECLARE_ATTRS(UniqueAttrs, "relay.attrs.UniqueAttrs") {
+    TVM_ATTR_FIELD(sorted).describe("Whether the unique elements are sorted").set_default(true);
+    TVM_ATTR_FIELD(return_counts)
+        .describe("Whether to return an additional tensor with counts of each unique elements")
+        .set_default(false);
+  }
+};  // struct UniqueAttrs
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_ATTRS_TRANSFORM_H_
diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 931611274c20..679541051e75 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -2157,6 +2157,24 @@ def is_floating_point(self, inputs, input_types):
         is_float = input_type in ["float32", "float64", "float16", "bfloat16"]
         return _expr.const(is_float)
 
+    def unique(self, inputs, input_types):
+        assert len(inputs) == 4
+        [data, is_sorted, return_inverse, return_counts] = inputs
+        if not is_sorted:
+            logging.warning("TVM always assumes sorted=True for torch.unique")
+            is_sorted = True
+        if return_counts:
+            [unique, indices, num_uniq, counts] = _op.unique(
+                data, is_sorted=is_sorted, return_counts=True
+            )
+            unique_sliced = _op.strided_slice(unique, begin=[0], end=num_uniq, slice_mode="size")
+            counts_sliced = _op.strided_slice(counts, begin=[0], end=num_uniq, slice_mode="size")
+            return (unique_sliced, indices, counts_sliced)
+        else:
+            [unique, indices, num_uniq] = _op.unique(data, is_sorted=is_sorted, return_counts=False)
+            unique_sliced = _op.strided_slice(unique, begin=[0], end=num_uniq, slice_mode="size")
+            return (unique_sliced, indices)
+
     # Operator mappings
     def create_convert_map(self):
         self.convert_map = {
@@ -2363,6 +2381,7 @@ def create_convert_map(self):
             "aten::masked_select": self.masked_select,
             "aten::argsort": self.argsort,
             "aten::sort": self.sort,
+            "aten::_unique2": self.unique,
         }
 
     def update_convert_map(self, custom_map):
diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index ab98cddd3835..65f18c029441 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -2471,6 +2471,30 @@ def _impl(inputs, attr, params, mod):
     return _impl
 
 
+def _unique(return_counts=True):
+    def _impl(inputs, attr, params, mod):
+        assert len(inputs) == 1
+        data = inputs[0]
+        if return_counts:
+            [unique, indices, num_uniq, counts] = _op.unique(
+                data, is_sorted=False, return_counts=True
+            )
+            unique_sliced = _op.strided_slice(unique, begin=[0], end=num_uniq, slice_mode="size")
+            counts_sliced = _op.strided_slice(counts, begin=[0], end=num_uniq, slice_mode="size")
+            return _expr.TupleWrapper(
+                _expr.Tuple([unique_sliced, indices, counts_sliced]),
+                3,
+            )
+        [unique, indices, num_uniq] = _op.unique(data, is_sorted=False, return_counts=False)
+        unique_sliced = _op.strided_slice(unique, begin=[0], end=num_uniq, slice_mode="size")
+        return _expr.TupleWrapper(
+            _expr.Tuple([unique_sliced, indices]),
+            2,
+        )
+
+    return _impl
+
+
 # compatible operators that do NOT require any conversion.
 _identity_list = []
 
@@ -2650,6 +2674,8 @@ def _impl(inputs, attr, params, mod):
     "TopKV2": _topk(),
     "Transpose": _transpose(),
     "TruncateMod": _elemwise("mod"),
+    "Unique": _unique(False),
+    "UniqueWithCounts": _unique(True),
     "Unpack": _unpack(),
     "UnravelIndex": _unravel_index(),
     "Where": _where(),
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 01bcf4a6cf60..e9cf3d83eaeb 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -142,6 +142,15 @@ def compute_cumsum(attrs, inputs, output_type):
 _reg.register_strategy("cumsum", strategy.cumsum_strategy)
 _reg.register_shape_func("cumsum", False, elemwise_shape_func)
 
+
+@_reg.register_compute("unique")
+def compute_unique(attrs, inputs, output_type):
+    """Compute definition of unique"""
+    return topi.unique(inputs[0], attrs.sorted, attrs.return_counts)
+
+
+_reg.register_strategy("unique", strategy.unique_strategy)
+
 #####################
 #  Shape functions  #
 #####################
@@ -946,3 +955,38 @@ def where_shape_func(attrs, inputs, _):
     out_shape = _broadcast_shape_tensors(bcast_shape, cond_shape)
 
     return [out_shape]
+
+
+@script
+def _unique_shape(data_shape):
+    unique_shape = output_tensor((1,), "int64")
+    indices_shape = output_tensor((1,), "int64")
+    num_unique_shape = output_tensor((1,), "int64")
+    unique_shape[0] = data_shape[0]
+    indices_shape[0] = data_shape[0]
+    num_unique_shape[0] = int64(1)
+    return (unique_shape, indices_shape, num_unique_shape)
+
+
+@script
+def _unique_with_counts_shape(data_shape):
+    unique_shape = output_tensor((1,), "int64")
+    indices_shape = output_tensor((1,), "int64")
+    num_unique_shape = output_tensor((1,), "int64")
+    counts_shape = output_tensor((1,), "int64")
+    unique_shape[0] = data_shape[0]
+    indices_shape[0] = data_shape[0]
+    num_unique_shape[0] = int64(1)
+    counts_shape[0] = data_shape[0]
+    return (unique_shape, indices_shape, num_unique_shape, counts_shape)
+
+
+@_reg.register_shape_func("unique", False)
+def unique_shape_func(attrs, inputs, _):
+    """
+    Shape func for unique operator.
+    """
+    if attrs.return_counts:
+        return _unique_with_counts_shape(inputs[0])
+    else:
+        return _unique_shape(inputs[0])
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 20c5f03b9b0b..3abc9c42b659 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -1009,3 +1009,15 @@ def cumsum_strategy_cuda(attrs, inputs, out_type, target):
         name="cumsum.cuda",
     )
     return strategy
+
+
+@unique_strategy.register(["cuda", "gpu"])
+def unique_strategy_cuda(attrs, inputs, out_type, target):
+    """unique cuda strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_unique(topi.cuda.unique),
+        wrap_topi_schedule(topi.cuda.schedule_scan),
+        name="unique.cuda",
+    )
+    return strategy
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index f076176c5d8a..8a2724dfb614 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -1432,3 +1432,24 @@ def cumsum_strategy(attrs, inputs, out_type, target):
         name="cumsum.generic",
     )
     return strategy
+
+
+def wrap_compute_unique(topi_compute):
+    """Wrap unique topi compute"""
+
+    def _compute_unique(attrs, inputs, _):
+        return topi_compute(inputs[0], attrs.sorted, attrs.return_counts)
+
+    return _compute_unique
+
+
+@override_native_generic_func("unique_strategy")
+def unique_strategy(attrs, inputs, out_type, target):
+    """unique generic strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_unique(topi.unique),
+        wrap_topi_schedule(topi.generic.schedule_unique),
+        name="unique.generic",
+    )
+    return strategy
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index b676fe742544..c0a0d31478ef 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -1463,3 +1463,57 @@ def cumsum(data, axis=None, dtype=None, exclusive=None):
         -> [1, 1, 2, 2, 3, 4, 4]
     """
     return _make.cumsum(data, axis, dtype, exclusive)
+
+
+def unique(data, is_sorted=True, return_counts=False):
+    """
+    Find the unique elements of a 1-D tensor. Please note `output` and `counts` are all padded to
+    have the same length of `data` and element with index >= num_unique[0] has undefined value.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        A 1-D tensor of integers.
+
+    sorted : bool
+        Whether to sort the unique elements in ascending order before returning as output.
+
+    return_counts : bool
+        Whether to return the count of each unique element.
+
+    Returns
+    -------
+    output : relay.Expr
+        A 1-D tensor containing the unique elements of the input data tensor.
+
+    indices : relay.Expr
+        A 1-D tensor containing the index of each data element in the output tensor.
+
+    num_unique : relay.Expr
+        A 1-D tensor with size=1 containing the number of unique elements in the input data tensor.
+
+    counts (optional) : relay.Expr
+        A 1-D tensor containing the count of each unique element in the output.
+
+    Examples
+    --------
+    .. code-block:: python
+        [output, indices, num_unique] = unique([4, 5, 1, 2, 3, 3, 4, 5], False, False)
+        output         =  [4, 5, 1, 2, 3, ?, ?, ?]
+        indices        =  [0, 1, 2, 3, 4, 4, 0, 1]
+        num_unique     =  [5]
+
+        [output, indices, num_unique, counts] = unique([4, 5, 1, 2, 3, 3, 4, 5], False, True)
+        output         =  [4, 5, 1, 2, 3, ?, ?, ?]
+        indices        =  [0, 1, 2, 3, 4, 4, 0, 1]
+        num_unique     =  [5]
+        counts         =  [2, 2, 1, 1, 2, ?, ?, ?]
+
+        [output, indices, num_unique] = unique([4, 5, 1, 2, 3, 3, 4, 5], True)
+        output         =  [1, 2, 3, 4, 5, ?, ?, ?]
+        indices        =  [3, 4, 0, 1, 2, 2, 3, 4]
+        num_unique     =  [5]
+    """
+    if return_counts:
+        return TupleWrapper(_make.unique(data, is_sorted, return_counts), 4)
+    return TupleWrapper(_make.unique(data, is_sorted, return_counts), 3)
diff --git a/python/tvm/topi/__init__.py b/python/tvm/topi/__init__.py
index 2b17162048e0..63dc4bd4ab83 100644
--- a/python/tvm/topi/__init__.py
+++ b/python/tvm/topi/__init__.py
@@ -43,6 +43,7 @@
 from .argwhere import *
 from .cumsum import *
 from .einsum import *
+from .unique import *
 from . import generic
 from . import nn
 from . import x86
diff --git a/python/tvm/topi/cuda/__init__.py b/python/tvm/topi/cuda/__init__.py
index bf3582c01d4f..df75c676fad3 100644
--- a/python/tvm/topi/cuda/__init__.py
+++ b/python/tvm/topi/cuda/__init__.py
@@ -58,3 +58,4 @@
 from . import tensorcore_alter_op
 from .argwhere import *
 from .scan import *
+from .unique import *
diff --git a/python/tvm/topi/cuda/unique.py b/python/tvm/topi/cuda/unique.py
new file mode 100644
index 000000000000..02a5cf3bc592
--- /dev/null
+++ b/python/tvm/topi/cuda/unique.py
@@ -0,0 +1,396 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""Unique operator"""
+import tvm
+from tvm import te, tir
+from ...te import hybrid
+from .scan import cumsum
+from .sort import sort, argsort
+from ..utils import ceil_div
+
+
+def _calc_adjacent_diff_ir(data, output, binop=tir.Sub):
+    """Low level IR to calculate adjacent difference in an 1-D array.
+
+    Parameters
+    ----------
+    data : Buffer
+        Input 1-D Buffer.
+
+    output: Buffer
+        A buffer to store adjacent difference, of the same shape as data. The adjacent difference
+        is defined as: output[0] = 0, output[i] = binop(data[i], data[i-1])
+        where i > 0 and i < len(data).
+
+    binop: function, optional
+        A binary associative op to use for calculating adjacent difference. The function takes two
+        TIR expressions and produce a new TIR expression. By default it uses tvm.tir.Sub to
+        compute the adjacent difference.
+    """
+    ib = tir.ir_builder.create()
+    data_ptr = ib.buffer_ptr(data)
+    output_ptr = ib.buffer_ptr(output)
+    batch_size = data.shape[0]
+    max_threads = tir.min(batch_size, tvm.target.Target.current(allow_none=False).max_num_threads)
+    with ib.new_scope():
+        nthread_tx = max_threads
+        nthread_bx = ceil_div(batch_size, max_threads)
+        tx = te.thread_axis("threadIdx.x")
+        bx = te.thread_axis("blockIdx.x")
+        ib.scope_attr(tx, "thread_extent", nthread_tx)
+        ib.scope_attr(bx, "thread_extent", nthread_bx)
+        tid = bx * max_threads + tx
+        with ib.if_scope(tid < batch_size):
+            with ib.if_scope(tid == 0):
+                output_ptr[tid] = 0
+            with ib.else_scope():
+                output_ptr[tid] = tir.Cast(output.dtype, binop(data_ptr[tid], data_ptr[tid - 1]))
+    return ib.get()
+
+
+def _calc_adjacent_diff(data, out_dtype="int32", binop=tir.Sub):
+    """Function calculate adjacent difference in an 1-D array.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        Input 1-D tensor.
+
+    output_dtype : str
+        The output tensor data type.
+
+    binop: function, optional
+        A binary associative op to use for calculating difference. The function takes two
+        TIR expressions and produce a new TIR expression. By default it uses tvm.tir.Sub to
+        compute the adjacent difference.
+
+    Returns
+    -------
+    output : tvm.te.Tensor
+        1-D tensor storing the adjacent difference of the input tensor. The adjacent difference
+        is defined as: output[0] = 0, output[i] = binop(data[i], data[i-1])
+        where i > 0 and i < len(data).
+    """
+    data_buf = tir.decl_buffer(data.shape, data.dtype, "sorted_data_buf", data_alignment=8)
+    output_buf = tir.decl_buffer(data.shape, out_dtype, "output_buf", data_alignment=8)
+    return te.extern(
+        [data.shape],
+        [data],
+        lambda ins, outs: _calc_adjacent_diff_ir(ins[0], outs[0], binop=binop),
+        dtype=[out_dtype],
+        in_buffers=[data_buf],
+        out_buffers=[output_buf],
+        name="_calc_adjacent_diff",
+        tag="_calc_adjacent_diff_gpu",
+    )
+
+
+@hybrid.script
+def _calc_num_unique(inc_scan):
+    """Helper function to get the number of unique elements fron inc_scan tensor"""
+    output = output_tensor((1,), "int32")
+    for i in bind("threadIdx.x", 1):
+        output[i] = inc_scan[inc_scan.shape[0] - 1] + int32(1)
+    return output
+
+
+def _calc_unique_ir(
+    data, argsorted_indices, inc_scan, index_converter, unique_elements, indices, counts
+):
+    """Low level IR to calculate unique elements, inverse indices, and counts (optional) of
+    unique elements of 1-D array.
+
+    Parameters
+    ----------
+    data : Buffer
+        Input 1-D Buffer.
+
+    argsorted_indices : Buffer
+        A buffer that stores the argsorted indices of the input data.
+
+    inc_scan : Buffer
+        A buffer that stores the inclusive scan of the binary tir.NE adjacent difference
+        of the sorted data.
+
+    index_converter (optional) : Buffer
+        An optional index converter that transforms the unique element index
+        such that new_idx = index_converter[old_idx].
+
+    unique_elements : Buffer
+        A buffer that stores the unique elements.
+
+    indices : Buffer
+        A buffer that stores the the index of each input data element in the unique element array.
+
+    counts (optional) : Buffer
+        A buffer that stores the count of each unique element.
+    """
+    ib = tir.ir_builder.create()
+    data_ptr = ib.buffer_ptr(data)
+    argsorted_indices_ptr = ib.buffer_ptr(argsorted_indices)
+    inc_scan_ptr = ib.buffer_ptr(inc_scan)
+    unique_elements_ptr = ib.buffer_ptr(unique_elements)
+    indices_ptr = ib.buffer_ptr(indices)
+
+    index_converter_ptr = None
+    if isinstance(index_converter, tir.Buffer):
+        index_converter_ptr = ib.buffer_ptr(index_converter)
+
+    if isinstance(counts, tir.Buffer):
+        counts_ptr = ib.buffer_ptr(counts)
+        # use indices_ptr as a tmp buffer to store tids with inc_scan[tid] != inc_scan[tid-1]
+        unique_seq_indices_ptr = ib.buffer_ptr(indices)
+
+    batch_size = data.shape[0]
+    max_threads = tir.min(batch_size, tvm.target.Target.current(allow_none=False).max_num_threads)
+
+    # if need to return counts
+    if isinstance(counts, tir.Buffer):
+        num_unique = inc_scan_ptr[inc_scan.shape[0] - 1] + 1
+        num_elements = data.shape[0]
+        with ib.new_scope():
+            nthread_tx = max_threads
+            nthread_bx = ceil_div(batch_size, max_threads)
+            tx = te.thread_axis("threadIdx.x")
+            bx = te.thread_axis("blockIdx.x")
+            ib.scope_attr(tx, "thread_extent", nthread_tx)
+            ib.scope_attr(bx, "thread_extent", nthread_bx)
+            tid = bx * max_threads + tx
+            with ib.if_scope(tid < batch_size):
+                with ib.if_scope(tid == 0):
+                    unique_seq_indices_ptr[num_unique - 1] = num_elements
+                with ib.else_scope():
+                    with ib.if_scope(inc_scan_ptr[tid] != inc_scan_ptr[tid - 1]):
+                        unique_seq_indices_ptr[inc_scan_ptr[tid] - 1] = tid
+        with ib.new_scope():
+            nthread_tx = max_threads
+            nthread_bx = ceil_div(batch_size, max_threads)
+            tx = te.thread_axis("threadIdx.x")
+            bx = te.thread_axis("blockIdx.x")
+            ib.scope_attr(tx, "thread_extent", nthread_tx)
+            ib.scope_attr(bx, "thread_extent", nthread_bx)
+            tid = bx * max_threads + tx
+            with ib.if_scope(tid < num_unique):
+                unique_idx = tid if not index_converter_ptr else index_converter_ptr[tid]
+                with ib.if_scope(tid == 0):
+                    counts_ptr[unique_idx] = unique_seq_indices_ptr[tid]
+                with ib.else_scope():
+                    counts_ptr[unique_idx] = (
+                        unique_seq_indices_ptr[tid] - unique_seq_indices_ptr[tid - 1]
+                    )
+    # calculate unique elements and inverse indices
+    with ib.new_scope():
+        nthread_tx = max_threads
+        nthread_bx = ceil_div(batch_size, max_threads)
+        tx = te.thread_axis("threadIdx.x")
+        bx = te.thread_axis("blockIdx.x")
+        ib.scope_attr(tx, "thread_extent", nthread_tx)
+        ib.scope_attr(bx, "thread_extent", nthread_bx)
+        tid = bx * max_threads + tx
+        with ib.if_scope(tid < batch_size):
+            data_idx = argsorted_indices_ptr[tid]
+            unique_idx = (
+                inc_scan_ptr[tid]
+                if not index_converter_ptr
+                else index_converter_ptr[inc_scan_ptr[tid]]
+            )
+            indices_ptr[data_idx] = unique_idx
+            with ib.if_scope(tid == 0):
+                unique_elements_ptr[unique_idx] = data_ptr[data_idx]
+            with ib.else_scope():
+                with ib.if_scope(inc_scan_ptr[tid] != inc_scan_ptr[tid - 1]):
+                    unique_elements_ptr[unique_idx] = data_ptr[data_idx]
+    return ib.get()
+
+
+def _calc_first_occurence_ir(argsorted_indices, inc_scan, first_occurence):
+    """Low level IR to calculate the first occurence of each unique element in the input data.
+
+    Parameters
+    ----------
+    argsorted_indices : Buffer
+        A buffer that stores the argsorted indices of the input data.
+
+    inc_scan : Buffer
+        A buffer that stores the inclusive scan of the binary tir.NE adjacent difference
+        of the sorted data.
+
+    first_occurence : Buffer
+        A buffer that stores the first occurence of each unique element in the input data.
+    """
+    ib = tir.ir_builder.create()
+    argsorted_indices_ptr = ib.buffer_ptr(argsorted_indices)
+    inc_scan_ptr = ib.buffer_ptr(inc_scan)
+    first_occurence_ptr = ib.buffer_ptr(first_occurence)
+    batch_size = argsorted_indices.shape[0]
+    max_threads = tir.min(batch_size, tvm.target.Target.current(allow_none=False).max_num_threads)
+    with ib.new_scope():
+        nthread_tx = max_threads
+        nthread_bx = ceil_div(batch_size, max_threads)
+        tx = te.thread_axis("threadIdx.x")
+        bx = te.thread_axis("blockIdx.x")
+        ib.scope_attr(tx, "thread_extent", nthread_tx)
+        ib.scope_attr(bx, "thread_extent", nthread_bx)
+        tid = bx * max_threads + tx
+        with ib.if_scope(tid < batch_size):
+            first_occurence_ptr[tid] = batch_size
+    with ib.new_scope():
+        nthread_tx = max_threads
+        nthread_bx = ceil_div(batch_size, max_threads)
+        tx = te.thread_axis("threadIdx.x")
+        bx = te.thread_axis("blockIdx.x")
+        ib.scope_attr(tx, "thread_extent", nthread_tx)
+        ib.scope_attr(bx, "thread_extent", nthread_bx)
+        tid = bx * max_threads + tx
+        with ib.if_scope(tid < batch_size):
+            with ib.if_scope(tid == 0):
+                first_occurence_ptr[inc_scan_ptr[tid]] = argsorted_indices_ptr[tid]
+            with ib.else_scope():
+                with ib.if_scope(inc_scan_ptr[tid] != inc_scan_ptr[tid - 1]):
+                    first_occurence_ptr[inc_scan_ptr[tid]] = argsorted_indices_ptr[tid]
+    return ib.get()
+
+
+def unique(data, is_sorted=True, return_counts=False):
+    """
+    Find the unique elements of a 1-D tensor. Please note `output` and `counts` are all padded to
+    have the same length of `data` and element with index >= num_unique[0] has undefined value.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        A 1-D tensor of integers.
+
+    sorted : bool
+        Whether to sort the unique elements in ascending order before returning as output.
+
+    return_counts : bool
+        Whether to return the count of each unique element.
+
+    Returns
+    -------
+    output : tvm.te.Tensor
+        A 1-D tensor containing the unique elements of the input data tensor.
+
+    indices : tvm.te.Tensor
+        A 1-D tensor containing the index of each data element in the output tensor.
+
+    num_unique : tvm.te.Tensor
+        A 1-D tensor with size=1 containing the number of unique elements in the input data tensor.
+
+    counts (optional) : tvm.te.Tensor
+        A 1-D tensor containing the count of each unique element in the output.
+
+    Examples
+    --------
+    .. code-block:: python
+        [output, indices, num_unique] = unique([4, 5, 1, 2, 3, 3, 4, 5], False, False)
+        output         =  [4, 5, 1, 2, 3, ?, ?, ?]
+        indices        =  [0, 1, 2, 3, 4, 4, 0, 1]
+        num_unique     =  [5]
+
+        [output, indices, num_unique, counts] = unique([4, 5, 1, 2, 3, 3, 4, 5], False, True)
+        output         =  [4, 5, 1, 2, 3, ?, ?, ?]
+        indices        =  [0, 1, 2, 3, 4, 4, 0, 1]
+        num_unique     =  [5]
+        counts         =  [2, 2, 1, 1, 2, ?, ?, ?]
+
+        [output, indices, num_unique] = unique([4, 5, 1, 2, 3, 3, 4, 5], True)
+        output         =  [1, 2, 3, 4, 5, ?, ?, ?]
+        indices        =  [3, 4, 0, 1, 2, 2, 3, 4]
+        num_unique     =  [5]
+    """
+    sorted_data = sort(data)
+    argsorted_indices = argsort(data, dtype="int32")
+    # adjacent difference
+    adjacent_diff = _calc_adjacent_diff(sorted_data, out_dtype="int32", binop=tir.NE)
+    # inclusive scan
+    inc_scan = cumsum(adjacent_diff, dtype="int32", exclusive=0)
+    # total number of unique elements
+    num_unique_elements = _calc_num_unique(inc_scan)
+    # buffers
+    data_buf = tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
+    argsorted_indices_buf = tir.decl_buffer(
+        data.shape, "int32", "argsorted_indices_buf", data_alignment=8
+    )
+    inc_scan_buf = tvm.tir.decl_buffer(data.shape, "int32", "inc_scan_buf", data_alignment=8)
+    unique_elements_buf = tir.decl_buffer(
+        data.shape, data.dtype, "unique_elements_buf", data_alignment=8
+    )
+    inverse_indices_buf = tvm.tir.decl_buffer(
+        data.shape, "int32", "inverse_indices_buf", data_alignment=8
+    )
+    # prepare outputs
+    if return_counts:
+        counts_buf = tir.decl_buffer(data.shape, "int32", "counts_buf", data_alignment=8)
+        out_data_shape = [data.shape] * 3
+        out_buffers = [unique_elements_buf, inverse_indices_buf, counts_buf]
+        out_dtypes = [data.dtype, "int32", "int32"]
+    else:
+        out_data_shape = [data.shape] * 2
+        out_buffers = [unique_elements_buf, inverse_indices_buf]
+        out_dtypes = [data.dtype, "int32"]
+    # prepare inputs and fcompute
+    if is_sorted:
+        in_data = [data, argsorted_indices, inc_scan]
+        in_buffers = [data_buf, argsorted_indices_buf, inc_scan_buf]
+        if return_counts:
+            fcompute = lambda ins, outs: _calc_unique_ir(*ins, None, *outs)
+        else:
+            fcompute = lambda ins, outs: _calc_unique_ir(*ins, None, *outs, None)
+    else:
+        # calculate the index converter if the unique elements should not be sorted
+        # calculate first occurence
+        first_occurence_buf = tir.decl_buffer(
+            data.shape, "int32", "first_occurence_buf", data_alignment=8
+        )
+        first_occurence = te.extern(
+            [data.shape],
+            [argsorted_indices, inc_scan],
+            lambda ins, outs: _calc_first_occurence_ir(ins[0], ins[1], outs[0]),
+            dtype=["int32"],
+            in_buffers=[argsorted_indices_buf, inc_scan_buf],
+            out_buffers=[first_occurence_buf],
+            name="_calc_first_occurence",
+            tag="_calc_first_occurence_gpu",
+        )
+        # calculate index converter by sorting unique elements by their first occurence
+        argsorted_first_occurence = argsort(first_occurence, dtype="int32")
+        index_converter = argsort(argsorted_first_occurence, dtype="int32")
+        index_converter_buf = tir.decl_buffer(
+            data.shape, "int32", "index_converter_buf", data_alignment=8
+        )
+        in_data = [data, argsorted_indices, inc_scan, index_converter]
+        in_buffers = [data_buf, argsorted_indices_buf, inc_scan_buf, index_converter_buf]
+        if return_counts:
+            fcompute = lambda ins, outs: _calc_unique_ir(*ins, *outs)
+        else:
+            fcompute = lambda ins, outs: _calc_unique_ir(*ins, *outs, None)
+    outs = te.extern(
+        out_data_shape,
+        in_data,
+        fcompute,
+        dtype=out_dtypes,
+        in_buffers=in_buffers,
+        out_buffers=out_buffers,
+        name="_calc_unique",
+        tag="_calc_unique_gpu",
+    )
+    if return_counts:
+        return [outs[0], outs[1], num_unique_elements, outs[2]]
+    return [*outs, num_unique_elements]
diff --git a/python/tvm/topi/generic/search.py b/python/tvm/topi/generic/search.py
index 5924d35def73..f458ee7bc782 100644
--- a/python/tvm/topi/generic/search.py
+++ b/python/tvm/topi/generic/search.py
@@ -70,3 +70,19 @@ def schedule_scatter_add(outs):
 
 def schedule_sparse_fill_empty_rows(outs):
     return _default_schedule(outs, False)
+
+
+def schedule_unique(outs):
+    """Schedule for unique operator.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+      The computation graph description of unique.
+
+    Returns
+    -------
+    s: Schedule
+      The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
diff --git a/python/tvm/topi/unique.py b/python/tvm/topi/unique.py
new file mode 100644
index 000000000000..b4f27b38f65f
--- /dev/null
+++ b/python/tvm/topi/unique.py
@@ -0,0 +1,297 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""Unique operator"""
+from tvm import te, tir
+from ..te import hybrid
+from .cumsum import cumsum
+from .sort import sort, argsort
+
+
+def _calc_adjacent_diff_ir(data, output, binop=tir.Sub):
+    """Low level IR to calculate adjacent difference in an 1-D array.
+
+    Parameters
+    ----------
+    data : Buffer
+        Input 1-D Buffer.
+
+    output: Buffer
+        A buffer to store adjacent difference, of the same shape as data. The adjacent difference
+        is defined as: output[0] = 0, output[i] = binop(data[i], data[i-1])
+        where i > 0 and i < len(data).
+
+    binop: function, optional
+        A binary associative op to use for calculating adjacent difference. The function takes two
+        TIR expressions and produce a new TIR expression. By default it uses tvm.tir.Sub to
+        compute the adjacent difference.
+    """
+    ib = tir.ir_builder.create()
+    data_ptr = ib.buffer_ptr(data)
+    output_ptr = ib.buffer_ptr(output)
+    with ib.for_range(0, data.shape[0], kind="parallel") as i:
+        with ib.if_scope(i == 0):
+            output_ptr[0] = 0
+        with ib.else_scope():
+            output_ptr[i] = tir.Cast(output.dtype, binop(data_ptr[i], data_ptr[i - 1]))
+    return ib.get()
+
+
+def _calc_adjacent_diff(data, out_dtype="int32", binop=tir.Sub):
+    """Function calculate adjacent difference in an 1-D array.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        Input 1-D tensor.
+
+    output_dtype : str
+        The output tensor data type.
+
+    binop: function, optional
+        A binary associative op to use for calculating difference. The function takes two
+        TIR expressions and produce a new TIR expression. By default it uses tvm.tir.Sub to
+        compute the adjacent difference.
+
+    Returns
+    -------
+    output : tvm.te.Tensor
+        1-D tensor storing the adjacent difference of the input tensor. The adjacent difference
+        is defined as: output[0] = 0, output[i] = binop(data[i], data[i-1])
+        where i > 0 and i < len(data).
+    """
+    return te.extern(
+        [data.shape],
+        [data],
+        lambda ins, outs: _calc_adjacent_diff_ir(ins[0], outs[0], binop=binop),
+        dtype=[out_dtype],
+        name="_calc_adjacent_diff",
+        tag="_calc_adjacent_diff_cpu",
+    )
+
+
+@hybrid.script
+def _calc_num_unique(inc_scan):
+    """Helper function to get the number of unique elements fron inc_scan tensor"""
+    output = output_tensor((1,), "int32")
+    output[0] = inc_scan[inc_scan.shape[0] - 1] + int32(1)
+    return output
+
+
+def _calc_unique_ir(
+    data, argsorted_indices, inc_scan, index_converter, unique_elements, indices, counts
+):
+    """Low level IR to calculate unique elements, inverse indices, and counts (optional) of
+    unique elements of 1-D array.
+
+    Parameters
+    ----------
+    data : Buffer
+        Input 1-D Buffer.
+
+    argsorted_indices : Buffer
+        A buffer that stores the argsorted indices of the input data.
+
+    inc_scan : Buffer
+        A buffer that stores the inclusive scan of the binary tir.NE adjacent difference
+        of the sorted data.
+
+    index_converter (optional) : Buffer
+        An optional index converter that transforms the unique element index
+        such that new_idx = index_converter[old_idx].
+
+    unique_elements : Buffer
+        A buffer that stores the unique elements.
+
+    indices : Buffer
+        A buffer that stores the the index of each input data element in the unique element array.
+
+    counts (optional) : Buffer
+        A buffer that stores the count of each unique element.
+    """
+    ib = tir.ir_builder.create()
+    data_ptr = ib.buffer_ptr(data)
+    argsorted_indices_ptr = ib.buffer_ptr(argsorted_indices)
+    inc_scan_ptr = ib.buffer_ptr(inc_scan)
+    unique_elements_ptr = ib.buffer_ptr(unique_elements)
+    indices_ptr = ib.buffer_ptr(indices)
+
+    index_converter_ptr = None
+    if isinstance(index_converter, tir.Buffer):
+        index_converter_ptr = ib.buffer_ptr(index_converter)
+
+    if isinstance(counts, tir.Buffer):
+        counts_ptr = ib.buffer_ptr(counts)
+        # use indices_ptr as a tmp buffer to store tids with inc_scan[tid] != inc_scan[tid-1]
+        unique_seq_indices_ptr = ib.buffer_ptr(indices)
+
+    data_length = data.shape[0]
+
+    # if need to return counts
+    if isinstance(counts, tir.Buffer):
+        num_unique = inc_scan_ptr[inc_scan.shape[0] - 1] + 1
+        num_elements = data.shape[0]
+        unique_seq_indices_ptr[num_unique - 1] = num_elements
+        with ib.new_scope():
+            with ib.for_range(0, data_length, kind="parallel") as i:
+                with ib.if_scope(i > 0):
+                    with ib.if_scope(inc_scan_ptr[i] != inc_scan_ptr[i - 1]):
+                        unique_seq_indices_ptr[inc_scan_ptr[i] - 1] = i
+        with ib.new_scope():
+            with ib.for_range(0, num_unique, kind="parallel") as i:
+                unique_idx = i if not index_converter_ptr else index_converter_ptr[i]
+                with ib.if_scope(i == 0):
+                    counts_ptr[unique_idx] = unique_seq_indices_ptr[i]
+                with ib.else_scope():
+                    counts_ptr[unique_idx] = (
+                        unique_seq_indices_ptr[i] - unique_seq_indices_ptr[i - 1]
+                    )
+    # calculate unique elements and inverse indices
+    with ib.new_scope():
+        with ib.for_range(0, data_length, kind="parallel") as i:
+            data_idx = argsorted_indices_ptr[i]
+            unique_idx = (
+                inc_scan_ptr[i] if not index_converter_ptr else index_converter_ptr[inc_scan_ptr[i]]
+            )
+            indices_ptr[data_idx] = unique_idx
+            with ib.if_scope(i == 0):
+                unique_elements_ptr[unique_idx] = data_ptr[data_idx]
+            with ib.else_scope():
+                with ib.if_scope(inc_scan_ptr[i] != inc_scan_ptr[i - 1]):
+                    unique_elements_ptr[unique_idx] = data_ptr[data_idx]
+    return ib.get()
+
+
+@hybrid.script
+def _calc_first_occurence(argsorted_indices, inc_scan):
+    """Hybrid script to calculate the first occurence of each unique element in the input data.
+
+    Parameters
+    ----------
+    argsorted_indices : tvm.te.Tensor
+        A tensor that stores the argsorted indices of the input data.
+
+    inc_scan : tvm.te.Tensor
+        A tensor that stores the inclusive scan of the binary tir.NE adjacent difference
+        of the sorted data.
+
+    first_occurence : tvm.te.Tensor
+        A tensor that stores the first occurence of each unique element in the input data.
+    """
+    first_occurence = output_tensor(argsorted_indices.shape, "int32")
+    for i in parallel(argsorted_indices.shape[0]):
+        first_occurence[i] = argsorted_indices.shape[0]
+    for i in parallel(argsorted_indices.shape[0]):
+        if i == 0 or inc_scan[i] != inc_scan[i - 1]:
+            first_occurence[inc_scan[i]] = argsorted_indices[i]
+    return first_occurence
+
+
+def unique(data, is_sorted=True, return_counts=False):
+    """
+    Find the unique elements of a 1-D tensor. Please note `output` and `counts` are all padded to
+    have the same length of `data` and element with index >= num_unique[0] has undefined value.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        A 1-D tensor of integers.
+
+    sorted : bool
+        Whether to sort the unique elements in ascending order before returning as output.
+
+    return_counts : bool
+        Whether to return the count of each unique element.
+
+    Returns
+    -------
+    output : tvm.te.Tensor
+        A 1-D tensor containing the unique elements of the input data tensor.
+
+    indices : tvm.te.Tensor
+        A 1-D tensor containing the index of each data element in the output tensor.
+
+    num_unique : tvm.te.Tensor
+        A 1-D tensor with size=1 containing the number of unique elements in the input data tensor.
+
+    counts (optional) : tvm.te.Tensor
+        A 1-D tensor containing the count of each unique element in the output.
+
+    Examples
+    --------
+    .. code-block:: python
+        [output, indices, num_unique] = unique([4, 5, 1, 2, 3, 3, 4, 5], False, False)
+        output         =  [4, 5, 1, 2, 3, ?, ?, ?]
+        indices        =  [0, 1, 2, 3, 4, 4, 0, 1]
+        num_unique     =  [5]
+
+        [output, indices, num_unique, counts] = unique([4, 5, 1, 2, 3, 3, 4, 5], False, True)
+        output         =  [4, 5, 1, 2, 3, ?, ?, ?]
+        indices        =  [0, 1, 2, 3, 4, 4, 0, 1]
+        num_unique     =  [5]
+        counts         =  [2, 2, 1, 1, 2, ?, ?, ?]
+
+        [output, indices, num_unique] = unique([4, 5, 1, 2, 3, 3, 4, 5], True)
+        output         =  [1, 2, 3, 4, 5, ?, ?, ?]
+        indices        =  [3, 4, 0, 1, 2, 2, 3, 4]
+        num_unique     =  [5]
+    """
+    sorted_data = sort(data)
+    argsorted_indices = argsort(data, dtype="int32")
+    # adjacent difference
+    adjacent_diff = _calc_adjacent_diff(sorted_data, "int32", tir.NE)
+    # inclusive scan
+    inc_scan = cumsum(adjacent_diff, dtype="int32", exclusive=0)
+    # total number of unique elements
+    num_unique_elements = _calc_num_unique(inc_scan)
+    # prepare outputs
+    if return_counts:
+        out_data_shape = [data.shape] * 3
+        out_dtypes = [data.dtype, "int32", "int32"]
+    else:
+        out_data_shape = [data.shape] * 2
+        out_dtypes = [data.dtype, "int32"]
+    # prepare inputs and fcompute
+    if is_sorted:
+        in_data = [data, argsorted_indices, inc_scan]
+        if return_counts:
+            fcompute = lambda ins, outs: _calc_unique_ir(*ins, None, *outs)
+        else:
+            fcompute = lambda ins, outs: _calc_unique_ir(*ins, None, *outs, None)
+    else:
+        # calculate the index converter if the unique elements should not be sorted
+        # calculate first occurence
+        first_occurence = _calc_first_occurence(argsorted_indices, inc_scan)
+        # calculate index converter by sorting unique elements by their first occurence
+        argsorted_first_occurence = argsort(first_occurence, dtype="int32")
+        index_converter = argsort(argsorted_first_occurence, dtype="int32")
+        in_data = [data, argsorted_indices, inc_scan, index_converter]
+        if return_counts:
+            fcompute = lambda ins, outs: _calc_unique_ir(*ins, *outs)
+        else:
+            fcompute = lambda ins, outs: _calc_unique_ir(*ins, *outs, None)
+    outs = te.extern(
+        out_data_shape,
+        in_data,
+        fcompute,
+        dtype=out_dtypes,
+        name="_calc_unique",
+        tag="_calc_unique_cpu",
+    )
+    if return_counts:
+        return [outs[0], outs[1], num_unique_elements, outs[2]]
+    return [*outs, num_unique_elements]
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 12db859d1ae1..eae231fd8d06 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -3772,5 +3772,52 @@ RELAY_REGISTER_OP("cumsum")
     .add_type_rel("Cumsum", CumsumRel)
     .set_attr<TOpPattern>("TOpPattern", kOpaque);
 
+TVM_REGISTER_NODE_TYPE(UniqueAttrs);
+
+bool UniqueRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+               const TypeReporter& reporter) {
+  // types: [data, result]
+  ICHECK_EQ(types.size(), 2) << "Unique: expect 2 types but " << types.size() << " provided";
+  ICHECK_EQ(num_inputs, 1) << "Unique: expect 1 inputs but " << num_inputs << " provided";
+  auto data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) {
+    ICHECK(types[0].as<IncompleteTypeNode>())
+        << "Unique: expect input type to be TensorType but get " << types[0];
+    return false;
+  }
+  const int ndim = static_cast<int>(data->shape.size());
+  ICHECK_EQ(ndim, 1) << "Unique: input must be 1-D tensor";
+  ICHECK_EQ(data->dtype.is_int(), true) << "Unique: input must have int32 or int64 dtype";
+  std::vector<Type> fields;
+  fields.push_back(TensorType(data->shape, data->dtype));               // unique
+  fields.push_back(TensorType(data->shape, DataType::Int(32)));         // indices
+  fields.push_back(TensorType(Array<PrimExpr>{1}, DataType::Int(32)));  // num_unique
+  const auto* param = attrs.as<UniqueAttrs>();
+  if (param->return_counts) {
+    fields.push_back(TensorType(data->shape, DataType::Int(32)));  // counts
+  }
+  reporter->Assign(types[1], TupleType(Array<Type>(fields)));
+  return true;
+}
+
+Expr MakeUnique(Expr data, bool sorted, bool return_counts) {
+  auto attrs = make_object<UniqueAttrs>();
+  attrs->sorted = sorted;
+  attrs->return_counts = return_counts;
+  static const Op& op = Op::Get("unique");
+  return Call(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.unique").set_body_typed(MakeUnique);
+
+RELAY_REGISTER_OP("unique")
+    .describe(
+        R"code(This operation returns the unique elements and the new index of each item in a given 1-D array.
+    )code" TVM_ADD_FILELINE)
+    .set_num_inputs(1)
+    .add_argument("data", "Tensor", "The input tensor")
+    .add_type_rel("unique", UniqueRel)
+    .set_support_level(3)
+    .set_attr<TOpPattern>("TOpPattern", kOpaque);
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index aa42b0fb84e4..0cf4839c6ebb 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -2064,7 +2064,12 @@ def verify_model_vm(input_model, ishapes, idtype=None, idata=None, targets=["llv
             pt_result = input_model(*input_data)
 
         # Verify the accuracy
-        if not isinstance(pt_result, torch.Tensor):
+        if isinstance(pt_result, tuple):
+            # handle multiple outputs
+            for i in range(len(pt_result)):
+                tvm_res = vm_res[i].asnumpy()
+                tvm.testing.assert_allclose(tvm_res, pt_result[i].numpy(), rtol=1e-5, atol=1e-5)
+        elif not isinstance(pt_result, torch.Tensor):
             tvm_res = vm_res.asnumpy().item()
             assert pt_result == tvm_res
         else:
@@ -3654,6 +3659,23 @@ def test_fn(x, mask):
         verify_trace_model(test_fn, [x, mask], ["llvm", "cuda", "nvptx"])
 
 
+def test_unique():
+    def test_fn(is_sorted, return_inverse, return_counts):
+        return lambda x: torch.unique(x, is_sorted, return_inverse, return_counts)
+
+    in_data = torch.randint(0, 20, (10,), dtype=torch.int32)
+    targets = ["llvm", "cuda", "nvptx"]
+    verify_trace_model(test_fn(True, True, True), [in_data], targets)
+    verify_trace_model(test_fn(True, False, True), [in_data], targets)
+    verify_trace_model(test_fn(True, True, False), [in_data], targets)
+    verify_trace_model(test_fn(True, False, True), [in_data], targets)
+    in_data = torch.randint(0, 20, (20,), dtype=torch.int64)
+    verify_trace_model(test_fn(True, True, True), [in_data], targets)
+    verify_trace_model(test_fn(True, False, True), [in_data], targets)
+    verify_trace_model(test_fn(True, True, False), [in_data], targets)
+    verify_trace_model(test_fn(True, False, True), [in_data], targets)
+
+
 if __name__ == "__main__":
     # some structural tests
     test_forward_traced_function()
@@ -3789,6 +3811,7 @@ def test_fn(x, mask):
     test_argsort()
     test_logical_and()
     test_masked_select()
+    test_unique()
 
     # Model tests
     test_resnet18()
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 5ed3e72206e4..8b146b6511ce 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -4988,5 +4988,70 @@ def lstm_cell():
             tvm.testing.assert_allclose(tf_output[i], tvm_output[i], atol=1e-5, rtol=1e-5)
 
 
+#######################################################################
+# Unique
+# ------------
+
+
+def _test_unique(n, dtype, is_dyn):
+    tf.reset_default_graph()
+    np_data = np.random.randint(100, size=n).astype(dtype)
+    with tf.Graph().as_default():
+        if is_dyn:
+            in_data = tf.placeholder(dtype, [n], name="in_data")
+        else:
+            in_data = tf.constant(np_data, dtype, name="in_data")
+        tf.unique(in_data)
+        if is_dyn:
+            compare_tf_with_tvm(np_data, "in_data:0", ["Unique:0", "Unique:1"], mode="vm")
+        else:
+            compare_tf_with_tvm(None, "", ["Unique:0", "Unique:1"])
+
+
+def test_forward_unique():
+    """test Unique"""
+
+    for dtype in ["int32", "int64"]:
+        for is_dyn in [False, True]:
+            _test_unique(50, dtype, is_dyn)
+            _test_unique(100, dtype, is_dyn)
+
+
+#######################################################################
+# Unique with counts
+# ------------
+
+
+def _test_unique_with_counts(n, dtype, is_dyn):
+    tf.reset_default_graph()
+    np_data = np.random.randint(100, size=n).astype(dtype)
+    with tf.Graph().as_default():
+        if is_dyn:
+            in_data = tf.placeholder(dtype, [n], name="in_data")
+        else:
+            in_data = tf.constant(np_data, dtype, name="in_data")
+        tf.unique_with_counts(in_data)
+        if is_dyn:
+            compare_tf_with_tvm(
+                np_data,
+                "in_data:0",
+                ["UniqueWithCounts:0", "UniqueWithCounts:1", "UniqueWithCounts:2"],
+                mode="vm",
+            )
+        else:
+            compare_tf_with_tvm(
+                None, "", ["UniqueWithCounts:0", "UniqueWithCounts:1", "UniqueWithCounts:2"]
+            )
+
+
+def test_forward_unique_with_counts():
+    """test UniqueWithCounts"""
+
+    for dtype in ["int32", "int64"]:
+        for is_dyn in [False, True]:
+            _test_unique_with_counts(10, dtype, is_dyn)
+            _test_unique_with_counts(20, dtype, is_dyn)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 94fac3ba1264..ee55b532218d 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -1453,5 +1453,58 @@ def verify_scatter_nd_with_stack(data_np, indices_np, shape, ref_res, rtol=1e-5,
     verify_scatter_nd_with_stack(data, indices, shape, out)
 
 
+def test_unique():
+    def calc_numpy_unique(data, is_sorted=False):
+        uniq, index, inverse, counts = np.unique(
+            data, return_index=True, return_inverse=True, return_counts=True
+        )
+        num_uniq = np.array([len(uniq)]).astype("int32")
+        if not is_sorted:
+            order = np.argsort(index)
+            reverse_order = np.argsort(order)
+            uniq = uniq[order].astype(data.dtype)
+            inverse = np.array([reverse_order[i] for i in inverse]).astype("int32")
+            counts = counts[order].astype("int32")
+        return [uniq.astype(data.dtype), inverse.astype("int32"), counts, num_uniq]
+
+    def verify_unique(n, dtype, is_dyn=False, is_sorted=False, return_counts=False):
+        if is_dyn:
+            x = relay.var("x", relay.TensorType([relay.Any()], dtype))
+        else:
+            x = relay.var("x", relay.TensorType([n], dtype))
+        outs = relay.unique(x, is_sorted, return_counts)
+        outs = outs.astuple()
+        func = relay.Function([x], outs)
+        x_data = np.random.randint(50, size=n).astype(dtype)
+
+        if is_dyn:
+            backends = ["vm", "debug"]
+        else:
+            backends = ["graph", "debug"]
+
+        for target, ctx in tvm.testing.enabled_targets():
+            for kind in backends:
+                mod = tvm.ir.IRModule.from_expr(func)
+                intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+                tvm_res = intrp.evaluate()(x_data)
+                np_res = calc_numpy_unique(x_data, is_sorted)
+                num_unique = np_res[3][0]
+                assert num_unique == tvm_res[2].asnumpy()[0]
+                # unique
+                tvm.testing.assert_allclose(tvm_res[0].asnumpy()[:num_unique], np_res[0], rtol=1e-5)
+                # inverse_indices
+                tvm.testing.assert_allclose(tvm_res[1].asnumpy(), np_res[1], rtol=1e-5)
+                # counts
+                if return_counts:
+                    tvm.testing.assert_allclose(
+                        tvm_res[3].asnumpy()[:num_unique], np_res[2], rtol=1e-5
+                    )
+
+    for dtype in ["int32", "int64"]:
+        for i in range(8):
+            is_dyn, is_sorted, return_counts = bool(i & 1), bool(i & 2), bool(i & 4)
+            verify_unique(10, dtype, is_dyn, is_sorted, return_counts)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/python/topi/python/test_topi_unique.py b/tests/python/topi/python/test_topi_unique.py
new file mode 100644
index 000000000000..d7ee74282922
--- /dev/null
+++ b/tests/python/topi/python/test_topi_unique.py
@@ -0,0 +1,111 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import numpy as np
+import tvm
+import tvm.testing
+from tvm import topi
+import tvm.topi.testing
+
+
+@tvm.testing.parametrize_targets
+def test_unique(ctx, target):
+    def calc_numpy_unique(data, is_sorted=False):
+        uniq, index, inverse, counts = np.unique(
+            data, return_index=True, return_inverse=True, return_counts=True
+        )
+        num_uniq = np.array([len(uniq)]).astype("int32")
+        if not is_sorted:
+            order = np.argsort(index)
+            reverse_order = np.argsort(order)
+            uniq = uniq[order].astype(data.dtype)
+            inverse = np.array([reverse_order[i] for i in inverse]).astype("int32")
+            counts = counts[order].astype("int32")
+        return [uniq.astype(data.dtype), inverse.astype("int32"), counts, num_uniq]
+
+    def check_unique(data, is_sorted=False):
+        # numpy reference
+        np_unique, np_indices, np_counts, np_num_unique = calc_numpy_unique(data, is_sorted)
+        num_unique = np_num_unique[0]
+
+        implementations = {
+            "generic": (
+                lambda x, return_counts: topi.unique(x, is_sorted, return_counts),
+                topi.generic.schedule_unique,
+            ),
+            "cuda": (
+                lambda x, return_counts: topi.cuda.unique(x, is_sorted, return_counts),
+                topi.cuda.schedule_scan,
+            ),
+            "nvptx": (
+                lambda x, return_counts: topi.cuda.unique(x, is_sorted, return_counts),
+                topi.cuda.schedule_scan,
+            ),
+        }
+        fcompute, fschedule = tvm.topi.testing.dispatch(target, implementations)
+        tvm_data = tvm.nd.array(data, ctx=ctx)
+        tvm_unique = tvm.nd.array(np.zeros(data.shape).astype(data.dtype), ctx=ctx)
+        tvm_indices = tvm.nd.array(np.zeros(data.shape).astype("int32"), ctx=ctx)
+        tvm_num_unique = tvm.nd.array(np.zeros([1]).astype("int32"), ctx=ctx)
+
+        # without counts
+        with tvm.target.Target(target):
+            te_input = tvm.te.placeholder(shape=data.shape, dtype=str(data.dtype))
+            outs = fcompute(te_input, False)
+            s = fschedule(outs)
+            func = tvm.build(s, [te_input, *outs])
+            func(tvm_data, tvm_unique, tvm_indices, tvm_num_unique)
+
+        assert tvm_num_unique.asnumpy()[0] == np_num_unique
+        np.testing.assert_allclose(
+            tvm_unique.asnumpy()[:num_unique], np_unique, atol=1e-5, rtol=1e-5
+        )
+        np.testing.assert_allclose(tvm_indices.asnumpy(), np_indices, atol=1e-5, rtol=1e-5)
+
+        # with counts
+        tvm_counts = tvm.nd.array(np.zeros(data.shape).astype("int32"), ctx=ctx)
+        with tvm.target.Target(target):
+            te_input = tvm.te.placeholder(shape=data.shape, dtype=str(data.dtype))
+            outs = fcompute(te_input, True)
+            s = fschedule(outs)
+            func = tvm.build(s, [te_input, *outs])
+            func(tvm_data, tvm_unique, tvm_indices, tvm_num_unique, tvm_counts)
+
+        np_unique, np_indices, _, np_num_unique = calc_numpy_unique(data, is_sorted)
+        num_unique = np_num_unique[0]
+        assert tvm_num_unique.asnumpy()[0] == np_num_unique
+        np.testing.assert_allclose(
+            tvm_unique.asnumpy()[:num_unique], np_unique, atol=1e-5, rtol=1e-5
+        )
+        np.testing.assert_allclose(tvm_indices.asnumpy(), np_indices, atol=1e-5, rtol=1e-5)
+        np.testing.assert_allclose(
+            tvm_counts.asnumpy()[:num_unique], np_counts, atol=1e-5, rtol=1e-5
+        )
+
+    for in_dtype in ["int32", "int64"]:
+        for is_sorted in [True, False]:
+            data = np.random.randint(0, 100, size=(1)).astype(in_dtype)
+            check_unique(data, is_sorted)
+            data = np.random.randint(0, 10, size=(10)).astype(in_dtype)
+            check_unique(data, is_sorted)
+            data = np.random.randint(0, 100, size=(10000)).astype(in_dtype)
+            check_unique(data, is_sorted)
+
+
+if __name__ == "__main__":
+    test_unique(tvm.context("cpu"), tvm.target.Target("llvm"))
+    test_unique(tvm.context("cuda"), tvm.target.Target("cuda"))
+    test_unique(tvm.context("nvptx"), tvm.target.Target("nvptx"))

From c46b1876f5bba743c864d3fbeeb3d8df4664aa5f Mon Sep 17 00:00:00 2001
From: Alex Wong <11878166+alexwong@users.noreply.github.com>
Date: Thu, 25 Feb 2021 21:37:11 -0800
Subject: [PATCH 243/357] [Torch] Pool ops, convert strides and pool_size to
 int (#7517)

* Convert strides and pool_size to int

* Make helper function, add test

* Fix lint
---
 python/tvm/relay/frontend/pytorch.py          | 16 ++++++++++++----
 tests/python/frontend/pytorch/test_forward.py |  9 +++++++++
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 679541051e75..fdebd2f50e68 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -825,11 +825,19 @@ def adaptive_avg_pool_3d(self, inputs, input_types):
         output_size = inputs[1]
         return _op.nn.adaptive_avg_pool3d(data, output_size=output_size)
 
+    @staticmethod
+    def convert_const_list(data):
+        if isinstance(data, list):
+            for i, _ in enumerate(data):
+                if isinstance(data[i], _expr.Expr):
+                    data[i] = int(_infer_value_simulated(data[i], {}).asnumpy())
+        return data
+
     def maxpool_2d(self, inputs, input_types):
         data = inputs[0]
 
-        pool_size = inputs[1]
-        strides = inputs[2] if inputs[2] else pool_size
+        pool_size = self.convert_const_list(inputs[1])
+        strides = self.convert_const_list(inputs[2] if inputs[2] else pool_size)
         padding = inputs[3]
         dilation = inputs[4]
         ceil_mode = int(inputs[5])
@@ -1309,8 +1317,8 @@ def softplus(self, inputs, input_types):
     def avg_pool2d(self, inputs, input_types):
         data = inputs[0]
 
-        pool_size = inputs[1]
-        strides = inputs[2] if inputs[2] else pool_size
+        pool_size = self.convert_const_list(inputs[1])
+        strides = self.convert_const_list(inputs[2] if inputs[2] else pool_size)
         padding = inputs[3]
         ceil_mode = int(inputs[4])
         count_include_pad = int(inputs[5])
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 0cf4839c6ebb..90604751d4f1 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -736,7 +736,16 @@ def forward(self, *args):
             output, indices = self.pool(args[0])
             return output
 
+    class MaxPool2DWithIntStrides(Module):
+        def forward(self, *args):
+            # Makes kernel_size and strides a Relay expr to test converting back to int
+            x_shape = args[0].shape
+            kernel_size = [torch.tensor(x_shape[1]).int(), torch.tensor(x_shape[1]).int()]
+            strides = [torch.tensor(x_shape[0]).int(), torch.tensor(x_shape[0]).int()]
+            return torch.nn.functional.max_pool2d(args[0], kernel_size=[4, 4], stride=strides)
+
     verify_model(MaxPool2DWithIndices().float().eval(), input_data=input_data)
+    verify_model(MaxPool2DWithIntStrides().float().eval(), input_data=input_data)
 
 
 @tvm.testing.uses_gpu

From 56ac8927247d2a96d305caa6d4125769b61371c9 Mon Sep 17 00:00:00 2001
From: Ritwik Das <ritwikdas54@gmail.com>
Date: Fri, 26 Feb 2021 09:31:28 -0800
Subject: [PATCH 244/357] SparseReshape Op (#7477)

* SparseReshape Inital Code

* Done

* Format

* Add empty tests

* Formatting

* SanityCheck

* formatting documentation

* Documentation

* Only Enable CPU

* Add support for CUDA

* Stuff

* Add Dynamic Support

* Parallelize GPU Impl

* Documentation

* Documentation

* Import

* Import

* Remove unnecessary code

* PR Comments

* Schedules

* Tests

* Dtypes

* Black

* Parallelize CPU

* CI error

Co-authored-by: Ubuntu <ubuntu@ip-172-31-42-251.us-east-2.compute.internal>
---
 python/tvm/relay/frontend/tensorflow.py       |  10 +
 python/tvm/relay/op/_transform.py             |  35 +++
 python/tvm/relay/op/strategy/cuda.py          |  11 +
 python/tvm/relay/op/strategy/generic.py       |  27 +++
 python/tvm/relay/op/transform.py              |  40 ++++
 python/tvm/topi/__init__.py                   |   1 +
 python/tvm/topi/cuda/__init__.py              |   1 +
 python/tvm/topi/cuda/sparse_reshape.py        | 209 ++++++++++++++++
 python/tvm/topi/sparse_reshape.py             | 185 +++++++++++++++
 src/relay/op/tensor/transform.cc              |  50 ++++
 .../frontend/tensorflow/test_forward.py       | 124 ++++++++++
 tests/python/relay/test_op_level3.py          | 223 ++++++++++++++++++
 12 files changed, 916 insertions(+)
 create mode 100644 python/tvm/topi/cuda/sparse_reshape.py
 create mode 100644 python/tvm/topi/sparse_reshape.py

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 65f18c029441..20eb95ba7c00 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -1157,6 +1157,15 @@ def _impl(inputs, attr, params, mod):
     return _impl
 
 
+def _sparse_reshape():
+    def _impl(inputs, attr, params, mod):
+        assert len(inputs) == 3, "There should be 3 input tensors"
+        new_indices, new_shape = get_relay_op("sparse_reshape")(inputs[0], inputs[1], inputs[2])
+        return _expr.TupleWrapper(_expr.Tuple([new_indices, new_shape]), 2)
+
+    return _impl
+
+
 def _identity():
     def _impl(inputs, attr, params, mod):
         return inputs[0]
@@ -2650,6 +2659,7 @@ def _impl(inputs, attr, params, mod):
     "SparseToDense": _sparse_to_dense(),
     "SparseTensorDenseMatMul": _sparse_tensor_dense_matmul(),
     "SparseFillEmptyRows": _sparse_fill_empty_rows(),
+    "SparseReshape": _sparse_reshape(),
     "Split": _split(False),
     "SplitV": _split(True),
     "Sqrt": AttrCvt("sqrt"),
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index e9cf3d83eaeb..97f45278f073 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -66,6 +66,7 @@
 _reg.register_injective_schedule("matrix_set_diag")
 _reg.register_injective_schedule("adv_index")
 
+
 # concatenate
 _reg.register_schedule("concatenate", strategy.schedule_concatenate)
 
@@ -114,6 +115,22 @@ def compute_sparse_fill_empty_rows(attrs, inputs, output_type):
 
 _reg.register_strategy("sparse_fill_empty_rows", strategy.sparse_fill_empty_rows_strategy)
 
+# sparse_reshape
+@_reg.register_compute("sparse_reshape")
+def compute_reshape(attrs, inputs, output_type):
+    """Compute definition of sparse_reshape"""
+
+    return topi.sparse_reshape(
+        inputs[0],
+        inputs[1],
+        inputs[2],
+        output_type.fields[0].shape,
+        output_type.fields[1].shape,
+    )
+
+
+_reg.register_strategy("sparse_reshape", strategy.sparse_reshape_strategy)
+
 # scatter_add
 @_reg.register_compute("scatter_add")
 def compute_scatter_add(attrs, inputs, output_type):
@@ -515,6 +532,24 @@ def sparse_fill_empty_rows_func(attrs, inputs, _):
     return _sparse_fill_empty_rows_shape_func(inputs[0], inputs[2])
 
 
+@script
+def _sparse_reshape_shape_func(sparse_indices_shape, prev_shape_shape, new_shape_shape):
+    indices_shape = output_tensor((2,), "int64")
+    indices_shape[0] = int64(sparse_indices_shape[0])
+    indices_shape[1] = int64(new_shape_shape[0])
+    shape_tensor = output_tensor((1,), "int64")
+    shape_tensor[0] = int64(new_shape_shape[0])
+    return (indices_shape, shape_tensor)
+
+
+@_reg.register_shape_func("sparse_reshape", False)
+def sparse_reshape_shape_func(attrs, inputs, _):
+    """
+    Shape func for sparse_reshape.
+    """
+    return _sparse_reshape_shape_func(inputs[0], inputs[1], inputs[2])
+
+
 @script
 def _layout_transform_shape_func(
     data_shape, out_layout_len, dst_equal_list, dst_mul_list, dst_div_list, dst_mix_list
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 3abc9c42b659..85bbab692574 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -764,6 +764,17 @@ def sparse_dense_strategy_cuda(attrs, inputs, out_type, target):
     return strategy
 
 
+@sparse_reshape_strategy.register(["cuda", "gpu"])
+def sparse_reshape_strategy_cuda(attrs, inputs, out_type, target):
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_sparse_reshape(topi.cuda.sparse_reshape),
+        wrap_topi_schedule(topi.generic.schedule_extern),
+        name="sparse_reshape.cuda",
+    )
+    return strategy
+
+
 @sparse_dense_padded_strategy.register(["cuda", "gpu"])
 def sparse_dense_padded_strategy_cuda(attrs, inputs, out_type, target):
     """sparse dense cuda strategy"""
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 8a2724dfb614..be86ea9d9184 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -1105,6 +1105,33 @@ def _compute_sparse_fill_empty_rows(attrs, inputs, output_type):
     return _compute_sparse_fill_empty_rows
 
 
+# sparse_reshape
+@override_native_generic_func("sparse_reshape_strategy")
+def sparse_reshape_strategy(attrs, outs, out_type, target):
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_sparse_reshape(topi.sparse_reshape),
+        wrap_topi_schedule(topi.generic.schedule_extern),
+        name="sparse_reshape.generic",
+    )
+    return strategy
+
+
+def wrap_compute_sparse_reshape(topi_compute):
+    """Wrap sparse_reshape compute"""
+
+    def _compute_sparse_reshape(attrs, inputs, output_type):
+        return topi_compute(
+            inputs[0],
+            inputs[1],
+            inputs[2],
+            output_type.fields[0].shape,
+            output_type.fields[1].shape,
+        )
+
+    return _compute_sparse_reshape
+
+
 # roi_pool
 @generic_func
 def schedule_roi_pool(attrs, outs, target):
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index c0a0d31478ef..73508ddd2603 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -1410,6 +1410,46 @@ def sparse_fill_empty_rows(sparse_indices, sparse_values, dense_shape, default_v
     return Tuple((new_sparse_indices, new_sparse_values, empty_row_indicator))
 
 
+def sparse_reshape(sparse_indices, prev_shape, new_shape):
+    """
+    Reshape a Sparse Tensor. The sparse array is in COO format.
+
+    Parameters
+    ----------
+    sparse_indices : relay.Expr
+        A 2-D tensor[N, n_dim] of integers containing location of sparse values, where N is the
+        number of sparse values and n_dim is the number of dimensions of the dense_shape
+    prev_shape : relay.Expr
+        A 1-D tensor containing the previous shape of the dense tensor
+    new_shape : relay.Expr
+        A 1-D tensor containing the new shape of the dense tensor
+    Returns
+    -------
+    result: relay.Expr
+        Output tensor.
+    Examples
+    --------
+    .. code-block:: python
+        sparse_indices = [[0, 0, 0],
+                            [0, 0, 1],
+                            [0, 1, 0],
+                            [1, 0, 0],
+                            [1, 2, 3]]
+        prev_shape = [2, 3, 4]
+        new_shape = [9, -1]
+        new_sparse_indices, new_shape = relay.sparse_reshape(sparse_indices,
+                            prev_shape,
+                            new_shape)
+        new_sparse_indices = [[0, 0],
+                              [0, 1],
+                              [1, 2],
+                              [4, 2],
+                              [8, 1]]
+        new_shape = [9, 4]
+    """
+    return TupleWrapper(_make.sparse_reshape(sparse_indices, prev_shape, new_shape), 2)
+
+
 def cumsum(data, axis=None, dtype=None, exclusive=None):
     """Numpy style cumsum op. Return the cumulative inclusive sum of the elements along
     a given axis.
diff --git a/python/tvm/topi/__init__.py b/python/tvm/topi/__init__.py
index 63dc4bd4ab83..c196b33cf880 100644
--- a/python/tvm/topi/__init__.py
+++ b/python/tvm/topi/__init__.py
@@ -39,6 +39,7 @@
 from .sort import *
 from .scatter import *
 from .sparse_fill_empty_rows import *
+from .sparse_reshape import *
 from .scatter_add import *
 from .argwhere import *
 from .cumsum import *
diff --git a/python/tvm/topi/cuda/__init__.py b/python/tvm/topi/cuda/__init__.py
index df75c676fad3..52e64804d692 100644
--- a/python/tvm/topi/cuda/__init__.py
+++ b/python/tvm/topi/cuda/__init__.py
@@ -58,4 +58,5 @@
 from . import tensorcore_alter_op
 from .argwhere import *
 from .scan import *
+from .sparse_reshape import *
 from .unique import *
diff --git a/python/tvm/topi/cuda/sparse_reshape.py b/python/tvm/topi/cuda/sparse_reshape.py
new file mode 100644
index 000000000000..4476648e0aa4
--- /dev/null
+++ b/python/tvm/topi/cuda/sparse_reshape.py
@@ -0,0 +1,209 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, too-many-arguments, too-many-nested-blocks
+"""Sparse_Reshape operator"""
+import tvm
+from tvm import te
+from ...tir import decl_buffer, ir_builder, Cast
+from ...te import extern, div, floordiv, floormod
+from ..utils import ceil_div
+
+
+def sparse_reshape(
+    sparse_indices,
+    prev_shape,
+    new_shape,
+    new_sparse_indices_shape,
+    new_shape_shape,
+):
+    """
+    Reshape a Sparse Tensor
+    Parameters
+    ----------
+    sparse_indices : relay.Expr
+        A 2-D tensor[N, n_dim] of integers containing location of sparse values, where N is the
+        number of sparse values and n_dim is the number of dimensions of the dense_shape
+    prev_shape : relay.Expr
+        A 1-D tensor containing the previous shape of the dense tensor
+    new_shape : relay.Expr
+        A 1-D tensor containing the new shape of the dense tensor
+    Returns
+    -------
+    result: relay.Expr
+        Output tensor.
+    Examples
+    --------
+    .. code-block:: python
+        sparse_indices = [[0, 0, 0],
+                            [0, 0, 1],
+                            [0, 1, 0],
+                            [1, 0, 0],
+                            [1, 2, 3]]
+        prev_shape = [2, 3, 4]
+        new_shape = [9, -1]
+        new_sparse_indices, new_shape = relay.sparse_reshape(sparse_indices,
+                            prev_shape,
+                            new_shape)
+        new_sparse_indices = [[0, 0],
+                              [0, 1],
+                              [1, 2],
+                              [4, 2],
+                              [8, 1]]
+        new_shape = [9, 4]
+    """
+
+    def gen_ir(
+        sparse_indices_ptr,
+        prev_shape_ptr,
+        new_shape_ptr,
+        new_sparse_indices_ptr,
+        out_new_shape_ptr,
+    ):
+        ib = ir_builder.create()
+
+        sparse_indices = ib.buffer_ptr(sparse_indices_ptr)
+        prev_shape = ib.buffer_ptr(prev_shape_ptr)
+
+        new_shape = ib.buffer_ptr(new_shape_ptr)
+        out_new_shape = ib.buffer_ptr(out_new_shape_ptr)
+        new_sparse_indices = ib.buffer_ptr(new_sparse_indices_ptr)
+        out_new_shape = ib.buffer_ptr(out_new_shape_ptr)
+
+        prev_shape_size = prev_shape_ptr.shape[0]
+        new_shape_size = new_shape_ptr.shape[0]
+
+        multipliers = ib.allocate(
+            new_shape_ptr.dtype, (prev_shape_size,), name="multipliers", scope="global"
+        )
+        dividers = ib.allocate(
+            new_shape_ptr.dtype, (new_shape_size,), name="dividers", scope="global"
+        )
+        flattened_indices = ib.allocate(
+            new_shape_ptr.dtype,
+            (sparse_indices_ptr.shape[0],),
+            name="flattened_indices",
+            scope="global",
+        )
+        total_ele = ib.allocate(new_shape_ptr.dtype, (1,), name="total_ele", scope="global")
+        division_total_ele = ib.allocate(
+            new_shape_ptr.dtype, (1,), name="division_total_ele", scope="global"
+        )
+        equal_shape = ib.allocate("bool", (1,), name="equal_shape", scope="global")
+        max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
+        with ib.new_scope():
+            # The computation in this block is very very miniscule since we are just iterating over
+            # shape tensors which are very small (< 10) and there is no need of parallelization
+            nthread_tx = 1
+            nthread_bx = 1
+            tx = te.thread_axis("threadIdx.x")
+            bx = te.thread_axis("blockIdx.x")
+            ib.scope_attr(tx, "thread_extent", nthread_tx)
+            ib.scope_attr(bx, "thread_extent", nthread_bx)
+
+            total_ele[0] = prev_shape[0]
+
+            # Cumulative Reverse Exclusive Multiply
+            multipliers[prev_shape_size - 1] = Cast(new_shape_ptr.dtype, 1)
+            with ib.for_range(0, prev_shape_size - 1) as i_:
+                i = i_ + 1
+                multipliers[prev_shape_size - 1 - i] = (
+                    prev_shape[prev_shape_size - i] * multipliers[prev_shape_size - i]
+                )
+                total_ele[0] *= prev_shape[prev_shape_size - i]
+
+            division_total_ele[0] = Cast(new_shape_ptr.dtype, 1)
+            with ib.for_range(0, new_shape_size) as i:
+                with ib.if_scope(new_shape[i] != -1):
+                    division_total_ele[0] *= new_shape[i]
+
+            # Compute true output shape (replace negative ones)
+            with ib.for_range(0, new_shape_size) as i:
+                with ib.if_scope(new_shape[i] == -1):
+                    out_new_shape[i] = Cast(
+                        new_shape_ptr.dtype, div(total_ele[0], division_total_ele[0])
+                    )
+                with ib.else_scope():
+                    out_new_shape[i] = new_shape[i]
+
+            # Check if prev_shape and new_shape are equal
+            equal_shape[0] = True
+            with ib.if_scope(prev_shape_size == new_shape_size):
+                with ib.for_range(0, prev_shape_size) as i:
+                    with ib.if_scope(prev_shape[i] != out_new_shape[i]):
+                        equal_shape[0] = False
+            with ib.else_scope():
+                equal_shape[0] = False
+
+        with ib.new_scope():
+            nthread_tx = max_threads
+            nthread_bx = ceil_div(sparse_indices_ptr.shape[0], max_threads)
+            tx = te.thread_axis("threadIdx.x")
+            bx = te.thread_axis("blockIdx.x")
+            ib.scope_attr(tx, "thread_extent", nthread_tx)
+            ib.scope_attr(bx, "thread_extent", nthread_bx)
+
+            row_number = bx * max_threads + tx
+
+            # Return same inputs if shapes are equal
+            with ib.if_scope(equal_shape[0]):
+                with ib.if_scope(row_number < sparse_indices_ptr.shape[0]):
+                    with ib.for_range(0, sparse_indices_ptr.shape[1]) as j:
+                        new_sparse_indices[row_number, j] = sparse_indices[row_number, j]
+
+            # Else compute new_sparse_indices
+            with ib.else_scope():
+                dividers[new_shape_size - 1] = Cast(new_shape_ptr.dtype, 1)
+                with ib.for_range(0, new_shape_size - 1) as i_:
+                    i = i_ + 1
+                    dividers[new_shape_size - 1 - i] = (
+                        dividers[new_shape_size - i] * out_new_shape[new_shape_size - i]
+                    )
+
+                with ib.if_scope(row_number < sparse_indices_ptr.shape[0]):
+                    flattened_indices[row_number] = Cast(new_shape_ptr.dtype, 0)
+                    with ib.for_range(0, sparse_indices_ptr.shape[1]) as j:
+                        flattened_indices[row_number] += (
+                            sparse_indices[row_number, j] * multipliers[j]
+                        )
+
+                with ib.if_scope(row_number < sparse_indices_ptr.shape[0]):
+                    current_element = ib.allocate(
+                        new_shape_ptr.dtype, (1,), name="current_element", scope="local"
+                    )
+                    current_element[0] = flattened_indices[row_number]
+
+                    with ib.for_range(0, new_sparse_indices_ptr.shape[1]) as j:
+                        new_sparse_indices[row_number, j] = Cast(
+                            sparse_indices_ptr.dtype, floordiv(current_element[0], dividers[j])
+                        )
+                        current_element[0] = floormod(current_element[0], dividers[j])
+
+        return ib.get()
+
+    new_sparse_indices_buf = decl_buffer(
+        new_sparse_indices_shape, sparse_indices.dtype, "new_sparse_indices_buf"
+    )
+    new_shape_buf = decl_buffer(new_shape_shape, prev_shape.dtype, "new_shape_buf")
+
+    return extern(
+        [new_sparse_indices_shape, new_shape_shape],
+        [sparse_indices, prev_shape, new_shape],
+        lambda ins, outs: gen_ir(ins[0], ins[1], ins[2], outs[0], outs[1]),
+        out_buffers=[new_sparse_indices_buf, new_shape_buf],
+        name="sparse_reshape_cuda",
+        tag="sparse_reshape_cuda",
+    )
diff --git a/python/tvm/topi/sparse_reshape.py b/python/tvm/topi/sparse_reshape.py
new file mode 100644
index 000000000000..5535477e17c8
--- /dev/null
+++ b/python/tvm/topi/sparse_reshape.py
@@ -0,0 +1,185 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, too-many-arguments, too-many-nested-blocks
+"""Sparse_Reshape operator"""
+from ..tir import decl_buffer, ir_builder, Cast
+from ..te import extern, div, floordiv, floormod
+
+
+def sparse_reshape(
+    sparse_indices,
+    prev_shape,
+    new_shape,
+    new_sparse_indices_shape,
+    new_shape_shape,
+):
+    """
+    Reshape a Sparse Tensor
+    Parameters
+    ----------
+    sparse_indices : relay.Expr
+        A 2-D tensor[N, n_dim] of integers containing location of sparse values, where N is the
+        number of sparse values and n_dim is the number of dimensions of the dense_shape
+    prev_shape : relay.Expr
+        A 1-D tensor containing the previous shape of the dense tensor
+    new_shape : relay.Expr
+        A 1-D tensor containing the new shape of the dense tensor
+    Returns
+    -------
+    result: relay.Expr
+        Output tensor.
+    Examples
+    --------
+    .. code-block:: python
+        sparse_indices = [[0, 0, 0],
+                            [0, 0, 1],
+                            [0, 1, 0],
+                            [1, 0, 0],
+                            [1, 2, 3]]
+        prev_shape = [2, 3, 4]
+        new_shape = [9, -1]
+        new_sparse_indices, new_shape = relay.sparse_reshape(sparse_indices,
+                            prev_shape,
+                            new_shape)
+        new_sparse_indices = [[0, 0],
+                              [0, 1],
+                              [1, 2],
+                              [4, 2],
+                              [8, 1]]
+        new_shape = [9, 4]
+    """
+
+    def gen_ir(
+        sparse_indices_ptr,
+        prev_shape_ptr,
+        new_shape_ptr,
+        new_sparse_indices_ptr,
+        out_new_shape_ptr,
+    ):
+        ib = ir_builder.create()
+
+        sparse_indices = ib.buffer_ptr(sparse_indices_ptr)
+        prev_shape = ib.buffer_ptr(prev_shape_ptr)
+
+        new_shape = ib.buffer_ptr(new_shape_ptr)
+        out_new_shape = ib.buffer_ptr(out_new_shape_ptr)
+        new_sparse_indices = ib.buffer_ptr(new_sparse_indices_ptr)
+        out_new_shape = ib.buffer_ptr(out_new_shape_ptr)
+
+        prev_shape_size = prev_shape_ptr.shape[0]
+        new_shape_size = new_shape_ptr.shape[0]
+
+        multipliers = ib.allocate(
+            new_shape_ptr.dtype, (prev_shape_size,), name="multipliers", scope="local"
+        )
+        dividers = ib.allocate(
+            new_shape_ptr.dtype, (new_shape_size,), name="dividers", scope="local"
+        )
+        flattened_indices = ib.allocate(
+            new_shape_ptr.dtype,
+            (sparse_indices_ptr.shape[0],),
+            name="flattened_indices",
+            scope="local",
+        )
+
+        total_ele = ib.allocate(new_shape_ptr.dtype, (1,), name="total_ele", scope="local")
+        total_ele[0] = prev_shape[0]
+
+        # Cumulative Reverse Exclusive Multiply
+        multipliers[prev_shape_size - 1] = Cast(new_shape_ptr.dtype, 1)
+        with ib.for_range(0, prev_shape_size - 1) as i_:
+            i = i_ + 1
+            multipliers[prev_shape_size - 1 - i] = (
+                prev_shape[prev_shape_size - i] * multipliers[prev_shape_size - i]
+            )
+            total_ele[0] *= prev_shape[prev_shape_size - i]
+
+        division_total_ele = ib.allocate(
+            new_shape_ptr.dtype, (1,), name="division_total_ele", scope="local"
+        )
+        division_total_ele[0] = Cast(new_shape_ptr.dtype, 1)
+        with ib.for_range(0, new_shape_size) as i:
+            with ib.if_scope(new_shape[i] != -1):
+                division_total_ele[0] *= new_shape[i]
+
+        # Compute true output shape (replace negative ones)
+        with ib.for_range(0, new_shape_size) as i:
+            with ib.if_scope(new_shape[i] == -1):
+                out_new_shape[i] = Cast(
+                    new_shape_ptr.dtype, div(total_ele[0], division_total_ele[0])
+                )
+            with ib.else_scope():
+                out_new_shape[i] = new_shape[i]
+
+        equal_shape = ib.allocate("bool", (1,), name="equal_shape", scope="local")
+
+        # Check if prev_shape and new_shape are equal
+        equal_shape[0] = True
+        with ib.if_scope(prev_shape_size == new_shape_size):
+            with ib.for_range(0, prev_shape_size) as i:
+                with ib.if_scope(prev_shape[i] != out_new_shape[i]):
+                    equal_shape[0] = False
+        with ib.else_scope():
+            equal_shape[0] = False
+
+        # Return same inputs if shapes are equal
+        with ib.if_scope(equal_shape[0]):
+            with ib.for_range(0, sparse_indices_ptr.shape[0], kind="parallel") as i:
+                with ib.for_range(0, sparse_indices_ptr.shape[1]) as j:
+                    new_sparse_indices[i, j] = sparse_indices[i, j]
+
+        # Else compute new_sparse_indices
+        with ib.else_scope():
+            dividers[new_shape_size - 1] = Cast(new_shape_ptr.dtype, 1)
+            with ib.for_range(0, new_shape_size - 1) as i_:
+                i = i_ + 1
+                dividers[new_shape_size - 1 - i] = (
+                    dividers[new_shape_size - i] * out_new_shape[new_shape_size - i]
+                )
+
+            with ib.for_range(0, sparse_indices_ptr.shape[0], kind="parallel") as i:
+                flattened_indices[i] = Cast(new_shape_ptr.dtype, 0)
+                with ib.for_range(0, sparse_indices_ptr.shape[1]) as j:
+                    flattened_indices[i] += sparse_indices[i, j] * multipliers[j]
+
+            with ib.for_range(0, new_sparse_indices_ptr.shape[0], kind="parallel") as i:
+                current_element = ib.allocate(
+                    new_shape_ptr.dtype, (1,), name="current_element", scope="local"
+                )
+                current_element[0] = flattened_indices[i]
+
+                with ib.for_range(0, new_sparse_indices_ptr.shape[1]) as j:
+                    new_sparse_indices[i, j] = Cast(
+                        sparse_indices_ptr.dtype, floordiv(current_element[0], dividers[j])
+                    )
+                    current_element[0] = floormod(current_element[0], dividers[j])
+
+        return ib.get()
+
+    new_sparse_indices_buf = decl_buffer(
+        new_sparse_indices_shape, sparse_indices.dtype, "new_sparse_indices_buf"
+    )
+    new_shape_buf = decl_buffer(new_shape_shape, prev_shape.dtype, "new_shape_buf")
+
+    return extern(
+        [new_sparse_indices_shape, new_shape_shape],
+        [sparse_indices, prev_shape, new_shape],
+        lambda ins, outs: gen_ir(ins[0], ins[1], ins[2], outs[0], outs[1]),
+        out_buffers=[new_sparse_indices_buf, new_shape_buf],
+        name="sparse_reshape_cpu",
+        tag="sparse_reshape_cpu",
+    )
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index eae231fd8d06..941f43a5a2c4 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -1631,6 +1631,56 @@ RELAY_REGISTER_OP("sparse_fill_empty_rows")
     .set_support_level(3)
     .set_attr<TOpPattern>("TOpPattern", kOpaque);
 
+bool SparseReshapeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                      const TypeReporter& reporter) {
+  // types: [sparse_indices, prev_shape, new_shape, result]
+  ICHECK_EQ(types.size(), 4) << "SparseReshapeRel expects 4 types but " << types.size()
+                             << " provided";
+  ICHECK_EQ(num_inputs, 3) << "SparseReshapeRel expects 4 inputs but " << num_inputs << " provided";
+  auto sparse_indices = types[0].as<TensorTypeNode>();
+  auto prev_shape = types[1].as<TensorTypeNode>();
+  auto new_shape = types[2].as<TensorTypeNode>();
+  if (sparse_indices == nullptr || prev_shape == nullptr || new_shape == nullptr) {
+    return false;
+  }
+  CHECK(sparse_indices->dtype.is_int()) << "sparse_indices must be tensor of integers";
+  CHECK(prev_shape->dtype.is_int()) << "prev_shape must be tensor of integers";
+  CHECK(new_shape->dtype.is_int()) << "new_shape must be tensor of integers";
+  ICHECK_EQ(sparse_indices->shape.size(), 2) << "sparse_indices must be 2-D tensor";
+  ICHECK_EQ(prev_shape->shape.size(), 1) << "prev_shape must be 1-D tensor";
+  ICHECK_EQ(new_shape->shape.size(), 1) << "new_shape must be 1-D tensor";
+  std::vector<Type> fields;
+  Array<PrimExpr> new_sparse_indices_shape{sparse_indices->shape[0], new_shape->shape[0]};
+  fields.push_back(TensorType(new_sparse_indices_shape, sparse_indices->dtype));
+  fields.push_back(TensorType(new_shape->shape, new_shape->dtype));
+  reporter->Assign(types[3], TupleType(Array<Type>(fields)));
+  return true;
+}
+
+Expr MakeSparseReshape(Expr sparse_indices, Expr prev_shape, Expr new_shape) {
+  static const Op& op = Op::Get("sparse_reshape");
+  return Call(op, {sparse_indices, prev_shape, new_shape}, Attrs(), {});
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.sparse_reshape").set_body_typed(MakeSparseReshape);
+
+RELAY_REGISTER_OP("sparse_reshape")
+    .describe(R"code(Return new sparse indices of the reshaped tensor
+)code" TVM_ADD_FILELINE)
+    .set_num_inputs(3)
+    .add_argument("sparse_indices", "Tensor",
+                  "A 2-D tensor of shape [N, ndims], which specifies the indices of the"
+                  "elements in the sparse tensor that contain nonzero values.  COO Format")
+    .add_argument("prev_shape", "Tensor",
+                  "A 1-D tensor of shape [ndims], which specifies the previous dense shape of the"
+                  "sparse tensor")
+    .add_argument("new_shape", "Tensor",
+                  "A 1-D tensor of shape [ndims], which specifies the desired dense shape of the"
+                  "sparse tensor")
+    .add_type_rel("sparse_reshape", SparseReshapeRel)
+    .set_attr<TOpPattern>("TOpPattern", kInjective)
+    .set_support_level(3);
+
 // meshgrid operator
 TVM_REGISTER_NODE_TYPE(MeshgridAttrs);
 
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 8b146b6511ce..41145bf77218 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -1956,6 +1956,130 @@ def test_forward_sparse_fill_empty_rows(
 
 
 #######################################################################
+# SparseReshape
+# ------------
+
+
+def _test_sparse_reshape(indices_np, values_np, prev_shape_np, new_shape_np, use_dyn=False):
+    with tf.Graph().as_default():
+        if use_dyn:
+            indices = tf.placeholder(shape=(None, None), dtype=indices_np.dtype, name="indices")
+            values = tf.placeholder(shape=(None), dtype=values_np.dtype, name="values")
+            prev_shape = tf.placeholder(shape=(None), dtype=prev_shape_np.dtype, name="prev_shape")
+            new_shape = tf.placeholder(shape=(None), dtype=new_shape_np.dtype, name="new_shape")
+        else:
+            indices = tf.placeholder(shape=indices_np.shape, dtype=indices_np.dtype, name="indices")
+            values = tf.placeholder(shape=values_np.shape, dtype=values_np.dtype, name="values")
+            prev_shape = tf.placeholder(
+                shape=prev_shape_np.shape, dtype=prev_shape_np.dtype, name="prev_shape"
+            )
+            new_shape = tf.placeholder(
+                shape=new_shape_np.shape, dtype=new_shape_np.dtype, name="new_shape"
+            )
+        sp_input = tf.sparse.SparseTensor(indices=indices, values=values, dense_shape=prev_shape)
+
+        _ = tf.sparse.reshape(sp_input, new_shape, name="sparse_reshape")
+        compare_tf_with_tvm(
+            [indices_np, values_np, prev_shape_np, new_shape_np],
+            [indices.name, values.name, prev_shape.name, new_shape.name],
+            ["sparse_reshape:0", "sparse_reshape:1", "sparse_reshape/Identity:0"],
+            mode="vm",
+        )
+
+
+@pytest.mark.parametrize(
+    "sparse_indices_np, sparse_values_np, prev_shape_np, new_shape_np",
+    [
+        (
+            np.ones((0, 1), dtype=np.int64),
+            np.array([], dtype=np.int64),
+            np.array([4], dtype=np.int64),
+            np.array([2, -1], dtype=np.int64),
+        ),
+        (
+            np.ones((0, 1), dtype=np.int64),
+            np.array([], dtype=np.int64),
+            np.array([4], dtype=np.int64),
+            np.array([2, 2], dtype=np.int64),
+        ),
+        (
+            np.ones((0, 2), dtype=np.int64),
+            np.array([], dtype=np.int64),
+            np.array([3, 6], dtype=np.int64),
+            np.array([-1, 2], dtype=np.int64),
+        ),
+        (
+            np.array([[0, 0, 0], [0, 0, 1], [0, 1, 0], [1, 0, 0], [1, 2, 3]], dtype=np.int64),
+            np.array([7, 5, 6, 3, 9], dtype=np.int64),
+            np.array([2, 3, 6], dtype=np.int64),
+            np.array([-1, 9], dtype=np.int64),
+        ),
+        (
+            np.array(
+                [
+                    [0, 0, 0, 0, 0],
+                    [0, 0, 1, 2, 3],
+                    [0, 1, 0, 3, 5],
+                    [1, 0, 0, 4, 6],
+                    [1, 2, 3, 6, 8],
+                ],
+                dtype=np.int64,
+            ),
+            np.array([7, 5, 6, 3, 9], dtype=np.int64),
+            np.array([2, 3, 6, 7, 9], dtype=np.int64),
+            np.array([9, -1, 7], dtype=np.int64),
+        ),
+        (
+            np.array([[0, 0], [0, 1], [3, 4], [4, 3], [7, 3]], dtype=np.int64),
+            np.array([7, 5, 6, 3, 9], dtype=np.int64),
+            np.array([9, 4], dtype=np.int64),
+            np.array([-1], dtype=np.int64),
+        ),
+        (
+            np.array([[0], [5], [10], [20], [24]], dtype=np.int64),
+            np.array([7, 5, 6, 3, 9], dtype=np.int64),
+            np.array([25], dtype=np.int64),
+            np.array([5, 5], dtype=np.int64),
+        ),
+        (
+            np.array([[0, 100], [200, 100], [300, 400], [50, 20], [400, 50]], dtype=np.int64),
+            np.array([7, 5, 6, 3, 9], dtype=np.int64),
+            np.array([500, 20], dtype=np.int64),
+            np.array([500, 20], dtype=np.int64),
+        ),
+        (
+            np.array([[0, 100], [200, 100], [300, 400], [50, 20], [400, 50]], dtype=np.int64),
+            np.array([7, 5, 6, 3, 9], dtype=np.int64),
+            np.array([500, 20], dtype=np.int64),
+            np.array([500, -1], dtype=np.int64),
+        ),
+        (
+            np.array([[0, 100], [200, 100], [300, 400], [50, 20], [400, 50]], dtype=np.int64),
+            np.array([7, 5, 6, 3, 9], dtype=np.int64),
+            np.array([500, 20], dtype=np.int64),
+            np.array([250, 40], dtype=np.int64),
+        ),
+    ],
+)
+@pytest.mark.parametrize("use_dyn", [True, False])
+def test_forward_sparse_reshape(
+    sparse_indices_np, sparse_values_np, prev_shape_np, new_shape_np, use_dyn
+):
+    """ sparse_reshape op test"""
+    ###################################################################
+    #
+    # In order to create a SparseTensor, it requires 3 input as below:
+    #    SparseTensor(indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
+    #
+    # Above Sparse can be represented in Dense as below :
+    #    [[1, 0, 0, 0]
+    #     [0, 0, 2, 0]
+    #     [0, 0, 0, 0]]
+    #
+    # ------------------------------------------------------------------
+    _test_sparse_reshape(sparse_indices_np, sparse_values_np, prev_shape_np, new_shape_np, use_dyn)
+
+
 # tensorflow.compat.v1.sparse_to_dense
 # ---------------
 def _test_sparse_to_dense(sparse_indices, sparse_values, default_value, output_shape):
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index ee55b532218d..c9ed975c3b9b 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -1311,6 +1311,229 @@ def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_
     # verify_sparse_to_dense([[[[0, 1, 4], [0, 2, 4]]]], [[[[3.1, 3.1, 3.1]]]], 3.5, [5], [3.1, 3.1, 3.5, 3.5, 3.1])
 
 
+@tvm.testing.uses_gpu
+@pytest.mark.parametrize(
+    "sparse_indices_np, sparse_values_np, prev_shape_np, new_shape_np",
+    [
+        (
+            np.array([[0, 0, 0], [0, 0, 1], [0, 1, 0], [1, 0, 0], [1, 2, 3]], dtype=np.int32),
+            np.array([7, 5, 6, 3, 9], dtype=np.int32),
+            np.array([2, 3, 6], dtype=np.int32),
+            np.array([9, -1], dtype=np.int32),
+        ),
+        (
+            np.array(
+                [[0, 0, 0, 0], [0, 0, 1, 2], [0, 1, 0, 3], [1, 0, 0, 4], [1, 2, 3, 6]],
+                dtype=np.int64,
+            ),
+            np.array([7, 5, 6, 3, 9], dtype=np.int64),
+            np.array([2, 3, 6, 7], dtype=np.int64),
+            np.array([9, -1, 7], dtype=np.int64),
+        ),
+        (
+            np.array(
+                [
+                    [0, 0, 0, 0, 0],
+                    [0, 0, 1, 2, 3],
+                    [0, 1, 0, 3, 5],
+                    [1, 0, 0, 4, 6],
+                    [1, 2, 3, 6, 8],
+                ],
+                dtype=np.int64,
+            ),
+            np.array([7, 5, 6, 3, 9], dtype=np.int64),
+            np.array([2, 3, 6, 7, 9], dtype=np.int64),
+            np.array([9, -1, 7], dtype=np.int64),
+        ),
+        (
+            np.array([[0, 0], [0, 1], [3, 4], [4, 3], [7, 3]], dtype=np.int32),
+            np.array([7, 5, 6, 3, 9], dtype=np.int32),
+            np.array([9, 4], dtype=np.int32),
+            np.array([2, -1, 6], dtype=np.int32),
+        ),
+        (
+            np.array([[0, 0], [0, 1], [3, 4], [4, 3], [7, 3]], dtype=np.int64),
+            np.array([7, 5, 6, 3, 9], dtype=np.int64),
+            np.array([9, 4], dtype=np.int64),
+            np.array([-1], dtype=np.int64),
+        ),
+        (
+            np.array([[0], [5], [10], [20], [24]], dtype=np.int32),
+            np.array([7, 5, 6, 3, 9], dtype=np.int32),
+            np.array([25], dtype=np.int32),
+            np.array([5, 5], dtype=np.int32),
+        ),
+        (
+            np.array([[0, 100], [200, 100], [300, 400], [50, 20], [400, 50]], dtype=np.int64),
+            np.array([7, 5, 6, 3, 9], dtype=np.int64),
+            np.array([500, 20], dtype=np.int64),
+            np.array([500, 20], dtype=np.int64),
+        ),
+        (
+            np.array([[0, 100], [200, 100], [300, 400], [50, 20], [400, 50]], dtype=np.int32),
+            np.array([7, 5, 6, 3, 9], dtype=np.int32),
+            np.array([500, 20], dtype=np.int32),
+            np.array([500, -1], dtype=np.int32),
+        ),
+        (
+            np.array([[0, 100], [200, 100], [300, 400], [50, 20], [400, 50]], dtype=np.int64),
+            np.array([7, 5, 6, 3, 9], dtype=np.int64),
+            np.array([500, 20], dtype=np.int64),
+            np.array([250, 40], dtype=np.int64),
+        ),
+        (
+            np.ones((0, 1), dtype=np.int32),
+            np.array([], dtype=np.int32),
+            np.array([4], dtype=np.int32),
+            np.array([2, -1], dtype=np.int32),
+        ),
+        (
+            np.ones((0, 1), dtype=np.int64),
+            np.array([], dtype=np.int64),
+            np.array([4], dtype=np.int64),
+            np.array([2, 2], dtype=np.int64),
+        ),
+        (
+            np.ones((0, 2), dtype=np.int32),
+            np.array([], dtype=np.int32),
+            np.array([3, 6], dtype=np.int32),
+            np.array([-1, 2], dtype=np.int32),
+        ),
+    ],
+)
+@pytest.mark.parametrize("use_dyn", [True, False])
+def test_sparse_reshape(sparse_indices_np, sparse_values_np, prev_shape_np, new_shape_np, use_dyn):
+    def ref_sparse_reshape(
+        sparse_indices: np.ndarray,
+        prev_shape: np.ndarray,
+        new_shape: np.ndarray,
+    ):
+        """
+        This function calculates the expected output of sparseshape operator given the inputs.
+        """
+
+        new_sparse_indices = np.ones(
+            (sparse_indices.shape[0], new_shape.shape[0]), dtype=sparse_indices.dtype
+        )
+        multipliers = np.ones(prev_shape.shape[0])
+        dividers = np.ones(new_shape.shape[0])
+        total_ele = np.prod(prev_shape)
+        division_total_ele = 1
+        for i in range(new_shape.shape[0]):
+            if new_shape[i] == -1:
+                continue
+            division_total_ele *= new_shape[i]
+        for i in range(prev_shape.shape[0] - 2, -1, -1):
+            multipliers[i] = prev_shape[i + 1] * multipliers[i + 1]
+
+        for i in range(len(new_shape)):
+            if new_shape[i] == -1:
+                new_shape[i] = total_ele // division_total_ele
+
+        if np.array_equal(prev_shape, new_shape):
+            return sparse_indices, prev_shape
+
+        for i in range(new_shape.shape[0] - 2, -1, -1):
+            dividers[i] = new_shape[i + 1] * dividers[i + 1]
+
+        for row_num, sparse_row in enumerate(sparse_indices):
+            flat_idx = 0
+            if len(sparse_indices.shape) != 1:
+                for i, ele in enumerate(sparse_row):
+                    flat_idx += sparse_row[i] * multipliers[i]
+            else:
+                flat_idx += sparse_row
+            if len(new_sparse_indices.shape) != 1:
+                for i in range(new_sparse_indices.shape[1]):
+                    new_sparse_indices[row_num][i] = flat_idx // dividers[i]
+                    flat_idx = flat_idx % dividers[i]
+            else:
+                new_sparse_indices[row_num] = flat_idx
+
+        return new_sparse_indices, new_shape
+
+    def verify_sparse_reshape(
+        sparse_indices_np: np.ndarray,
+        sparse_values_np: np.ndarray,
+        prev_shape_np: np.ndarray,
+        new_shape_np: np.ndarray,
+    ):
+        """
+        This function verifies the relay output of sparse_reshape with its expected output.
+        """
+        if use_dyn:
+            sparse_indices = relay.var(
+                "sparse_indices",
+                shape=[relay.Any(), relay.Any()],
+                dtype=str(sparse_indices_np.dtype),
+            )
+            prev_shape = relay.var(
+                "prev_shape",
+                shape=[relay.Any()],
+                dtype=str(prev_shape_np.dtype),
+            )
+            new_shape = relay.var(
+                "new_shape",
+                shape=[relay.Any()],
+                dtype=str(new_shape_np.dtype),
+            )
+        else:
+            sparse_indices = relay.var(
+                "sparse_indices",
+                relay.TensorType(sparse_indices_np.shape, str(sparse_indices_np.dtype)),
+            )
+            prev_shape = relay.var(
+                "prev_shape", relay.TensorType(prev_shape_np.shape, str(prev_shape_np.dtype))
+            )
+            new_shape = relay.var(
+                "new_shape", relay.TensorType(new_shape_np.shape, str(new_shape_np.dtype))
+            )
+        z = relay.op.sparse_reshape(sparse_indices, prev_shape, new_shape).astuple()
+
+        func = relay.Function([sparse_indices, prev_shape, new_shape], z)
+
+        ref_res = ref_sparse_reshape(sparse_indices_np, prev_shape_np, new_shape_np)
+        outputs = run_infer_type(z)
+        new_sparse_indices_infer_type, new_shape_infer_type = (
+            outputs.checked_type.fields[0].dtype,
+            outputs.checked_type.fields[1].dtype,
+        )
+
+        assert new_sparse_indices_infer_type == sparse_indices_np.dtype
+        assert new_shape_infer_type == new_shape_np.dtype
+        verify_func(
+            func,
+            [sparse_indices_np, prev_shape_np, new_shape_np],
+            ref_res,
+        )
+
+    verify_sparse_reshape(
+        sparse_indices_np,
+        sparse_values_np,
+        prev_shape_np,
+        new_shape_np,
+    )
+
+
+def verify_func(func, data, ref_res, target_ctx=tvm.testing.enabled_targets()):
+    assert isinstance(data, list)
+    for target, ctx in target_ctx:
+        for kind in ["vm"]:
+            mod = tvm.ir.IRModule.from_expr(func)
+            intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target)
+            op_res = intrp.evaluate()(*data)
+            if isinstance(op_res, tvm.runtime.container.ADT):
+                assert len(op_res) == len(
+                    ref_res
+                ), "Outputs from TVM and Python implementation must be equal "
+
+                for op_result, ref_result in zip(op_res, ref_res):
+                    tvm.testing.assert_allclose(op_result.asnumpy(), ref_result, rtol=1e-5)
+            else:
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+            relay.backend.compile_engine.get().clear()
+
+
 @tvm.testing.uses_gpu
 def test_adv_index():
     def verify_adv_index(data_shape, index_shapes):

From 5c5aea620bb940fd4fb7106602ae51111c7af03a Mon Sep 17 00:00:00 2001
From: Lily Orth-Smith <lilyorthsmith@gmail.com>
Date: Fri, 26 Feb 2021 16:00:54 -0500
Subject: [PATCH 245/357] [BUG_FIX][TOPI] Allow topi resize to accept more
 options (#7532)

* Make topi more permissive

* Remove testing stuff

* lint

* Downsampling tests
---
 python/tvm/topi/image/resize.py             |  6 +-----
 tests/python/frontend/onnx/test_forward.py  | 12 ++++++++++++
 tests/python/relay/test_op_level5.py        | 20 +++++++++++++-------
 tests/python/topi/python/test_topi_image.py | 20 +++++++++++---------
 4 files changed, 37 insertions(+), 21 deletions(-)

diff --git a/python/tvm/topi/image/resize.py b/python/tvm/topi/image/resize.py
index 103850de4923..433a92008b6e 100644
--- a/python/tvm/topi/image/resize.py
+++ b/python/tvm/topi/image/resize.py
@@ -653,11 +653,7 @@ def resize(
         or 5-D with shape [batch, channel-major, in_height*scale, in_width*scale, channel-minor]
     """
     method = method.lower()
-    if method == "nearest_neighbor" and coordinate_transformation_mode != "asymmetric":
-        raise ValueError(
-            "Topi Resize does not support the combination of method %s "
-            "and coordinate_transformation_mode %s" % (method, coordinate_transformation_mode)
-        )
+
     if layout == "NHWC":
         in_n, in_h, in_w, in_c = data.shape
         if output_shape is None:
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 9e5911791481..8dbd04951f21 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -3356,15 +3356,27 @@ def verify(ishape, oshape, scales, mode, coord_trans):
 
     # upsampling
     verify([1, 16, 32, 32], [1, 16, 64, 64], [], "nearest", "asymmetric")
+    verify([1, 16, 32, 32], [1, 16, 64, 64], [], "linear", "asymmetric")
+    verify([1, 16, 32, 32], [1, 16, 64, 64], [], "nearest", "align_corners")
     verify([1, 16, 32, 32], [1, 16, 64, 64], [], "linear", "align_corners")
+    verify([1, 16, 32, 32], [1, 16, 64, 64], [], "nearest", "half_pixel")
     verify([1, 16, 32, 32], [1, 16, 64, 64], [], "linear", "half_pixel")
+
     # downsampling
     verify([1, 16, 32, 32], [1, 16, 16, 16], [], "nearest", "asymmetric")
+    verify([1, 16, 32, 32], [1, 16, 16, 16], [], "linear", "asymmetric")
+    verify([1, 16, 32, 32], [1, 16, 16, 16], [], "nearest", "align_corners")
     verify([1, 16, 32, 32], [1, 16, 16, 16], [], "linear", "align_corners")
+    verify([1, 16, 32, 32], [1, 16, 16, 16], [], "nearest", "half_pixel")
     verify([1, 16, 32, 32], [1, 16, 16, 16], [], "linear", "half_pixel")
+
     # scales are specified instead of sizes
     verify([1, 16, 32, 32], [], [1, 1, 2, 2], "nearest", "asymmetric")
+    verify([1, 16, 32, 32], [], [1, 1, 2, 2], "linear", "asymmetric")
+    verify([1, 16, 32, 32], [], [1, 1, 2, 2], "nearest", "align_corners")
+    verify([1, 16, 32, 32], [], [1, 1, 2, 2], "linear", "align_corners")
     verify([1, 16, 32, 32], [], [1, 1, 0.5, 0.5], "linear", "half_pixel")
+    verify([1, 16, 32, 32], [], [1, 1, 0.5, 0.5], "nearest", "half_pixel")
 
     def verify_opset_10(ishape, scales, mode):
         nodes = [
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 87f3ab87989b..929764b6e40a 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -67,13 +67,19 @@ def verify_resize(dshape, scale, method, layout, coord_trans):
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(x_data)
-                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-4, atol=1e-5)
-
-    for layout in ["NHWC", "NCHW"]:
-        verify_resize((1, 4, 4, 4), 2, "bilinear", layout, "align_corners")
-        verify_resize((2, 8, 17, 20), 3, "bilinear", layout, "half_pixel")
-        verify_resize((2, 8, 17, 20), 3, "bilinear", layout, "asymmetric")
-        verify_resize((3, 4, 5, 6), 5, "nearest_neighbor", layout, "asymmetric")
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-3, atol=1e-4)
+
+    for method in ["nearest_neighbor", "bilinear"]:
+        for coord_trans in ["asymmetric", "half_pixel", "align_corners"]:
+            for layout in ["NHWC", "NCHW"]:
+                # TODO: Topi test does not have a function to produce numpy output for resize with
+                # nearest_neighbors and align_corners. Enable when topi test has this option
+                if coord_trans == "align_corners" and method == "nearest_neighbor":
+                    continue
+                verify_resize((1, 4, 4, 4), 2, method, layout, coord_trans)
+                verify_resize((2, 8, 17, 20), 3, method, layout, coord_trans)
+                verify_resize((2, 8, 17, 20), 3, method, layout, coord_trans)
+                verify_resize((3, 4, 5, 6), 5, method, layout, coord_trans)
 
 
 def test_resize3d_infer_type():
diff --git a/tests/python/topi/python/test_topi_image.py b/tests/python/topi/python/test_topi_image.py
index 518ee1f32676..c605df7037e4 100644
--- a/tests/python/topi/python/test_topi_image.py
+++ b/tests/python/topi/python/test_topi_image.py
@@ -59,6 +59,9 @@ def verify_resize(
             a_np, (out_height, out_width), layout, coord_trans
         )
     else:
+        # TODO: Nearest neighbor case doesn't do anything with coordinate transform mode, and also
+        # nearest_neighbors and align_corners combination in topi doesn't match the output of this
+        # function.
         scale_h = out_height / in_height
         scale_w = out_width / in_width
         b_np = tvm.topi.testing.upsampling_python(a_np, (scale_h, scale_w), layout)
@@ -88,15 +91,14 @@ def test_resize():
     verify_resize(4, 16, 32, 32, 50, 50, "NHWC")
     # Scale NHWC + Align Corners
     verify_resize(6, 32, 64, 64, 20, 20, "NHWC")
-    # Nearest + Fractional
-    verify_resize(4, 16, 32, 32, 50, 50, "NCHW", "asymmetric", method="nearest_neighbor")
-    verify_resize(4, 16, 32, 32, 50, 50, "NHWC", "asymmetric", method="nearest_neighbor")
-    # half_pixel
-    verify_resize(4, 16, 16, 16, 32, 32, "NCHW", "half_pixel", method="bilinear")
-    verify_resize(4, 16, 16, 16, 32, 32, "NHWC", "half_pixel", method="bilinear")
-    # Bilinear + Fractional
-    verify_resize(4, 16, 32, 32, 50, 50, "NCHW", "asymmetric", method="bilinear")
-    verify_resize(4, 16, 32, 32, 50, 50, "NHWC", "asymmetric", method="bilinear")
+    for method in ["nearest_neighbor", "bilinear"]:
+        for coord_trans in ["asymmetric", "half_pixel", "align_corners"]:
+            for layout in ["NCHW", "NHWC"]:
+                # TODO: When topi test has an option for align corners and nearest neighbor that
+                # produces correct results, re-enable it.
+                if coord_trans == "align_corners" and method == "nearest_neighbor":
+                    continue
+                verify_resize(4, 16, 32, 32, 50, 50, layout, coord_trans, method=method)
 
 
 def verify_resize3d(

From 2d5747054ca05a0863236b317e2fed281b455a00 Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Fri, 26 Feb 2021 14:05:22 -0700
Subject: [PATCH 246/357] [ONNX]fix datatype on Reciprocal op (#7519)

* fix datatype on Reciprocal op

* clean up test case
---
 python/tvm/relay/frontend/onnx.py          |  3 ++-
 tests/python/frontend/onnx/test_forward.py | 11 +++++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 58c2dbcad26a..860753d6cd0b 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -839,7 +839,8 @@ class Reciprocal(OnnxOpConverter):
 
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
-        return _expr.const(1.0) / inputs[0]
+        dtype = infer_type(inputs[0]).checked_type.dtype
+        return _expr.const(1.0, dtype=dtype) / inputs[0]
 
 
 class Flatten(OnnxOpConverter):
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 8dbd04951f21..1e1341640ea0 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -1830,23 +1830,26 @@ def test_unary_ops():
     dtype = "float32"
     out_shape = in_shape
 
-    def verify_unary_ops(op, x, rtol=1e-5, atol=1e-5):
+    def verify_unary_ops(op, x, rtol=1e-5, atol=1e-5, dtype="float32"):
+        x = x.astype(dtype)
+        ONNX_DTYPE = mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(dtype)]
         z = helper.make_node(op, ["in1"], ["out"])
         graph = helper.make_graph(
             [z],
             "_test",
             inputs=[
-                helper.make_tensor_value_info("in1", TensorProto.FLOAT, list(in_shape)),
+                helper.make_tensor_value_info("in1", ONNX_DTYPE, list(in_shape)),
             ],
-            outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))],
+            outputs=[helper.make_tensor_value_info("out", ONNX_DTYPE, list(out_shape))],
         )
         model = helper.make_model(graph, producer_name="_test")
         verify_with_ort_with_inputs(model, [x], rtol=rtol, atol=atol)
 
-    x = np.random.uniform(size=in_shape).astype(dtype)
+    x = np.random.uniform(size=in_shape)
     verify_unary_ops("Neg", x)
     verify_unary_ops("Abs", x)
     verify_unary_ops("Reciprocal", x)
+    verify_unary_ops("Reciprocal", x, dtype="float16")
     verify_unary_ops("Sqrt", x)
     verify_unary_ops("Relu", x)
     verify_unary_ops("Exp", x)

From 74ca8f03f4cdffa8f4d31719942f5a48988ef906 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Fri, 26 Feb 2021 17:30:58 -0500
Subject: [PATCH 247/357] [CI] Move ci-cpu to use llvm-11 (#7541)

* [CI] Move ci-cpu to use llvm-11

* Fix the testcase of x86 codegen by relax the register names.
---
 .../unittest/test_target_codegen_x86.py       | 21 +++++++------------
 tests/scripts/task_config_build_cpu.sh        |  2 +-
 2 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/tests/python/unittest/test_target_codegen_x86.py b/tests/python/unittest/test_target_codegen_x86.py
index b581f72ec763..ec42e0a4d749 100644
--- a/tests/python/unittest/test_target_codegen_x86.py
+++ b/tests/python/unittest/test_target_codegen_x86.py
@@ -52,21 +52,14 @@ def fp16_to_fp32(target, width, match=None, not_match=None):
             not_matches = [l for l in assembly if re.search(not_match, l)]
             assert not not_matches
 
-    fp16_to_fp32(
-        "llvm -mcpu=skylake-avx512", 15, match="vcvtph2ps.*ymm", not_match="vcvtph2ps.*zmm"
-    )
-    fp16_to_fp32("llvm -mcpu=skylake-avx512", 16, match="vcvtph2ps.*zmm")
-    fp16_to_fp32("llvm -mcpu=skylake-avx512", 17, match="vcvtph2ps.*zmm")
-    fp16_to_fp32("llvm -mcpu=skylake-avx512", 49, match="vcvtph2ps.*zmm")
-    fp16_to_fp32(
-        "llvm -mcpu=skylake-avx512 -mattr=-avx512f",
-        49,
-        match="vcvtph2ps.*ymm",
-        not_match="vcvtph2ps.*zmm",
-    )
+    fp16_to_fp32("llvm -mcpu=skylake-avx512", 15, match="vcvtph2ps.*mm")
+    fp16_to_fp32("llvm -mcpu=skylake-avx512", 16, match="vcvtph2ps.*mm")
+    fp16_to_fp32("llvm -mcpu=skylake-avx512", 17, match="vcvtph2ps.*mm")
+    fp16_to_fp32("llvm -mcpu=skylake-avx512", 49, match="vcvtph2ps.*mm")
+    fp16_to_fp32("llvm -mcpu=skylake-avx512 -mattr=-avx512f", 49, match="vcvtph2ps.*mm")
     fp16_to_fp32("llvm -mcpu=skylake-avx512 -mattr=-f16c,-avx512f", 49, not_match="vcvtph2ps")
-    fp16_to_fp32("llvm -mcpu=core-avx2", 8, match="vcvtph2ps.*ymm")
-    fp16_to_fp32("llvm -mcpu=core-avx2", 9, match="vcvtph2ps.*ymm")
+    fp16_to_fp32("llvm -mcpu=core-avx2", 8, match="vcvtph2ps.*mm")
+    fp16_to_fp32("llvm -mcpu=core-avx2", 9, match="vcvtph2ps.*mm")
     fp16_to_fp32("llvm", 9, not_match="vcvtph2ps")
 
 
diff --git a/tests/scripts/task_config_build_cpu.sh b/tests/scripts/task_config_build_cpu.sh
index 9ddf1778ff9f..db636063b9e3 100755
--- a/tests/scripts/task_config_build_cpu.sh
+++ b/tests/scripts/task_config_build_cpu.sh
@@ -30,7 +30,7 @@ echo set\(USE_GRAPH_RUNTIME_DEBUG ON\) >> config.cmake
 echo set\(USE_VM_PROFILER ON\) >> config.cmake
 echo set\(USE_DNNL_CODEGEN ON\) >> config.cmake
 echo set\(USE_ARM_COMPUTE_LIB ON\) >> config.cmake
-echo set\(USE_LLVM llvm-config-10\) >> config.cmake
+echo set\(USE_LLVM llvm-config-11\) >> config.cmake
 echo set\(USE_NNPACK ON\) >> config.cmake
 echo set\(NNPACK_PATH /NNPACK/build/\) >> config.cmake
 echo set\(USE_ANTLR ON\) >> config.cmake

From 0758337b2be12d9f839e0799795050d4c1c33545 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Fri, 26 Feb 2021 17:31:18 -0800
Subject: [PATCH 248/357] Add create_local_debug_runtime to micro exports
 (#7528)

* Add create_local_debug_runtime to micro exports.

* retrigger CI
---
 python/tvm/micro/__init__.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/python/tvm/micro/__init__.py b/python/tvm/micro/__init__.py
index 299b143eb5de..8e5807acec94 100644
--- a/python/tvm/micro/__init__.py
+++ b/python/tvm/micro/__init__.py
@@ -23,5 +23,10 @@
 from .debugger import GdbRemoteDebugger
 from .micro_library import MicroLibrary
 from .micro_binary import MicroBinary
-from .session import create_local_graph_runtime, Session, SessionTerminatedError
+from .session import (
+    create_local_graph_runtime,
+    create_local_debug_runtime,
+    Session,
+    SessionTerminatedError,
+)
 from .transport import TransportLogger, DebugWrapperTransport, SubprocessTransport

From f6d0feef52f7b4c037150f405c37e693b4884f7f Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Fri, 26 Feb 2021 17:32:43 -0800
Subject: [PATCH 249/357] Don't run non-tvm_op GraphRuntime nodes in Debug
 Runtime over RPC. (#7512)

* Don't run non-tvm_op GraphRuntime nodes in Debug Runtime over RPC.

 * These are filtered out in SetupOpExecs for normal debug runtime
 operation.

* retrigger CI

* retrigger CI

* address tkonolige comment
---
 src/runtime/graph/debug/graph_runtime_debug.cc | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/runtime/graph/debug/graph_runtime_debug.cc b/src/runtime/graph/debug/graph_runtime_debug.cc
index 0b8f39dd9f94..93bdd065c9d9 100644
--- a/src/runtime/graph/debug/graph_runtime_debug.cc
+++ b/src/runtime/graph/debug/graph_runtime_debug.cc
@@ -110,6 +110,19 @@ class GraphRuntimeDebug : public GraphRuntime {
   }
 
   double RunOpRPC(int index, int number, int repeat, int min_repeat_ms) {
+    // Right now we expect either "tvm_op" for nodes which run PackedFunc or "null" for nodes which
+    // represent inputs/parameters to the graph. Other types may be supported in the future, but
+    // consideration would be needed as to how to do that over RPC before we support it here.
+    if (nodes_[index].op_type != "tvm_op") {
+      CHECK_EQ(nodes_[index].op_type, "null")
+          << "Don't know how to run op type " << nodes_[index].op_type
+          << " remotely over RPC right now";
+
+      // NOTE: GraphRuntimeDebug expects graph nodes to have an "op" attribute of "tvm_op" or "null"
+      // and "null" is a placeholder node for a parameter or input.
+      return 0;
+    }
+
     const TVMContext& ctx = data_entry_[entry_id(index, 0)]->ctx;
     TVMOpParam param = nodes_[index].param;
     std::string name = param.func_name;

From e7f0a11f8fb2a9dd8afc45dfb02e19d564f05de7 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <pivovaa@amazon.com>
Date: Fri, 26 Feb 2021 18:28:02 -0800
Subject: [PATCH 250/357] Add test_forward_index_put to __main__ (#7542)

---
 python/tvm/auto_scheduler/feature.py          | 2 +-
 tests/python/frontend/pytorch/test_forward.py | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/tvm/auto_scheduler/feature.py b/python/tvm/auto_scheduler/feature.py
index bd6526187581..ec7cf6334f98 100644
--- a/python/tvm/auto_scheduler/feature.py
+++ b/python/tvm/auto_scheduler/feature.py
@@ -120,7 +120,7 @@ def unpack_feature(byte_arr: bytearray) -> Tuple[np.ndarray, np.ndarray, np.ndar
             tmp_vec_len = (size - 1) // n_stmts
             assert (
                 tmp_vec_len == vec_len
-            ), "The lenght of feature vector is wrong. " "Expected %d but got %d." % (
+            ), "The length of feature vector is wrong. Expected %d but got %d." % (
                 vec_len,
                 tmp_vec_len,
             )
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 90604751d4f1..6acd8b299920 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -1957,7 +1957,7 @@ def _impl(inputs, input_types):
 
 
 @tvm.testing.uses_gpu
-def test_segmentaton_models():
+def test_segmentation_models():
     class SegmentationModelWrapper(Module):
         def __init__(self, model):
             super().__init__()
@@ -3811,6 +3811,7 @@ def test_fn(is_sorted, return_inverse, return_counts):
     test_forward_unbind()
     test_forward_nonzero()
     test_forward_scatter()
+    test_forward_index_put()
     test_numel()
     test_bincount()
     test_cumsum()
@@ -3836,7 +3837,7 @@ def test_fn(is_sorted, return_inverse, return_counts):
 
     test_custom_conversion_map()
 
-    test_segmentaton_models()
+    test_segmentation_models()
     test_3d_models()
 
     # Quantization test

From 0bbc205c16daa5ef79ef05c10faabf7666ab6309 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <pivovaa@amazon.com>
Date: Fri, 26 Feb 2021 19:41:17 -0800
Subject: [PATCH 251/357] [torch] Add narrow operator (#7535)

---
 python/tvm/relay/frontend/pytorch.py          | 14 ++++++++++
 tests/python/frontend/pytorch/test_forward.py | 26 +++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index fdebd2f50e68..a471639da623 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -473,6 +473,19 @@ def slice(self, inputs, input_types):
             data, begin=begin, end=end, strides=strides, slice_mode="end"
         )
 
+    def narrow(self, inputs, input_types):
+        # Inputs are:
+        # 0 - the tensor to narrow
+        # 1 - the dimension along which to narrow
+        # 2 - the starting dimension
+        # 3 - the distance to the ending dimension
+        # Lets find the ending dimension
+        end = self.add(inputs[2:4], input_types[2:4])
+        stride = 1
+        slice_input = inputs[:3] + [end, stride]
+        slice_types = input_types + ["int32"]
+        return self.slice(slice_input, slice_types)
+
     def split(self, inputs, input_types):
         data = inputs[0]
         split_size = int(inputs[1])
@@ -2222,6 +2235,7 @@ def create_convert_map(self):
             "aten::unsqueeze_": self.unsqueeze,
             "aten::cat": self.concatenate,
             "aten::slice": self.slice,
+            "aten::narrow": self.narrow,
             "aten::split": self.split,
             "aten::split_with_sizes": self.split_with_sizes,
             "aten::select": self.select,
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 6acd8b299920..826edd051544 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -1505,6 +1505,31 @@ def forward(self, x):
     verify_model(SliceWithStride2(), input_data=torch.randn(4, 4))
 
 
+@tvm.testing.uses_gpu
+def test_forward_narrow():
+    torch.set_grad_enabled(False)
+    input_shape = [3, 3]
+
+    class Narrow1(Module):
+        def forward(self, *args):
+            return torch.narrow(args[0], 0, 0, 2)
+
+    class Narrow2(Module):
+        def forward(self, *args):
+            return torch.narrow(args[0], 1, 1, 2)
+
+    class Narrow3(Module):
+        def forward(self, *args):
+            begin = torch.tensor(2) - torch.tensor(1)
+            length = torch.tensor(1) * torch.tensor(2)
+            return torch.narrow(args[0], 1, begin, length)
+
+    input_data = torch.rand(input_shape).float()
+    verify_model(Narrow1(), input_data=input_data)
+    verify_model(Narrow2(), input_data=input_data)
+    verify_model(Narrow3(), input_data=input_data)
+
+
 @tvm.testing.uses_gpu
 def test_forward_mean():
     torch.set_grad_enabled(False)
@@ -3758,6 +3783,7 @@ def test_fn(is_sorted, return_inverse, return_counts):
     test_forward_avgpool3d()
     test_forward_dropout()
     test_forward_slice()
+    test_forward_narrow()
     test_forward_mean()
     test_forward_expand()
     test_forward_pow()

From 22ba2c42ac3f35f9366327197448cfc62a3dabd3 Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Fri, 26 Feb 2021 22:21:15 -0800
Subject: [PATCH 252/357] [Torch] Simplify contiguous (#7544)

---
 python/tvm/relay/frontend/pytorch.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index a471639da623..31c78cfdea84 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -1004,8 +1004,7 @@ def threshold(self, inputs, input_types):
         return _op.nn.relu(data)
 
     def contiguous(self, inputs, input_types):
-        data = inputs[0]
-        return _op.tensor.copy(data)
+        return inputs[0]
 
     def batch_norm(self, inputs, input_types):
         data = inputs[0]

From b1b3823e1d63912b3ae9b3931cdde7884c155dba Mon Sep 17 00:00:00 2001
From: Luis Vega <vegaluisjose@users.noreply.github.com>
Date: Sat, 27 Feb 2021 00:31:02 -0800
Subject: [PATCH 253/357] add missing equal sign (#7531)

---
 tutorials/frontend/from_tflite.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/frontend/from_tflite.py b/tutorials/frontend/from_tflite.py
index a3014f9d2ea8..f7e8422c37b6 100644
--- a/tutorials/frontend/from_tflite.py
+++ b/tutorials/frontend/from_tflite.py
@@ -26,7 +26,7 @@
 .. code-block:: bash
 
     # install tflite
-    pip install tflite=2.1.0 --user
+    pip install tflite==2.1.0 --user
 
 
 or you could generate TFLite package yourself. The steps are the following:

From 485dfd6a3cbda41e8046eb89896bbc16195b26ab Mon Sep 17 00:00:00 2001
From: Yanming Wang <yanmingwang01@gmail.com>
Date: Sat, 27 Feb 2021 12:31:29 -0800
Subject: [PATCH 254/357] Fix typo in relay.vm.Executable (#7543)

Co-authored-by: Yanming Wang <yanmwang@amazon.com>
---
 python/tvm/runtime/vm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py
index 448cb137cc9b..d641e52d7184 100644
--- a/python/tvm/runtime/vm.py
+++ b/python/tvm/runtime/vm.py
@@ -113,7 +113,7 @@ def save(self):
             # define a simple network.
             x = relay.var('x', shape=(10, 10))
             f = relay.Function([x], x + x)
-            mod = relay.Module({"main": f})
+            mod = tvm.IRModule({"main": f})
             # create a Relay VM.
             ctx = tvm.cpu()
             target = "llvm"
@@ -128,7 +128,7 @@ def save(self):
             loaded_lib = tvm.runtime.load_module(path_lib)
             loaded_code = bytearray(open(tmp.relpath("code.ro"), "rb").read())
             # deserialize.
-            des_exec = tvm.runtime.vm.Executable.load_exec(loaded_code, loaded_code)
+            des_exec = tvm.runtime.vm.Executable.load_exec(loaded_code, loaded_lib)
             # execute the deserialized executable.
             x_data = np.random.rand(10, 10).astype('float32')
             des_vm = tvm.runtime.vm.VirtualMachine(des_exec, ctx)

From 26733095f5a1e0887c32d644429d430bc1f51c91 Mon Sep 17 00:00:00 2001
From: ziheng <ziheng@apache.org>
Date: Sun, 28 Feb 2021 23:52:16 +0800
Subject: [PATCH 255/357] [Runtime] Special Memory Scope Support (#7488)

---
 include/tvm/runtime/c_runtime_api.h       |  35 ++--
 include/tvm/runtime/device_api.h          |  42 +++--
 include/tvm/runtime/ndarray.h             |   7 +-
 python/tvm/runtime/ndarray.py             |  38 ++--
 src/runtime/c_runtime_api.cc              |  64 ++++++-
 src/runtime/cpu_device_api.cc             |  13 +-
 src/runtime/crt/common/crt_runtime_api.c  |  39 ++++-
 src/runtime/cuda/cuda_device_api.cc       |   2 +
 src/runtime/hexagon/hexagon_device_api.cc |   8 +-
 src/runtime/metal/metal_common.h          |   8 +-
 src/runtime/minrpc/minrpc_server.h        | 137 +++++++++------
 src/runtime/minrpc/rpc_reference.h        |  76 ++++----
 src/runtime/ndarray.cc                    |  65 ++++---
 src/runtime/opencl/opencl_common.h        |   8 +-
 src/runtime/rpc/rpc_device_api.cc         |  63 +++++--
 src/runtime/rpc/rpc_endpoint.cc           | 201 +++++++++++-----------
 src/runtime/rpc/rpc_endpoint.h            |   6 +-
 src/runtime/rpc/rpc_local_session.cc      |  40 +++--
 src/runtime/rpc/rpc_local_session.h       |   6 +-
 src/runtime/rpc/rpc_session.cc            |  15 +-
 src/runtime/rpc/rpc_session.h             |  43 +----
 src/runtime/vulkan/vulkan.cc              |   2 +
 web/emcc/tvmjs_support.cc                 |  44 ++---
 web/emcc/webgpu_runtime.cc                |   2 +
 24 files changed, 577 insertions(+), 387 deletions(-)

diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index 467e69a60827..59316a0bace0 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -559,6 +559,23 @@ TVM_DLL int TVMByteArrayFree(TVMByteArray* arr);
 TVM_DLL int TVMDeviceAllocDataSpace(DLContext ctx, size_t nbytes, size_t alignment,
                                     DLDataType type_hint, void** out_data);
 
+/*!
+ * \brief Allocate a data space on device with special memory scope.
+ * \note The memory could use a special multi-dimensional memory layout.
+ *       That is why we pass shape and dtype instead of raw number of bytes.
+ * \param ctx The device context to perform operation.
+ * \param ndim The number of dimension of the tensor.
+ * \param shape The shape of the tensor.
+ * \param dtype The type of elements.
+ * \param mem_scope The memory scope of the tensor,
+ *        can be nullptr, which indicate the default global DRAM
+ * \param out_data The allocated device pointer.
+ * \return 0 when success, -1 when failure happens
+ */
+TVM_DLL int TVMDeviceAllocDataSpaceWithScope(DLContext ctx, int ndim, const int64_t* shape,
+                                             DLDataType dtype, const char* mem_scope,
+                                             void** out_data);
+
 /*!
  * \brief Free a data space on device.
  * \param ctx The device context to perform operation.
@@ -569,22 +586,14 @@ TVM_DLL int TVMDeviceFreeDataSpace(TVMContext ctx, void* ptr);
 
 /*!
  * \brief Copy data from one place to another.
- * \param from The source array.
- * \param from_offset The byte offeset in the from.
- * \param to The target array.
- * \param to_offset The byte offset in the to.
- * \param num_bytes The size of the memory in bytes
- * \param ctx_from The source context
- * \param ctx_to The target context
- * \param type_hint The type of elements, only neded by certain backends.
- *                  can be useful for cross device endian converison.
+ * \note This API is designed to support special memory with shape dependent layout.
+ *       We pass in DLTensor* with shape information to support these cases.
+ * \param from The source tensor.
+ * \param to The target tensor.
  * \param stream Optional stream object.
  * \return 0 when success, -1 when failure happens.
  */
-TVM_DLL int TVMDeviceCopyDataFromTo(const void* from, size_t from_offset, void* to,
-                                    size_t to_offset, size_t num_bytes, TVMContext ctx_from,
-                                    TVMContext ctx_to, DLDataType type_hint,
-                                    TVMStreamHandle stream);
+TVM_DLL int TVMDeviceCopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream);
 
 /*!
  * \brief Check that an object is derived from another.
diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h
index a6f5624de084..1276663a2bc3 100644
--- a/include/tvm/runtime/device_api.h
+++ b/include/tvm/runtime/device_api.h
@@ -90,6 +90,17 @@ class TVM_DLL DeviceAPI {
    */
   virtual void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment,
                                DLDataType type_hint) = 0;
+  /*!
+   * \brief Allocate a data space on device with memory scope support.
+   * \param ctx The device context to perform operation.
+   * \param ndim The number of dimension of allocated tensor.
+   * \param shape The shape of allocated tensor.
+   * \param dtype The type of elements.
+   * \param mem_scope The memory scope of allocated tensor.
+   * \return The allocated device pointer.
+   */
+  virtual void* AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, DLDataType dtype,
+                               Optional<String> mem_scope = NullOpt);
   /*!
    * \brief Free a data space on device.
    * \param ctx The device context to perform operation.
@@ -98,20 +109,13 @@ class TVM_DLL DeviceAPI {
   virtual void FreeDataSpace(TVMContext ctx, void* ptr) = 0;
   /*!
    * \brief copy data from one place to another
+   * \note This API is designed to support special memory with shape dependent layout.
+   *       We pass in DLTensor* with shape information to support these cases.
    * \param from The source array.
-   * \param from_offset The byte offeset in the from.
    * \param to The target array.
-   * \param to_offset The byte offset in the to.
-   * \param num_bytes The size of the memory in bytes
-   * \param ctx_from The source context
-   * \param ctx_to The target context
-   * \param type_hint The type of elements, only neded by certain backends.
-   *                  can be useful for cross device endian converison.
    * \param stream Optional stream object.
    */
-  virtual void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
-                              size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
-                              DLDataType type_hint, TVMStreamHandle stream) = 0;
+  virtual void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream);
   /*!
    * \brief Create a new stream of execution.
    *
@@ -194,6 +198,24 @@ class TVM_DLL DeviceAPI {
   static bool NeedSetDeviceContext(int device_type) {
     return device_type != kDLCPU && device_type != kDLMicroDev;
   }
+
+ protected:
+  /*!
+   * \brief copy data from one place to another
+   * \param from The source array.
+   * \param from_offset The byte offeset in the from.
+   * \param to The target array.
+   * \param to_offset The byte offset in the to.
+   * \param num_bytes The size of the memory in bytes
+   * \param ctx_from The source context
+   * \param ctx_to The target context
+   * \param type_hint The type of elements, only neded by certain backends.
+   *                  can be useful for cross device endian converison.
+   * \param stream Optional stream object.
+   */
+  virtual void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
+                              size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
+                              DLDataType type_hint, TVMStreamHandle stream);
 };
 
 /*! \brief The device type bigger than this is RPC device */
diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
index 0ff171d4821f..a884b5c6838f 100644
--- a/include/tvm/runtime/ndarray.h
+++ b/include/tvm/runtime/ndarray.h
@@ -25,6 +25,7 @@
 #define TVM_RUNTIME_NDARRAY_H_
 
 #include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/container.h>
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/object.h>
 #include <tvm/runtime/serializer.h>
@@ -133,10 +134,12 @@ class NDArray : public ObjectRef {
    * \brief Create an empty NDArray.
    * \param shape The shape of the new array.
    * \param dtype The data type of the new array.
-   * \param ctx The context of the Array.
+   * \param ctx The context of the array.
+   * \param mem_scope The memory scope of the array.
    * \return The created Array
    */
-  TVM_DLL static NDArray Empty(std::vector<int64_t> shape, DLDataType dtype, DLContext ctx);
+  TVM_DLL static NDArray Empty(std::vector<int64_t> shape, DLDataType dtype, DLContext ctx,
+                               Optional<String> mem_scope = NullOpt);
   /*!
    * \brief Create a NDArray backed by a dlpack tensor.
    *
diff --git a/python/tvm/runtime/ndarray.py b/python/tvm/runtime/ndarray.py
index 2f616ce879c9..75da3d4a5c17 100644
--- a/python/tvm/runtime/ndarray.py
+++ b/python/tvm/runtime/ndarray.py
@@ -23,6 +23,7 @@
 from tvm._ffi.base import _LIB, check_call, c_array, string_types, _FFI_MODE
 from tvm._ffi.runtime_ctypes import DataType, TVMContext, TVMArray, TVMArrayHandle
 from tvm._ffi.runtime_ctypes import DataTypeCode, tvm_shape_index_t
+from . import _ffi_api
 
 try:
     # pylint: disable=wrong-import-position
@@ -253,42 +254,41 @@ def numpyasarray(np_data):
     return arr, shape
 
 
-def empty(shape, dtype="float32", ctx=context(1, 0)):
+def empty(shape, dtype="float32", ctx=context(1, 0), mem_scope=None):
     """Create an empty array given shape and device
 
     Parameters
     ----------
     shape : tuple of int
-        The shape of the array
+        The shape of the array.
 
     dtype : type or str
         The data type of the array.
 
     ctx : TVMContext
-        The context of the array
+        The context of the array.
+
+    mem_scope : Optional[str]
+        The memory scope of the array.
 
     Returns
     -------
     arr : tvm.nd.NDArray
         The array tvm supported.
     """
-    shape = c_array(tvm_shape_index_t, shape)
-    ndim = ctypes.c_int(len(shape))
-    handle = TVMArrayHandle()
+    shape_imm = []
+    for s in shape:
+        if isinstance(s, tvm.tir.IntImm):
+            shape_imm.append(s.value)
+        else:
+            shape_imm.append(int(s))
+    arr = np.array(shape_imm, "int64")
+    ptr = arr.ctypes.data_as(ctypes.POINTER(ctypes.c_int64))
+    shape_ptr = ctypes.cast(ptr, ctypes.c_void_p)
+    ndim = len(shape_imm)
     dtype = DataType(dtype)
-    check_call(
-        _LIB.TVMArrayAlloc(
-            shape,
-            ndim,
-            ctypes.c_int(dtype.type_code),
-            ctypes.c_int(dtype.bits),
-            ctypes.c_int(dtype.lanes),
-            ctx.device_type,
-            ctx.device_id,
-            ctypes.byref(handle),
-        )
-    )
-    return _make_array(handle, False, False)
+    arr = _ffi_api.TVMArrayAllocWithScope(shape_ptr, ndim, dtype, ctx, mem_scope)
+    return arr
 
 
 def from_dlpack(dltensor):
diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc
index b4457bf66614..7fd27cba6136 100644
--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -144,6 +144,50 @@ void* DeviceAPI::AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hin
   return AllocDataSpace(ctx, size, kTempAllocaAlignment, type_hint);
 }
 
+static size_t GetDataAlignment(const DLDataType dtype) {
+  size_t align = (dtype.bits / 8) * dtype.lanes;
+  if (align < kAllocAlignment) return kAllocAlignment;
+  return align;
+}
+
+void* DeviceAPI::AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, DLDataType dtype,
+                                Optional<String> mem_scope) {
+  if (!mem_scope.defined() || mem_scope.value() == "global") {
+    // by default, we can always redirect to the flat memory allocations
+    DLTensor temp;
+    temp.data = nullptr;
+    temp.ctx = ctx;
+    temp.ndim = ndim;
+    temp.dtype = dtype;
+    temp.shape = const_cast<int64_t*>(shape);
+    temp.strides = nullptr;
+    temp.byte_offset = 0;
+    size_t size = GetDataSize(temp);
+    size_t alignment = GetDataAlignment(temp.dtype);
+    return AllocDataSpace(ctx, size, alignment, dtype);
+  }
+  LOG(FATAL) << "Device does not support allocate data space with "
+             << "specified memory scope: " << mem_scope.value();
+  return nullptr;
+}
+
+void DeviceAPI::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
+  // by default, we can always redirect to the flat memory copy operation.
+  size_t nbytes = GetDataSize(*from);
+  ICHECK_EQ(nbytes, GetDataSize(*to));
+
+  ICHECK(IsContiguous(*from) && IsContiguous(*to))
+      << "CopyDataFromTo only support contiguous array for now";
+  CopyDataFromTo(from->data, from->byte_offset, to->data, to->byte_offset, nbytes, from->ctx,
+                 to->ctx, from->dtype, stream);
+}
+
+void DeviceAPI::CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
+                               size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
+                               DLDataType type_hint, TVMStreamHandle stream) {
+  LOG(FATAL) << "Device does not support CopyDataFromTo.";
+}
+
 void DeviceAPI::FreeWorkspace(TVMContext ctx, void* ptr) { FreeDataSpace(ctx, ptr); }
 
 TVMStreamHandle DeviceAPI::CreateStream(TVMContext ctx) {
@@ -553,19 +597,29 @@ int TVMDeviceAllocDataSpace(DLContext ctx, size_t nbytes, size_t alignment, DLDa
   API_END();
 }
 
+int TVMDeviceAllocDataSpaceWithScope(DLContext ctx, int ndim, const int64_t* shape,
+                                     DLDataType dtype, const char* mem_scope, void** out_data) {
+  API_BEGIN();
+  Optional<String> scope;
+  if (mem_scope != nullptr) {
+    scope = String(std::string(mem_scope));
+  }
+  out_data[0] = DeviceAPIManager::Get(ctx)->AllocDataSpace(ctx, ndim, shape, dtype, scope);
+  API_END();
+}
+
 int TVMDeviceFreeDataSpace(DLContext ctx, void* ptr) {
   API_BEGIN();
   DeviceAPIManager::Get(ctx)->FreeDataSpace(ctx, ptr);
   API_END();
 }
 
-int TVMDeviceCopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
-                            size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
-                            DLDataType type_hint, TVMStreamHandle stream) {
+int TVMDeviceCopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
   API_BEGIN();
+  TVMContext ctx_from = from->ctx;
+  TVMContext ctx_to = to->ctx;
   TVMContext ctx = ctx_from.device_type != kDLCPU ? ctx_from : ctx_to;
-  DeviceAPIManager::Get(ctx)->CopyDataFromTo(from, from_offset, to, to_offset, num_bytes, ctx_from,
-                                             ctx_to, type_hint, stream);
+  DeviceAPIManager::Get(ctx)->CopyDataFromTo(from, to, stream);
   API_END();
 }
 
diff --git a/src/runtime/cpu_device_api.cc b/src/runtime/cpu_device_api.cc
index 146bfa804785..b745be33b456 100644
--- a/src/runtime/cpu_device_api.cc
+++ b/src/runtime/cpu_device_api.cc
@@ -69,12 +69,6 @@ class CPUDeviceAPI final : public DeviceAPI {
 #endif
   }
 
-  void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
-                      TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
-                      TVMStreamHandle stream) final {
-    memcpy(static_cast<char*>(to) + to_offset, static_cast<const char*>(from) + from_offset, size);
-  }
-
   void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {}
 
   void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final;
@@ -86,6 +80,13 @@ class CPUDeviceAPI final : public DeviceAPI {
     static auto* inst = new CPUDeviceAPI();
     return inst;
   }
+
+ protected:
+  void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
+                      TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
+                      TVMStreamHandle stream) final {
+    memcpy(static_cast<char*>(to) + to_offset, static_cast<const char*>(from) + from_offset, size);
+  }
 };
 
 struct CPUWorkspacePool : public WorkspacePool {
diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c
index bc47f995eac0..c2eb1ff903e3 100644
--- a/src/runtime/crt/common/crt_runtime_api.c
+++ b/src/runtime/crt/common/crt_runtime_api.c
@@ -22,6 +22,7 @@
 #include <assert.h>
 #include <inttypes.h>
 #include <stdarg.h>
+#include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -87,16 +88,44 @@ int TVMDeviceAllocDataSpace(DLContext ctx, size_t nbytes, size_t alignment, DLDa
   if (alignment != 1) {
     nbytes = (nbytes + alignment - 1) / alignment * alignment;
   }
-
   return TVMPlatformMemoryAllocate(nbytes, ctx, out_data);
 }
 
+int TVMDeviceAllocDataSpaceWithScope(DLContext ctx, int ndim, const int64_t* shape,
+                                     DLDataType dtype, const char* mem_scope, void** out_data) {
+  size_t nbytes = 1;
+  for (int i = 0; i < ndim; ++i) {
+    nbytes *= shape[i];
+  }
+  nbytes *= (dtype.bits * dtype.lanes + 7) / 8;
+
+  int kAllocAlignment = 128;
+  size_t align = (dtype.bits / 8) * dtype.lanes;
+  if (align < kAllocAlignment) align = kAllocAlignment;
+  return TVMDeviceAllocDataSpace(ctx, nbytes, align, dtype, out_data);
+}
+
 int TVMDeviceFreeDataSpace(TVMContext ctx, void* ptr) { return TVMPlatformMemoryFree(ptr, ctx); }
 
-int TVMDeviceCopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
-                            size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
-                            DLDataType type_hint, TVMStreamHandle stream) {
-  memcpy(((uint8_t*)to) + to_offset, ((uint8_t*)from) + from_offset, num_bytes);
+static bool IsContiguous(const DLTensor* arr) {
+  if (arr->strides == NULL) return true;
+  int64_t expected_stride = 1;
+  for (int32_t i = arr->ndim; i != 0; --i) {
+    int32_t k = i - 1;
+    if (arr->strides[k] != expected_stride) return false;
+    expected_stride *= arr->shape[k];
+  }
+  return true;
+}
+
+int TVMDeviceCopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
+  assert(IsContiguous(from) && IsContiguous(to));
+  size_t size = 1;
+  for (int i = 0; i < from->ndim; ++i) {
+    size *= from->shape[i];
+  }
+  size *= (from->dtype.bits * from->dtype.lanes + 7) / 8;
+  memcpy(((uint8_t*)to->data) + to->byte_offset, ((uint8_t*)from->data) + from->byte_offset, size);
   return 0;
 }
 
diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc
index 30abfc8dc559..c77395422e87 100644
--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -127,6 +127,7 @@ class CUDADeviceAPI final : public DeviceAPI {
     }
   }
 
+ protected:
   void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
                       TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
                       TVMStreamHandle stream) final {
@@ -166,6 +167,7 @@ class CUDADeviceAPI final : public DeviceAPI {
     }
   }
 
+ public:
   TVMStreamHandle CreateStream(TVMContext ctx) {
     CUDA_CALL(cudaSetDevice(ctx.device_id));
     cudaStream_t retval;
diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
index 605c55eb89b9..70cebf5afa44 100644
--- a/src/runtime/hexagon/hexagon_device_api.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -35,9 +35,6 @@ class HexagonDeviceAPI : public DeviceAPI {
   void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final;
   void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment, DLDataType type_hint) final;
   void FreeDataSpace(TVMContext ctx, void* ptr) final;
-  void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
-                      size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
-                      DLDataType type_hint, TVMStreamHandle stream) final;
   void StreamSync(TVMContext ctx, TVMStreamHandle stream) final;
   void* AllocWorkspace(TVMContext ctx, size_t nbytes, DLDataType type_hint = {}) final;
   void FreeWorkspace(TVMContext ctx, void* ptr) final;
@@ -48,6 +45,11 @@ class HexagonDeviceAPI : public DeviceAPI {
     static HexagonDeviceAPI* inst = new HexagonDeviceAPI();
     return inst;
   }
+
+ protected:
+  void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
+                      size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
+                      DLDataType type_hint, TVMStreamHandle stream) final;
 };
 
 // HexagonDeviceAPI.
diff --git a/src/runtime/metal/metal_common.h b/src/runtime/metal/metal_common.h
index d13ac7e78982..bd07dbfde9d0 100644
--- a/src/runtime/metal/metal_common.h
+++ b/src/runtime/metal/metal_common.h
@@ -84,14 +84,16 @@ class MetalWorkspace final : public DeviceAPI {
   void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final;
   void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment, DLDataType type_hint) final;
   void FreeDataSpace(TVMContext ctx, void* ptr) final;
-  void CopyDataFromTo(const void* from, size_t from_size, void* to, size_t to_size, size_t size,
-                      TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
-                      TVMStreamHandle stream) final;
   void StreamSync(TVMContext ctx, TVMStreamHandle stream) final;
   void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final;
   void FreeWorkspace(TVMContext ctx, void* data) final;
   // get the global workspace
   static MetalWorkspace* Global();
+
+ protected:
+  void CopyDataFromTo(const void* from, size_t from_size, void* to, size_t to_size, size_t size,
+                      TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
+                      TVMStreamHandle stream) final;
 };
 
 /*! \brief Thread local workspace */
diff --git a/src/runtime/minrpc/minrpc_server.h b/src/runtime/minrpc/minrpc_server.h
index d28e0c396e36..d5c61eccfd6d 100644
--- a/src/runtime/minrpc/minrpc_server.h
+++ b/src/runtime/minrpc/minrpc_server.h
@@ -169,28 +169,39 @@ class MinRPCServer {
   }
 
   void HandleCopyFromRemote() {
-    uint64_t handle, offset, num_bytes;
-    TVMContext ctx;
-    DLDataType type_hint;
-
-    this->Read(&handle);
-    this->Read(&offset);
+    DLTensor* arr = this->ArenaAlloc<DLTensor>(1);
+    uint64_t data_handle;
+    this->Read(&data_handle);
+    arr->data = reinterpret_cast<void*>(data_handle);
+    this->Read(&(arr->ctx));
+    this->Read(&(arr->ndim));
+    this->Read(&(arr->dtype));
+    arr->shape = this->ArenaAlloc<int64_t>(arr->ndim);
+    this->ReadArray(arr->shape, arr->ndim);
+    arr->strides = nullptr;
+    this->Read(&(arr->byte_offset));
+
+    uint64_t num_bytes;
     this->Read(&num_bytes);
-    this->Read(&ctx);
-    this->Read(&type_hint);
 
     uint8_t* data_ptr;
     int call_ecode = 0;
-    if (ctx.device_type == kDLCPU) {
-      data_ptr = reinterpret_cast<uint8_t*>(handle) + offset;
+    if (arr->ctx.device_type == kDLCPU) {
+      data_ptr = reinterpret_cast<uint8_t*>(data_handle) + arr->byte_offset;
     } else {
       data_ptr = this->ArenaAlloc<uint8_t>(num_bytes);
-      call_ecode =
-          TVMDeviceCopyDataFromTo(reinterpret_cast<void*>(handle), offset, data_ptr, 0, num_bytes,
-                                  ctx, DLContext{kDLCPU, 0}, type_hint, nullptr);
+      DLTensor temp;
+      temp.data = reinterpret_cast<void*>(data_ptr);
+      temp.ctx = arr->ctx;
+      temp.ndim = arr->ndim;
+      temp.dtype = arr->dtype;
+      temp.shape = arr->shape;
+      temp.strides = nullptr;
+      temp.byte_offset = 0;
+      call_ecode = TVMDeviceCopyDataFromTo(arr, &temp, nullptr);
       // need sync to make sure that the copy is completed.
       if (call_ecode == 0) {
-        call_ecode = TVMSynchronize(ctx.device_type, ctx.device_id, nullptr);
+        call_ecode = TVMSynchronize(arr->ctx.device_type, arr->ctx.device_id, nullptr);
       }
     }
 
@@ -209,30 +220,39 @@ class MinRPCServer {
   }
 
   void HandleCopyToRemote() {
-    uint64_t handle, offset, num_bytes;
-    TVMContext ctx;
-    DLDataType type_hint;
-
-    this->Read(&handle);
-    this->Read(&offset);
+    DLTensor* arr = this->ArenaAlloc<DLTensor>(1);
+    uint64_t data_handle;
+    this->Read(&data_handle);
+    arr->data = reinterpret_cast<void*>(data_handle);
+    this->Read(&(arr->ctx));
+    this->Read(&(arr->ndim));
+    this->Read(&(arr->dtype));
+    arr->shape = this->ArenaAlloc<int64_t>(arr->ndim);
+    this->ReadArray(arr->shape, arr->ndim);
+    arr->strides = nullptr;
+    this->Read(&(arr->byte_offset));
+    uint64_t num_bytes;
     this->Read(&num_bytes);
-    this->Read(&ctx);
-    this->Read(&type_hint);
-    int call_ecode = 0;
 
-    if (ctx.device_type == kDLCPU) {
-      uint8_t* dptr = reinterpret_cast<uint8_t*>(handle) + offset;
+    int call_ecode = 0;
+    if (arr->ctx.device_type == kDLCPU) {
+      uint8_t* dptr = reinterpret_cast<uint8_t*>(data_handle) + arr->byte_offset;
       this->ReadArray(dptr, num_bytes);
     } else {
       uint8_t* temp_data = this->ArenaAlloc<uint8_t>(num_bytes);
       this->ReadArray(temp_data, num_bytes);
-
-      call_ecode =
-          TVMDeviceCopyDataFromTo(temp_data, 0, reinterpret_cast<void*>(handle), offset, num_bytes,
-                                  DLContext{kDLCPU, 0}, ctx, type_hint, nullptr);
+      DLTensor temp;
+      temp.data = temp_data;
+      temp.ctx = DLContext{kDLCPU, 0};
+      temp.ndim = arr->ndim;
+      temp.dtype = arr->dtype;
+      temp.shape = arr->shape;
+      temp.strides = nullptr;
+      temp.byte_offset = 0;
+      call_ecode = TVMDeviceCopyDataFromTo(&temp, arr, nullptr);
       // need sync to make sure that the copy is completed.
       if (call_ecode == 0) {
-        call_ecode = TVMSynchronize(ctx.device_type, ctx.device_id, nullptr);
+        call_ecode = TVMSynchronize(arr->ctx.device_type, arr->ctx.device_id, nullptr);
       }
     }
 
@@ -269,6 +289,10 @@ class MinRPCServer {
         this->SyscallDevAllocData(values, tcodes, num_args);
         break;
       }
+      case RPCCode::kDevAllocDataWithScope: {
+        this->SyscallDevAllocDataWithScope(values, tcodes, num_args);
+        break;
+      }
       case RPCCode::kDevFreeData: {
         this->SyscallDevFreeData(values, tcodes, num_args);
         break;
@@ -342,34 +366,20 @@ class MinRPCServer {
   }
 
   void SyscallCopyAmongRemote(TVMValue* values, int* tcodes, int num_args) {
-    MINRPC_CHECK(num_args == 9);
-    // from, from_offset
-    MINRPC_CHECK(tcodes[0] == kTVMOpaqueHandle);
-    MINRPC_CHECK(tcodes[1] == kDLInt);
-    // to, to_offset
+    MINRPC_CHECK(num_args == 3);
+    // from dltensor
+    MINRPC_CHECK(tcodes[0] == kTVMDLTensorHandle);
+    // to dltensor
+    MINRPC_CHECK(tcodes[1] == kTVMDLTensorHandle);
+    // stream
     MINRPC_CHECK(tcodes[2] == kTVMOpaqueHandle);
-    MINRPC_CHECK(tcodes[3] == kDLInt);
-    // size
-    MINRPC_CHECK(tcodes[4] == kDLInt);
-    // ctx_from, ctx_to
-    MINRPC_CHECK(tcodes[5] == kTVMContext);
-    MINRPC_CHECK(tcodes[6] == kTVMContext);
-    // type_hint, stream
-    MINRPC_CHECK(tcodes[7] == kTVMDataType);
-    MINRPC_CHECK(tcodes[8] == kTVMOpaqueHandle);
 
     void* from = values[0].v_handle;
-    int64_t from_offset = values[1].v_int64;
-    void* to = values[2].v_handle;
-    int64_t to_offset = values[3].v_int64;
-    int64_t size = values[4].v_int64;
-    TVMContext ctx_from = values[5].v_ctx;
-    TVMContext ctx_to = values[6].v_ctx;
-    DLDataType type_hint = values[7].v_type;
-    TVMStreamHandle stream = values[8].v_handle;
-
-    int call_ecode = TVMDeviceCopyDataFromTo(from, from_offset, to, to_offset, size, ctx_from,
-                                             ctx_to, type_hint, stream);
+    void* to = values[1].v_handle;
+    TVMStreamHandle stream = values[2].v_handle;
+
+    int call_ecode = TVMDeviceCopyDataFromTo(reinterpret_cast<DLTensor*>(from),
+                                             reinterpret_cast<DLTensor*>(to), stream);
 
     if (call_ecode == 0) {
       this->ReturnVoid();
@@ -400,6 +410,23 @@ class MinRPCServer {
     }
   }
 
+  void SyscallDevAllocDataWithScope(TVMValue* values, int* tcodes, int num_args) {
+    MINRPC_CHECK(num_args == 2);
+    MINRPC_CHECK(tcodes[0] == kTVMDLTensorHandle);
+    MINRPC_CHECK(tcodes[1] == kTVMNullptr || tcodes[1] == kTVMStr);
+
+    DLTensor* arr = reinterpret_cast<DLTensor*>(values[0].v_handle);
+    const char* mem_scope = (tcodes[1] == kTVMNullptr ? nullptr : values[1].v_str);
+    void* handle;
+    int call_ecode = TVMDeviceAllocDataSpaceWithScope(arr->ctx, arr->ndim, arr->shape, arr->dtype,
+                                                      mem_scope, &handle);
+    if (call_ecode == 0) {
+      this->ReturnHandle(handle);
+    } else {
+      this->ReturnLastTVMError();
+    }
+  }
+
   void SyscallDevFreeData(TVMValue* values, int* tcodes, int num_args) {
     MINRPC_CHECK(num_args == 2);
     MINRPC_CHECK(tcodes[0] == kTVMContext);
diff --git a/src/runtime/minrpc/rpc_reference.h b/src/runtime/minrpc/rpc_reference.h
index e195b9ca9e89..07d13a7ff67b 100644
--- a/src/runtime/minrpc/rpc_reference.h
+++ b/src/runtime/minrpc/rpc_reference.h
@@ -28,7 +28,7 @@ namespace tvm {
 namespace runtime {
 
 /*! \brief The current RPC procotol version. */
-constexpr const char* kRPCProtocolVer = "0.7.0";
+constexpr const char* kRPCProtocolVer = "0.8.0";
 
 /*! \brief The RPC code */
 enum class RPCCode : int {
@@ -51,6 +51,7 @@ enum class RPCCode : int {
   kDevFreeData,
   kDevStreamSync,
   kCopyAmongRemote,
+  kDevAllocDataWithScope,
 };
 
 /*!
@@ -107,6 +108,8 @@ inline const char* RPCCodeToString(RPCCode code) {
       return "kDevStreamSync";
     case RPCCode::kCopyAmongRemote:
       return "kCopyAmongRemote";
+    case RPCCode::kDevAllocDataWithScope:
+      return "kDevAllocDataWithScope";
     default:
       return "";
   }
@@ -218,6 +221,44 @@ struct RPCReference {
     return getter.num_bytes();
   }
 
+  template <typename TChannelPtr>
+  static void SendDLTensor(TChannelPtr channel, DLTensor* arr) {
+    TVMContext ctx;
+    uint64_t data;
+    // When we return NDArray, we directly return
+    // the space and the context
+    // The client will be further wrapping
+    ctx = arr->ctx;
+    data = reinterpret_cast<uint64_t>(arr->data);
+    channel->Write(data);
+    channel->Write(ctx);
+    channel->Write(arr->ndim);
+    channel->Write(arr->dtype);
+    channel->WriteArray(arr->shape, arr->ndim);
+    if (arr->strides != nullptr) {
+      channel->ThrowError(RPCServerStatus::kInvalidDLTensorFieldStride);
+    }
+    channel->Write(arr->byte_offset);
+    return;
+  }
+
+  template <typename TChannelPtr>
+  static DLTensor* ReceiveDLTensor(TChannelPtr channel) {
+    uint64_t handle;
+    channel->Read(&handle);
+    DLTensor* arr = channel->template ArenaAlloc<DLTensor>(1);
+    DLTensor& tensor = *arr;
+    tensor.data = reinterpret_cast<void*>(handle);
+    channel->Read(&(tensor.ctx));
+    channel->Read(&(tensor.ndim));
+    channel->Read(&(tensor.dtype));
+    tensor.shape = channel->template ArenaAlloc<int64_t>(tensor.ndim);
+    channel->ReadArray(tensor.shape, tensor.ndim);
+    tensor.strides = nullptr;
+    channel->Read(&(tensor.byte_offset));
+    return arr;
+  }
+
   /*!
    * \brief Send packed argument sequnce to the other peer.
    *
@@ -292,24 +333,7 @@ struct RPCReference {
         }
         case kTVMDLTensorHandle: {
           DLTensor* arr = static_cast<DLTensor*>(value.v_handle);
-          TVMContext ctx;
-          uint64_t data;
-          // When we return NDArray, we directly return
-          // the space and the context
-          // The client will be further wrapping
-          ctx = arr->ctx;
-          data = reinterpret_cast<uint64_t>(arr->data);
-          channel->Write(data);
-          channel->Write(ctx);
-          channel->Write(arr->ndim);
-          channel->Write(arr->dtype);
-          channel->WriteArray(arr->shape, arr->ndim);
-          if (arr->strides != nullptr) {
-            channel->ThrowError(RPCServerStatus::kInvalidDLTensorFieldStride);
-          }
-          if (arr->byte_offset != 0) {
-            channel->ThrowError(RPCServerStatus::kInvalidDLTensorFieldByteOffset);
-          }
+          SendDLTensor(channel, arr);
           break;
         }
         case kTVMNullptr:
@@ -422,19 +446,7 @@ struct RPCReference {
           break;
         }
         case kTVMDLTensorHandle: {
-          uint64_t handle;
-          channel->Read(&handle);
-          DLTensor* arr = channel->template ArenaAlloc<DLTensor>(1);
-          DLTensor& tensor = *arr;
-          tensor.data = reinterpret_cast<void*>(handle);
-          channel->Read(&(tensor.ctx));
-          channel->Read(&(tensor.ndim));
-          channel->Read(&(tensor.dtype));
-          tensor.shape = channel->template ArenaAlloc<int64_t>(tensor.ndim);
-          channel->ReadArray(tensor.shape, tensor.ndim);
-          tensor.strides = nullptr;
-          tensor.byte_offset = 0;
-          value.v_handle = arr;
+          value.v_handle = ReceiveDLTensor(channel);
           break;
         }
         default: {
diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc
index dae775606a7e..d3ddbf8c0229 100644
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -24,6 +24,7 @@
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/registry.h>
 #include <tvm/support/logging.h>
 
 #include "runtime_base.h"
@@ -58,36 +59,39 @@ inline void VerifyDataType(DLDataType dtype) {
   ICHECK_EQ(dtype.bits & (dtype.bits - 1), 0);
 }
 
-inline size_t GetDataAlignment(const DLTensor& arr) {
-  size_t align = (arr.dtype.bits / 8) * arr.dtype.lanes;
-  if (align < kAllocAlignment) return kAllocAlignment;
-  return align;
-}
-
 void ArrayCopyFromBytes(DLTensor* handle, const void* data, size_t nbytes) {
-  TVMContext cpu_ctx;
-  cpu_ctx.device_type = kDLCPU;
-  cpu_ctx.device_id = 0;
   size_t arr_size = GetDataSize(*handle);
   ICHECK_EQ(arr_size, nbytes) << "ArrayCopyFromBytes: size mismatch";
   ICHECK(IsContiguous(*handle)) << "ArrayCopyFromBytes only support contiguous array for now";
-  DeviceAPI::Get(handle->ctx)
-      ->CopyDataFromTo(data, 0, handle->data, static_cast<size_t>(handle->byte_offset), nbytes,
-                       cpu_ctx, handle->ctx, handle->dtype, nullptr);
+
+  DLTensor from;
+  from.data = const_cast<void*>(data);
+  from.ctx = DLContext{kDLCPU, 0};
+  from.ndim = handle->ndim;
+  from.dtype = handle->dtype;
+  from.shape = handle->shape;
+  from.strides = nullptr;
+  from.byte_offset = 0;
+  DeviceAPI::Get(handle->ctx)->CopyDataFromTo(&from, handle, nullptr);
   // Synchronize in case data become unavailable later.
   DeviceAPI::Get(handle->ctx)->StreamSync(handle->ctx, nullptr);
 }
 
 void ArrayCopyToBytes(const DLTensor* handle, void* data, size_t nbytes) {
-  TVMContext cpu_ctx;
-  cpu_ctx.device_type = kDLCPU;
-  cpu_ctx.device_id = 0;
   size_t arr_size = GetDataSize(*handle);
   ICHECK_EQ(arr_size, nbytes) << "ArrayCopyToBytes: size mismatch";
   ICHECK(IsContiguous(*handle)) << "ArrayCopyToBytes only support contiguous array for now";
-  DeviceAPI::Get(handle->ctx)
-      ->CopyDataFromTo(handle->data, static_cast<size_t>(handle->byte_offset), data, 0, nbytes,
-                       handle->ctx, cpu_ctx, handle->dtype, nullptr);
+
+  DLTensor to;
+  to.data = const_cast<void*>(data);
+  to.ctx = DLContext{kDLCPU, 0};
+  to.ndim = handle->ndim;
+  to.dtype = handle->dtype;
+  to.shape = handle->shape;
+  to.strides = nullptr;
+  to.byte_offset = 0;
+
+  DeviceAPI::Get(handle->ctx)->CopyDataFromTo(const_cast<DLTensor*>(handle), &to, nullptr);
   // Synchronize in case data become unavailable later.
   DeviceAPI::Get(handle->ctx)->StreamSync(handle->ctx, nullptr);
 }
@@ -186,13 +190,11 @@ NDArray NDArray::CreateView(std::vector<int64_t> shape, DLDataType dtype) {
 
 DLManagedTensor* NDArray::ToDLPack() const { return Internal::ToDLPack(get_mutable()); }
 
-NDArray NDArray::Empty(std::vector<int64_t> shape, DLDataType dtype, DLContext ctx) {
+NDArray NDArray::Empty(std::vector<int64_t> shape, DLDataType dtype, DLContext ctx,
+                       Optional<String> mem_scope) {
   NDArray ret = Internal::Create(shape, dtype, ctx);
-  // setup memory content
-  size_t size = GetDataSize(ret.get_mutable()->dl_tensor);
-  size_t alignment = GetDataAlignment(ret.get_mutable()->dl_tensor);
-  ret.get_mutable()->dl_tensor.data =
-      DeviceAPI::Get(ret->ctx)->AllocDataSpace(ret->ctx, size, alignment, ret->dtype);
+  ret.get_mutable()->dl_tensor.data = DeviceAPI::Get(ret->ctx)->AllocDataSpace(
+      ret->ctx, shape.size(), shape.data(), ret->dtype, mem_scope);
   return ret;
 }
 
@@ -236,9 +238,7 @@ void NDArray::CopyFromTo(const DLTensor* from, DLTensor* to, TVMStreamHandle str
   // api manager.
   TVMContext ctx = from->ctx.device_type != kDLCPU ? from->ctx : to->ctx;
 
-  DeviceAPI::Get(ctx)->CopyDataFromTo(from->data, static_cast<size_t>(from->byte_offset), to->data,
-                                      static_cast<size_t>(to->byte_offset), from_size, from->ctx,
-                                      to->ctx, from->dtype, stream);
+  DeviceAPI::Get(ctx)->CopyDataFromTo(const_cast<DLTensor*>(from), to, stream);
 }
 
 std::vector<int64_t> NDArray::Shape() const { return get_mutable()->shape_; }
@@ -279,6 +279,17 @@ int TVMArrayAlloc(const tvm_index_t* shape, int ndim, int dtype_code, int dtype_
   API_END();
 }
 
+TVM_REGISTER_GLOBAL("runtime.TVMArrayAllocWithScope").set_body([](TVMArgs args, TVMRetValue* ret) {
+  int64_t* shape_ptr = static_cast<int64_t*>(static_cast<void*>(args[0]));
+  int ndim = args[1];
+  std::vector<int64_t> shape(shape_ptr, shape_ptr + ndim);
+  DataType dtype = args[2];
+  TVMContext ctx = args[3];
+  Optional<String> mem_scope = args[4];
+  auto ndarray = NDArray::Empty(shape, dtype, ctx, mem_scope);
+  *ret = ndarray;
+});
+
 int TVMArrayFree(TVMArrayHandle handle) {
   API_BEGIN();
   NDArray::Internal::FFIDecRef(handle);
diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index fa118ed9525b..2e7f05f91020 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -232,9 +232,6 @@ class OpenCLWorkspace : public DeviceAPI {
   void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final;
   void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment, DLDataType type_hint) final;
   void FreeDataSpace(TVMContext ctx, void* ptr) final;
-  void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
-                      TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
-                      TVMStreamHandle stream) final;
   void StreamSync(TVMContext ctx, TVMStreamHandle stream) final;
   void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final;
   void FreeWorkspace(TVMContext ctx, void* data) final;
@@ -246,6 +243,11 @@ class OpenCLWorkspace : public DeviceAPI {
 
   // get the global workspace
   static OpenCLWorkspace* Global();
+
+ protected:
+  void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
+                      TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
+                      TVMStreamHandle stream) final;
 };
 
 /*! \brief Thread local workspace */
diff --git a/src/runtime/rpc/rpc_device_api.cc b/src/runtime/rpc/rpc_device_api.cc
index a1e96e92b4e0..06737f99a4de 100644
--- a/src/runtime/rpc/rpc_device_api.cc
+++ b/src/runtime/rpc/rpc_device_api.cc
@@ -43,6 +43,18 @@ class RPCDeviceAPI final : public DeviceAPI {
     GetSess(ctx)->GetDeviceAPI(remote_ctx)->GetAttr(remote_ctx, kind, rv);
   }
 
+  void* AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, DLDataType dtype,
+                       Optional<String> mem_scope) final {
+    auto sess = GetSess(ctx);
+    auto remote_ctx = RemoveRPCSessionMask(ctx);
+    void* data =
+        sess->GetDeviceAPI(remote_ctx)->AllocDataSpace(remote_ctx, ndim, shape, dtype, mem_scope);
+    RemoteSpace* space = new RemoteSpace();
+    space->data = data;
+    space->sess = std::move(sess);
+    return space;
+  }
+
   void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment,
                        DLDataType type_hint) final {
     auto sess = GetSess(ctx);
@@ -65,30 +77,36 @@ class RPCDeviceAPI final : public DeviceAPI {
     }
     delete space;
   }
-  void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
-                      TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
-                      TVMStreamHandle stream) final {
+
+  void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) final {
+    DLContext ctx_from = from->ctx;
+    DLContext ctx_to = to->ctx;
     if (IsRPCSessionContext(ctx_from) && IsRPCSessionContext(ctx_to)) {
       ICHECK(ctx_from.device_type == ctx_to.device_type)
           << "Cannot copy across two different remote session";
-      auto remote_ctx_from = RemoveRPCSessionMask(ctx_from);
-      auto remote_ctx_to = RemoveRPCSessionMask(ctx_to);
-      auto remote_ctx = remote_ctx_from;
-      if (remote_ctx.device_type == kDLCPU) remote_ctx = remote_ctx_to;
-      GetSess(ctx_from)
-          ->GetDeviceAPI(remote_ctx)
-          ->CopyDataFromTo(static_cast<const RemoteSpace*>(from)->data, from_offset,
-                           static_cast<const RemoteSpace*>(to)->data, to_offset, size,
-                           remote_ctx_from, remote_ctx_to, type_hint, stream);
+      DLTensor from_tensor = *from;
+      from_tensor.ctx = RemoveRPCSessionMask(ctx_from);
+      from_tensor.data = static_cast<const RemoteSpace*>(from->data)->data;
+      DLTensor to_tensor = *to;
+      to_tensor.ctx = RemoveRPCSessionMask(ctx_to);
+      to_tensor.data = static_cast<const RemoteSpace*>(to->data)->data;
+      auto remote_ctx = from_tensor.ctx;
+      if (remote_ctx.device_type == kDLCPU) remote_ctx = to_tensor.ctx;
+      GetSess(ctx_from)->GetDeviceAPI(remote_ctx)->CopyDataFromTo(&from_tensor, &to_tensor, stream);
     } else if (IsRPCSessionContext(ctx_from) && ctx_to.device_type == kDLCPU) {
-      auto remote_ctx_from = RemoveRPCSessionMask(ctx_from);
-      GetSess(ctx_from)->CopyFromRemote(static_cast<const RemoteSpace*>(from)->data, from_offset,
-                                        to, to_offset, size, remote_ctx_from, type_hint);
+      DLTensor from_tensor = *from;
+      from_tensor.ctx = RemoveRPCSessionMask(ctx_from);
+      from_tensor.data = static_cast<const RemoteSpace*>(from->data)->data;
+      void* to_bytes = static_cast<char*>(to->data) + to->byte_offset;
+      size_t nbytes = GetDataSize(*to);
+      GetSess(ctx_from)->CopyFromRemote(&from_tensor, to_bytes, nbytes);
     } else if (ctx_from.device_type == kDLCPU && IsRPCSessionContext(ctx_to)) {
-      auto remote_ctx_to = RemoveRPCSessionMask(ctx_to);
-      GetSess(ctx_to)->CopyToRemote(const_cast<void*>(from), from_offset,
-                                    static_cast<const RemoteSpace*>(to)->data, to_offset, size,
-                                    remote_ctx_to, type_hint);
+      DLTensor to_tensor = *to;
+      to_tensor.ctx = RemoveRPCSessionMask(ctx_to);
+      to_tensor.data = static_cast<const RemoteSpace*>(to->data)->data;
+      void* from_bytes = static_cast<char*>(from->data) + from->byte_offset;
+      size_t nbytes = GetDataSize(*from);
+      GetSess(ctx_to)->CopyToRemote(from_bytes, &to_tensor, nbytes);
     } else {
       LOG(FATAL) << "expect copy from/to remote or between remote";
     }
@@ -99,6 +117,13 @@ class RPCDeviceAPI final : public DeviceAPI {
     GetSess(ctx)->GetDeviceAPI(remote_ctx)->StreamSync(remote_ctx, stream);
   }
 
+ protected:
+  void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
+                      size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
+                      DLDataType type_hint, TVMStreamHandle stream) final {
+    LOG(FATAL) << "Not implemented.";
+  }
+
  private:
   std::shared_ptr<RPCSession> GetSess(TVMContext ctx) {
     int tbl_index = GetRPCSessionIndex(ctx);
diff --git a/src/runtime/rpc/rpc_endpoint.cc b/src/runtime/rpc/rpc_endpoint.cc
index fbdd93fb4f62..8716355fd68f 100644
--- a/src/runtime/rpc/rpc_endpoint.cc
+++ b/src/runtime/rpc/rpc_endpoint.cc
@@ -387,88 +387,72 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
   void HandleSyscall(RPCCode code);
 
   void HandleCopyFromRemote() {
-    uint64_t handle, offset, num_bytes;
-    TVMContext ctx;
-    DLDataType type_hint;
-    this->Read(&handle);
-    this->Read(&offset);
-    this->Read(&num_bytes);
-    this->Read(&ctx);
-    this->Read(&type_hint);
-    size_t elem_bytes = (type_hint.bits * type_hint.lanes + 7) / 8;
-
+    DLTensor* arr = RPCReference::ReceiveDLTensor(this);
+    uint64_t data_bytes;
+    this->Read(&data_bytes);
+    size_t elem_bytes = (arr->dtype.bits * arr->dtype.lanes + 7) / 8;
     auto* sess = GetServingSession();
-
     // Return Copy Ack with the given data
-    auto fcopyack = [this](char* data_ptr, size_t num_bytes) {
+    auto fcopyack = [this](char* dptr, size_t num_bytes) {
       RPCCode code = RPCCode::kCopyAck;
       uint64_t packet_nbytes = sizeof(code) + num_bytes;
 
       this->Write(packet_nbytes);
       this->Write(code);
-      this->WriteArray(data_ptr, num_bytes);
+      this->WriteArray(dptr, num_bytes);
       this->SwitchToState(kRecvPacketNumBytes);
     };
 
     // When session is local, we can directly treat handle
     // as the cpu pointer without allocating a temp space.
-    if (ctx.device_type == kDLCPU && sess->IsLocalSession() && DMLC_IO_NO_ENDIAN_SWAP) {
-      char* data_ptr = reinterpret_cast<char*>(handle) + offset;
-      fcopyack(data_ptr, num_bytes);
+    if (arr->ctx.device_type == kDLCPU && sess->IsLocalSession() && DMLC_IO_NO_ENDIAN_SWAP) {
+      char* data_ptr = reinterpret_cast<char*>(arr->data) + arr->byte_offset;
+      fcopyack(data_ptr, data_bytes);
     } else {
-      char* data_ptr = this->ArenaAlloc<char>(num_bytes);
-
-      auto on_copy_complete = [this, elem_bytes, num_bytes, data_ptr, fcopyack](RPCCode status,
-                                                                                TVMArgs args) {
+      char* temp_data = this->ArenaAlloc<char>(data_bytes);
+      auto on_copy_complete = [this, elem_bytes, data_bytes, temp_data, fcopyack](RPCCode status,
+                                                                                  TVMArgs args) {
         if (status == RPCCode::kException) {
           this->ReturnException(args.values[0].v_str);
           this->SwitchToState(kRecvPacketNumBytes);
         } else {
           // endian aware handling
           if (!DMLC_IO_NO_ENDIAN_SWAP) {
-            dmlc::ByteSwap(data_ptr, elem_bytes, num_bytes / elem_bytes);
+            dmlc::ByteSwap(temp_data, elem_bytes, data_bytes / elem_bytes);
           }
-          fcopyack(data_ptr, num_bytes);
+          fcopyack(temp_data, data_bytes);
         }
       };
 
       this->SwitchToState(kWaitForAsyncCallback);
-      sess->AsyncCopyFromRemote(reinterpret_cast<void*>(handle), offset, data_ptr, 0, num_bytes,
-                                ctx, type_hint, on_copy_complete);
+      sess->AsyncCopyFromRemote(arr, static_cast<void*>(temp_data), data_bytes, on_copy_complete);
     }
   }
 
   void HandleCopyToRemote() {
-    uint64_t handle, offset, num_bytes;
-    TVMContext ctx;
-    DLDataType type_hint;
-
-    this->Read(&handle);
-    this->Read(&offset);
-    this->Read(&num_bytes);
-    this->Read(&ctx);
-    this->Read(&type_hint);
-
-    size_t elem_bytes = (type_hint.bits * type_hint.lanes + 7) / 8;
+    DLTensor* arr = RPCReference::ReceiveDLTensor(this);
+    uint64_t data_bytes;
+    this->Read(&data_bytes);
+    size_t elem_bytes = (arr->dtype.bits * arr->dtype.lanes + 7) / 8;
     auto* sess = GetServingSession();
 
     // When session is local, we can directly treat handle
     // as the cpu pointer without allocating a temp space.
-    if (ctx.device_type == kDLCPU && sess->IsLocalSession()) {
-      char* dptr = reinterpret_cast<char*>(handle) + offset;
-      this->ReadArray(dptr, num_bytes);
+    if (arr->ctx.device_type == kDLCPU && sess->IsLocalSession()) {
+      char* dptr = reinterpret_cast<char*>(arr->data) + arr->byte_offset;
+      this->ReadArray(dptr, data_bytes);
 
       if (!DMLC_IO_NO_ENDIAN_SWAP) {
-        dmlc::ByteSwap(dptr, elem_bytes, num_bytes / elem_bytes);
+        dmlc::ByteSwap(dptr, elem_bytes, data_bytes / elem_bytes);
       }
       this->ReturnVoid();
       this->SwitchToState(kRecvPacketNumBytes);
     } else {
-      char* temp_data = this->ArenaAlloc<char>(num_bytes);
-      this->ReadArray(temp_data, num_bytes);
+      char* temp_data = this->ArenaAlloc<char>(data_bytes);
+      this->ReadArray(temp_data, data_bytes);
 
       if (!DMLC_IO_NO_ENDIAN_SWAP) {
-        dmlc::ByteSwap(temp_data, elem_bytes, num_bytes / elem_bytes);
+        dmlc::ByteSwap(temp_data, elem_bytes, data_bytes / elem_bytes);
       }
 
       auto on_copy_complete = [this](RPCCode status, TVMArgs args) {
@@ -482,8 +466,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
       };
 
       this->SwitchToState(kWaitForAsyncCallback);
-      sess->AsyncCopyToRemote(temp_data, 0, reinterpret_cast<void*>(handle), offset, num_bytes, ctx,
-                              type_hint, on_copy_complete);
+      sess->AsyncCopyToRemote(static_cast<void*>(temp_data), arr, data_bytes, on_copy_complete);
     }
   }
 
@@ -815,51 +798,47 @@ void RPCEndpoint::CallFunc(RPCSession::PackedFuncHandle h, const TVMValue* arg_v
   ICHECK(code == RPCCode::kReturn) << "code=" << static_cast<int>(code);
 }
 
-void RPCEndpoint::CopyToRemote(void* from, size_t from_offset, void* to, size_t to_offset,
-                               size_t data_size, TVMContext ctx_to, DLDataType type_hint) {
+void RPCEndpoint::CopyToRemote(void* from_bytes, DLTensor* to, uint64_t nbytes) {
   std::lock_guard<std::mutex> lock(mutex_);
   RPCCode code = RPCCode::kCopyToRemote;
-  uint64_t handle = reinterpret_cast<uint64_t>(to);
-  uint64_t offset = static_cast<uint64_t>(to_offset);
-  uint64_t size = static_cast<uint64_t>(data_size);
 
-  uint64_t packet_nbytes = sizeof(code) + sizeof(handle) + sizeof(offset) + sizeof(size) +
-                           sizeof(ctx_to) + sizeof(type_hint) + data_size;
+  uint64_t num_data_bytes = static_cast<uint64_t>(GetDataSize(*to));
+  ICHECK_EQ(nbytes, num_data_bytes);
+
+  uint64_t to_data = reinterpret_cast<uint64_t>(to->data);
+  uint64_t shape_bytes = to->ndim * sizeof(int64_t);
+  uint64_t packet_nbytes = sizeof(code) + sizeof(to_data) + sizeof(to->ctx) + sizeof(to->ndim) +
+                           sizeof(to->dtype) + sizeof(to->byte_offset) + shape_bytes +
+                           sizeof(nbytes) + num_data_bytes;
 
   handler_->Write(packet_nbytes);
   handler_->Write(code);
-  handler_->Write(handle);
-  handler_->Write(offset);
-  handler_->Write(size);
-  handler_->Write(ctx_to);
-  handler_->Write(type_hint);
-  handler_->WriteArray(reinterpret_cast<char*>(from) + from_offset, data_size);
-
+  RPCReference::SendDLTensor(handler_, to);
+  handler_->Write(nbytes);
+  handler_->WriteArray(reinterpret_cast<char*>(from_bytes), nbytes);
   ICHECK(HandleUntilReturnEvent(true, [](TVMArgs) {}) == RPCCode::kReturn);
 }
 
-void RPCEndpoint::CopyFromRemote(void* from, size_t from_offset, void* to, size_t to_offset,
-                                 size_t data_size, TVMContext ctx_from, DLDataType type_hint) {
+void RPCEndpoint::CopyFromRemote(DLTensor* from, void* to_bytes, uint64_t nbytes) {
   std::lock_guard<std::mutex> lock(mutex_);
   RPCCode code = RPCCode::kCopyFromRemote;
-  uint64_t handle = reinterpret_cast<uint64_t>(from);
-  uint64_t offset = static_cast<uint64_t>(from_offset);
-  uint64_t size = static_cast<uint64_t>(data_size);
 
-  uint64_t packet_nbytes = sizeof(code) + sizeof(handle) + sizeof(offset) + sizeof(size) +
-                           sizeof(ctx_from) + sizeof(type_hint);
+  uint64_t num_data_bytes = static_cast<uint64_t>(GetDataSize(*from));
+  CHECK_EQ(nbytes, num_data_bytes);
+
+  uint64_t from_data = reinterpret_cast<uint64_t>(from->data);
+  uint64_t shape_bytes = from->ndim * sizeof(int64_t);
+  uint64_t packet_nbytes = sizeof(code) + sizeof(from_data) + sizeof(from->ctx) +
+                           sizeof(from->ndim) + sizeof(from->dtype) + sizeof(from->byte_offset) +
+                           shape_bytes + sizeof(nbytes);
 
   handler_->Write(packet_nbytes);
   handler_->Write(code);
-  handler_->Write(handle);
-  handler_->Write(offset);
-  handler_->Write(size);
-  handler_->Write(ctx_from);
-  handler_->Write(type_hint);
-
-  TVMRetValue rv;
+  RPCReference::SendDLTensor(handler_, from);
+  handler_->Write(nbytes);
   ICHECK(HandleUntilReturnEvent(true, [](TVMArgs) {}) == RPCCode::kCopyAck);
-  handler_->ReadArray(reinterpret_cast<char*>(to) + to_offset, data_size);
+
+  handler_->ReadArray(reinterpret_cast<char*>(to_bytes), nbytes);
   handler_->FinishCopyAck();
 }
 
@@ -904,6 +883,23 @@ void RPCDevAllocData(RPCSession* handler, TVMArgs args, TVMRetValue* rv) {
   *rv = data;
 }
 
+void RPCDevAllocDataWithScope(RPCSession* handler, TVMArgs args, TVMRetValue* rv) {
+  DLTensor* arr = args[0];
+  TVMContext ctx = arr->ctx;
+  int ndim = arr->ndim;
+  int64_t* shape = arr->shape;
+  DLDataType dtype = arr->dtype;
+  int tcode = args[1].type_code();
+  Optional<String> mem_scope = NullOpt;
+  if (tcode == kTVMStr) {
+    mem_scope = args[1].operator String();
+  } else {
+    ICHECK_EQ(tcode, kTVMNullptr);
+  }
+  void* data = handler->GetDeviceAPI(ctx)->AllocDataSpace(ctx, ndim, shape, dtype, mem_scope);
+  *rv = data;
+}
+
 void RPCDevFreeData(RPCSession* handler, TVMArgs args, TVMRetValue* rv) {
   TVMContext ctx = args[0];
   void* ptr = args[1];
@@ -911,25 +907,18 @@ void RPCDevFreeData(RPCSession* handler, TVMArgs args, TVMRetValue* rv) {
 }
 
 void RPCCopyAmongRemote(RPCSession* handler, TVMArgs args, TVMRetValue* rv) {
-  void* from = args[0];
-  uint64_t from_offset = args[1];
-  void* to = args[2];
-  uint64_t to_offset = args[3];
-  uint64_t size = args[4];
-  TVMContext ctx_from = args[5];
-  TVMContext ctx_to = args[6];
-  DLDataType type_hint = args[7];
-  TVMStreamHandle stream = args[8];
-  TVMContext ctx = ctx_from;
+  DLTensor* from = args[0];
+  DLTensor* to = args[1];
+  TVMStreamHandle stream = args[2];
 
+  TVMContext ctx = from->ctx;
   if (ctx.device_type == kDLCPU) {
-    ctx = ctx_to;
+    ctx = to->ctx;
   } else {
-    ICHECK(ctx_to.device_type == kDLCPU || ctx_to.device_type == ctx_from.device_type)
+    ICHECK(to->ctx.device_type == kDLCPU || to->ctx.device_type == from->ctx.device_type)
         << "Can not copy across different ctx types directly";
   }
-  handler->GetDeviceAPI(ctx)->CopyDataFromTo(from, from_offset, to, to_offset, size, ctx_from,
-                                             ctx_to, type_hint, stream);
+  handler->GetDeviceAPI(ctx)->CopyDataFromTo(from, to, stream);
 }
 
 void RPCEndpoint::EventHandler::HandleSyscall(RPCCode code) {
@@ -951,6 +940,9 @@ void RPCEndpoint::EventHandler::HandleSyscall(RPCCode code) {
     case RPCCode::kDevAllocData:
       SysCallHandler(RPCDevAllocData);
       break;
+    case RPCCode::kDevAllocDataWithScope:
+      SysCallHandler(RPCDevAllocDataWithScope);
+      break;
     case RPCCode::kDevFreeData:
       SysCallHandler(RPCDevFreeData);
       break;
@@ -989,14 +981,12 @@ class RPCClientSession : public RPCSession, public DeviceAPI {
     endpoint_->CallFunc(func, arg_values, arg_type_codes, num_args, fencode_return);
   }
 
-  void CopyToRemote(void* from, size_t from_offset, void* to, size_t to_offset, size_t nbytes,
-                    TVMContext ctx_to, DLDataType type_hint) final {
-    endpoint_->CopyToRemote(from, from_offset, to, to_offset, nbytes, ctx_to, type_hint);
+  void CopyToRemote(void* local_from_bytes, DLTensor* remote_to, uint64_t nbytes) final {
+    endpoint_->CopyToRemote(local_from_bytes, remote_to, nbytes);
   }
 
-  void CopyFromRemote(void* from, size_t from_offset, void* to, size_t to_offset, size_t nbytes,
-                      TVMContext ctx_from, DLDataType type_hint) final {
-    endpoint_->CopyFromRemote(from, from_offset, to, to_offset, nbytes, ctx_from, type_hint);
+  void CopyFromRemote(DLTensor* remote_from, void* local_to_bytes, uint64_t nbytes) final {
+    endpoint_->CopyFromRemote(remote_from, local_to_bytes, nbytes);
   }
 
   void FreeHandle(void* handle, int type_code) final {
@@ -1019,15 +1009,30 @@ class RPCClientSession : public RPCSession, public DeviceAPI {
     return endpoint_->SysCallRemote(RPCCode::kDevAllocData, ctx, nbytes, alignment, type_hint);
   }
 
+  void* AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, DLDataType dtype,
+                       Optional<String> mem_scope) final {
+    DLTensor temp;
+    temp.data = nullptr;
+    temp.ctx = ctx;
+    temp.ndim = ndim;
+    temp.dtype = dtype;
+    temp.shape = const_cast<int64_t*>(shape);
+    temp.strides = nullptr;
+    temp.byte_offset = 0;
+    if (mem_scope.defined()) {
+      return endpoint_->SysCallRemote(RPCCode::kDevAllocDataWithScope, &temp,
+                                      static_cast<std::string>(mem_scope.value()));
+    } else {
+      return endpoint_->SysCallRemote(RPCCode::kDevAllocDataWithScope, &temp, nullptr);
+    }
+  }
+
   void FreeDataSpace(TVMContext ctx, void* ptr) final {
     endpoint_->SysCallRemote(RPCCode::kDevFreeData, ctx, ptr);
   }
 
-  void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
-                      TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
-                      TVMStreamHandle stream) final {
-    endpoint_->SysCallRemote(RPCCode::kCopyAmongRemote, const_cast<void*>(from), from_offset, to,
-                             to_offset, size, ctx_from, ctx_to, type_hint, stream);
+  void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) final {
+    endpoint_->SysCallRemote(RPCCode::kCopyAmongRemote, from, to, stream);
   }
 
   void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {
diff --git a/src/runtime/rpc/rpc_endpoint.h b/src/runtime/rpc/rpc_endpoint.h
index 031435fc8ef9..8e08bfa75623 100644
--- a/src/runtime/rpc/rpc_endpoint.h
+++ b/src/runtime/rpc/rpc_endpoint.h
@@ -135,8 +135,7 @@ class RPCEndpoint {
    * \param ctx_to The target context.
    * \param type_hint Hint of content data type.
    */
-  void CopyToRemote(void* from, size_t from_offset, void* to, size_t to_offset, size_t nbytes,
-                    TVMContext ctx_to, DLDataType type_hint);
+  void CopyToRemote(void* from_bytes, DLTensor* to, uint64_t nbytes);
   /*!
    * \brief Copy bytes from remote array content.
    * \param from The source host data.
@@ -147,8 +146,7 @@ class RPCEndpoint {
    * \param ctx_from The source context.
    * \param type_hint Hint of content data type.
    */
-  void CopyFromRemote(void* from, size_t from_offset, void* to, size_t to_offset, size_t nbytes,
-                      TVMContext ctx_from, DLDataType type_hint);
+  void CopyFromRemote(DLTensor* from, void* to_bytes, uint64_t nbytes);
 
   /*!
    * \brief Call a remote defined system function with arguments.
diff --git a/src/runtime/rpc/rpc_local_session.cc b/src/runtime/rpc/rpc_local_session.cc
index b35c62d255fc..0650b55d0d7c 100644
--- a/src/runtime/rpc/rpc_local_session.cc
+++ b/src/runtime/rpc/rpc_local_session.cc
@@ -87,26 +87,36 @@ void LocalSession::CallFunc(RPCSession::PackedFuncHandle func, const TVMValue* a
   this->EncodeReturn(std::move(rv), encode_return);
 }
 
-void LocalSession::CopyToRemote(void* from, size_t from_offset, void* to, size_t to_offset,
-                                size_t nbytes, TVMContext ctx_to, DLDataType type_hint) {
-  TVMContext cpu_ctx;
-  cpu_ctx.device_type = kDLCPU;
-  cpu_ctx.device_id = 0;
-  this->GetDeviceAPI(ctx_to)->CopyDataFromTo(from, from_offset, to, to_offset, nbytes, cpu_ctx,
-                                             ctx_to, type_hint, nullptr);
+void LocalSession::CopyToRemote(void* from_bytes, DLTensor* to, uint64_t nbytes) {
+  ICHECK_EQ(nbytes, GetDataSize(*to));
+  DLTensor from;
+  from.data = from_bytes;
+  from.ctx = {kDLCPU, 0};
+  from.ndim = to->ndim;
+  from.shape = to->shape;
+  from.dtype = to->dtype;
+  from.strides = nullptr;
+  from.byte_offset = 0;
+  TVMContext ctx_to = to->ctx;
+  this->GetDeviceAPI(ctx_to)->CopyDataFromTo(&from, to, nullptr);
   // Copy can happen asynchrously
   // synchronize to make sure that copy is completed
   this->GetDeviceAPI(ctx_to)->StreamSync(ctx_to, nullptr);
 }
 
-void LocalSession::CopyFromRemote(void* from, size_t from_offset, void* to, size_t to_offset,
-                                  size_t nbytes, TVMContext ctx_from, DLDataType type_hint) {
-  TVMContext cpu_ctx;
-  cpu_ctx.device_type = kDLCPU;
-  cpu_ctx.device_id = 0;
-
-  this->GetDeviceAPI(ctx_from)->CopyDataFromTo(from, from_offset, to, to_offset, nbytes, ctx_from,
-                                               cpu_ctx, type_hint, nullptr);
+void LocalSession::CopyFromRemote(DLTensor* from, void* to_bytes, uint64_t nbytes) {
+  ICHECK_EQ(nbytes, GetDataSize(*from));
+  DLTensor to;
+  to.data = to_bytes;
+  to.ctx = {kDLCPU, 0};
+  to.ndim = from->ndim;
+  to.shape = from->shape;
+  to.dtype = from->dtype;
+  to.strides = nullptr;
+  to.byte_offset = 0;
+
+  TVMContext ctx_from = from->ctx;
+  this->GetDeviceAPI(ctx_from)->CopyDataFromTo(from, &to, nullptr);
   // Copy can happen asynchrously
   // synchronize to make sure that copy is completed
   this->GetDeviceAPI(ctx_from)->StreamSync(ctx_from, nullptr);
diff --git a/src/runtime/rpc/rpc_local_session.h b/src/runtime/rpc/rpc_local_session.h
index 7a67ce86bf80..ea070e34bd35 100644
--- a/src/runtime/rpc/rpc_local_session.h
+++ b/src/runtime/rpc/rpc_local_session.h
@@ -48,11 +48,9 @@ class LocalSession : public RPCSession {
   void CallFunc(PackedFuncHandle func, const TVMValue* arg_values, const int* arg_type_codes,
                 int num_args, const FEncodeReturn& fencode_return) override;
 
-  void CopyToRemote(void* from, size_t from_offset, void* to, size_t to_offset, size_t nbytes,
-                    TVMContext ctx_to, DLDataType type_hint) override;
+  void CopyToRemote(void* from_bytes, DLTensor* to, uint64_t nbytes) override;
 
-  void CopyFromRemote(void* from, size_t from_offset, void* to, size_t to_offset, size_t nbytes,
-                      TVMContext ctx_from, DLDataType type_hint) override;
+  void CopyFromRemote(DLTensor* from, void* to_bytes, uint64_t nbytes) override;
 
   void FreeHandle(void* handle, int type_code) override;
 
diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc
index f5405f0c2fa0..0ac5b8dc74ef 100644
--- a/src/runtime/rpc/rpc_session.cc
+++ b/src/runtime/rpc/rpc_session.cc
@@ -51,33 +51,28 @@ void RPCSession::AsyncCallFunc(PackedFuncHandle func, const TVMValue* arg_values
   }
 }
 
-void RPCSession::AsyncCopyToRemote(void* local_from, size_t local_from_offset, void* remote_to,
-                                   size_t remote_to_offset, size_t nbytes, TVMContext remote_ctx_to,
-                                   DLDataType type_hint, RPCSession::FAsyncCallback callback) {
+void RPCSession::AsyncCopyToRemote(void* local_from_bytes, DLTensor* remote_to, uint64_t nbytes,
+                                   RPCSession::FAsyncCallback callback) {
   TVMValue value;
   int32_t tcode = kTVMNullptr;
   value.v_handle = nullptr;
 
   try {
-    this->CopyToRemote(local_from, local_from_offset, remote_to, remote_to_offset, nbytes,
-                       remote_ctx_to, type_hint);
+    this->CopyToRemote(local_from_bytes, remote_to, nbytes);
     callback(RPCCode::kReturn, TVMArgs(&value, &tcode, 1));
   } catch (const std::runtime_error& e) {
     this->SendException(callback, e.what());
   }
 }
 
-void RPCSession::AsyncCopyFromRemote(void* remote_from, size_t remote_from_offset, void* local_to,
-                                     size_t local_to_offset, size_t nbytes,
-                                     TVMContext remote_ctx_from, DLDataType type_hint,
+void RPCSession::AsyncCopyFromRemote(DLTensor* remote_from, void* local_to_bytes, uint64_t nbytes,
                                      RPCSession::FAsyncCallback callback) {
   TVMValue value;
   int32_t tcode = kTVMNullptr;
   value.v_handle = nullptr;
 
   try {
-    this->CopyFromRemote(remote_from, remote_from_offset, local_to, local_to_offset, nbytes,
-                         remote_ctx_from, type_hint);
+    this->CopyFromRemote(remote_from, local_to_bytes, nbytes);
     callback(RPCCode::kReturn, TVMArgs(&value, &tcode, 1));
   } catch (const std::runtime_error& e) {
     this->SendException(callback, e.what());
diff --git a/src/runtime/rpc/rpc_session.h b/src/runtime/rpc/rpc_session.h
index 4ea937acc6ef..4b942f2230ba 100644
--- a/src/runtime/rpc/rpc_session.h
+++ b/src/runtime/rpc/rpc_session.h
@@ -127,30 +127,18 @@ class RPCSession {
 
   /*!
    * \brief Copy bytes into remote array content.
-   * \param local_from The source host data.
-   * \param local_from_offset The byte offeset in the from.
+   * \param local_from_bytes The source host data.
    * \param remote_to The target array.
-   * \param remote_to_offset The byte offset in the to.
    * \param nbytes The size of the memory in bytes.
-   * \param remote_ctx_to The target context.
-   * \param type_hint Hint of content data type.
    */
-  virtual void CopyToRemote(void* local_from, size_t local_from_offset, void* remote_to,
-                            size_t remote_to_offset, size_t nbytes, TVMContext remote_ctx_to,
-                            DLDataType type_hint) = 0;
+  virtual void CopyToRemote(void* local_from_bytes, DLTensor* remote_to, uint64_t nbytes) = 0;
   /*!
    * \brief Copy bytes from remote array content.
    * \param remote_from The source host data.
-   * \param remote_from_offset The byte offeset in the from.
-   * \param to The target array.
-   * \param to_offset The byte offset in the to.
+   * \param local_to_bytes The target array.
    * \param nbytes The size of the memory in bytes.
-   * \param remote_ctx_from The source context in the remote.
-   * \param type_hint Hint of content data type.
    */
-  virtual void CopyFromRemote(void* remote_from, size_t remote_from_offset, void* local_to,
-                              size_t local_to_offset, size_t nbytes, TVMContext remote_ctx_from,
-                              DLDataType type_hint) = 0;
+  virtual void CopyFromRemote(DLTensor* remote_from, void* local_to_bytes, uint64_t nbytes) = 0;
 
   /*!
    * \brief Free a remote function.
@@ -223,40 +211,27 @@ class RPCSession {
   /*!
    * \brief Asynchrous version of CopyToRemote.
    *
-   * \param local_from The source host data.
-   * \param local_from_offset The byte offeset in the from.
+   * \param local_from_bytes The source host data.
    * \param remote_to The target array.
-   * \param remote_to_offset The byte offset in the to.
    * \param nbytes The size of the memory in bytes.
-   * \param remote_ctx_to The target context.
-   * \param type_hint Hint of content data type.
-   *
    * \param on_complete The callback to signal copy complete.
    * \note All the allocated memory in local_from, and remote_to
    *       must stay alive until on_compelete is called.
    */
-  virtual void AsyncCopyToRemote(void* local_from, size_t local_from_offset, void* remote_to,
-                                 size_t remote_to_offset, size_t nbytes, TVMContext remote_ctx_to,
-                                 DLDataType type_hint, FAsyncCallback on_complete);
+  virtual void AsyncCopyToRemote(void* local_from_bytes, DLTensor* remote_to, uint64_t nbytes,
+                                 FAsyncCallback on_complete);
 
   /*!
    * \brief Asynchrous version of CopyFromRemote.
    *
    * \param remote_from The source host data.
-   * \param remote_from_offset The byte offeset in the from.
-   * \param to The target array.
-   * \param to_offset The byte offset in the to.
+   * \param local_to_bytes The target array.
    * \param nbytes The size of the memory in bytes.
-   * \param remote_ctx_from The source context in the remote.
-   * \param type_hint Hint of content data type.
-   *
    * \param on_complete The callback to signal copy complete.
    * \note All the allocated memory in remote_from, and local_to
    *       must stay alive until on_compelete is called.
    */
-  virtual void AsyncCopyFromRemote(void* remote_from, size_t remote_from_offset, void* local_to,
-                                   size_t local_to_offset, size_t nbytes,
-                                   TVMContext remote_ctx_from, DLDataType type_hint,
+  virtual void AsyncCopyFromRemote(DLTensor* remote_from, void* local_to_bytes, uint64_t nbytes,
                                    FAsyncCallback on_complete);
   /*!
    * \brief Asynchrously wait for all events in ctx, stream compeletes.
diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc
index cbf1974ee3c7..f40fd80f38b5 100644
--- a/src/runtime/vulkan/vulkan.cc
+++ b/src/runtime/vulkan/vulkan.cc
@@ -199,6 +199,7 @@ class VulkanDeviceAPI final : public DeviceAPI {
     delete pbuf;
   }
 
+ protected:
   void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
                       TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
                       TVMStreamHandle stream) final {
@@ -307,6 +308,7 @@ class VulkanDeviceAPI final : public DeviceAPI {
     }
   }
 
+ public:
   // Always use the default stream
   TVMStreamHandle CreateStream(TVMContext ctx) {
     LOG(FATAL) << "Not implemented";
diff --git a/web/emcc/tvmjs_support.cc b/web/emcc/tvmjs_support.cc
index 6abd12252d1d..b72caad1e3df 100644
--- a/web/emcc/tvmjs_support.cc
+++ b/web/emcc/tvmjs_support.cc
@@ -177,33 +177,37 @@ class AsyncLocalSession : public LocalSession {
     }
   }
 
-  void AsyncCopyToRemote(void* local_from, size_t local_from_offset, void* remote_to,
-                         size_t remote_to_offset, size_t nbytes, TVMContext remote_ctx_to,
-                         DLDataType type_hint, FAsyncCallback on_complete) final {
-    TVMContext cpu_ctx;
-    cpu_ctx.device_type = kDLCPU;
-    cpu_ctx.device_id = 0;
+  void AsyncCopyToRemote(void* local_from_bytes, DLTensor* remote_to, uint64_t nbytes,
+                         FAsyncCallback on_complete) final {
     try {
-      this->GetDeviceAPI(remote_ctx_to)
-          ->CopyDataFromTo(local_from, local_from_offset, remote_to, remote_to_offset, nbytes,
-                           cpu_ctx, remote_ctx_to, type_hint, nullptr);
-      this->AsyncStreamWait(remote_ctx_to, nullptr, on_complete);
+      DLTensor local_from;
+      local_from.data = local_from_bytes;
+      local_from.ctx = TVMContext{kDLCPU, 0};
+      local_from.ndim = remote_to->ndim;
+      local_from.shape = remote_to->shape;
+      local_from.dtype = remote_to->dtype;
+      local_from.strides = nullptr;
+      local_from.byte_offset = 0;
+      this->GetDeviceAPI(remote_to->ctx)->CopyDataFromTo(&local_from, remote_to, nullptr);
+      this->AsyncStreamWait(remote_to->ctx, nullptr, on_complete);
     } catch (const std::runtime_error& e) {
       this->SendException(on_complete, e.what());
     }
   }
 
-  void AsyncCopyFromRemote(void* remote_from, size_t remote_from_offset, void* local_to,
-                           size_t local_to_offset, size_t nbytes, TVMContext remote_ctx_from,
-                           DLDataType type_hint, FAsyncCallback on_complete) final {
-    TVMContext cpu_ctx;
-    cpu_ctx.device_type = kDLCPU;
-    cpu_ctx.device_id = 0;
+  void AsyncCopyFromRemote(DLTensor* remote_from, void* local_to_bytes, uint64_t nbytes,
+                           FAsyncCallback on_complete) final {
     try {
-      this->GetDeviceAPI(remote_ctx_from)
-          ->CopyDataFromTo(remote_from, remote_from_offset, local_to, local_to_offset, nbytes,
-                           remote_ctx_from, cpu_ctx, type_hint, nullptr);
-      this->AsyncStreamWait(remote_ctx_from, nullptr, on_complete);
+      DLTensor local_to;
+      local_to.data = local_to_bytes;
+      local_to.ctx = TVMContext{kDLCPU, 0};
+      local_to.ndim = remote_from->ndim;
+      local_to.shape = remote_from->shape;
+      local_to.dtype = remote_from->dtype;
+      local_to.strides = nullptr;
+      local_to.byte_offset = 0;
+      this->GetDeviceAPI(remote_from->ctx)->CopyDataFromTo(&local_to, remote_from, nullptr);
+      this->AsyncStreamWait(remote_from->ctx, nullptr, on_complete);
     } catch (const std::runtime_error& e) {
       this->SendException(on_complete, e.what());
     }
diff --git a/web/emcc/webgpu_runtime.cc b/web/emcc/webgpu_runtime.cc
index 54601e37d037..62b87af01774 100644
--- a/web/emcc/webgpu_runtime.cc
+++ b/web/emcc/webgpu_runtime.cc
@@ -82,6 +82,7 @@ class WebGPUDeviceAPI : public DeviceAPI {
 
   void FreeDataSpace(TVMContext ctx, void* ptr) final { return free_space_(ptr); }
 
+ protected:
   void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
                       TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
                       TVMStreamHandle stream) final {
@@ -102,6 +103,7 @@ class WebGPUDeviceAPI : public DeviceAPI {
     }
   }
 
+ public:
   TVMStreamHandle CreateStream(TVMContext ctx) final {
     LOG(FATAL) << "Not implemented";
     return nullptr;

From 51dc332646d90b77ae29e2e2dbe21f40008a0082 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Mon, 1 Mar 2021 17:12:25 +0900
Subject: [PATCH 256/357] Fix foldconstant involving dropout (#7550)

Co-authored-by: masa <masa@pop-os.localdomain>
---
 src/relay/op/nn/nn.cc                         |  3 ++-
 tests/python/relay/test_pass_fold_constant.py | 26 +++++++++++++++++--
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 97460ba4a98b..38c33b45936e 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -591,7 +591,8 @@ The whole array is rescaled by ``1/(1-p)`` to keep the expected sum of the input
     .add_argument("data", "Tensor", "Input to which dropout will be applied.")
     .set_support_level(1)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
-    .add_type_rel("Dropout", DropoutRel);
+    .add_type_rel("Dropout", DropoutRel)
+    .set_attr<TOpIsStateful>("TOpIsStateful", true);
 
 // batch_norm
 TVM_REGISTER_NODE_TYPE(BatchNormAttrs);
diff --git a/tests/python/relay/test_pass_fold_constant.py b/tests/python/relay/test_pass_fold_constant.py
index 14ad419e80c6..7b4eb5231a2c 100644
--- a/tests/python/relay/test_pass_fold_constant.py
+++ b/tests/python/relay/test_pass_fold_constant.py
@@ -16,7 +16,6 @@
 # under the License.
 import numpy as np
 import tvm
-from tvm import te
 from tvm import relay
 from tvm.relay import transform
 from tvm.relay.build_module import bind_params_by_name
@@ -276,12 +275,35 @@ def initializer(_, param):
     assert tvm.ir.structural_equal(mod["main"], expect)
 
 
+def test_fold_dropout():
+    def before():
+        # A constant graph to fire fold constant
+        data = relay.const(np.arange(10).astype(np.float32))
+        dropout = relay.nn.dropout(data)
+        add = dropout + relay.const(1.0)
+        return relay.Function(relay.analysis.free_vars(add), add)
+
+    passes = tvm.transform.Sequential(
+        [
+            relay.transform.InferType(),
+            relay.transform.FoldConstant(),
+        ]
+    )
+
+    before_mod = tvm.IRModule.from_expr(before())
+
+    with tvm.transform.PassContext(opt_level=3):
+        after_mod = passes(before_mod)
+
+    assert tvm.ir.structural_equal(run_infer_type(before_mod["main"]), after_mod["main"])
+
+
 if __name__ == "__main__":
     test_fold_const()
     test_fold_let()
     test_fold_tuple()
     test_fold_concat()
     test_fold_shape_of()
-    test_fold_full()
     test_fold_batch_norm()
     test_fold_ndarray_size()
+    test_fold_dropout()

From 057a673986f0ab50c6be8335339b2beb01e3a1f4 Mon Sep 17 00:00:00 2001
From: Siyuan Feng <Hzfengsy@sjtu.edu.cn>
Date: Tue, 2 Mar 2021 04:59:32 +0800
Subject: [PATCH 257/357] [TensorIR] introduce Block and BlockRealize (#312)
 (#7553)

Co-authored-by: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com>
Co-authored-by: Junru Shao <junrushao1994@gmail.com>
Co-authored-by: Tianqi Chen <tqchen@users.noreply.github.com>
Co-authored-by: Ruihang Lai <lairuihangdongdong@qq.com>
Co-authored-by: Hongyi Jin <3231950289@qq.com>
Co-authored-by: Wuwei Lin <wuwei@apache.org>

Co-authored-by: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com>
Co-authored-by: Junru Shao <junrushao1994@gmail.com>
Co-authored-by: Tianqi Chen <tqchen@users.noreply.github.com>
Co-authored-by: Ruihang Lai <lairuihangdongdong@qq.com>
Co-authored-by: Hongyi Jin <3231950289@qq.com>
Co-authored-by: Wuwei Lin <wuwei@apache.org>
---
 include/tvm/tir/stmt.h                  | 248 +++++++++++++++++++++++-
 include/tvm/tir/stmt_functor.h          |   8 +
 python/tvm/tir/__init__.py              |   1 +
 python/tvm/tir/stmt.py                  | 162 ++++++++++++++++
 src/tir/ir/stmt.cc                      | 219 +++++++++++++++++++++
 src/tir/ir/stmt_functor.cc              | 109 +++++++++++
 tests/cpp/ir_functor_test.cc            |  41 ++++
 tests/python/unittest/test_tir_nodes.py |  82 ++++++++
 8 files changed, 869 insertions(+), 1 deletion(-)

diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h
index 093d49ca2dd4..074bcdd3f533 100644
--- a/include/tvm/tir/stmt.h
+++ b/include/tvm/tir/stmt.h
@@ -862,7 +862,7 @@ class For : public Stmt {
 };
 
 /*!
- * \brief A prefetch hint for abuffer
+ * \brief A prefetch hint for a buffer
  */
 class PrefetchNode : public StmtNode {
  public:
@@ -905,6 +905,252 @@ class Prefetch : public Stmt {
   TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(Prefetch, Stmt, PrefetchNode);
 };
 
+/*!
+ * \brief Representing the region of multi-dimensional buffer access.
+ */
+class BufferRegionNode : public Object {
+ public:
+  /*! \brief The buffer of the buffer region. */
+  Buffer buffer;
+  /*! \brief The region array of the buffer region. */
+  Array<Range> region;
+
+  void VisitAttrs(AttrVisitor* v) {
+    v->Visit("buffer", &buffer);
+    v->Visit("region", &region);
+  }
+
+  bool SEqualReduce(const BufferRegionNode* other, SEqualReducer equal) const {
+    return equal(buffer, other->buffer) && equal(region, other->region);
+  }
+
+  void SHashReduce(SHashReducer hash_reduce) const {
+    hash_reduce(buffer);
+    hash_reduce(region);
+  }
+
+  static constexpr const char* _type_key = "tir.BufferRegion";
+  static constexpr const bool _type_has_method_sequal_reduce = true;
+  static constexpr const bool _type_has_method_shash_reduce = true;
+  TVM_DECLARE_FINAL_OBJECT_INFO(BufferRegionNode, Object);
+};
+
+/*!
+ * \brief Managed reference to BufferRegionNode.
+ * \sa BufferRegionNode
+ */
+class BufferRegion : public ObjectRef {
+ public:
+  TVM_DLL explicit BufferRegion(Buffer buffer, Array<Range> region);
+
+  /*!
+   * \brief Create a BufferRegion which is full region of the given buffer..
+   * \param buffer The buffer to generate full BufferRegion.
+   * \return The BufferRegion which covers all region of the given buffer
+   */
+  TVM_DLL static BufferRegion FullRegion(Buffer buffer);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(BufferRegion, ObjectRef, BufferRegionNode);
+};
+
+/*!
+ * \brief Match introduces a constraint that the source buffer region can be remapped to the data
+ * layout specified by the buffer field. The constraint can be checked in later part of lowering (or
+ * optionally during runtime).
+ *
+ * MatchBufferRegion provides a mechanism to represent data layout and compactness constraints in
+ * low-level hardware primitives in the IR and defer the check after the sequence of
+ * transformations.
+ */
+class MatchBufferRegionNode : public Object {
+ public:
+  /*! \brief The target buffer. */
+  Buffer buffer;
+  /*! \brief The source buffer region. */
+  BufferRegion source;
+
+  void VisitAttrs(AttrVisitor* v) {
+    v->Visit("buffer", &buffer);
+    v->Visit("source", &source);
+  }
+
+  bool SEqualReduce(const MatchBufferRegionNode* other, SEqualReducer equal) const {
+    return equal(buffer, other->buffer) && equal(source, other->source);
+  }
+
+  void SHashReduce(SHashReducer hash_reduce) const {
+    hash_reduce(buffer);
+    hash_reduce(source);
+  }
+
+  static constexpr const char* _type_key = "tir.MatchBufferRegion";
+  static constexpr const bool _type_has_method_sequal_reduce = true;
+  static constexpr const bool _type_has_method_shash_reduce = true;
+  TVM_DECLARE_FINAL_OBJECT_INFO(MatchBufferRegionNode, Object);
+};
+
+/*!
+ * \brief Managed reference to MatchBufferRegionNode.
+ * \sa MatchBufferRegionNode
+ */
+class MatchBufferRegion : public ObjectRef {
+ public:
+  TVM_DLL explicit MatchBufferRegion(Buffer buffer, BufferRegion source);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(MatchBufferRegion, ObjectRef, MatchBufferRegionNode);
+};
+
+/*!
+ * \brief A block is a basic schedule unit in TIR.
+ * \note Block's body is parameterized by iter vars.
+ * \code
+ *
+ *  with tir.block([extent0, extent1, ...], name) as [v0, v1, ...]:
+ *      tir.bind(v0, value0)
+ *      tir.bind(v1, value1)
+ *      ...
+ *      tir.reads([buffer0[start:end, ...], ...])
+ *      tir.writes([buffer1[start:end, ...], ...])
+ *      tir.where(predicate)
+ *      buffer2 = tir.alloc_buffer(shape, dtype)
+ *      buffer3 = tir.match_buffer(source_buffer[start:end, ...])
+ *      tir.attr({attr_key: attr_value, ...})
+ *      with tir.init():
+ *          // init body
+ *      // body
+ *
+ * \endcode
+ */
+class BlockNode : public StmtNode {
+ public:
+  /*! \brief The variables of the block. */
+  Array<IterVar> iter_vars;
+  /*! \brief The read buffer regions of the block. */
+  Array<BufferRegion> reads;
+  /*! \brief The write buffer regions of the block. */
+  Array<BufferRegion> writes;
+  /*! \brief The name_hint of the block. */
+  String name_hint;
+  /*! \brief The body of the block. */
+  Stmt body;
+  /*!
+   * \brief The init statement is executed during the first iteration of reduction loops in a
+   *  reduction block. The optional init field allows us to represent initialization and
+   *  reduction update in a single block and transform them collectively.
+   *  We also provide primitives to decompose the init into a separate block during scheduling.
+   *  Init field is `NullOpt` if there is no reduction iter_vars
+   */
+  Optional<Stmt> init;
+  /*! \brief The buffer allocated in the block. */
+  Array<Buffer> alloc_buffers;
+  /*! \brief The match buffer regions. */
+  Array<MatchBufferRegion> match_buffers;
+  /*! \brief The annotation of the block. */
+  Map<String, ObjectRef> annotations;
+
+  void VisitAttrs(AttrVisitor* v) {
+    v->Visit("iter_vars", &iter_vars);
+    v->Visit("reads", &reads);
+    v->Visit("writes", &writes);
+    v->Visit("name_hint", &name_hint);
+    v->Visit("body", &body);
+    v->Visit("init", &init);
+    v->Visit("alloc_buffers", &alloc_buffers);
+    v->Visit("match_buffers", &match_buffers);
+    v->Visit("annotations", &annotations);
+  }
+
+  bool SEqualReduce(const BlockNode* other, SEqualReducer equal) const {
+    // Need first reduce iter_vars, alloc_buffers and match_buffers to define new vars
+    return equal.DefEqual(iter_vars, other->iter_vars) &&
+           equal(alloc_buffers, other->alloc_buffers) &&
+           equal(match_buffers, other->match_buffers) && equal(reads, other->reads) &&
+           equal(writes, other->writes) && equal(body, other->body) && equal(init, other->init) &&
+           equal(annotations, other->annotations);
+  }
+
+  void SHashReduce(SHashReducer hash_reduce) const {
+    hash_reduce.DefHash(iter_vars);
+    hash_reduce(alloc_buffers);
+    hash_reduce(match_buffers);
+    hash_reduce(reads);
+    hash_reduce(writes);
+    hash_reduce(body);
+    hash_reduce(init);
+    hash_reduce(annotations);
+  }
+
+  static constexpr const char* _type_key = "tir.Block";
+  TVM_DECLARE_FINAL_OBJECT_INFO(BlockNode, StmtNode);
+};
+
+/*!
+ * \brief Managed reference to BlockNode.
+ * \sa BlockNode
+ */
+class Block : public Stmt {
+ public:
+  TVM_DLL explicit Block(Array<IterVar> iter_vars, Array<BufferRegion> reads,
+                         Array<BufferRegion> writes, String name_hint, Stmt body,
+                         Optional<Stmt> init = NullOpt,
+                         Array<Buffer> alloc_buffers = Array<Buffer>(),
+                         Array<MatchBufferRegion> match_buffers = Array<MatchBufferRegion>(),
+                         Map<String, ObjectRef> annotations = Map<String, ObjectRef>(),
+                         Span span = Span());
+
+  TVM_DEFINE_OBJECT_REF_METHODS(Block, Stmt, BlockNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(BlockNode);
+};
+
+/*!
+ * \brief A block realization node represents execution of the block at the binding values.
+ */
+class BlockRealizeNode : public StmtNode {
+ public:
+  /*! \brief The corresponding values of the iter vars. */
+  Array<PrimExpr> iter_values;
+  /*!
+   * \brief The predicate of the block realization, the block will only be executed when the
+   * predicate is true.
+   */
+  PrimExpr predicate;
+  /*! \brief The block to be realized. */
+  Block block;
+
+  void VisitAttrs(AttrVisitor* v) {
+    v->Visit("iter_values", &iter_values);
+    v->Visit("predicate", &predicate);
+    v->Visit("block", &block);
+  }
+
+  bool SEqualReduce(const BlockRealizeNode* other, SEqualReducer equal) const {
+    return equal(iter_values, other->iter_values) && equal(predicate, other->predicate) &&
+           equal(block, other->block);
+  }
+
+  void SHashReduce(SHashReducer hash_reduce) const {
+    hash_reduce(iter_values);
+    hash_reduce(predicate);
+    hash_reduce(block);
+  }
+
+  static constexpr const char* _type_key = "tir.BlockRealize";
+  TVM_DECLARE_FINAL_OBJECT_INFO(BlockRealizeNode, StmtNode);
+};
+
+/*!
+ * \brief Managed reference to BlockRealizeNode
+ * \sa BlockRealizeNode
+ */
+class BlockRealize : public Stmt {
+ public:
+  TVM_DLL explicit BlockRealize(Array<PrimExpr> iter_values, PrimExpr predicate, Block block,
+                                Span span = Span());
+
+  TVM_DEFINE_OBJECT_REF_METHODS(BlockRealize, Stmt, BlockRealizeNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(BlockRealizeNode);
+};
+
 /*! \brief namespace of possible attribute sin AttrStmt.attr_key */
 namespace attr {
 // The above attr does not pass to ir stage.
diff --git a/include/tvm/tir/stmt_functor.h b/include/tvm/tir/stmt_functor.h
index 0f4238deeebd..e53b02d73e1d 100644
--- a/include/tvm/tir/stmt_functor.h
+++ b/include/tvm/tir/stmt_functor.h
@@ -96,6 +96,8 @@ class StmtFunctor<R(const Stmt& n, Args... args)> {
   virtual R VisitStmt_(const PrefetchNode* op, Args... args) STMT_FUNCTOR_DEFAULT;
   virtual R VisitStmt_(const SeqStmtNode* op, Args... args) STMT_FUNCTOR_DEFAULT;
   virtual R VisitStmt_(const EvaluateNode* op, Args... args) STMT_FUNCTOR_DEFAULT;
+  virtual R VisitStmt_(const BlockNode* op, Args... args) STMT_FUNCTOR_DEFAULT;
+  virtual R VisitStmt_(const BlockRealizeNode* op, Args... args) STMT_FUNCTOR_DEFAULT;
   virtual R VisitStmtDefault_(const Object* op, Args...) {
     LOG(FATAL) << "Do not have a default for " << op->GetTypeKey();
     return R();
@@ -119,6 +121,8 @@ class StmtFunctor<R(const Stmt& n, Args... args)> {
     IR_STMT_FUNCTOR_DISPATCH(EvaluateNode);
     IR_STMT_FUNCTOR_DISPATCH(BufferStoreNode);
     IR_STMT_FUNCTOR_DISPATCH(BufferRealizeNode);
+    IR_STMT_FUNCTOR_DISPATCH(BlockNode);
+    IR_STMT_FUNCTOR_DISPATCH(BlockRealizeNode);
     return vtable;
   }
 };
@@ -158,6 +162,8 @@ class TVM_DLL StmtVisitor : protected StmtFunctor<void(const Stmt&)> {
   void VisitStmt_(const PrefetchNode* op) override;
   void VisitStmt_(const SeqStmtNode* op) override;
   void VisitStmt_(const EvaluateNode* op) override;
+  void VisitStmt_(const BlockNode* op) override;
+  void VisitStmt_(const BlockRealizeNode* op) override;
 };
 
 /*!
@@ -249,6 +255,8 @@ class TVM_DLL StmtMutator : protected StmtFunctor<Stmt(const Stmt&)> {
   Stmt VisitStmt_(const PrefetchNode* op) override;
   Stmt VisitStmt_(const SeqStmtNode* op) override;
   Stmt VisitStmt_(const EvaluateNode* op) override;
+  Stmt VisitStmt_(const BlockNode* op) override;
+  Stmt VisitStmt_(const BlockRealizeNode* op) override;
   /*!
    * \brief Alternative advance method for SeqStmtNode.
    *
diff --git a/python/tvm/tir/__init__.py b/python/tvm/tir/__init__.py
index 324c4daf19ba..ad91eab64b52 100644
--- a/python/tvm/tir/__init__.py
+++ b/python/tvm/tir/__init__.py
@@ -31,6 +31,7 @@
 from .stmt import BufferStore, BufferRealize, Store, ProducerStore, Allocate, AttrStmt
 from .stmt import ProducerRealize, SeqStmt
 from .stmt import IfThenElse, Evaluate, Prefetch, stmt_seq, stmt_list
+from .stmt import BufferRegion, MatchBufferRegion, Block, BlockRealize
 
 from .function import PrimFunc
 
diff --git a/python/tvm/tir/stmt.py b/python/tvm/tir/stmt.py
index 5882dca5578e..e4f1ac924a83 100644
--- a/python/tvm/tir/stmt.py
+++ b/python/tvm/tir/stmt.py
@@ -26,11 +26,15 @@
     assert isinstance(st, tvm.tir.stmt.Store)
     assert(st.buffer_var == a)
 """
+from typing import List, Optional, Mapping
 from enum import IntEnum
 import tvm._ffi
 
 from tvm.runtime import Object
+from tvm.ir import Span, PrimExpr, Range
 from . import _ffi_api
+from .buffer import Buffer
+from .expr import IterVar
 
 
 class Stmt(Object):
@@ -429,6 +433,164 @@ def __init__(self, buffer, bounds, span=None):
         self.__init_handle_by_constructor__(_ffi_api.Prefetch, buffer, bounds, span)
 
 
+@tvm._ffi.register_object("tir.BufferRegion")
+class BufferRegion(Object):
+    """BufferRegion node.
+
+    Parameters
+    ----------
+    buffer : Buffer
+        The buffer of the buffer region
+
+    region : List[Range]
+        The region array of the buffer region
+    """
+
+    buffer: Buffer
+    region: List[Range]
+
+    def __init__(self, buffer: Buffer, region: List[Range]):
+        self.__init_handle_by_constructor__(_ffi_api.BufferRegion, buffer, region)
+
+
+@tvm._ffi.register_object("tir.MatchBufferRegion")
+class MatchBufferRegion(Object):
+    """MatchBufferRegion node.
+
+    Parameters
+    ----------
+    buffer : Buffer
+        The target buffer
+
+    source : BufferRegion
+        The region of source buffer
+    """
+
+    buffer: Buffer
+    source: BufferRegion
+
+    def __init__(self, buffer: Buffer, source: BufferRegion):
+        self.__init_handle_by_constructor__(_ffi_api.MatchBufferRegion, buffer, source)
+
+
+@tvm._ffi.register_object("tir.Block")
+class Block(Stmt):
+    """Block node.
+
+    Parameters
+    ----------
+    iter_vars : List[IterVar]
+        The block Variable.
+
+    reads : List[BufferRegion]
+        The read buffer regions of the block.
+
+    writes: List[BufferRegion]
+        The write buffer regions of the block.
+
+    name_hint: str
+        the name_hint of the block.
+
+    body: Stmt
+        The body of the block.
+
+    init: Optional[Stmt]
+        The init block of the reduction block
+
+    alloc_buffers: Optional[list[Buffer]]
+        The buffer allocations
+
+    match_buffers: Optional[List[MatchBufferRegion]]
+        The subregion buffer match
+
+    annotations: Optional[Mapping[str, Object]]
+        Additional annotation hints.
+
+    span : Optional[Span]
+        The location of this block in the source code.
+    """
+
+    iter_vars: List[IterVar]
+    reads: List[BufferRegion]
+    writes: List[BufferRegion]
+    name_hint: str
+    body: Stmt
+    init: Optional[Stmt]
+    alloc_buffers: Optional[List[Buffer]]
+    match_buffers: Optional[List[MatchBufferRegion]]
+    annotations: Optional[Mapping[str, Object]]
+    span: Optional[Span]
+
+    def __init__(
+        self,
+        iter_vars: List[IterVar],
+        reads: List[BufferRegion],
+        writes: List[BufferRegion],
+        name_hint: str,
+        body: Stmt,
+        init: Optional[Stmt] = None,
+        alloc_buffers: Optional[List[Buffer]] = None,
+        match_buffers: Optional[List[MatchBufferRegion]] = None,
+        annotations: Optional[Mapping[str, Object]] = None,
+        span: Optional[Span] = None,
+    ):
+        if alloc_buffers is None:
+            alloc_buffers = []
+        if match_buffers is None:
+            match_buffers = []
+        if annotations is None:
+            annotations = {}
+        self.__init_handle_by_constructor__(
+            _ffi_api.Block,
+            iter_vars,
+            reads,
+            writes,
+            name_hint,
+            body,
+            init,
+            alloc_buffers,
+            match_buffers,
+            annotations,
+            span,
+        )
+
+
+@tvm._ffi.register_object("tir.BlockRealize")
+class BlockRealize(Stmt):
+    """BlockRealize node.
+
+    Parameters
+    ----------
+    iter_values : List[PrimExpr]
+        The binding values of the block var.
+
+    predicate : PrimExpr
+        The predicate of the block.
+
+    block : Block
+        The block to realize
+
+    span : Optional[Span]
+        The location of this block_realize in the source code.
+    """
+
+    iter_values: List[PrimExpr]
+    predicate: PrimExpr
+    block: Block
+    span: Optional[Span]
+
+    def __init__(
+        self,
+        iter_values: List[PrimExpr],
+        predicate: PrimExpr,
+        block: Block,
+        span: Optional[Span] = None,
+    ):
+        self.__init_handle_by_constructor__(
+            _ffi_api.BlockRealize, iter_values, predicate, block, span
+        )
+
+
 def stmt_seq(*args):
     """Make sequence of statements
 
diff --git a/src/tir/ir/stmt.cc b/src/tir/ir/stmt.cc
index 92dc38797544..e54be4347c8e 100644
--- a/src/tir/ir/stmt.cc
+++ b/src/tir/ir/stmt.cc
@@ -598,6 +598,225 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
       p->stream << "}\n";
     });
 
+// BufferRegion
+BufferRegion::BufferRegion(Buffer buffer, Array<Range> region) {
+  ObjectPtr<BufferRegionNode> node = make_object<BufferRegionNode>();
+  node->buffer = std::move(buffer);
+  node->region = std::move(region);
+  data_ = std::move(node);
+}
+
+BufferRegion BufferRegion::FullRegion(Buffer buffer) {
+  Array<Range> region;
+  for (PrimExpr extent : buffer->shape) {
+    region.push_back(Range::FromMinExtent(0, extent));
+  }
+  return BufferRegion(buffer, region);
+}
+
+TVM_REGISTER_GLOBAL("tir.BufferRegion").set_body_typed([](Buffer buffer, Array<Range> region) {
+  return BufferRegion(buffer, region);
+});
+
+TVM_REGISTER_NODE_TYPE(BufferRegionNode);
+
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<BufferRegionNode>([](const ObjectRef& node, ReprPrinter* p) {
+      auto* op = static_cast<const BufferRegionNode*>(node.get());
+      p->stream << op->buffer->name;
+      p->stream << "[";
+      for (size_t i = 0; i < op->region.size(); ++i) {
+        const auto& range = op->region[i];
+        p->Print(range->min);
+        if (!is_one(range->extent)) {
+          p->stream << ":";
+          p->Print(range->min + range->extent);
+        }
+        if (i != op->region.size() - 1) p->stream << ", ";
+      }
+      p->stream << "]";
+    });
+
+// MatchBufferRegion
+MatchBufferRegion::MatchBufferRegion(Buffer buffer, BufferRegion source) {
+  ObjectPtr<MatchBufferRegionNode> node = make_object<MatchBufferRegionNode>();
+  node->buffer = std::move(buffer);
+  node->source = std::move(source);
+  data_ = std::move(node);
+}
+
+TVM_REGISTER_GLOBAL("tir.MatchBufferRegion").set_body_typed([](Buffer buffer, BufferRegion source) {
+  return MatchBufferRegion(buffer, source);
+});
+
+TVM_REGISTER_NODE_TYPE(MatchBufferRegionNode);
+
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<MatchBufferRegionNode>([](const ObjectRef& node, ReprPrinter* p) {
+      auto* op = static_cast<const MatchBufferRegionNode*>(node.get());
+      p->PrintIndent();
+      p->stream << op->buffer->name << " = match_buffer_region(";
+      p->Print(op->source);
+      p->stream << ")\n";
+    });
+
+// Block
+Block::Block(Array<IterVar> iter_vars, Array<BufferRegion> reads, Array<BufferRegion> writes,
+             String name_hint, Stmt body, Optional<Stmt> init, Array<Buffer> alloc_buffers,
+             Array<MatchBufferRegion> match_buffers, Map<String, ObjectRef> annotations,
+             Span span) {
+  ObjectPtr<BlockNode> node = make_object<BlockNode>();
+  node->iter_vars = std::move(iter_vars);
+  node->reads = std::move(reads);
+  node->writes = std::move(writes);
+  node->name_hint = std::move(name_hint);
+  node->body = std::move(body);
+  node->init = std::move(init);
+  node->alloc_buffers = std::move(alloc_buffers);
+  node->match_buffers = std::move(match_buffers);
+  node->annotations = std::move(annotations);
+  node->span = std::move(span);
+  data_ = std::move(node);
+}
+
+TVM_REGISTER_GLOBAL("tir.Block")
+    .set_body_typed([](Array<IterVar> iter_vars, Array<BufferRegion> reads,
+                       Array<BufferRegion> writes, String name_hint, Stmt body, Optional<Stmt> init,
+                       Array<Buffer> alloc_buffers, Array<MatchBufferRegion> match_buffers,
+                       Map<String, ObjectRef> annotations, Span span) {
+      return Block(iter_vars, reads, writes, name_hint, body, init, alloc_buffers, match_buffers,
+                   annotations, span);
+    });
+
+TVM_REGISTER_NODE_TYPE(BlockNode);
+
+void PrintBlockTitle(const BlockNode* op, ReprPrinter* p) {
+  p->stream << "block " << op->name_hint << "(";
+  for (size_t i = 0; i < op->iter_vars.size(); i++) {
+    p->Print(op->iter_vars[i]);
+    if (i < op->iter_vars.size() - 1) p->stream << ", ";
+  }
+  p->stream << ")";
+}
+
+void PrintBlockSignature(const BlockNode* op, ReprPrinter* p) {
+  // print read/write regions
+  p->PrintIndent();
+  p->stream << "reads(";
+  p->Print(op->reads);
+  p->stream << ")\n";
+  p->PrintIndent();
+  p->stream << "writes(";
+  p->Print(op->writes);
+  p->stream << ")\n";
+  // Print alloc_buffers
+  for (const auto& alloc_buf : op->alloc_buffers) {
+    p->PrintIndent();
+    p->stream << alloc_buf->name << " = alloc_buffer(" << alloc_buf->dtype << "[";
+    for (size_t i = 0; i < alloc_buf->shape.size(); ++i) {
+      if (i > 0) p->stream << ", ";
+      p->Print(alloc_buf->shape[i]);
+    }
+    p->stream << "])\n";
+  }
+  // Print match_buffer_regions
+  for (const auto& match_buf : op->match_buffers) {
+    p->Print(match_buf);
+  }
+  if (!op->annotations.empty()) {
+    p->PrintIndent();
+    p->stream << "annotations(" << op->annotations << ")\n";
+  }
+}
+
+void PrintBlockBody(const BlockNode* op, ReprPrinter* p) {
+  // Print init
+  if (op->init.defined()) {
+    p->PrintIndent();
+    p->stream << "with init() {\n";
+    p->indent += 2;
+    p->Print(op->init.value());
+    p->indent -= 2;
+    p->PrintIndent();
+    p->stream << "}\n";
+  }
+  // Print body
+  p->Print(op->body);
+}
+
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<BlockNode>([](const ObjectRef& node, ReprPrinter* p) {
+      auto* op = static_cast<const BlockNode*>(node.get());
+      p->PrintIndent();
+      PrintBlockTitle(op, p);
+      p->stream << "{\n";
+      p->indent += 2;
+
+      // Print block elements (e.g. reads/writes, etc)
+      PrintBlockSignature(op, p);
+      // Print block init and body
+      PrintBlockBody(op, p);
+
+      p->indent -= 2;
+      p->PrintIndent();
+      p->stream << "}\n";
+    });
+
+// BlockRealize
+BlockRealize::BlockRealize(Array<PrimExpr> values, PrimExpr predicate, Block block, Span span) {
+  CHECK_EQ(block->iter_vars.size(), values.size())
+      << "ValueError: BlockRealize needs to have the same number of iter_vars and binding values";
+  CHECK(predicate.dtype().is_bool()) << "TypeError: Expect Block.predicate to be a bool expression";
+  ObjectPtr<BlockRealizeNode> node = make_object<BlockRealizeNode>();
+  node->iter_values = std::move(values);
+  node->predicate = std::move(predicate);
+  node->block = std::move(block);
+  node->span = std::move(span);
+  data_ = std::move(node);
+}
+
+TVM_REGISTER_GLOBAL("tir.BlockRealize")
+    .set_body_typed([](Array<PrimExpr> iter_values, PrimExpr predicate, Block block, Span span) {
+      return BlockRealize(iter_values, predicate, block, span);
+    });
+
+TVM_REGISTER_NODE_TYPE(BlockRealizeNode);
+
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<BlockRealizeNode>([](const ObjectRef& node, ReprPrinter* p) {
+      auto* op = static_cast<const BlockRealizeNode*>(node.get());
+      auto* block_op = op->block.get();
+      p->PrintIndent();
+      PrintBlockTitle(block_op, p);
+      p->stream << "{\n";
+      p->indent += 2;
+
+      // Print binding iter_values
+      for (size_t i = 0; i < block_op->iter_vars.size(); ++i) {
+        p->PrintIndent();
+        p->stream << "bind(";
+        p->Print(block_op->iter_vars[i]->var);
+        p->stream << ", ";
+        p->Print(op->iter_values[i]);
+        p->stream << ")\n";
+      }
+      // Print predicate
+      if (!is_one(op->predicate)) {
+        p->PrintIndent();
+        p->stream << "where(";
+        p->Print(op->predicate);
+        p->stream << ")\n";
+      }
+      // Print block elements (e.g. reads/writes, etc)
+      PrintBlockSignature(block_op, p);
+      // Print block init and body
+      PrintBlockBody(block_op, p);
+
+      p->indent -= 2;
+      p->PrintIndent();
+      p->stream << "}\n";
+    });
+
 PrimExpr TypeAnnotation(DataType dtype, Span span) {
   static auto op = Op::Get("tir.type_annotation");
   return tir::Call(dtype, op, {}, span);
diff --git a/src/tir/ir/stmt_functor.cc b/src/tir/ir/stmt_functor.cc
index e4cc1b7e4275..f05dc7116494 100644
--- a/src/tir/ir/stmt_functor.cc
+++ b/src/tir/ir/stmt_functor.cc
@@ -112,6 +112,35 @@ void StmtVisitor::VisitStmt_(const SeqStmtNode* op) {
 
 void StmtVisitor::VisitStmt_(const EvaluateNode* op) { this->VisitExpr(op->value); }
 
+void StmtVisitor::VisitStmt_(const BlockNode* op) {
+  auto fvisit_buffer_region = [this](const BufferRegion& s) {
+    for (const auto& range : s->region) {
+      this->VisitExpr(range->min);
+      this->VisitExpr(range->extent);
+    }
+  };
+  VisitArray(op->iter_vars, [this](const IterVar& iter_var) {
+    this->VisitExpr(iter_var->dom->min);
+    this->VisitExpr(iter_var->dom->extent);
+  });
+  VisitArray(op->reads, fvisit_buffer_region);
+  VisitArray(op->writes, fvisit_buffer_region);
+  VisitArray(op->match_buffers,
+             [fvisit_buffer_region](const MatchBufferRegion& match_buffer_region) {
+               fvisit_buffer_region(match_buffer_region->source);
+             });
+  if (op->init.defined()) {
+    this->VisitStmt(op->init.value());
+  }
+  this->VisitStmt(op->body);
+}
+
+void StmtVisitor::VisitStmt_(const BlockRealizeNode* op) {
+  VisitArray(op->iter_values, [this](const PrimExpr& e) { this->VisitExpr(e); });
+  this->VisitExpr(op->predicate);
+  this->VisitStmt(op->block);
+}
+
 class StmtMutator::Internal {
  public:
   /*!
@@ -150,6 +179,20 @@ class StmtMutator::Internal {
     }
   }
 
+  static Array<IterVar> Mutate(StmtMutator* self, const Array<IterVar>& arr) {
+    auto fmutate = [self](const IterVar& iter_var) {
+      PrimExpr min = self->VisitExpr(iter_var->dom->min);
+      PrimExpr extent = self->VisitExpr(iter_var->dom->extent);
+      if (min.same_as(iter_var->dom->min) && extent.same_as(iter_var->dom->extent)) {
+        return iter_var;
+      } else {
+        return IterVar(Range(min, extent), iter_var->var, iter_var->iter_type,
+                       iter_var->thread_tag);
+      }
+    };
+    return MutateArray(self, arr, fmutate);
+  }
+
   static Array<PrimExpr> Mutate(StmtMutator* self, const Array<PrimExpr>& arr) {
     auto fmutate = [self](const PrimExpr& e) { return self->VisitExpr(e); };
     return MutateArray(self, arr, fmutate);
@@ -172,6 +215,31 @@ class StmtMutator::Internal {
     };
     return MutateArray(self, arr, fmutate);
   }
+
+  static Array<BufferRegion> Mutate(StmtMutator* self, const Array<BufferRegion>& arr) {
+    auto fmutate = [self](const BufferRegion& buffer_region) {
+      Array<Range> region = Mutate(self, buffer_region->region);
+      if (region.same_as(buffer_region->region)) {
+        return buffer_region;
+      } else {
+        return BufferRegion(buffer_region->buffer, region);
+      }
+    };
+    return MutateArray(self, arr, fmutate);
+  }
+
+  static Array<MatchBufferRegion> Mutate(StmtMutator* self, const Array<MatchBufferRegion>& arr) {
+    auto fmutate = [self](const MatchBufferRegion& match_buffer_region) {
+      Array<Range> region = Mutate(self, match_buffer_region->source->region);
+      if (region.same_as(match_buffer_region->source->region)) {
+        return match_buffer_region;
+      } else {
+        return MatchBufferRegion(match_buffer_region->buffer,
+                                 BufferRegion(match_buffer_region->source->buffer, region));
+      }
+    };
+    return MutateArray(self, arr, fmutate);
+  }
 };
 
 Stmt StmtMutator::VisitStmt_(const AttrStmtNode* op) {
@@ -415,6 +483,47 @@ Stmt StmtMutator::VisitStmt_(const EvaluateNode* op) {
   }
 }
 
+Stmt StmtMutator::VisitStmt_(const BlockNode* op) {
+  Array<IterVar> iter_vars = Internal::Mutate(this, op->iter_vars);
+  Array<BufferRegion> reads = Internal::Mutate(this, op->reads);
+  Array<BufferRegion> writes = Internal::Mutate(this, op->writes);
+  Array<MatchBufferRegion> match_buffers = Internal::Mutate(this, op->match_buffers);
+  Optional<Stmt> init = NullOpt;
+  if (op->init.defined()) {
+    init = VisitStmt(op->init.value());
+  }
+  Stmt body = VisitStmt(op->body);
+  if (iter_vars.same_as(op->iter_vars) && reads.same_as(op->reads) && writes.same_as(op->writes) &&
+      body.same_as(op->body) && init.same_as(op->init) &&
+      match_buffers.same_as(op->match_buffers)) {
+    return GetRef<Block>(op);
+  } else {
+    auto n = CopyOnWrite(op);
+    n->iter_vars = std::move(iter_vars);
+    n->reads = std::move(reads);
+    n->writes = std::move(writes);
+    n->body = std::move(body);
+    n->init = std::move(init);
+    n->match_buffers = std::move(match_buffers);
+    return Stmt(n);
+  }
+}
+
+Stmt StmtMutator::VisitStmt_(const BlockRealizeNode* op) {
+  Array<PrimExpr> v = Internal::Mutate(this, op->iter_values);
+  PrimExpr pred = this->VisitExpr(op->predicate);
+  Stmt block = this->VisitStmt(op->block);
+  if (v.same_as(op->iter_values) && pred.same_as(op->predicate) && block.same_as(op->block)) {
+    return GetRef<Stmt>(op);
+  } else {
+    auto n = CopyOnWrite(op);
+    n->iter_values = std::move(v);
+    n->predicate = std::move(pred);
+    n->block = Downcast<Block>(block);
+    return Stmt(n);
+  }
+}
+
 // Implementations of IRTransform, PostOrderVisit and Substitute
 class IRApplyVisit : public StmtExprVisitor {
  public:
diff --git a/tests/cpp/ir_functor_test.cc b/tests/cpp/ir_functor_test.cc
index d242b20f1ba7..237dc46b99ca 100644
--- a/tests/cpp/ir_functor_test.cc
+++ b/tests/cpp/ir_functor_test.cc
@@ -120,6 +120,25 @@ TEST(IRF, StmtVisitor) {
   };
   v(fmaketest());
   ICHECK_EQ(v.count, 3);
+
+  {
+    // tests for block and block_realize
+    Stmt body = fmaketest();
+    DataType dtype = DataType::Float(32);
+    Var buf_var("b", PointerType(PrimType(dtype)));
+    Buffer buffer = decl_buffer({16});
+    BufferRegion buffer_region(buffer, {Range::FromMinExtent(x + 1, 1)});
+    MatchBufferRegion match_buffer_region(decl_buffer({1}), buffer_region);
+
+    // construct block and block_realize
+    Block block =
+        Block({}, {buffer_region}, {buffer_region}, "block", body, body, {}, {match_buffer_region});
+    Stmt block_realize = BlockRealize({}, const_true(), block);
+
+    v.count = 0;
+    v(block_realize);
+    ICHECK_EQ(v.count, 9);
+  }
 }
 
 TEST(IRF, StmtMutator) {
@@ -229,6 +248,28 @@ TEST(IRF, StmtMutator) {
     // the seq get flattened
     ICHECK(body.as<SeqStmtNode>()->seq[0].as<AllocateNode>()->extents.get() != extentptr);
   }
+
+  {
+    // tests for block and block_realize
+    Stmt body = fmakealloc();
+    DataType dtype = DataType::Float(32);
+    Var buf_var("b", PointerType(PrimType(dtype)));
+    Buffer buffer = decl_buffer({16});
+    BufferRegion buffer_region(buffer, {Range::FromMinExtent(x + 1, 1)});
+    MatchBufferRegion match_buffer_region(decl_buffer({1}), buffer_region);
+    // construct block and block_realize
+    Block block =
+        Block({}, {buffer_region}, {buffer_region}, "block", body, body, {}, {match_buffer_region});
+    Stmt block_realize = BlockRealize({}, const_true(), block);
+    body = v(std::move(block_realize));
+    // the body should be changed
+    Block new_block = body.as<BlockRealizeNode>()->block;
+    ICHECK(new_block->body.as<AllocateNode>()->extents[1].same_as(x));
+    ICHECK(new_block->init.as<AllocateNode>()->extents[1].same_as(x));
+    ICHECK(new_block->reads[0]->region[0]->min.same_as(x));
+    ICHECK(new_block->writes[0]->region[0]->min.same_as(x));
+    ICHECK(new_block->match_buffers[0]->source->region[0]->min.same_as(x));
+  }
 }
 
 int main(int argc, char** argv) {
diff --git a/tests/python/unittest/test_tir_nodes.py b/tests/python/unittest/test_tir_nodes.py
index bff60f70f53b..6e338d64a61c 100644
--- a/tests/python/unittest/test_tir_nodes.py
+++ b/tests/python/unittest/test_tir_nodes.py
@@ -364,6 +364,87 @@ def test_intimm_cond():
     assert x == 1
 
 
+def test_block_blockrealize():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    vx = tvm.tir.IterVar((16, 16), "vx", 0)
+    vx_var = vx.var
+    vy = tvm.tir.IterVar((16, 16), "vy", 2)
+    vy_var = vy.var
+    A = tvm.tir.decl_buffer((16), "float32")
+    B = tvm.tir.decl_buffer((16, 16), "float32")
+    alloc_buffer = tvm.tir.decl_buffer((16, 16), "float32")
+    match_buffer = tvm.tir.decl_buffer((16, 16), "float32")
+    init_body = tvm.tir.BufferStore(A, 0.0, [vx_var])
+    body = tvm.tir.BufferStore(
+        A,
+        tvm.tir.BufferLoad(A, [vx_var]) + tvm.tir.BufferLoad(B, [vx_var, vy_var]),
+        [vx_var],
+    )
+    reads = [
+        tvm.tir.BufferRegion(
+            B, [tvm.ir.Range.from_min_extent(vx_var, 1), tvm.ir.Range.from_min_extent(vy_var, 1)]
+        )
+    ]
+    writes = [tvm.tir.BufferRegion(A, [tvm.ir.Range.from_min_extent(vx_var, 1)])]
+    match_buffer_region = tvm.tir.MatchBufferRegion(
+        match_buffer, tvm.tir.BufferRegion(B, [tvm.ir.Range(0, 16), tvm.ir.Range(0, 16)])
+    )
+
+    block = tvm.tir.Block(
+        [vx, vy],
+        reads,
+        writes,
+        "block",
+        body,
+        init=init_body,
+        alloc_buffers=[alloc_buffer],
+        match_buffers=[match_buffer_region],
+        annotations={"attr_key": "attr_value"},
+    )
+
+    # Checking Block
+    assert isinstance(block, tvm.tir.Block)
+    # Checking iter_vars
+    assert block.iter_vars[0] == vx
+    assert block.iter_vars[1] == vy
+    # Checking reads/writes region
+    assert isinstance(block.reads[0], tvm.tir.BufferRegion)
+    assert block.reads[0].buffer == B
+    assert block.reads[0].region[0].min == vx_var
+    assert block.reads[0].region[1].min == vy_var
+    assert isinstance(block.writes[0], tvm.tir.BufferRegion)
+    assert block.writes[0].buffer == A
+    assert block.writes[0].region[0].min == vx_var
+    assert block.writes[0].region[0].extent == 1
+    # Checking name_hint
+    assert block.name_hint == "block"
+    # Checking body
+    assert block.body == body
+    # Checking init
+    assert block.init == init_body
+    # Checking alloc_buffers
+    assert block.alloc_buffers[0] == alloc_buffer
+    # Checking match_buffers
+    assert block.match_buffers[0].buffer == match_buffer
+    assert isinstance(block.match_buffers[0].source, tvm.tir.BufferRegion)
+    assert block.match_buffers[0].source.buffer == B
+    assert block.match_buffers[0].source.region[0].min == 0
+    assert block.match_buffers[0].source.region[0].extent == 16
+
+    # Checking BlockRealize
+    block_realize = tvm.tir.BlockRealize([x, y], tvm.tir.const(True, "bool"), block)
+    assert isinstance(block_realize, tvm.tir.BlockRealize)
+    assert block_realize.iter_values[0] == x
+    assert block_realize.iter_values[1] == y
+    assert block_realize.predicate == tvm.tir.const(True, "bool")
+    assert block_realize.block == block
+
+    # make sure we can print
+    str(block)
+    str(block_realize)
+
+
 if __name__ == "__main__":
     test_intimm_cond()
     test_buffer_load_store()
@@ -389,3 +470,4 @@ def test_intimm_cond():
     test_isnan()
     test_equality()
     test_equality_string_imm()
+    test_block_blockrealize()

From a1d43c15ac6382831370c6de141bf80888761e70 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <tmoreau@octoml.ai>
Date: Mon, 1 Mar 2021 17:31:57 -0800
Subject: [PATCH 258/357] [Autoscheduler][VM] Autoscheduler layout rewrite pass
 to VM (#7516)

* fix type inference for conv2d

* fix

* adding the autoscheduler layout rewrite pass to VM compiler passes

* revert edits applied in other PR

* minor fix

* fix

* formatting fix

* lint
---
 src/relay/backend/vm/compiler.cc | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index 7697b59437f0..0718191a2ff6 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -1066,6 +1066,23 @@ IRModule VMCompiler::OptimizeModule(IRModule mod, const TargetsMap& targets,
   }
 
   pass_seqs.push_back(transform::FuseOps());
+  // Do layout rewrite for auto-scheduler.
+  transform::PassContext pass_ctx = PassContext::Current();
+  if (backend::IsAutoSchedulerEnabled() && targets.size() == 1) {
+    const auto& target = (*targets.begin()).second;
+    Pass major_pass = transform::AutoSchedulerLayoutRewrite();
+    bool enable_layout_rewrite_targets =
+        target->kind->device_type == kDLCPU || target->GetAttr<String>("device", "") == "mali";
+    if (enable_layout_rewrite_targets && pass_ctx.PassEnabled(major_pass->Info())) {
+      With<Target> tctx(target);
+      pass_seqs.push_back(major_pass);
+      // Defuse ops to fold constants, then fuse them again
+      pass_seqs.push_back(transform::DefuseOps());
+      pass_seqs.push_back(transform::FoldConstant());
+      pass_seqs.push_back(transform::FuseOps());
+    }
+  }
+
   pass_seqs.push_back(transform::ToANormalForm());
   pass_seqs.push_back(transform::InferType());
   pass_seqs.push_back(transform::LambdaLift());
@@ -1082,7 +1099,6 @@ IRModule VMCompiler::OptimizeModule(IRModule mod, const TargetsMap& targets,
   pass_seqs.push_back(transform::InferType());
 
   transform::Sequential seq(pass_seqs);
-  transform::PassContext pass_ctx = PassContext::Current();
   tvm::With<relay::transform::PassContext> ctx(pass_ctx);
   if (targets.size() == 1) {
     const auto& it = targets.begin();

From 633ee118efecd04efb4be9bf6053deae6e8fac3b Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Mon, 1 Mar 2021 23:56:50 -0700
Subject: [PATCH 259/357] fuse constant padding into conv kernels (#7515)

* fuse constant padding into conv kernels

* change the kernel to support other layouts

* add channel-last test

* add a comment about bailing early
---
 src/relay/transforms/simplify_expr.cc         | 116 ++++++++++++++++++
 tests/python/relay/test_pass_simplify_expr.py |  78 ++++++++++++
 2 files changed, 194 insertions(+)

diff --git a/src/relay/transforms/simplify_expr.cc b/src/relay/transforms/simplify_expr.cc
index 74e48dc4bc54..bfe04e10a9d0 100644
--- a/src/relay/transforms/simplify_expr.cc
+++ b/src/relay/transforms/simplify_expr.cc
@@ -82,6 +82,121 @@ class SimplifyReshape : public SimplifyPattern {
   DFPattern x_;
 };
 
+/*!
+ * \brief SimplifyConvPad matches a pad followed by a conv/convtranspose/pool/etc
+ * with a pad attribute and merges the padding into the kernel.
+ */
+class SimplifyConvPad : public SimplifyPattern {
+ public:
+  SimplifyConvPad() {
+    x_ = IsWildcard();
+    w_ = IsWildcard();
+    pad_ = IsOp("nn.pad")({x_});
+    conv1d_ = IsOp("nn.conv1d");
+    conv2d_ = IsOp("nn.conv2d");
+    conv3d_ = IsOp("nn.conv3d");
+    conv_ = (conv1d_ || conv2d_ || conv3d_)({pad_, w_});
+    pattern_ = conv_;
+  }
+  template <typename T>
+  Attrs MakeConvAttrs(const T* old_attrs, const Array<PrimExpr> padding) const {
+    ICHECK(old_attrs);
+    ICHECK(padding.size() == old_attrs->padding.size())
+        << "Number of dimensions to pad and convolution padding attributes should have the same "
+           "extent";
+
+    auto new_attrs = make_object<T>();
+    Array<PrimExpr> combined_padding;
+    for (size_t i = 0; i < padding.size(); ++i) {
+      combined_padding.push_back(padding[i] + old_attrs->padding[i]);
+    }
+    new_attrs->strides = old_attrs->strides;
+    new_attrs->padding = combined_padding;
+    new_attrs->dilation = old_attrs->dilation;
+    new_attrs->groups = old_attrs->groups;
+    new_attrs->channels = old_attrs->channels;
+    new_attrs->kernel_size = old_attrs->kernel_size;
+    new_attrs->data_layout = old_attrs->data_layout;
+    new_attrs->kernel_layout = old_attrs->kernel_layout;
+    new_attrs->out_layout = old_attrs->out_layout;
+    new_attrs->out_dtype = old_attrs->out_dtype;
+    return Attrs(new_attrs);
+  }
+  template <typename T>
+  Attrs GetAttrs(const PadAttrs* param, const T* attrs) const {
+    ICHECK(param);
+    ICHECK(attrs);
+    ICHECK(attrs->data_layout.size() == param->pad_width.size())
+        << "Data Layout and padding attributes should have the same extent";
+
+    std::string data_layout = attrs->data_layout;
+    std::set<char> image_dims({'H', 'W', 'D'});
+    Array<PrimExpr> padding;
+    // If we're padding a non-spatial dimension, don't simplify
+    // Convolution can only pad on spatial axes
+    for (size_t i = 0; i < param->pad_width.size(); ++i) {
+      if (!image_dims.count(data_layout[i])) {
+        for (size_t j = 0; j < param->pad_width[i].size(); ++j) {
+          if (param->pad_width[i][j] != 0) {
+            return Attrs();
+          }
+        }
+      }
+    }
+    for (size_t j = 0; j < param->pad_width[0].size(); ++j) {
+      for (size_t i = 0; i < param->pad_width.size(); ++i) {
+        if (image_dims.count(data_layout[i])) {
+          padding.push_back(param->pad_width[i][j]);
+        }
+      }
+    }
+
+    return MakeConvAttrs(attrs, padding);
+  }
+  Expr callback(const Expr& pre, const Expr& post,
+                const Map<DFPattern, Array<Expr>>& node_map) const override {
+    const CallNode* call_node = post.as<CallNode>();
+    ICHECK(call_node);
+    auto pad = node_map[pad_][0];
+    const CallNode* pad_node = pad.as<CallNode>();
+    ICHECK(pad_node);
+    const PadAttrs* param = pad_node->attrs.as<PadAttrs>();
+    ICHECK(param);
+    if (param->pad_mode == "constant" && param->pad_value == 0.0) {
+      Attrs attrs;
+      if (node_map.count(conv1d_)) {
+        attrs = GetAttrs(param, call_node->attrs.as<Conv1DAttrs>());
+      } else if (node_map.count(conv2d_)) {
+        attrs = GetAttrs(param, call_node->attrs.as<Conv2DAttrs>());
+      } else if (node_map.count(conv3d_)) {
+        attrs = GetAttrs(param, call_node->attrs.as<Conv3DAttrs>());
+      } else {
+        return post;
+      }
+      if (!attrs.defined()) {
+        return post;
+      }
+      auto x = node_map[x_][0];
+      auto w = node_map[w_][0];
+      return Call(call_node->op, {x, w}, attrs, call_node->type_args, call_node->span);
+    }
+    return post;
+  }
+
+ private:
+  /*! \brief Pattern input */
+  DFPattern x_;
+  /*! \brief Pattern input weight */
+  DFPattern w_;
+  /*! \brief Pattern pad */
+  DFPattern pad_;
+  /*! \brief Pattern conv */
+  DFPattern conv_;
+  DFPattern conv1d_;
+  DFPattern conv2d_;
+  DFPattern conv3d_;
+};
+
 /*!
  * \brief FullArgwhere finds full followed by argwhere and turns it into an Arange op
  */
@@ -163,6 +278,7 @@ class ExprSimplifier {
   explicit ExprSimplifier(IRModule mod) : mod_(mod) {
     CreateCallback(SimplifyReshape());
     CreateCallback(FullElementwise());
+    CreateCallback(SimplifyConvPad());
   }
   template <typename T>
   void CreateCallback(const T& pattern) {
diff --git a/tests/python/relay/test_pass_simplify_expr.py b/tests/python/relay/test_pass_simplify_expr.py
index 423f0a4f213d..e3e497e930f9 100644
--- a/tests/python/relay/test_pass_simplify_expr.py
+++ b/tests/python/relay/test_pass_simplify_expr.py
@@ -19,6 +19,8 @@
 from tvm.relay import transform
 from tvm.relay.testing import run_opt_pass
 
+import numpy as np
+
 
 def test_simplify_reshape():
     def before():
@@ -122,6 +124,82 @@ def after_right(x, elem_op, value):
                 validate(shape, value, dtype)
 
 
+def test_simplify_conv_pad():
+    convs = [relay.nn.conv1d, relay.nn.conv2d, relay.nn.conv3d]
+
+    def validate(ndim, pad_width, pad_value, pad_mode, orig_padding, layout):
+        if layout[1] == "C":
+            shape = [1, 3] + [10] * ndim
+            wshape = [8, 3] + [3] * ndim
+        elif layout[-1] == "C":
+            shape = [1] + [10] * ndim + [3]
+            wshape = [8] + [3] * ndim + [3]
+        else:
+            raise ValueError("This test only supports NC* and N*C")
+
+        x = relay.var("x", shape=shape, dtype="float32")
+        w = relay.var("w", shape=wshape, dtype="float32")
+        pad = relay.nn.pad(x, pad_width, pad_value, pad_mode)
+        if layout[1] == "C":
+            conv = convs[ndim - 1](pad, w, padding=orig_padding)
+        else:
+            conv = convs[ndim - 1](
+                pad, w, padding=orig_padding, data_layout=layout, kernel_layout="DHWIO"[3 - ndim :]
+            )
+
+        if pad_mode == "constant" and pad_value == 0:
+            new_padding = []
+            for j in range(2):
+                for i in range(len(pad_width)):
+                    if layout[i] in ["D", "H", "W"]:
+                        new_padding.append(pad_width[i][j])
+            for i in range(len(new_padding)):
+                new_padding[i] += orig_padding[i]
+            if layout[1] == "C":
+                after = convs[ndim - 1](x, w, padding=new_padding)
+            else:
+                after = convs[ndim - 1](
+                    x, w, padding=new_padding, data_layout=layout, kernel_layout="DHWIO"[3 - ndim :]
+                )
+        else:
+            after = conv
+
+        zz = run_opt_pass(conv, transform.SimplifyExpr())
+        expected = run_opt_pass(after, transform.InferType())
+        assert tvm.ir.structural_equal(zz, expected)
+
+        mod1 = tvm.IRModule.from_expr(conv)
+        mod2 = tvm.IRModule.from_expr(zz)
+
+        with tvm.transform.PassContext(disabled_pass="SimplifyExpr"):
+            ex1 = relay.create_executor("vm", mod=mod1, ctx=tvm.cpu(), target="llvm")
+        ex2 = relay.create_executor("vm", mod=mod2, ctx=tvm.cpu(), target="llvm")
+        x_np = np.random.rand(*shape).astype("float32")
+        w_np = np.random.rand(*wshape).astype("float32")
+        result1 = ex1.evaluate()(x_np, w_np)
+        result2 = ex2.evaluate()(x_np, w_np)
+
+        tvm.testing.assert_allclose(result1.asnumpy(), result2.asnumpy())
+
+    for orig_pad in [[0, 0], [2, 0], [0, 2]]:
+        for i_pad in [[0, 0], [1, 1], [1, 0]]:
+            for ndim in [1, 2, 3]:
+                for channels_last in [0, 1]:
+                    if channels_last:
+                        layout = "NDHWC"
+                        layout = layout[0:1] + layout[4 - ndim : 4] + layout[-1:]
+                        padding = [[0, 0]] + [i_pad] * ndim + [[0, 0]]
+                    else:
+                        layout = "NCDHW"
+                        layout = layout[0:2] + layout[5 - ndim :]
+                        padding = [[0, 0]] * 2 + [i_pad] * ndim
+
+                    validate(ndim, padding, 0, "constant", orig_pad * ndim, layout)
+    ndim = 2
+    validate(ndim, [[0, 0]] * 2 + [i_pad] * ndim, 1, "constant", orig_pad * ndim, "NCHW")
+    validate(ndim, [[0, 0]] * 2 + [i_pad] * ndim, 0, "edge", orig_pad * ndim, "NCHW")
+
+
 if __name__ == "__main__":
     test_simplify_reshape()
     test_simplify_full_elementwise()

From 5d354e4dfecce0cf3852edec7abd5d821fc10998 Mon Sep 17 00:00:00 2001
From: kongroo <imjcqt@gmail.com>
Date: Wed, 3 Mar 2021 02:45:50 +0800
Subject: [PATCH 260/357] [Codegen][CUDA] Fix: cuda codegen vectorize cast
 (#7561)

* fix: cuda codegen vectorize cast

* style: fix python coding style

* fix: missing break

* refactor: directly split by factor

Co-authored-by: jiangchengquan <jiangchengquan@bytedance.com>
---
 src/target/source/codegen_cuda.cc             | 133 +++++++++++++++---
 .../unittest/test_target_codegen_cuda.py      |  30 +++-
 2 files changed, 136 insertions(+), 27 deletions(-)

diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc
index 35b94f55e4e4..2e9babacc441 100644
--- a/src/target/source/codegen_cuda.cc
+++ b/src/target/source/codegen_cuda.cc
@@ -89,8 +89,8 @@ std::string CodeGenCUDA::Finish() {
   decl_stream << "  #define uint unsigned int\n";
   decl_stream << "  #define uchar unsigned char\n";
   decl_stream << "  #define ushort unsigned short\n";
-  decl_stream << "  #define int64_t long\n";
-  decl_stream << "  #define uint64_t ulong\n";
+  decl_stream << "  #define int64_t long long\n";
+  decl_stream << "  #define uint64_t unsigned long long\n";
   decl_stream << "#endif\n";
 
   return CodeGenC::Finish();
@@ -141,7 +141,21 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
         }
         break;
       case 32:
-        os << "float";
+        if (lanes <= 4) {
+          os << "float";
+        } else if (lanes <= 8) {
+          // Emit CUDA code to access fp32 vector elements for 4 < lanes <= 8.
+          //
+          // float8 is stored as ulonglong4
+          //
+          // f8.v1 is emitted as *(float2*)(&(ul4.x)).x
+          // f8.v2 is emitted as *(float2*)(&(ul4.x)).y
+          //
+          ICHECK_EQ(lanes % 2, 0) << "only support even lane for float type with lanes > 4";
+          os << "ulonglong" << lanes / 2;
+        } else {
+          fail = true;
+        }
         break;
       case 64:
         os << "double";
@@ -151,6 +165,7 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
         break;
     }
     if (!fail && (t.is_scalar() || t.bits() == 16)) return;
+    if (!fail && (lanes > 4 && lanes <= 8 && t.bits() == 32)) return;
     if (!fail && (lanes >= 2 && lanes <= 4)) {
       os << lanes;
       return;
@@ -238,12 +253,54 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
           break;
         }
       }
-      case 16:
-        os << "short";
+      case 16: {
+        if (t.is_scalar()) {
+          os << "short";
+        } else if (t.lanes() <= 4) {
+          os << "short" << lanes;
+        } else if (t.lanes() <= 8) {
+          // Emit CUDA code to access int16 vector elements.
+          //
+          // short4 is stored as int2
+          //
+          // s4.x is emitted as *(short2*)(&(i2.x)).x
+          // s4.y is emitted as *(short2*)(&(i2.x)).y
+          // s4.z is emitted as *(short2*)(&(i2.y)).x
+          // s4.w is emitted as *(short2*)(&(i2.y)).y
+          //
+          ICHECK_EQ(t.lanes() % 2, 0) << "only support even lane for shorT type with lanes > 4";
+          os << "int" << t.lanes() / 2;
+        } else {
+          fail = true;
+        }
+        if (!fail) {
+          return;
+        }
         break;
-      case 32:
-        os << "int";
+      }
+      case 32: {
+        if (t.is_scalar()) {
+          os << "int";
+        } else if (t.lanes() <= 4) {
+          os << "int" << t.lanes();
+        } else if (t.lanes() <= 8) {
+          // Emit CUDA code to access int32 vector elements for 4 < lanes <= 8.
+          //
+          // int8 is stored as longlong4
+          //
+          // i8.v1 is emitted as *(int2*)(&(l4.x)).x
+          // i8.v2 is emitted as *(int2*)(&(l4.x)).y
+          //
+          ICHECK_EQ(lanes % 2, 0) << "only support even lane for int32 type with lanes > 4";
+          os << "longlong" << lanes / 2;
+        } else {
+          fail = true;
+        }
+        if (!fail) {
+          return;
+        }
         break;
+      }
       case 64: {
         if (t.is_scalar()) {
           os << "int64_t";
@@ -314,21 +371,36 @@ void CodeGenCUDA::PrintVecElemLoad(const std::string& vec, DataType t, int i,
   }
 
   static const char access[] = {'x', 'y', 'z', 'w'};
-  ICHECK(i >= 0 && i < (t.is_float16() ? 8 : 4));
-  if ((t.is_int()) && t.bits() == 8) {
-    if (t.lanes() == 2 || t.lanes() == 3) {
-      os << vec << "." << access[i % t.lanes()];
-    } else {
-      os << "((char)(" << vec << " >> " << i * 8 << "))";
-    }
-  } else if ((t.is_uint()) && t.bits() == 8) {
+  ICHECK(i >= 0 && i < (t.bits() == 8 ? 16 : (t.bits() == 16 || t.bits() == 32) ? 8 : 4));
+  if (t.bits() == 8 && (t.is_int() || t.is_uint())) {
+    std::string type_name = t.is_int() ? "char" : "unsigned char";
     if (t.lanes() == 2 || t.lanes() == 3) {
       os << vec << "." << access[i % t.lanes()];
     } else {
-      os << "((unsigned char)(" << vec << " >> " << i * 8 << "))";
+      std::string ac = t.lanes() == 4 ? vec : (vec + "." + access[i / 4]);
+      os << "((" << type_name << ")(" << ac << " >> " << i % 4 * 8 << "))";
     }
   } else if (t.is_float16()) {
     os << "((half2*)(&(" << vec << "." << access[i / 2] << ")))->" << access[i % 2];
+  } else if (t.lanes() > 4 && t.lanes() <= 8) {
+    std::string type_name;
+    if (t.bits() == 16) {
+      if (t.is_int()) {
+        type_name = "short";
+      } else if (t.is_uint()) {
+        type_name = "ushort";
+      }
+    } else if (t.bits() == 32) {
+      if (t.is_int()) {
+        type_name = "int";
+      } else if (t.is_uint()) {
+        type_name = "uint";
+      } else if (t.is_float()) {
+        type_name = "float";
+      }
+    }
+    ICHECK(!type_name.empty());
+    os << "((" << type_name << "2*)(&(" << vec << "." << access[i / 2] << ")))->" << access[i % 2];
   } else {
     os << vec << "." << access[i];
   }
@@ -338,22 +410,43 @@ void CodeGenCUDA::PrintVecElemStore(const std::string& vec, DataType t, int i,
                                     const std::string& value) {
   this->PrintIndent();
   static const char access[] = {'x', 'y', 'z', 'w'};
-  ICHECK(i >= 0 && i < (t.is_float16() ? 8 : 4));
+  ICHECK(i >= 0 && i < (t.bits() == 8 ? 16 : (t.bits() == 16 || t.bits() == 32) ? 8 : 4));
   if (t.bits() == 8 && (t.is_int() || t.is_uint())) {
     if (t.lanes() == 2 || t.lanes() == 3) {
       stream << vec << '.' << access[i % t.lanes()] << "="
              << "(" << value << ");\n";
     } else {
-      stream << vec << "=";
+      std::string ac = t.lanes() == 4 ? vec : (vec + "." + access[i / 4]);
+      stream << ac << "=";
       // Do not read the first undef lane.
       if (i != 0) {
-        stream << vec << " & ~(0x000000ff << " << i * 8 << ") |";
+        stream << ac << " & ~(0x000000ff << " << i % 4 * 8 << ") |";
       }
-      stream << "(" << value << " << " << i * 8 << ");\n";
+      stream << "(" << value << " << " << i % 4 * 8 << ");\n";
     }
   } else if (t.is_float16()) {
     stream << "((half2*)(&(" << vec << "." << access[i / 2] << ")))->" << access[i % 2] << " = "
            << value << ";\n";
+  } else if (t.lanes() > 4 && t.lanes() <= 8) {
+    std::string type_name;
+    if (t.bits() == 16) {
+      if (t.is_int()) {
+        type_name = "short";
+      } else if (t.is_uint()) {
+        type_name = "ushort";
+      }
+    } else if (t.bits() == 32) {
+      if (t.is_int()) {
+        type_name = "int";
+      } else if (t.is_uint()) {
+        type_name = "uint";
+      } else if (t.is_float()) {
+        type_name = "float";
+      }
+    }
+    ICHECK(!type_name.empty());
+    stream << "((" << type_name << "2*)(&(" << vec << "." << access[i / 2] << ")))->"
+           << access[i % 2] << " = " << value << ";\n";
   } else {
     stream << vec << "." << access[i] << " = " << value << ";\n";
   }
diff --git a/tests/python/unittest/test_target_codegen_cuda.py b/tests/python/unittest/test_target_codegen_cuda.py
index a22fe10c1321..a228a640f108 100644
--- a/tests/python/unittest/test_target_codegen_cuda.py
+++ b/tests/python/unittest/test_target_codegen_cuda.py
@@ -498,7 +498,7 @@ def test_cuda_floormod_with_vectorization():
 @tvm.testing.requires_gpu
 @tvm.testing.requires_cuda
 def test_vectorized_casts():
-    def check(t0, t1):
+    def check(t0, t1, factor):
         if (t0 == "float16" or t1 == "float16") and not have_fp16(tvm.gpu(0).compute_version):
             print("Skip because gpu does not have fp16 support")
             return
@@ -511,9 +511,8 @@ def check(t0, t1):
 
         # schedule
         s = tvm.te.create_schedule(C.op)
-        ob, ib = s[C].split(s[C].op.axis[0], nparts=32)
-        _, iib = s[C].split(ib, factor=4)
-        s[C].vectorize(iib)
+        ob, ib = s[C].split(s[C].op.axis[0], factor=factor)
+        s[C].vectorize(ib)
         s[C].bind(ob, tx)
         func = tvm.build(s, [A, B, C], "cuda")
 
@@ -538,9 +537,26 @@ def skip(t0, t1):
             return True
         return False
 
-    types = ["float16", "float32", "int8", "uint8", "int16", "uint16", "int32", "uint32"]
-    for t0, t1 in [(x, y) for x in types for y in types if not skip(x, y)]:
-        check(t0, t1)
+    types_4 = [
+        "float16",
+        "float32",
+        "int8",
+        "uint8",
+        "int16",
+        "uint16",
+        "int32",
+        "uint32",
+        "float64",
+        "int64",
+        "uint64",
+    ]
+    types_8 = ["float16", "float32", "int8", "uint8", "int16", "uint16", "int32", "uint32"]
+    for t0, t1 in [(x, y) for x in types_4 for y in types_4 if not skip(x, y)]:
+        check(t0, t1, 4)
+    for t0, t1 in [(x, y) for x in types_8 for y in types_8 if not skip(x, y)]:
+        check(t0, t1, 8)
+    check("int8", "uint8", 16)
+    check("uint8", "int8", 16)
 
 
 def sched(B):

From 08ea9612dc58ffb11660b167e69543792ce88c22 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Wed, 3 Mar 2021 04:59:08 +0900
Subject: [PATCH 261/357] [Torch] Fix converting torch slice op with dynamic
 slice length (#7549)

* Fix converting torch slice op with dynamic slice length

* use isinstance

Co-authored-by: masa <masa@pop-os.localdomain>
---
 python/tvm/relay/frontend/pytorch.py          | 8 +++++++-
 tests/python/frontend/pytorch/test_forward.py | 9 +++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 31c78cfdea84..3c61749fc203 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -400,7 +400,13 @@ def slice(self, inputs, input_types):
         )
 
         # A fast path when slicing is nop.
-        if target_begin == 0 and target_end >= index_size_limit and stride == 1:
+        if (
+            isinstance(target_begin, int)
+            and isinstance(target_end, int)
+            and target_begin == 0
+            and target_end >= index_size_limit
+            and stride == 1
+        ):
             return data
 
         # Process begin
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 826edd051544..9f035ade7a21 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -1497,6 +1497,10 @@ class SliceWithStride2(torch.nn.Module):
         def forward(self, x):
             return x[0::2, 0::2] + x[1::2, 1::2]
 
+    class DynamicLengthSlice(torch.nn.Module):
+        def forward(self, values, length):
+            return values[0:length]
+
     input_data = torch.rand(input_shape).float()
     verify_model(Slice1(), input_data=input_data)
     verify_model(Slice2(), input_data=input_data)
@@ -1504,6 +1508,11 @@ def forward(self, x):
     verify_model(SliceWithStride(), input_data=torch.randn(1, 4))
     verify_model(SliceWithStride2(), input_data=torch.randn(4, 4))
 
+    inp = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    slice_len = torch.tensor(2)
+    targets = ["llvm", "cuda"]
+    verify_trace_model(DynamicLengthSlice(), [inp, slice_len], targets)
+
 
 @tvm.testing.uses_gpu
 def test_forward_narrow():

From 3a02e0b4438614b94594c8f8996161f548438d62 Mon Sep 17 00:00:00 2001
From: Altan Haan <ahaan@octoml.ai>
Date: Tue, 2 Mar 2021 16:44:00 -0800
Subject: [PATCH 262/357] [Pass] Profiling TVM compiler passes (#7500)

* basic pass profiler prototype

* allow enable/disable of pass profiling

* lint

* add example pass profiler usage as test

* render pass profiles to String instead of stdout
---
 include/tvm/ir/transform.h               |  13 +-
 python/tvm/ir/transform.py               |  23 ++++
 src/ir/transform.cc                      | 157 +++++++++++++++++++++++
 tests/python/relay/test_pass_profiler.py |  41 ++++++
 4 files changed, 224 insertions(+), 10 deletions(-)
 create mode 100644 tests/python/relay/test_pass_profiler.py

diff --git a/include/tvm/ir/transform.h b/include/tvm/ir/transform.h
index 56905ded5201..6557bbe31b8e 100644
--- a/include/tvm/ir/transform.h
+++ b/include/tvm/ir/transform.h
@@ -349,11 +349,8 @@ class Pass : public ObjectRef {
    *
    * \return The transformed module.
    */
-  IRModule operator()(IRModule mod) const {
-    const PassNode* node = operator->();
-    ICHECK(node != nullptr);
-    return node->operator()(std::move(mod));
-  }
+  IRModule operator()(IRModule mod) const;
+
   /*!
    * \brief Transform mod using a functor under a given pass context.
    *
@@ -362,11 +359,7 @@ class Pass : public ObjectRef {
    *
    * \return The transformed module.
    */
-  IRModule operator()(IRModule mod, const PassContext& pass_ctx) const {
-    const PassNode* node = operator->();
-    ICHECK(node != nullptr);
-    return node->operator()(std::move(mod), pass_ctx);
-  }
+  IRModule operator()(IRModule mod, const PassContext& pass_ctx) const;
 
   TVM_DEFINE_OBJECT_REF_METHODS(Pass, ObjectRef, PassNode);
 };
diff --git a/python/tvm/ir/transform.py b/python/tvm/ir/transform.py
index bb230cad0c9c..36e06eeb8b23 100644
--- a/python/tvm/ir/transform.py
+++ b/python/tvm/ir/transform.py
@@ -330,3 +330,26 @@ def PrintIR(header="", show_meta_data=False):
     The pass
     """
     return _ffi_transform_api.PrintIR(header, show_meta_data)
+
+
+def render_pass_profiles():
+    """Returns a string render of the pass profiling data. The format of each output line is
+    `{name}: {time} [{time excluding sub-passes}] ({% of total}; {% of parent})`.
+    The indentation of each line corresponds to nesting of passes.
+    """
+    return _ffi_transform_api.render_pass_profiles()
+
+
+def clear_pass_profiles():
+    """Clears all stored pass profiling data."""
+    _ffi_transform_api.clear_pass_profiles()
+
+
+def enable_pass_profiling():
+    """Enables pass profiling."""
+    _ffi_transform_api.enable_pass_profiling()
+
+
+def disable_pass_profiling():
+    """Disables pass profiling."""
+    _ffi_transform_api.disable_pass_profiling()
diff --git a/src/ir/transform.cc b/src/ir/transform.cc
index f4516d5e57c5..48f13bc81df4 100644
--- a/src/ir/transform.cc
+++ b/src/ir/transform.cc
@@ -28,6 +28,8 @@
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/registry.h>
 
+#include <chrono>
+#include <iomanip>
 #include <stack>
 #include <unordered_set>
 
@@ -169,6 +171,161 @@ void PassContext::Trace(const IRModule& module, const PassInfo& info, bool is_be
 
 class ModulePass;
 
+/*! \brief PassProfile stores profiling information for a given pass and its sub-passes. */
+struct PassProfile {
+  // TODO(@altanh): expose PassProfile through TVM Object API
+  using Clock = std::chrono::steady_clock;
+  using Duration = std::chrono::duration<double, std::micro>;
+  using Time = std::chrono::time_point<Clock>;
+
+  /*! \brief The name of the pass being profiled. */
+  String name;
+  /*! \brief The time when the pass was entered. */
+  Time start;
+  /*! \brief The time when the pass completed. */
+  Time end;
+  /*! \brief The total duration of the pass, i.e. end - start. */
+  Duration duration;
+  /*! \brief PassProfiles for all sub-passes invoked during the execution of the pass. */
+  std::vector<PassProfile> children;
+
+  explicit PassProfile(String name)
+      : name(name), start(Clock::now()), end(Clock::now()), children() {}
+
+  /*! \brief Gets the PassProfile of the currently executing pass. */
+  static PassProfile* Current();
+  /*! \brief Pushes a new PassProfile with the given pass name. */
+  static void EnterPass(String name);
+  /*! \brief Pops the current PassProfile. */
+  static void ExitPass();
+};
+
+struct PassProfileThreadLocalEntry {
+  /*! \brief The placeholder top-level PassProfile. */
+  PassProfile root;
+  /*! \brief The stack of PassProfiles for nested passes currently running. */
+  std::stack<PassProfile*> profile_stack;
+  /*! \brief Whether or not pass profiling is active. */
+  bool active;
+
+  PassProfileThreadLocalEntry() : root("root"), active(false) {}
+};
+
+/*! \brief Thread local store to hold the pass profiling data. */
+typedef dmlc::ThreadLocalStore<PassProfileThreadLocalEntry> PassProfileThreadLocalStore;
+
+void PassProfile::EnterPass(String name) {
+  if (!PassProfileThreadLocalStore::Get()->active) return;
+  PassProfile* cur = PassProfile::Current();
+  cur->children.emplace_back(name);
+  PassProfileThreadLocalStore::Get()->profile_stack.push(&cur->children.back());
+}
+
+void PassProfile::ExitPass() {
+  if (!PassProfileThreadLocalStore::Get()->active) return;
+  PassProfile* cur = PassProfile::Current();
+  ICHECK_NE(cur->name, "root") << "mismatched enter/exit for pass profiling";
+  cur->end = std::move(PassProfile::Clock::now());
+  cur->duration = std::chrono::duration_cast<PassProfile::Duration>(cur->end - cur->start);
+  PassProfileThreadLocalStore::Get()->profile_stack.pop();
+}
+
+PassProfile* PassProfile::Current() {
+  PassProfileThreadLocalEntry* entry = PassProfileThreadLocalStore::Get();
+  if (!entry->profile_stack.empty()) {
+    return entry->profile_stack.top();
+  } else {
+    return &entry->root;
+  }
+}
+
+IRModule Pass::operator()(IRModule mod) const {
+  const PassNode* node = operator->();
+  ICHECK(node != nullptr);
+  PassProfile::EnterPass(node->Info()->name);
+  auto ret = node->operator()(std::move(mod));
+  PassProfile::ExitPass();
+  return std::move(ret);
+}
+
+IRModule Pass::operator()(IRModule mod, const PassContext& pass_ctx) const {
+  const PassNode* node = operator->();
+  ICHECK(node != nullptr);
+  PassProfile::EnterPass(node->Info()->name);
+  auto ret = node->operator()(std::move(mod), pass_ctx);
+  PassProfile::ExitPass();
+  return std::move(ret);
+}
+
+String RenderPassProfiles() {
+  PassProfileThreadLocalEntry* entry = PassProfileThreadLocalStore::Get();
+  CHECK(entry->profile_stack.empty()) << "cannot print pass profile while still in a pass!";
+
+  if (entry->root.children.empty()) {
+    LOG(WARNING) << "no passes have been profiled, did you enable pass profiling?";
+    return String();
+  }
+
+  // (depth, parent_duration, pass)
+  std::stack<std::tuple<size_t, PassProfile::Duration, PassProfile*>> profiles;
+
+  // push top level passes
+  PassProfile::Duration top_dur(0);
+  for (auto it = entry->root.children.begin(); it != entry->root.children.end(); ++it) {
+    top_dur += it->duration;
+  }
+  for (auto it = entry->root.children.rbegin(); it != entry->root.children.rend(); ++it) {
+    profiles.push(std::make_tuple(0, top_dur, &*it));
+  }
+
+  std::ostringstream os;
+  os << std::fixed;
+
+  while (profiles.size() > 0) {
+    size_t depth;
+    PassProfile::Duration parent_duration;
+    PassProfile* profile;
+    std::tie(depth, parent_duration, profile) = profiles.top();
+    profiles.pop();
+
+    // indent depth
+    for (size_t i = 0; i < depth; ++i) {
+      os << "\t";
+    }
+
+    // calculate time spent in pass itself (excluding sub-passes), and push children
+    PassProfile::Duration self_duration = profile->duration;
+    for (auto it = profile->children.rbegin(); it != profile->children.rend(); ++it) {
+      self_duration -= it->duration;
+      profiles.push(std::make_tuple(depth + 1, profile->duration, &*it));
+    }
+
+    double parent_pct = profile->duration.count() / parent_duration.count() * 100.0;
+    double total_pct = profile->duration.count() / top_dur.count() * 100.0;
+
+    os << profile->name << ": ";
+    os << std::setprecision(0);
+    os << profile->duration.count() << "us [" << self_duration.count() << "us] ";
+    os << std::setprecision(2) << "(" << total_pct << "%; " << parent_pct << "%)\n";
+  }
+
+  return os.str();
+}
+
+TVM_REGISTER_GLOBAL("transform.render_pass_profiles").set_body_typed(RenderPassProfiles);
+
+TVM_REGISTER_GLOBAL("transform.clear_pass_profiles").set_body_typed([]() {
+  PassProfileThreadLocalStore::Get()->root.children.clear();
+});
+
+TVM_REGISTER_GLOBAL("transform.enable_pass_profiling").set_body_typed([]() {
+  PassProfileThreadLocalStore::Get()->active = true;
+});
+
+TVM_REGISTER_GLOBAL("transform.disable_pass_profiling").set_body_typed([]() {
+  PassProfileThreadLocalStore::Get()->active = false;
+});
+
 /*!
  * \brief Module-level passes are designed to implement global
  * analysis/optimizations, i.e. interprocedural optimizations (IPO), etc. Passes
diff --git a/tests/python/relay/test_pass_profiler.py b/tests/python/relay/test_pass_profiler.py
new file mode 100644
index 000000000000..acf6c8c50aff
--- /dev/null
+++ b/tests/python/relay/test_pass_profiler.py
@@ -0,0 +1,41 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+import tvm.relay
+from tvm.relay import op
+
+
+def test_pass_profiler():
+    x, y, z = [tvm.relay.var(c, shape=(3, 4), dtype="float32") for c in "xyz"]
+    e1 = op.add(x, y)
+    e2 = op.subtract(x, z)
+    e3 = op.multiply(e1, e1 / e2)
+    mod = tvm.IRModule.from_expr(e3 + e2)
+
+    tvm.transform.enable_pass_profiling()
+
+    mod = tvm.relay.transform.AnnotateSpans()(mod)
+    mod = tvm.relay.transform.ToANormalForm()(mod)
+    mod = tvm.relay.transform.InferType()(mod)
+
+    profiles = tvm.transform.render_pass_profiles()
+    assert "AnnotateSpans" in profiles
+    assert "ToANormalForm" in profiles
+    assert "InferType" in profiles
+
+    tvm.transform.clear_pass_profiles()
+    tvm.transform.disable_pass_profiling()

From cf36aa65de80efd84a906058d66b72fb4c23ab16 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Wed, 3 Mar 2021 10:31:05 +0900
Subject: [PATCH 263/357] [TIR] Add TIR While node (#7425)

* add while node

* update visitors

* binary search lowering works

* llvm codegen working

* cuda codegen working

* nms updated to use while loop

* add missing upper bound check too

* add mandelbrot test

* add gpu mandel

commit ee2363bf8131830cf0fb112890befd6be6a03f36
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri Jan 29 11:44:02 2021 +0900

    enable extern lib offload for nvptx

* rename test

* run black

* add doc

* add collatz test

* add while + vectorize test

* simplify bin search

* Add special case visit method to storage_access.cc

* disallow while loop inside vectorized loop

* disallow trivial condition since we do not have break

* error out in CoprocSync for now

* error out LiftAttrScope for now

* add placeholder to inject_vpthread

* refactor to use MakeAttach

* handle WhileNode in InplaceOpVerifier

* error out in InjectVirtualThread

* try handle WhileNode in StoragePlanRewriter

* remove WhileNode visitor from storage rewrite

* add while loop storage rewrite test

* update tests

* move test_vectorize_while_fail to  test_tir_transform_vectorize.py
---
 include/tvm/tir/stmt.h                        |  47 +++
 include/tvm/tir/stmt_functor.h                |   4 +
 python/tvm/tir/ir_builder.py                  |  29 ++
 python/tvm/tir/stmt.py                        |  25 ++
 python/tvm/topi/cuda/nms.py                   |  28 +-
 src/printer/text_printer.h                    |   1 +
 src/printer/tir_text_printer.cc               |   7 +
 src/target/llvm/codegen_llvm.cc               |  14 +
 src/target/llvm/codegen_llvm.h                |   1 +
 src/target/source/codegen_c.cc                |  11 +-
 src/target/source/codegen_c.h                 |   1 +
 src/tir/ir/stmt.cc                            |  32 ++
 src/tir/ir/stmt_functor.cc                    |  18 +
 src/tir/transforms/coproc_sync.cc             |   5 +
 src/tir/transforms/inject_virtual_thread.cc   |   7 +
 src/tir/transforms/lift_attr_scope.cc         |   6 +
 src/tir/transforms/storage_access.cc          |  13 +
 src/tir/transforms/storage_access.h           |   1 +
 src/tir/transforms/storage_rewrite.cc         |  16 +-
 src/tir/transforms/vectorize_loop.cc          |   7 +-
 tests/python/unittest/test_tir_ir_builder.py  | 326 ++++++++++++++++++
 .../test_tir_transform_storage_rewrite.py     |  71 ++++
 .../unittest/test_tir_transform_vectorize.py  |  48 +++
 23 files changed, 695 insertions(+), 23 deletions(-)

diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h
index 074bcdd3f533..ac660bfb7461 100644
--- a/include/tvm/tir/stmt.h
+++ b/include/tvm/tir/stmt.h
@@ -861,6 +861,53 @@ class For : public Stmt {
   TVM_DEFINE_OBJECT_REF_METHODS(For, Stmt, ForNode);
 };
 
+/*!
+ * \brief A While loop
+ *
+ * \code
+ *
+ *  while (condition)
+ *    body
+ *
+ * \endcode
+ */
+class WhileNode : public StmtNode {
+ public:
+  /*! \brief The termination condition. */
+  PrimExpr condition;
+  /*! \brief The body of the while loop. */
+  Stmt body;
+
+  void VisitAttrs(AttrVisitor* v) {
+    v->Visit("condition", &condition);
+    v->Visit("body", &body);
+    v->Visit("span", &span);
+  }
+
+  bool SEqualReduce(const WhileNode* other, SEqualReducer equal) const {
+    return equal.DefEqual(condition, other->condition) && equal.DefEqual(body, other->body);
+  }
+
+  void SHashReduce(SHashReducer hash_reduce) const {
+    hash_reduce.DefHash(condition);
+    hash_reduce.DefHash(body);
+  }
+
+  static constexpr const char* _type_key = "tir.While";
+  TVM_DECLARE_FINAL_OBJECT_INFO(WhileNode, StmtNode);
+};
+
+/*!
+ * \brief Managed reference to WhileNode.
+ * \sa WhileNode
+ */
+class While : public Stmt {
+ public:
+  TVM_DLL While(PrimExpr condition, Stmt body, Span span = Span());
+
+  TVM_DEFINE_OBJECT_REF_METHODS(While, Stmt, WhileNode);
+};
+
 /*!
  * \brief A prefetch hint for a buffer
  */
diff --git a/include/tvm/tir/stmt_functor.h b/include/tvm/tir/stmt_functor.h
index e53b02d73e1d..ceebbbb305ce 100644
--- a/include/tvm/tir/stmt_functor.h
+++ b/include/tvm/tir/stmt_functor.h
@@ -86,6 +86,7 @@ class StmtFunctor<R(const Stmt& n, Args... args)> {
   virtual R VisitStmt_(const AttrStmtNode* op, Args... args) STMT_FUNCTOR_DEFAULT;
   virtual R VisitStmt_(const IfThenElseNode* op, Args... args) STMT_FUNCTOR_DEFAULT;
   virtual R VisitStmt_(const ForNode* op, Args... args) STMT_FUNCTOR_DEFAULT;
+  virtual R VisitStmt_(const WhileNode* op, Args... args) STMT_FUNCTOR_DEFAULT;
   virtual R VisitStmt_(const AllocateNode* op, Args... args) STMT_FUNCTOR_DEFAULT;
   virtual R VisitStmt_(const StoreNode* op, Args... args) STMT_FUNCTOR_DEFAULT;
   virtual R VisitStmt_(const BufferStoreNode* op, Args... args) STMT_FUNCTOR_DEFAULT;
@@ -111,6 +112,7 @@ class StmtFunctor<R(const Stmt& n, Args... args)> {
     IR_STMT_FUNCTOR_DISPATCH(AttrStmtNode);
     IR_STMT_FUNCTOR_DISPATCH(IfThenElseNode);
     IR_STMT_FUNCTOR_DISPATCH(ForNode);
+    IR_STMT_FUNCTOR_DISPATCH(WhileNode);
     IR_STMT_FUNCTOR_DISPATCH(AllocateNode);
     IR_STMT_FUNCTOR_DISPATCH(StoreNode);
     IR_STMT_FUNCTOR_DISPATCH(AssertStmtNode);
@@ -152,6 +154,7 @@ class TVM_DLL StmtVisitor : protected StmtFunctor<void(const Stmt&)> {
   void VisitStmt_(const IfThenElseNode* op) override;
   void VisitStmt_(const LetStmtNode* op) override;
   void VisitStmt_(const ForNode* op) override;
+  void VisitStmt_(const WhileNode* op) override;
   void VisitStmt_(const AllocateNode* op) override;
   void VisitStmt_(const StoreNode* op) override;
   void VisitStmt_(const BufferStoreNode* op) override;
@@ -245,6 +248,7 @@ class TVM_DLL StmtMutator : protected StmtFunctor<Stmt(const Stmt&)> {
   Stmt VisitStmt_(const IfThenElseNode* op) override;
   Stmt VisitStmt_(const LetStmtNode* op) override;
   Stmt VisitStmt_(const ForNode* op) override;
+  Stmt VisitStmt_(const WhileNode* op) override;
   Stmt VisitStmt_(const AllocateNode* op) override;
   Stmt VisitStmt_(const StoreNode* op) override;
   Stmt VisitStmt_(const BufferStoreNode* op) override;
diff --git a/python/tvm/tir/ir_builder.py b/python/tvm/tir/ir_builder.py
index 437e8f6610f4..2ecbdeda8371 100644
--- a/python/tvm/tir/ir_builder.py
+++ b/python/tvm/tir/ir_builder.py
@@ -263,6 +263,35 @@ def _exit_cb():
 
         return WithScope(loop_var, _exit_cb)
 
+    def while_loop(self, condition):
+        """Create a while loop scope.
+
+        Parameters
+        ----------
+        condition : Expr
+            The termination condition.
+
+        Returns
+        -------
+        loop_scope : With.Scope of Var
+            The while scope.
+
+        Examples
+        --------
+        .. code-block:: python
+
+            ib = tvm.tir.ir_builder.create()
+            iterations = ib.allocate("int32", (1,), name="iterations", scope="local")
+            with ib.while_loop(iterations[0] < 10):
+                iterations[0] += 1
+        """
+        self._seq_stack.append([])
+
+        def _exit_cb():
+            self.emit(_stmt.While(condition, self._pop_seq()))
+
+        return WithScope(None, _exit_cb)
+
     def if_scope(self, cond):
         """Create an if scope.
 
diff --git a/python/tvm/tir/stmt.py b/python/tvm/tir/stmt.py
index e4f1ac924a83..47462066c364 100644
--- a/python/tvm/tir/stmt.py
+++ b/python/tvm/tir/stmt.py
@@ -159,6 +159,31 @@ def __init__(
         )
 
 
+@tvm._ffi.register_object("tir.While")
+class While(Stmt):
+    """While node.
+
+    Parameters
+    ----------
+    condition : PrimExpr
+        The termination condition.
+
+    body : Stmt
+        The body statement.
+
+    span : Optional[Span]
+        The location of this itervar in the source code.
+    """
+
+    def __init__(self, condition, body, span=None):
+        self.__init_handle_by_constructor__(
+            _ffi_api.While,
+            condition,
+            body,
+            span,
+        )
+
+
 @tvm._ffi.register_object("tir.Store")
 class Store(Stmt):
     """Store node.
diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
index 152b1bd15987..83b538554ed4 100644
--- a/python/tvm/topi/cuda/nms.py
+++ b/python/tvm/topi/cuda/nms.py
@@ -521,7 +521,7 @@ def nms_inner_loop(ib, j):
             offset_j = j * 4
             num_iter_per_thread = ceil_div(nkeep - (j + 1), nthread_tx)
 
-            with ib.for_range(0, num_iter_per_thread) as _k:
+            with ib.for_range(0, num_iter_per_thread, name="_k") as _k:
                 k = j + 1 + _k * nthread_tx + tx
                 offset_k = k * 4
 
@@ -555,16 +555,22 @@ def nms_inner_loop(ib, j):
 
         with ib.if_scope(tvm.tir.all(iou_threshold > 0, valid_count[i] > 0)):
             # Apply nms
-            with ib.for_range(0, nkeep) as j:
-                # Proceed to the inner loop if the box j is still valid
-                with ib.if_scope(out_scores[i, j] > -1.0):
-                    with ib.if_scope(max_output_size > 0):
-                        # No need to do more iteration if we have already reached max_output_size
-                        # boxes
-                        # TODO(masahi): Add TIR while loop to realize early exit from the outer loop
-                        with ib.if_scope(num_valid_boxes_local[0] < max_output_size):
-                            nms_inner_loop(ib, j)
-                    with ib.else_scope():
+            with ib.if_scope(max_output_size > 0):
+                # No need to do more iteration if we have already reached max_output_size boxes
+                box_idx = ib.allocate("int32", (1,), name="box_idx", scope="local")
+                box_idx[0] = 0
+                with ib.while_loop(
+                    tvm.tir.all(box_idx[0] < nkeep, num_valid_boxes_local[0] < max_output_size)
+                ):
+                    # Proceed to the inner loop if the box with id box_idx is still valid
+                    with ib.if_scope(out_scores[i, box_idx[0]] > -1.0):
+                        nms_inner_loop(ib, box_idx[0])
+                    box_idx[0] += 1
+
+            with ib.else_scope():
+                with ib.for_range(0, nkeep, name="j") as j:
+                    # Proceed to the inner loop if the box j is still valid
+                    with ib.if_scope(out_scores[i, j] > -1.0):
                         nms_inner_loop(ib, j)
 
             with ib.if_scope(tx + 0 == 0):
diff --git a/src/printer/text_printer.h b/src/printer/text_printer.h
index 9a24fe65b4b1..6ec32a9e104c 100644
--- a/src/printer/text_printer.h
+++ b/src/printer/text_printer.h
@@ -308,6 +308,7 @@ class TIRTextPrinter : public StmtFunctor<Doc(const Stmt&)>,
   Doc VisitStmt_(const SeqStmtNode* op) override;
   Doc VisitStmt_(const EvaluateNode* op) override;
   Doc VisitStmt_(const ForNode* op) override;
+  Doc VisitStmt_(const WhileNode* op) override;
   Doc VisitStmt_(const PrefetchNode* op) override;
   Doc VisitStmtDefault_(const Object* op) override;
 
diff --git a/src/printer/tir_text_printer.cc b/src/printer/tir_text_printer.cc
index 711af2a8fd08..8d5bba5e5bb0 100644
--- a/src/printer/tir_text_printer.cc
+++ b/src/printer/tir_text_printer.cc
@@ -494,6 +494,13 @@ Doc TIRTextPrinter::VisitStmt_(const ForNode* op) {
   return doc;
 }
 
+Doc TIRTextPrinter::VisitStmt_(const WhileNode* op) {
+  Doc doc;
+  doc << "while (" << Print(op->condition) << ")";
+  doc << PrintBody(op->body);
+  return doc;
+}
+
 Doc TIRTextPrinter::VisitStmt_(const PrefetchNode* op) {
   Doc doc;
   doc << "prefetch(" << Print(op->buffer) << ", " << Print(op->bounds) << ")";
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 1dd76f6b9d51..d5140677d45a 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -1328,6 +1328,20 @@ void CodeGenLLVM::VisitStmt_(const ForNode* op) {
                   llvm::ConstantInt::getSigned(GetLLVMType(op->extent), 1), op->loop_var, op->body);
 }
 
+void CodeGenLLVM::VisitStmt_(const WhileNode* op) {
+  using llvm::BasicBlock;
+  BasicBlock* while_cond = BasicBlock::Create(*ctx_, "while_cond", function_);
+  BasicBlock* while_body = BasicBlock::Create(*ctx_, "while_body", function_);
+  BasicBlock* while_merge = BasicBlock::Create(*ctx_, "while_merge", function_);
+  builder_->CreateBr(while_cond);
+  builder_->SetInsertPoint(while_cond);
+  builder_->CreateCondBr(MakeValue(op->condition), while_body, while_merge);
+  builder_->SetInsertPoint(while_body);
+  this->VisitStmt(op->body);
+  builder_->CreateBr(while_cond);
+  builder_->SetInsertPoint(while_merge);
+}
+
 void CodeGenLLVM::VisitStmt_(const IfThenElseNode* op) {
   using llvm::BasicBlock;
   llvm::Value* cond = MakeValue(op->condition);
diff --git a/src/target/llvm/codegen_llvm.h b/src/target/llvm/codegen_llvm.h
index 71583708da2c..e56a6de6d914 100644
--- a/src/target/llvm/codegen_llvm.h
+++ b/src/target/llvm/codegen_llvm.h
@@ -152,6 +152,7 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
   // stmt
   void VisitStmt_(const StoreNode* op) override;
   void VisitStmt_(const ForNode* op) override;
+  void VisitStmt_(const WhileNode* op) override;
   void VisitStmt_(const IfThenElseNode* op) override;
   void VisitStmt_(const AllocateNode* op) override;
   void VisitStmt_(const AttrStmtNode* op) override;
diff --git a/src/target/source/codegen_c.cc b/src/target/source/codegen_c.cc
index af175c7f2208..55db59f8d842 100644
--- a/src/target/source/codegen_c.cc
+++ b/src/target/source/codegen_c.cc
@@ -728,7 +728,6 @@ void CodeGenC::VisitStmt_(const StoreNode* op) {
     ICHECK(is_one(op->predicate)) << "Predicated store is not supported";
     arith::PVar<PrimExpr> base;
 
-
     if (arith::ramp(base, 1, t.lanes()).Match(op->index)) {
       std::string value = this->PrintExpr(op->value);
       this->PrintVecStore(op->buffer_var.get(), t, base.Eval(), value);
@@ -899,6 +898,16 @@ void CodeGenC::VisitStmt_(const ForNode* op) {
   stream << "}\n";
 }
 
+void CodeGenC::VisitStmt_(const WhileNode* op) {
+  PrintIndent();
+  stream << "while (" << PrintExpr(op->condition) << ") {\n";
+  int while_scope = BeginScope();
+  PrintStmt(op->body);
+  this->EndScope(while_scope);
+  PrintIndent();
+  stream << "}\n";
+}
+
 void CodeGenC::VisitStmt_(const IfThenElseNode* op) {
   std::string cond = PrintExpr(op->condition);
   PrintIndent();
diff --git a/src/target/source/codegen_c.h b/src/target/source/codegen_c.h
index c1b566c064a4..76e6a9bc7197 100644
--- a/src/target/source/codegen_c.h
+++ b/src/target/source/codegen_c.h
@@ -150,6 +150,7 @@ class CodeGenC : public ExprFunctor<void(const PrimExpr&, std::ostream&)>,
   void VisitStmt_(const LetStmtNode* op) override;
   void VisitStmt_(const StoreNode* op) override;
   void VisitStmt_(const ForNode* op) override;
+  void VisitStmt_(const WhileNode* op) override;
   void VisitStmt_(const IfThenElseNode* op) override;
   void VisitStmt_(const AllocateNode* op) override;
   void VisitStmt_(const AttrStmtNode* op) override;
diff --git a/src/tir/ir/stmt.cc b/src/tir/ir/stmt.cc
index e54be4347c8e..2aeaae3eb592 100644
--- a/src/tir/ir/stmt.cc
+++ b/src/tir/ir/stmt.cc
@@ -197,6 +197,38 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
       p->stream << "}\n";
     });
 
+// While
+While::While(PrimExpr condition, Stmt body, Span span) {
+  ICHECK(condition.defined());
+  ICHECK(condition.dtype().is_scalar());
+  ICHECK(condition.as<tir::IntImmNode>() == nullptr) << "The condition should not be trivial.";
+  ICHECK(body.defined());
+
+  ObjectPtr<WhileNode> node = make_object<WhileNode>();
+  node->condition = std::move(condition);
+  node->body = std::move(body);
+  node->span = std::move(span);
+  data_ = std::move(node);
+}
+
+TVM_REGISTER_GLOBAL("tir.While").set_body_typed([](PrimExpr condition, Stmt body, Span span) {
+  return While(condition, body, span);
+});
+
+TVM_REGISTER_NODE_TYPE(WhileNode);
+
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<WhileNode>([](const ObjectRef& node, ReprPrinter* p) {
+      auto* op = static_cast<const WhileNode*>(node.get());
+      p->PrintIndent();
+      p->stream << "while(" << op->condition << "){\n";
+      p->indent += 2;
+      p->Print(op->body);
+      p->indent -= 2;
+      p->PrintIndent();
+      p->stream << "}\n";
+    });
+
 // Store
 Store::Store(Var buffer_var, PrimExpr value, PrimExpr index, PrimExpr predicate, Span span) {
   ICHECK(value.defined());
diff --git a/src/tir/ir/stmt_functor.cc b/src/tir/ir/stmt_functor.cc
index f05dc7116494..639d38db0a81 100644
--- a/src/tir/ir/stmt_functor.cc
+++ b/src/tir/ir/stmt_functor.cc
@@ -45,6 +45,11 @@ void StmtVisitor::VisitStmt_(const ForNode* op) {
   this->VisitStmt(op->body);
 }
 
+void StmtVisitor::VisitStmt_(const WhileNode* op) {
+  this->VisitExpr(op->condition);
+  this->VisitStmt(op->body);
+}
+
 void StmtVisitor::VisitStmt_(const AllocateNode* op) {
   VisitArray(op->extents, [this](const PrimExpr& e) { this->VisitExpr(e); });
   this->VisitStmt(op->body);
@@ -283,6 +288,19 @@ Stmt StmtMutator::VisitStmt_(const ForNode* op) {
   }
 }
 
+Stmt StmtMutator::VisitStmt_(const WhileNode* op) {
+  PrimExpr condition = this->VisitExpr(op->condition);
+  Stmt body = this->VisitStmt(op->body);
+  if (condition.same_as(op->condition) && body.same_as(op->body)) {
+    return GetRef<Stmt>(op);
+  } else {
+    auto n = CopyOnWrite(op);
+    n->condition = std::move(condition);
+    n->body = std::move(body);
+    return Stmt(n);
+  }
+}
+
 Stmt StmtMutator::VisitStmt_(const AllocateNode* op) {
   Array<PrimExpr> extents = Internal::Mutate(this, op->extents);
   Stmt body = this->VisitStmt(op->body);
diff --git a/src/tir/transforms/coproc_sync.cc b/src/tir/transforms/coproc_sync.cc
index f9245442d268..424a1bbb0ae6 100644
--- a/src/tir/transforms/coproc_sync.cc
+++ b/src/tir/transforms/coproc_sync.cc
@@ -429,6 +429,11 @@ class CoProcInstDepDetector : public StmtVisitor {
     }
   }
 
+  void VisitStmt_(const WhileNode* op) final {
+    // TODO(masahi): Do we need a special handling for While nodes?
+    LOG(FATAL) << "WhileNode not supported in CoProcSync.";
+  }
+
   // insert before is stored in reverse order
   // the first element is closest to the node.
   std::unordered_map<const Object*, std::vector<Stmt> > insert_before_;
diff --git a/src/tir/transforms/inject_virtual_thread.cc b/src/tir/transforms/inject_virtual_thread.cc
index b24a0e95cd53..4ef10f326bb0 100644
--- a/src/tir/transforms/inject_virtual_thread.cc
+++ b/src/tir/transforms/inject_virtual_thread.cc
@@ -333,6 +333,13 @@ class VTInjector : public StmtExprMutator {
     }
   }
 
+  // While
+  Stmt VisitStmt_(const WhileNode* op) final {
+    // TODO(masahi): What should we do for While nodes?
+    LOG(FATAL) << "WhileNode in InjectVirtualThread not supported yet";
+    return Stmt();
+  }
+
   // Seq
   Stmt VisitStmt_(const SeqStmtNode* op) final {
     ICHECK_EQ(max_loop_depth_, 0);
diff --git a/src/tir/transforms/lift_attr_scope.cc b/src/tir/transforms/lift_attr_scope.cc
index 27dd583b8b42..40d152b3b3b6 100644
--- a/src/tir/transforms/lift_attr_scope.cc
+++ b/src/tir/transforms/lift_attr_scope.cc
@@ -157,6 +157,12 @@ class AttrScopeLifter : public StmtMutator {
     }
   }
 
+  Stmt VisitStmt_(const WhileNode* op) final {
+    // TODO(masahi): Do we need a special handling for While nodes?
+    LOG(FATAL) << "WhileNode not supported in LiftAttrScope.";
+    return Stmt();
+  }
+
  private:
   // value comparison that also compares content of int constant
   static bool ValueSame(const PrimExpr& a, const PrimExpr& b) {
diff --git a/src/tir/transforms/storage_access.cc b/src/tir/transforms/storage_access.cc
index be20724ae207..38143c14b021 100644
--- a/src/tir/transforms/storage_access.cc
+++ b/src/tir/transforms/storage_access.cc
@@ -180,6 +180,19 @@ void StorageAccessVisitor::VisitStmt_(const IfThenElseNode* op) {
   --condition_counter_;
 }
 
+void StorageAccessVisitor::VisitStmt_(const WhileNode* op) {
+  ++condition_counter_;
+  this->VisitExpr(op->condition);
+  scope_.push_back(std::vector<StmtEntry>());
+  this->VisitStmt(op->body);
+  StmtEntry s;
+  s.stmt = op;
+  s.access = Summarize(std::move(scope_.back()), nullptr);
+  scope_.pop_back();
+  scope_.back().emplace_back(std::move(s));
+  --condition_counter_;
+}
+
 void StorageAccessVisitor::VisitExpr_(const CallNode* op) {
   if (op->op.same_as(builtin::address_of())) {
     const LoadNode* l = op->args[0].as<LoadNode>();
diff --git a/src/tir/transforms/storage_access.h b/src/tir/transforms/storage_access.h
index 80bbff4c1fe4..663c570fd15c 100644
--- a/src/tir/transforms/storage_access.h
+++ b/src/tir/transforms/storage_access.h
@@ -84,6 +84,7 @@ class StorageAccessVisitor : public StmtExprVisitor {
   void VisitStmt_(const AttrStmtNode* op) final;
   void VisitStmt_(const ForNode* op) final;
   void VisitStmt_(const IfThenElseNode* op) final;
+  void VisitStmt_(const WhileNode* op) final;
   void VisitExpr_(const CallNode* op) final;
 
  protected:
diff --git a/src/tir/transforms/storage_rewrite.cc b/src/tir/transforms/storage_rewrite.cc
index 0b1429ca7efa..36eeddb17d89 100644
--- a/src/tir/transforms/storage_rewrite.cc
+++ b/src/tir/transforms/storage_rewrite.cc
@@ -192,6 +192,8 @@ class LinearAccessPatternFinder final : public StmtExprVisitor {
 
   void VisitStmt_(const ForNode* op) final { VisitNewScope(op); }
 
+  void VisitStmt_(const WhileNode* op) final { VisitNewScope(op); }
+
   void VisitStmt_(const AssertStmtNode* op) final { VisitNewScope(op); }
 
   // linearized access sequence.
@@ -244,6 +246,8 @@ class InplaceOpVerifier : public StmtExprVisitor {
       VisitStmt_(static_cast<const ForNode*>(stmt));
     } else if (stmt->IsInstance<IfThenElseNode>()) {
       VisitStmt_(static_cast<const IfThenElseNode*>(stmt));
+    } else if (stmt->IsInstance<WhileNode>()) {
+      VisitStmt_(static_cast<const WhileNode*>(stmt));
     } else if (stmt->IsInstance<StoreNode>()) {
       VisitStmt_(static_cast<const StoreNode*>(stmt));
     } else {
@@ -350,16 +354,7 @@ class StoragePlanRewriter : public StmtExprMutator {
     // start rewrite
     stmt = operator()(std::move(stmt));
     if (attach_map_.count(nullptr)) {
-      std::vector<Stmt> nest;
-      for (StorageEntry* e : attach_map_.at(nullptr)) {
-        // ICHECK_EQ(e->scope.rank, 0);
-        if (e->new_alloc.defined()) {
-          nest.emplace_back(AttrStmt(e->alloc_var, attr::storage_scope,
-                                     StringImm(e->scope.to_string()), Evaluate(0)));
-          nest.push_back(e->new_alloc);
-        }
-      }
-      stmt = MergeNest(nest, stmt);
+      return MakeAttach(attach_map_.at(nullptr), stmt);
     }
     return stmt;
   }
@@ -437,6 +432,7 @@ class StoragePlanRewriter : public StmtExprMutator {
       return StmtExprMutator::VisitStmt_(op);
     }
   }
+
   Stmt VisitStmt_(const ForNode* op) final {
     ICHECK(op->kind != ForKind::kVectorized) << "VectorizeLoop before LiftStorageAlloc";
     // remake all the allocation at the attach scope.
diff --git a/src/tir/transforms/vectorize_loop.cc b/src/tir/transforms/vectorize_loop.cc
index 66f4ae329f69..64956bc8ee54 100644
--- a/src/tir/transforms/vectorize_loop.cc
+++ b/src/tir/transforms/vectorize_loop.cc
@@ -388,6 +388,11 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
       return IfThenElse(condition, then_case, else_case);
     }
   }
+  // While
+  Stmt VisitStmt_(const WhileNode* op) final {
+    LOG(FATAL) << "A while loop inside a vectorized loop not supported.";
+    return Stmt();
+  }
   // LetStmt
   Stmt VisitStmt_(const LetStmtNode* op) final {
     PrimExpr value = this->VisitExpr(op->value);
@@ -441,7 +446,7 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
   }
   // ProducerStore
   Stmt VisitStmt_(const ProducerStoreNode* op) final {
-    LOG(FATAL) << "ProducerProvide is cannot appear in a TIR PrimFunc";
+    LOG(FATAL) << "ProducerProvide cannot appear in a TIR PrimFunc";
     return Stmt();
   }
 
diff --git a/tests/python/unittest/test_tir_ir_builder.py b/tests/python/unittest/test_tir_ir_builder.py
index b84ee09b9fd9..46bc500fc503 100644
--- a/tests/python/unittest/test_tir_ir_builder.py
+++ b/tests/python/unittest/test_tir_ir_builder.py
@@ -173,9 +173,335 @@ def check_target(target):
     check_target("cuda")
 
 
+def test_while_vectorize():
+    """Test while loop + vectorized inner loop"""
+
+    n = 64
+    num_iter = 10
+
+    def test_ir(A, B, C):
+        ib = tvm.tir.ir_builder.create()
+        n = C.shape[0]
+        A = ib.buffer_ptr(A)
+        B = ib.buffer_ptr(B)
+        C = ib.buffer_ptr(C)
+        i = ib.allocate("int32", (1,), name="i", scope="local")
+        i[0] = 0
+
+        with ib.for_range(0, n) as j:
+            C[j] = 0.0
+
+        with ib.while_loop(i[0] < num_iter):
+            with ib.for_range(0, n, kind="vectorize") as j:
+                C[j] += A[j] + B[j]
+            i[0] += 1
+
+        return ib.get()
+
+    def check_target(target, ir):
+        dtype = "float32"
+        A = te.placeholder((n,), name="A", dtype=dtype)
+        B = te.placeholder((n,), name="B", dtype=dtype)
+
+        C = te.extern(
+            (n,),
+            [A, B],
+            lambda ins, outs: ir(ins[0], ins[1], outs[0]),
+            name="while_vectorize",
+            dtype=dtype,
+        )
+        s = te.create_schedule(C.op)
+
+        with tvm.transform.PassContext(opt_level=3):
+            func = tvm.build(s, [A, B, C], target)
+
+        ctx = tvm.context(target, 0)
+        a_np = np.random.uniform(size=n).astype(A.dtype)
+        b_np = np.random.uniform(size=n).astype(B.dtype)
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(b_np, ctx)
+        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+        func(a, b, c)
+        ref = num_iter * (a_np + b_np)
+        tvm.testing.assert_allclose(c.asnumpy(), ref, rtol=1e-5, atol=1e-5)
+
+    check_target("llvm", test_ir)
+
+
+def test_while_collatz():
+    """Test while loop + if"""
+
+    def collatz_ref(n):
+        a = n
+        i = 0
+        while a > 1:
+            if a % 2 == 1:
+                a = 3 * a + 1
+            else:
+                a = a >> 1
+            i += 1
+        return i
+
+    def collatz(ib, n, C):
+        i = ib.allocate("int32", (1,), name="i", scope="local")
+        a = ib.allocate("int32", (1,), name="a", scope="local")
+        i[0] = 0
+        a[0] = n
+        with ib.while_loop(a[0] > 1):
+            with ib.if_scope(tvm.tir.floormod(a[0], 2) == 1):
+                a[0] = 3 * a[0] + 1
+            with ib.else_scope():
+                a[0] = a[0] >> 1
+            i[0] += 1
+
+        C[n] = i[0]
+
+    def collatz_ir_cpu(C):
+        ib = tvm.tir.ir_builder.create()
+        n = C.shape[0]
+        C = ib.buffer_ptr(C)
+
+        with ib.for_range(0, n, name="i", kind="parallel") as i:
+            collatz(ib, i, C)
+
+        body = ib.get()
+
+        return body
+
+    n = 30
+
+    def check_target(target, ir):
+        C = te.extern(
+            (n,),
+            [],
+            lambda ins, outs: ir(outs[0]),
+            name="collatz",
+            dtype="int32",
+        )
+        s = te.create_schedule(C.op)
+
+        with tvm.transform.PassContext(opt_level=3):
+            func = tvm.build(s, [C], target)
+
+        ctx = tvm.context(target, 0)
+        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+        func(c)
+        ref = np.array([collatz_ref(i) for i in range(n)])
+        tvm.testing.assert_allclose(c.asnumpy(), ref)
+
+    check_target("llvm", collatz_ir_cpu)
+
+
+def test_while_mandel():
+    n = 160
+    shape = (n * 2, n)
+    t = 300
+
+    def mandel_ref():
+        def complex_sqr(z):
+            return np.array([z[0] ** 2 - z[1] ** 2, z[1] * z[0] * 2])
+
+        pixels = np.zeros(shape)
+
+        for i in range(pixels.shape[0]):
+            for j in range(pixels.shape[1]):
+                c = np.array([-0.8, np.cos(t) * 0.2])
+                z = np.array([i / n - 1, j / n - 0.5]) * 2
+                iterations = 0
+
+                while np.linalg.norm(z) < 20 and iterations < 50:
+                    z = complex_sqr(z) + c
+                    iterations += 1
+
+                pixels[i, j] = 1 - iterations * 0.02
+
+        return pixels
+
+    def mandel(ib, i, j, pixels):
+        z = ib.allocate("float32", (2,), name="z", scope="local")
+        tmp = ib.allocate("float32", (1,), name="tmp", scope="local")
+        iterations = ib.allocate("int32", (1,), name="iterations", scope="local")
+
+        z[0] = (i / float(n) - 1) * 2
+        z[1] = (j / float(n) - 0.5) * 2
+        iterations[0] = 0
+        c = [-0.8, float(np.cos(t)) * 0.2]
+
+        def norm(z):
+            return tvm.tir.sqrt(z[0] * z[0] + z[1] * z[1])
+
+        with ib.while_loop(tvm.tir.all(norm(z) < 20, iterations[0] < 50)):
+            tmp[0] = z[0]
+            z[0] = z[0] * z[0] - z[1] * z[1] + c[0]
+            z[1] = z[1] * tmp[0] * 2 + c[1]
+            iterations[0] += 1
+
+        pixels[i, j] = 1 - iterations[0] * 0.02
+
+    def mandel_ir_cpu(C):
+        ib = tvm.tir.ir_builder.create()
+        ny = C.shape[0]
+        nx = C.shape[1]
+        C = ib.buffer_ptr(C)
+
+        with ib.for_range(0, ny, name="i", kind="parallel") as i:
+            with ib.for_range(0, nx, name="j") as j:
+                mandel(ib, i, j, C)
+
+        body = ib.get()
+
+        return body
+
+    def mandel_ir_gpu(C):
+        ib = tvm.tir.ir_builder.create()
+        ny = C.shape[0]
+        nx = C.shape[1]
+        C = ib.buffer_ptr(C)
+
+        bx = te.thread_axis("blockIdx.x")
+        tx = te.thread_axis("threadIdx.x")
+        by = te.thread_axis("blockIdx.y")
+        ty = te.thread_axis("threadIdx.y")
+
+        max_threads = 16
+        ib.scope_attr(bx, "thread_extent", tvm.tir.indexdiv(nx + max_threads - 1, max_threads))
+        ib.scope_attr(tx, "thread_extent", max_threads)
+        ib.scope_attr(by, "thread_extent", tvm.tir.indexdiv(ny + max_threads - 1, max_threads))
+        ib.scope_attr(ty, "thread_extent", max_threads)
+
+        tidx = bx * max_threads + tx
+        tidy = by * max_threads + ty
+
+        with ib.if_scope(tvm.tir.all(tidx < nx, tidy < ny)):
+            mandel(ib, tidy, tidx, C)
+
+        body = ib.get()
+
+        return body
+
+    ref = mandel_ref()
+
+    def check_target(target, ir):
+        if not tvm.testing.device_enabled(target):
+            return
+
+        C = te.extern(
+            shape,
+            [],
+            lambda ins, outs: ir(outs[0]),
+            name="mandel_ir",
+            dtype="float32",
+        )
+        s = te.create_schedule(C.op)
+
+        with tvm.transform.PassContext(opt_level=3):
+            func = tvm.build(s, [C], target)
+
+        ctx = tvm.context(target, 0)
+        c = tvm.nd.array(np.zeros(shape, dtype=C.dtype), ctx)
+        func(c)
+        tvm.testing.assert_allclose(c.asnumpy(), ref, rtol=1e-5, atol=1e-5)
+
+    check_target("llvm", mandel_ir_cpu)
+    check_target("npvtx", mandel_ir_gpu)
+    check_target("cuda", mandel_ir_gpu)
+
+
+def test_while_binary_search():
+    def binary_search(ib, n, i, Aptr, Bptr, Cptr):
+        lo = ib.allocate("int32", (1,), name="lo", scope="local")
+        hi = ib.allocate("int32", (1,), name="hi", scope="local")
+
+        lo[0] = 0
+        hi[0] = n
+        v = Bptr[i]
+
+        with ib.while_loop(lo[0] < hi[0]):
+            mid = lo[0] + (hi[0] - lo[0] >> 1)
+            with ib.if_scope(Aptr[mid] < v):
+                lo[0] = mid + 1
+            with ib.else_scope():
+                hi[0] = mid
+
+        Cptr[i] = lo[0]
+
+    def searchsorted_ir_cpu(A, B, C, n):
+        ib = tvm.tir.ir_builder.create()
+        Aptr = ib.buffer_ptr(A)
+        Bptr = ib.buffer_ptr(B)
+        Cptr = ib.buffer_ptr(C)
+
+        with ib.for_range(0, n, name="i", kind="parallel") as i:
+            binary_search(ib, n, i, Aptr, Bptr, Cptr)
+
+        body = ib.get()
+
+        return body
+
+    def searchsorted_ir_gpu(A, B, C, n):
+        ib = tvm.tir.ir_builder.create()
+        Aptr = ib.buffer_ptr(A)
+        Bptr = ib.buffer_ptr(B)
+        Cptr = ib.buffer_ptr(C)
+
+        bx = te.thread_axis("blockIdx.x")
+        tx = te.thread_axis("threadIdx.x")
+        max_threads = 32
+        ib.scope_attr(bx, "thread_extent", tvm.tir.indexdiv(n + max_threads - 1, max_threads))
+        ib.scope_attr(tx, "thread_extent", max_threads)
+        tid = bx * max_threads + tx
+
+        with ib.if_scope(tid < n):
+            binary_search(ib, n, tid, Aptr, Bptr, Cptr)
+
+        body = ib.get()
+
+        return body
+
+    n = 1024
+    dtype = "float32"
+    A = te.placeholder((n,), name="A", dtype=dtype)
+    B = te.placeholder((n,), name="B", dtype=dtype)
+
+    def check_target(target, ir):
+        if not tvm.testing.device_enabled(target):
+            return
+
+        C = te.extern(
+            A.shape,
+            [A, B],
+            lambda ins, outs: ir(ins[0], ins[1], outs[0], n),
+            name="searchsorted_ir",
+            dtype="int32",
+        )
+        s = te.create_schedule(C.op)
+
+        with tvm.transform.PassContext(opt_level=3):
+            func = tvm.build(s, [A, B, C], target)
+
+        ctx = tvm.context(target, 0)
+        a_np = np.random.uniform(size=n).astype(A.dtype)
+        b_np = np.random.uniform(size=n).astype(B.dtype)
+        a_np = np.sort(a_np)
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(b_np, ctx)
+        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+        func(a, b, c)
+        ref = np.searchsorted(a_np, b_np)
+        tvm.testing.assert_allclose(c.asnumpy(), ref)
+
+    check_target("llvm", searchsorted_ir_cpu)
+    check_target("cuda", searchsorted_ir_gpu)
+    check_target("nvptx", searchsorted_ir_gpu)
+
+
 if __name__ == "__main__":
     test_prefetch()
     test_if()
     test_for()
     test_cpu()
     test_gpu()
+    test_while_vectorize()
+    test_while_collatz()
+    test_while_mandel()
+    test_while_binary_search()
diff --git a/tests/python/unittest/test_tir_transform_storage_rewrite.py b/tests/python/unittest/test_tir_transform_storage_rewrite.py
index 49adcfb568a7..dbe7e04700d9 100644
--- a/tests/python/unittest/test_tir_transform_storage_rewrite.py
+++ b/tests/python/unittest/test_tir_transform_storage_rewrite.py
@@ -298,6 +298,76 @@ def test_parallel_alloc():
     assert isinstance(body.body.body.body.body, tvm.tir.Allocate)
 
 
+def test_while_alloc():
+    def get_mod(kind="serial"):
+        ib = tvm.tir.ir_builder.create()
+        n = te.var("n")
+        with ib.for_range(0, n, name="i", kind=kind) as i:
+            j = ib.allocate("int32", 1, name="j", scope="global")
+            j[0] = 0
+            with ib.while_loop(j[0] < 10):
+                A = ib.allocate("float32", n, name="A", scope="global")
+                A[j[0]] = A[j[0]] + 2
+                j[0] += j[0] + 1
+
+        body = ib.get()
+        return tvm.IRModule.from_expr(tvm.tir.PrimFunc([n], body))
+
+    mod = get_mod(kind="parallel")
+    # parallel (i, 0, n) {
+    #   // attr [j] storage_scope = "global"
+    #   allocate j[int32 * 1]
+    #   j[0] = 0
+    #   while((j[0] < 10)){
+    #     // attr [A] storage_scope = "global"
+    #     allocate A[float32 * n]
+    #     A[j[0]] = (A[j[0]] + 2f)
+    #     j[0] = (j[0] + (j[0] + 1))
+    #   }
+    # }
+    body = tvm.tir.transform.StorageRewrite()(mod)["main"].body
+    # parallel (i, 0, n) {
+    #   // attr [j] storage_scope = "global"
+    #   allocate j[int32 * 1]
+    #   // attr [A] storage_scope = "global"
+    #   allocate A[float32 * n]
+    #   j[0] = 0
+    #   while((j[0] < 10)){
+    #     A[j[0]] = (A[j[0]] + 2f)
+    #     j[0] = (j[0] + (j[0] + 1))
+    #   }
+    # }
+    assert isinstance(body.body.body, tvm.tir.Allocate)  # j
+    assert isinstance(body.body.body.body.body, tvm.tir.Allocate)  # A
+
+    mod = get_mod(kind="serial")
+    # for (i, 0, n) {
+    #   // attr [j] storage_scope = "global"
+    #   allocate j[int32 * 1]
+    #   j[0] = 0
+    #   while((j[0] < 10)){
+    #     // attr [A] storage_scope = "global"
+    #     allocate A[float32 * n]
+    #     A[j[0]] = (A[j[0]] + 2f)
+    #     j[0] = (j[0] + (j[0] + 1))
+    #   }
+    # }
+    body = tvm.tir.transform.StorageRewrite()(mod)["main"].body
+    # // attr [j] storage_scope = "global"
+    # allocate j[int32 * 1]
+    # // attr [A] storage_scope = "global"
+    # allocate A[float32 * n]
+    # for (i, 0, n) {
+    #   j[0] = 0
+    #   while((j[0] < 10)){
+    #     A[j[0]] = (A[j[0]] + 2f)
+    #     j[0] = (j[0] + (j[0] + 1))
+    #   }
+    # }
+    assert isinstance(body.body, tvm.tir.Allocate)  # j
+    assert isinstance(body.body.body.body, tvm.tir.Allocate)  # A
+
+
 def test_inplace_rule2(scope_tb="local_TB2", max_bits=1024 * 1024 * 1024):
     # Test Buffer
     register_mem(scope_tb, max_bits)
@@ -576,6 +646,7 @@ def verify(n):
     test_alloc_different_dtypes()
     test_inplace_rule()
     test_parallel_alloc()
+    test_while_alloc()
     test_storage_combine()
     test_storage_share_gpu()
     test_inplace_rule2()
diff --git a/tests/python/unittest/test_tir_transform_vectorize.py b/tests/python/unittest/test_tir_transform_vectorize.py
index 5ae47e01f681..b1e580957b24 100644
--- a/tests/python/unittest/test_tir_transform_vectorize.py
+++ b/tests/python/unittest/test_tir_transform_vectorize.py
@@ -158,6 +158,53 @@ def test_vectorize_if_then_else():
     assert isinstance(stmt.body.value.args[2], tvm.tir.Broadcast)
 
 
+def test_vectorize_while_fail():
+    """A while loop inside a vectorized loop should fail."""
+
+    n = 64
+    num_iter = 10
+
+    def test_ir(A, B, C):
+        ib = tvm.tir.ir_builder.create()
+        n = C.shape[0]
+        A = ib.buffer_ptr(A)
+        B = ib.buffer_ptr(B)
+        C = ib.buffer_ptr(C)
+        i = ib.allocate("int32", (1,), name="i", scope="local")
+        i[0] = 0
+
+        with ib.for_range(0, n) as j:
+            C[j] = 0.0
+
+        with ib.for_range(0, n, kind="vectorize") as j:
+            with ib.while_loop(i[0] < num_iter):
+                C[j] += A[j] + B[j]
+                i[0] += 1
+
+        return ib.get()
+
+    dtype = "float32"
+    A = te.placeholder((n,), name="A", dtype=dtype)
+    B = te.placeholder((n,), name="B", dtype=dtype)
+
+    C = te.extern(
+        (n,),
+        [A, B],
+        lambda ins, outs: test_ir(ins[0], ins[1], outs[0]),
+        name="while_vectorize",
+        dtype=dtype,
+    )
+    s = te.create_schedule(C.op)
+
+    try:
+        tvm.lower(s, [A, B, C], "llvm")
+        assert False
+    except tvm.error.TVMError as e:
+        error_msg = str(e).split("\n")[-1]
+        expected = "A while loop inside a vectorized loop not supported"
+        assert expected in error_msg
+
+
 if __name__ == "__main__":
     test_vectorize_vector()
     test_vectorize_with_if()
@@ -166,3 +213,4 @@ def test_vectorize_if_then_else():
     test_vectorize_with_le_cond()
     test_vectorize_with_ge_cond()
     test_vectorize_let()
+    test_vectorize_while_fail()

From 91b6b3f389097aa7c7073e36b97892d52efcd1be Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Tue, 2 Mar 2021 20:02:57 -0800
Subject: [PATCH 264/357] [RELAY] Modify some passes to not stack overflow on
 many lets. (#7558)

* [RELAY] Modify some passes to not stack overflow on many lets.

Passes modified:
- inline primitives
- dead code
- lambda lift

* one fix

* small fix

* .at -> []

* fix
---
 include/tvm/relay/expr_functor.h          |  3 +-
 src/relay/backend/vm/compiler.cc          | 15 ++++---
 src/relay/backend/vm/inline_primitives.cc | 15 ++++++-
 src/relay/backend/vm/lambda_lift.cc       | 35 +++++++++++------
 src/relay/transforms/dead_code.cc         | 48 +++++++++++++++++------
 5 files changed, 85 insertions(+), 31 deletions(-)

diff --git a/include/tvm/relay/expr_functor.h b/include/tvm/relay/expr_functor.h
index d53658f87f40..e6eec61a7e9d 100644
--- a/include/tvm/relay/expr_functor.h
+++ b/include/tvm/relay/expr_functor.h
@@ -88,7 +88,8 @@ class ExprFunctor<R(const Expr& n, Args...)> {
    * \return The result of the call
    */
   virtual R VisitExpr(const Expr& n, Args... args) {
-    ICHECK(n.defined());
+    ICHECK(n.defined()) << "Found null pointer node while traversing AST. The previous pass may "
+                           "have generated invalid data.";
     static FType vtable = InitVTable();
     return vtable(n, this, std::forward<Args>(args)...);
   }
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index 0718191a2ff6..251a55f10b72 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -376,11 +376,16 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
     CompileMatch(match);
   }
 
-  void VisitExpr_(const LetNode* let_node) {
-    DLOG(INFO) << PrettyPrint(let_node->value);
-    this->VisitExpr(let_node->value);
-    var_register_map_.insert({let_node->var, this->last_register_});
-    this->VisitExpr(let_node->body);
+  void VisitExpr_(const LetNode* l) final {
+    Expr let_binding = GetRef<Expr>(l);
+    const LetNode* let;
+    while ((let = let_binding.as<LetNode>())) {
+      VisitExpr(let->value);
+      var_register_map_.insert({let->var, this->last_register_});
+      let_binding = let->body;
+    }
+
+    VisitExpr(let_binding);
   }
 
   void VisitExpr_(const TupleGetItemNode* get_node) {
diff --git a/src/relay/backend/vm/inline_primitives.cc b/src/relay/backend/vm/inline_primitives.cc
index 650df99645e7..eb848eb7a828 100644
--- a/src/relay/backend/vm/inline_primitives.cc
+++ b/src/relay/backend/vm/inline_primitives.cc
@@ -58,8 +58,19 @@ struct PrimitiveInliner : ExprMutator {
   explicit PrimitiveInliner(const IRModule& module) : module_(module) {}
 
   Expr VisitExpr_(const LetNode* let_node) {
-    var_map.insert({let_node->var, VisitExpr(let_node->value)});
-    return ExprMutator::VisitExpr_(let_node);
+    auto pre_visit = [this](const LetNode* op) {
+      var_map.insert({op->var, this->VisitExpr(op->value)});
+    };
+    auto post_visit = [this](const LetNode* op) {
+      // Rely on the Memoizer to cache pre-visit values
+      Expr value = this->VisitExpr(op->value);
+      // Visit body and cache the op
+      Expr body = this->VisitExpr(op->body);
+      auto expr = GetRef<Expr>(op);
+      this->memo_[expr] = Let(op->var, value, body);
+    };
+    ExpandANormalForm(let_node, pre_visit, post_visit);
+    return memo_[GetRef<Expr>(let_node)];
   }
 
   Expr VisitExpr_(const CallNode* call) {
diff --git a/src/relay/backend/vm/lambda_lift.cc b/src/relay/backend/vm/lambda_lift.cc
index fe9a544a719e..cc530a10188e 100644
--- a/src/relay/backend/vm/lambda_lift.cc
+++ b/src/relay/backend/vm/lambda_lift.cc
@@ -61,19 +61,30 @@ class LambdaLifter : public ExprMutator {
   explicit LambdaLifter(const IRModule& module) : module_(module) {}
 
   Expr VisitExpr_(const LetNode* let_node) final {
-    bool is_lambda = false;
-    if (auto func = let_node->value.as<FunctionNode>()) {
-      if (!func->HasNonzeroAttr(attr::kPrimitive)) {
-        is_lambda = true;
-        letrec_.push_back(let_node->var);
+    auto pre_visit = [this](const LetNode* op) {
+      bool is_lambda = false;
+      if (auto func = op->value.as<FunctionNode>()) {
+        if (!func->HasNonzeroAttr(attr::kPrimitive)) {
+          is_lambda = true;
+          this->letrec_.push_back(op->var);
+        }
       }
-    }
-    auto value = VisitExpr(let_node->value);
-    if (is_lambda) {
-      letrec_.pop_back();
-    }
-    auto body = VisitExpr(let_node->body);
-    return Let(let_node->var, value, body);
+      Expr value = this->VisitExpr(op->value);
+
+      if (is_lambda) {
+        this->letrec_.pop_back();
+      }
+    };
+    auto post_visit = [this](const LetNode* op) {
+      // Rely on the Memoizer to cache pre-visit values
+      Expr value = this->VisitExpr(op->value);
+      // Visit body and cache the op
+      Expr body = this->VisitExpr(op->body);
+      auto expr = GetRef<Expr>(op);
+      this->memo_[expr] = Let(op->var, value, body);
+    };
+    ExpandANormalForm(let_node, pre_visit, post_visit);
+    return memo_[GetRef<Expr>(let_node)];
   }
 
   Expr VisitExpr_(const CallNode* call_node) final {
diff --git a/src/relay/transforms/dead_code.cc b/src/relay/transforms/dead_code.cc
index 2e7c08a684dc..26624e438b8a 100644
--- a/src/relay/transforms/dead_code.cc
+++ b/src/relay/transforms/dead_code.cc
@@ -46,10 +46,16 @@ class FindDef : private ExprVisitor {
   VarMap<Expr> expr_map_;
 
   void VisitExpr_(const LetNode* l) final {
-    ICHECK_EQ(expr_map_.count(l->var), 0);
-    expr_map_[l->var] = l->value;
-    VisitExpr(l->value);
-    VisitExpr(l->body);
+    auto pre_visit = [this](const LetNode* op) {
+      ICHECK_EQ(expr_map_.count(op->var), 0);
+      expr_map_[op->var] = op->value;
+      this->VisitExpr(op->value);
+    };
+    auto post_visit = [this](const LetNode* op) {
+      this->VisitExpr(op->body);
+      this->visit_counter_[op] += 1;
+    };
+    ExpandANormalForm(l, pre_visit, post_visit);
   }
 
   friend CalcDep;
@@ -81,12 +87,24 @@ class Eliminator : private ExprMutator {
   }
 
   Expr VisitExpr_(const LetNode* op) final {
-    Var v = op->var;
-    if (HasLet(v)) {
-      return Let(v, VisitExpr(op->value), VisitExpr(op->body));
-    } else {
-      return VisitExpr(op->body);
-    }
+    auto pre_visit = [this](const LetNode* op) {
+      if (HasLet(op->var)) {
+        Expr value = this->VisitExpr(op->value);
+      }
+    };
+    auto post_visit = [this](const LetNode* op) {
+      Expr body = this->VisitExpr(op->body);
+      auto expr = GetRef<Expr>(op);
+      Var v = op->var;
+      if (HasLet(v)) {
+        Expr value = this->VisitExpr(op->value);
+        this->memo_[expr] = Let(v, value, body);
+      } else {
+        this->memo_[expr] = body;
+      }
+    };
+    ExpandANormalForm(op, pre_visit, post_visit);
+    return memo_[GetRef<Expr>(op)];
   }
 };
 
@@ -121,7 +139,15 @@ class CalcDep : protected MixedModeVisitor {
     }
   }
 
-  void VisitExpr_(const LetNode* l) final { VisitExpr(l->body); }
+  void VisitExpr_(const LetNode* l) final {
+    Expr let_binding = GetRef<Expr>(l);
+    const LetNode* let;
+    while ((let = let_binding.as<LetNode>())) {
+      let_binding = let->body;
+      visit_counter_[l] += 1;
+    }
+    VisitExpr(let_binding);
+  }
 
   void VisitExpr_(const VarNode* v) final {
     Var var = GetRef<Var>(v);

From 67bba9032577025419dc0e110fdf4b08c5f66895 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <pivovaa@amazon.com>
Date: Tue, 2 Mar 2021 21:45:51 -0800
Subject: [PATCH 265/357] [torch] Add linear operator support (#7569)

---
 python/tvm/relay/frontend/pytorch.py          | 15 ++++++++
 tests/python/frontend/pytorch/test_forward.py | 34 +++++++++++++++++++
 2 files changed, 49 insertions(+)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 3c61749fc203..dcf2f08caeef 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -1374,6 +1374,20 @@ def avg_pool3d(self, inputs, input_types):
             count_include_pad=count_include_pad,
         )
 
+    def linear(self, inputs, input_types):
+        # https://pytorch.org/docs/stable/nn.functional.html#linear
+        # 0 - input
+        # 1 - weight
+        bias = inputs[2]
+        mm_out = self.matmul(inputs[:2], input_types[:2])
+        if isinstance(bias, _expr.Expr):
+            bias_ndims = len(self.infer_shape_with_prelude(bias))
+            if bias_ndims == 1:
+                return _op.nn.bias_add(mm_out, bias)
+            mm_dtype = self.infer_type_with_prelude(mm_out).dtype
+            return self.add([mm_out, bias], [mm_dtype, input_types[2]])
+        return mm_out
+
     def dropout(self, inputs, input_types):
         data = inputs[0]
         rate = float(inputs[1])
@@ -2289,6 +2303,7 @@ def create_convert_map(self):
             "aten::softplus": self.softplus,
             "aten::avg_pool2d": self.avg_pool2d,
             "aten::avg_pool3d": self.avg_pool3d,
+            "aten::linear": self.linear,
             "aten::dropout": self.dropout,
             "aten::dropout_": self.dropout,
             "aten::feature_dropout": self.dropout,
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 9f035ade7a21..54bf2fd49acb 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -24,6 +24,7 @@
 import torch
 import torchvision
 from torch.nn import Module
+from torch.nn import functional as F
 import tvm
 from tvm import relay
 from tvm.contrib import graph_runtime
@@ -1459,6 +1460,39 @@ def forward(self, *args):
     assert not any([op.name == "multiply" for op in list_ops(mod["main"])])
 
 
+@tvm.testing.uses_gpu
+def test_forward_linear():
+    torch.set_grad_enabled(False)
+
+    class Linear(Module):
+        def forward(self, input, weight, bias):
+            return F.linear(input, weight, bias)
+
+    class LinearNoBias(Module):
+        def forward(self, input, weight):
+            return F.linear(input, weight)
+
+    input2d = torch.rand([2, 2]).float()
+    weight1d = torch.rand([2]).float()
+    weight2d = torch.rand([2, 2]).float()
+    bias1d = torch.rand([2]).float()
+    bias2d = torch.rand([2, 2]).float()
+    # 2D input, 2D weight, 1D bias
+    verify_model(Linear(), input_data=[input2d, weight2d, bias1d])
+    # 2D input, 2D weight, 2D bias
+    verify_model(Linear(), input_data=[input2d, weight2d, bias2d])
+    # 2D input, 2D weight, no bias
+    verify_model(LinearNoBias(), input_data=[input2d, weight2d])
+    # 2D input, 1D weight, 1D bias is not supported by torch.linear()
+    # 2D input, 1D weight, no bias
+    verify_model(LinearNoBias(), input_data=[input2d, weight1d])
+    # TODO: Add the following cases when matmul(1D, _) is supported by TVM
+    # 1D input, 2D weight, 1D bias
+    # 1D input, 2D weight, no bias
+    # 1D input, 1D weight, scalar bias
+    # 1D input, 1D weight, no bias
+
+
 @tvm.testing.uses_gpu
 def test_forward_dropout():
     torch.set_grad_enabled(False)

From 37053e1708c6565c8a82c31c0ffc78e594bfe3b0 Mon Sep 17 00:00:00 2001
From: lee <cy-l@live.com>
Date: Thu, 4 Mar 2021 01:30:31 +0800
Subject: [PATCH 266/357] [Tensorize] Support conds depend on outer loop vars
 inside tensorize scope (#7497)

* [Tensorize] Support conds depend on outer loop vars inside tensorize scope

* Reformat
---
 src/te/operation/op_utils.cc                  |  8 +++
 src/te/operation/op_utils.h                   | 10 +++-
 src/te/operation/tensorize.cc                 |  6 +-
 .../unittest/test_te_schedule_tensorize.py    | 57 ++++++++++++++++---
 4 files changed, 69 insertions(+), 12 deletions(-)

diff --git a/src/te/operation/op_utils.cc b/src/te/operation/op_utils.cc
index 32ffccbbec1f..b3897e142545 100644
--- a/src/te/operation/op_utils.cc
+++ b/src/te/operation/op_utils.cc
@@ -243,6 +243,14 @@ Stmt Substitute(Stmt s, const std::unordered_map<IterVar, PrimExpr>& value_map)
   return tir::Substitute(s, init);
 }
 
+PrimExpr Substitute(PrimExpr s, const std::unordered_map<IterVar, PrimExpr>& value_map) {
+  std::unordered_map<const VarNode*, PrimExpr> init;
+  for (const auto& kv : value_map) {
+    init[kv.first->var.get()] = kv.second;
+  }
+  return tir::Substitute(s, init);
+}
+
 IterVarType ForKindToIterVarType(tir::ForKind kind) {
   switch (kind) {
     case ForKind::kSerial:
diff --git a/src/te/operation/op_utils.h b/src/te/operation/op_utils.h
index e6bf2caae6e0..02f4a860a01d 100644
--- a/src/te/operation/op_utils.h
+++ b/src/te/operation/op_utils.h
@@ -73,7 +73,7 @@ std::vector<Stmt> MakeIfNest(const std::vector<PrimExpr>& predicates);
  */
 Stmt ReplaceTensor(Stmt stmt, const std::unordered_map<Tensor, Tensor>& replace);
 /*!
- * \brief Replace the tensor reference (especially in Call's) in stmt by the replace map.
+ * \brief Replace the tensor reference (especially in Call's) in primExpr by the replace map.
  * \param expr The expression to be processed.
  * \param replace The replacement rule.
  */
@@ -87,6 +87,14 @@ PrimExpr ReplaceTensor(PrimExpr expr, const std::unordered_map<Tensor, Tensor>&
  */
 Stmt Substitute(Stmt stmt, const std::unordered_map<IterVar, PrimExpr>& value_map);
 
+/*!
+ * \brief Substitute the variables of primExpr by value map.
+ * \param expr the expression to be processed.
+ * \param value_map The value map.
+ * \return Substituted result.
+ */
+PrimExpr Substitute(PrimExpr expr, const std::unordered_map<IterVar, PrimExpr>& value_map);
+
 /*!
  * \brief Converts Halide ForKind to its corresponding IterVarType
  * \param kind The ForKind to be converted
diff --git a/src/te/operation/tensorize.cc b/src/te/operation/tensorize.cc
index bfd1ec579818..ea713220eddd 100644
--- a/src/te/operation/tensorize.cc
+++ b/src/te/operation/tensorize.cc
@@ -311,6 +311,7 @@ Array<PrimExpr> MatchTensorizeBody(const ComputeOpNode* self, const Stage& stage
 }
 
 void VerifyTensorizeBody(const ComputeOpNode* self, const Stage& stage,
+                         const std::unordered_map<IterVar, PrimExpr>& value_map,
                          const std::unordered_map<IterVar, Range>& dom_map,
                          const std::unordered_map<IterVar, Range>& out_dom,
                          const std::unordered_map<Tensor, Array<Range> >& in_region,
@@ -327,7 +328,8 @@ void VerifyTensorizeBody(const ComputeOpNode* self, const Stage& stage,
 
   for (size_t i = 0; i < body.size(); ++i) {
     PrimExpr lhs = ana.Simplify(body[i]);
-    PrimExpr rhs = ana.Simplify(intrin_compute->body[i]);
+    // run substitution because the intrin body could depend on outer loop vars.
+    PrimExpr rhs = ana.Simplify(Substitute(intrin_compute->body[i], value_map));
     if (lhs.dtype() != rhs.dtype()) {
       LOG(FATAL) << "Failed to match the data type with TensorIntrin " << intrin->name
                  << "'s declaration "
@@ -349,7 +351,7 @@ Stmt MakeTensorize(const ComputeOpNode* self, const Stage& stage,
   ICHECK(intrin.defined());
   ComputeLoopNest n = ComputeLoopNest::Create(self, stage, dom_map, debug_keep_trivial_loop);
   VerifyTensorizeLoopNest(self, stage, n, tloc);
-  VerifyTensorizeBody(self, stage, dom_map, out_dom, in_region, intrin);
+  VerifyTensorizeBody(self, stage, n.main_vmap, dom_map, out_dom, in_region, intrin);
   // Start bind data.
   Stmt nop = Evaluate(0);
   std::vector<Stmt> input_bind_nest, output_bind_nest;
diff --git a/tests/python/unittest/test_te_schedule_tensorize.py b/tests/python/unittest/test_te_schedule_tensorize.py
index 83a5d30bb90d..fdafdb74fc0b 100644
--- a/tests/python/unittest/test_te_schedule_tensorize.py
+++ b/tests/python/unittest/test_te_schedule_tensorize.py
@@ -18,14 +18,22 @@
 from tvm import te
 
 
-def intrin_vadd(n):
+def intrin_vadd(xo, m, n):
     x = te.placeholder((n,), name="vx")
     y = te.placeholder((n,), name="vy")
-    z = te.compute(x.shape, lambda i: x[i] + y[i], name="z")
+    if m % n == 0:
+        body = lambda i: x[i] + y[i]
+    else:
+        body = lambda i: tvm.tir.Select(
+            xo * n + i < m, x[i] + y[i], tvm.tir.const(0, dtype=x.dtype)
+        )
+    z = te.compute(x.shape, body, name="z")
 
     def intrin_func(ins, outs):
         xx, yy = ins
         zz = outs[0]
+        # special handle needed to tackle tail loop part when m % n != 0
+        # here is tvm.min(n, m - xo * n)
         return tvm.tir.call_packed("vadd", xx, yy, zz)
 
     buffer_params = {"offset_factor": 16}
@@ -84,15 +92,17 @@ def intrin_func(ins, outs):
 
 
 def test_tensorize_vadd():
-    m = 128
-    x = te.placeholder((m,), name="x")
-    y = te.placeholder((m,), name="y")
-    z = te.compute(x.shape, lambda i: x[i] + y[i], name="z")
+    def add(m):
+        x = te.placeholder((m,), name="x")
+        y = te.placeholder((m,), name="y")
+        z = te.compute(x.shape, lambda i: x[i] + y[i], name="z")
+        return x, y, z
 
-    def check(factor):
+    def check(m, factor):
+        x, y, z = add(m)
         s = te.create_schedule(z.op)
         xo, xi = s[z].split(z.op.axis[0], factor=factor)
-        vadd = intrin_vadd(factor)
+        vadd = intrin_vadd(xo, m, factor)
         s[z].tensorize(xi, vadd)
         s = s.normalize()
         dom_map = tvm.te.schedule.InferBound(s)
@@ -108,7 +118,36 @@ def check(factor):
         stmt = tvm.te.schedule.ScheduleOps(s, dom_map)
         tvm.lower(s, [x, y, z])
 
-    check(16)
+    def check_cache_write(m, factor):
+        x, y, z = add(m)
+        s = te.create_schedule(z.op)
+        _, _ = s[z].split(z.op.axis[0], factor=factor)
+
+        z_global = s.cache_write(z, "global")
+        xo, xi = z_global.op.axis
+
+        vadd = intrin_vadd(xo, m, factor)
+        s[z_global].tensorize(xi, vadd)
+        s = s.normalize()
+        dom_map = tvm.te.schedule.InferBound(s)
+        finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
+        out_dom, in_dom = finfer(s[z_global], dom_map)
+        # outer loop var will be rebased, so min value is the new loop var and extent is 1
+        assert tvm.ir.structural_equal(out_dom[xo].extent, 1)
+        assert isinstance(out_dom[xo].min, tvm.tir.Var)
+        assert xo.var.name == out_dom[xo].min.name
+
+        fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
+        body = fmatch(s[z_global], out_dom, in_dom, vadd)[0]
+        ana = tvm.arith.Analyzer()
+        vars = tvm.runtime.convert({xo.var: out_dom[xo].min})
+        vadd_body = tvm.tir.stmt_functor.substitute(vadd.op.body[0], vars)
+        assert tvm.ir.structural_equal(ana.simplify(body), ana.simplify(vadd_body))
+        stmt = tvm.te.schedule.ScheduleOps(s, dom_map)
+        tvm.lower(s, [x, y, z])
+
+    check(128, 16)
+    check_cache_write(129, 16)
 
 
 def test_tensorize_matmul():

From 296700e6de889874f736daa426f6488bfcd8a453 Mon Sep 17 00:00:00 2001
From: Jorn Tuyls <jtuyls@users.noreply.github.com>
Date: Wed, 3 Mar 2021 17:34:59 +0000
Subject: [PATCH 267/357] [CI][VitisAI] Update CI Vitis AI PyXIR version
 (#7575)

* Update Vitis AI CI PyXIR version to v0.1.6

* Add --depth 1 to PyXIR clone command
---
 docker/install/ubuntu_install_vitis_ai_packages_ci.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/install/ubuntu_install_vitis_ai_packages_ci.sh b/docker/install/ubuntu_install_vitis_ai_packages_ci.sh
index c34ed3addce2..774d85dcf68a 100644
--- a/docker/install/ubuntu_install_vitis_ai_packages_ci.sh
+++ b/docker/install/ubuntu_install_vitis_ai_packages_ci.sh
@@ -23,7 +23,7 @@ set -o pipefail
 export PYXIR_HOME=/opt/pyxir
 mkdir "$PYXIR_HOME"
 
-pip3 install progressbar
+pip3 install progressbar h5py==2.10.0
 
-git clone --recursive --branch v0.1.3 https://github.com/Xilinx/pyxir.git "${PYXIR_HOME}"
+git clone --recursive --branch v0.1.6 --depth 1 https://github.com/Xilinx/pyxir.git "${PYXIR_HOME}"
 cd "${PYXIR_HOME}" && python3 setup.py install

From dccc1ab6ca5b98e0f9b12542e250abcd959ff36d Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Thu, 4 Mar 2021 03:33:34 +0900
Subject: [PATCH 268/357] [SPIR-V] Add SPIR-V lowering for While node (#7574)

* Add SPIR-V lowering for WhileNode

* test vulkan in while loop tests
---
 src/target/spirv/codegen_spirv.cc            | 28 ++++++++++++++++++++
 src/target/spirv/codegen_spirv.h             |  1 +
 tests/python/unittest/test_tir_ir_builder.py |  2 ++
 3 files changed, 31 insertions(+)

diff --git a/src/target/spirv/codegen_spirv.cc b/src/target/spirv/codegen_spirv.cc
index 51d136d5510e..6311b435f197 100644
--- a/src/target/spirv/codegen_spirv.cc
+++ b/src/target/spirv/codegen_spirv.cc
@@ -514,6 +514,34 @@ void CodeGenSPIRV::VisitStmt_(const ForNode* op) {
   builder_->StartLabel(merge_label);
 }
 
+void CodeGenSPIRV::VisitStmt_(const WhileNode* op) {
+  spirv::Label head_label = builder_->NewLabel();
+  spirv::Label body_label = builder_->NewLabel();
+  spirv::Label continue_label = builder_->NewLabel();
+  spirv::Label merge_label = builder_->NewLabel();
+  builder_->MakeInst(spv::OpBranch, head_label);
+
+  // Loop head
+  builder_->StartLabel(head_label);
+  spirv::Value loop_cond = MakeValue(op->condition);
+  uint32_t control = spv::LoopControlMaskNone;
+  builder_->MakeInst(spv::OpLoopMerge, merge_label, continue_label, control);
+  builder_->MakeInst(spv::OpBranchConditional, loop_cond, body_label, merge_label,
+                     weight_likely_branch_, 1);
+
+  // loop body
+  builder_->StartLabel(body_label);
+  this->VisitStmt(op->body);
+  builder_->MakeInst(spv::OpBranch, continue_label);
+
+  // loop continue
+  builder_->StartLabel(continue_label);
+  builder_->MakeInst(spv::OpBranch, head_label);
+
+  // loop merge
+  builder_->StartLabel(merge_label);
+}
+
 void CodeGenSPIRV::VisitStmt_(const IfThenElseNode* op) {
   spirv::Value cond = MakeValue(op->condition);
   spirv::Label then_label = builder_->NewLabel();
diff --git a/src/target/spirv/codegen_spirv.h b/src/target/spirv/codegen_spirv.h
index be755641c8a5..1e80fcc4a931 100644
--- a/src/target/spirv/codegen_spirv.h
+++ b/src/target/spirv/codegen_spirv.h
@@ -93,6 +93,7 @@ class CodeGenSPIRV : public ExprFunctor<spirv::Value(const PrimExpr&)>,
   // stmt
   void VisitStmt_(const StoreNode* op) override;
   void VisitStmt_(const ForNode* op) override;
+  void VisitStmt_(const WhileNode* op) override;
   void VisitStmt_(const IfThenElseNode* op) override;
   void VisitStmt_(const AllocateNode* op) override;
   void VisitStmt_(const AttrStmtNode* op) override;
diff --git a/tests/python/unittest/test_tir_ir_builder.py b/tests/python/unittest/test_tir_ir_builder.py
index 46bc500fc503..8ad5cb63924e 100644
--- a/tests/python/unittest/test_tir_ir_builder.py
+++ b/tests/python/unittest/test_tir_ir_builder.py
@@ -405,6 +405,7 @@ def check_target(target, ir):
     check_target("llvm", mandel_ir_cpu)
     check_target("npvtx", mandel_ir_gpu)
     check_target("cuda", mandel_ir_gpu)
+    check_target("vulkan", mandel_ir_gpu)
 
 
 def test_while_binary_search():
@@ -493,6 +494,7 @@ def check_target(target, ir):
     check_target("llvm", searchsorted_ir_cpu)
     check_target("cuda", searchsorted_ir_gpu)
     check_target("nvptx", searchsorted_ir_gpu)
+    check_target("vulkan", searchsorted_ir_gpu)
 
 
 if __name__ == "__main__":

From 25bf44998ca261e50e6cd25021ecccd8a5a11762 Mon Sep 17 00:00:00 2001
From: Qiang Zhang <johnson9009@163.com>
Date: Thu, 4 Mar 2021 06:14:15 +0800
Subject: [PATCH 269/357] [Relay][Quantization] Fix Bug Which Cause Negative
 Left Shift Op (#7432)

---
 src/relay/quantize/realize.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc
index 2716c6e65f65..d77ede3acbf9 100644
--- a/src/relay/quantize/realize.cc
+++ b/src/relay/quantize/realize.cc
@@ -165,7 +165,7 @@ Expr QuantizeRealize(const Call& ref_call, const Array<Expr>& new_args, const Ob
                           MakeConstantScalar(cfg->dtype_activation, static_cast<int>(shift_nbit)));
       } else {
         data = LeftShift(data,
-                         MakeConstantScalar(cfg->dtype_activation, static_cast<int>(shift_nbit)));
+                         MakeConstantScalar(cfg->dtype_activation, static_cast<int>(-shift_nbit)));
       }
       data = Clip(data, clip_min_imm, clip_max_imm);
       return QRealizeIntExpr(data, dom_scale, n->dtype);

From 1d5f334a47ae675f20d53d6fb3b11832db75bd9c Mon Sep 17 00:00:00 2001
From: "Steven S. Lyubomirsky" <sslyu@cs.washington.edu>
Date: Wed, 3 Mar 2021 17:24:01 -0500
Subject: [PATCH 270/357] [Relay][bugfix][error reporting] BiasAddRel does not
 check for a negative index being out of bounds (#7554)

---
 src/relay/op/nn/nn.cc                |  4 ++--
 tests/python/relay/test_op_level1.py | 18 ++++++++++--------
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 38c33b45936e..0ea71de367fa 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -61,10 +61,10 @@ bool BiasAddRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   if (axis < 0) {
     axis = data->shape.size() + axis;
   }
-  if (axis >= static_cast<int>(data->shape.size())) {
+  if (axis >= static_cast<int>(data->shape.size()) || axis < 0) {
     reporter->GetDiagCtx().EmitFatal(Diagnostic::Error(reporter->GetSpan())
                                      << "The axis in bias_add must be in range for the shape; "
-                                     << "attempted to access index " << axis << " of "
+                                     << "attempted to access index " << param->axis << " of "
                                      << PrettyPrint(data->shape));
     return false;
   }
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index ea5dd6948b11..dfd350486c3b 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -202,14 +202,16 @@ def test_bias_add():
 
 
 def test_bias_add_type_failure():
-    # the axis is out of range
-    try:
-        b_add = relay.nn.bias_add(relay.const(1), relay.const(2), axis=0)
-        run_infer_type(b_add)
-    except tvm._ffi.base.TVMError:
-        pass
-    else:
-        assert False
+    def assert_failure(expr):
+        try:
+            run_infer_type(expr)
+        except tvm._ffi.base.TVMError:
+            return
+        else:
+            assert False
+
+    for axis in (0, -1, -3, 1):
+        assert_failure(relay.nn.bias_add(relay.const(1), relay.const(2), axis=axis))
 
 
 def test_expand_dims_infer_type():

From 980cf803242405d7965034bfa064dd8b82ee2387 Mon Sep 17 00:00:00 2001
From: wangxiang2713 <49302617+wangxiang2713@users.noreply.github.com>
Date: Thu, 4 Mar 2021 06:24:52 +0800
Subject: [PATCH 271/357] compile engine dump tir and shape funcs (#7552)

---
 python/tvm/relay/backend/compile_engine.py | 33 ++++++++++++++++++++++
 src/relay/backend/compile_engine.cc        | 18 ++++++++++++
 2 files changed, 51 insertions(+)

diff --git a/python/tvm/relay/backend/compile_engine.py b/python/tvm/relay/backend/compile_engine.py
index a39f72e2e61f..68397cc0cef6 100644
--- a/python/tvm/relay/backend/compile_engine.py
+++ b/python/tvm/relay/backend/compile_engine.py
@@ -386,6 +386,18 @@ def items(self):
         assert len(res) % 2 == 0
         return [(res[2 * i], res[2 * i + 1]) for i in range(len(res) // 2)]
 
+    def shape_func_items(self):
+        """List items in the shape_func_cache.
+
+        Returns
+        -------
+        item_list : List[Tuple[CCacheKey, CCacheValue]]
+            The list of shape_func_items.
+        """
+        res = _backend._CompileEngineListShapeFuncItems(self)
+        assert len(res) % 2 == 0
+        return [(res[2 * i], res[2 * i + 1]) for i in range(len(res) // 2)]
+
     def get_current_ccache_key(self):
         return _backend._CompileEngineGetCurrentCCacheKey(self)
 
@@ -405,7 +417,28 @@ def dump(self):
             res += "target={}\n".format(k.target)
             res += "use_count={}\n".format(v.use_count)
             res += "func_name={}\n".format(v.cached_func.func_name)
+            res += "----relay function----\n"
+            res += k.source_func.astext() + "\n"
+            res += "----tir function----- \n"
+            res += "inputs={}\n".format(v.cached_func.inputs)
+            res += "outputs={}\n".format(v.cached_func.outputs)
+            res += "function: \n"
+            res += v.cached_func.funcs.astext() + "\n"
+        res += "===================================\n"
+        shape_func_items = self.shape_func_items()
+        res += "%d shape_func_items cached\n" % len(shape_func_items)
+        for k, v in shape_func_items:
+            res += "------------------------------------\n"
+            res += "target={}\n".format(k.target)
+            res += "use_count={}\n".format(v.use_count)
+            res += "func_name={}\n".format(v.cached_func.func_name)
+            res += "----relay function----\n"
             res += k.source_func.astext() + "\n"
+            res += "----tir function----- \n"
+            res += "inputs={}\n".format(v.cached_func.inputs)
+            res += "outputs={}\n".format(v.cached_func.outputs)
+            res += "function: \n"
+            res += v.cached_func.funcs.astext() + "\n"
         res += "===================================\n"
         return res
 
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index ed09e4f6eb32..ae975a5f3240 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -692,6 +692,17 @@ class CompileEngineImpl : public CompileEngineNode {
     return items;
   }
 
+  // List all items in the shape_func_cache.
+  Array<ObjectRef> ListShapeFuncItems() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    Array<ObjectRef> items;
+    for (auto& kv : shape_func_cache_) {
+      items.push_back(kv.first);
+      items.push_back(kv.second);
+    }
+    return items;
+  }
+
   /*!
    * \brief Get the cache key of the function that is being lowered currently
    * \return the cache key
@@ -882,6 +893,13 @@ TVM_REGISTER_GLOBAL("relay.backend._CompileEngineListItems").set_body_typed([](C
   return ptr->ListItems();
 });
 
+TVM_REGISTER_GLOBAL("relay.backend._CompileEngineListShapeFuncItems")
+    .set_body_typed([](CompileEngine self) {
+      CompileEngineImpl* ptr = dynamic_cast<CompileEngineImpl*>(self.operator->());
+      ICHECK(ptr != nullptr);
+      return ptr->ListShapeFuncItems();
+    });
+
 TVM_REGISTER_GLOBAL("relay.backend._CompileEngineGetCurrentCCacheKey")
     .set_body_typed([](CompileEngine self) {
       CompileEngineImpl* ptr = dynamic_cast<CompileEngineImpl*>(self.operator->());

From 73a0b96c646137d92539661671c9c7743e96ae64 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Wed, 3 Mar 2021 14:52:54 -0800
Subject: [PATCH 272/357] [RUNTIME] Move Map into runtime (#7570)

* [RUNTIME] Move Map into runtime

This allows us to use Map to store parameters needed at runtime.

* node.{Array|Map} -> runtime.{Array|Map}

* missed some renames
---
 include/tvm/arith/bound.h                     |    2 +-
 include/tvm/arith/pattern.h                   |    2 +-
 include/tvm/ir/adt.h                          |    2 +-
 include/tvm/ir/expr.h                         |    2 +-
 include/tvm/ir/module.h                       |    2 +-
 include/tvm/ir/transform.h                    |    1 -
 include/tvm/ir/type.h                         |    2 +-
 include/tvm/node/attr_registry_map.h          |    2 +-
 include/tvm/node/container.h                  | 1486 -----------------
 include/tvm/node/node.h                       |    1 -
 include/tvm/node/structural_equal.h           |    2 +-
 include/tvm/node/structural_hash.h            |    2 +-
 include/tvm/relay/feature.h                   |    2 +-
 include/tvm/runtime/container.h               | 1408 ++++++++++++++++
 include/tvm/runtime/packed_func.h             |   34 +
 include/tvm/te/tensor.h                       |    2 +-
 include/tvm/tir/buffer.h                      |    2 +-
 include/tvm/tir/expr.h                        |    2 +-
 include/tvm/tir/stmt_functor.h                |    2 +-
 python/tvm/ir/container.py                    |   14 +-
 python/tvm/runtime/object_generic.py          |    4 +-
 rust/tvm-rt/src/array.rs                      |    8 +-
 rust/tvm-rt/src/map.rs                        |   12 +-
 rust/tvm/src/ir/diagnostics/mod.rs            |    2 +-
 src/node/container.cc                         |  363 ----
 src/node/reflection.cc                        |    2 +-
 src/node/serialization.cc                     |    2 +-
 src/node/structural_hash.cc                   |  238 +++
 src/printer/meta_data.h                       |    2 +-
 .../contrib/codegen_json/codegen_json.h       |    1 -
 src/relay/op/nn/nn.h                          |    2 +-
 src/runtime/container.cc                      |   95 ++
 src/runtime/file_utils.cc                     |    1 +
 src/runtime/file_utils.h                      |    2 +
 src/runtime/graph/graph_runtime_factory.cc    |    2 +-
 src/runtime/metadata_module.cc                |    2 +-
 src/support/libinfo.cc                        |    2 +-
 37 files changed, 1819 insertions(+), 1893 deletions(-)
 delete mode 100644 include/tvm/node/container.h
 delete mode 100644 src/node/container.cc

diff --git a/include/tvm/arith/bound.h b/include/tvm/arith/bound.h
index 12b91cc033e5..f8e63ed5857a 100644
--- a/include/tvm/arith/bound.h
+++ b/include/tvm/arith/bound.h
@@ -25,7 +25,7 @@
 
 #include <tvm/arith/int_set.h>
 #include <tvm/ir/expr.h>
-#include <tvm/node/container.h>
+#include <tvm/runtime/container.h>
 #include <tvm/tir/expr.h>
 #include <tvm/tir/stmt.h>
 
diff --git a/include/tvm/arith/pattern.h b/include/tvm/arith/pattern.h
index 301d95636ca4..3f1096b10a8b 100644
--- a/include/tvm/arith/pattern.h
+++ b/include/tvm/arith/pattern.h
@@ -25,7 +25,7 @@
 #define TVM_ARITH_PATTERN_H_
 
 #include <tvm/ir/expr.h>
-#include <tvm/node/container.h>
+#include <tvm/runtime/container.h>
 #include <tvm/tir/expr.h>
 
 namespace tvm {
diff --git a/include/tvm/ir/adt.h b/include/tvm/ir/adt.h
index 466a4f00fd5f..231c04e69821 100644
--- a/include/tvm/ir/adt.h
+++ b/include/tvm/ir/adt.h
@@ -29,8 +29,8 @@
 
 #include <tvm/ir/expr.h>
 #include <tvm/ir/type.h>
-#include <tvm/node/container.h>
 #include <tvm/node/node.h>
+#include <tvm/runtime/container.h>
 #include <tvm/runtime/object.h>
 
 #include <string>
diff --git a/include/tvm/ir/expr.h b/include/tvm/ir/expr.h
index 5302a55bfff3..2295baa0297b 100644
--- a/include/tvm/ir/expr.h
+++ b/include/tvm/ir/expr.h
@@ -26,8 +26,8 @@
 
 #include <tvm/ir/span.h>
 #include <tvm/ir/type.h>
-#include <tvm/node/container.h>
 #include <tvm/node/node.h>
+#include <tvm/runtime/container.h>
 #include <tvm/runtime/object.h>
 
 #include <algorithm>
diff --git a/include/tvm/ir/module.h b/include/tvm/ir/module.h
index d6fb6a20b58a..07d582a298e4 100644
--- a/include/tvm/ir/module.h
+++ b/include/tvm/ir/module.h
@@ -28,8 +28,8 @@
 #include <tvm/ir/expr.h>
 #include <tvm/ir/function.h>
 #include <tvm/ir/type.h>
-#include <tvm/node/container.h>
 #include <tvm/parser/source_map.h>
+#include <tvm/runtime/container.h>
 
 #include <string>
 #include <unordered_map>
diff --git a/include/tvm/ir/transform.h b/include/tvm/ir/transform.h
index 6557bbe31b8e..50c6f8dd8c3a 100644
--- a/include/tvm/ir/transform.h
+++ b/include/tvm/ir/transform.h
@@ -59,7 +59,6 @@
 #include <tvm/ir/diagnostic.h>
 #include <tvm/ir/error.h>
 #include <tvm/ir/module.h>
-#include <tvm/node/container.h>
 #include <tvm/runtime/container.h>
 #include <tvm/support/with.h>
 
diff --git a/include/tvm/ir/type.h b/include/tvm/ir/type.h
index 19b1ad0a0d83..b93a41e0c098 100644
--- a/include/tvm/ir/type.h
+++ b/include/tvm/ir/type.h
@@ -50,8 +50,8 @@
 #define TVM_IR_TYPE_H_
 
 #include <tvm/ir/span.h>
-#include <tvm/node/container.h>
 #include <tvm/node/node.h>
+#include <tvm/runtime/container.h>
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/object.h>
 
diff --git a/include/tvm/node/attr_registry_map.h b/include/tvm/node/attr_registry_map.h
index 552aa7114657..6acd2e7dbdd8 100644
--- a/include/tvm/node/attr_registry_map.h
+++ b/include/tvm/node/attr_registry_map.h
@@ -23,7 +23,7 @@
 #ifndef TVM_NODE_ATTR_REGISTRY_MAP_H_
 #define TVM_NODE_ATTR_REGISTRY_MAP_H_
 
-#include <tvm/node/container.h>
+#include <tvm/runtime/container.h>
 
 #include <utility>
 #include <vector>
diff --git a/include/tvm/node/container.h b/include/tvm/node/container.h
deleted file mode 100644
index 10b47a92bdcf..000000000000
--- a/include/tvm/node/container.h
+++ /dev/null
@@ -1,1486 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-/*!
- * \file tvm/node/container.h
- * \brief Array/Map container in the DSL graph.
- */
-#ifndef TVM_NODE_CONTAINER_H_
-#define TVM_NODE_CONTAINER_H_
-
-#ifndef USE_FALLBACK_STL_MAP
-#define USE_FALLBACK_STL_MAP 0
-#endif
-
-#include <tvm/runtime/container.h>
-#include <tvm/runtime/memory.h>
-#include <tvm/runtime/object.h>
-#include <tvm/runtime/packed_func.h>
-
-#include <algorithm>
-#include <string>
-#include <utility>
-
-namespace tvm {
-
-using runtime::Array;
-using runtime::ArrayNode;
-using runtime::Downcast;
-using runtime::IterAdapter;
-using runtime::make_object;
-using runtime::Object;
-using runtime::ObjectEqual;
-using runtime::ObjectHash;
-using runtime::ObjectPtr;
-using runtime::ObjectPtrEqual;
-using runtime::ObjectPtrHash;
-using runtime::ObjectRef;
-using runtime::String;
-using runtime::StringObj;
-
-#if (USE_FALLBACK_STL_MAP != 0)
-
-/*! \brief Shared content of all specializations of hash map */
-class MapNode : public Object {
- public:
-  /*! \brief Type of the keys in the hash map */
-  using key_type = ObjectRef;
-  /*! \brief Type of the values in the hash map */
-  using mapped_type = ObjectRef;
-  /*! \brief Type of the actual underlying container */
-  using ContainerType = std::unordered_map<ObjectRef, ObjectRef, ObjectHash, ObjectEqual>;
-  /*! \brief Iterator class */
-  using iterator = ContainerType::iterator;
-  /*! \brief Iterator class */
-  using const_iterator = ContainerType::const_iterator;
-  /*! \brief Type of value stored in the hash map */
-  using KVType = ContainerType::value_type;
-
-  static_assert(std::is_standard_layout<KVType>::value, "KVType is not standard layout");
-  static_assert(sizeof(KVType) == 16 || sizeof(KVType) == 8, "sizeof(KVType) incorrect");
-
-  static constexpr const uint32_t _type_index = runtime::TypeIndex::kRuntimeMap;
-  static constexpr const char* _type_key = "Map";
-  TVM_DECLARE_FINAL_OBJECT_INFO(MapNode, Object);
-
-  /*!
-   * \brief Number of elements in the SmallMapNode
-   * \return The result
-   */
-  size_t size() const { return data_.size(); }
-  /*!
-   * \brief Count the number of times a key exists in the hash map
-   * \param key The indexing key
-   * \return The result, 0 or 1
-   */
-  size_t count(const key_type& key) const { return data_.count(key); }
-  /*!
-   * \brief Index value associated with a key, throw exception if the key does not exist
-   * \param key The indexing key
-   * \return The const reference to the value
-   */
-  const mapped_type& at(const key_type& key) const { return data_.at(key); }
-  /*!
-   * \brief Index value associated with a key, throw exception if the key does not exist
-   * \param key The indexing key
-   * \return The mutable reference to the value
-   */
-  mapped_type& at(const key_type& key) { return data_.at(key); }
-  /*! \return begin iterator */
-  iterator begin() { return data_.begin(); }
-  /*! \return const begin iterator */
-  const_iterator begin() const { return data_.begin(); }
-  /*! \return end iterator */
-  iterator end() { return data_.end(); }
-  /*! \return end iterator */
-  const_iterator end() const { return data_.end(); }
-  /*!
-   * \brief Index value associated with a key
-   * \param key The indexing key
-   * \return The iterator of the entry associated with the key, end iterator if not exists
-   */
-  const_iterator find(const key_type& key) const { return data_.find(key); }
-  /*!
-   * \brief Index value associated with a key
-   * \param key The indexing key
-   * \return The iterator of the entry associated with the key, end iterator if not exists
-   */
-  iterator find(const key_type& key) { return data_.find(key); }
-  /*!
-   * \brief Erase the entry associated with the iterator
-   * \param position The iterator
-   */
-  void erase(const iterator& position) { data_.erase(position); }
-  /*!
-   * \brief Erase the entry associated with the key, do nothing if not exists
-   * \param key The indexing key
-   */
-  void erase(const key_type& key) { data_.erase(key); }
-  /*!
-   * \brief Create an empty container
-   * \return The object created
-   */
-  static ObjectPtr<MapNode> Empty() { return make_object<MapNode>(); }
-
- protected:
-  /*!
-   * \brief Create the map using contents from the given iterators.
-   * \param first Begin of iterator
-   * \param last End of iterator
-   * \tparam IterType The type of iterator
-   * \return ObjectPtr to the map created
-   */
-  template <typename IterType>
-  static ObjectPtr<Object> CreateFromRange(IterType first, IterType last) {
-    ObjectPtr<MapNode> p = make_object<MapNode>();
-    p->data_ = ContainerType(first, last);
-    return p;
-  }
-  /*!
-   * \brief InsertMaybeReHash an entry into the given hash map
-   * \param kv The entry to be inserted
-   * \param map The pointer to the map, can be changed if re-hashing happens
-   */
-  static void InsertMaybeReHash(const KVType& kv, ObjectPtr<Object>* map) {
-    MapNode* map_node = static_cast<MapNode*>(map->get());
-    map_node->data_[kv.first] = kv.second;
-  }
-  /*!
-   * \brief Create an empty container with elements copying from another MapNode
-   * \param from The source container
-   * \return The object created
-   */
-  static ObjectPtr<MapNode> CopyFrom(MapNode* from) {
-    ObjectPtr<MapNode> p = make_object<MapNode>();
-    p->data_ = ContainerType(from->data_.begin(), from->data_.end());
-    return p;
-  }
-  /*! \brief The real container storing data */
-  ContainerType data_;
-  template <typename, typename, typename, typename>
-  friend class Map;
-};
-
-#else
-
-/*! \brief Shared content of all specializations of hash map */
-class MapNode : public Object {
- public:
-  /*! \brief Type of the keys in the hash map */
-  using key_type = ObjectRef;
-  /*! \brief Type of the values in the hash map */
-  using mapped_type = ObjectRef;
-  /*! \brief Type of value stored in the hash map */
-  using KVType = std::pair<ObjectRef, ObjectRef>;
-  /*! \brief Iterator class */
-  class iterator;
-
-  static_assert(std::is_standard_layout<KVType>::value, "KVType is not standard layout");
-  static_assert(sizeof(KVType) == 16 || sizeof(KVType) == 8, "sizeof(KVType) incorrect");
-
-  static constexpr const uint32_t _type_index = runtime::TypeIndex::kRuntimeMap;
-  static constexpr const char* _type_key = "Map";
-  TVM_DECLARE_FINAL_OBJECT_INFO(MapNode, Object);
-
-  /*!
-   * \brief Number of elements in the SmallMapNode
-   * \return The result
-   */
-  size_t size() const { return size_; }
-  /*!
-   * \brief Count the number of times a key exists in the hash map
-   * \param key The indexing key
-   * \return The result, 0 or 1
-   */
-  size_t count(const key_type& key) const;
-  /*!
-   * \brief Index value associated with a key, throw exception if the key does not exist
-   * \param key The indexing key
-   * \return The const reference to the value
-   */
-  const mapped_type& at(const key_type& key) const;
-  /*!
-   * \brief Index value associated with a key, throw exception if the key does not exist
-   * \param key The indexing key
-   * \return The mutable reference to the value
-   */
-  mapped_type& at(const key_type& key);
-  /*! \return begin iterator */
-  iterator begin() const;
-  /*! \return end iterator */
-  iterator end() const;
-  /*!
-   * \brief Index value associated with a key
-   * \param key The indexing key
-   * \return The iterator of the entry associated with the key, end iterator if not exists
-   */
-  iterator find(const key_type& key) const;
-  /*!
-   * \brief Erase the entry associated with the iterator
-   * \param position The iterator
-   */
-  void erase(const iterator& position);
-  /*!
-   * \brief Erase the entry associated with the key, do nothing if not exists
-   * \param key The indexing key
-   */
-  void erase(const key_type& key) { erase(find(key)); }
-
-  class iterator {
-   public:
-    using iterator_category = std::forward_iterator_tag;
-    using difference_type = int64_t;
-    using value_type = KVType;
-    using pointer = KVType*;
-    using reference = KVType&;
-    /*! \brief Default constructor */
-    iterator() : index(0), self(nullptr) {}
-    /*! \brief Compare iterators */
-    bool operator==(const iterator& other) const {
-      return index == other.index && self == other.self;
-    }
-    /*! \brief Compare iterators */
-    bool operator!=(const iterator& other) const { return !(*this == other); }
-    /*! \brief De-reference iterators */
-    pointer operator->() const;
-    /*! \brief De-reference iterators */
-    reference operator*() const { return *((*this).operator->()); }
-    /*! \brief Prefix self increment, e.g. ++iter */
-    iterator& operator++();
-    /*! \brief Prefix self decrement, e.g. --iter */
-    iterator& operator--();
-    /*! \brief Suffix self increment */
-    iterator operator++(int) {
-      iterator copy = *this;
-      ++(*this);
-      return copy;
-    }
-    /*! \brief Suffix self decrement */
-    iterator operator--(int) {
-      iterator copy = *this;
-      --(*this);
-      return copy;
-    }
-
-   protected:
-    /*! \brief Construct by value */
-    iterator(uint64_t index, const MapNode* self) : index(index), self(self) {}
-    /*! \brief The position on the array */
-    uint64_t index;
-    /*! \brief The container it points to */
-    const MapNode* self;
-
-    friend class DenseMapNode;
-    friend class SmallMapNode;
-  };
-  /*!
-   * \brief Create an empty container
-   * \return The object created
-   */
-  static inline ObjectPtr<MapNode> Empty();
-
- protected:
-  /*!
-   * \brief Create the map using contents from the given iterators.
-   * \param first Begin of iterator
-   * \param last End of iterator
-   * \tparam IterType The type of iterator
-   * \return ObjectPtr to the map created
-   */
-  template <typename IterType>
-  static inline ObjectPtr<Object> CreateFromRange(IterType first, IterType last);
-  /*!
-   * \brief InsertMaybeReHash an entry into the given hash map
-   * \param kv The entry to be inserted
-   * \param map The pointer to the map, can be changed if re-hashing happens
-   */
-  static inline void InsertMaybeReHash(const KVType& kv, ObjectPtr<Object>* map);
-  /*!
-   * \brief Create an empty container with elements copying from another SmallMapNode
-   * \param from The source container
-   * \return The object created
-   */
-  static inline ObjectPtr<MapNode> CopyFrom(MapNode* from);
-  /*! \brief number of slots minus 1 */
-  uint64_t slots_;
-  /*! \brief number of entries in the container */
-  uint64_t size_;
-  // Reference class
-  template <typename, typename, typename, typename>
-  friend class Map;
-};
-
-/*! \brief A specialization of small-sized hash map */
-class SmallMapNode : public MapNode,
-                     public runtime::InplaceArrayBase<SmallMapNode, MapNode::KVType> {
- private:
-  static constexpr uint64_t kInitSize = 2;
-  static constexpr uint64_t kMaxSize = 4;
-
- public:
-  using MapNode::iterator;
-  using MapNode::KVType;
-
-  /*! \brief Defaults to the destructor of InplaceArrayBase */
-  ~SmallMapNode() = default;
-  /*!
-   * \brief Count the number of times a key exists in the SmallMapNode
-   * \param key The indexing key
-   * \return The result, 0 or 1
-   */
-  size_t count(const key_type& key) const { return find(key).index < size_; }
-  /*!
-   * \brief Index value associated with a key, throw exception if the key does not exist
-   * \param key The indexing key
-   * \return The const reference to the value
-   */
-  const mapped_type& at(const key_type& key) const {
-    iterator itr = find(key);
-    ICHECK(itr.index < size_) << "IndexError: key is not in Map";
-    return itr->second;
-  }
-  /*!
-   * \brief Index value associated with a key, throw exception if the key does not exist
-   * \param key The indexing key
-   * \return The mutable reference to the value
-   */
-  mapped_type& at(const key_type& key) {
-    iterator itr = find(key);
-    ICHECK(itr.index < size_) << "IndexError: key is not in Map";
-    return itr->second;
-  }
-  /*! \return begin iterator */
-  iterator begin() const { return iterator(0, this); }
-  /*! \return end iterator */
-  iterator end() const { return iterator(size_, this); }
-  /*!
-   * \brief Index value associated with a key
-   * \param key The indexing key
-   * \return The iterator of the entry associated with the key, end iterator if not exists
-   */
-  iterator find(const key_type& key) const {
-    KVType* ptr = static_cast<KVType*>(AddressOf(0));
-    for (uint64_t i = 0; i < size_; ++i, ++ptr) {
-      if (ObjectEqual()(ptr->first, key)) {
-        return iterator(i, this);
-      }
-    }
-    return iterator(size_, this);
-  }
-  /*!
-   * \brief Erase the entry associated with the iterator
-   * \param position The iterator
-   */
-  void erase(const iterator& position) { Erase(position.index); }
-
- private:
-  /*!
-   * \brief Remove a position in SmallMapNode
-   * \param index The position to be removed
-   */
-  void Erase(const uint64_t index) {
-    if (index >= size_) {
-      return;
-    }
-    KVType* begin = static_cast<KVType*>(AddressOf(0));
-    KVType* last = begin + (size_ - 1);
-    if (index + 1 == size_) {
-      last->first.ObjectRef::~ObjectRef();
-      last->second.ObjectRef::~ObjectRef();
-    } else {
-      *(begin + index) = std::move(*last);
-    }
-    size_ -= 1;
-  }
-  /*!
-   * \brief Create an empty container
-   * \param n Number of empty slots
-   * \return The object created
-   */
-  static ObjectPtr<SmallMapNode> Empty(uint64_t n = kInitSize) {
-    using ::tvm::runtime::make_inplace_array_object;
-    ObjectPtr<SmallMapNode> p = make_inplace_array_object<SmallMapNode, KVType>(n);
-    p->size_ = 0;
-    p->slots_ = n;
-    return p;
-  }
-  /*!
-   * \brief Create an empty container initialized with a given range
-   * \param n Number of empty slots
-   * \param first begin of iterator
-   * \param last end of iterator
-   * \tparam IterType The type of iterator
-   * \return The object created
-   */
-  template <typename IterType>
-  static ObjectPtr<SmallMapNode> CreateFromRange(uint64_t n, IterType first, IterType last) {
-    ObjectPtr<SmallMapNode> p = Empty(n);
-    KVType* ptr = static_cast<KVType*>(p->AddressOf(0));
-    for (; first != last; ++first, ++p->size_) {
-      new (ptr++) KVType(*first);
-    }
-    return p;
-  }
-  /*!
-   * \brief Create an empty container with elements copying from another SmallMapNode
-   * \param from The source container
-   * \return The object created
-   */
-  static ObjectPtr<SmallMapNode> CopyFrom(SmallMapNode* from) {
-    KVType* first = static_cast<KVType*>(from->AddressOf(0));
-    KVType* last = first + from->size_;
-    return CreateFromRange(from->size_, first, last);
-  }
-  /*!
-   * \brief InsertMaybeReHash an entry into the given hash map
-   * \param kv The entry to be inserted
-   * \param map The pointer to the map, can be changed if re-hashing happens
-   */
-  static void InsertMaybeReHash(const KVType& kv, ObjectPtr<Object>* map) {
-    SmallMapNode* map_node = static_cast<SmallMapNode*>(map->get());
-    iterator itr = map_node->find(kv.first);
-    if (itr.index < map_node->size_) {
-      itr->second = kv.second;
-      return;
-    }
-    if (map_node->size_ < map_node->slots_) {
-      KVType* ptr = static_cast<KVType*>(map_node->AddressOf(map_node->size_));
-      new (ptr) KVType(kv);
-      ++map_node->size_;
-      return;
-    }
-    uint64_t next_size = std::max(map_node->slots_ * 2, uint64_t(kInitSize));
-    next_size = std::min(next_size, uint64_t(kMaxSize));
-    ICHECK_GT(next_size, map_node->slots_);
-    ObjectPtr<Object> new_map = CreateFromRange(next_size, map_node->begin(), map_node->end());
-    InsertMaybeReHash(kv, &new_map);
-    *map = std::move(new_map);
-  }
-  /*!
-   * \brief Increment the pointer
-   * \param index The pointer to be incremented
-   * \return The increased pointer
-   */
-  uint64_t IncItr(uint64_t index) const { return index + 1 < size_ ? index + 1 : size_; }
-  /*!
-   * \brief Decrement the pointer
-   * \param index The pointer to be decremented
-   * \return The decreased pointer
-   */
-  uint64_t DecItr(uint64_t index) const { return index > 0 ? index - 1 : size_; }
-  /*!
-   * \brief De-reference the pointer
-   * \param index The pointer to be dereferenced
-   * \return The result
-   */
-  KVType* DeRefItr(uint64_t index) const { return static_cast<KVType*>(AddressOf(index)); }
-  /*! \brief A size function used by InplaceArrayBase */
-  uint64_t GetSize() const { return size_; }
-
- protected:
-  friend class MapNode;
-  friend class DenseMapNode;
-  friend class runtime::InplaceArrayBase<SmallMapNode, MapNode::KVType>;
-};
-
-/*! \brief A specialization of hash map that implements the idea of array-based hash map.
- * Another reference implementation can be found [1].
- *
- * A. Overview
- *
- * DenseMapNode did several improvements over traditional separate chaining hash,
- * in terms of cache locality, memory footprints and data organization.
- *
- * A1. Implicit linked list. For better cache locality, instead of using linked list
- * explicitly for each bucket, we store list data into a single array that spans contiguously
- * in memory, and then carefully design access patterns to make sure most of them fall into
- * a single cache line.
- *
- * A2. 1-byte metadata. There is only 1 byte overhead for each slot in the array to indexing and
- * traversal. This can be divided in 3 parts.
- * 1) Reserved code: (0b11111111)_2 indicates a slot is empty; (0b11111110)_2 indicates protected,
- * which means the slot is empty but not allowed to be written.
- * 2) If not empty or protected, the highest bit is used to indicate whether data in the slot is
- * head of a linked list.
- * 3) The rest 7 bits are used as the "next pointer" (i.e. pointer to the next element). On 64-bit
- * architecture, an ordinary pointer can take up to 8 bytes, which is not acceptable overhead when
- * dealing with 16-byte ObjectRef pairs. Based on a commonly noticed fact that the lists are
- * relatively short (length <= 3) in hash maps, we follow [1]'s idea that only allows the pointer to
- * be one of the 126 possible values, i.e. if the next element of i-th slot is (i + x)-th element,
- * then x must be one of the 126 pre-defined values.
- *
- * A3. Data blocking. We organize the array in the way that every 16 elements forms a data block.
- * The 16-byte metadata of those 16 elements are stored together, followed by the real data, i.e.
- * 16 key-value pairs.
- *
- * B. Implementation details
- *
- * B1. Power-of-2 table size and Fibonacci Hashing. We use power-of-two as table size to avoid
- * modulo for more efficient arithmetics. To make the hash-to-slot mapping distribute more evenly,
- * we use the Fibonacci Hashing [2] trick.
- *
- * B2. Traverse a linked list in the array.
- * 1) List head. Assume Fibonacci Hashing maps a given key to slot i, if metadata at slot i
- * indicates that it is list head, then we found the head; otherwise the list is empty. No probing
- * is done in this procedure. 2) Next element. To find the next element of a non-empty slot i, we
- * look at the last 7 bits of the metadata at slot i. If they are all zeros, then it is the end of
- * list; otherwise, we know that the next element is (i + candidates[the-last-7-bits]).
- *
- * B3. InsertMaybeReHash an element. Following B2, we first traverse the linked list to see if this
- * element is in the linked list, and if not, we put it at the end by probing the next empty
- * position in one of the 126 candidate positions. If the linked list does not even exist, but the
- * slot for list head has been occupied by another linked list, we should find this intruder another
- * place.
- *
- * B4. Quadratic probing with triangle numbers. In open address hashing, it is provable that probing
- * with triangle numbers can traverse power-of-2-sized table [3]. In our algorithm, we follow the
- * suggestion in [1] that also use triangle numbers for "next pointer" as well as sparing for list
- * head.
- *
- * [1] https://github.com/skarupke/flat_hash_map
- * [2] https://programmingpraxis.com/2018/06/19/fibonacci-hash/
- * [3] https://fgiesen.wordpress.com/2015/02/22/triangular-numbers-mod-2n/
- */
-class DenseMapNode : public MapNode {
- private:
-  /*! \brief The number of elements in a memory block */
-  static constexpr int kBlockCap = 16;
-  /*! \brief Maximum load factor of the hash map */
-  static constexpr double kMaxLoadFactor = 0.99;
-  /*! \brief Binary representation of the metadata of an empty slot */
-  static constexpr uint8_t kEmptySlot = uint8_t(0b11111111);
-  /*! \brief Binary representation of the metadata of a protected slot */
-  static constexpr uint8_t kProtectedSlot = uint8_t(0b11111110);
-  /*! \brief Number of probing choices available */
-  static constexpr int kNumJumpDists = 126;
-  /*! \brief Head of the implicit linked list */
-  struct ListNode;
-  /*! \brief POD type of a block of memory */
-  struct Block {
-    uint8_t bytes[kBlockCap + kBlockCap * sizeof(KVType)];
-  };
-  static_assert(sizeof(Block) == kBlockCap * (sizeof(KVType) + 1), "sizeof(Block) incorrect");
-  static_assert(std::is_standard_layout<Block>::value, "Block is not standard layout");
-
- public:
-  using MapNode::iterator;
-
-  /*!
-   * \brief Destroy the DenseMapNode
-   */
-  ~DenseMapNode() { this->Reset(); }
-  /*! \return The number of elements of the key */
-  size_t count(const key_type& key) const { return !Search(key).IsNone(); }
-  /*!
-   * \brief Index value associated with a key, throw exception if the key does not exist
-   * \param key The indexing key
-   * \return The const reference to the value
-   */
-  const mapped_type& at(const key_type& key) const { return At(key); }
-  /*!
-   * \brief Index value associated with a key, throw exception if the key does not exist
-   * \param key The indexing key
-   * \return The mutable reference to the value
-   */
-  mapped_type& at(const key_type& key) { return At(key); }
-  /*!
-   * \brief Index value associated with a key
-   * \param key The indexing key
-   * \return The iterator of the entry associated with the key, end iterator if not exists
-   */
-  iterator find(const key_type& key) const {
-    ListNode node = Search(key);
-    return node.IsNone() ? end() : iterator(node.index, this);
-  }
-  /*!
-   * \brief Erase the entry associated with the iterator
-   * \param position The iterator
-   */
-  void erase(const iterator& position) {
-    uint64_t index = position.index;
-    if (position.self != nullptr && index <= this->slots_) {
-      Erase(ListNode(index, this));
-    }
-  }
-  /*! \return begin iterator */
-  iterator begin() const {
-    if (slots_ == 0) {
-      return iterator(0, this);
-    }
-    for (uint64_t index = 0; index <= slots_; ++index) {
-      if (!ListNode(index, this).IsEmpty()) {
-        return iterator(index, this);
-      }
-    }
-    return iterator(slots_ + 1, this);
-  }
-  /*! \return end iterator */
-  iterator end() const { return slots_ == 0 ? iterator(0, this) : iterator(slots_ + 1, this); }
-
- private:
-  /*!
-   * \brief Search for the given key
-   * \param key The key
-   * \return ListNode that associated with the key
-   */
-  ListNode Search(const key_type& key) const {
-    if (this->size_ == 0) {
-      return ListNode();
-    }
-    for (ListNode iter = GetListHead(ObjectHash()(key)); !iter.IsNone(); iter.MoveToNext(this)) {
-      if (ObjectEqual()(key, iter.Key())) {
-        return iter;
-      }
-    }
-    return ListNode();
-  }
-  /*!
-   * \brief Search for the given key, throw exception if not exists
-   * \param key The key
-   * \return ListNode that associated with the key
-   */
-  mapped_type& At(const key_type& key) const {
-    ListNode iter = Search(key);
-    ICHECK(!iter.IsNone()) << "IndexError: key is not in Map";
-    return iter.Val();
-  }
-  /*!
-   * \brief Try to insert a key, or do nothing if already exists
-   * \param key The indexing key
-   * \param result The linked-list entry found or just constructed
-   * \return A boolean, indicating if actual insertion happens
-   */
-  bool TryInsert(const key_type& key, ListNode* result) {
-    if (slots_ == 0) {
-      return false;
-    }
-    // required that `iter` to be the head of a linked list through which we can iterator
-    ListNode iter = IndexFromHash(ObjectHash()(key));
-    // `iter` can be: 1) empty; 2) body of an irrelevant list; 3) head of the relevant list
-    // Case 1: empty
-    if (iter.IsEmpty()) {
-      iter.NewHead(KVType(key, ObjectRef(nullptr)));
-      this->size_ += 1;
-      *result = iter;
-      return true;
-    }
-    // Case 2: body of an irrelevant list
-    if (!iter.IsHead()) {
-      // we move the elements around and construct the single-element linked list
-      return IsFull() ? false : TrySpareListHead(iter, key, result);
-    }
-    // Case 3: head of the relevant list
-    // we iterate through the linked list until the end
-    // make sure `iter` is the previous element of `next`
-    ListNode next = iter;
-    do {
-      // find equal item, do not insert
-      if (ObjectEqual()(key, next.Key())) {
-        *result = next;
-        return true;
-      }
-      // make sure `iter` is the previous element of `next`
-      iter = next;
-    } while (next.MoveToNext(this));
-    // `iter` is the tail of the linked list
-    // always check capacity before insertion
-    if (IsFull()) {
-      return false;
-    }
-    // find the next empty slot
-    uint8_t jump;
-    if (!iter.GetNextEmpty(this, &jump, result)) {
-      return false;
-    }
-    result->NewTail(KVType(key, ObjectRef(nullptr)));
-    // link `iter` to `empty`, and move forward
-    iter.SetJump(jump);
-    this->size_ += 1;
-    return true;
-  }
-  /*!
-   * \brief Spare an entry to be the head of a linked list.
-   * As described in B3, during insertion, it is possible that the entire linked list does not
-   * exist, but the slot of its head has been occupied by other linked lists. In this case, we need
-   * to spare the slot by moving away the elements to another valid empty one to make insertion
-   * possible.
-   * \param target The given entry to be spared
-   * \param key The indexing key
-   * \param result The linked-list entry constructed as the head
-   * \return A boolean, if actual insertion happens
-   */
-  bool TrySpareListHead(ListNode target, const key_type& key, ListNode* result) {
-    // `target` is not the head of the linked list
-    // move the original item of `target` (if any)
-    // and construct new item on the position `target`
-    // To make `target` empty, we
-    // 1) find `w` the previous element of `target` in the linked list
-    // 2) copy the linked list starting from `r = target`
-    // 3) paste them after `w`
-    // read from the linked list after `r`
-    ListNode r = target;
-    // write to the tail of `w`
-    ListNode w = target.FindPrev(this);
-    // after `target` is moved, we disallow writing to the slot
-    bool is_first = true;
-    uint8_t r_meta, jump;
-    ListNode empty;
-    do {
-      // `jump` describes how `w` is jumped to `empty`
-      // rehash if there is no empty space after `w`
-      if (!w.GetNextEmpty(this, &jump, &empty)) {
-        return false;
-      }
-      // move `r` to `empty`
-      empty.NewTail(std::move(r.Data()));
-      // clear the metadata of `r`
-      r_meta = r.Meta();
-      if (is_first) {
-        is_first = false;
-        r.SetProtected();
-      } else {
-        r.SetEmpty();
-      }
-      // link `w` to `empty`, and move forward
-      w.SetJump(jump);
-      w = empty;
-      // move `r` forward as well
-    } while (r.MoveToNext(this, r_meta));
-    // finally we have done moving the linked list
-    // fill data_ into `target`
-    target.NewHead(KVType(key, ObjectRef(nullptr)));
-    this->size_ += 1;
-    *result = target;
-    return true;
-  }
-  /*!
-   * \brief Remove a ListNode
-   * \param iter The node to be removed
-   */
-  void Erase(const ListNode& iter) {
-    this->size_ -= 1;
-    if (!iter.HasNext()) {
-      // `iter` is the last
-      if (!iter.IsHead()) {
-        // cut the link if there is any
-        iter.FindPrev(this).SetJump(0);
-      }
-      iter.Data().KVType::~KVType();
-      iter.SetEmpty();
-    } else {
-      ListNode last = iter, prev = iter;
-      for (last.MoveToNext(this); last.HasNext(); prev = last, last.MoveToNext(this)) {
-      }
-      iter.Data() = std::move(last.Data());
-      last.SetEmpty();
-      prev.SetJump(0);
-    }
-  }
-  /*! \brief Clear the container to empty, release all entries and memory acquired */
-  void Reset() {
-    uint64_t n_blocks = CalcNumBlocks(this->slots_);
-    for (uint64_t bi = 0; bi < n_blocks; ++bi) {
-      uint8_t* meta_ptr = data_[bi].bytes;
-      KVType* data_ptr = reinterpret_cast<KVType*>(data_[bi].bytes + kBlockCap);
-      for (int j = 0; j < kBlockCap; ++j, ++meta_ptr, ++data_ptr) {
-        uint8_t& meta = *meta_ptr;
-        if (meta != uint8_t(kProtectedSlot) && meta != uint8_t(kEmptySlot)) {
-          meta = uint8_t(kEmptySlot);
-          data_ptr->KVType::~KVType();
-        }
-      }
-    }
-    ReleaseMemory();
-  }
-  /*! \brief Release the memory acquired by the container without deleting its entries stored inside
-   */
-  void ReleaseMemory() {
-    delete[] data_;
-    data_ = nullptr;
-    slots_ = 0;
-    size_ = 0;
-    fib_shift_ = 63;
-  }
-  /*!
-   * \brief Create an empty container
-   * \param fib_shift The fib shift provided
-   * \param n_slots Number of slots required, should be power-of-two
-   * \return The object created
-   */
-  static ObjectPtr<DenseMapNode> Empty(uint32_t fib_shift, uint64_t n_slots) {
-    ICHECK_GT(n_slots, uint64_t(SmallMapNode::kMaxSize));
-    ObjectPtr<DenseMapNode> p = make_object<DenseMapNode>();
-    uint64_t n_blocks = CalcNumBlocks(n_slots - 1);
-    Block* block = p->data_ = new Block[n_blocks];
-    p->slots_ = n_slots - 1;
-    p->size_ = 0;
-    p->fib_shift_ = fib_shift;
-    for (uint64_t i = 0; i < n_blocks; ++i, ++block) {
-      std::fill(block->bytes, block->bytes + kBlockCap, uint8_t(kEmptySlot));
-    }
-    return p;
-  }
-  /*!
-   * \brief Create an empty container with elements copying from another DenseMapNode
-   * \param from The source container
-   * \return The object created
-   */
-  static ObjectPtr<DenseMapNode> CopyFrom(DenseMapNode* from) {
-    ObjectPtr<DenseMapNode> p = make_object<DenseMapNode>();
-    uint64_t n_blocks = CalcNumBlocks(from->slots_);
-    p->data_ = new Block[n_blocks];
-    p->slots_ = from->slots_;
-    p->size_ = from->size_;
-    p->fib_shift_ = from->fib_shift_;
-    for (uint64_t bi = 0; bi < n_blocks; ++bi) {
-      uint8_t* meta_ptr_from = from->data_[bi].bytes;
-      KVType* data_ptr_from = reinterpret_cast<KVType*>(from->data_[bi].bytes + kBlockCap);
-      uint8_t* meta_ptr_to = p->data_[bi].bytes;
-      KVType* data_ptr_to = reinterpret_cast<KVType*>(p->data_[bi].bytes + kBlockCap);
-      for (int j = 0; j < kBlockCap;
-           ++j, ++meta_ptr_from, ++data_ptr_from, ++meta_ptr_to, ++data_ptr_to) {
-        uint8_t& meta = *meta_ptr_to = *meta_ptr_from;
-        ICHECK(meta != kProtectedSlot);
-        if (meta != uint8_t(kEmptySlot)) {
-          new (data_ptr_to) KVType(*data_ptr_from);
-        }
-      }
-    }
-    return p;
-  }
-  /*!
-   * \brief InsertMaybeReHash an entry into the given hash map
-   * \param kv The entry to be inserted
-   * \param map The pointer to the map, can be changed if re-hashing happens
-   */
-  static void InsertMaybeReHash(const KVType& kv, ObjectPtr<Object>* map) {
-    DenseMapNode* map_node = static_cast<DenseMapNode*>(map->get());
-    ListNode iter;
-    // Try to insert. If succeed, we simply return
-    if (map_node->TryInsert(kv.first, &iter)) {
-      iter.Val() = kv.second;
-      return;
-    }
-    ICHECK_GT(map_node->slots_, uint64_t(SmallMapNode::kMaxSize));
-    // Otherwise, start rehash
-    ObjectPtr<Object> p = Empty(map_node->fib_shift_ - 1, map_node->slots_ * 2 + 2);
-    // Insert the given `kv` into the new hash map
-    InsertMaybeReHash(kv, &p);
-    uint64_t n_blocks = CalcNumBlocks(map_node->slots_);
-    // Then Insert data from the original block.
-    for (uint64_t bi = 0; bi < n_blocks; ++bi) {
-      uint8_t* meta_ptr = map_node->data_[bi].bytes;
-      KVType* data_ptr = reinterpret_cast<KVType*>(map_node->data_[bi].bytes + kBlockCap);
-      for (int j = 0; j < kBlockCap; ++j, ++meta_ptr, ++data_ptr) {
-        uint8_t& meta = *meta_ptr;
-        if (meta != uint8_t(kProtectedSlot) && meta != uint8_t(kEmptySlot)) {
-          meta = uint8_t(kEmptySlot);
-          KVType kv = std::move(*data_ptr);
-          InsertMaybeReHash(kv, &p);
-        }
-      }
-    }
-    map_node->ReleaseMemory();
-    *map = p;
-  }
-  /*!
-   * \brief Check whether the hash table is full
-   * \return A boolean indicating whether hash table is full
-   */
-  bool IsFull() const { return size_ + 1 > (slots_ + 1) * kMaxLoadFactor; }
-  /*!
-   * \brief Increment the pointer
-   * \param index The pointer to be incremented
-   * \return The increased pointer
-   */
-  uint64_t IncItr(uint64_t index) const {
-    for (++index; index <= slots_; ++index) {
-      if (!ListNode(index, this).IsEmpty()) {
-        return index;
-      }
-    }
-    return slots_ + 1;
-  }
-  /*!
-   * \brief Decrement the pointer
-   * \param index The pointer to be decremented
-   * \return The decreased pointer
-   */
-  uint64_t DecItr(uint64_t index) const {
-    while (index != 0) {
-      index -= 1;
-      if (!ListNode(index, this).IsEmpty()) {
-        return index;
-      }
-    }
-    return slots_ + 1;
-  }
-  /*!
-   * \brief De-reference the pointer
-   * \param index The pointer to be dereferenced
-   * \return The result
-   */
-  KVType* DeRefItr(uint64_t index) const { return &ListNode(index, this).Data(); }
-  /*! \brief Construct from hash code */
-  ListNode IndexFromHash(uint64_t hash_value) const {
-    return ListNode(FibHash(hash_value, fib_shift_), this);
-  }
-  /*! \brief Construct from hash code if the position is head of list */
-  ListNode GetListHead(uint64_t hash_value) const {
-    ListNode node = IndexFromHash(hash_value);
-    return node.IsHead() ? node : ListNode();
-  }
-  /*! \brief Construct the number of blocks in the hash table */
-  static uint64_t CalcNumBlocks(uint64_t n_slots_m1) {
-    uint64_t n_slots = n_slots_m1 > 0 ? n_slots_m1 + 1 : 0;
-    return (n_slots + kBlockCap - 1) / kBlockCap;
-  }
-  /*!
-   * \brief Calculate the power-of-2 table size given the lower-bound of required capacity.
-   * \param cap The lower-bound of the required capacity
-   * \param fib_shift The result shift for Fibonacci Hashing
-   * \param n_slots The result number of slots
-   */
-  static void CalcTableSize(uint64_t cap, uint32_t* fib_shift, uint64_t* n_slots) {
-    uint32_t shift = 64;
-    uint64_t slots = 1;
-    for (uint64_t c = cap; c; c >>= 1) {
-      shift -= 1;
-      slots <<= 1;
-    }
-    ICHECK_GT(slots, cap);
-    if (slots < cap * 2) {
-      *fib_shift = shift - 1;
-      *n_slots = slots << 1;
-    } else {
-      *fib_shift = shift;
-      *n_slots = slots;
-    }
-  }
-  /*!
-   * \brief Fibonacci Hashing, maps a hash code to an index in a power-of-2-sized table.
-   * See also: https://programmingpraxis.com/2018/06/19/fibonacci-hash/.
-   * \param hash_value The raw hash value
-   * \param fib_shift The shift in Fibonacci Hashing
-   * \return An index calculated using Fibonacci Hashing
-   */
-  static uint64_t FibHash(uint64_t hash_value, uint32_t fib_shift) {
-    constexpr uint64_t coeff = 11400714819323198485ull;
-    return (coeff * hash_value) >> fib_shift;
-  }
-  /*! \brief The implicit in-place linked list used to index a chain */
-  struct ListNode {
-    /*! \brief Construct None */
-    ListNode() : index(0), block(nullptr) {}
-    /*! \brief Construct from position */
-    ListNode(uint64_t index, const DenseMapNode* self)
-        : index(index), block(self->data_ + (index / kBlockCap)) {}
-    /*! \brief Metadata on the entry */
-    uint8_t& Meta() const { return *(block->bytes + index % kBlockCap); }
-    /*! \brief Data on the entry */
-    KVType& Data() const {
-      return *(reinterpret_cast<KVType*>(block->bytes + kBlockCap +
-                                         (index % kBlockCap) * sizeof(KVType)));
-    }
-    /*! \brief Key on the entry */
-    key_type& Key() const { return Data().first; }
-    /*! \brief Value on the entry */
-    mapped_type& Val() const { return Data().second; }
-    /*! \brief If the entry is head of linked list */
-    bool IsHead() const { return (Meta() & 0b10000000) == 0b00000000; }
-    /*! \brief If the entry is none */
-    bool IsNone() const { return block == nullptr; }
-    /*! \brief If the entry is empty slot */
-    bool IsEmpty() const { return Meta() == uint8_t(kEmptySlot); }
-    /*! \brief If the entry is protected slot */
-    bool IsProtected() const { return Meta() == uint8_t(kProtectedSlot); }
-    /*! \brief Set the entry to be empty */
-    void SetEmpty() const { Meta() = uint8_t(kEmptySlot); }
-    /*! \brief Set the entry to be protected */
-    void SetProtected() const { Meta() = uint8_t(kProtectedSlot); }
-    /*! \brief Set the entry's jump to its next entry */
-    void SetJump(uint8_t jump) const { (Meta() &= 0b10000000) |= jump; }
-    /*! \brief Construct a head of linked list in-place */
-    void NewHead(KVType v) const {
-      Meta() = 0b00000000;
-      new (&Data()) KVType(std::move(v));
-    }
-    /*! \brief Construct a tail of linked list in-place */
-    void NewTail(KVType v) const {
-      Meta() = 0b10000000;
-      new (&Data()) KVType(std::move(v));
-    }
-    /*! \brief If the entry has next entry on the linked list */
-    bool HasNext() const { return kNextProbeLocation[Meta() & 0b01111111] != 0; }
-    /*! \brief Move the entry to the next entry on the linked list */
-    bool MoveToNext(const DenseMapNode* self, uint8_t meta) {
-      uint64_t offset = kNextProbeLocation[meta & 0b01111111];
-      if (offset == 0) {
-        index = 0;
-        block = nullptr;
-        return false;
-      }
-      index = (index + offset) & (self->slots_);
-      block = self->data_ + (index / kBlockCap);
-      return true;
-    }
-    /*! \brief Move the entry to the next entry on the linked list */
-    bool MoveToNext(const DenseMapNode* self) { return MoveToNext(self, Meta()); }
-    /*! \brief Get the previous entry on the linked list */
-    ListNode FindPrev(const DenseMapNode* self) const {
-      // start from the head of the linked list, which must exist
-      ListNode next = self->IndexFromHash(ObjectHash()(Key()));
-      // `prev` is always the previous item of `next`
-      ListNode prev = next;
-      for (next.MoveToNext(self); index != next.index; prev = next, next.MoveToNext(self)) {
-      }
-      return prev;
-    }
-    /*! \brief Get the next empty jump */
-    bool GetNextEmpty(const DenseMapNode* self, uint8_t* jump, ListNode* result) const {
-      for (uint8_t idx = 1; idx < kNumJumpDists; ++idx) {
-        ListNode candidate((index + kNextProbeLocation[idx]) & (self->slots_), self);
-        if (candidate.IsEmpty()) {
-          *jump = idx;
-          *result = candidate;
-          return true;
-        }
-      }
-      return false;
-    }
-    /*! \brief Index on the real array */
-    uint64_t index;
-    /*! \brief Pointer to the actual block */
-    Block* block;
-  };
-
- protected:
-  /*! \brief fib shift in Fibonacci Hashing */
-  uint32_t fib_shift_;
-  /*! \brief array of data blocks */
-  Block* data_;
-  /* clang-format off */
-  /*! \brief Candidates of probing distance */
-  TVM_DLL static constexpr uint64_t kNextProbeLocation[kNumJumpDists] {
-    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-    // Quadratic probing with triangle numbers. See also:
-    // 1) https://en.wikipedia.org/wiki/Quadratic_probing
-    // 2) https://fgiesen.wordpress.com/2015/02/22/triangular-numbers-mod-2n/
-    // 3) https://github.com/skarupke/flat_hash_map
-    21, 28, 36, 45, 55, 66, 78, 91, 105, 120,
-    136, 153, 171, 190, 210, 231, 253, 276, 300, 325,
-    351, 378, 406, 435, 465, 496, 528, 561, 595, 630,
-    666, 703, 741, 780, 820, 861, 903, 946, 990, 1035,
-    1081, 1128, 1176, 1225, 1275, 1326, 1378, 1431, 1485, 1540,
-    1596, 1653, 1711, 1770, 1830, 1891, 1953, 2016, 2080, 2145,
-    2211, 2278, 2346, 2415, 2485, 2556, 2628,
-    // larger triangle numbers
-    8515, 19110, 42778, 96141, 216153,
-    486591, 1092981, 2458653, 5532801, 12442566,
-    27993903, 62983476, 141717030, 318844378, 717352503,
-    1614057336, 3631522476, 8170957530, 18384510628, 41364789378,
-    93070452520, 209408356380, 471168559170, 1060128894105, 2385289465695,
-    5366898840628, 12075518705635, 27169915244790, 61132312065111, 137547689707000,
-    309482283181501, 696335127828753, 1566753995631385, 3525196511162271, 7931691992677701,
-    17846306936293605, 40154190677507445, 90346928918121501, 203280589587557251, 457381325854679626,
-    1029107982097042876, 2315492959180353330, 5209859154120846435,
-  };
-  /* clang-format on */
-  friend class MapNode;
-};
-
-#define TVM_DISPATCH_MAP(base, var, body)     \
-  {                                           \
-    using TSmall = SmallMapNode*;             \
-    using TDense = DenseMapNode*;             \
-    uint64_t slots = base->slots_;            \
-    if (slots <= SmallMapNode::kMaxSize) {    \
-      TSmall var = static_cast<TSmall>(base); \
-      body;                                   \
-    } else {                                  \
-      TDense var = static_cast<TDense>(base); \
-      body;                                   \
-    }                                         \
-  }
-
-#define TVM_DISPATCH_MAP_CONST(base, var, body) \
-  {                                             \
-    using TSmall = const SmallMapNode*;         \
-    using TDense = const DenseMapNode*;         \
-    uint64_t slots = base->slots_;              \
-    if (slots <= SmallMapNode::kMaxSize) {      \
-      TSmall var = static_cast<TSmall>(base);   \
-      body;                                     \
-    } else {                                    \
-      TDense var = static_cast<TDense>(base);   \
-      body;                                     \
-    }                                           \
-  }
-
-inline MapNode::iterator::pointer MapNode::iterator::operator->() const {
-  TVM_DISPATCH_MAP_CONST(self, p, { return p->DeRefItr(index); });
-}
-
-inline MapNode::iterator& MapNode::iterator::operator++() {
-  TVM_DISPATCH_MAP_CONST(self, p, {
-    index = p->IncItr(index);
-    return *this;
-  });
-}
-
-inline MapNode::iterator& MapNode::iterator::operator--() {
-  TVM_DISPATCH_MAP_CONST(self, p, {
-    index = p->DecItr(index);
-    return *this;
-  });
-}
-
-inline size_t MapNode::count(const key_type& key) const {
-  TVM_DISPATCH_MAP_CONST(this, p, { return p->count(key); });
-}
-
-inline const MapNode::mapped_type& MapNode::at(const MapNode::key_type& key) const {
-  TVM_DISPATCH_MAP_CONST(this, p, { return p->at(key); });
-}
-
-inline MapNode::mapped_type& MapNode::at(const MapNode::key_type& key) {
-  TVM_DISPATCH_MAP(this, p, { return p->at(key); });
-}
-
-inline MapNode::iterator MapNode::begin() const {
-  TVM_DISPATCH_MAP_CONST(this, p, { return p->begin(); });
-}
-
-inline MapNode::iterator MapNode::end() const {
-  TVM_DISPATCH_MAP_CONST(this, p, { return p->end(); });
-}
-
-inline MapNode::iterator MapNode::find(const MapNode::key_type& key) const {
-  TVM_DISPATCH_MAP_CONST(this, p, { return p->find(key); });
-}
-
-inline void MapNode::erase(const MapNode::iterator& position) {
-  TVM_DISPATCH_MAP(this, p, { return p->erase(position); });
-}
-
-#undef TVM_DISPATCH_MAP
-#undef TVM_DISPATCH_MAP_CONST
-
-inline ObjectPtr<MapNode> MapNode::Empty() { return SmallMapNode::Empty(); }
-
-inline ObjectPtr<MapNode> MapNode::CopyFrom(MapNode* from) {
-  if (from->slots_ <= SmallMapNode::kMaxSize) {
-    return SmallMapNode::CopyFrom(static_cast<SmallMapNode*>(from));
-  } else {
-    return DenseMapNode::CopyFrom(static_cast<DenseMapNode*>(from));
-  }
-}
-
-template <typename IterType>
-inline ObjectPtr<Object> MapNode::CreateFromRange(IterType first, IterType last) {
-  int64_t _cap = std::distance(first, last);
-  if (_cap < 0) {
-    return SmallMapNode::Empty();
-  }
-  uint64_t cap = static_cast<uint64_t>(_cap);
-  if (cap < SmallMapNode::kMaxSize) {
-    return SmallMapNode::CreateFromRange(cap, first, last);
-  }
-  uint32_t fib_shift;
-  uint64_t n_slots;
-  DenseMapNode::CalcTableSize(cap, &fib_shift, &n_slots);
-  ObjectPtr<Object> obj = DenseMapNode::Empty(fib_shift, n_slots);
-  for (; first != last; ++first) {
-    KVType kv(*first);
-    DenseMapNode::InsertMaybeReHash(kv, &obj);
-  }
-  return obj;
-}
-
-inline void MapNode::InsertMaybeReHash(const KVType& kv, ObjectPtr<Object>* map) {
-  constexpr uint64_t kSmallMapMaxSize = SmallMapNode::kMaxSize;
-  MapNode* base = static_cast<MapNode*>(map->get());
-  if (base->slots_ < kSmallMapMaxSize) {
-    SmallMapNode::InsertMaybeReHash(kv, map);
-  } else if (base->slots_ == kSmallMapMaxSize) {
-    if (base->size_ < base->slots_) {
-      SmallMapNode::InsertMaybeReHash(kv, map);
-    } else {
-      ObjectPtr<Object> new_map = MapNode::CreateFromRange(base->begin(), base->end());
-      DenseMapNode::InsertMaybeReHash(kv, &new_map);
-      *map = std::move(new_map);
-    }
-  } else {
-    DenseMapNode::InsertMaybeReHash(kv, map);
-  }
-}
-
-namespace runtime {
-template <>
-inline ObjectPtr<MapNode> make_object<>() = delete;
-}  // namespace runtime
-
-#endif
-
-/*!
- * \brief Map container of NodeRef->NodeRef in DSL graph.
- *  Map implements copy on write semantics, which means map is mutable
- *  but copy will happen when array is referenced in more than two places.
- *
- * operator[] only provide const acces, use Set to mutate the content.
- * \tparam K The key NodeRef type.
- * \tparam V The value NodeRef type.
- */
-template <typename K, typename V,
-          typename = typename std::enable_if<std::is_base_of<ObjectRef, K>::value>::type,
-          typename = typename std::enable_if<std::is_base_of<ObjectRef, V>::value>::type>
-class Map : public ObjectRef {
- public:
-  using key_type = K;
-  using mapped_type = V;
-  class iterator;
-  /*!
-   * \brief default constructor
-   */
-  Map() { data_ = MapNode::Empty(); }
-  /*!
-   * \brief move constructor
-   * \param other source
-   */
-  Map(Map<K, V>&& other) { data_ = std::move(other.data_); }
-  /*!
-   * \brief copy constructor
-   * \param other source
-   */
-  Map(const Map<K, V>& other) : ObjectRef(other.data_) {}
-  /*!
-   * \brief copy assign operator
-   * \param other The source of assignment
-   * \return reference to self.
-   */
-  Map<K, V>& operator=(Map<K, V>&& other) {
-    data_ = std::move(other.data_);
-    return *this;
-  }
-  /*!
-   * \brief move assign operator
-   * \param other The source of assignment
-   * \return reference to self.
-   */
-  Map<K, V>& operator=(const Map<K, V>& other) {
-    data_ = other.data_;
-    return *this;
-  }
-  /*!
-   * \brief constructor from pointer
-   * \param n the container pointer
-   */
-  explicit Map(ObjectPtr<Object> n) : ObjectRef(n) {}
-  /*!
-   * \brief constructor from iterator
-   * \param begin begin of iterator
-   * \param end end of iterator
-   * \tparam IterType The type of iterator
-   */
-  template <typename IterType>
-  Map(IterType begin, IterType end) {
-    data_ = MapNode::CreateFromRange(begin, end);
-  }
-  /*!
-   * \brief constructor from initializer list
-   * \param init The initalizer list
-   */
-  Map(std::initializer_list<std::pair<K, V>> init) {
-    data_ = MapNode::CreateFromRange(init.begin(), init.end());
-  }
-  /*!
-   * \brief constructor from unordered_map
-   * \param init The unordered_map
-   */
-  template <typename Hash, typename Equal>
-  Map(const std::unordered_map<K, V, Hash, Equal>& init) {  // NOLINT(*)
-    data_ = MapNode::CreateFromRange(init.begin(), init.end());
-  }
-  /*!
-   * \brief Read element from map.
-   * \param key The key
-   * \return the corresonding element.
-   */
-  const V at(const K& key) const { return DowncastNoCheck<V>(GetMapNode()->at(key)); }
-  /*!
-   * \brief Read element from map.
-   * \param key The key
-   * \return the corresonding element.
-   */
-  const V operator[](const K& key) const { return this->at(key); }
-  /*! \return The size of the array */
-  size_t size() const {
-    MapNode* n = GetMapNode();
-    return n == nullptr ? 0 : n->size();
-  }
-  /*! \return The number of elements of the key */
-  size_t count(const K& key) const {
-    MapNode* n = GetMapNode();
-    return n == nullptr ? 0 : GetMapNode()->count(key);
-  }
-  /*! \return whether array is empty */
-  bool empty() const { return size() == 0; }
-  /*!
-   * \brief set the Map.
-   * \param key The index key.
-   * \param value The value to be setted.
-   */
-  void Set(const K& key, const V& value) {
-    CopyOnWrite();
-    MapNode::InsertMaybeReHash(MapNode::KVType(key, value), &data_);
-  }
-  /*! \return begin iterator */
-  iterator begin() const { return iterator(GetMapNode()->begin()); }
-  /*! \return end iterator */
-  iterator end() const { return iterator(GetMapNode()->end()); }
-  /*! \return find the key and returns the associated iterator */
-  iterator find(const K& key) const { return iterator(GetMapNode()->find(key)); }
-
-  void erase(const K& key) { CopyOnWrite()->erase(key); }
-
-  /*!
-   * \brief copy on write semantics
-   *  Do nothing if current handle is the unique copy of the array.
-   *  Otherwise make a new copy of the array to ensure the current handle
-   *  hold a unique copy.
-   *
-   * \return Handle to the internal node container(which ganrantees to be unique)
-   */
-  MapNode* CopyOnWrite() {
-    if (data_.get() == nullptr) {
-      data_ = MapNode::Empty();
-    } else if (!data_.unique()) {
-      data_ = MapNode::CopyFrom(GetMapNode());
-    }
-    return GetMapNode();
-  }
-  /*! \brief specify container node */
-  using ContainerType = MapNode;
-
-  /*! \brief Iterator of the hash map */
-  class iterator {
-   public:
-    using iterator_category = std::bidirectional_iterator_tag;
-    using difference_type = int64_t;
-    using value_type = const std::pair<K, V>;
-    using pointer = value_type*;
-    using reference = value_type;
-
-    iterator() : itr() {}
-
-    /*! \brief Compare iterators */
-    bool operator==(const iterator& other) const { return itr == other.itr; }
-    /*! \brief Compare iterators */
-    bool operator!=(const iterator& other) const { return itr != other.itr; }
-    /*! \brief De-reference iterators is not allowed */
-    pointer operator->() const = delete;
-    /*! \brief De-reference iterators */
-    reference operator*() const {
-      auto& kv = *itr;
-      return std::make_pair(DowncastNoCheck<K>(kv.first), DowncastNoCheck<V>(kv.second));
-    }
-    /*! \brief Prefix self increment, e.g. ++iter */
-    iterator& operator++() {
-      ++itr;
-      return *this;
-    }
-    /*! \brief Suffix self increment */
-    iterator operator++(int) {
-      iterator copy = *this;
-      ++(*this);
-      return copy;
-    }
-
-   private:
-    iterator(const MapNode::iterator& itr)  // NOLINT(*)
-        : itr(itr) {}
-
-    template <typename, typename, typename, typename>
-    friend class Map;
-
-    MapNode::iterator itr;
-  };
-
- private:
-  /*! \brief Return data_ as type of pointer of MapNode */
-  MapNode* GetMapNode() const { return static_cast<MapNode*>(data_.get()); }
-};
-
-/*!
- * \brief Merge two Maps.
- * \param lhs the first Map to merge.
- * \param rhs the second Map to merge.
- * @return The merged Array. Original Maps are kept unchanged.
- */
-template <typename K, typename V,
-          typename = typename std::enable_if<std::is_base_of<ObjectRef, K>::value>::type,
-          typename = typename std::enable_if<std::is_base_of<ObjectRef, V>::value>::type>
-inline Map<K, V> Merge(Map<K, V> lhs, const Map<K, V>& rhs) {
-  for (const auto& p : rhs) {
-    lhs.Set(p.first, p.second);
-  }
-  return std::move(lhs);
-}
-
-}  // namespace tvm
-
-namespace tvm {
-namespace runtime {
-// Additional overloads for PackedFunc checking.
-template <typename K, typename V>
-struct ObjectTypeChecker<Map<K, V>> {
-  static Optional<String> CheckAndGetMismatch(const Object* ptr) {
-    if (ptr == nullptr) return NullOpt;
-    if (!ptr->IsInstance<MapNode>()) return String(ptr->GetTypeKey());
-    const MapNode* n = static_cast<const MapNode*>(ptr);
-    for (const auto& kv : *n) {
-      Optional<String> key_type = ObjectTypeChecker<K>::CheckAndGetMismatch(kv.first.get());
-      Optional<String> value_type = ObjectTypeChecker<K>::CheckAndGetMismatch(kv.first.get());
-      if (key_type.defined() || value_type.defined()) {
-        std::string key_name =
-            key_type.defined() ? std::string(key_type.value()) : ObjectTypeChecker<K>::TypeName();
-        std::string value_name = value_type.defined() ? std::string(value_type.value())
-                                                      : ObjectTypeChecker<V>::TypeName();
-        return String("Map[" + key_name + ", " + value_name + "]");
-      }
-    }
-    return NullOpt;
-  }
-  static bool Check(const Object* ptr) {
-    if (ptr == nullptr) return true;
-    if (!ptr->IsInstance<MapNode>()) return false;
-    const MapNode* n = static_cast<const MapNode*>(ptr);
-    for (const auto& kv : *n) {
-      if (!ObjectTypeChecker<K>::Check(kv.first.get())) return false;
-      if (!ObjectTypeChecker<V>::Check(kv.second.get())) return false;
-    }
-    return true;
-  }
-  static std::string TypeName() {
-    return "Map[" + ObjectTypeChecker<K>::TypeName() + ", " + ObjectTypeChecker<V>::TypeName() +
-           ']';
-  }
-};
-}  // namespace runtime
-}  // namespace tvm
-#endif  // TVM_NODE_CONTAINER_H_
diff --git a/include/tvm/node/node.h b/include/tvm/node/node.h
index 59295c2ce427..7b2a9f8061b4 100644
--- a/include/tvm/node/node.h
+++ b/include/tvm/node/node.h
@@ -34,7 +34,6 @@
 #ifndef TVM_NODE_NODE_H_
 #define TVM_NODE_NODE_H_
 
-#include <tvm/node/container.h>
 #include <tvm/node/reflection.h>
 #include <tvm/node/repr_printer.h>
 #include <tvm/node/structural_equal.h>
diff --git a/include/tvm/node/structural_equal.h b/include/tvm/node/structural_equal.h
index 9424f6dc30f2..d5309bca894d 100644
--- a/include/tvm/node/structural_equal.h
+++ b/include/tvm/node/structural_equal.h
@@ -23,8 +23,8 @@
 #ifndef TVM_NODE_STRUCTURAL_EQUAL_H_
 #define TVM_NODE_STRUCTURAL_EQUAL_H_
 
-#include <tvm/node/container.h>
 #include <tvm/node/functor.h>
+#include <tvm/runtime/container.h>
 #include <tvm/runtime/data_type.h>
 
 #include <string>
diff --git a/include/tvm/node/structural_hash.h b/include/tvm/node/structural_hash.h
index ed89d841cd65..a661a852780d 100644
--- a/include/tvm/node/structural_hash.h
+++ b/include/tvm/node/structural_hash.h
@@ -23,8 +23,8 @@
 #ifndef TVM_NODE_STRUCTURAL_HASH_H_
 #define TVM_NODE_STRUCTURAL_HASH_H_
 
-#include <tvm/node/container.h>
 #include <tvm/node/functor.h>
+#include <tvm/runtime/container.h>
 #include <tvm/runtime/data_type.h>
 
 #include <functional>
diff --git a/include/tvm/relay/feature.h b/include/tvm/relay/feature.h
index 7df881938f50..4a5de33af4b9 100644
--- a/include/tvm/relay/feature.h
+++ b/include/tvm/relay/feature.h
@@ -25,8 +25,8 @@
 #define TVM_RELAY_FEATURE_H_
 
 #include <tvm/ir/module.h>
-#include <tvm/node/container.h>
 #include <tvm/relay/expr.h>
+#include <tvm/runtime/container.h>
 
 #include <bitset>
 #include <string>
diff --git a/include/tvm/runtime/container.h b/include/tvm/runtime/container.h
index 796ab7b113c1..336fef21ab88 100644
--- a/include/tvm/runtime/container.h
+++ b/include/tvm/runtime/container.h
@@ -24,7 +24,12 @@
 #ifndef TVM_RUNTIME_CONTAINER_H_
 #define TVM_RUNTIME_CONTAINER_H_
 
+#ifndef USE_FALLBACK_STL_MAP
+#define USE_FALLBACK_STL_MAP 0
+#endif
+
 #include <dmlc/logging.h>
+#include <tvm/runtime/container.h>
 #include <tvm/runtime/memory.h>
 #include <tvm/runtime/object.h>
 
@@ -34,6 +39,7 @@
 #include <memory>
 #include <string>
 #include <unordered_map>
+#include <utility>
 // We use c++14 std::experimental::string_view for optimizing hash computation
 // only right now, its usage is limited in this file. Any broader usage of
 // std::experiment in our core codebase is discouraged and needs community
@@ -1688,11 +1694,1413 @@ class Closure : public ObjectRef {
   TVM_DEFINE_OBJECT_REF_METHODS(Closure, ObjectRef, ClosureObj);
 };
 
+#if (USE_FALLBACK_STL_MAP != 0)
+
+/*! \brief Shared content of all specializations of hash map */
+class MapNode : public Object {
+ public:
+  /*! \brief Type of the keys in the hash map */
+  using key_type = ObjectRef;
+  /*! \brief Type of the values in the hash map */
+  using mapped_type = ObjectRef;
+  /*! \brief Type of the actual underlying container */
+  using ContainerType = std::unordered_map<ObjectRef, ObjectRef, ObjectHash, ObjectEqual>;
+  /*! \brief Iterator class */
+  using iterator = ContainerType::iterator;
+  /*! \brief Iterator class */
+  using const_iterator = ContainerType::const_iterator;
+  /*! \brief Type of value stored in the hash map */
+  using KVType = ContainerType::value_type;
+
+  static_assert(std::is_standard_layout<KVType>::value, "KVType is not standard layout");
+  static_assert(sizeof(KVType) == 16 || sizeof(KVType) == 8, "sizeof(KVType) incorrect");
+
+  static constexpr const uint32_t _type_index = runtime::TypeIndex::kRuntimeMap;
+  static constexpr const char* _type_key = "Map";
+  TVM_DECLARE_FINAL_OBJECT_INFO(MapNode, Object);
+
+  /*!
+   * \brief Number of elements in the SmallMapNode
+   * \return The result
+   */
+  size_t size() const { return data_.size(); }
+  /*!
+   * \brief Count the number of times a key exists in the hash map
+   * \param key The indexing key
+   * \return The result, 0 or 1
+   */
+  size_t count(const key_type& key) const { return data_.count(key); }
+  /*!
+   * \brief Index value associated with a key, throw exception if the key does not exist
+   * \param key The indexing key
+   * \return The const reference to the value
+   */
+  const mapped_type& at(const key_type& key) const { return data_.at(key); }
+  /*!
+   * \brief Index value associated with a key, throw exception if the key does not exist
+   * \param key The indexing key
+   * \return The mutable reference to the value
+   */
+  mapped_type& at(const key_type& key) { return data_.at(key); }
+  /*! \return begin iterator */
+  iterator begin() { return data_.begin(); }
+  /*! \return const begin iterator */
+  const_iterator begin() const { return data_.begin(); }
+  /*! \return end iterator */
+  iterator end() { return data_.end(); }
+  /*! \return end iterator */
+  const_iterator end() const { return data_.end(); }
+  /*!
+   * \brief Index value associated with a key
+   * \param key The indexing key
+   * \return The iterator of the entry associated with the key, end iterator if not exists
+   */
+  const_iterator find(const key_type& key) const { return data_.find(key); }
+  /*!
+   * \brief Index value associated with a key
+   * \param key The indexing key
+   * \return The iterator of the entry associated with the key, end iterator if not exists
+   */
+  iterator find(const key_type& key) { return data_.find(key); }
+  /*!
+   * \brief Erase the entry associated with the iterator
+   * \param position The iterator
+   */
+  void erase(const iterator& position) { data_.erase(position); }
+  /*!
+   * \brief Erase the entry associated with the key, do nothing if not exists
+   * \param key The indexing key
+   */
+  void erase(const key_type& key) { data_.erase(key); }
+  /*!
+   * \brief Create an empty container
+   * \return The object created
+   */
+  static ObjectPtr<MapNode> Empty() { return make_object<MapNode>(); }
+
+ protected:
+  /*!
+   * \brief Create the map using contents from the given iterators.
+   * \param first Begin of iterator
+   * \param last End of iterator
+   * \tparam IterType The type of iterator
+   * \return ObjectPtr to the map created
+   */
+  template <typename IterType>
+  static ObjectPtr<Object> CreateFromRange(IterType first, IterType last) {
+    ObjectPtr<MapNode> p = make_object<MapNode>();
+    p->data_ = ContainerType(first, last);
+    return p;
+  }
+  /*!
+   * \brief InsertMaybeReHash an entry into the given hash map
+   * \param kv The entry to be inserted
+   * \param map The pointer to the map, can be changed if re-hashing happens
+   */
+  static void InsertMaybeReHash(const KVType& kv, ObjectPtr<Object>* map) {
+    MapNode* map_node = static_cast<MapNode*>(map->get());
+    map_node->data_[kv.first] = kv.second;
+  }
+  /*!
+   * \brief Create an empty container with elements copying from another MapNode
+   * \param from The source container
+   * \return The object created
+   */
+  static ObjectPtr<MapNode> CopyFrom(MapNode* from) {
+    ObjectPtr<MapNode> p = make_object<MapNode>();
+    p->data_ = ContainerType(from->data_.begin(), from->data_.end());
+    return p;
+  }
+  /*! \brief The real container storing data */
+  ContainerType data_;
+  template <typename, typename, typename, typename>
+  friend class Map;
+};
+
+#else
+
+/*! \brief Shared content of all specializations of hash map */
+class MapNode : public Object {
+ public:
+  /*! \brief Type of the keys in the hash map */
+  using key_type = ObjectRef;
+  /*! \brief Type of the values in the hash map */
+  using mapped_type = ObjectRef;
+  /*! \brief Type of value stored in the hash map */
+  using KVType = std::pair<ObjectRef, ObjectRef>;
+  /*! \brief Iterator class */
+  class iterator;
+
+  static_assert(std::is_standard_layout<KVType>::value, "KVType is not standard layout");
+  static_assert(sizeof(KVType) == 16 || sizeof(KVType) == 8, "sizeof(KVType) incorrect");
+
+  static constexpr const uint32_t _type_index = runtime::TypeIndex::kRuntimeMap;
+  static constexpr const char* _type_key = "Map";
+  TVM_DECLARE_FINAL_OBJECT_INFO(MapNode, Object);
+
+  /*!
+   * \brief Number of elements in the SmallMapNode
+   * \return The result
+   */
+  size_t size() const { return size_; }
+  /*!
+   * \brief Count the number of times a key exists in the hash map
+   * \param key The indexing key
+   * \return The result, 0 or 1
+   */
+  size_t count(const key_type& key) const;
+  /*!
+   * \brief Index value associated with a key, throw exception if the key does not exist
+   * \param key The indexing key
+   * \return The const reference to the value
+   */
+  const mapped_type& at(const key_type& key) const;
+  /*!
+   * \brief Index value associated with a key, throw exception if the key does not exist
+   * \param key The indexing key
+   * \return The mutable reference to the value
+   */
+  mapped_type& at(const key_type& key);
+  /*! \return begin iterator */
+  iterator begin() const;
+  /*! \return end iterator */
+  iterator end() const;
+  /*!
+   * \brief Index value associated with a key
+   * \param key The indexing key
+   * \return The iterator of the entry associated with the key, end iterator if not exists
+   */
+  iterator find(const key_type& key) const;
+  /*!
+   * \brief Erase the entry associated with the iterator
+   * \param position The iterator
+   */
+  void erase(const iterator& position);
+  /*!
+   * \brief Erase the entry associated with the key, do nothing if not exists
+   * \param key The indexing key
+   */
+  void erase(const key_type& key) { erase(find(key)); }
+
+  class iterator {
+   public:
+    using iterator_category = std::forward_iterator_tag;
+    using difference_type = int64_t;
+    using value_type = KVType;
+    using pointer = KVType*;
+    using reference = KVType&;
+    /*! \brief Default constructor */
+    iterator() : index(0), self(nullptr) {}
+    /*! \brief Compare iterators */
+    bool operator==(const iterator& other) const {
+      return index == other.index && self == other.self;
+    }
+    /*! \brief Compare iterators */
+    bool operator!=(const iterator& other) const { return !(*this == other); }
+    /*! \brief De-reference iterators */
+    pointer operator->() const;
+    /*! \brief De-reference iterators */
+    reference operator*() const { return *((*this).operator->()); }
+    /*! \brief Prefix self increment, e.g. ++iter */
+    iterator& operator++();
+    /*! \brief Prefix self decrement, e.g. --iter */
+    iterator& operator--();
+    /*! \brief Suffix self increment */
+    iterator operator++(int) {
+      iterator copy = *this;
+      ++(*this);
+      return copy;
+    }
+    /*! \brief Suffix self decrement */
+    iterator operator--(int) {
+      iterator copy = *this;
+      --(*this);
+      return copy;
+    }
+
+   protected:
+    /*! \brief Construct by value */
+    iterator(uint64_t index, const MapNode* self) : index(index), self(self) {}
+    /*! \brief The position on the array */
+    uint64_t index;
+    /*! \brief The container it points to */
+    const MapNode* self;
+
+    friend class DenseMapNode;
+    friend class SmallMapNode;
+  };
+  /*!
+   * \brief Create an empty container
+   * \return The object created
+   */
+  static inline ObjectPtr<MapNode> Empty();
+
+ protected:
+  /*!
+   * \brief Create the map using contents from the given iterators.
+   * \param first Begin of iterator
+   * \param last End of iterator
+   * \tparam IterType The type of iterator
+   * \return ObjectPtr to the map created
+   */
+  template <typename IterType>
+  static inline ObjectPtr<Object> CreateFromRange(IterType first, IterType last);
+  /*!
+   * \brief InsertMaybeReHash an entry into the given hash map
+   * \param kv The entry to be inserted
+   * \param map The pointer to the map, can be changed if re-hashing happens
+   */
+  static inline void InsertMaybeReHash(const KVType& kv, ObjectPtr<Object>* map);
+  /*!
+   * \brief Create an empty container with elements copying from another SmallMapNode
+   * \param from The source container
+   * \return The object created
+   */
+  static inline ObjectPtr<MapNode> CopyFrom(MapNode* from);
+  /*! \brief number of slots minus 1 */
+  uint64_t slots_;
+  /*! \brief number of entries in the container */
+  uint64_t size_;
+  // Reference class
+  template <typename, typename, typename, typename>
+  friend class Map;
+};
+
+/*! \brief A specialization of small-sized hash map */
+class SmallMapNode : public MapNode,
+                     public runtime::InplaceArrayBase<SmallMapNode, MapNode::KVType> {
+ private:
+  static constexpr uint64_t kInitSize = 2;
+  static constexpr uint64_t kMaxSize = 4;
+
+ public:
+  using MapNode::iterator;
+  using MapNode::KVType;
+
+  /*! \brief Defaults to the destructor of InplaceArrayBase */
+  ~SmallMapNode() = default;
+  /*!
+   * \brief Count the number of times a key exists in the SmallMapNode
+   * \param key The indexing key
+   * \return The result, 0 or 1
+   */
+  size_t count(const key_type& key) const { return find(key).index < size_; }
+  /*!
+   * \brief Index value associated with a key, throw exception if the key does not exist
+   * \param key The indexing key
+   * \return The const reference to the value
+   */
+  const mapped_type& at(const key_type& key) const {
+    iterator itr = find(key);
+    ICHECK(itr.index < size_) << "IndexError: key is not in Map";
+    return itr->second;
+  }
+  /*!
+   * \brief Index value associated with a key, throw exception if the key does not exist
+   * \param key The indexing key
+   * \return The mutable reference to the value
+   */
+  mapped_type& at(const key_type& key) {
+    iterator itr = find(key);
+    ICHECK(itr.index < size_) << "IndexError: key is not in Map";
+    return itr->second;
+  }
+  /*! \return begin iterator */
+  iterator begin() const { return iterator(0, this); }
+  /*! \return end iterator */
+  iterator end() const { return iterator(size_, this); }
+  /*!
+   * \brief Index value associated with a key
+   * \param key The indexing key
+   * \return The iterator of the entry associated with the key, end iterator if not exists
+   */
+  iterator find(const key_type& key) const {
+    KVType* ptr = static_cast<KVType*>(AddressOf(0));
+    for (uint64_t i = 0; i < size_; ++i, ++ptr) {
+      if (ObjectEqual()(ptr->first, key)) {
+        return iterator(i, this);
+      }
+    }
+    return iterator(size_, this);
+  }
+  /*!
+   * \brief Erase the entry associated with the iterator
+   * \param position The iterator
+   */
+  void erase(const iterator& position) { Erase(position.index); }
+
+ private:
+  /*!
+   * \brief Remove a position in SmallMapNode
+   * \param index The position to be removed
+   */
+  void Erase(const uint64_t index) {
+    if (index >= size_) {
+      return;
+    }
+    KVType* begin = static_cast<KVType*>(AddressOf(0));
+    KVType* last = begin + (size_ - 1);
+    if (index + 1 == size_) {
+      last->first.ObjectRef::~ObjectRef();
+      last->second.ObjectRef::~ObjectRef();
+    } else {
+      *(begin + index) = std::move(*last);
+    }
+    size_ -= 1;
+  }
+  /*!
+   * \brief Create an empty container
+   * \param n Number of empty slots
+   * \return The object created
+   */
+  static ObjectPtr<SmallMapNode> Empty(uint64_t n = kInitSize) {
+    using ::tvm::runtime::make_inplace_array_object;
+    ObjectPtr<SmallMapNode> p = make_inplace_array_object<SmallMapNode, KVType>(n);
+    p->size_ = 0;
+    p->slots_ = n;
+    return p;
+  }
+  /*!
+   * \brief Create an empty container initialized with a given range
+   * \param n Number of empty slots
+   * \param first begin of iterator
+   * \param last end of iterator
+   * \tparam IterType The type of iterator
+   * \return The object created
+   */
+  template <typename IterType>
+  static ObjectPtr<SmallMapNode> CreateFromRange(uint64_t n, IterType first, IterType last) {
+    ObjectPtr<SmallMapNode> p = Empty(n);
+    KVType* ptr = static_cast<KVType*>(p->AddressOf(0));
+    for (; first != last; ++first, ++p->size_) {
+      new (ptr++) KVType(*first);
+    }
+    return p;
+  }
+  /*!
+   * \brief Create an empty container with elements copying from another SmallMapNode
+   * \param from The source container
+   * \return The object created
+   */
+  static ObjectPtr<SmallMapNode> CopyFrom(SmallMapNode* from) {
+    KVType* first = static_cast<KVType*>(from->AddressOf(0));
+    KVType* last = first + from->size_;
+    return CreateFromRange(from->size_, first, last);
+  }
+  /*!
+   * \brief InsertMaybeReHash an entry into the given hash map
+   * \param kv The entry to be inserted
+   * \param map The pointer to the map, can be changed if re-hashing happens
+   */
+  static void InsertMaybeReHash(const KVType& kv, ObjectPtr<Object>* map) {
+    SmallMapNode* map_node = static_cast<SmallMapNode*>(map->get());
+    iterator itr = map_node->find(kv.first);
+    if (itr.index < map_node->size_) {
+      itr->second = kv.second;
+      return;
+    }
+    if (map_node->size_ < map_node->slots_) {
+      KVType* ptr = static_cast<KVType*>(map_node->AddressOf(map_node->size_));
+      new (ptr) KVType(kv);
+      ++map_node->size_;
+      return;
+    }
+    uint64_t next_size = std::max(map_node->slots_ * 2, uint64_t(kInitSize));
+    next_size = std::min(next_size, uint64_t(kMaxSize));
+    ICHECK_GT(next_size, map_node->slots_);
+    ObjectPtr<Object> new_map = CreateFromRange(next_size, map_node->begin(), map_node->end());
+    InsertMaybeReHash(kv, &new_map);
+    *map = std::move(new_map);
+  }
+  /*!
+   * \brief Increment the pointer
+   * \param index The pointer to be incremented
+   * \return The increased pointer
+   */
+  uint64_t IncItr(uint64_t index) const { return index + 1 < size_ ? index + 1 : size_; }
+  /*!
+   * \brief Decrement the pointer
+   * \param index The pointer to be decremented
+   * \return The decreased pointer
+   */
+  uint64_t DecItr(uint64_t index) const { return index > 0 ? index - 1 : size_; }
+  /*!
+   * \brief De-reference the pointer
+   * \param index The pointer to be dereferenced
+   * \return The result
+   */
+  KVType* DeRefItr(uint64_t index) const { return static_cast<KVType*>(AddressOf(index)); }
+  /*! \brief A size function used by InplaceArrayBase */
+  uint64_t GetSize() const { return size_; }
+
+ protected:
+  friend class MapNode;
+  friend class DenseMapNode;
+  friend class runtime::InplaceArrayBase<SmallMapNode, MapNode::KVType>;
+};
+
+/*! \brief A specialization of hash map that implements the idea of array-based hash map.
+ * Another reference implementation can be found [1].
+ *
+ * A. Overview
+ *
+ * DenseMapNode did several improvements over traditional separate chaining hash,
+ * in terms of cache locality, memory footprints and data organization.
+ *
+ * A1. Implicit linked list. For better cache locality, instead of using linked list
+ * explicitly for each bucket, we store list data into a single array that spans contiguously
+ * in memory, and then carefully design access patterns to make sure most of them fall into
+ * a single cache line.
+ *
+ * A2. 1-byte metadata. There is only 1 byte overhead for each slot in the array to indexing and
+ * traversal. This can be divided in 3 parts.
+ * 1) Reserved code: (0b11111111)_2 indicates a slot is empty; (0b11111110)_2 indicates protected,
+ * which means the slot is empty but not allowed to be written.
+ * 2) If not empty or protected, the highest bit is used to indicate whether data in the slot is
+ * head of a linked list.
+ * 3) The rest 7 bits are used as the "next pointer" (i.e. pointer to the next element). On 64-bit
+ * architecture, an ordinary pointer can take up to 8 bytes, which is not acceptable overhead when
+ * dealing with 16-byte ObjectRef pairs. Based on a commonly noticed fact that the lists are
+ * relatively short (length <= 3) in hash maps, we follow [1]'s idea that only allows the pointer to
+ * be one of the 126 possible values, i.e. if the next element of i-th slot is (i + x)-th element,
+ * then x must be one of the 126 pre-defined values.
+ *
+ * A3. Data blocking. We organize the array in the way that every 16 elements forms a data block.
+ * The 16-byte metadata of those 16 elements are stored together, followed by the real data, i.e.
+ * 16 key-value pairs.
+ *
+ * B. Implementation details
+ *
+ * B1. Power-of-2 table size and Fibonacci Hashing. We use power-of-two as table size to avoid
+ * modulo for more efficient arithmetics. To make the hash-to-slot mapping distribute more evenly,
+ * we use the Fibonacci Hashing [2] trick.
+ *
+ * B2. Traverse a linked list in the array.
+ * 1) List head. Assume Fibonacci Hashing maps a given key to slot i, if metadata at slot i
+ * indicates that it is list head, then we found the head; otherwise the list is empty. No probing
+ * is done in this procedure. 2) Next element. To find the next element of a non-empty slot i, we
+ * look at the last 7 bits of the metadata at slot i. If they are all zeros, then it is the end of
+ * list; otherwise, we know that the next element is (i + candidates[the-last-7-bits]).
+ *
+ * B3. InsertMaybeReHash an element. Following B2, we first traverse the linked list to see if this
+ * element is in the linked list, and if not, we put it at the end by probing the next empty
+ * position in one of the 126 candidate positions. If the linked list does not even exist, but the
+ * slot for list head has been occupied by another linked list, we should find this intruder another
+ * place.
+ *
+ * B4. Quadratic probing with triangle numbers. In open address hashing, it is provable that probing
+ * with triangle numbers can traverse power-of-2-sized table [3]. In our algorithm, we follow the
+ * suggestion in [1] that also use triangle numbers for "next pointer" as well as sparing for list
+ * head.
+ *
+ * [1] https://github.com/skarupke/flat_hash_map
+ * [2] https://programmingpraxis.com/2018/06/19/fibonacci-hash/
+ * [3] https://fgiesen.wordpress.com/2015/02/22/triangular-numbers-mod-2n/
+ */
+class DenseMapNode : public MapNode {
+ private:
+  /*! \brief The number of elements in a memory block */
+  static constexpr int kBlockCap = 16;
+  /*! \brief Maximum load factor of the hash map */
+  static constexpr double kMaxLoadFactor = 0.99;
+  /*! \brief Binary representation of the metadata of an empty slot */
+  static constexpr uint8_t kEmptySlot = uint8_t(0b11111111);
+  /*! \brief Binary representation of the metadata of a protected slot */
+  static constexpr uint8_t kProtectedSlot = uint8_t(0b11111110);
+  /*! \brief Number of probing choices available */
+  static constexpr int kNumJumpDists = 126;
+  /*! \brief Head of the implicit linked list */
+  struct ListNode;
+  /*! \brief POD type of a block of memory */
+  struct Block {
+    uint8_t bytes[kBlockCap + kBlockCap * sizeof(KVType)];
+  };
+  static_assert(sizeof(Block) == kBlockCap * (sizeof(KVType) + 1), "sizeof(Block) incorrect");
+  static_assert(std::is_standard_layout<Block>::value, "Block is not standard layout");
+
+ public:
+  using MapNode::iterator;
+
+  /*!
+   * \brief Destroy the DenseMapNode
+   */
+  ~DenseMapNode() { this->Reset(); }
+  /*! \return The number of elements of the key */
+  size_t count(const key_type& key) const { return !Search(key).IsNone(); }
+  /*!
+   * \brief Index value associated with a key, throw exception if the key does not exist
+   * \param key The indexing key
+   * \return The const reference to the value
+   */
+  const mapped_type& at(const key_type& key) const { return At(key); }
+  /*!
+   * \brief Index value associated with a key, throw exception if the key does not exist
+   * \param key The indexing key
+   * \return The mutable reference to the value
+   */
+  mapped_type& at(const key_type& key) { return At(key); }
+  /*!
+   * \brief Index value associated with a key
+   * \param key The indexing key
+   * \return The iterator of the entry associated with the key, end iterator if not exists
+   */
+  iterator find(const key_type& key) const {
+    ListNode node = Search(key);
+    return node.IsNone() ? end() : iterator(node.index, this);
+  }
+  /*!
+   * \brief Erase the entry associated with the iterator
+   * \param position The iterator
+   */
+  void erase(const iterator& position) {
+    uint64_t index = position.index;
+    if (position.self != nullptr && index <= this->slots_) {
+      Erase(ListNode(index, this));
+    }
+  }
+  /*! \return begin iterator */
+  iterator begin() const {
+    if (slots_ == 0) {
+      return iterator(0, this);
+    }
+    for (uint64_t index = 0; index <= slots_; ++index) {
+      if (!ListNode(index, this).IsEmpty()) {
+        return iterator(index, this);
+      }
+    }
+    return iterator(slots_ + 1, this);
+  }
+  /*! \return end iterator */
+  iterator end() const { return slots_ == 0 ? iterator(0, this) : iterator(slots_ + 1, this); }
+
+ private:
+  /*!
+   * \brief Search for the given key
+   * \param key The key
+   * \return ListNode that associated with the key
+   */
+  ListNode Search(const key_type& key) const {
+    if (this->size_ == 0) {
+      return ListNode();
+    }
+    for (ListNode iter = GetListHead(ObjectHash()(key)); !iter.IsNone(); iter.MoveToNext(this)) {
+      if (ObjectEqual()(key, iter.Key())) {
+        return iter;
+      }
+    }
+    return ListNode();
+  }
+  /*!
+   * \brief Search for the given key, throw exception if not exists
+   * \param key The key
+   * \return ListNode that associated with the key
+   */
+  mapped_type& At(const key_type& key) const {
+    ListNode iter = Search(key);
+    ICHECK(!iter.IsNone()) << "IndexError: key is not in Map";
+    return iter.Val();
+  }
+  /*!
+   * \brief Try to insert a key, or do nothing if already exists
+   * \param key The indexing key
+   * \param result The linked-list entry found or just constructed
+   * \return A boolean, indicating if actual insertion happens
+   */
+  bool TryInsert(const key_type& key, ListNode* result) {
+    if (slots_ == 0) {
+      return false;
+    }
+    // required that `iter` to be the head of a linked list through which we can iterator
+    ListNode iter = IndexFromHash(ObjectHash()(key));
+    // `iter` can be: 1) empty; 2) body of an irrelevant list; 3) head of the relevant list
+    // Case 1: empty
+    if (iter.IsEmpty()) {
+      iter.NewHead(KVType(key, ObjectRef(nullptr)));
+      this->size_ += 1;
+      *result = iter;
+      return true;
+    }
+    // Case 2: body of an irrelevant list
+    if (!iter.IsHead()) {
+      // we move the elements around and construct the single-element linked list
+      return IsFull() ? false : TrySpareListHead(iter, key, result);
+    }
+    // Case 3: head of the relevant list
+    // we iterate through the linked list until the end
+    // make sure `iter` is the previous element of `next`
+    ListNode next = iter;
+    do {
+      // find equal item, do not insert
+      if (ObjectEqual()(key, next.Key())) {
+        *result = next;
+        return true;
+      }
+      // make sure `iter` is the previous element of `next`
+      iter = next;
+    } while (next.MoveToNext(this));
+    // `iter` is the tail of the linked list
+    // always check capacity before insertion
+    if (IsFull()) {
+      return false;
+    }
+    // find the next empty slot
+    uint8_t jump;
+    if (!iter.GetNextEmpty(this, &jump, result)) {
+      return false;
+    }
+    result->NewTail(KVType(key, ObjectRef(nullptr)));
+    // link `iter` to `empty`, and move forward
+    iter.SetJump(jump);
+    this->size_ += 1;
+    return true;
+  }
+  /*!
+   * \brief Spare an entry to be the head of a linked list.
+   * As described in B3, during insertion, it is possible that the entire linked list does not
+   * exist, but the slot of its head has been occupied by other linked lists. In this case, we need
+   * to spare the slot by moving away the elements to another valid empty one to make insertion
+   * possible.
+   * \param target The given entry to be spared
+   * \param key The indexing key
+   * \param result The linked-list entry constructed as the head
+   * \return A boolean, if actual insertion happens
+   */
+  bool TrySpareListHead(ListNode target, const key_type& key, ListNode* result) {
+    // `target` is not the head of the linked list
+    // move the original item of `target` (if any)
+    // and construct new item on the position `target`
+    // To make `target` empty, we
+    // 1) find `w` the previous element of `target` in the linked list
+    // 2) copy the linked list starting from `r = target`
+    // 3) paste them after `w`
+    // read from the linked list after `r`
+    ListNode r = target;
+    // write to the tail of `w`
+    ListNode w = target.FindPrev(this);
+    // after `target` is moved, we disallow writing to the slot
+    bool is_first = true;
+    uint8_t r_meta, jump;
+    ListNode empty;
+    do {
+      // `jump` describes how `w` is jumped to `empty`
+      // rehash if there is no empty space after `w`
+      if (!w.GetNextEmpty(this, &jump, &empty)) {
+        return false;
+      }
+      // move `r` to `empty`
+      empty.NewTail(std::move(r.Data()));
+      // clear the metadata of `r`
+      r_meta = r.Meta();
+      if (is_first) {
+        is_first = false;
+        r.SetProtected();
+      } else {
+        r.SetEmpty();
+      }
+      // link `w` to `empty`, and move forward
+      w.SetJump(jump);
+      w = empty;
+      // move `r` forward as well
+    } while (r.MoveToNext(this, r_meta));
+    // finally we have done moving the linked list
+    // fill data_ into `target`
+    target.NewHead(KVType(key, ObjectRef(nullptr)));
+    this->size_ += 1;
+    *result = target;
+    return true;
+  }
+  /*!
+   * \brief Remove a ListNode
+   * \param iter The node to be removed
+   */
+  void Erase(const ListNode& iter) {
+    this->size_ -= 1;
+    if (!iter.HasNext()) {
+      // `iter` is the last
+      if (!iter.IsHead()) {
+        // cut the link if there is any
+        iter.FindPrev(this).SetJump(0);
+      }
+      iter.Data().KVType::~KVType();
+      iter.SetEmpty();
+    } else {
+      ListNode last = iter, prev = iter;
+      for (last.MoveToNext(this); last.HasNext(); prev = last, last.MoveToNext(this)) {
+      }
+      iter.Data() = std::move(last.Data());
+      last.SetEmpty();
+      prev.SetJump(0);
+    }
+  }
+  /*! \brief Clear the container to empty, release all entries and memory acquired */
+  void Reset() {
+    uint64_t n_blocks = CalcNumBlocks(this->slots_);
+    for (uint64_t bi = 0; bi < n_blocks; ++bi) {
+      uint8_t* meta_ptr = data_[bi].bytes;
+      KVType* data_ptr = reinterpret_cast<KVType*>(data_[bi].bytes + kBlockCap);
+      for (int j = 0; j < kBlockCap; ++j, ++meta_ptr, ++data_ptr) {
+        uint8_t& meta = *meta_ptr;
+        if (meta != uint8_t(kProtectedSlot) && meta != uint8_t(kEmptySlot)) {
+          meta = uint8_t(kEmptySlot);
+          data_ptr->KVType::~KVType();
+        }
+      }
+    }
+    ReleaseMemory();
+  }
+  /*! \brief Release the memory acquired by the container without deleting its entries stored inside
+   */
+  void ReleaseMemory() {
+    delete[] data_;
+    data_ = nullptr;
+    slots_ = 0;
+    size_ = 0;
+    fib_shift_ = 63;
+  }
+  /*!
+   * \brief Create an empty container
+   * \param fib_shift The fib shift provided
+   * \param n_slots Number of slots required, should be power-of-two
+   * \return The object created
+   */
+  static ObjectPtr<DenseMapNode> Empty(uint32_t fib_shift, uint64_t n_slots) {
+    ICHECK_GT(n_slots, uint64_t(SmallMapNode::kMaxSize));
+    ObjectPtr<DenseMapNode> p = make_object<DenseMapNode>();
+    uint64_t n_blocks = CalcNumBlocks(n_slots - 1);
+    Block* block = p->data_ = new Block[n_blocks];
+    p->slots_ = n_slots - 1;
+    p->size_ = 0;
+    p->fib_shift_ = fib_shift;
+    for (uint64_t i = 0; i < n_blocks; ++i, ++block) {
+      std::fill(block->bytes, block->bytes + kBlockCap, uint8_t(kEmptySlot));
+    }
+    return p;
+  }
+  /*!
+   * \brief Create an empty container with elements copying from another DenseMapNode
+   * \param from The source container
+   * \return The object created
+   */
+  static ObjectPtr<DenseMapNode> CopyFrom(DenseMapNode* from) {
+    ObjectPtr<DenseMapNode> p = make_object<DenseMapNode>();
+    uint64_t n_blocks = CalcNumBlocks(from->slots_);
+    p->data_ = new Block[n_blocks];
+    p->slots_ = from->slots_;
+    p->size_ = from->size_;
+    p->fib_shift_ = from->fib_shift_;
+    for (uint64_t bi = 0; bi < n_blocks; ++bi) {
+      uint8_t* meta_ptr_from = from->data_[bi].bytes;
+      KVType* data_ptr_from = reinterpret_cast<KVType*>(from->data_[bi].bytes + kBlockCap);
+      uint8_t* meta_ptr_to = p->data_[bi].bytes;
+      KVType* data_ptr_to = reinterpret_cast<KVType*>(p->data_[bi].bytes + kBlockCap);
+      for (int j = 0; j < kBlockCap;
+           ++j, ++meta_ptr_from, ++data_ptr_from, ++meta_ptr_to, ++data_ptr_to) {
+        uint8_t& meta = *meta_ptr_to = *meta_ptr_from;
+        ICHECK(meta != kProtectedSlot);
+        if (meta != uint8_t(kEmptySlot)) {
+          new (data_ptr_to) KVType(*data_ptr_from);
+        }
+      }
+    }
+    return p;
+  }
+  /*!
+   * \brief InsertMaybeReHash an entry into the given hash map
+   * \param kv The entry to be inserted
+   * \param map The pointer to the map, can be changed if re-hashing happens
+   */
+  static void InsertMaybeReHash(const KVType& kv, ObjectPtr<Object>* map) {
+    DenseMapNode* map_node = static_cast<DenseMapNode*>(map->get());
+    ListNode iter;
+    // Try to insert. If succeed, we simply return
+    if (map_node->TryInsert(kv.first, &iter)) {
+      iter.Val() = kv.second;
+      return;
+    }
+    ICHECK_GT(map_node->slots_, uint64_t(SmallMapNode::kMaxSize));
+    // Otherwise, start rehash
+    ObjectPtr<Object> p = Empty(map_node->fib_shift_ - 1, map_node->slots_ * 2 + 2);
+    // Insert the given `kv` into the new hash map
+    InsertMaybeReHash(kv, &p);
+    uint64_t n_blocks = CalcNumBlocks(map_node->slots_);
+    // Then Insert data from the original block.
+    for (uint64_t bi = 0; bi < n_blocks; ++bi) {
+      uint8_t* meta_ptr = map_node->data_[bi].bytes;
+      KVType* data_ptr = reinterpret_cast<KVType*>(map_node->data_[bi].bytes + kBlockCap);
+      for (int j = 0; j < kBlockCap; ++j, ++meta_ptr, ++data_ptr) {
+        uint8_t& meta = *meta_ptr;
+        if (meta != uint8_t(kProtectedSlot) && meta != uint8_t(kEmptySlot)) {
+          meta = uint8_t(kEmptySlot);
+          KVType kv = std::move(*data_ptr);
+          InsertMaybeReHash(kv, &p);
+        }
+      }
+    }
+    map_node->ReleaseMemory();
+    *map = p;
+  }
+  /*!
+   * \brief Check whether the hash table is full
+   * \return A boolean indicating whether hash table is full
+   */
+  bool IsFull() const { return size_ + 1 > (slots_ + 1) * kMaxLoadFactor; }
+  /*!
+   * \brief Increment the pointer
+   * \param index The pointer to be incremented
+   * \return The increased pointer
+   */
+  uint64_t IncItr(uint64_t index) const {
+    for (++index; index <= slots_; ++index) {
+      if (!ListNode(index, this).IsEmpty()) {
+        return index;
+      }
+    }
+    return slots_ + 1;
+  }
+  /*!
+   * \brief Decrement the pointer
+   * \param index The pointer to be decremented
+   * \return The decreased pointer
+   */
+  uint64_t DecItr(uint64_t index) const {
+    while (index != 0) {
+      index -= 1;
+      if (!ListNode(index, this).IsEmpty()) {
+        return index;
+      }
+    }
+    return slots_ + 1;
+  }
+  /*!
+   * \brief De-reference the pointer
+   * \param index The pointer to be dereferenced
+   * \return The result
+   */
+  KVType* DeRefItr(uint64_t index) const { return &ListNode(index, this).Data(); }
+  /*! \brief Construct from hash code */
+  ListNode IndexFromHash(uint64_t hash_value) const {
+    return ListNode(FibHash(hash_value, fib_shift_), this);
+  }
+  /*! \brief Construct from hash code if the position is head of list */
+  ListNode GetListHead(uint64_t hash_value) const {
+    ListNode node = IndexFromHash(hash_value);
+    return node.IsHead() ? node : ListNode();
+  }
+  /*! \brief Construct the number of blocks in the hash table */
+  static uint64_t CalcNumBlocks(uint64_t n_slots_m1) {
+    uint64_t n_slots = n_slots_m1 > 0 ? n_slots_m1 + 1 : 0;
+    return (n_slots + kBlockCap - 1) / kBlockCap;
+  }
+  /*!
+   * \brief Calculate the power-of-2 table size given the lower-bound of required capacity.
+   * \param cap The lower-bound of the required capacity
+   * \param fib_shift The result shift for Fibonacci Hashing
+   * \param n_slots The result number of slots
+   */
+  static void CalcTableSize(uint64_t cap, uint32_t* fib_shift, uint64_t* n_slots) {
+    uint32_t shift = 64;
+    uint64_t slots = 1;
+    for (uint64_t c = cap; c; c >>= 1) {
+      shift -= 1;
+      slots <<= 1;
+    }
+    ICHECK_GT(slots, cap);
+    if (slots < cap * 2) {
+      *fib_shift = shift - 1;
+      *n_slots = slots << 1;
+    } else {
+      *fib_shift = shift;
+      *n_slots = slots;
+    }
+  }
+  /*!
+   * \brief Fibonacci Hashing, maps a hash code to an index in a power-of-2-sized table.
+   * See also: https://programmingpraxis.com/2018/06/19/fibonacci-hash/.
+   * \param hash_value The raw hash value
+   * \param fib_shift The shift in Fibonacci Hashing
+   * \return An index calculated using Fibonacci Hashing
+   */
+  static uint64_t FibHash(uint64_t hash_value, uint32_t fib_shift) {
+    constexpr uint64_t coeff = 11400714819323198485ull;
+    return (coeff * hash_value) >> fib_shift;
+  }
+  /*! \brief The implicit in-place linked list used to index a chain */
+  struct ListNode {
+    /*! \brief Construct None */
+    ListNode() : index(0), block(nullptr) {}
+    /*! \brief Construct from position */
+    ListNode(uint64_t index, const DenseMapNode* self)
+        : index(index), block(self->data_ + (index / kBlockCap)) {}
+    /*! \brief Metadata on the entry */
+    uint8_t& Meta() const { return *(block->bytes + index % kBlockCap); }
+    /*! \brief Data on the entry */
+    KVType& Data() const {
+      return *(reinterpret_cast<KVType*>(block->bytes + kBlockCap +
+                                         (index % kBlockCap) * sizeof(KVType)));
+    }
+    /*! \brief Key on the entry */
+    key_type& Key() const { return Data().first; }
+    /*! \brief Value on the entry */
+    mapped_type& Val() const { return Data().second; }
+    /*! \brief If the entry is head of linked list */
+    bool IsHead() const { return (Meta() & 0b10000000) == 0b00000000; }
+    /*! \brief If the entry is none */
+    bool IsNone() const { return block == nullptr; }
+    /*! \brief If the entry is empty slot */
+    bool IsEmpty() const { return Meta() == uint8_t(kEmptySlot); }
+    /*! \brief If the entry is protected slot */
+    bool IsProtected() const { return Meta() == uint8_t(kProtectedSlot); }
+    /*! \brief Set the entry to be empty */
+    void SetEmpty() const { Meta() = uint8_t(kEmptySlot); }
+    /*! \brief Set the entry to be protected */
+    void SetProtected() const { Meta() = uint8_t(kProtectedSlot); }
+    /*! \brief Set the entry's jump to its next entry */
+    void SetJump(uint8_t jump) const { (Meta() &= 0b10000000) |= jump; }
+    /*! \brief Construct a head of linked list in-place */
+    void NewHead(KVType v) const {
+      Meta() = 0b00000000;
+      new (&Data()) KVType(std::move(v));
+    }
+    /*! \brief Construct a tail of linked list in-place */
+    void NewTail(KVType v) const {
+      Meta() = 0b10000000;
+      new (&Data()) KVType(std::move(v));
+    }
+    /*! \brief If the entry has next entry on the linked list */
+    bool HasNext() const { return kNextProbeLocation[Meta() & 0b01111111] != 0; }
+    /*! \brief Move the entry to the next entry on the linked list */
+    bool MoveToNext(const DenseMapNode* self, uint8_t meta) {
+      uint64_t offset = kNextProbeLocation[meta & 0b01111111];
+      if (offset == 0) {
+        index = 0;
+        block = nullptr;
+        return false;
+      }
+      index = (index + offset) & (self->slots_);
+      block = self->data_ + (index / kBlockCap);
+      return true;
+    }
+    /*! \brief Move the entry to the next entry on the linked list */
+    bool MoveToNext(const DenseMapNode* self) { return MoveToNext(self, Meta()); }
+    /*! \brief Get the previous entry on the linked list */
+    ListNode FindPrev(const DenseMapNode* self) const {
+      // start from the head of the linked list, which must exist
+      ListNode next = self->IndexFromHash(ObjectHash()(Key()));
+      // `prev` is always the previous item of `next`
+      ListNode prev = next;
+      for (next.MoveToNext(self); index != next.index; prev = next, next.MoveToNext(self)) {
+      }
+      return prev;
+    }
+    /*! \brief Get the next empty jump */
+    bool GetNextEmpty(const DenseMapNode* self, uint8_t* jump, ListNode* result) const {
+      for (uint8_t idx = 1; idx < kNumJumpDists; ++idx) {
+        ListNode candidate((index + kNextProbeLocation[idx]) & (self->slots_), self);
+        if (candidate.IsEmpty()) {
+          *jump = idx;
+          *result = candidate;
+          return true;
+        }
+      }
+      return false;
+    }
+    /*! \brief Index on the real array */
+    uint64_t index;
+    /*! \brief Pointer to the actual block */
+    Block* block;
+  };
+
+ protected:
+  /*! \brief fib shift in Fibonacci Hashing */
+  uint32_t fib_shift_;
+  /*! \brief array of data blocks */
+  Block* data_;
+  /* clang-format off */
+  /*! \brief Candidates of probing distance */
+  TVM_DLL static constexpr uint64_t kNextProbeLocation[kNumJumpDists] {
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+    // Quadratic probing with triangle numbers. See also:
+    // 1) https://en.wikipedia.org/wiki/Quadratic_probing
+    // 2) https://fgiesen.wordpress.com/2015/02/22/triangular-numbers-mod-2n/
+    // 3) https://github.com/skarupke/flat_hash_map
+    21, 28, 36, 45, 55, 66, 78, 91, 105, 120,
+    136, 153, 171, 190, 210, 231, 253, 276, 300, 325,
+    351, 378, 406, 435, 465, 496, 528, 561, 595, 630,
+    666, 703, 741, 780, 820, 861, 903, 946, 990, 1035,
+    1081, 1128, 1176, 1225, 1275, 1326, 1378, 1431, 1485, 1540,
+    1596, 1653, 1711, 1770, 1830, 1891, 1953, 2016, 2080, 2145,
+    2211, 2278, 2346, 2415, 2485, 2556, 2628,
+    // larger triangle numbers
+    8515, 19110, 42778, 96141, 216153,
+    486591, 1092981, 2458653, 5532801, 12442566,
+    27993903, 62983476, 141717030, 318844378, 717352503,
+    1614057336, 3631522476, 8170957530, 18384510628, 41364789378,
+    93070452520, 209408356380, 471168559170, 1060128894105, 2385289465695,
+    5366898840628, 12075518705635, 27169915244790, 61132312065111, 137547689707000,
+    309482283181501, 696335127828753, 1566753995631385, 3525196511162271, 7931691992677701,
+    17846306936293605, 40154190677507445, 90346928918121501, 203280589587557251, 457381325854679626,
+    1029107982097042876, 2315492959180353330, 5209859154120846435,
+  };
+  /* clang-format on */
+  friend class MapNode;
+};
+
+#define TVM_DISPATCH_MAP(base, var, body)     \
+  {                                           \
+    using TSmall = SmallMapNode*;             \
+    using TDense = DenseMapNode*;             \
+    uint64_t slots = base->slots_;            \
+    if (slots <= SmallMapNode::kMaxSize) {    \
+      TSmall var = static_cast<TSmall>(base); \
+      body;                                   \
+    } else {                                  \
+      TDense var = static_cast<TDense>(base); \
+      body;                                   \
+    }                                         \
+  }
+
+#define TVM_DISPATCH_MAP_CONST(base, var, body) \
+  {                                             \
+    using TSmall = const SmallMapNode*;         \
+    using TDense = const DenseMapNode*;         \
+    uint64_t slots = base->slots_;              \
+    if (slots <= SmallMapNode::kMaxSize) {      \
+      TSmall var = static_cast<TSmall>(base);   \
+      body;                                     \
+    } else {                                    \
+      TDense var = static_cast<TDense>(base);   \
+      body;                                     \
+    }                                           \
+  }
+
+inline MapNode::iterator::pointer MapNode::iterator::operator->() const {
+  TVM_DISPATCH_MAP_CONST(self, p, { return p->DeRefItr(index); });
+}
+
+inline MapNode::iterator& MapNode::iterator::operator++() {
+  TVM_DISPATCH_MAP_CONST(self, p, {
+    index = p->IncItr(index);
+    return *this;
+  });
+}
+
+inline MapNode::iterator& MapNode::iterator::operator--() {
+  TVM_DISPATCH_MAP_CONST(self, p, {
+    index = p->DecItr(index);
+    return *this;
+  });
+}
+
+inline size_t MapNode::count(const key_type& key) const {
+  TVM_DISPATCH_MAP_CONST(this, p, { return p->count(key); });
+}
+
+inline const MapNode::mapped_type& MapNode::at(const MapNode::key_type& key) const {
+  TVM_DISPATCH_MAP_CONST(this, p, { return p->at(key); });
+}
+
+inline MapNode::mapped_type& MapNode::at(const MapNode::key_type& key) {
+  TVM_DISPATCH_MAP(this, p, { return p->at(key); });
+}
+
+inline MapNode::iterator MapNode::begin() const {
+  TVM_DISPATCH_MAP_CONST(this, p, { return p->begin(); });
+}
+
+inline MapNode::iterator MapNode::end() const {
+  TVM_DISPATCH_MAP_CONST(this, p, { return p->end(); });
+}
+
+inline MapNode::iterator MapNode::find(const MapNode::key_type& key) const {
+  TVM_DISPATCH_MAP_CONST(this, p, { return p->find(key); });
+}
+
+inline void MapNode::erase(const MapNode::iterator& position) {
+  TVM_DISPATCH_MAP(this, p, { return p->erase(position); });
+}
+
+#undef TVM_DISPATCH_MAP
+#undef TVM_DISPATCH_MAP_CONST
+
+inline ObjectPtr<MapNode> MapNode::Empty() { return SmallMapNode::Empty(); }
+
+inline ObjectPtr<MapNode> MapNode::CopyFrom(MapNode* from) {
+  if (from->slots_ <= SmallMapNode::kMaxSize) {
+    return SmallMapNode::CopyFrom(static_cast<SmallMapNode*>(from));
+  } else {
+    return DenseMapNode::CopyFrom(static_cast<DenseMapNode*>(from));
+  }
+}
+
+template <typename IterType>
+inline ObjectPtr<Object> MapNode::CreateFromRange(IterType first, IterType last) {
+  int64_t _cap = std::distance(first, last);
+  if (_cap < 0) {
+    return SmallMapNode::Empty();
+  }
+  uint64_t cap = static_cast<uint64_t>(_cap);
+  if (cap < SmallMapNode::kMaxSize) {
+    return SmallMapNode::CreateFromRange(cap, first, last);
+  }
+  uint32_t fib_shift;
+  uint64_t n_slots;
+  DenseMapNode::CalcTableSize(cap, &fib_shift, &n_slots);
+  ObjectPtr<Object> obj = DenseMapNode::Empty(fib_shift, n_slots);
+  for (; first != last; ++first) {
+    KVType kv(*first);
+    DenseMapNode::InsertMaybeReHash(kv, &obj);
+  }
+  return obj;
+}
+
+inline void MapNode::InsertMaybeReHash(const KVType& kv, ObjectPtr<Object>* map) {
+  constexpr uint64_t kSmallMapMaxSize = SmallMapNode::kMaxSize;
+  MapNode* base = static_cast<MapNode*>(map->get());
+  if (base->slots_ < kSmallMapMaxSize) {
+    SmallMapNode::InsertMaybeReHash(kv, map);
+  } else if (base->slots_ == kSmallMapMaxSize) {
+    if (base->size_ < base->slots_) {
+      SmallMapNode::InsertMaybeReHash(kv, map);
+    } else {
+      ObjectPtr<Object> new_map = MapNode::CreateFromRange(base->begin(), base->end());
+      DenseMapNode::InsertMaybeReHash(kv, &new_map);
+      *map = std::move(new_map);
+    }
+  } else {
+    DenseMapNode::InsertMaybeReHash(kv, map);
+  }
+}
+
+template <>
+inline ObjectPtr<MapNode> make_object<>() = delete;
+
+#endif
+
+/*!
+ * \brief Map container of NodeRef->NodeRef in DSL graph.
+ *  Map implements copy on write semantics, which means map is mutable
+ *  but copy will happen when array is referenced in more than two places.
+ *
+ * operator[] only provide const acces, use Set to mutate the content.
+ * \tparam K The key NodeRef type.
+ * \tparam V The value NodeRef type.
+ */
+template <typename K, typename V,
+          typename = typename std::enable_if<std::is_base_of<ObjectRef, K>::value>::type,
+          typename = typename std::enable_if<std::is_base_of<ObjectRef, V>::value>::type>
+class Map : public ObjectRef {
+ public:
+  using key_type = K;
+  using mapped_type = V;
+  class iterator;
+  /*!
+   * \brief default constructor
+   */
+  Map() { data_ = MapNode::Empty(); }
+  /*!
+   * \brief move constructor
+   * \param other source
+   */
+  Map(Map<K, V>&& other) { data_ = std::move(other.data_); }
+  /*!
+   * \brief copy constructor
+   * \param other source
+   */
+  Map(const Map<K, V>& other) : ObjectRef(other.data_) {}
+  /*!
+   * \brief copy assign operator
+   * \param other The source of assignment
+   * \return reference to self.
+   */
+  Map<K, V>& operator=(Map<K, V>&& other) {
+    data_ = std::move(other.data_);
+    return *this;
+  }
+  /*!
+   * \brief move assign operator
+   * \param other The source of assignment
+   * \return reference to self.
+   */
+  Map<K, V>& operator=(const Map<K, V>& other) {
+    data_ = other.data_;
+    return *this;
+  }
+  /*!
+   * \brief constructor from pointer
+   * \param n the container pointer
+   */
+  explicit Map(ObjectPtr<Object> n) : ObjectRef(n) {}
+  /*!
+   * \brief constructor from iterator
+   * \param begin begin of iterator
+   * \param end end of iterator
+   * \tparam IterType The type of iterator
+   */
+  template <typename IterType>
+  Map(IterType begin, IterType end) {
+    data_ = MapNode::CreateFromRange(begin, end);
+  }
+  /*!
+   * \brief constructor from initializer list
+   * \param init The initalizer list
+   */
+  Map(std::initializer_list<std::pair<K, V>> init) {
+    data_ = MapNode::CreateFromRange(init.begin(), init.end());
+  }
+  /*!
+   * \brief constructor from unordered_map
+   * \param init The unordered_map
+   */
+  template <typename Hash, typename Equal>
+  Map(const std::unordered_map<K, V, Hash, Equal>& init) {  // NOLINT(*)
+    data_ = MapNode::CreateFromRange(init.begin(), init.end());
+  }
+  /*!
+   * \brief Read element from map.
+   * \param key The key
+   * \return the corresonding element.
+   */
+  const V at(const K& key) const { return DowncastNoCheck<V>(GetMapNode()->at(key)); }
+  /*!
+   * \brief Read element from map.
+   * \param key The key
+   * \return the corresonding element.
+   */
+  const V operator[](const K& key) const { return this->at(key); }
+  /*! \return The size of the array */
+  size_t size() const {
+    MapNode* n = GetMapNode();
+    return n == nullptr ? 0 : n->size();
+  }
+  /*! \return The number of elements of the key */
+  size_t count(const K& key) const {
+    MapNode* n = GetMapNode();
+    return n == nullptr ? 0 : GetMapNode()->count(key);
+  }
+  /*! \return whether array is empty */
+  bool empty() const { return size() == 0; }
+  /*!
+   * \brief set the Map.
+   * \param key The index key.
+   * \param value The value to be setted.
+   */
+  void Set(const K& key, const V& value) {
+    CopyOnWrite();
+    MapNode::InsertMaybeReHash(MapNode::KVType(key, value), &data_);
+  }
+  /*! \return begin iterator */
+  iterator begin() const { return iterator(GetMapNode()->begin()); }
+  /*! \return end iterator */
+  iterator end() const { return iterator(GetMapNode()->end()); }
+  /*! \return find the key and returns the associated iterator */
+  iterator find(const K& key) const { return iterator(GetMapNode()->find(key)); }
+
+  void erase(const K& key) { CopyOnWrite()->erase(key); }
+
+  /*!
+   * \brief copy on write semantics
+   *  Do nothing if current handle is the unique copy of the array.
+   *  Otherwise make a new copy of the array to ensure the current handle
+   *  hold a unique copy.
+   *
+   * \return Handle to the internal node container(which ganrantees to be unique)
+   */
+  MapNode* CopyOnWrite() {
+    if (data_.get() == nullptr) {
+      data_ = MapNode::Empty();
+    } else if (!data_.unique()) {
+      data_ = MapNode::CopyFrom(GetMapNode());
+    }
+    return GetMapNode();
+  }
+  /*! \brief specify container node */
+  using ContainerType = MapNode;
+
+  /*! \brief Iterator of the hash map */
+  class iterator {
+   public:
+    using iterator_category = std::bidirectional_iterator_tag;
+    using difference_type = int64_t;
+    using value_type = const std::pair<K, V>;
+    using pointer = value_type*;
+    using reference = value_type;
+
+    iterator() : itr() {}
+
+    /*! \brief Compare iterators */
+    bool operator==(const iterator& other) const { return itr == other.itr; }
+    /*! \brief Compare iterators */
+    bool operator!=(const iterator& other) const { return itr != other.itr; }
+    /*! \brief De-reference iterators is not allowed */
+    pointer operator->() const = delete;
+    /*! \brief De-reference iterators */
+    reference operator*() const {
+      auto& kv = *itr;
+      return std::make_pair(DowncastNoCheck<K>(kv.first), DowncastNoCheck<V>(kv.second));
+    }
+    /*! \brief Prefix self increment, e.g. ++iter */
+    iterator& operator++() {
+      ++itr;
+      return *this;
+    }
+    /*! \brief Suffix self increment */
+    iterator operator++(int) {
+      iterator copy = *this;
+      ++(*this);
+      return copy;
+    }
+
+   private:
+    iterator(const MapNode::iterator& itr)  // NOLINT(*)
+        : itr(itr) {}
+
+    template <typename, typename, typename, typename>
+    friend class Map;
+
+    MapNode::iterator itr;
+  };
+
+ private:
+  /*! \brief Return data_ as type of pointer of MapNode */
+  MapNode* GetMapNode() const { return static_cast<MapNode*>(data_.get()); }
+};
+
+/*!
+ * \brief Merge two Maps.
+ * \param lhs the first Map to merge.
+ * \param rhs the second Map to merge.
+ * @return The merged Array. Original Maps are kept unchanged.
+ */
+template <typename K, typename V,
+          typename = typename std::enable_if<std::is_base_of<ObjectRef, K>::value>::type,
+          typename = typename std::enable_if<std::is_base_of<ObjectRef, V>::value>::type>
+inline Map<K, V> Merge(Map<K, V> lhs, const Map<K, V>& rhs) {
+  for (const auto& p : rhs) {
+    lhs.Set(p.first, p.second);
+  }
+  return std::move(lhs);
+}
+
 }  // namespace runtime
 
 // expose the functions to the root namespace.
+using runtime::Array;
+using runtime::ArrayNode;
+using runtime::Downcast;
+using runtime::IterAdapter;
+using runtime::make_object;
+using runtime::Map;
+using runtime::MapNode;
+using runtime::Object;
+using runtime::ObjectEqual;
+using runtime::ObjectHash;
+using runtime::ObjectPtr;
+using runtime::ObjectPtrEqual;
+using runtime::ObjectPtrHash;
+using runtime::ObjectRef;
 using runtime::Optional;
 using runtime::String;
+using runtime::StringObj;
 constexpr runtime::NullOptType NullOpt{};
 }  // namespace tvm
 
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index cf30923aacb0..751a435c734a 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -450,6 +450,40 @@ struct ObjectTypeChecker<Array<T>> {
   }
   static std::string TypeName() { return "Array[" + ObjectTypeChecker<T>::TypeName() + "]"; }
 };
+template <typename K, typename V>
+struct ObjectTypeChecker<Map<K, V>> {
+  static Optional<String> CheckAndGetMismatch(const Object* ptr) {
+    if (ptr == nullptr) return NullOpt;
+    if (!ptr->IsInstance<MapNode>()) return String(ptr->GetTypeKey());
+    const MapNode* n = static_cast<const MapNode*>(ptr);
+    for (const auto& kv : *n) {
+      Optional<String> key_type = ObjectTypeChecker<K>::CheckAndGetMismatch(kv.first.get());
+      Optional<String> value_type = ObjectTypeChecker<K>::CheckAndGetMismatch(kv.first.get());
+      if (key_type.defined() || value_type.defined()) {
+        std::string key_name =
+            key_type.defined() ? std::string(key_type.value()) : ObjectTypeChecker<K>::TypeName();
+        std::string value_name = value_type.defined() ? std::string(value_type.value())
+                                                      : ObjectTypeChecker<V>::TypeName();
+        return String("Map[" + key_name + ", " + value_name + "]");
+      }
+    }
+    return NullOpt;
+  }
+  static bool Check(const Object* ptr) {
+    if (ptr == nullptr) return true;
+    if (!ptr->IsInstance<MapNode>()) return false;
+    const MapNode* n = static_cast<const MapNode*>(ptr);
+    for (const auto& kv : *n) {
+      if (!ObjectTypeChecker<K>::Check(kv.first.get())) return false;
+      if (!ObjectTypeChecker<V>::Check(kv.second.get())) return false;
+    }
+    return true;
+  }
+  static std::string TypeName() {
+    return "Map[" + ObjectTypeChecker<K>::TypeName() + ", " + ObjectTypeChecker<V>::TypeName() +
+           ']';
+  }
+};
 
 /*!
  * \brief Internal base class to
diff --git a/include/tvm/te/tensor.h b/include/tvm/te/tensor.h
index 2f9fa2f534c5..401ba102c2f4 100644
--- a/include/tvm/te/tensor.h
+++ b/include/tvm/te/tensor.h
@@ -25,7 +25,7 @@
 #define TVM_TE_TENSOR_H_
 
 #include <tvm/arith/bound.h>
-#include <tvm/node/container.h>
+#include <tvm/runtime/container.h>
 #include <tvm/tir/expr.h>
 #include <tvm/tir/op.h>
 
diff --git a/include/tvm/tir/buffer.h b/include/tvm/tir/buffer.h
index 839e7c1b7c1c..83f228da9475 100644
--- a/include/tvm/tir/buffer.h
+++ b/include/tvm/tir/buffer.h
@@ -25,7 +25,7 @@
 #define TVM_TIR_BUFFER_H_
 
 #include <tvm/ir/expr.h>
-#include <tvm/node/container.h>
+#include <tvm/runtime/container.h>
 #include <tvm/tir/var.h>
 
 #include <string>
diff --git a/include/tvm/tir/expr.h b/include/tvm/tir/expr.h
index c7ff9e19014c..7cab1970f478 100644
--- a/include/tvm/tir/expr.h
+++ b/include/tvm/tir/expr.h
@@ -26,10 +26,10 @@
 #define TVM_TIR_EXPR_H_
 
 #include <tvm/ir/expr.h>
-#include <tvm/node/container.h>
 #include <tvm/node/functor.h>
 #include <tvm/node/node.h>
 #include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/container.h>
 #include <tvm/runtime/data_type.h>
 #include <tvm/tir/buffer.h>
 #include <tvm/tir/var.h>
diff --git a/include/tvm/tir/stmt_functor.h b/include/tvm/tir/stmt_functor.h
index ceebbbb305ce..d6303ae266e1 100644
--- a/include/tvm/tir/stmt_functor.h
+++ b/include/tvm/tir/stmt_functor.h
@@ -26,8 +26,8 @@
 #ifndef TVM_TIR_STMT_FUNCTOR_H_
 #define TVM_TIR_STMT_FUNCTOR_H_
 
-#include <tvm/node/container.h>
 #include <tvm/node/functor.h>
+#include <tvm/runtime/container.h>
 #include <tvm/tir/expr.h>
 #include <tvm/tir/expr_functor.h>
 #include <tvm/tir/stmt.h>
diff --git a/python/tvm/ir/container.py b/python/tvm/ir/container.py
index a87d67992953..5222f7a97a7c 100644
--- a/python/tvm/ir/container.py
+++ b/python/tvm/ir/container.py
@@ -19,7 +19,7 @@
 
 from tvm.runtime import Object
 from tvm.runtime.container import getitem_helper
-from tvm.runtime import _ffi_node_api
+from tvm.runtime import _ffi_api
 
 
 @tvm._ffi.register_object("Array")
@@ -33,10 +33,10 @@ class Array(Object):
     """
 
     def __getitem__(self, idx):
-        return getitem_helper(self, _ffi_node_api.ArrayGetItem, len(self), idx)
+        return getitem_helper(self, _ffi_api.ArrayGetItem, len(self), idx)
 
     def __len__(self):
-        return _ffi_node_api.ArraySize(self)
+        return _ffi_api.ArraySize(self)
 
 
 @tvm._ffi.register_object
@@ -49,18 +49,18 @@ class Map(Object):
     """
 
     def __getitem__(self, k):
-        return _ffi_node_api.MapGetItem(self, k)
+        return _ffi_api.MapGetItem(self, k)
 
     def __contains__(self, k):
-        return _ffi_node_api.MapCount(self, k) != 0
+        return _ffi_api.MapCount(self, k) != 0
 
     def items(self):
         """Get the items from the map"""
-        akvs = _ffi_node_api.MapItems(self)
+        akvs = _ffi_api.MapItems(self)
         return [(akvs[i], akvs[i + 1]) for i in range(0, len(akvs), 2)]
 
     def __len__(self):
-        return _ffi_node_api.MapSize(self)
+        return _ffi_api.MapSize(self)
 
     def get(self, key, default=None):
         """Get an element with a default value.
diff --git a/python/tvm/runtime/object_generic.py b/python/tvm/runtime/object_generic.py
index 4aa83c17d178..974523d1eb1a 100644
--- a/python/tvm/runtime/object_generic.py
+++ b/python/tvm/runtime/object_generic.py
@@ -64,7 +64,7 @@ def convert_to_object(value, span=None):
         return _ffi_api.String(value)
     if isinstance(value, (list, tuple)):
         value = [convert_to_object(x) for x in value]
-        return _ffi_node_api.Array(*value)
+        return _ffi_api.Array(*value)
     if isinstance(value, dict):
         vlist = []
         for item in value.items():
@@ -72,7 +72,7 @@ def convert_to_object(value, span=None):
                 raise ValueError("key of map must already been a container type")
             vlist.append(item[0])
             vlist.append(convert_to_object(item[1]))
-        return _ffi_node_api.Map(*vlist)
+        return _ffi_api.Map(*vlist)
     if isinstance(value, ObjectGeneric):
         return value.asobject()
     if value is None:
diff --git a/rust/tvm-rt/src/array.rs b/rust/tvm-rt/src/array.rs
index 5abf66708f45..e8902b54f6ef 100644
--- a/rust/tvm-rt/src/array.rs
+++ b/rust/tvm-rt/src/array.rs
@@ -39,9 +39,9 @@ pub struct Array<T: IsObjectRef> {
 // TODO(@jroesch): convert to use generics instead of casting inside
 // the implementation.
 external! {
-    #[name("node.ArrayGetItem")]
+    #[name("runtime.ArrayGetItem")]
     fn array_get_item(array: ObjectRef, index: isize) -> ObjectRef;
-    #[name("node.ArraySize")]
+    #[name("runtime.ArraySize")]
     fn array_size(array: ObjectRef) -> i64;
 }
 
@@ -69,8 +69,8 @@ impl<T: IsObjectRef> Array<T> {
     pub fn from_vec(data: Vec<T>) -> Result<Array<T>> {
         let iter = data.into_iter().map(T::into_arg_value).collect();
 
-        let func = Function::get("node.Array").expect(
-            "node.Array function is not registered, this is most likely a build or linking error",
+        let func = Function::get("runtime.Array").expect(
+            "runtime.Array function is not registered, this is most likely a build or linking error",
         );
 
         // let array_data = func.invoke(iter)?;
diff --git a/rust/tvm-rt/src/map.rs b/rust/tvm-rt/src/map.rs
index b8bfb4e5e644..5ea48893d86b 100644
--- a/rust/tvm-rt/src/map.rs
+++ b/rust/tvm-rt/src/map.rs
@@ -48,13 +48,13 @@ where
 // TODO(@jroesch): convert to use generics instead of casting inside
 // the implementation.
 external! {
-   #[name("node.MapSize")]
+   #[name("runtime.MapSize")]
    fn map_size(map: ObjectRef) -> i64;
-   #[name("node.MapGetItem")]
+   #[name("runtime.MapGetItem")]
    fn map_get_item(map_object: ObjectRef, key: ObjectRef) -> ObjectRef;
-   #[name("node.MapCount")]
+   #[name("runtime.MapCount")]
    fn map_count(map: ObjectRef, key: ObjectRef) -> ObjectRef;
-   #[name("node.MapItems")]
+   #[name("runtime.MapItems")]
    fn map_items(map: ObjectRef) -> Array<ObjectRef>;
 }
 
@@ -81,8 +81,8 @@ where
     V: IsObjectRef,
 {
     pub fn from_data(data: Vec<ArgValue>) -> Result<Map<K, V>> {
-        let func = Function::get("node.Map").expect(
-            "node.Map function is not registered, this is most likely a build or linking error",
+        let func = Function::get("runtime.Map").expect(
+            "runtime.Map function is not registered, this is most likely a build or linking error",
         );
 
         let map_data: ObjectPtr<Object> = func.invoke(data)?.try_into()?;
diff --git a/rust/tvm/src/ir/diagnostics/mod.rs b/rust/tvm/src/ir/diagnostics/mod.rs
index 8bcdf8f51e60..182ffd4d9081 100644
--- a/rust/tvm/src/ir/diagnostics/mod.rs
+++ b/rust/tvm/src/ir/diagnostics/mod.rs
@@ -35,7 +35,7 @@ use tvm_macros::{external, Object};
 pub mod codespan;
 
 external! {
-    #[name("node.ArrayGetItem")]
+    #[name("runtime.ArrayGetItem")]
     fn get_renderer() -> DiagnosticRenderer;
 
     #[name("diagnostics.DiagnosticRenderer")]
diff --git a/src/node/container.cc b/src/node/container.cc
deleted file mode 100644
index b72d5a4cd736..000000000000
--- a/src/node/container.cc
+++ /dev/null
@@ -1,363 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-/*!
- *  Expose container API to frontend.
- * \file src/node/container.cc
- */
-#include <tvm/node/container.h>
-#include <tvm/runtime/container.h>
-#include <tvm/runtime/registry.h>
-#include <tvm/tir/expr.h>
-
-#include "../support/str_escape.h"
-
-namespace tvm {
-
-// SEQualReduce traits for runtime containers.
-struct StringObjTrait {
-  static constexpr const std::nullptr_t VisitAttrs = nullptr;
-
-  static void SHashReduce(const runtime::StringObj* key, SHashReducer hash_reduce) {
-    hash_reduce->SHashReduceHashedValue(runtime::String::HashBytes(key->data, key->size));
-  }
-
-  static bool SEqualReduce(const runtime::StringObj* lhs, const runtime::StringObj* rhs,
-                           SEqualReducer equal) {
-    if (lhs == rhs) return true;
-    if (lhs->size != rhs->size) return false;
-    if (lhs->data == rhs->data) return true;
-    return std::memcmp(lhs->data, rhs->data, lhs->size) == 0;
-  }
-};
-
-struct RefToObjectPtr : public ObjectRef {
-  static ObjectPtr<Object> Get(const ObjectRef& ref) { return GetDataPtr<Object>(ref); }
-};
-
-TVM_REGISTER_REFLECTION_VTABLE(runtime::StringObj, StringObjTrait)
-    .set_creator([](const std::string& bytes) {
-      return RefToObjectPtr::Get(runtime::String(bytes));
-    })
-    .set_repr_bytes([](const Object* n) -> std::string {
-      return GetRef<runtime::String>(static_cast<const runtime::StringObj*>(n))
-          .
-          operator std::string();
-    });
-
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<runtime::StringObj>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const runtime::StringObj*>(node.get());
-      p->stream << '"' << support::StrEscape(op->data, op->size) << '"';
-    });
-
-struct ADTObjTrait {
-  static constexpr const std::nullptr_t VisitAttrs = nullptr;
-
-  static void SHashReduce(const runtime::ADTObj* key, SHashReducer hash_reduce) {
-    hash_reduce(key->tag);
-    hash_reduce(static_cast<uint64_t>(key->size));
-    for (uint32_t i = 0; i < key->size; ++i) {
-      hash_reduce((*key)[i]);
-    }
-  }
-
-  static bool SEqualReduce(const runtime::ADTObj* lhs, const runtime::ADTObj* rhs,
-                           SEqualReducer equal) {
-    if (lhs == rhs) return true;
-    if (lhs->tag != rhs->tag) return false;
-    if (lhs->size != rhs->size) return false;
-
-    for (uint32_t i = 0; i < lhs->size; ++i) {
-      if (!equal((*lhs)[i], (*rhs)[i])) return false;
-    }
-    return true;
-  }
-};
-
-TVM_REGISTER_REFLECTION_VTABLE(runtime::ADTObj, ADTObjTrait);
-
-struct NDArrayContainerTrait {
-  static constexpr const std::nullptr_t VisitAttrs = nullptr;
-
-  static void SHashReduce(const runtime::NDArray::Container* key, SHashReducer hash_reduce) {
-    ICHECK_EQ(key->dl_tensor.ctx.device_type, kDLCPU) << "can only compare CPU tensor";
-    ICHECK(runtime::IsContiguous(key->dl_tensor)) << "Can only hash contiguous tensor";
-    hash_reduce(runtime::DataType(key->dl_tensor.dtype));
-    hash_reduce(key->dl_tensor.ndim);
-    for (int i = 0; i < key->dl_tensor.ndim; ++i) {
-      hash_reduce(key->dl_tensor.shape[i]);
-    }
-    hash_reduce->SHashReduceHashedValue(runtime::String::HashBytes(
-        static_cast<const char*>(key->dl_tensor.data), runtime::GetDataSize(key->dl_tensor)));
-  }
-
-  static bool SEqualReduce(const runtime::NDArray::Container* lhs,
-                           const runtime::NDArray::Container* rhs, SEqualReducer equal) {
-    if (lhs == rhs) return true;
-
-    auto ldt = lhs->dl_tensor.dtype;
-    auto rdt = rhs->dl_tensor.dtype;
-    ICHECK_EQ(lhs->dl_tensor.ctx.device_type, kDLCPU) << "can only compare CPU tensor";
-    ICHECK_EQ(rhs->dl_tensor.ctx.device_type, kDLCPU) << "can only compare CPU tensor";
-    ICHECK(runtime::IsContiguous(lhs->dl_tensor)) << "Can only compare contiguous tensor";
-    ICHECK(runtime::IsContiguous(rhs->dl_tensor)) << "Can only compare contiguous tensor";
-
-    if (lhs->dl_tensor.ndim != rhs->dl_tensor.ndim) return false;
-    for (int i = 0; i < lhs->dl_tensor.ndim; ++i) {
-      if (!equal(lhs->dl_tensor.shape[i], rhs->dl_tensor.shape[i])) return false;
-    }
-    if (ldt.code == rdt.code && ldt.lanes == rdt.lanes && ldt.bits == rdt.bits) {
-      size_t data_size = runtime::GetDataSize(lhs->dl_tensor);
-      return std::memcmp(lhs->dl_tensor.data, rhs->dl_tensor.data, data_size) == 0;
-    } else {
-      return false;
-    }
-  }
-};
-
-TVM_REGISTER_REFLECTION_VTABLE(runtime::NDArray::Container, NDArrayContainerTrait);
-
-struct ArrayNodeTrait {
-  static constexpr const std::nullptr_t VisitAttrs = nullptr;
-
-  static void SHashReduce(const ArrayNode* key, SHashReducer hash_reduce) {
-    hash_reduce(static_cast<uint64_t>(key->size()));
-    for (size_t i = 0; i < key->size(); ++i) {
-      hash_reduce(key->at(i));
-    }
-  }
-
-  static bool SEqualReduce(const ArrayNode* lhs, const ArrayNode* rhs, SEqualReducer equal) {
-    if (lhs->size() != rhs->size()) return false;
-    for (size_t i = 0; i < lhs->size(); ++i) {
-      if (!equal(lhs->at(i), rhs->at(i))) return false;
-    }
-    return true;
-  }
-};
-
-TVM_REGISTER_OBJECT_TYPE(ArrayNode);
-TVM_REGISTER_REFLECTION_VTABLE(ArrayNode, ArrayNodeTrait)
-    .set_creator([](const std::string&) -> ObjectPtr<Object> {
-      return ::tvm::runtime::make_object<ArrayNode>();
-    });
-
-TVM_REGISTER_GLOBAL("node.Array").set_body([](TVMArgs args, TVMRetValue* ret) {
-  std::vector<ObjectRef> data;
-  for (int i = 0; i < args.size(); ++i) {
-    if (args[i].type_code() != kTVMNullptr) {
-      data.push_back(args[i].operator ObjectRef());
-    } else {
-      data.push_back(ObjectRef(nullptr));
-    }
-  }
-  *ret = Array<ObjectRef>(data);
-});
-
-TVM_REGISTER_GLOBAL("node.ArrayGetItem").set_body([](TVMArgs args, TVMRetValue* ret) {
-  int64_t i = args[1];
-  ICHECK_EQ(args[0].type_code(), kTVMObjectHandle);
-  Object* ptr = static_cast<Object*>(args[0].value().v_handle);
-  ICHECK(ptr->IsInstance<ArrayNode>());
-  auto* n = static_cast<const ArrayNode*>(ptr);
-  ICHECK_LT(static_cast<size_t>(i), n->size()) << "out of bound of array";
-  *ret = n->at(i);
-});
-
-TVM_REGISTER_GLOBAL("node.ArraySize").set_body([](TVMArgs args, TVMRetValue* ret) {
-  ICHECK_EQ(args[0].type_code(), kTVMObjectHandle);
-  Object* ptr = static_cast<Object*>(args[0].value().v_handle);
-  ICHECK(ptr->IsInstance<ArrayNode>());
-  *ret = static_cast<int64_t>(static_cast<const ArrayNode*>(ptr)->size());
-});
-
-struct MapNodeTrait {
-  static constexpr const std::nullptr_t VisitAttrs = nullptr;
-
-  static void SHashReduceForOMap(const MapNode* key, SHashReducer hash_reduce) {
-    // SHash's var handling depends on the determinism of traversal.
-    // NOTE: only book-keep the mapped hash keys.
-    // This resolves common use cases where we want to store
-    // Map<Var, Value> where Var is defined in the function
-    // parameters.
-    using KV = std::pair<size_t, ObjectRef>;
-    std::vector<KV> temp;
-    for (const auto& kv : *key) {
-      size_t hashed_value;
-      if (hash_reduce->LookupHashedValue(kv.first, &hashed_value)) {
-        temp.emplace_back(hashed_value, kv.second);
-      }
-    }
-    // sort by the hash key of the keys.
-    std::sort(temp.begin(), temp.end(),
-              [](const KV& lhs, const KV& rhs) { return lhs.first < rhs.first; });
-    // add size to the hash
-    hash_reduce(static_cast<uint64_t>(key->size()));
-    // hash the content
-    for (size_t i = 0; i < temp.size();) {
-      size_t k = i + 1;
-      for (; k < temp.size() && temp[k].first == temp[i].first; ++k) {
-      }
-      // ties are rare, but we need to skip them to make the hash determinsitic
-      if (k == i + 1) {
-        hash_reduce->SHashReduceHashedValue(temp[i].first);
-        hash_reduce(temp[i].second);
-      }
-      i = k;
-    }
-  }
-
-  static void SHashReduceForSMap(const MapNode* key, SHashReducer hash_reduce) {
-    // NOTE: only book-keep the mapped hash keys.
-    // This resolves common use cases where we want to store
-    // Map<Var, Value> where Var is defined in the function
-    // parameters.
-    using KV = std::pair<String, ObjectRef>;
-    std::vector<KV> temp;
-    for (const auto& kv : *key) {
-      temp.push_back(std::make_pair(Downcast<String>(kv.first), kv.second));
-    }
-    // sort by the hash key of the keys.
-    std::sort(temp.begin(), temp.end(),
-              [](const KV& lhs, const KV& rhs) { return lhs.first < rhs.first; });
-    // NOTE: we won't have ties
-    // add size to the hash after sorting.
-    hash_reduce(static_cast<uint64_t>(key->size()));
-    // hash the content
-    for (size_t i = 0; i < temp.size(); ++i) {
-      hash_reduce(temp[i].first);
-      hash_reduce(temp[i].second);
-    }
-  }
-
-  static void SHashReduce(const MapNode* key, SHashReducer hash_reduce) {
-    bool is_str_map = std::all_of(key->begin(), key->end(), [](const auto& v) {
-      return v.first->template IsInstance<StringObj>();
-    });
-    if (is_str_map) {
-      SHashReduceForSMap(key, hash_reduce);
-    } else {
-      SHashReduceForOMap(key, hash_reduce);
-    }
-  }
-
-  static bool SEqualReduceForOMap(const MapNode* lhs, const MapNode* rhs, SEqualReducer equal) {
-    for (const auto& kv : *lhs) {
-      // Only allow equal checking if the keys are already mapped
-      // This resolves common use cases where we want to store
-      // Map<Var, Value> where Var is defined in the function
-      // parameters.
-      ObjectRef rhs_key = equal->MapLhsToRhs(kv.first);
-      if (!rhs_key.defined()) return false;
-      auto it = rhs->find(rhs_key);
-      if (it == rhs->end()) return false;
-      if (!equal(kv.second, it->second)) return false;
-    }
-    return true;
-  }
-
-  static bool SEqualReduceForSMap(const MapNode* lhs, const MapNode* rhs, SEqualReducer equal) {
-    for (const auto& kv : *lhs) {
-      auto it = rhs->find(kv.first);
-      if (it == rhs->end()) return false;
-      if (!equal(kv.second, it->second)) return false;
-    }
-    return true;
-  }
-
-  static bool SEqualReduce(const MapNode* lhs, const MapNode* rhs, SEqualReducer equal) {
-    if (rhs->size() != lhs->size()) return false;
-    if (rhs->size() == 0) return true;
-    bool ls = std::all_of(lhs->begin(), lhs->end(),
-                          [](const auto& v) { return v.first->template IsInstance<StringObj>(); });
-    bool rs = std::all_of(rhs->begin(), rhs->end(),
-                          [](const auto& v) { return v.first->template IsInstance<StringObj>(); });
-    if (ls != rs) {
-      return false;
-    }
-    return (ls && rs) ? SEqualReduceForSMap(lhs, rhs, equal) : SEqualReduceForOMap(lhs, rhs, equal);
-  }
-};
-
-TVM_REGISTER_OBJECT_TYPE(MapNode);
-TVM_REGISTER_REFLECTION_VTABLE(MapNode, MapNodeTrait)
-    .set_creator([](const std::string&) -> ObjectPtr<Object> { return MapNode::Empty(); });
-
-TVM_REGISTER_GLOBAL("node.Map").set_body([](TVMArgs args, TVMRetValue* ret) {
-  ICHECK_EQ(args.size() % 2, 0);
-  std::unordered_map<ObjectRef, ObjectRef, ObjectPtrHash, ObjectPtrEqual> data;
-  for (int i = 0; i < args.num_args; i += 2) {
-    ObjectRef k =
-        String::CanConvertFrom(args[i]) ? args[i].operator String() : args[i].operator ObjectRef();
-    ObjectRef v = args[i + 1];
-    data.emplace(std::move(k), std::move(v));
-  }
-  *ret = Map<ObjectRef, ObjectRef>(std::move(data));
-});
-
-TVM_REGISTER_GLOBAL("node.MapSize").set_body([](TVMArgs args, TVMRetValue* ret) {
-  ICHECK_EQ(args[0].type_code(), kTVMObjectHandle);
-  Object* ptr = static_cast<Object*>(args[0].value().v_handle);
-  ICHECK(ptr->IsInstance<MapNode>());
-  auto* n = static_cast<const MapNode*>(ptr);
-  *ret = static_cast<int64_t>(n->size());
-});
-
-TVM_REGISTER_GLOBAL("node.MapGetItem").set_body([](TVMArgs args, TVMRetValue* ret) {
-  ICHECK_EQ(args[0].type_code(), kTVMObjectHandle);
-  Object* ptr = static_cast<Object*>(args[0].value().v_handle);
-  ICHECK(ptr->IsInstance<MapNode>());
-
-  auto* n = static_cast<const MapNode*>(ptr);
-  auto it = n->find(String::CanConvertFrom(args[1]) ? args[1].operator String()
-                                                    : args[1].operator ObjectRef());
-  ICHECK(it != n->end()) << "cannot find the corresponding key in the Map";
-  *ret = (*it).second;
-});
-
-TVM_REGISTER_GLOBAL("node.MapCount").set_body([](TVMArgs args, TVMRetValue* ret) {
-  ICHECK_EQ(args[0].type_code(), kTVMObjectHandle);
-  Object* ptr = static_cast<Object*>(args[0].value().v_handle);
-  ICHECK(ptr->IsInstance<MapNode>());
-  const MapNode* n = static_cast<const MapNode*>(ptr);
-  int64_t cnt = n->count(String::CanConvertFrom(args[1]) ? args[1].operator String()
-                                                         : args[1].operator ObjectRef());
-  *ret = cnt;
-});
-
-TVM_REGISTER_GLOBAL("node.MapItems").set_body([](TVMArgs args, TVMRetValue* ret) {
-  ICHECK_EQ(args[0].type_code(), kTVMObjectHandle);
-  Object* ptr = static_cast<Object*>(args[0].value().v_handle);
-  auto* n = static_cast<const MapNode*>(ptr);
-  Array<ObjectRef> rkvs;
-  for (const auto& kv : *n) {
-    if (kv.first->IsInstance<StringObj>()) {
-      rkvs.push_back(Downcast<String>(kv.first));
-    } else {
-      rkvs.push_back(kv.first);
-    }
-    rkvs.push_back(kv.second);
-  }
-  *ret = std::move(rkvs);
-});
-
-#if (USE_FALLBACK_STL_MAP == 0)
-TVM_DLL constexpr uint64_t DenseMapNode::kNextProbeLocation[];
-#endif
-}  // namespace tvm
diff --git a/src/node/reflection.cc b/src/node/reflection.cc
index 9dc9d330bb77..79a53aa26440 100644
--- a/src/node/reflection.cc
+++ b/src/node/reflection.cc
@@ -22,9 +22,9 @@
  * \file node/reflection.cc
  */
 #include <tvm/ir/attrs.h>
-#include <tvm/node/container.h>
 #include <tvm/node/node.h>
 #include <tvm/node/reflection.h>
+#include <tvm/runtime/container.h>
 #include <tvm/runtime/registry.h>
 
 namespace tvm {
diff --git a/src/node/serialization.cc b/src/node/serialization.cc
index c7e4d27c8b2c..ad42799b55e5 100644
--- a/src/node/serialization.cc
+++ b/src/node/serialization.cc
@@ -24,9 +24,9 @@
 #include <dmlc/json.h>
 #include <dmlc/memory_io.h>
 #include <tvm/ir/attrs.h>
-#include <tvm/node/container.h>
 #include <tvm/node/reflection.h>
 #include <tvm/node/serialization.h>
+#include <tvm/runtime/container.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
diff --git a/src/node/structural_hash.cc b/src/node/structural_hash.cc
index e0b729d3f103..efedd1b99d6d 100644
--- a/src/node/structural_hash.cc
+++ b/src/node/structural_hash.cc
@@ -28,6 +28,7 @@
 #include <algorithm>
 #include <unordered_map>
 
+#include "../support/str_escape.h"
 #include "../support/utils.h"
 
 namespace tvm {
@@ -260,4 +261,241 @@ size_t StructuralHash::operator()(const ObjectRef& object) const {
   return VarCountingSHashHandler().Hash(object, false);
 }
 
+// SEQualReduce traits for runtime containers.
+struct StringObjTrait {
+  static constexpr const std::nullptr_t VisitAttrs = nullptr;
+
+  static void SHashReduce(const runtime::StringObj* key, SHashReducer hash_reduce) {
+    hash_reduce->SHashReduceHashedValue(runtime::String::HashBytes(key->data, key->size));
+  }
+
+  static bool SEqualReduce(const runtime::StringObj* lhs, const runtime::StringObj* rhs,
+                           SEqualReducer equal) {
+    if (lhs == rhs) return true;
+    if (lhs->size != rhs->size) return false;
+    if (lhs->data == rhs->data) return true;
+    return std::memcmp(lhs->data, rhs->data, lhs->size) == 0;
+  }
+};
+
+struct RefToObjectPtr : public ObjectRef {
+  static ObjectPtr<Object> Get(const ObjectRef& ref) { return GetDataPtr<Object>(ref); }
+};
+
+TVM_REGISTER_REFLECTION_VTABLE(runtime::StringObj, StringObjTrait)
+    .set_creator([](const std::string& bytes) {
+      return RefToObjectPtr::Get(runtime::String(bytes));
+    })
+    .set_repr_bytes([](const Object* n) -> std::string {
+      return GetRef<runtime::String>(static_cast<const runtime::StringObj*>(n))
+          .
+          operator std::string();
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<runtime::StringObj>([](const ObjectRef& node, ReprPrinter* p) {
+      auto* op = static_cast<const runtime::StringObj*>(node.get());
+      p->stream << '"' << support::StrEscape(op->data, op->size) << '"';
+    });
+
+struct ADTObjTrait {
+  static constexpr const std::nullptr_t VisitAttrs = nullptr;
+
+  static void SHashReduce(const runtime::ADTObj* key, SHashReducer hash_reduce) {
+    hash_reduce(key->tag);
+    hash_reduce(static_cast<uint64_t>(key->size));
+    for (uint32_t i = 0; i < key->size; ++i) {
+      hash_reduce((*key)[i]);
+    }
+  }
+
+  static bool SEqualReduce(const runtime::ADTObj* lhs, const runtime::ADTObj* rhs,
+                           SEqualReducer equal) {
+    if (lhs == rhs) return true;
+    if (lhs->tag != rhs->tag) return false;
+    if (lhs->size != rhs->size) return false;
+
+    for (uint32_t i = 0; i < lhs->size; ++i) {
+      if (!equal((*lhs)[i], (*rhs)[i])) return false;
+    }
+    return true;
+  }
+};
+
+TVM_REGISTER_REFLECTION_VTABLE(runtime::ADTObj, ADTObjTrait);
+
+struct NDArrayContainerTrait {
+  static constexpr const std::nullptr_t VisitAttrs = nullptr;
+
+  static void SHashReduce(const runtime::NDArray::Container* key, SHashReducer hash_reduce) {
+    ICHECK_EQ(key->dl_tensor.ctx.device_type, kDLCPU) << "can only compare CPU tensor";
+    ICHECK(runtime::IsContiguous(key->dl_tensor)) << "Can only hash contiguous tensor";
+    hash_reduce(runtime::DataType(key->dl_tensor.dtype));
+    hash_reduce(key->dl_tensor.ndim);
+    for (int i = 0; i < key->dl_tensor.ndim; ++i) {
+      hash_reduce(key->dl_tensor.shape[i]);
+    }
+    hash_reduce->SHashReduceHashedValue(runtime::String::HashBytes(
+        static_cast<const char*>(key->dl_tensor.data), runtime::GetDataSize(key->dl_tensor)));
+  }
+
+  static bool SEqualReduce(const runtime::NDArray::Container* lhs,
+                           const runtime::NDArray::Container* rhs, SEqualReducer equal) {
+    if (lhs == rhs) return true;
+
+    auto ldt = lhs->dl_tensor.dtype;
+    auto rdt = rhs->dl_tensor.dtype;
+    ICHECK_EQ(lhs->dl_tensor.ctx.device_type, kDLCPU) << "can only compare CPU tensor";
+    ICHECK_EQ(rhs->dl_tensor.ctx.device_type, kDLCPU) << "can only compare CPU tensor";
+    ICHECK(runtime::IsContiguous(lhs->dl_tensor)) << "Can only compare contiguous tensor";
+    ICHECK(runtime::IsContiguous(rhs->dl_tensor)) << "Can only compare contiguous tensor";
+
+    if (lhs->dl_tensor.ndim != rhs->dl_tensor.ndim) return false;
+    for (int i = 0; i < lhs->dl_tensor.ndim; ++i) {
+      if (!equal(lhs->dl_tensor.shape[i], rhs->dl_tensor.shape[i])) return false;
+    }
+    if (ldt.code == rdt.code && ldt.lanes == rdt.lanes && ldt.bits == rdt.bits) {
+      size_t data_size = runtime::GetDataSize(lhs->dl_tensor);
+      return std::memcmp(lhs->dl_tensor.data, rhs->dl_tensor.data, data_size) == 0;
+    } else {
+      return false;
+    }
+  }
+};
+
+TVM_REGISTER_REFLECTION_VTABLE(runtime::NDArray::Container, NDArrayContainerTrait);
+
+struct ArrayNodeTrait {
+  static constexpr const std::nullptr_t VisitAttrs = nullptr;
+
+  static void SHashReduce(const ArrayNode* key, SHashReducer hash_reduce) {
+    hash_reduce(static_cast<uint64_t>(key->size()));
+    for (size_t i = 0; i < key->size(); ++i) {
+      hash_reduce(key->at(i));
+    }
+  }
+
+  static bool SEqualReduce(const ArrayNode* lhs, const ArrayNode* rhs, SEqualReducer equal) {
+    if (lhs->size() != rhs->size()) return false;
+    for (size_t i = 0; i < lhs->size(); ++i) {
+      if (!equal(lhs->at(i), rhs->at(i))) return false;
+    }
+    return true;
+  }
+};
+TVM_REGISTER_REFLECTION_VTABLE(ArrayNode, ArrayNodeTrait)
+    .set_creator([](const std::string&) -> ObjectPtr<Object> {
+      return ::tvm::runtime::make_object<ArrayNode>();
+    });
+
+struct MapNodeTrait {
+  static constexpr const std::nullptr_t VisitAttrs = nullptr;
+
+  static void SHashReduceForOMap(const MapNode* key, SHashReducer hash_reduce) {
+    // SHash's var handling depends on the determinism of traversal.
+    // NOTE: only book-keep the mapped hash keys.
+    // This resolves common use cases where we want to store
+    // Map<Var, Value> where Var is defined in the function
+    // parameters.
+    using KV = std::pair<size_t, ObjectRef>;
+    std::vector<KV> temp;
+    for (const auto& kv : *key) {
+      size_t hashed_value;
+      if (hash_reduce->LookupHashedValue(kv.first, &hashed_value)) {
+        temp.emplace_back(hashed_value, kv.second);
+      }
+    }
+    // sort by the hash key of the keys.
+    std::sort(temp.begin(), temp.end(),
+              [](const KV& lhs, const KV& rhs) { return lhs.first < rhs.first; });
+    // add size to the hash
+    hash_reduce(static_cast<uint64_t>(key->size()));
+    // hash the content
+    for (size_t i = 0; i < temp.size();) {
+      size_t k = i + 1;
+      for (; k < temp.size() && temp[k].first == temp[i].first; ++k) {
+      }
+      // ties are rare, but we need to skip them to make the hash determinsitic
+      if (k == i + 1) {
+        hash_reduce->SHashReduceHashedValue(temp[i].first);
+        hash_reduce(temp[i].second);
+      }
+      i = k;
+    }
+  }
+
+  static void SHashReduceForSMap(const MapNode* key, SHashReducer hash_reduce) {
+    // NOTE: only book-keep the mapped hash keys.
+    // This resolves common use cases where we want to store
+    // Map<Var, Value> where Var is defined in the function
+    // parameters.
+    using KV = std::pair<String, ObjectRef>;
+    std::vector<KV> temp;
+    for (const auto& kv : *key) {
+      temp.push_back(std::make_pair(Downcast<String>(kv.first), kv.second));
+    }
+    // sort by the hash key of the keys.
+    std::sort(temp.begin(), temp.end(),
+              [](const KV& lhs, const KV& rhs) { return lhs.first < rhs.first; });
+    // NOTE: we won't have ties
+    // add size to the hash after sorting.
+    hash_reduce(static_cast<uint64_t>(key->size()));
+    // hash the content
+    for (size_t i = 0; i < temp.size(); ++i) {
+      hash_reduce(temp[i].first);
+      hash_reduce(temp[i].second);
+    }
+  }
+
+  static void SHashReduce(const MapNode* key, SHashReducer hash_reduce) {
+    bool is_str_map = std::all_of(key->begin(), key->end(), [](const auto& v) {
+      return v.first->template IsInstance<StringObj>();
+    });
+    if (is_str_map) {
+      SHashReduceForSMap(key, hash_reduce);
+    } else {
+      SHashReduceForOMap(key, hash_reduce);
+    }
+  }
+
+  static bool SEqualReduceForOMap(const MapNode* lhs, const MapNode* rhs, SEqualReducer equal) {
+    for (const auto& kv : *lhs) {
+      // Only allow equal checking if the keys are already mapped
+      // This resolves common use cases where we want to store
+      // Map<Var, Value> where Var is defined in the function
+      // parameters.
+      ObjectRef rhs_key = equal->MapLhsToRhs(kv.first);
+      if (!rhs_key.defined()) return false;
+      auto it = rhs->find(rhs_key);
+      if (it == rhs->end()) return false;
+      if (!equal(kv.second, it->second)) return false;
+    }
+    return true;
+  }
+
+  static bool SEqualReduceForSMap(const MapNode* lhs, const MapNode* rhs, SEqualReducer equal) {
+    for (const auto& kv : *lhs) {
+      auto it = rhs->find(kv.first);
+      if (it == rhs->end()) return false;
+      if (!equal(kv.second, it->second)) return false;
+    }
+    return true;
+  }
+
+  static bool SEqualReduce(const MapNode* lhs, const MapNode* rhs, SEqualReducer equal) {
+    if (rhs->size() != lhs->size()) return false;
+    if (rhs->size() == 0) return true;
+    bool ls = std::all_of(lhs->begin(), lhs->end(),
+                          [](const auto& v) { return v.first->template IsInstance<StringObj>(); });
+    bool rs = std::all_of(rhs->begin(), rhs->end(),
+                          [](const auto& v) { return v.first->template IsInstance<StringObj>(); });
+    if (ls != rs) {
+      return false;
+    }
+    return (ls && rs) ? SEqualReduceForSMap(lhs, rhs, equal) : SEqualReduceForOMap(lhs, rhs, equal);
+  }
+};
+TVM_REGISTER_REFLECTION_VTABLE(MapNode, MapNodeTrait)
+    .set_creator([](const std::string&) -> ObjectPtr<Object> { return MapNode::Empty(); });
+
 }  // namespace tvm
diff --git a/src/printer/meta_data.h b/src/printer/meta_data.h
index 233da1baffd8..f76c32d353cf 100644
--- a/src/printer/meta_data.h
+++ b/src/printer/meta_data.h
@@ -24,8 +24,8 @@
 #ifndef TVM_PRINTER_META_DATA_H_
 #define TVM_PRINTER_META_DATA_H_
 
-#include <tvm/node/container.h>
 #include <tvm/node/serialization.h>
+#include <tvm/runtime/container.h>
 
 #include <string>
 #include <unordered_map>
diff --git a/src/relay/backend/contrib/codegen_json/codegen_json.h b/src/relay/backend/contrib/codegen_json/codegen_json.h
index 859ef8c9bdb2..192e09140375 100644
--- a/src/relay/backend/contrib/codegen_json/codegen_json.h
+++ b/src/relay/backend/contrib/codegen_json/codegen_json.h
@@ -26,7 +26,6 @@
 
 #include <dmlc/any.h>
 #include <dmlc/json.h>
-#include <tvm/node/container.h>
 #include <tvm/node/reflection.h>
 #include <tvm/runtime/container.h>
 #include <tvm/tir/op.h>
diff --git a/src/relay/op/nn/nn.h b/src/relay/op/nn/nn.h
index c00e2e02b369..8802cd903b01 100644
--- a/src/relay/op/nn/nn.h
+++ b/src/relay/op/nn/nn.h
@@ -26,8 +26,8 @@
 
 #include <tvm/ir/attrs.h>
 #include <tvm/ir/expr.h>
-#include <tvm/node/container.h>
 #include <tvm/relay/type.h>
+#include <tvm/runtime/container.h>
 
 #include <utility>
 
diff --git a/src/runtime/container.cc b/src/runtime/container.cc
index 916a912b3c5e..3d9b1481f6e6 100644
--- a/src/runtime/container.cc
+++ b/src/runtime/container.cc
@@ -79,5 +79,100 @@ TVM_REGISTER_OBJECT_TYPE(ADTObj);
 TVM_REGISTER_OBJECT_TYPE(StringObj);
 TVM_REGISTER_OBJECT_TYPE(ClosureObj);
 
+TVM_REGISTER_OBJECT_TYPE(ArrayNode);
+
+TVM_REGISTER_GLOBAL("runtime.Array").set_body([](TVMArgs args, TVMRetValue* ret) {
+  std::vector<ObjectRef> data;
+  for (int i = 0; i < args.size(); ++i) {
+    if (args[i].type_code() != kTVMNullptr) {
+      data.push_back(args[i].operator ObjectRef());
+    } else {
+      data.push_back(ObjectRef(nullptr));
+    }
+  }
+  *ret = Array<ObjectRef>(data);
+});
+
+TVM_REGISTER_GLOBAL("runtime.ArrayGetItem").set_body([](TVMArgs args, TVMRetValue* ret) {
+  int64_t i = args[1];
+  ICHECK_EQ(args[0].type_code(), kTVMObjectHandle);
+  Object* ptr = static_cast<Object*>(args[0].value().v_handle);
+  ICHECK(ptr->IsInstance<ArrayNode>());
+  auto* n = static_cast<const ArrayNode*>(ptr);
+  ICHECK_LT(static_cast<size_t>(i), n->size()) << "out of bound of array";
+  *ret = n->at(i);
+});
+
+TVM_REGISTER_GLOBAL("runtime.ArraySize").set_body([](TVMArgs args, TVMRetValue* ret) {
+  ICHECK_EQ(args[0].type_code(), kTVMObjectHandle);
+  Object* ptr = static_cast<Object*>(args[0].value().v_handle);
+  ICHECK(ptr->IsInstance<ArrayNode>());
+  *ret = static_cast<int64_t>(static_cast<const ArrayNode*>(ptr)->size());
+});
+
+TVM_REGISTER_OBJECT_TYPE(MapNode);
+
+TVM_REGISTER_GLOBAL("runtime.Map").set_body([](TVMArgs args, TVMRetValue* ret) {
+  ICHECK_EQ(args.size() % 2, 0);
+  std::unordered_map<ObjectRef, ObjectRef, ObjectPtrHash, ObjectPtrEqual> data;
+  for (int i = 0; i < args.num_args; i += 2) {
+    ObjectRef k =
+        String::CanConvertFrom(args[i]) ? args[i].operator String() : args[i].operator ObjectRef();
+    ObjectRef v = args[i + 1];
+    data.emplace(std::move(k), std::move(v));
+  }
+  *ret = Map<ObjectRef, ObjectRef>(std::move(data));
+});
+
+TVM_REGISTER_GLOBAL("runtime.MapSize").set_body([](TVMArgs args, TVMRetValue* ret) {
+  ICHECK_EQ(args[0].type_code(), kTVMObjectHandle);
+  Object* ptr = static_cast<Object*>(args[0].value().v_handle);
+  ICHECK(ptr->IsInstance<MapNode>());
+  auto* n = static_cast<const MapNode*>(ptr);
+  *ret = static_cast<int64_t>(n->size());
+});
+
+TVM_REGISTER_GLOBAL("runtime.MapGetItem").set_body([](TVMArgs args, TVMRetValue* ret) {
+  ICHECK_EQ(args[0].type_code(), kTVMObjectHandle);
+  Object* ptr = static_cast<Object*>(args[0].value().v_handle);
+  ICHECK(ptr->IsInstance<MapNode>());
+
+  auto* n = static_cast<const MapNode*>(ptr);
+  auto it = n->find(String::CanConvertFrom(args[1]) ? args[1].operator String()
+                                                    : args[1].operator ObjectRef());
+  ICHECK(it != n->end()) << "cannot find the corresponding key in the Map";
+  *ret = (*it).second;
+});
+
+TVM_REGISTER_GLOBAL("runtime.MapCount").set_body([](TVMArgs args, TVMRetValue* ret) {
+  ICHECK_EQ(args[0].type_code(), kTVMObjectHandle);
+  Object* ptr = static_cast<Object*>(args[0].value().v_handle);
+  ICHECK(ptr->IsInstance<MapNode>());
+  const MapNode* n = static_cast<const MapNode*>(ptr);
+  int64_t cnt = n->count(String::CanConvertFrom(args[1]) ? args[1].operator String()
+                                                         : args[1].operator ObjectRef());
+  *ret = cnt;
+});
+
+TVM_REGISTER_GLOBAL("runtime.MapItems").set_body([](TVMArgs args, TVMRetValue* ret) {
+  ICHECK_EQ(args[0].type_code(), kTVMObjectHandle);
+  Object* ptr = static_cast<Object*>(args[0].value().v_handle);
+  auto* n = static_cast<const MapNode*>(ptr);
+  Array<ObjectRef> rkvs;
+  for (const auto& kv : *n) {
+    if (kv.first->IsInstance<StringObj>()) {
+      rkvs.push_back(Downcast<String>(kv.first));
+    } else {
+      rkvs.push_back(kv.first);
+    }
+    rkvs.push_back(kv.second);
+  }
+  *ret = std::move(rkvs);
+});
+
+#if (USE_FALLBACK_STL_MAP == 0)
+TVM_DLL constexpr uint64_t DenseMapNode::kNextProbeLocation[];
+#endif
+
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/file_utils.cc b/src/runtime/file_utils.cc
index 42cbfdc3b1ed..3957505a7c7d 100644
--- a/src/runtime/file_utils.cc
+++ b/src/runtime/file_utils.cc
@@ -23,6 +23,7 @@
 #include "file_utils.h"
 
 #include <dmlc/json.h>
+#include <dmlc/memory_io.h>
 #include <tvm/runtime/serializer.h>
 #include <tvm/support/logging.h>
 
diff --git a/src/runtime/file_utils.h b/src/runtime/file_utils.h
index 696a9760c2e1..dfa7d67f1bfe 100644
--- a/src/runtime/file_utils.h
+++ b/src/runtime/file_utils.h
@@ -24,6 +24,8 @@
 #ifndef TVM_RUNTIME_FILE_UTILS_H_
 #define TVM_RUNTIME_FILE_UTILS_H_
 
+#include <tvm/runtime/container.h>
+
 #include <string>
 #include <unordered_map>
 
diff --git a/src/runtime/graph/graph_runtime_factory.cc b/src/runtime/graph/graph_runtime_factory.cc
index 2c055e16cc9f..4d3993a9a36f 100644
--- a/src/runtime/graph/graph_runtime_factory.cc
+++ b/src/runtime/graph/graph_runtime_factory.cc
@@ -24,7 +24,7 @@
 
 #include "./graph_runtime_factory.h"
 
-#include <tvm/node/container.h>
+#include <tvm/runtime/container.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/registry.h>
 
diff --git a/src/runtime/metadata_module.cc b/src/runtime/metadata_module.cc
index acef9d4736fd..665c72cc5e0d 100644
--- a/src/runtime/metadata_module.cc
+++ b/src/runtime/metadata_module.cc
@@ -27,7 +27,7 @@
  * code and metadata significantly reduces the efforts for handling external
  * codegen and runtimes.
  */
-#include <tvm/node/container.h>
+#include <tvm/runtime/container.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
diff --git a/src/support/libinfo.cc b/src/support/libinfo.cc
index 0f394f50fe71..d6c8f1799596 100644
--- a/src/support/libinfo.cc
+++ b/src/support/libinfo.cc
@@ -16,7 +16,7 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#include <tvm/node/container.h>
+#include <tvm/runtime/container.h>
 #include <tvm/runtime/object.h>
 #include <tvm/runtime/registry.h>
 

From 41c059178005d8e0df03f4f25cdc631baba62ef1 Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Wed, 3 Mar 2021 16:08:29 -0800
Subject: [PATCH 273/357] [AutoSchedule] Fix a flaky test (#7580)

---
 tests/python/unittest/test_auto_scheduler_layout_rewrite.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/python/unittest/test_auto_scheduler_layout_rewrite.py b/tests/python/unittest/test_auto_scheduler_layout_rewrite.py
index 2fae7b838143..795c3cb3b0a2 100644
--- a/tests/python/unittest/test_auto_scheduler_layout_rewrite.py
+++ b/tests/python/unittest/test_auto_scheduler_layout_rewrite.py
@@ -66,7 +66,7 @@ def test_apply_steps_with_layout_rewrite_corner_case():
 
 @tvm.testing.requires_llvm
 def test_correctness_layout_rewrite_rewrite_for_preTransformed():
-    N = 128
+    N = 16
     target = tvm.target.Target("llvm")
     task = auto_scheduler.SearchTask(func=matmul_auto_scheduler_test, args=(N, N, N), target=target)
     dag = task.compute_dag
@@ -78,9 +78,10 @@ def test_correctness_layout_rewrite_rewrite_for_preTransformed():
 
         measure_ctx = auto_scheduler.LocalRPCMeasureContext()
         tuning_options = auto_scheduler.TuningOptions(
-            num_measure_trials=2,
+            num_measure_trials=100,
             runner=measure_ctx.runner,
             verbose=2,
+            early_stopping=1,
             measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
         )
         task.tune(tuning_options, search_policy=search_policy)

From 3f5f84d2e27225a188588450fd744516122d9a67 Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Wed, 3 Mar 2021 16:09:38 -0800
Subject: [PATCH 274/357] [AutoScheduler] Querying and sampling in task
 extraction (#7571)

* [AutoScheduler] Query in task extraction

* trigger ci
---
 python/tvm/auto_scheduler/relay_integration.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py
index b39aba227a88..68f53125c7ae 100644
--- a/python/tvm/auto_scheduler/relay_integration.py
+++ b/python/tvm/auto_scheduler/relay_integration.py
@@ -283,10 +283,13 @@ def auto_schedule_topi(outs):
     key = register_workload_tensors(dag.workload_key(), io_tensors)
     target = tvm.target.Target.current()
 
+    dispatch_ctx = DispatchContext.current
+    state = dispatch_ctx.query(target, key, has_complex_op, dag)
+    schedule = None
+
     env = TracingEnvironment.current
     if env is None:
         # in the final build mode
-        state = DispatchContext.current.query(target, key, has_complex_op, dag)
         if state is None:
             return None
 
@@ -303,8 +306,6 @@ def auto_schedule_topi(outs):
             LayoutRewriteOption.get_target_default(target, True) != LayoutRewriteOption.NO_REWRITE
             and has_layout_free
         ):
-            dispatch_ctx = DispatchContext.current
-            state = dispatch_ctx.query(target, key, has_complex_op, dag)
             if state is None:
                 return None
 
@@ -316,7 +317,7 @@ def auto_schedule_topi(outs):
     else:
         raise ValueError("Invalid tracing mode: " + env.tracing_mode)
 
-    return None
+    return schedule
 
 
 def tensor_no_check_call(self, *indices):

From 66f91394fe9ecc18217f1f9737aa2f39455b86c2 Mon Sep 17 00:00:00 2001
From: Rambo Lan <brianlanbo@gmail.com>
Date: Thu, 4 Mar 2021 23:33:37 +0800
Subject: [PATCH 275/357] [DOCKER] Fix: install script regarding get-pip.py
 during docker build (#7579)

---
 docker/install/ubuntu_install_python.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/install/ubuntu_install_python.sh b/docker/install/ubuntu_install_python.sh
index 58d72f327aa6..d3af336491cc 100755
--- a/docker/install/ubuntu_install_python.sh
+++ b/docker/install/ubuntu_install_python.sh
@@ -34,7 +34,7 @@ apt-get install -y python-pip python-dev python3.6 python3.6-dev
 rm -f /usr/bin/python3 && ln -s /usr/bin/python3.6 /usr/bin/python3
 
 # Install pip
-cd /tmp && wget -q https://bootstrap.pypa.io/get-pip.py && python2 get-pip.py && python3.6 get-pip.py
+cd /tmp && wget -q https://bootstrap.pypa.io/get-pip.py && python3.6 get-pip.py
 
 # Pin pip version
 pip3 install pip==19.3.1

From 02a648317305546b2927e877ac37b600d450b136 Mon Sep 17 00:00:00 2001
From: tristan-arm <tristan.oconnor@arm.com>
Date: Thu, 4 Mar 2021 18:29:05 +0000
Subject: [PATCH 276/357] [ETHOSN] Add support for 20.11 Ethos-N driver stack
 release (#7506)

- Updated ethosn relay backend to support 20.11 api changes.
 - Removed legacy support for 20.05.
 - Added a mechanism to specify the ethosn driver stack version.
---
 cmake/utils/FindEthosN.cmake                  |   3 +-
 .../backend/contrib/ethosn/capabilities.h     |  28 ++-
 src/relay/backend/contrib/ethosn/codegen.cc   | 192 +++++++++++++++++-
 .../backend/contrib/ethosn/codegen_ethosn.h   |  16 +-
 .../backend/contrib/ethosn/ethosn_api.cc      | 116 -----------
 .../contrib/ethosn/ethosn_api_version.h       |   8 +-
 .../contrib/test_ethosn/test_networks.py      |  22 +-
 .../contrib/test_ethosn/test_reshape.py       |   4 +-
 8 files changed, 241 insertions(+), 148 deletions(-)

diff --git a/cmake/utils/FindEthosN.cmake b/cmake/utils/FindEthosN.cmake
index d33b55f0c7a9..26d00a462b39 100644
--- a/cmake/utils/FindEthosN.cmake
+++ b/cmake/utils/FindEthosN.cmake
@@ -59,6 +59,7 @@ macro(find_ethosn use_ethosn)
     find_library(ETHOSN_COMPILER_LIBRARY NAMES EthosNSupport)
 
     set(ETHOSN_PACKAGE_VERSION "0.1.1")
+    set(ETHOSN_DEFINITIONS -DETHOSN_API_VERSION=${USE_ETHOSN_API_VERSION})
 
     if(${USE_ETHOSN_HW} MATCHES ${IS_TRUE_PATTERN})
       # Runtime hardware support
@@ -70,7 +71,7 @@ macro(find_ethosn use_ethosn)
       find_library(ETHOSN_RUNTIME_LIBRARY NAMES EthosNDriver
         PATHS ${__ethosn_stack}/lib)
       find_library(ETHOSN_RUNTIME_LIBRARY NAMES EthosNDriver)
-      set(ETHOSN_DEFINITIONS -DETHOSN_HW)
+      set(ETHOSN_DEFINITIONS -DETHOSN_HW -DETHOSN_API_VERSION=${USE_ETHOSN_API_VERSION})
     endif ()
 
     if(ETHOSN_COMPILER_LIBRARY)
diff --git a/src/relay/backend/contrib/ethosn/capabilities.h b/src/relay/backend/contrib/ethosn/capabilities.h
index 8c7ee6a0d009..cc14ca101da6 100644
--- a/src/relay/backend/contrib/ethosn/capabilities.h
+++ b/src/relay/backend/contrib/ethosn/capabilities.h
@@ -45,7 +45,7 @@ namespace ethosn {
  * variant[2] - Ethos-N37
  * variant[3] - Ethos-N78
  */
-#if _ETHOSN_API_VERSION_ == 2008
+#if _ETHOSN_API_VERSION_ == 2011
 static std::vector<char> variants[4] = {
     {
         0x03, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -84,40 +84,50 @@ static std::vector<char> variants[4] = {
         0x10, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
         0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
         0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00,
-        0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x01,
+        0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x02,
         0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
     }};
 #else
-static std::vector<char> variants[3] = {
+static std::vector<char> variants[4] = {
     {
-        0x02, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x03, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
         0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x10, 0x00,
         0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00,
         0x10, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
         0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
         0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00,
         0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x01,
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     },
     {
-        0x02, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x03, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
         0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x10, 0x00,
         0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00,
         0x10, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
         0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
         0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00,
         0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x01,
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     },
     {
-        0x02, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x03, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
         0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x10, 0x00,
         0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00,
         0x10, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
         0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
         0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00,
         0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x01,
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    },
+    {
+        0x03, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0x00, 0x02, 0x00,
+        0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00,
+        0x10, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+        0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+        0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00,
+        0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x01,
+        0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
     }};
 #endif
 }  // namespace ethosn
diff --git a/src/relay/backend/contrib/ethosn/codegen.cc b/src/relay/backend/contrib/ethosn/codegen.cc
index 3097a300a0d9..5e052b3e4fd6 100644
--- a/src/relay/backend/contrib/ethosn/codegen.cc
+++ b/src/relay/backend/contrib/ethosn/codegen.cc
@@ -198,8 +198,19 @@ sl::TensorsAndId MakeOps(const sl::TensorAndId<sl::Operand>& op) {
 
 NetworkWithIDs ConstructNetworkVisitor::Construct(const Function& func) {
   // Initialise everything
+#if _ETHOSN_API_VERSION_ == 2011
+  auto ctx = transform::PassContext::Current();
+  auto cfg = ctx->GetConfig<EthosnCompilerConfig>("relay.ext.ethos-n.options");
+  if (!cfg.defined()) {
+    cfg = AttrsWithDefaultValues<EthosnCompilerConfig>();
+  }
+#endif
   NetworkWithIDs network_with_ids;
+#if _ETHOSN_API_VERSION_ == 2011
+  network_ = sl::CreateNetwork(variants[cfg.value()->variant]);
+#else
   network_ = sl::CreateNetwork();
+#endif
   network_with_ids.network = network_;
   operand_table_.clear();
 
@@ -561,7 +572,11 @@ sl::CompilationOptions EthosnCompiler::CreateOptions() {
     cfg = AttrsWithDefaultValues<EthosnCompilerConfig>();
   }
 
+#if _ETHOSN_API_VERSION_ == 2011
+  sl::CompilationOptions options;
+#else
   sl::CompilationOptions options(variants[cfg.value()->variant]);
+#endif
   options.m_Strategy0 = cfg.value()->strategy0;
   options.m_Strategy1 = cfg.value()->strategy1;
   options.m_Strategy3 = cfg.value()->strategy3;
@@ -575,15 +590,13 @@ sl::CompilationOptions EthosnCompiler::CreateOptions() {
   options.m_BlockConfig8x32 = cfg.value()->block_config_8x32;
   options.m_BlockConfig8x8 = cfg.value()->block_config_8x8;
   options.m_EnableIntermediateCompression = cfg.value()->enable_intermediate_compression;
-  options.m_DisableWinograd = cfg.value()->disable_winograd;
+#if _ETHOSN_API_VERSION_ == 2008
   options.m_DebugInfo.m_DumpDebugFiles = cfg.value()->dump_debug_files;
+#endif
+  options.m_DisableWinograd = cfg.value()->disable_winograd;
   options.m_DebugInfo.m_DebugDir = cfg.value()->debug_dir;
-#if _ETHOSN_API_VERSION_ == 2008
   options.m_CompilerAlgorithm =
       sl::EthosNCompilerAlgorithmFromString(cfg.value()->compiler_algorithm.c_str());
-#else
-  options.m_EnableCascading = cfg.value()->enable_cascading;
-#endif
   return options;
 }
 
@@ -606,6 +619,175 @@ std::pair<std::vector<uint32_t>, std::vector<uint32_t>> EthosnCompiler::GetInput
   return std::make_pair(input_order, output_order);
 }
 
+#if _ETHOSN_API_VERSION_ == 2011
+auto ctx = transform::PassContext::Current();
+auto cfg = ctx -> GetConfig<EthosnCompilerConfig>("relay.ext.ethos-n.options").defined()
+               ? ctx -> GetConfig<EthosnCompilerConfig>("relay.ext.ethos-n.options")
+               : AttrsWithDefaultValues<EthosnCompilerConfig>();
+auto m_Queries = sl::SupportQueries(variants[cfg.value()->variant]);
+#endif
+
+TVM_REGISTER_GLOBAL("relay.ethos-n.support.conv2d")
+    .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
+      Call call = args[0];
+      ConvolutionParams params;
+      auto err = EthosnAPI::QnnConv2d(call, &params);
+#if _ETHOSN_API_VERSION_ == 2011
+      if (params.is_depthwise) {
+        *rv = !err &&
+              m_Queries.IsDepthwiseConvolutionSupported(params.bias_info, params.weights_info,
+                                                        params.conv_info, params.activation_info);
+      } else {
+        *rv = !err && m_Queries.IsConvolutionSupported(params.bias_info, params.weights_info,
+                                                       params.conv_info, params.activation_info);
+      }
+#else
+      if (params.is_depthwise) {
+        *rv = !err && sl::IsDepthwiseConvolutionSupported(params.bias_info, params.weights_info,
+                                                          params.conv_info, params.activation_info);
+      } else {
+        *rv = !err && sl::IsConvolutionSupported(params.bias_info, params.weights_info,
+                                                 params.conv_info, params.activation_info);
+      }
+#endif
+    });
+
+TVM_REGISTER_GLOBAL("relay.ethos-n.support.fc")
+    .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
+      Call call = args[0];
+      FullyConnectedParams params;
+      auto err = EthosnAPI::QnnFullyConnected(call, &params);
+#if _ETHOSN_API_VERSION_ == 2011
+      *rv = !err && m_Queries.IsFullyConnectedSupported(params.bias_info, params.weights_info,
+                                                        params.fc_info, params.input_info);
+#else
+      *rv = !err && sl::IsFullyConnectedSupported(params.bias_info, params.weights_info,
+                                                  params.fc_info, params.input_info);
+#endif
+    });
+
+TVM_REGISTER_GLOBAL("relay.ethos-n.support.max_pool2d")
+    .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
+      Call call = args[0];
+      MaxPool2DParams params;
+      auto err = EthosnAPI::MaxPool2D(call, &params);
+#if _ETHOSN_API_VERSION_ == 2011
+      *rv = !err && m_Queries.IsPoolingSupported(params.pool_info, params.input_info);
+#else
+      *rv = !err && sl::IsPoolingSupported(params.pool_info, params.input_info);
+#endif
+    });
+
+TVM_REGISTER_GLOBAL("relay.ethos-n.support.avg_pool2d")
+    .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
+      Call call = args[0];
+      AvgPool2DParams params;
+      auto err = EthosnAPI::AvgPool2D(call, &params);
+#if _ETHOSN_API_VERSION_ == 2011
+      *rv = !err && m_Queries.IsPoolingSupported(params.pool_info, params.input_info);
+#else
+      *rv = !err && sl::IsPoolingSupported(params.pool_info, params.input_info);
+#endif
+    });
+
+TVM_REGISTER_GLOBAL("relay.ethos-n.support.reshape")
+    .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
+      Call call = args[0];
+      ReshapeParams params;
+      auto err = EthosnAPI::Reshape(call, &params);
+#if _ETHOSN_API_VERSION_ == 2011
+      *rv = !err && m_Queries.IsReshapeSupported(params.new_shape, params.input_info);
+#else
+      *rv = !err && sl::IsReshapeSupported(params.new_shape, params.input_info);
+#endif
+    });
+
+TVM_REGISTER_GLOBAL("relay.ethos-n.support.addition")
+    .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
+      Call call = args[0];
+      AdditionParams params;
+      auto err = EthosnAPI::Addition(call, &params);
+#if _ETHOSN_API_VERSION_ == 2011
+      *rv = !err && m_Queries.IsAdditionSupported(params.lhs_info, params.rhs_info,
+                                                  params.output_quantization_info);
+#else
+      *rv = !err && sl::IsAdditionSupported(params.lhs_info, params.rhs_info,
+                                            params.output_quantization_info);
+#endif
+    });
+
+TVM_REGISTER_GLOBAL("relay.ethos-n.support.sigmoid")
+    .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
+      Call call = args[0];
+      SigmoidParams params;
+      auto err = EthosnAPI::Sigmoid(call, &params);
+#if _ETHOSN_API_VERSION_ == 2011
+      *rv = !err && m_Queries.IsSigmoidSupported(params.input_info);
+#else
+      *rv = !err && sl::IsSigmoidSupported(params.input_info);
+#endif
+    });
+
+TVM_REGISTER_GLOBAL("relay.ethos-n.support.concatenate")
+    .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
+      Call call = args[0];
+      ConcatenateParams params;
+      auto err = EthosnAPI::Concatenate(call, &params);
+#if _ETHOSN_API_VERSION_ == 2011
+      *rv = !err && m_Queries.IsConcatenationSupported(params.input_infos, params.concat_info);
+#else
+      *rv = !err && sl::IsConcatenationSupported(params.input_infos, params.concat_info);
+#endif
+    });
+
+TVM_REGISTER_GLOBAL("relay.ethos-n.support.split")
+    .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
+      Call call = args[0];
+      SplitParams params;
+      auto err = EthosnAPI::Split(call, &params);
+#if _ETHOSN_API_VERSION_ == 2011
+      *rv = !err && m_Queries.IsSplitSupported(params.input_info, params.split_info);
+#else
+      *rv = !err && sl::IsSplitSupported(params.input_info, params.split_info);
+#endif
+    });
+
+TVM_REGISTER_GLOBAL("relay.ethos-n.support.depth_to_space")
+    .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
+      Call call = args[0];
+      DepthToSpaceParams params;
+      auto err = EthosnAPI::DepthToSpace(call, &params);
+#if _ETHOSN_API_VERSION_ == 2011
+      *rv = !err && m_Queries.IsDepthToSpaceSupported(params.input_info, params.depth_info);
+#else
+      *rv = !err && sl::IsDepthToSpaceSupported(params.input_info, params.depth_info);
+#endif
+    });
+
+TVM_REGISTER_GLOBAL("relay.ethos-n.support.relu")
+    .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
+      Call call = args[0];
+      ReluParams params;
+      auto err = EthosnAPI::Relu(call, &params);
+#if _ETHOSN_API_VERSION_ == 2011
+      *rv = !err && m_Queries.IsReluSupported(params.relu_info, params.input_info);
+#else
+      *rv = !err && sl::IsReluSupported(params.relu_info, params.input_info);
+#endif
+    });
+
+TVM_REGISTER_GLOBAL("relay.ethos-n.query").set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
+#if defined ETHOSN_HW
+  *rv = true;
+#else
+  *rv = false;
+#endif
+});
+
+TVM_REGISTER_GLOBAL("relay.ethos-n.api.version").set_body_typed([]() -> int {
+  return _ETHOSN_API_VERSION_;
+});
+
 }  // namespace ethosn
 }  // namespace contrib
 }  // namespace relay
diff --git a/src/relay/backend/contrib/ethosn/codegen_ethosn.h b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
index 9887a2b3ad78..e44aa31d6b13 100644
--- a/src/relay/backend/contrib/ethosn/codegen_ethosn.h
+++ b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
@@ -240,14 +240,12 @@ struct EthosnCompilerConfigNode : public tvm::AttrsNode<EthosnCompilerConfigNode
   bool block_config_8x32;
   bool block_config_8x8;
   bool enable_intermediate_compression;
-  bool disable_winograd;
+#if _ETHOSN_API_VERSION_ == 2008
   bool dump_debug_files;
+#endif
+  bool disable_winograd;
   String debug_dir;
-#if _ETHOSN_API_VERSION_ == 2008
   String compiler_algorithm;
-#else
-  bool enable_cascading;
-#endif
 
   TVM_DECLARE_ATTRS(EthosnCompilerConfigNode, "ext.attrs.EthosnCompilerConfigNode") {
     TVM_ATTR_FIELD(variant)
@@ -268,14 +266,12 @@ struct EthosnCompilerConfigNode : public tvm::AttrsNode<EthosnCompilerConfigNode
     TVM_ATTR_FIELD(block_config_8x32).set_default(true);
     TVM_ATTR_FIELD(block_config_8x8).set_default(true);
     TVM_ATTR_FIELD(enable_intermediate_compression).set_default(true);
-    TVM_ATTR_FIELD(disable_winograd).set_default(false);
+#if _ETHOSN_API_VERSION_ == 2008
     TVM_ATTR_FIELD(dump_debug_files).set_default(false);
+#endif
+    TVM_ATTR_FIELD(disable_winograd).set_default(false);
     TVM_ATTR_FIELD(debug_dir).set_default(".");
-#if _ETHOSN_API_VERSION_ == 2008
     TVM_ATTR_FIELD(compiler_algorithm).set_default("NonCascadingOnly");
-#else
-    TVM_ATTR_FIELD(enable_cascading).set_default(false);
-#endif
   }
 };
 
diff --git a/src/relay/backend/contrib/ethosn/ethosn_api.cc b/src/relay/backend/contrib/ethosn/ethosn_api.cc
index 6cc03ac7b8f0..4587cdbfed5a 100644
--- a/src/relay/backend/contrib/ethosn/ethosn_api.cc
+++ b/src/relay/backend/contrib/ethosn/ethosn_api.cc
@@ -72,11 +72,7 @@ EthosnError EthosnAPI::QnnConv2d(const Expr& expr, ConvolutionParams* params) {
   sl::QuantizationInfo output_q_info;
   err += Tvm2Npu(input_zero_point, input_scale, &data_q_info);
   err += Tvm2Npu(kernel_zero_point, kernel_scale, &weights_q_info);
-#if _ETHOSN_API_VERSION_ == 2008
   err += Tvm2Npu(0, data_q_info.GetScale() * weights_q_info.GetScale(), &bias_q_info);
-#else
-  err += Tvm2Npu(0, data_q_info.m_Scale * weights_q_info.m_Scale, &bias_q_info);
-#endif
   err += Tvm2Npu(output_zero_point, output_scale, &output_q_info);
 
   // Convert convolution attributes
@@ -170,11 +166,7 @@ EthosnError EthosnAPI::QnnFullyConnected(const Expr& expr, FullyConnectedParams*
   sl::QuantizationInfo output_q_info;
   err += Tvm2Npu(input_zero_point, input_scale, &data_q_info);
   err += Tvm2Npu(kernel_zero_point, kernel_scale, &weights_q_info);
-#if _ETHOSN_API_VERSION_ == 2008
   err += Tvm2Npu(0, data_q_info.GetScale() * weights_q_info.GetScale(), &bias_q_info);
-#else
-  err += Tvm2Npu(0, data_q_info.m_Scale * weights_q_info.m_Scale, &bias_q_info);
-#endif
   err += Tvm2Npu(output_zero_point, output_scale, &output_q_info);
 
   // Create fc info
@@ -629,114 +621,6 @@ EthosnError EthosnAPI::AsConstant(const Expr& expr, T* out) {
   return EthosnError();
 }
 
-TVM_REGISTER_GLOBAL("relay.ethos-n.support.conv2d")
-    .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
-      Call call = args[0];
-      ConvolutionParams params;
-      auto err = EthosnAPI::QnnConv2d(call, &params);
-      if (params.is_depthwise) {
-        *rv = !err && sl::IsDepthwiseConvolutionSupported(params.bias_info, params.weights_info,
-                                                          params.conv_info, params.activation_info);
-      } else {
-        *rv = !err && sl::IsConvolutionSupported(params.bias_info, params.weights_info,
-                                                 params.conv_info, params.activation_info);
-      }
-    });
-
-TVM_REGISTER_GLOBAL("relay.ethos-n.support.fc")
-    .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
-      Call call = args[0];
-      FullyConnectedParams params;
-      auto err = EthosnAPI::QnnFullyConnected(call, &params);
-      *rv = !err && sl::IsFullyConnectedSupported(params.bias_info, params.weights_info,
-                                                  params.fc_info, params.input_info);
-    });
-
-TVM_REGISTER_GLOBAL("relay.ethos-n.support.max_pool2d")
-    .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
-      Call call = args[0];
-      MaxPool2DParams params;
-      auto err = EthosnAPI::MaxPool2D(call, &params);
-      *rv = !err && sl::IsPoolingSupported(params.pool_info, params.input_info);
-    });
-
-TVM_REGISTER_GLOBAL("relay.ethos-n.support.avg_pool2d")
-    .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
-      Call call = args[0];
-      AvgPool2DParams params;
-      auto err = EthosnAPI::AvgPool2D(call, &params);
-      *rv = !err && sl::IsPoolingSupported(params.pool_info, params.input_info);
-    });
-
-TVM_REGISTER_GLOBAL("relay.ethos-n.support.reshape")
-    .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
-      Call call = args[0];
-      ReshapeParams params;
-      auto err = EthosnAPI::Reshape(call, &params);
-      *rv = !err && sl::IsReshapeSupported(params.new_shape, params.input_info);
-    });
-
-TVM_REGISTER_GLOBAL("relay.ethos-n.support.addition")
-    .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
-      Call call = args[0];
-      AdditionParams params;
-      auto err = EthosnAPI::Addition(call, &params);
-      *rv = !err && sl::IsAdditionSupported(params.lhs_info, params.rhs_info,
-                                            params.output_quantization_info);
-    });
-
-TVM_REGISTER_GLOBAL("relay.ethos-n.support.sigmoid")
-    .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
-      Call call = args[0];
-      SigmoidParams params;
-      auto err = EthosnAPI::Sigmoid(call, &params);
-      *rv = !err && sl::IsSigmoidSupported(params.input_info);
-    });
-
-TVM_REGISTER_GLOBAL("relay.ethos-n.support.concatenate")
-    .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
-      Call call = args[0];
-      ConcatenateParams params;
-      auto err = EthosnAPI::Concatenate(call, &params);
-      *rv = !err && sl::IsConcatenationSupported(params.input_infos, params.concat_info);
-    });
-
-TVM_REGISTER_GLOBAL("relay.ethos-n.support.split")
-    .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
-      Call call = args[0];
-      SplitParams params;
-      auto err = EthosnAPI::Split(call, &params);
-      *rv = !err && sl::IsSplitSupported(params.input_info, params.split_info);
-    });
-
-TVM_REGISTER_GLOBAL("relay.ethos-n.support.depth_to_space")
-    .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
-      Call call = args[0];
-      DepthToSpaceParams params;
-      auto err = EthosnAPI::DepthToSpace(call, &params);
-      *rv = !err && sl::IsDepthToSpaceSupported(params.input_info, params.depth_info);
-    });
-
-TVM_REGISTER_GLOBAL("relay.ethos-n.support.relu")
-    .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
-      Call call = args[0];
-      ReluParams params;
-      auto err = EthosnAPI::Relu(call, &params);
-      *rv = !err && sl::IsReluSupported(params.relu_info, params.input_info);
-    });
-
-TVM_REGISTER_GLOBAL("relay.ethos-n.query").set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
-#if defined ETHOSN_HW
-  *rv = true;
-#else
-  *rv = false;
-#endif
-});
-
-TVM_REGISTER_GLOBAL("relay.ethos-n.api.version").set_body_typed([]() -> int {
-  return _ETHOSN_API_VERSION_;
-});
-
 }  // namespace ethosn
 }  // namespace contrib
 }  // namespace relay
diff --git a/src/relay/backend/contrib/ethosn/ethosn_api_version.h b/src/relay/backend/contrib/ethosn/ethosn_api_version.h
index 618b702da333..78f08950bb48 100644
--- a/src/relay/backend/contrib/ethosn/ethosn_api_version.h
+++ b/src/relay/backend/contrib/ethosn/ethosn_api_version.h
@@ -29,10 +29,12 @@
  * along with associated compatibility measures when no
  * longer necessary.
  */
+#ifndef ETHOSN_API_VERSION
 #define _ETHOSN_API_VERSION_ 2008
-#ifndef COMPILER_ALGORITHM_MODE
-#undef _ETHOSN_API_VERSION_
-#define _ETHOSN_API_VERSION_ 2005
+#elif ~(~ETHOSN_API_VERSION + 0) == 0 && ~(~ETHOSN_API_VERSION + 1) == 1
+#define _ETHOSN_API_VERSION_ 2008
+#else
+#define _ETHOSN_API_VERSION_ ETHOSN_API_VERSION
 #endif
 
 #endif  // TVM_RELAY_BACKEND_CONTRIB_ETHOSN_ETHOSN_API_VERSION_H_
diff --git a/tests/python/contrib/test_ethosn/test_networks.py b/tests/python/contrib/test_ethosn/test_networks.py
index e0eccdfb30f5..06ce93b2aba5 100644
--- a/tests/python/contrib/test_ethosn/test_networks.py
+++ b/tests/python/contrib/test_ethosn/test_networks.py
@@ -127,6 +127,10 @@ def test_mobilenet_v1():
         _compile_hash = {"47e216d8ab2bf491708ccf5620bc0d02"}
         if tei.get_ethosn_variant() == 3:
             _compile_hash = {"2436f523e263f66a063cef902f2f43d7"}
+    if tei.get_ethosn_api_version() == 2011:
+        _compile_hash = {"9298b6c51e2a82f70e91dd11dd6af412"}
+        if tei.get_ethosn_variant() == 3:
+            _compile_hash = {"407eb47346c8afea2d15e8f0d1c079f2"}
     _test_image_network(
         model_url="https://storage.googleapis.com/download.tensorflow.org/"
         "models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz",
@@ -151,6 +155,10 @@ def test_inception_v3():
         _compile_hash = {"8c9d75659cd7bc9ff6dd6d490d28f9b2"}
         if tei.get_ethosn_variant() == 3:
             _compile_hash = {"cdd4d7f6453d722ea73224ff9d6a115a"}
+    if tei.get_ethosn_api_version() == 2011:
+        _compile_hash = {"d44eece5027ff56e5e7fcf014367378d"}
+        if tei.get_ethosn_variant() == 3:
+            _compile_hash = {"1ba555b4bc60c428018a0f2de9d90532"}
     _test_image_network(
         model_url="https://storage.googleapis.com/download.tensorflow.org/"
         "models/tflite_11_05_08/inception_v3_quant.tgz",
@@ -169,11 +177,17 @@ def test_inception_v4():
     # codegen, which could come about from either a change in Support Library
     # version or a change in the Ethos-N codegen. To update this requires running
     # on hardware that isn't available in CI.
-    if not tei.get_ethosn_variant() == 0:
-        pytest.skip("Ethos-N78 20.08 does not support inception_v4 in the default configuration.")
     _compile_hash = {"06bf6cb56344f3904bcb108e54edfe87"}
     if tei.get_ethosn_api_version() == 2008:
+        if not tei.get_ethosn_variant() == 0:
+            pytest.skip(
+                "Ethos-N78 20.08 does not support inception_v4 in the default configuration."
+            )
         _compile_hash = {"798292bfa596ca7c32086396b494b46c"}
+    if tei.get_ethosn_api_version() == 2011:
+        _compile_hash = {"53f126cf654d4cf61ebb23c767f6740b"}
+        if tei.get_ethosn_variant() == 3:
+            _compile_hash = {"851665c060cf4719248919d17325ae02"}
     _test_image_network(
         model_url="https://storage.googleapis.com/download.tensorflow.org/"
         "models/inception_v4_299_quant_20181026.tgz",
@@ -197,6 +211,10 @@ def test_ssd_mobilenet_v1():
         _compile_hash = {"5999f26e140dee0d7866491997ef78c5", "24e3a690a7e95780052792d5626c85be"}
         if tei.get_ethosn_variant() == 3:
             _compile_hash = {"da871b3f03a93df69d704ed44584d6cd", "9f52411d301f3cba3f6e4c0f1c558e87"}
+    if tei.get_ethosn_api_version() == 2011:
+        _compile_hash = {"6e8c4586bdd26527c642a4f016f52284", "057c5efb094c79fbe4483b561147f1d2"}
+        if tei.get_ethosn_variant() == 3:
+            _compile_hash = {"dc687e60a4b6750fe740853f22aeb2dc", "1949d86100004eca41099c8e6fa919ab"}
     _test_image_network(
         model_url="https://storage.googleapis.com/download.tensorflow.org/"
         "models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip",
diff --git a/tests/python/contrib/test_ethosn/test_reshape.py b/tests/python/contrib/test_ethosn/test_reshape.py
index 4afec557e569..20df5f9bd288 100644
--- a/tests/python/contrib/test_ethosn/test_reshape.py
+++ b/tests/python/contrib/test_ethosn/test_reshape.py
@@ -37,8 +37,8 @@ def test_reshape():
         return
 
     trials = [
-        ((1, 15, 4, 1), (60,)),
-        ((1, 15, 4, 1), (30, 2)),
+        ((1, 15, 4, 1), (1, 60)),
+        ((1, 15, 4, 1), (1, 30, 2)),
         ((1, 15, 4, 1), (1, 4, 15, 1)),
         ((1, 15, 4, 1), (1, 12, 5, 1)),
         ((1, 15, 4, 1), (1, -1, 2, 1)),

From 6aae48b3785aeba565138f4bb63884ae9930b2aa Mon Sep 17 00:00:00 2001
From: Jared Roesch <jroesch@octoml.ai>
Date: Thu, 4 Mar 2021 10:43:14 -0800
Subject: [PATCH 277/357] Fixes for using Python APIs from Rust. (#7085)

* Rewrite the Rust Module API and change some imports causing crashes.

This commit also updates the docs to remove outdated information.

* Renable Python test and remove warnings

* Python test still flaky

* Fix broken module test

* Fix broken test

* Reset test file
---
 python/tvm/relay/analysis/analysis.py         |   6 +-
 .../tvm/relay/analysis/annotated_regions.py   |   2 +-
 python/tvm/relay/analysis/call_graph.py       |   4 +-
 .../relay/backend/graph_runtime_factory.py    |   2 +-
 python/tvm/relay/build_module.py              |  16 +-
 python/tvm/relay/frontend/__init__.py         |   3 -
 python/tvm/relay/frontend/tensorflow.py       |   7 +-
 python/tvm/topi/cuda/__init__.py              |   2 -
 rust/tvm-rt/README.md                         |   4 +-
 rust/tvm-rt/src/lib.rs                        |  28 ++-
 rust/tvm-rt/src/map.rs                        |  12 +
 rust/tvm-rt/src/module.rs                     |  58 ++---
 rust/tvm-rt/src/object/object_ptr.rs          |  13 +-
 rust/tvm-rt/src/to_function.rs                |   1 +
 rust/tvm-rt/src/value.rs                      | 106 --------
 rust/tvm/Cargo.toml                           |   3 +-
 rust/tvm/README.md                            | 233 ++----------------
 rust/tvm/src/compiler/graph_rt.rs             | 124 ++++++++++
 rust/tvm/src/compiler/mod.rs                  |  20 ++
 rust/tvm/src/ir/expr.rs                       |  12 +-
 rust/tvm/src/ir/function.rs                   |  10 +-
 rust/tvm/src/ir/module.rs                     |   4 +-
 rust/tvm/src/ir/relay/mod.rs                  |  85 +++----
 rust/tvm/src/ir/tir.rs                        |   7 +-
 rust/tvm/src/ir/ty.rs                         |  19 +-
 rust/tvm/src/lib.rs                           |   4 +-
 rust/tvm/src/python.rs                        |  24 +-
 rust/tvm/src/runtime/graph_rt.rs              |  12 +-
 rust/tvm/tests/basics/src/main.rs             |   5 +-
 rust/tvm/tests/basics/src/tvm_add.py          |   1 -
 src/runtime/module.cc                         |   2 +-
 31 files changed, 392 insertions(+), 437 deletions(-)
 delete mode 100644 rust/tvm-rt/src/value.rs
 create mode 100644 rust/tvm/src/compiler/graph_rt.rs
 create mode 100644 rust/tvm/src/compiler/mod.rs

diff --git a/python/tvm/relay/analysis/analysis.py b/python/tvm/relay/analysis/analysis.py
index 7e49461dff52..48e9ce0643a9 100644
--- a/python/tvm/relay/analysis/analysis.py
+++ b/python/tvm/relay/analysis/analysis.py
@@ -20,9 +20,9 @@
 This file contains the set of passes for Relay, which exposes an interface for
 configuring the passes and scripting them in Python.
 """
-from tvm.ir import IRModule
-from tvm.relay import transform, build_module
-from tvm.runtime.ndarray import cpu
+from ...ir import IRModule
+from ...relay import transform, build_module
+from ...runtime.ndarray import cpu
 
 from . import _ffi_api
 from .feature import Feature
diff --git a/python/tvm/relay/analysis/annotated_regions.py b/python/tvm/relay/analysis/annotated_regions.py
index 437b97b0fa16..a18ccb97836b 100644
--- a/python/tvm/relay/analysis/annotated_regions.py
+++ b/python/tvm/relay/analysis/annotated_regions.py
@@ -17,7 +17,7 @@
 # pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name, unused-import
 """Regions used in Relay."""
 
-from tvm.runtime import Object
+from ...runtime import Object
 from . import _ffi_api
 
 
diff --git a/python/tvm/relay/analysis/call_graph.py b/python/tvm/relay/analysis/call_graph.py
index 966659aac494..fd9704d0af1f 100644
--- a/python/tvm/relay/analysis/call_graph.py
+++ b/python/tvm/relay/analysis/call_graph.py
@@ -17,8 +17,8 @@
 # pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name, unused-import
 """Call graph used in Relay."""
 
-from tvm.ir import IRModule
-from tvm.runtime import Object
+from ...ir import IRModule
+from ...runtime import Object
 from ..expr import GlobalVar
 from . import _ffi_api
 
diff --git a/python/tvm/relay/backend/graph_runtime_factory.py b/python/tvm/relay/backend/graph_runtime_factory.py
index 4c6ac47b71b4..3427a62cd491 100644
--- a/python/tvm/relay/backend/graph_runtime_factory.py
+++ b/python/tvm/relay/backend/graph_runtime_factory.py
@@ -21,7 +21,7 @@
 from tvm.runtime import ndarray
 
 
-class GraphRuntimeFactoryModule(object):
+class GraphRuntimeFactoryModule:
     """Graph runtime factory module.
     This is a module of graph runtime factory
 
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index f05e105ed2a2..79eb7e4f19ff 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -25,7 +25,7 @@
 
 from tvm.ir.transform import PassContext
 from tvm.tir import expr as tvm_expr
-from .. import nd as _nd, autotvm
+from .. import nd as _nd, autotvm, register_func
 from ..target import Target
 from ..contrib import graph_runtime as _graph_rt
 from . import _build_module
@@ -194,6 +194,20 @@ def get_params(self):
         return ret
 
 
+@register_func("tvm.relay.module_export_library")
+def _module_export(module, file_name):  # fcompile, addons, kwargs?
+    return module.export_library(file_name)
+
+
+@register_func("tvm.relay.build")
+def _build_module_no_factory(mod, target=None, target_host=None, params=None, mod_name="default"):
+    """A wrapper around build which discards the Python GraphFactoryRuntime.
+    This wrapper is suitable to be used from other programming languages as
+    the runtime::Module can be freely passed between language boundaries.
+    """
+    return build(mod, target, target_host, params, mod_name).module
+
+
 def build(mod, target=None, target_host=None, params=None, mod_name="default"):
     # fmt: off
     # pylint: disable=line-too-long
diff --git a/python/tvm/relay/frontend/__init__.py b/python/tvm/relay/frontend/__init__.py
index 7e16499ccc44..aa8ac4fc7434 100644
--- a/python/tvm/relay/frontend/__init__.py
+++ b/python/tvm/relay/frontend/__init__.py
@@ -20,9 +20,6 @@
 Contains the model importers currently defined
 for Relay.
 """
-
-from __future__ import absolute_import
-
 from .mxnet import from_mxnet
 from .mxnet_qnn_op_utils import quantize_conv_bias_mkldnn_from_var
 from .keras import from_keras
diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 20eb95ba7c00..b75331a4f9a2 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -1051,10 +1051,11 @@ def _impl(inputs, attr, params, mod):
 
 
 def _sparse_tensor_dense_matmul():
-    # Sparse utility from scipy
-    from scipy.sparse import csr_matrix
-
     def _impl(inputs, attr, params, mod):
+        # Loading this by default causes TVM to not be loadable from other languages.
+        # Sparse utility from scipy
+        from scipy.sparse import csr_matrix
+
         assert len(inputs) == 4, "There should be 4 input tensors"
 
         indices_tensor = _infer_value(inputs[0], params, mod).asnumpy()
diff --git a/python/tvm/topi/cuda/__init__.py b/python/tvm/topi/cuda/__init__.py
index 52e64804d692..c2f55668d2e2 100644
--- a/python/tvm/topi/cuda/__init__.py
+++ b/python/tvm/topi/cuda/__init__.py
@@ -17,8 +17,6 @@
 
 # pylint: disable=redefined-builtin, wildcard-import
 """CUDA specific declaration and schedules."""
-from __future__ import absolute_import as _abs
-
 from .conv1d import *
 from .conv1d_transpose_ncw import *
 from .conv2d import *
diff --git a/rust/tvm-rt/README.md b/rust/tvm-rt/README.md
index a99eeaa578dd..58b1f8a30a39 100644
--- a/rust/tvm-rt/README.md
+++ b/rust/tvm-rt/README.md
@@ -17,8 +17,8 @@
 
 # TVM Runtime Support
 
-This crate provides an idiomatic Rust API for [TVM](https://github.com/apache/tvm) runtime.
-Currently this is tested on `1.42.0` and above.
+This crate provides an idiomatic Rust API for [TVM](https://github.com/apache/tvm) runtime,
+see [here](https://github.com/apache/tvm/blob/main/rust/tvm/README.md) for more details.
 
 ## What Does This Crate Offer?
 
diff --git a/rust/tvm-rt/src/lib.rs b/rust/tvm-rt/src/lib.rs
index 4b163eff9c8f..5f9ab1617378 100644
--- a/rust/tvm-rt/src/lib.rs
+++ b/rust/tvm-rt/src/lib.rs
@@ -99,7 +99,6 @@ pub mod map;
 pub mod module;
 pub mod ndarray;
 mod to_function;
-pub mod value;
 
 /// Outputs the current TVM version.
 pub fn version() -> &'static str {
@@ -112,6 +111,8 @@ pub fn version() -> &'static str {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::{ByteArray, Context, DataType};
+    use std::{convert::TryInto, str::FromStr};
 
     #[test]
     fn print_version() {
@@ -127,4 +128,29 @@ mod tests {
             errors::NDArrayError::EmptyArray.to_string()
         );
     }
+
+    #[test]
+    fn bytearray() {
+        let w = vec![1u8, 2, 3, 4, 5];
+        let v = ByteArray::from(w.as_slice());
+        let tvm: ByteArray = RetValue::from(v).try_into().unwrap();
+        assert_eq!(
+            tvm.data(),
+            w.iter().copied().collect::<Vec<u8>>().as_slice()
+        );
+    }
+
+    #[test]
+    fn ty() {
+        let t = DataType::from_str("int32").unwrap();
+        let tvm: DataType = RetValue::from(t).try_into().unwrap();
+        assert_eq!(tvm, t);
+    }
+
+    #[test]
+    fn ctx() {
+        let c = Context::from_str("gpu").unwrap();
+        let tvm: Context = RetValue::from(c).try_into().unwrap();
+        assert_eq!(tvm, c);
+    }
 }
diff --git a/rust/tvm-rt/src/map.rs b/rust/tvm-rt/src/map.rs
index 5ea48893d86b..d6dfaf3641b8 100644
--- a/rust/tvm-rt/src/map.rs
+++ b/rust/tvm-rt/src/map.rs
@@ -107,6 +107,18 @@ where
         let oref: ObjectRef = map_get_item(self.object.clone(), key.upcast())?;
         oref.downcast()
     }
+
+    pub fn empty() -> Self {
+        Self::from_iter(vec![].into_iter())
+    }
+
+    //(@jroesch): I don't think this is a correct implementation.
+    pub fn null() -> Self {
+        Map {
+            object: ObjectRef::null(),
+            _data: PhantomData,
+        }
+    }
 }
 
 pub struct IntoIter<K, V> {
diff --git a/rust/tvm-rt/src/module.rs b/rust/tvm-rt/src/module.rs
index c0822a5045e6..6109819939af 100644
--- a/rust/tvm-rt/src/module.rs
+++ b/rust/tvm-rt/src/module.rs
@@ -26,21 +26,24 @@ use std::{
     ptr,
 };
 
+use crate::object::Object;
+use tvm_macros::Object;
 use tvm_sys::ffi;
 
 use crate::errors::Error;
+use crate::String as TString;
 use crate::{errors, function::Function};
 
-const ENTRY_FUNC: &str = "__tvm_main__";
-
 /// Wrapper around TVM module handle which contains an entry function.
 /// The entry function can be applied to an imported module through [`entry_func`].
 ///
 /// [`entry_func`]:struct.Module.html#method.entry_func
-#[derive(Debug, Clone)]
-pub struct Module {
-    pub(crate) handle: ffi::TVMModuleHandle,
-    entry_func: Option<Function>,
+#[repr(C)]
+#[derive(Object, Debug)]
+#[ref_name = "Module"]
+#[type_key = "runtime.Module"]
+pub struct ModuleNode {
+    base: Object,
 }
 
 crate::external! {
@@ -49,21 +52,18 @@ crate::external! {
 
     #[name("runtime.ModuleLoadFromFile")]
     fn load_from_file(file_name: CString, format: CString) -> Module;
+
+    #[name("runtime.ModuleSaveToFile")]
+    fn save_to_file(module: Module, name: TString, fmt: TString);
+
+    // TODO(@jroesch): we need to refactor this
+    #[name("tvm.relay.module_export_library")]
+    fn export_library(module: Module, file_name: TString);
 }
 
 impl Module {
-    pub(crate) fn new(handle: ffi::TVMModuleHandle) -> Self {
-        Self {
-            handle,
-            entry_func: None,
-        }
-    }
-
-    pub fn entry(&mut self) -> Option<Function> {
-        if self.entry_func.is_none() {
-            self.entry_func = self.get_function(ENTRY_FUNC, false).ok();
-        }
-        self.entry_func.clone()
+    pub fn default_fn(&mut self) -> Result<Function, Error> {
+        self.get_function("default", true)
     }
 
     /// Gets a function by name from a registered module.
@@ -72,7 +72,7 @@ impl Module {
         let mut fhandle = ptr::null_mut() as ffi::TVMFunctionHandle;
 
         check_call!(ffi::TVMModGetFunction(
-            self.handle,
+            self.handle(),
             name.as_ptr() as *const c_char,
             query_import as c_int,
             &mut fhandle as *mut _
@@ -87,7 +87,7 @@ impl Module {
 
     /// Imports a dependent module such as `.ptx` for gpu.
     pub fn import_module(&self, dependent_module: Module) {
-        check_call!(ffi::TVMModImport(self.handle, dependent_module.handle))
+        check_call!(ffi::TVMModImport(self.handle(), dependent_module.handle()))
     }
 
     /// Loads a module shared library from path.
@@ -110,6 +110,14 @@ impl Module {
         Ok(module)
     }
 
+    pub fn save_to_file(&self, name: String, fmt: String) -> Result<(), Error> {
+        save_to_file(self.clone(), name.into(), fmt.into())
+    }
+
+    pub fn export_library(&self, name: String) -> Result<(), Error> {
+        export_library(self.clone(), name.into())
+    }
+
     /// Checks if a target device is enabled for a module.
     pub fn enabled(&self, target: &str) -> bool {
         let target = CString::new(target).unwrap();
@@ -118,13 +126,7 @@ impl Module {
     }
 
     /// Returns the underlying module handle.
-    pub fn handle(&self) -> ffi::TVMModuleHandle {
-        self.handle
-    }
-}
-
-impl Drop for Module {
-    fn drop(&mut self) {
-        check_call!(ffi::TVMModFree(self.handle));
+    pub unsafe fn handle(&self) -> ffi::TVMModuleHandle {
+        self.0.clone().unwrap().into_raw() as *mut _
     }
 }
diff --git a/rust/tvm-rt/src/object/object_ptr.rs b/rust/tvm-rt/src/object/object_ptr.rs
index 8df6041956b8..264d5febd103 100644
--- a/rust/tvm-rt/src/object/object_ptr.rs
+++ b/rust/tvm-rt/src/object/object_ptr.rs
@@ -267,6 +267,10 @@ impl<T: IsObject> ObjectPtr<T> {
             Err(Error::downcast("TODOget_type_key".into(), U::TYPE_KEY))
         }
     }
+
+    pub unsafe fn into_raw(self) -> *mut T {
+        self.ptr.as_ptr()
+    }
 }
 
 impl<T: IsObject> std::ops::Deref for ObjectPtr<T> {
@@ -300,7 +304,7 @@ impl<'a, T: IsObject> TryFrom<RetValue> for ObjectPtr<T> {
         use crate::ndarray::NDArrayContainer;
 
         match ret_value {
-            RetValue::ObjectHandle(handle) => {
+            RetValue::ObjectHandle(handle) | RetValue::ModuleHandle(handle) => {
                 let optr = ObjectPtr::from_raw(handle as *mut Object).ok_or(Error::Null)?;
                 debug_assert!(optr.count() >= 1);
                 optr.downcast()
@@ -329,6 +333,11 @@ impl<'a, T: IsObject> From<ObjectPtr<T>> for ArgValue<'a> {
                 assert!(!raw_ptr.is_null());
                 ArgValue::NDArrayHandle(raw_ptr)
             }
+            "runtime.Module" => {
+                let raw_ptr = ObjectPtr::leak(object_ptr) as *mut Object as *mut std::ffi::c_void;
+                assert!(!raw_ptr.is_null());
+                ArgValue::ModuleHandle(raw_ptr)
+            }
             _ => {
                 let raw_ptr = ObjectPtr::leak(object_ptr) as *mut Object as *mut std::ffi::c_void;
                 assert!(!raw_ptr.is_null());
@@ -346,7 +355,7 @@ impl<'a, T: IsObject> TryFrom<ArgValue<'a>> for ObjectPtr<T> {
         use crate::ndarray::NDArrayContainer;
 
         match arg_value {
-            ArgValue::ObjectHandle(handle) => {
+            ArgValue::ObjectHandle(handle) | ArgValue::ModuleHandle(handle) => {
                 let optr = ObjectPtr::from_raw(handle as *mut Object).ok_or(Error::Null)?;
                 debug_assert!(optr.count() >= 1);
                 optr.downcast()
diff --git a/rust/tvm-rt/src/to_function.rs b/rust/tvm-rt/src/to_function.rs
index affd81b0e7ed..c5ede7d224ce 100644
--- a/rust/tvm-rt/src/to_function.rs
+++ b/rust/tvm-rt/src/to_function.rs
@@ -255,6 +255,7 @@ impl_typed_and_to_function!(2; A, B);
 impl_typed_and_to_function!(3; A, B, C);
 impl_typed_and_to_function!(4; A, B, C, D);
 impl_typed_and_to_function!(5; A, B, C, D, E);
+impl_typed_and_to_function!(6; A, B, C, D, E, G);
 
 #[cfg(test)]
 mod tests {
diff --git a/rust/tvm-rt/src/value.rs b/rust/tvm-rt/src/value.rs
deleted file mode 100644
index b8cd190176c4..000000000000
--- a/rust/tvm-rt/src/value.rs
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-//! This module implements [`ArgValue`] and [`RetValue`] types
-//! and their conversions needed for the types used in frontend crate.
-//! `RetValue` is the owned version of `TVMPODValue`.
-
-use std::convert::TryFrom;
-
-use crate::{ArgValue, Module, RetValue};
-use tvm_sys::{errors::ValueDowncastError, ffi::TVMModuleHandle, try_downcast};
-
-macro_rules! impl_handle_val {
-    ($type:ty, $variant:ident, $inner_type:ty, $ctor:path) => {
-        impl<'a> From<&'a $type> for ArgValue<'a> {
-            fn from(arg: &'a $type) -> Self {
-                ArgValue::$variant(arg.handle() as $inner_type)
-            }
-        }
-
-        impl<'a> From<&'a mut $type> for ArgValue<'a> {
-            fn from(arg: &'a mut $type) -> Self {
-                ArgValue::$variant(arg.handle() as $inner_type)
-            }
-        }
-
-        impl<'a> TryFrom<ArgValue<'a>> for $type {
-            type Error = ValueDowncastError;
-            fn try_from(val: ArgValue<'a>) -> Result<$type, Self::Error> {
-                try_downcast!(val -> $type, |ArgValue::$variant(val)| { $ctor(val) })
-            }
-        }
-
-        impl<'a, 'v> TryFrom<&'a ArgValue<'v>> for $type {
-            type Error = ValueDowncastError;
-            fn try_from(val: &'a ArgValue<'v>) -> Result<$type, Self::Error> {
-                try_downcast!(val -> $type, |ArgValue::$variant(val)| { $ctor(*val) })
-            }
-        }
-
-        impl From<$type> for RetValue {
-            fn from(val: $type) -> RetValue {
-                RetValue::$variant(val.handle() as $inner_type)
-            }
-        }
-
-        impl TryFrom<RetValue> for $type {
-            type Error = ValueDowncastError;
-            fn try_from(val: RetValue) -> Result<$type, Self::Error> {
-                try_downcast!(val -> $type, |RetValue::$variant(val)| { $ctor(val) })
-            }
-        }
-    };
-}
-
-impl_handle_val!(Module, ModuleHandle, TVMModuleHandle, Module::new);
-
-#[cfg(test)]
-mod tests {
-    use std::{convert::TryInto, str::FromStr};
-
-    use crate::{ByteArray, Context, DataType};
-
-    use super::*;
-
-    #[test]
-    fn bytearray() {
-        let w = vec![1u8, 2, 3, 4, 5];
-        let v = ByteArray::from(w.as_slice());
-        let tvm: ByteArray = RetValue::from(v).try_into().unwrap();
-        assert_eq!(
-            tvm.data(),
-            w.iter().copied().collect::<Vec<u8>>().as_slice()
-        );
-    }
-
-    #[test]
-    fn ty() {
-        let t = DataType::from_str("int32").unwrap();
-        let tvm: DataType = RetValue::from(t).try_into().unwrap();
-        assert_eq!(tvm, t);
-    }
-
-    #[test]
-    fn ctx() {
-        let c = Context::from_str("gpu").unwrap();
-        let tvm: Context = RetValue::from(c).try_into().unwrap();
-        assert_eq!(tvm, c);
-    }
-}
diff --git a/rust/tvm/Cargo.toml b/rust/tvm/Cargo.toml
index 29d2003b5089..9438f340f78f 100644
--- a/rust/tvm/Cargo.toml
+++ b/rust/tvm/Cargo.toml
@@ -50,9 +50,10 @@ tvm-macros = { version = "*", path = "../tvm-macros/" }
 paste = "0.1"
 mashup = "0.1"
 once_cell = "^1.3.1"
-pyo3 = { version = "0.11.1", optional = true }
+pyo3 = { version = "^0.13", optional = true }
 codespan-reporting = "0.9.5"
 structopt = { version = "0.3" }
+tracing = "^0.1"
 
 [[bin]]
 name = "tyck"
diff --git a/rust/tvm/README.md b/rust/tvm/README.md
index 26f9f1fbedfd..75fabe7d9a1b 100644
--- a/rust/tvm/README.md
+++ b/rust/tvm/README.md
@@ -15,221 +15,40 @@
 <!--- specific language governing permissions and limitations -->
 <!--- under the License. -->
 
-# TVM Runtime Frontend Support
+# TVM
 
-This crate provides an idiomatic Rust API for [TVM](https://github.com/apache/tvm) runtime frontend. Currently this requires **Nightly Rust** and tested on `rustc 1.32.0-nightly`
+This crate provides an idiomatic Rust API for [TVM](https://github.com/apache/tvm).
+The code works on **Stable Rust** and is tested against `rustc 1.47`.
 
-## What Does This Crate Offer?
-
-Here is a major workflow
-
-1. Train your **Deep Learning** model using any major framework such as [PyTorch](https://pytorch.org/), [Apache MXNet](https://mxnet.apache.org/) or [TensorFlow](https://www.tensorflow.org/)
-2. Use **TVM** to build optimized model artifacts on a supported context such as CPU, GPU, OpenCL and specialized accelerators.
-3. Deploy your models using **Rust** :heart:
-
-### Example: Deploy Image Classification from Pretrained Resnet18 on ImageNet1k
-
-Please checkout [examples/resnet](examples/resnet) for the complete end-to-end example.
-
-Here's a Python snippet for downloading and building a pretrained Resnet18 via Apache MXNet and TVM
-
-```python
-block = get_model('resnet18_v1', pretrained=True)
-
-sym, params = relay.frontend.from_mxnet(block, shape_dict)
-# compile the model
-with relay.build_config(opt_level=opt_level):
-    graph, lib, params = relay.build(
-        net, target, params=params)
-# same the model artifacts
-lib.save(os.path.join(target_dir, "deploy_lib.o"))
-cc.create_shared(os.path.join(target_dir, "deploy_lib.so"),
-                [os.path.join(target_dir, "deploy_lib.o")])
-
-with open(os.path.join(target_dir, "deploy_graph.json"), "w") as fo:
-    fo.write(graph.json())
-with open(os.path.join(target_dir,"deploy_param.params"), "wb") as fo:
-    fo.write(relay.save_param_dict(params))
-```
+You can find the API Documentation [here](https://tvm.apache.org/docs/api/rust/tvm/index.html).
 
-Now, we need to input the artifacts to create and run the *Graph Runtime* to detect our input cat image
-
-![cat](https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true)
+## What Does This Crate Offer?
 
-as demostrated in the following Rust snippet
+The goal of this crate is to provide bindings to both the TVM compiler and runtime
+APIs. First train your **Deep Learning** model using any major framework such as
+[PyTorch](https://pytorch.org/), [Apache MXNet](https://mxnet.apache.org/) or [TensorFlow](https://www.tensorflow.org/).
+Then use **TVM** to build and deploy optimized model artifacts on a supported devices such as CPU, GPU, OpenCL and specialized accelerators.
 
-```rust
-    let graph = fs::read_to_string("deploy_graph.json")?;
-    // load the built module
-    let lib = Module::load(&Path::new("deploy_lib.so"))?;
-    // get the global TVM graph runtime function
-    let runtime_create_fn = Function::get("tvm.graph_runtime.create", true).unwrap();
-    let runtime_create_fn_ret = call_packed!(
-        runtime_create_fn,
-        &graph,
-        &lib,
-        &ctx.device_type,
-        &ctx.device_id
-    )?;
-    // get graph runtime module
-    let graph_runtime_module: Module = runtime_create_fn_ret.try_into()?;
-    // get the registered `load_params` from runtime module
-    let ref load_param_fn = graph_runtime_module
-        .get_function("load_params", false)
-        .unwrap();
-    // parse parameters and convert to TVMByteArray
-    let params: Vec<u8> = fs::read("deploy_param.params")?;
-    let barr = TVMByteArray::from(&params);
-    // load the parameters
-    call_packed!(load_param_fn, &barr)?;
-    // get the set_input function
-    let ref set_input_fn = graph_runtime_module
-        .get_function("set_input", false)
-        .unwrap();
+The Rust bindings are composed of a few crates:
+- The [tvm](https://tvm.apache.org/docs/api/rust/tvm/index.html) crate which exposes Rust bindings to
+  both the compiler and runtime.
+- The [tvm_macros](https://tvm.apache.org/docs/api/rust/tvm/index.html) crate which provides macros
+  which generate unsafe boilerplate for TVM's data structures.
+- The [tvm_rt](https://tvm.apache.org/docs/api/rust/tvm_rt/index.html) crate which exposes Rust
+  bindings to the TVM runtime APIs.
+- The [tvm_sys] crate which provides raw bindings and linkage to the TVM C++ library.
+- The [tvm_graph_rt] crate which implements a version of the TVM graph runtime in Rust vs. C++.
 
-    call_packed!(set_input_fn, "data", &input)?;
-    // get `run` function from runtime module
-    let ref run_fn = graph_runtime_module.get_function("run", false).unwrap();
-    // execute the run function. Note that it has no argument
-    call_packed!(run_fn,)?;
-    // prepare to get the output
-    let output_shape = &mut [1, 1000];
-    let output = empty(output_shape, TVMContext::cpu(0), TVMType::from("float32"));
-    // get the `get_output` function from runtime module
-    let ref get_output_fn = graph_runtime_module
-        .get_function("get_output", false)
-        .unwrap();
-    // execute the get output function
-    call_packed!(get_output_fn, &0, &output)?;
-    // flatten the output as Vec<f32>
-    let output = output.to_vec::<f32>()?;
-```
+These crates have been recently refactored and reflect a much different philosophy than
+previous bindings, as well as much increased support for more of the TVM API including
+exposing all of the compiler internals.
 
-and the model correctly predicts the input image as **tiger cat**.
+These are still very much in development and should not be considered stable, but contributions
+and usage is welcome and encouraged. If you want to discuss design issues check our Discourse
+[forum](https://discuss.tvm.ai) and for bug reports check our GitHub [repository](https://github.com/apache/tvm).
 
-## Installations
+## Install
 
-Please follow TVM [installations](https://tvm.apache.org/docs/install/index.html), `export TVM_HOME=/path/to/tvm` and add `libtvm_runtime` to your `LD_LIBRARY_PATH`.
+Please follow the TVM [install](https://tvm.apache.org/docs/install/index.html) instructions, `export TVM_HOME=/path/to/tvm` and add `libtvm_runtime` to your `LD_LIBRARY_PATH`.
 
 *Note:* To run the end-to-end examples and tests, `tvm` and `topi` need to be added to your `PYTHONPATH` or it's automatic via an Anaconda environment when it is installed individually.
-
-## Supported TVM Functionalities
-
-### Use TVM to Generate Shared Library
-
-One can use the following Python snippet to generate `add_gpu.so` which add two vectors on GPU.
-
-```python
-import os
-import tvm
-from tvm import te
-from tvm.contrib import cc
-
-def test_add(target_dir):
-    if not tvm.runtime.enabled("cuda"):
-        print("skip {__file__} because cuda is not enabled...".format(__file__=__file__))
-        return
-    n = te.var("n")
-    A = te.placeholder((n,), name='A')
-    B = te.placeholder((n,), name='B')
-    C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")
-    s = te.create_schedule(C.op)
-    bx, tx = s[C].split(C.op.axis[0], factor=64)
-    s[C].bind(bx, tvm.thread_axis("blockIdx.x"))
-    s[C].bind(tx, tvm.thread_axis("threadIdx.x"))
-    fadd_cuda = tvm.build(s, [A, B, C], "cuda", target_host="llvm", name="myadd")
-
-    fadd_cuda.save(os.path.join(target_dir, "add_gpu.o"))
-    fadd_cuda.imported_modules[0].save(os.path.join(target_dir, "add_gpu.ptx"))
-    cc.create_shared(os.path.join(target_dir, "add_gpu.so"),
-            [os.path.join(target_dir, "add_gpu.o")])
-
-
-if __name__ == "__main__":
-    import sys
-    if len(sys.argv) != 2:
-        sys.exit(-1)
-    test_add(sys.argv[1])
-```
-
-### Run the Generated Shared Library
-
-The following code snippet demonstrates how to load and test the generated shared library (`add_gpu.so`) in Rust.
-
-```rust
-extern crate tvm_frontend as tvm;
-
-use tvm::*;
-
-fn main() {
-    let shape = &mut [2];
-    let mut data = vec![3f32, 4.0];
-    let mut arr = empty(shape, TVMContext::gpu(0), TVMType::from("float32"));
-    arr.copy_from_buffer(data.as_mut_slice());
-    let mut ret = empty(shape, TVMContext::gpu(0), TVMType::from("float32"));
-    let mut fadd = Module::load(&Path::new("add_gpu.so")).unwrap();
-    let fadd_dep = Module::load(&Path::new("add_gpu.ptx")).unwrap();
-    assert!(fadd.enabled("gpu"));
-    fadd.import_module(fadd_dep);
-    fadd.entry();
-    function::Builder::from(&mut fadd)
-        .arg(&arr)
-        .arg(&arr)
-        .set_output(&mut ret)?
-        .invoke()
-        .unwrap();
-
-    assert_eq!(ret.to_vec::<f32>().unwrap(), vec![6f32, 8.0]);
-}
-```
-
-**Note:** it is required to instruct the `rustc` to link to the generated `add_gpu.so` in runtime, for example by
-`cargo:rustc-link-search=native=add_gpu`.
-
-See the tests and examples custom `build.rs` for more details.
-
-### Convert and Register a Rust Function as a TVM Packed Function
-
-One can use `register_global_func!` macro to convert and register a Rust
-function of type `fn(&[TVMArgValue]) -> Result<TVMRetValue>` to a global TVM **packed function** as follows
-
-```rust
-#[macro_use]
-extern crate tvm_frontend as tvm;
-use std::convert::TryInto;
-use tvm::*;
-
-fn main() {
-    register_global_func! {
-        fn sum(args: &[TVMArgValue]) -> Result<TVMRetValue> {
-            let mut ret = 0f32;
-            let shape = &mut [2];
-            for arg in args.iter() {
-                let e = empty(shape, TVMContext::cpu(0), TVMType::from("float32"));
-                let arg: NDArray = arg.try_into()?;
-                let arr = arg.copy_to_ndarray(e).unwrap();
-                let rnd: ArrayD<f32> = ArrayD::try_from(&arr).unwrap();
-                ret += rnd.scalar_sum();
-            }
-            let ret_val = TVMRetValue::from(&ret);
-            Ok(ret_val)
-        }
-    }
-
-    let shape = &mut [2];
-    let mut data = vec![3f32, 4.0];
-    let mut arr = empty(shape, TVMContext::cpu(0), TVMType::from("float32"));
-    arr.copy_from_buffer(data.as_mut_slice());
-    let mut registered = function::Builder::default();
-    let ret: f64 = registered
-        .get_function("sum", true)
-        .arg(&arr)
-        .arg(&arr)
-        .invoke()
-        .unwrap()
-        .try_into()
-        .unwrap();
-
-    assert_eq!(ret, 14f64);
-}
-```
diff --git a/rust/tvm/src/compiler/graph_rt.rs b/rust/tvm/src/compiler/graph_rt.rs
new file mode 100644
index 000000000000..6b5873398cab
--- /dev/null
+++ b/rust/tvm/src/compiler/graph_rt.rs
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+use std::convert::TryInto;
+use std::io::Read;
+use std::path::Path;
+
+use once_cell::sync::Lazy;
+use thiserror::Error;
+
+use crate::ir::IRModule;
+use crate::python;
+use crate::runtime::{map::Map, Function, Module as RtModule, NDArray, String};
+
+#[derive(Error, Debug)]
+pub enum Error {
+    #[error("{0}")]
+    IO(#[from] std::io::Error),
+    #[error("{0}")]
+    TVM(#[from] crate::errors::Error),
+}
+
+static TVM_BUILD: Lazy<Function> = Lazy::new(|| {
+    python::import("tvm").unwrap();
+    python::import("tvm.relay").unwrap();
+    Function::get("tvm.relay.build").unwrap()
+});
+
+fn _compile_module(
+    module: IRModule,
+    target: String,
+    target_host: String,
+    params: Map<String, NDArray>,
+    module_name: String,
+) -> Result<RtModule, Error> {
+    // The RAW API is Fn(IRModule, String, String, Map<String, NDArray>, String);
+    let module = TVM_BUILD.invoke(vec![
+        module.into(),
+        target.into(),
+        target_host.into(),
+        params.into(),
+        module_name.into(),
+    ])?;
+    let module: RtModule = module.try_into().unwrap();
+    Ok(module)
+}
+
+#[derive(Debug)]
+pub struct CompilerConfig {
+    target: Option<String>,
+    target_host: Option<String>,
+    params: Map<String, NDArray>,
+    module_name: Option<String>,
+}
+
+impl Default for CompilerConfig {
+    fn default() -> Self {
+        CompilerConfig {
+            target: None,
+            target_host: None,
+            params: Map::empty(),
+            module_name: None,
+        }
+    }
+}
+
+/// Compile a module from a configuration and IRModule.
+///
+/// # Arguments
+///
+/// * `config` - The configuration for the compiler.
+/// * `module` - The IRModule to compile.
+pub fn compile_module(config: CompilerConfig, module: IRModule) -> Result<RtModule, Error> {
+    let target = config.target.unwrap_or("llvm".into());
+    _compile_module(
+        module,
+        target,
+        "llvm".into(),
+        Map::<String, NDArray>::empty(),
+        "default".into(),
+    )
+}
+
+/// Compile an IRModule on disk and output a runtime module to disk.
+///
+/// # Arguments
+/// * `config` - The configuration for the compiler.
+/// * `ir_mod_path` - The path the serialized IRModule.
+//
+/// * `output_rt_mod_path` - The path to the output runtime module.
+pub fn compile_from_disk<P1, P2>(
+    config: CompilerConfig,
+    ir_mod_path: P1,
+    output_rt_mod_path: P2,
+) -> Result<(), Error>
+where
+    P1: AsRef<Path>,
+    P2: AsRef<Path>,
+{
+    let mut input_file = std::fs::File::open(ir_mod_path.as_ref())?;
+    let mut input_module_text = std::string::String::new();
+    input_file.read_to_string(&mut input_module_text)?;
+    let input_module = IRModule::parse("name", input_module_text)?;
+    let rt_module = compile_module(config, input_module)?;
+    let output_path_str = output_rt_mod_path.as_ref().display().to_string();
+    rt_module.export_library(output_path_str)?;
+    Ok(())
+}
diff --git a/rust/tvm/src/compiler/mod.rs b/rust/tvm/src/compiler/mod.rs
new file mode 100644
index 000000000000..ed8b47edbad4
--- /dev/null
+++ b/rust/tvm/src/compiler/mod.rs
@@ -0,0 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+pub mod graph_rt;
diff --git a/rust/tvm/src/ir/expr.rs b/rust/tvm/src/ir/expr.rs
index 653169def3a4..03d8a4920718 100644
--- a/rust/tvm/src/ir/expr.rs
+++ b/rust/tvm/src/ir/expr.rs
@@ -32,12 +32,14 @@ use super::span::Span;
 #[type_key = "Expr"]
 pub struct BaseExprNode {
     pub base: Object,
+    pub span: Span,
 }
 
 impl BaseExprNode {
-    pub fn base<T: IsObject>() -> BaseExprNode {
+    pub fn base<T: IsObject>(span: Span) -> BaseExprNode {
         BaseExprNode {
             base: Object::base::<T>(),
+            span,
         }
     }
 }
@@ -52,9 +54,9 @@ pub struct PrimExprNode {
 }
 
 impl PrimExprNode {
-    pub fn base<T: IsObject>(datatype: DataType) -> PrimExprNode {
+    pub fn base<T: IsObject>(datatype: DataType, span: Span) -> PrimExprNode {
         PrimExprNode {
-            base: BaseExprNode::base::<T>(),
+            base: BaseExprNode::base::<T>(span),
             datatype,
         }
     }
@@ -70,9 +72,9 @@ pub struct GlobalVarNode {
 }
 
 impl GlobalVar {
-    pub fn new(name_hint: String, _span: Span) -> GlobalVar {
+    pub fn new(name_hint: String, span: Span) -> GlobalVar {
         let node = GlobalVarNode {
-            base: relay::ExprNode::base::<GlobalVarNode>(),
+            base: relay::ExprNode::base::<GlobalVarNode>(span),
             name_hint: name_hint.into(),
         };
         GlobalVar(Some(ObjectPtr::new(node)))
diff --git a/rust/tvm/src/ir/function.rs b/rust/tvm/src/ir/function.rs
index 14c00ea02bf6..43aca869f385 100644
--- a/rust/tvm/src/ir/function.rs
+++ b/rust/tvm/src/ir/function.rs
@@ -17,12 +17,12 @@
  * under the License.
  */
 
-use crate::ir::relay::ExprNode;
-use crate::runtime::{IsObject, IsObjectRef, ObjectRef};
-
 use tvm_macros::Object;
 
-// Define Calling Convention.
+use super::span::Span;
+
+use crate::ir::relay::ExprNode;
+use crate::runtime::{IsObject, IsObjectRef, ObjectRef};
 
 // TODO(@jroesch): define DictAttrs
 pub type DictAttrs = ObjectRef;
@@ -39,7 +39,7 @@ pub struct BaseFuncNode {
 impl BaseFuncNode {
     pub fn base<T: IsObject>() -> BaseFuncNode {
         BaseFuncNode {
-            base: ExprNode::base::<T>(),
+            base: ExprNode::base::<T>(Span::null()),
             attrs: <ObjectRef as IsObjectRef>::null(),
         }
     }
diff --git a/rust/tvm/src/ir/module.rs b/rust/tvm/src/ir/module.rs
index a09f70dc25b9..513a906f6db4 100644
--- a/rust/tvm/src/ir/module.rs
+++ b/rust/tvm/src/ir/module.rs
@@ -279,8 +279,8 @@ mod tests {
         let name = GlobalTypeVar::new("my_type", TypeKind::Type, Span::null());
         let type_data = TypeData::new(name.clone(), vec![], vec![], Span::null());
         module.add_def(name.clone(), type_data, true)?;
-        let by_gtv = module.lookup_def(name)?;
-        let by_gv = module.lookup_def_str("my_type")?;
+        let _by_gtv = module.lookup_def(name)?;
+        let _by_gv = module.lookup_def_str("my_type")?;
         Ok(())
     }
 
diff --git a/rust/tvm/src/ir/relay/mod.rs b/rust/tvm/src/ir/relay/mod.rs
index 9d2983237acb..f43967f28d60 100644
--- a/rust/tvm/src/ir/relay/mod.rs
+++ b/rust/tvm/src/ir/relay/mod.rs
@@ -23,7 +23,7 @@ use super::attrs::Attrs;
 use super::expr::BaseExprNode;
 use super::function::BaseFuncNode;
 use super::span::Span;
-use super::ty::{Type, TypeNode};
+use super::ty::Type;
 
 use tvm_macros::Object;
 use tvm_rt::NDArray;
@@ -39,19 +39,14 @@ pub mod attrs;
 #[type_key = "RelayExpr"]
 pub struct ExprNode {
     pub base: BaseExprNode,
-    pub span: ObjectRef,
     pub checked_type: Type,
 }
 
 impl ExprNode {
-    pub fn base<T: IsObject>() -> ExprNode {
+    pub fn base<T: IsObject>(span: Span) -> ExprNode {
         ExprNode {
-            base: BaseExprNode::base::<T>(),
-            span: ObjectRef::null(),
-            checked_type: Type::from(TypeNode {
-                base: Object::base::<TypeNode>(),
-                span: Span::null(),
-            }),
+            base: BaseExprNode::base::<T>(span.clone()),
+            checked_type: Type::null(),
         }
     }
 }
@@ -85,9 +80,9 @@ pub struct ConstantNode {
 }
 
 impl Constant {
-    pub fn new(data: NDArray, _span: ObjectRef) -> Constant {
+    pub fn new(data: NDArray, span: Span) -> Constant {
         let node = ConstantNode {
-            base: ExprNode::base::<ConstantNode>(),
+            base: ExprNode::base::<ConstantNode>(span),
             data: data,
         };
         Constant(Some(ObjectPtr::new(node)))
@@ -104,9 +99,9 @@ pub struct TupleNode {
 }
 
 impl Tuple {
-    pub fn new(fields: Array<Expr>, _span: ObjectRef) -> Tuple {
+    pub fn new(fields: Array<Expr>, span: Span) -> Tuple {
         let node = TupleNode {
-            base: ExprNode::base::<TupleNode>(),
+            base: ExprNode::base::<TupleNode>(span),
             fields,
         };
         Tuple(Some(ObjectPtr::new(node)))
@@ -124,9 +119,9 @@ pub struct VarNode {
 }
 
 impl Var {
-    pub fn new(name_hint: String, type_annotation: Type, _span: Span) -> Var {
+    pub fn new(name_hint: String, type_annotation: Type, span: Span) -> Var {
         let node = VarNode {
-            base: ExprNode::base::<VarNode>(),
+            base: ExprNode::base::<VarNode>(span),
             vid: Id::new(name_hint.into()),
             type_annotation: type_annotation,
         };
@@ -165,10 +160,10 @@ impl Call {
         args: Array<Expr>,
         attrs: Attrs,
         type_args: Array<Type>,
-        _span: ObjectRef,
+        span: Span,
     ) -> Call {
         let node = CallNode {
-            base: ExprNode::base::<VarNode>(),
+            base: ExprNode::base::<VarNode>(span),
             op: op,
             args: args,
             attrs: attrs,
@@ -190,9 +185,9 @@ pub struct LetNode {
 }
 
 impl Let {
-    pub fn new(var: Var, value: Expr, body: Expr, _span: ObjectRef) -> Let {
+    pub fn new(var: Var, value: Expr, body: Expr, span: Span) -> Let {
         let node = LetNode {
-            base: ExprNode::base::<LetNode>(),
+            base: ExprNode::base::<LetNode>(span),
             var,
             value,
             body,
@@ -213,9 +208,9 @@ pub struct IfNode {
 }
 
 impl If {
-    pub fn new(cond: Expr, true_branch: Expr, false_branch: Expr, _span: ObjectRef) -> If {
+    pub fn new(cond: Expr, true_branch: Expr, false_branch: Expr, span: Span) -> If {
         let node = IfNode {
-            base: ExprNode::base::<IfNode>(),
+            base: ExprNode::base::<IfNode>(span),
             cond,
             true_branch,
             false_branch,
@@ -235,9 +230,9 @@ pub struct TupleGetItemNode {
 }
 
 impl TupleGetItem {
-    pub fn new(tuple: Expr, index: i32, _span: ObjectRef) -> TupleGetItem {
+    pub fn new(tuple: Expr, index: i32, span: Span) -> TupleGetItem {
         let node = TupleGetItemNode {
-            base: ExprNode::base::<TupleGetItemNode>(),
+            base: ExprNode::base::<TupleGetItemNode>(span),
             tuple,
             index,
         };
@@ -255,9 +250,9 @@ pub struct RefCreateNode {
 }
 
 impl RefCreate {
-    pub fn new(value: Expr, _span: ObjectRef) -> RefCreate {
+    pub fn new(value: Expr, span: Span) -> RefCreate {
         let node = RefCreateNode {
-            base: ExprNode::base::<RefCreateNode>(),
+            base: ExprNode::base::<RefCreateNode>(span),
             value,
         };
         RefCreate(Some(ObjectPtr::new(node)))
@@ -274,9 +269,9 @@ pub struct RefReadNode {
 }
 
 impl RefRead {
-    pub fn new(ref_value: Expr, _span: ObjectRef) -> RefRead {
+    pub fn new(ref_value: Expr, span: Span) -> RefRead {
         let node = RefReadNode {
-            base: ExprNode::base::<RefReadNode>(),
+            base: ExprNode::base::<RefReadNode>(span),
             ref_value,
         };
         RefRead(Some(ObjectPtr::new(node)))
@@ -294,9 +289,9 @@ pub struct RefWriteNode {
 }
 
 impl RefWrite {
-    pub fn new(ref_value: Expr, value: Expr, _span: ObjectRef) -> RefWrite {
+    pub fn new(ref_value: Expr, value: Expr, span: Span) -> RefWrite {
         let node = RefWriteNode {
-            base: ExprNode::base::<RefWriteNode>(),
+            base: ExprNode::base::<RefWriteNode>(span),
             ref_value,
             value,
         };
@@ -316,9 +311,9 @@ pub struct ConstructorNode {
 }
 
 impl Constructor {
-    pub fn new(name_hint: String, inputs: Array<Type>, tag: i32, _span: ObjectRef) -> Constructor {
+    pub fn new(name_hint: String, inputs: Array<Type>, tag: i32, span: Span) -> Constructor {
         let node = ConstructorNode {
-            base: ExprNode::base::<ConstructorNode>(),
+            base: ExprNode::base::<ConstructorNode>(span),
             name_hint,
             inputs,
             tag,
@@ -335,14 +330,14 @@ impl Constructor {
 #[type_key = "relay.Pattern"]
 pub struct PatternNode {
     pub base: Object,
-    pub span: ObjectRef,
+    pub span: Span,
 }
 
 impl PatternNode {
-    pub fn base<T: IsObject>() -> PatternNode {
+    pub fn base<T: IsObject>(span: Span) -> PatternNode {
         PatternNode {
             base: Object::base::<T>(),
-            span: ObjectRef::null(),
+            span: span,
         }
     }
 }
@@ -356,9 +351,9 @@ pub struct PatternWildcardNode {
 }
 
 impl PatternWildcard {
-    pub fn new(_span: ObjectRef) -> PatternWildcard {
+    pub fn new(span: Span) -> PatternWildcard {
         let node = PatternWildcardNode {
-            base: PatternNode::base::<PatternWildcardNode>(),
+            base: PatternNode::base::<PatternWildcardNode>(span),
         };
         PatternWildcard(Some(ObjectPtr::new(node)))
     }
@@ -374,9 +369,9 @@ pub struct PatternVarNode {
 }
 
 impl PatternVar {
-    pub fn new(var: Var, _span: ObjectRef) -> PatternVar {
+    pub fn new(var: Var, span: Span) -> PatternVar {
         let node = PatternVarNode {
-            base: PatternNode::base::<PatternVarNode>(),
+            base: PatternNode::base::<PatternVarNode>(span),
             var: var,
         };
         PatternVar(Some(ObjectPtr::new(node)))
@@ -397,10 +392,10 @@ impl PatternConstructor {
     pub fn new(
         constructor: Constructor,
         patterns: Array<Pattern>,
-        _span: ObjectRef,
+        span: Span,
     ) -> PatternConstructor {
         let node = PatternConstructorNode {
-            base: PatternNode::base::<PatternConstructorNode>(),
+            base: PatternNode::base::<PatternConstructorNode>(span),
             constructor,
             patterns,
         };
@@ -418,9 +413,9 @@ pub struct PatternTupleNode {
 }
 
 impl PatternTuple {
-    pub fn new(patterns: Array<Pattern>, _span: ObjectRef) -> PatternTuple {
+    pub fn new(patterns: Array<Pattern>, span: Span) -> PatternTuple {
         let node = PatternTupleNode {
-            base: PatternNode::base::<PatternTupleNode>(),
+            base: PatternNode::base::<PatternTupleNode>(span),
             patterns,
         };
         PatternTuple(Some(ObjectPtr::new(node)))
@@ -438,7 +433,7 @@ pub struct ClauseNode {
 }
 
 impl Clause {
-    pub fn new(lhs: Pattern, rhs: Expr, _span: ObjectRef) -> Clause {
+    pub fn new(lhs: Pattern, rhs: Expr, _span: Span) -> Clause {
         let node = ClauseNode {
             base: Object::base::<ClauseNode>(),
             lhs,
@@ -460,9 +455,9 @@ pub struct MatchNode {
 }
 
 impl Match {
-    pub fn new(data: Expr, clauses: Array<Clause>, complete: bool, _span: ObjectRef) -> Match {
+    pub fn new(data: Expr, clauses: Array<Clause>, complete: bool, span: Span) -> Match {
         let node = MatchNode {
-            base: ExprNode::base::<MatchNode>(),
+            base: ExprNode::base::<MatchNode>(span),
             data,
             clauses,
             complete,
diff --git a/rust/tvm/src/ir/tir.rs b/rust/tvm/src/ir/tir.rs
index ccbe30c95820..dcbec520d3b6 100644
--- a/rust/tvm/src/ir/tir.rs
+++ b/rust/tvm/src/ir/tir.rs
@@ -18,7 +18,9 @@
  */
 
 use super::{PrimExpr, PrimExprNode};
-use crate::runtime::String as TVMString;
+
+use crate::ir::span::Span;
+use crate::runtime::{IsObjectRef, String as TVMString};
 use crate::DataType;
 
 use tvm_macros::Object;
@@ -36,7 +38,7 @@ macro_rules! define_node {
 
         impl $name {
             pub fn new(datatype: DataType, $($id : $t,)*) -> $name {
-                let base = PrimExprNode::base::<$node>(datatype);
+                let base = PrimExprNode::base::<$node>(datatype, Span::null());
                 let node = $node { base, $($id),* };
                 node.into()
             }
@@ -56,7 +58,6 @@ impl From<i32> for IntImm {
 
 impl From<i32> for PrimExpr {
     fn from(i: i32) -> PrimExpr {
-        use crate::runtime::IsObjectRef;
         IntImm::from(i).upcast()
     }
 }
diff --git a/rust/tvm/src/ir/ty.rs b/rust/tvm/src/ir/ty.rs
index f7c52b51f332..83fdbfeb66aa 100644
--- a/rust/tvm/src/ir/ty.rs
+++ b/rust/tvm/src/ir/ty.rs
@@ -23,7 +23,7 @@ use tvm_rt::{array::Array, DataType};
 use crate::ir::relay::Constructor;
 use crate::ir::span::Span;
 use crate::ir::PrimExpr;
-use crate::runtime::{string::String as TString, IsObject, Object, ObjectPtr};
+use crate::runtime::{string::String as TString, IsObject, IsObjectRef, Object, ObjectPtr};
 
 #[repr(C)]
 #[derive(Object, Debug)]
@@ -147,8 +147,17 @@ pub struct TupleTypeNode {
 }
 
 impl TupleType {
+    // todo add coercion
+    pub fn new(fields: Vec<Type>, span: Span) -> Self {
+        let node = TupleTypeNode {
+            base: TypeNode::base::<TupleTypeNode>(span),
+            fields: Array::from_vec(fields).unwrap(),
+        };
+        ObjectPtr::new(node).into()
+    }
+
     pub fn empty() -> TupleType {
-        todo!()
+        TupleType::new(vec![], Span::null())
     }
 }
 
@@ -236,7 +245,13 @@ impl TensorType {
         };
         ObjectPtr::new(node).into()
     }
+
+    pub fn static_sh(shape: Vec<i32>, dtype: DataType, span: Span) -> TensorType {
+        let sh = Array::from_vec(shape.into_iter().map(Into::into).collect()).unwrap();
+        Self::new(sh, dtype, span)
+    }
 }
+
 // TODO(@jroesch): implement these in future.
 //
 // using TypeCall = tvm::TypeCall;
diff --git a/rust/tvm/src/lib.rs b/rust/tvm/src/lib.rs
index e86420eb70c9..caae07775d21 100644
--- a/rust/tvm/src/lib.rs
+++ b/rust/tvm/src/lib.rs
@@ -39,7 +39,9 @@ pub use tvm_rt::errors;
 pub use tvm_rt::function;
 pub use tvm_rt::module;
 pub use tvm_rt::ndarray;
-pub use tvm_rt::value;
+
+#[cfg(feature = "python")]
+pub mod compiler;
 pub mod ir;
 #[cfg(feature = "python")]
 pub mod python;
diff --git a/rust/tvm/src/python.rs b/rust/tvm/src/python.rs
index 89558af733b3..c224fb4db372 100644
--- a/rust/tvm/src/python.rs
+++ b/rust/tvm/src/python.rs
@@ -29,6 +29,8 @@ use pyo3::prelude::*;
 pub fn load() -> Result<String, ()> {
     let gil = Python::acquire_gil();
     let py = gil.python();
+    // let main_mod = initialize();
+    //let main_mod = main_mod.as_ref(py);
     load_python_tvm_(py).map_err(|e| {
         // We can't display Python exceptions via std::fmt::Display,
         // so print the error here manually.
@@ -36,25 +38,33 @@ pub fn load() -> Result<String, ()> {
     })
 }
 
-// const TVMC_CODE: &'static str = include_str!("tvmc.py");
+pub fn import(mod_to_import: &str) -> PyResult<()> {
+    let gil = Python::acquire_gil();
+    let py = gil.python();
+    import_python(py, mod_to_import)?;
+    Ok(())
+}
+
+fn import_python<'p, 'b: 'p>(py: Python<'p>, to_import: &'b str) -> PyResult<&'p PyModule> {
+    let imported_mod = py.import(to_import)?;
+    Ok(imported_mod)
+}
 
 fn load_python_tvm_(py: Python) -> PyResult<String> {
-    let sys = py.import("tvm")?;
-    let version: String = sys.get("__version__")?.extract()?;
-    // py.run(TVMC_CODE, None, None)?;
+    let imported_mod = import_python(py, "tvm")?;
+    let version: String = imported_mod.get("__version__")?.extract()?;
     Ok(version)
 }
 
 #[cfg(test)]
 mod tests {
-    use super::load_python_tvm_;
+    use super::*;
     use anyhow::Result;
-    use pyo3::prelude::*;
 
     #[ignore]
     #[test]
     fn test_run() -> Result<()> {
-        load_python_tvm_(Python::acquire_gil().python()).unwrap();
+        load().unwrap();
         Ok(())
     }
 }
diff --git a/rust/tvm/src/runtime/graph_rt.rs b/rust/tvm/src/runtime/graph_rt.rs
index 8b26ebb4ca22..fcc41aca560f 100644
--- a/rust/tvm/src/runtime/graph_rt.rs
+++ b/rust/tvm/src/runtime/graph_rt.rs
@@ -34,13 +34,23 @@ pub struct GraphRt {
 }
 
 impl GraphRt {
+    /// Create a graph runtime directly from a runtime module.
+    pub fn from_module(module: Module, ctx: Context) -> Result<GraphRt> {
+        let default: Box<dyn Fn(Context) -> Result<Module>> =
+            module.get_function("default", false)?.into();
+
+        Ok(Self {
+            module: default(ctx)?,
+        })
+    }
+
     /// Create a graph runtime from the deprecated graph, lib, ctx triple.
     pub fn create_from_parts(graph: &str, lib: Module, ctx: Context) -> Result<Self> {
         let runtime_create_fn = Function::get("tvm.graph_runtime.create").unwrap();
 
         let runtime_create_fn_ret = runtime_create_fn.invoke(vec![
             graph.into(),
-            (&lib).into(),
+            lib.into(),
             (&ctx.device_type).into(),
             // NOTE you must pass the device id in as i32 because that's what TVM expects
             (ctx.device_id as i32).into(),
diff --git a/rust/tvm/tests/basics/src/main.rs b/rust/tvm/tests/basics/src/main.rs
index e4249a491746..450ab48dc1b2 100644
--- a/rust/tvm/tests/basics/src/main.rs
+++ b/rust/tvm/tests/basics/src/main.rs
@@ -30,6 +30,7 @@ fn main() {
     } else {
         (Context::gpu(0), "gpu")
     };
+
     let dtype = DataType::from_str("float32").unwrap();
     let mut arr = NDArray::empty(shape, ctx, dtype);
     arr.copy_from_buffer(data.as_mut_slice());
@@ -38,11 +39,13 @@ fn main() {
     if !fadd.enabled(ctx_name) {
         return;
     }
+
     if cfg!(feature = "gpu") {
         fadd.import_module(Module::load(&concat!(env!("OUT_DIR"), "/test_add.ptx")).unwrap());
     }
 
-    fadd.entry()
+    // todo(@jroesch): fix the entry_name
+    fadd.get_function("__tvm_main__", false)
         .expect("module must have entry point")
         .invoke(vec![(&arr).into(), (&arr).into(), (&ret).into()])
         .unwrap();
diff --git a/rust/tvm/tests/basics/src/tvm_add.py b/rust/tvm/tests/basics/src/tvm_add.py
index b9672fbf4aaf..3c1fc64d3e36 100755
--- a/rust/tvm/tests/basics/src/tvm_add.py
+++ b/rust/tvm/tests/basics/src/tvm_add.py
@@ -37,7 +37,6 @@ def main(target, out_dir):
         s[C].bind(tx, te.thread_axis("threadIdx.x"))
 
     fadd = tvm.build(s, [A, B, C], target, target_host="llvm", name="myadd")
-
     fadd.save(osp.join(out_dir, "test_add.o"))
     if target == "cuda":
         fadd.imported_modules[0].save(osp.join(out_dir, "test_add.ptx"))
diff --git a/src/runtime/module.cc b/src/runtime/module.cc
index 4cec5e3643c1..d84a8215421f 100644
--- a/src/runtime/module.cc
+++ b/src/runtime/module.cc
@@ -178,7 +178,7 @@ TVM_REGISTER_GLOBAL("runtime.ModuleGetTypeKey").set_body_typed([](Module mod) {
 TVM_REGISTER_GLOBAL("runtime.ModuleLoadFromFile").set_body_typed(Module::LoadFromFile);
 
 TVM_REGISTER_GLOBAL("runtime.ModuleSaveToFile")
-    .set_body_typed([](Module mod, std::string name, std::string fmt) {
+    .set_body_typed([](Module mod, tvm::String name, tvm::String fmt) {
       mod->SaveToFile(name, fmt);
     });
 

From 83ab23460e326e2cd56a54e81c723ca122c71de2 Mon Sep 17 00:00:00 2001
From: Ritwik Das <ritwikdas54@gmail.com>
Date: Thu, 4 Mar 2021 11:37:43 -0800
Subject: [PATCH 278/357] Add segment sum Op to relay and 7 corresponding TF
 Ops , fix scatter_add dynamic bug  (#7562)

* Add segment sum Op

* Remove unnecessary

* Documentation

* Black

* Add GPU

* Uncomment

* Add documentation

* Add dynamic tests

* Add TF Op

* Add Sparse Segment Sum

* Add test coverage

* PR Comments

* Int64 tests

* Add SparseSegmentSqrtN

* Add SparseSegmentSqrtNOp

* Deduplicate code

* Add SparseSegmentMean

* Parametrize Tests

* Remove

* Modularize

* Black

* Modularize Code

* Pylint

* PR Comments

* Add scatter add tests

* Remove Test

Co-authored-by: Ubuntu <ubuntu@ip-172-31-42-251.us-east-2.compute.internal>
---
 python/tvm/relay/frontend/tensorflow.py       | 126 +++++++++++++
 python/tvm/relay/op/transform.py              |  69 +++++++
 python/tvm/topi/scatter_add.py                |  40 ++--
 .../frontend/tensorflow/test_forward.py       | 175 ++++++++++++++++++
 tests/python/relay/test_op_level3.py          | 157 +++++++++++++---
 5 files changed, 519 insertions(+), 48 deletions(-)

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index b75331a4f9a2..c79c495b0360 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -1167,6 +1167,125 @@ def _impl(inputs, attr, params, mod):
     return _impl
 
 
+def _math_segment_sum():
+    def _impl(inputs, attr, params, mod):
+        assert len(inputs) == 2, "There should be 2 input tensors"
+        return get_relay_op("segment_sum")(inputs[0], inputs[1])
+
+    return _impl
+
+
+def _sparse_segment_sum():
+    def _impl(inputs, attr, params, mod):
+        assert len(inputs) == 3, "There should be 3 input tensors"
+        data = _op.take(inputs[0], inputs[1], axis=0)
+        return _op.segment_sum(data, inputs[2])
+
+    return _impl
+
+
+def _sparse_segment_sum_with_num_segments():
+    def _impl(inputs, attr, params, mod):
+        assert len(inputs) == 4, "There should be 4 input tensors"
+        data = _op.take(inputs[0], inputs[1], axis=0)
+        num_segments = int(inputs[3].data.asnumpy().item())
+        return _op.segment_sum(data, inputs[2], num_segments)
+
+    return _impl
+
+
+def row_wise_divide(multi_dim_tensor, one_dim_vector):
+    """
+    This function enables row-wise division of multi_dim_tensor and one_dim_vector.
+    To achieve this, it is first tiled to the appropriate shape and then elemwise_division
+    """
+    multi_dim_tensor_offrow_shape = _op.strided_slice(
+        _op.shape_of(multi_dim_tensor, "int32"), [1], [-1], slice_mode="size"
+    )
+    one_dim_vector_tiled_shape = _op.concatenate(
+        [_op.reverse(multi_dim_tensor_offrow_shape, 0), _expr.const([1])], axis=0
+    )
+    one_dim_vector_tiled = _op.transpose(_op.tile(one_dim_vector, one_dim_vector_tiled_shape))
+    return _op.divide(multi_dim_tensor, one_dim_vector_tiled)
+
+
+def count_all_indices(segment_ids, counts_dtype, num_segments=None):
+    """
+    This snippet calculates the sqrt count of each index among all valid indices
+    Valid indices are from 0 to max of [segment ids, num_segments]
+    """
+
+    max_segments = _op.reshape(_op.max(segment_ids), -1) + _expr.const([1])
+    if num_segments:
+        max_segments = _op.maximum(max_segments, _expr.const([num_segments]))
+    max_ones = _op.maximum(max_segments, _op.shape_of(segment_ids))
+    counts = _op.segment_sum(
+        _op.ones(max_ones, counts_dtype), segment_ids, num_segments=num_segments
+    )
+    real_counts = _op.clip(counts, 1, 2147483647)  # Clip max doesn't work over int32
+    return real_counts
+
+
+def _sparse_segment_sum_sqrtn():
+    def _impl(inputs, attr, params, mod):
+        assert len(inputs) == 3, "There should be 3 input tensors"
+        data = _op.take(inputs[0], inputs[1], axis=0)
+        real_counts = count_all_indices(inputs[2], attr["T"].name)
+        real_sqrt_counts = _op.sqrt(_op.cast_like(real_counts, data))
+
+        # Calculate regular segment sum
+        segment_sum = _op.segment_sum(data, inputs[2])
+
+        return row_wise_divide(segment_sum, real_sqrt_counts)
+
+    return _impl
+
+
+def _sparse_segment_sum_sqrtn_with_num_segments():
+    def _impl(inputs, attr, params, mod):
+        assert len(inputs) == 4, "There should be 4 input tensors"
+        data = _op.take(inputs[0], inputs[1], axis=0)
+        num_segments = int(inputs[3].data.asnumpy().item())
+        real_counts = count_all_indices(inputs[2], attr["T"].name, num_segments=num_segments)
+        real_sqrt_counts = _op.sqrt(_op.cast_like(real_counts, data))
+
+        # Calculate regular segment sum
+        segment_sum = _op.segment_sum(data, inputs[2], num_segments=num_segments)
+
+        return row_wise_divide(segment_sum, real_sqrt_counts)
+
+    return _impl
+
+
+def _sparse_segment_mean():
+    def _impl(inputs, attr, params, mod):
+        assert len(inputs) == 3, "There should be 3 input tensors"
+        data = _op.take(inputs[0], inputs[1], axis=0)
+        real_counts = count_all_indices(inputs[2], attr["T"].name)
+
+        # Calculate regular segment sum
+        segment_sum = _op.segment_sum(data, inputs[2])
+
+        return row_wise_divide(segment_sum, real_counts)
+
+    return _impl
+
+
+def _sparse_segment_mean_with_num_segments():
+    def _impl(inputs, attr, params, mod):
+        assert len(inputs) == 4, "There should be 4 input tensors"
+        data = _op.take(inputs[0], inputs[1], axis=0)
+        num_segments = int(inputs[3].data.asnumpy().item())
+        real_counts = count_all_indices(inputs[2], attr["T"].name, num_segments=num_segments)
+
+        # Calculate regular segment sum
+        segment_sum = _op.segment_sum(data, inputs[2], num_segments=num_segments)
+
+        return row_wise_divide(segment_sum, real_counts)
+
+    return _impl
+
+
 def _identity():
     def _impl(inputs, attr, params, mod):
         return inputs[0]
@@ -2661,6 +2780,13 @@ def _impl(inputs, attr, params, mod):
     "SparseTensorDenseMatMul": _sparse_tensor_dense_matmul(),
     "SparseFillEmptyRows": _sparse_fill_empty_rows(),
     "SparseReshape": _sparse_reshape(),
+    "SegmentSum": _math_segment_sum(),
+    "SparseSegmentSum": _sparse_segment_sum(),
+    "SparseSegmentSumWithNumSegments": _sparse_segment_sum_with_num_segments(),
+    "SparseSegmentSqrtN": _sparse_segment_sum_sqrtn(),
+    "SparseSegmentSqrtNWithNumSegments": _sparse_segment_sum_sqrtn_with_num_segments(),
+    "SparseSegmentMean": _sparse_segment_mean(),
+    "SparseSegmentMeanWithNumSegments": _sparse_segment_mean_with_num_segments(),
     "Split": _split(False),
     "SplitV": _split(True),
     "Sqrt": AttrCvt("sqrt"),
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 73508ddd2603..f2e3850a8f67 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -1450,6 +1450,75 @@ def sparse_reshape(sparse_indices, prev_shape, new_shape):
     return TupleWrapper(_make.sparse_reshape(sparse_indices, prev_shape, new_shape), 2)
 
 
+def segment_sum(data, segment_ids, num_segments=None):
+    """
+    Computes the sum along segment_ids along axis 0. If multiple segment_ids reference the same
+    location their contributions add up.
+    result[index, j, k, ...] = Σi... data[i, j, k,..] where index = segment_ids[i]
+    This op is much better understood with visualization articulated in the following links and
+    examples at the end of this docstring.
+
+    https://www.tensorflow.org/api_docs/python/tf/math/unsorted_segment_sum
+    https://caffe2.ai/docs/sparse-operations.html#null__unsorted-segment-reduction-ops
+
+    Parameters
+    ----------
+    data : relay.Expr
+        Input Tensor. It can be of any type and multi-dimensional
+    segment_ids : relay.Expr
+        A 1-D int32/int64 tensor containing the segment_ids of the rows to calculate the output
+        sum upon. It defines a mapping from the zeroth dimension of data onto segment_ids. The
+        segment_ids tensor should be the size of the first dimension, d0, with consecutive IDs
+        in the range 0 to k, where k<d0. In particular, a segmentation of a matrix tensor is a
+        mapping of rows to segments. This tensor doesn't need to be sorted
+    num_segments : Optional[int]
+        An integer describing the shape of the zeroth dimension. If unspecified, its calculated
+        equivalent to the number of unique segment_ids
+    Returns
+    -------
+    result: relay.Expr
+        Output tensor.
+    Examples
+    --------
+    .. code-block:: python
+        data = [[1, 2, 3, 4],
+                [4, -3, 2, -1],
+                [5, 6, 7, 8]]
+        segment_ids = [0, 0, 1]
+        result = segment_sum(data, segment_ids)
+        result = [[5, -1, 5, 3],[5, 6, 7, 8]]
+
+        data = [[1, 2, 3, 4],
+                [4, -3, 2, -1],
+                [5, 6, 7, 8]]
+        segment_ids = [2, 0, 0]
+        num_segments = 3
+        result = segment_sum(data, segment_ids, num_segments)
+        result = [[5, 6, 7, 8],[0, 0, 0, 0], [5, -1, 5, 3]]
+    """
+
+    one_tensor = cast_like(const([1]), segment_ids)
+    if num_segments:
+        if isinstance(num_segments, int):
+            max_segments = const([num_segments])
+            max_segments = cast_like(max_segments, segment_ids)
+        else:
+            max_segments = cast_like(num_segments, segment_ids)
+    else:
+        max_segments = _make.add(reshape(_make.max(segment_ids, [0], False, False), -1), one_tensor)
+
+    data_offrow_shape = strided_slice(_make.shape_of(data, "int32"), [1], [-1], slice_mode="size")
+    data_offrow_shape = cast_like(data_offrow_shape, max_segments)
+    new_shape = _make.concatenate(Tuple([max_segments, data_offrow_shape]), 0)
+    segment_ids_tiled_shape = _make.concatenate(
+        Tuple([reverse(data_offrow_shape, 0), one_tensor]), 0
+    )
+    expanded_segment_ids = tile(segment_ids, segment_ids_tiled_shape)
+    scatter_add_segment_ids = transpose(expanded_segment_ids)
+    src = cast_like(_dyn_make.zeros(new_shape, "float64"), data)
+    return scatter_add(src, scatter_add_segment_ids, data, axis=0)
+
+
 def cumsum(data, axis=None, dtype=None, exclusive=None):
     """Numpy style cumsum op. Return the cumulative inclusive sum of the elements along
     a given axis.
diff --git a/python/tvm/topi/scatter_add.py b/python/tvm/topi/scatter_add.py
index 4c77a0767785..6b04837b7766 100644
--- a/python/tvm/topi/scatter_add.py
+++ b/python/tvm/topi/scatter_add.py
@@ -32,8 +32,8 @@ def _scatter_add_1d(data, indices, updates):
 @hybrid.script
 def _scatter_add_2d(data, indices, updates, axis):
     out = output_tensor(data.shape, data.dtype)
-    for i in const_range(data.shape[0]):
-        for j in const_range(data.shape[1]):
+    for i in range(data.shape[0]):
+        for j in range(data.shape[1]):
             out[i, j] = data[i, j]
     if axis == 0:
         for i in range(indices.shape[0]):
@@ -54,14 +54,14 @@ def _scatter_add_2d(data, indices, updates, axis):
 @hybrid.script
 def _scatter_add_3d(data, indices, updates, axis):
     out = output_tensor(data.shape, data.dtype)
-    for i in const_range(data.shape[0]):
-        for j in const_range(data.shape[1]):
-            for k in const_range(data.shape[2]):
+    for i in range(data.shape[0]):
+        for j in range(data.shape[1]):
+            for k in range(data.shape[2]):
                 out[i, j, k] = data[i, j, k]
     if axis == 0:
         for i in range(indices.shape[0]):
             for j in range(indices.shape[1]):
-                for k in const_range(indices.shape[2]):
+                for k in range(indices.shape[2]):
                     out[
                         indices[i, j, k]
                         if indices[i, j, k] >= 0
@@ -72,7 +72,7 @@ def _scatter_add_3d(data, indices, updates, axis):
     elif axis == 1:
         for i in range(indices.shape[0]):
             for j in range(indices.shape[1]):
-                for k in const_range(indices.shape[2]):
+                for k in range(indices.shape[2]):
                     out[
                         i,
                         indices[i, j, k]
@@ -83,7 +83,7 @@ def _scatter_add_3d(data, indices, updates, axis):
     else:
         for i in range(indices.shape[0]):
             for j in range(indices.shape[1]):
-                for k in const_range(indices.shape[2]):
+                for k in range(indices.shape[2]):
                     out[
                         i,
                         j,
@@ -98,17 +98,17 @@ def _scatter_add_3d(data, indices, updates, axis):
 @hybrid.script
 def _scatter_add_4d(data, indices, updates, axis):
     out = output_tensor(data.shape, data.dtype)
-    for i in const_range(data.shape[0]):
-        for j in const_range(data.shape[1]):
-            for k in const_range(data.shape[2]):
-                for l in const_range(data.shape[3]):
+    for i in range(data.shape[0]):
+        for j in range(data.shape[1]):
+            for k in range(data.shape[2]):
+                for l in range(data.shape[3]):
                     out[i, j, k, l] = data[i, j, k, l]
 
     if axis == 0:
         for i in range(indices.shape[0]):
             for j in range(indices.shape[1]):
-                for k in const_range(indices.shape[2]):
-                    for l in const_range(indices.shape[3]):
+                for k in range(indices.shape[2]):
+                    for l in range(indices.shape[3]):
                         out[
                             indices[i, j, k, l]
                             if indices[i, j, k, l] >= 0
@@ -120,8 +120,8 @@ def _scatter_add_4d(data, indices, updates, axis):
     elif axis == 1:
         for i in range(indices.shape[0]):
             for j in range(indices.shape[1]):
-                for k in const_range(indices.shape[2]):
-                    for l in const_range(indices.shape[3]):
+                for k in range(indices.shape[2]):
+                    for l in range(indices.shape[3]):
                         out[
                             i,
                             indices[i, j, k, l]
@@ -133,8 +133,8 @@ def _scatter_add_4d(data, indices, updates, axis):
     elif axis == 2:
         for i in range(indices.shape[0]):
             for j in range(indices.shape[1]):
-                for k in const_range(indices.shape[2]):
-                    for l in const_range(indices.shape[3]):
+                for k in range(indices.shape[2]):
+                    for l in range(indices.shape[3]):
                         out[
                             i,
                             j,
@@ -146,8 +146,8 @@ def _scatter_add_4d(data, indices, updates, axis):
     else:
         for i in range(indices.shape[0]):
             for j in range(indices.shape[1]):
-                for k in const_range(indices.shape[2]):
-                    for l in const_range(indices.shape[3]):
+                for k in range(indices.shape[2]):
+                    for l in range(indices.shape[3]):
                         out[
                             i,
                             j,
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 41145bf77218..81aeb5ef886c 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -2080,6 +2080,181 @@ def test_forward_sparse_reshape(
     _test_sparse_reshape(sparse_indices_np, sparse_values_np, prev_shape_np, new_shape_np, use_dyn)
 
 
+#######################################################################
+# Sparse Segment Variants
+# ------------
+
+
+def _test_sparse_segment_variant(
+    tf_op, data_np, indices_np, segment_ids_np, num_segments, use_dyn=False
+):
+    with tf.Graph().as_default():
+        if use_dyn:
+            data = tf.placeholder(
+                shape=[None for _ in data_np.shape], dtype=data_np.dtype, name="data"
+            )
+            indices = tf.placeholder(shape=[None], dtype=indices_np.dtype, name="indices")
+            segment_ids = tf.placeholder(
+                shape=(None), dtype=segment_ids_np.dtype, name="segment_ids"
+            )
+        else:
+            data = tf.placeholder(shape=data_np.shape, dtype=data_np.dtype, name="data")
+            indices = tf.placeholder(shape=indices_np.shape, dtype=indices_np.dtype, name="indices")
+            segment_ids = tf.placeholder(
+                shape=segment_ids_np.shape, dtype=segment_ids_np.dtype, name="segment_ids"
+            )
+
+        _ = tf_op(
+            data, indices, segment_ids, num_segments=num_segments, name="sparse_segment_variant"
+        )
+        compare_tf_with_tvm(
+            [data_np, indices_np, segment_ids_np],
+            [data.name, indices.name, segment_ids.name],
+            ["sparse_segment_variant:0"],
+            mode="vm",
+        )
+
+
+@pytest.mark.parametrize(
+    "data_np, indices_np, segment_ids_np, num_segments",
+    [
+        (
+            np.array([5, 1, 7, 2, 3, 4], dtype=np.float32),
+            np.array([0, 3, 4], dtype=np.int32),
+            np.array([0, 1, 1], dtype=np.int32),
+            None,
+        ),
+        (
+            np.array([[1, 2, 3, 4], [-1, -2, -3, -4], [5, 6, 7, 8]], dtype=np.float64),
+            np.array([0, 1], dtype=np.int32),
+            np.array([0, 2], dtype=np.int32),
+            4,
+        ),
+        (
+            np.random.random((6, 4, 5)),
+            np.array([0, 2, 4, 3, 1], dtype=np.int32),
+            np.array([0, 0, 1, 5, 5], dtype=np.int32),
+            100,
+        ),
+        (
+            np.random.random((6, 4, 5)),
+            np.array([0, 2, 4, 3, 1], dtype=np.int32),
+            np.array([0, 0, 1, 5, 5], dtype=np.int32),
+            None,
+        ),
+        (
+            np.array([[[1, 7]], [[3, 8]], [[2, 9]]], dtype=np.float64),
+            np.array([0, 1, 2], dtype=np.int32),
+            np.array([0, 0, 1], dtype=np.int32),
+            None,
+        ),
+        (
+            np.random.random((9, 4, 5, 7)),
+            np.array([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=np.int32),
+            np.array([0, 0, 1, 3, 5, 6, 7, 7, 8], dtype=np.int32),
+            9,
+        ),
+        (
+            np.random.random((9, 4, 5, 7)),
+            np.array([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=np.int32),
+            np.array([0, 0, 1, 3, 5, 6, 7, 7, 8], dtype=np.int32),
+            None,
+        ),
+        (
+            np.array([[1, 2, 3, 4], [-1, -2, -3, -4], [5, 6, 7, 8]], dtype=np.float64),
+            np.array([0, 1], dtype=np.int32),
+            np.array([0, 2], dtype=np.int32),
+            None,
+        ),
+        (
+            np.random.random((9, 4, 5, 7)),
+            np.array([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=np.int32),
+            np.array([0, 0, 1, 3, 5, 5, 5, 5, 5], dtype=np.int32),
+            6,
+        ),
+    ],
+)
+@pytest.mark.parametrize("use_dyn", [True, False])
+@pytest.mark.parametrize(
+    "tf_op",
+    [
+        tf.sparse.segment_sum,
+        tf.sparse.segment_sqrt_n,
+        tf.sparse.segment_mean,
+    ],
+)
+def test_forward_sparse_segment_sum_variants(
+    tf_op,
+    data_np,
+    indices_np,
+    segment_ids_np,
+    num_segments,
+    use_dyn,
+):
+    """sparse segment sum variants tests"""
+    _test_sparse_segment_variant(tf_op, data_np, indices_np, segment_ids_np, num_segments, use_dyn)
+
+
+#######################################################################
+# Math SegmentSum
+# ------------
+
+
+def _test_math_segment_sum(data_np, segment_ids_np, use_dyn=False):
+    with tf.Graph().as_default():
+        if use_dyn:
+            data = tf.placeholder(
+                shape=[None for _ in data_np.shape], dtype=data_np.dtype, name="data"
+            )
+            segment_ids = tf.placeholder(
+                shape=(None), dtype=segment_ids_np.dtype, name="segment_ids"
+            )
+        else:
+            data = tf.placeholder(shape=data_np.shape, dtype=data_np.dtype, name="data")
+            segment_ids = tf.placeholder(
+                shape=segment_ids_np.shape, dtype=segment_ids_np.dtype, name="segment_ids"
+            )
+
+        _ = tf.math.segment_sum(data, segment_ids, name="segment_sum")
+        compare_tf_with_tvm(
+            [data_np, segment_ids_np],
+            [data.name, segment_ids.name],
+            ["segment_sum:0"],
+            mode="vm",
+        )
+
+
+@pytest.mark.parametrize(
+    "data_np, segment_ids_np",
+    [
+        (
+            np.array([5, 1, 7, 2, 3, 4], dtype=np.float32),
+            np.array([0, 0, 0, 1, 1, 1], dtype=np.int32),
+        ),
+        (
+            np.array([[1, 2, 3, 4], [-1, -2, -3, -4], [5, 6, 7, 8]], dtype=np.float64),
+            np.array([0, 0, 1], dtype=np.int32),
+        ),
+        (
+            np.random.random((6, 4, 5)),
+            np.array([0, 0, 1, 2, 2, 3], dtype=np.int64),
+        ),
+        (
+            np.array([[[1, 7]], [[3, 8]], [[2, 9]]], dtype=np.float32),
+            np.array([0, 0, 1], dtype=np.int32),
+        ),
+        (
+            np.random.random((9, 4, 5, 7)),
+            np.array([0, 0, 0, 1, 2, 3, 4, 4, 5], dtype=np.int64),
+        ),
+    ],
+)
+@pytest.mark.parametrize("use_dyn", [True, False])
+def test_forward_math_segment_sum(data_np, segment_ids_np, use_dyn):
+    """math segment sum test"""
+    _test_math_segment_sum(data_np, segment_ids_np, use_dyn)
+
+
 # tensorflow.compat.v1.sparse_to_dense
 # ---------------
 def _test_sparse_to_dense(sparse_indices, sparse_values, default_value, output_shape):
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index c9ed975c3b9b..31b95b0b49ae 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -24,6 +24,7 @@
 from tvm.error import TVMError
 from tvm.relay import create_executor, transform
 from tvm.relay.testing import check_grad, run_infer_type
+from typing import Optional
 import tvm.testing
 
 
@@ -1023,7 +1024,25 @@ def verify_dynamic_scatter(dshape, ishape, axis=0):
 
 
 @tvm.testing.uses_gpu
-def test_scatter_add():
+@pytest.mark.parametrize(
+    "dshape, ishape, axis, dtype",
+    [
+        ((10,), (10,), 0, "int32"),
+        ((1000,), (1000,), 0, "int32"),
+        ((10, 5), (10, 5), -2, "float32"),
+        ((10, 5), (10, 5), -1, "float32"),
+        ((10, 5), (3, 5), 0, "float32"),
+        ((12, 4), (7, 2), 1, "float32"),
+        ((2, 3, 4), (1, 3, 4), 0, "float32"),
+        ((2, 3, 4), (2, 1, 4), 1, "float32"),
+        ((2, 3, 4), (2, 3, 1), 2, "float32"),
+        ((2, 3, 4, 5), (1, 3, 4, 5), 0, "float32"),
+        ((6, 3, 4, 5), (2, 3, 4, 5), 1, "float32"),
+        ((2, 3, 8, 5), (2, 3, 1, 1), 2, "float32"),
+        ((16, 16, 4, 5), (16, 16, 4, 5), 3, "float32"),
+    ],
+)
+def test_scatter_add(dshape, ishape, axis, dtype):
     def ref_scatter_add(data, indices, updates, axis=0):
         output = np.copy(data)
         for index in np.ndindex(*indices.shape):
@@ -1033,9 +1052,9 @@ def ref_scatter_add(data, indices, updates, axis=0):
         return output
 
     def verify_scatter_add(dshape, ishape, axis=0, dtype="float32"):
-        d = relay.var("d", relay.TensorType(dshape, dtype))
-        i = relay.var("i", relay.TensorType(ishape, "int64"))
-        u = relay.var("u", relay.TensorType(ishape, dtype))
+        d = relay.var("d", relay.TensorType(shape=[relay.Any() for _ in dshape], dtype=dtype))
+        i = relay.var("i", relay.TensorType(shape=[relay.Any() for _ in ishape], dtype="int64"))
+        u = relay.var("u", relay.TensorType(shape=[relay.Any() for _ in ishape], dtype=dtype))
         z = relay.op.scatter_add(d, i, u, axis)
 
         func = relay.Function([d, i, u], z)
@@ -1045,31 +1064,14 @@ def verify_scatter_add(dshape, ishape, axis=0, dtype="float32"):
         indices_np = np.random.randint(-dshape[axis], dshape[axis] - 1, ishape).astype("int64")
 
         ref_res = ref_scatter_add(data_np, indices_np, updates_np, axis)
-        for target, ctx in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                if target == "nvptx" and dtype == "float32" and len(dshape) == 1:
-                    # scatter_add 1D on GPU is implemented via atomic.
-                    # Floating point atomic requires LLVM 9 or newer for nvptx backend.
-                    # But LLVM on CI is LLVM 8.
-                    continue
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
-                op_res = intrp.evaluate(func)(data_np, indices_np, updates_np)
-                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
-    verify_scatter_add((10,), (10,), 0, dtype="int32")
-    verify_scatter_add((1000,), (1000,))
-    verify_scatter_add((1000,), (1000,), 0, dtype="int32")
-    verify_scatter_add((10, 5), (10, 5), -2)
-    verify_scatter_add((10, 5), (10, 5), -1)
-    verify_scatter_add((10, 5), (3, 5), 0)
-    verify_scatter_add((12, 4), (7, 2), 1)
-    verify_scatter_add((2, 3, 4), (1, 3, 4), 0)
-    verify_scatter_add((2, 3, 4), (2, 1, 4), 1)
-    verify_scatter_add((2, 3, 4), (2, 3, 1), 2)
-    verify_scatter_add((2, 3, 4, 5), (1, 3, 4, 5), 0)
-    verify_scatter_add((6, 3, 4, 5), (2, 3, 4, 5), 1)
-    verify_scatter_add((2, 3, 8, 5), (2, 3, 1, 1), 2)
-    verify_scatter_add((16, 16, 4, 5), (16, 16, 4, 5), 3)
+        verify_func(
+            func,
+            [data_np, indices_np, updates_np],
+            ref_res,
+        )
+
+    verify_scatter_add(dshape, ishape, axis, dtype)
 
 
 @tvm.testing.uses_gpu
@@ -1515,6 +1517,105 @@ def verify_sparse_reshape(
     )
 
 
+@tvm.testing.uses_gpu
+@pytest.mark.parametrize(
+    "data_np, segment_ids_np, num_segments",
+    [
+        (
+            np.array([5, 1, 7, 2, 3, 4], dtype=np.float32),
+            np.array([0, 0, 1, 1, 0, 1], dtype=np.int32),
+            None,
+        ),
+        (
+            np.array([[1, 2, 3, 4], [-1, -2, -3, -4], [5, 6, 7, 8]], dtype=np.float64),
+            np.array([0, 0, 1], dtype=np.int32),
+            None,
+        ),
+        (
+            np.random.random((6, 4, 5)),
+            np.array([2, 0, 1, 0, 3, 2], dtype=np.int64),
+            None,
+        ),
+        (
+            np.array([[[1, 7]], [[3, 8]], [[2, 9]]], dtype=np.float32),
+            np.array([0, 0, 1], dtype=np.int32),
+            None,
+        ),
+        (
+            np.random.random((9, 4, 5, 7)),
+            np.array([5, 0, 1, 0, 3, 6, 8, 7, 7], dtype=np.int64),
+            9,
+        ),
+        (
+            np.array([[1, 2, 3, 4], [-1, -2, -3, -4], [5, 6, 7, 8]], dtype=np.float64),
+            np.array([0, 2], dtype=np.int32),
+            4,
+        ),
+        (
+            np.random.random((6, 4, 5)),
+            np.array([0, 0, 1, 5, 5], dtype=np.int32),
+            100,
+        ),
+    ],
+)
+@pytest.mark.parametrize("use_dyn", [True, False])
+def test_segment_sum(data_np, segment_ids_np, num_segments, use_dyn):
+    def ref_segment_sum(
+        data: np.ndarray,
+        segment_ids: np.ndarray,
+        num_segments: Optional[int] = None,
+    ):
+        """
+        This function calculates the expected output of segment_sum operator given the inputs.
+        """
+        if not num_segments:
+            num_segments = np.unique(segment_ids).shape[0]
+
+        result = np.zeros((num_segments,) + data.shape[1:], data.dtype)
+        for i, index in enumerate(segment_ids):
+            result[index] += data[i]
+        return result
+
+    def verify_segment_sum(
+        data_np: np.ndarray, segment_ids_np: np.ndarray, num_segments: Optional[int]
+    ):
+        """
+        This function verifies the relay output of segment_sum with its expected output.
+        """
+        if use_dyn:
+            data = relay.var(
+                "data",
+                shape=[relay.Any() for _ in data_np.shape],
+                dtype=str(data_np.dtype),
+            )
+            segment_ids = relay.var(
+                "segment_ids",
+                shape=[relay.Any()],
+                dtype=str(segment_ids_np.dtype),
+            )
+        else:
+            data = relay.var(
+                "data",
+                relay.TensorType(data_np.shape, str(data_np.dtype)),
+            )
+            segment_ids = relay.var(
+                "segment_ids", relay.TensorType(segment_ids_np.shape, str(segment_ids_np.dtype))
+            )
+        z = relay.op.segment_sum(data, segment_ids, num_segments)
+
+        func = relay.Function([data, segment_ids], z)
+        ref_res = ref_segment_sum(data_np, segment_ids_np, num_segments=num_segments)
+        segment_sum_result = run_infer_type(z)
+        assert segment_sum_result.checked_type.dtype == data_np.dtype
+        verify_func(
+            func,
+            [data_np, segment_ids_np],
+            ref_res,
+        )
+
+    verify_segment_sum(data_np, segment_ids_np, num_segments)
+
+
 def verify_func(func, data, ref_res, target_ctx=tvm.testing.enabled_targets()):
     assert isinstance(data, list)
     for target, ctx in target_ctx:

From 3fbb0a3d749c45d121ed213d4741c4e8e8041320 Mon Sep 17 00:00:00 2001
From: Trevor Morris <trevmorr@amazon.com>
Date: Thu, 4 Mar 2021 14:10:15 -0800
Subject: [PATCH 279/357] [BYOC][TensorRT] Make TRT runtime robust to empty or
 weird subgraphs (#7581)

* Prevent TRT runtime crash for duplicate inputs and outputs

* Add empty subgraph unit test
---
 .../contrib/tensorrt/tensorrt_builder.cc      |  8 ++++
 tests/python/contrib/test_tensorrt.py         | 42 +++++++++++++++----
 2 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/src/runtime/contrib/tensorrt/tensorrt_builder.cc b/src/runtime/contrib/tensorrt/tensorrt_builder.cc
index ee47e67001f3..09b36d720877 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_builder.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_builder.cc
@@ -99,6 +99,14 @@ void TensorRTBuilder::AddOutput(const JSONGraphNodeEntry& node, uint32_t entry_i
   ICHECK(it != node_output_map_.end()) << "Output was not found.";
   auto out_tensor = it->second[node.index_].tensor;
   std::string name = "tensorrt_output_" + std::to_string(network_output_names_.size());
+  // If the network is already marked as an input or output, make a copy to avoid TRT crash.
+  if (out_tensor->isNetworkOutput()) {
+    LOG(WARNING) << name << " is a duplicate output.";
+    out_tensor = network_->addIdentity(*out_tensor)->getOutput(0);
+  } else if (out_tensor->isNetworkInput()) {
+    LOG(WARNING) << name << " is both an input and an output.";
+    out_tensor = network_->addIdentity(*out_tensor)->getOutput(0);
+  }
   out_tensor->setName(name.c_str());
   network_->markOutput(*out_tensor);
   network_output_names_.push_back(name);
diff --git a/tests/python/contrib/test_tensorrt.py b/tests/python/contrib/test_tensorrt.py
index 7ddc4e762cfd..60d6b2aa7571 100644
--- a/tests/python/contrib/test_tensorrt.py
+++ b/tests/python/contrib/test_tensorrt.py
@@ -71,6 +71,14 @@ def assert_result_dict_holds(result_dict):
             tvm.testing.assert_allclose(r1, r2, rtol=1e-3, atol=1e-3)
 
 
+def set_func_attr(func, compile_name, symbol_name):
+    func = func.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
+    func = func.with_attr("Inline", tvm.tir.IntImm("int32", 1))
+    func = func.with_attr("Compiler", compile_name)
+    func = func.with_attr("global_symbol", symbol_name)
+    return func
+
+
 def run_and_verify_func(config, target="cuda"):
     """Test a Relay func by compiling, running, and comparing TVM and TRT outputs.
 
@@ -1109,13 +1117,6 @@ def test_dynamic_offload():
     kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
 
     def get_expected():
-        def set_func_attr(func, compile_name, symbol_name):
-            func = func.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
-            func = func.with_attr("Inline", tvm.tir.IntImm("int32", 1))
-            func = func.with_attr("Compiler", compile_name)
-            func = func.with_attr("global_symbol", symbol_name)
-            return func
-
         # Create a nested TRT function that matches the expected output
         mod = tvm.IRModule()
         var1 = relay.var("tensorrt_0_i0", shape=(data_shape), dtype="float32")
@@ -1331,5 +1332,32 @@ def get_maskrcnn_input(in_size: int) -> np.ndarray:
         )
 
 
+def test_empty_subgraph():
+    if skip_codegen_test():
+        return
+    x_shape = (1, 3, 5)
+    mod = tvm.IRModule()
+    # Empty tensorrt subgraph.
+    var1 = relay.var("tensorrt_0_i0", shape=(x_shape), dtype="float32")
+    f1 = GlobalVar("tensorrt_0")
+    func = relay.Function([var1], var1)
+    func = set_func_attr(func, "tensorrt", "tensorrt_0")
+    mod[f1] = func
+    mod = relay.transform.InferType()(mod)
+
+    # Create the main function
+    x = relay.var("x", shape=x_shape, dtype="float32")
+    out = f1(relay.nn.relu(x))
+    f = relay.Function([x], out)
+    mod["main"] = f
+
+    x_data = np.random.uniform(-1, 1, x_shape).astype("float32")
+    for mode in ["graph", "vm"]:
+        with tvm.transform.PassContext(opt_level=3):
+            exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda")
+            if not skip_runtime_test():
+                results = exec.evaluate()(x_data)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])

From d7f57532746680732e58ab028d8c3129b9140d3d Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Fri, 5 Mar 2021 09:55:09 +0900
Subject: [PATCH 280/357] [SPIRV] Support Bool buffer argument (#7591)

---
 src/target/spirv/codegen_spirv.cc             | 24 ++++--
 .../unittest/test_target_codegen_spirv.py     | 75 +++++++++++++++++++
 2 files changed, 93 insertions(+), 6 deletions(-)
 create mode 100644 tests/python/unittest/test_target_codegen_spirv.py

diff --git a/src/target/spirv/codegen_spirv.cc b/src/target/spirv/codegen_spirv.cc
index 6311b435f197..24608ebc93f4 100644
--- a/src/target/spirv/codegen_spirv.cc
+++ b/src/target/spirv/codegen_spirv.cc
@@ -45,10 +45,15 @@ std::vector<uint32_t> CodeGenSPIRV::BuildFunction(const PrimFunc& f, const std::
       if (auto* ptr = arg->type_annotation.as<PointerTypeNode>()) {
         auto* prim = ptr->element_type.as<PrimTypeNode>();
         ICHECK(prim);
-        DataType value_type = prim->dtype;
+        DataType value_storage_type = prim->dtype;
+        if (value_storage_type == DataType::UInt(1)) {
+          // We need a physically addressable buffer type to support boolean tensors.
+          // The loaded byte is cast to bool inside the LoadNode visitor below.
+          value_storage_type = DataType::UInt(8);
+        }
         spirv::Value arg_value =
-            builder_->BufferArgument(builder_->GetSType(value_type), 0, num_buffer);
-        storage_info_[arg.get()].UpdateContentType(value_type);
+            builder_->BufferArgument(builder_->GetSType(value_storage_type), 0, num_buffer);
+        storage_info_[arg.get()].UpdateContentType(value_storage_type);
         var_map_[arg.get()] = arg_value;
       } else {
         LOG(FATAL) << "require all handles to be typed";
@@ -369,11 +374,18 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const LoadNode* op) {
     mask |= spv::MemoryAccessVolatileMask;
   }
   if (op->dtype.lanes() == 1) {
-    ICHECK_EQ(info.content_type, op->dtype)
-        << "Vulkan only allow one type access to the same buffer";
     spirv::Value index = MakeValue(op->index);
     spirv::Value ptr = builder_->StructArrayAccess(ptr_type, buffer, index);
-    return builder_->MakeValue(spv::OpLoad, content_type, ptr, mask);
+    spirv::Value loaded = builder_->MakeValue(spv::OpLoad, content_type, ptr, mask);
+    if (op->dtype == DataType::UInt(1)) {
+      // A bool tensor is backed by a byte buffer, we cast to bool here.
+      auto bool_ty = builder_->GetSType(DataType::UInt(1));
+      return builder_->Cast(bool_ty, loaded);
+    } else {
+      ICHECK_EQ(info.content_type, op->dtype)
+          << "Vulkan only allow one type access to the same buffer";
+      return loaded;
+    }
   } else {
     if (op->dtype.element_of() == info.content_type) {
       // because content type is element type, we can only do scalarize load.
diff --git a/tests/python/unittest/test_target_codegen_spirv.py b/tests/python/unittest/test_target_codegen_spirv.py
new file mode 100644
index 000000000000..2cbf0bea9257
--- /dev/null
+++ b/tests/python/unittest/test_target_codegen_spirv.py
@@ -0,0 +1,75 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+import tvm.testing
+from tvm import te
+from tvm.topi.math import cast
+import numpy as np
+
+
+def test_bool_load():
+    def do_copy(A, B, n):
+        ib = tvm.tir.ir_builder.create()
+        A = ib.buffer_ptr(A)
+        B = ib.buffer_ptr(B)
+
+        tx = te.thread_axis("threadIdx.x")
+        bx = te.thread_axis("blockIdx.x")
+
+        max_threads = 32
+        ib.scope_attr(bx, "thread_extent", tvm.tir.indexdiv(n + max_threads - 1, max_threads))
+        ib.scope_attr(tx, "thread_extent", max_threads)
+        tid = bx * max_threads + tx
+
+        with ib.if_scope(tid < n):
+            B[tid] = cast(A[tid], "int32")
+
+        return ib.get()
+
+    n = 1024
+    A = te.placeholder((n,), name="A", dtype="bool")
+    B = te.placeholder((n,), name="B", dtype="int32")
+
+    target = "vulkan"
+
+    if not tvm.testing.device_enabled(target):
+        return
+
+    B = te.extern(
+        A.shape,
+        [A],
+        lambda ins, outs: do_copy(ins[0], outs[0], n),
+        name="bool_copy_ir",
+        dtype="int32",
+    )
+    s = te.create_schedule(B.op)
+
+    with tvm.transform.PassContext(opt_level=3):
+        func = tvm.build(s, [A, B], target)
+
+    ctx = tvm.context(target, 0)
+    a_np = np.random.uniform(size=n) > 0.5
+    b_np = np.zeros((n,), dtype="int32")
+    a = tvm.nd.array(a_np, ctx)
+    b = tvm.nd.array(b_np, ctx)
+    func(a, b)
+    ref = a_np.astype(np.int32)
+    tvm.testing.assert_allclose(b.asnumpy(), ref)
+
+
+if __name__ == "__main__":
+    test_bool_load()

From d5cb3cbdfca5a6c8db78b96d3e96eca948406510 Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Thu, 4 Mar 2021 21:34:18 -0800
Subject: [PATCH 281/357] [PyTorch] Guarantee data input is the first argument
 (#7592)

---
 python/tvm/relay/frontend/pytorch.py          | 13 ++++++++++++-
 tests/python/frontend/pytorch/test_forward.py |  2 ++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index dcf2f08caeef..e5ad57c6b87a 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -3216,5 +3216,16 @@ def from_pytorch(script_module, input_infos, custom_convert_map=None, default_dt
         # ListConstruct kept original python list. Convert to tuple.
         ret = _expr.Tuple(ret)
 
-    mod["main"] = tvm.relay.Function(_analysis.free_vars(ret), ret)
+    # Separate data inputs and parameters to make sure data inputs are always in the beginning.
+    func_args = []
+    data_inputs = []
+    for arg in _analysis.free_vars(ret):
+        if arg.name_hint not in tvm_params.keys():
+            data_inputs.append(arg)
+        else:
+            func_args.append(arg)
+    func_args = data_inputs + func_args
+
+    mod["main"] = tvm.relay.Function(func_args, ret)
+
     return transform.RemoveUnusedFunctions()(mod), tvm_params
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 54bf2fd49acb..41679bf16c5d 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -201,6 +201,8 @@ def verify_model(model_name, input_data=[], custom_convert_map={}, rtol=1e-5, at
     input_names = ["input{}".format(idx) for idx, inp in enumerate(baseline_input)]
     input_shapes = list(zip(input_names, [inp.shape for inp in baseline_input]))
     mod, params = relay.frontend.from_pytorch(trace, input_shapes, custom_convert_map)
+    for arg in mod["main"].params[: len(input_names)]:
+        assert arg.name_hint in input_names
     compiled_input = dict(zip(input_names, [inp.clone().cpu().numpy() for inp in baseline_input]))
 
     with tvm.transform.PassContext(opt_level=3):

From 61e799cb85c90a963e7e493709eee2c8a327e8d3 Mon Sep 17 00:00:00 2001
From: Zhi <5145158+zhiics@users.noreply.github.com>
Date: Thu, 4 Mar 2021 23:17:41 -0800
Subject: [PATCH 282/357] [CI] Bump arm version (#7584)

---
 Jenkinsfile              | 2 +-
 docker/Dockerfile.ci_arm | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index bba3950aea87..506dcab4e306 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -50,7 +50,7 @@ ci_cpu = "tlcpack/ci-cpu:v0.72-t0"
 ci_wasm = "tlcpack/ci-wasm:v0.70"
 ci_i386 = "tlcpack/ci-i386:v0.72-t0"
 ci_qemu = "tlcpack/ci-qemu:v0.01"
-ci_arm = "tlcpack/ci-arm:v0.01"
+ci_arm = "tlcpack/ci-arm:v0.02"
 // <--- End of regex-scanned config.
 
 // tvm libraries
diff --git a/docker/Dockerfile.ci_arm b/docker/Dockerfile.ci_arm
index 020792700ee9..671ce04e8c1d 100644
--- a/docker/Dockerfile.ci_arm
+++ b/docker/Dockerfile.ci_arm
@@ -16,7 +16,7 @@
 # under the License.
 
 # CI docker arm env
-# tag: v0.10
+# tag: v0.02
 
 FROM ubuntu:18.04
 

From fb06fd8a6d7d6580e99e75ae5deae5e57ad02388 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Fri, 5 Mar 2021 21:18:51 +0900
Subject: [PATCH 283/357] Fix for dynamic batch size conv2d nhwc (#7598)

---
 python/tvm/relay/op/strategy/cuda.py | 2 ++
 python/tvm/topi/cuda/conv2d_nhwc.py  | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 85bbab692574..3b498a99495e 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -354,6 +354,8 @@ def judge_winograd(
     OH = (H + pt + pb - KH) // stride_h + 1
     OW = (W + pl + pr - KW) // stride_w + 1
     nH, nW = (OH + tile_size - 1) // tile_size, (OW + tile_size - 1) // tile_size
+    if not isinstance(N, int):
+        return False, False, False
     P = N * nH * nW
 
     judge_winograd_tensorcore = (
diff --git a/python/tvm/topi/cuda/conv2d_nhwc.py b/python/tvm/topi/cuda/conv2d_nhwc.py
index a08d217696e2..991585587bbf 100644
--- a/python/tvm/topi/cuda/conv2d_nhwc.py
+++ b/python/tvm/topi/cuda/conv2d_nhwc.py
@@ -129,4 +129,6 @@ def schedule_conv2d_nhwc_direct(cfg, s, Conv):
 
     N, OH, OW, CO = get_const_tuple(output.shape)
     KH, KW, CI, _ = get_const_tuple(kernel.shape)
-    cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW)
+
+    if isinstance(N, int):
+        cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW)

From b9adce2c8006095eb0747ee991a9475f43b46264 Mon Sep 17 00:00:00 2001
From: Trevor Morris <trevmorr@amazon.com>
Date: Fri, 5 Mar 2021 04:46:43 -0800
Subject: [PATCH 284/357] [Frontend][MXNet] Fix default value for is_ascend in
 topk (#7568)

* Use correct default value of False for is_ascend

* Add unit test for default topk is_ascend value
---
 python/tvm/relay/frontend/mxnet.py          |  2 +-
 tests/python/frontend/mxnet/test_forward.py | 25 ++++++++++++++-------
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index 0c9d2c4381ac..5415c77097a2 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -1234,7 +1234,7 @@ def _mx_topk(inputs, attrs):
     new_attrs = {}
     new_attrs["k"] = attrs.get_int("k", 1)
     new_attrs["axis"] = attrs.get_int("axis", -1)
-    new_attrs["is_ascend"] = attrs.get_bool("is_ascend", True)
+    new_attrs["is_ascend"] = attrs.get_bool("is_ascend", False)
     ret_type = attrs.get_str("ret_typ", "indices")
     if ret_type == "mask":
         raise tvm.error.OpAttributeUnimplemented(
diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py
index 3e652cfc69e3..4eb7f6139e8f 100644
--- a/tests/python/frontend/mxnet/test_forward.py
+++ b/tests/python/frontend/mxnet/test_forward.py
@@ -1064,14 +1064,23 @@ def verify(shape, axis, is_ascend, dtype="float32"):
 
 @tvm.testing.uses_gpu
 def test_forward_topk():
-    def verify(shape, k, axis, ret_type, is_ascend=False, dtype="float32"):
+    def verify(shape, k, axis, ret_type, is_ascend=None, dtype="float32"):
         x_np = np.random.uniform(size=shape).astype("float32")
-        ref_res = mx.nd.topk(
-            mx.nd.array(x_np), k=k, axis=axis, ret_typ=ret_type, is_ascend=is_ascend, dtype=dtype
-        )
-        mx_sym = mx.sym.topk(
-            mx.sym.var("x"), k=k, axis=axis, ret_typ=ret_type, is_ascend=is_ascend, dtype=dtype
-        )
+        if is_ascend is None:
+            ref_res = mx.nd.topk(mx.nd.array(x_np), k=k, axis=axis, ret_typ=ret_type, dtype=dtype)
+            mx_sym = mx.sym.topk(mx.sym.var("x"), k=k, axis=axis, ret_typ=ret_type, dtype=dtype)
+        else:
+            ref_res = mx.nd.topk(
+                mx.nd.array(x_np),
+                k=k,
+                axis=axis,
+                ret_typ=ret_type,
+                is_ascend=is_ascend,
+                dtype=dtype,
+            )
+            mx_sym = mx.sym.topk(
+                mx.sym.var("x"), k=k, axis=axis, ret_typ=ret_type, is_ascend=is_ascend, dtype=dtype
+            )
         mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": shape})
         for target, ctx in tvm.testing.enabled_targets():
             for kind in ["graph", "debug"]:
@@ -1086,7 +1095,7 @@ def verify(shape, k, axis, ret_type, is_ascend=False, dtype="float32"):
 
     verify((3, 4), k=1, axis=0, ret_type="both")
     verify((3, 4), k=1, axis=-1, ret_type="indices")
-    verify((3, 5, 6), k=2, axis=2, ret_type="value")
+    verify((3, 5, 6), k=2, axis=2, ret_type="value", is_ascend=False)
     verify((3, 5, 6), k=2, axis=1, ret_type="value", is_ascend=True)
     verify((3, 5, 6), k=0, axis=2, ret_type="both", dtype="int32")
 

From c5f608f08861e2b4e869e0db0f68def7bcf65338 Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Fri, 5 Mar 2021 04:47:07 -0800
Subject: [PATCH 285/357] [BYOC][TRT]Fix groups cannot divide output channel
 count error for deconv when groups>1 (#7595)

* trt num_outputs

* asdf

* fix lint

Co-authored-by: Leyuan Wang <leyuan.wang@bytedance.com>
---
 src/runtime/contrib/tensorrt/tensorrt_ops.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/runtime/contrib/tensorrt/tensorrt_ops.cc b/src/runtime/contrib/tensorrt/tensorrt_ops.cc
index 824178eaa619..04b1e838ee8e 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_ops.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_ops.cc
@@ -309,8 +309,8 @@ class Conv3DOpConverter : public TensorRTOpConverter {
     bool use_asymmetric_padding;
     GetPadding3D(str_padding, &use_asymmetric_padding, &prepadding, &postpadding);
 
-    // Could use attrs->channels.as<IntImmNode>()->value
-    const int num_outputs = weight_shape[0];
+    const int num_outputs =
+        std::stoi(params->node.GetAttr<std::vector<std::string>>("channels")[0]);
     const auto kernel_size = nvinfer1::Dims3(weight_shape[2], weight_shape[3], weight_shape[4]);
     nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
     auto conv_layer = params->network->addConvolutionNd(*input_tensor, num_outputs, kernel_size,
@@ -788,8 +788,8 @@ class Conv2DTransposeOpConverter : public TensorRTOpConverter {
     }
 #endif
 
-    // Could use conv2d_attr->channels.as<IntImmNode>()->value
-    const int num_outputs = weight_shape[1];
+    const int num_outputs =
+        std::stoi(params->node.GetAttr<std::vector<std::string>>("channels")[0]);
     const auto kernel_size = nvinfer1::DimsHW(weight_shape[2], weight_shape[3]);
     nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
     auto deconv_layer = params->network->addDeconvolution(*input_tensor, num_outputs, kernel_size,
@@ -846,8 +846,8 @@ class Conv3DTransposeOpConverter : public TensorRTOpConverter {
     bool use_asymmetric_padding;
     GetPadding3D(str_padding, &use_asymmetric_padding, &prepadding, &postpadding);
 
-    // Could use attrs->channels.as<IntImmNode>()->value
-    const int num_outputs = weight_shape[1];
+    const int num_outputs =
+        std::stoi(params->node.GetAttr<std::vector<std::string>>("channels")[0]);
     const auto kernel_size = nvinfer1::Dims3(weight_shape[2], weight_shape[3], weight_shape[4]);
     nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
     auto deconv_layer = params->network->addDeconvolutionNd(*input_tensor, num_outputs, kernel_size,

From 5d5bbfb1ed2281c3e9b2719be6643281d60022fb Mon Sep 17 00:00:00 2001
From: Ritwik Das <ritwikdas54@gmail.com>
Date: Fri, 5 Mar 2021 09:16:06 -0800
Subject: [PATCH 286/357] Support negative axis for gather (#7600)

* Fix negative axis in gather

* Clang Format

* Black

* Empty Commit

Co-authored-by: Ubuntu <ubuntu@ip-172-31-42-251.us-east-2.compute.internal>
---
 include/tvm/topi/transform.h                  |   3 +
 python/tvm/relay/op/transform.py              |   2 +-
 src/relay/op/tensor/transform.cc              |   3 +
 tests/python/frontend/pytorch/test_forward.py |   6 +-
 tests/python/relay/test_op_level3.py          | 223 ++++++++++++------
 5 files changed, 167 insertions(+), 70 deletions(-)

diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h
index 261fdf9970a3..3ad230560f3a 100644
--- a/include/tvm/topi/transform.h
+++ b/include/tvm/topi/transform.h
@@ -1134,6 +1134,9 @@ inline Tensor gather(const Tensor& data, int axis, const Tensor& indices,
   size_t ndim_i = indices->shape.size();
   ICHECK_GE(ndim_d, 1) << "Cannot gather from a scalar.";
   ICHECK_EQ(ndim_d, ndim_i);
+  if (axis < 0) {
+    axis += ndim_d;
+  }
   ICHECK_GE(axis, 0);
   ICHECK_LT(axis, ndim_d);
   size_t indices_dim_i = static_cast<size_t>(GetConstInt(indices->shape[axis]));
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index f2e3850a8f67..4129b610cb7c 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -1047,7 +1047,7 @@ def gather(data, axis, indices):
         The input data to the operator.
 
     axis: int
-        The axis along which to index.
+        The axis along which to index. negative axis is supported.
 
     indices: relay.Expr
         The indices of values to gather.
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 941f43a5a2c4..e3929bf8b77e 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -3179,6 +3179,9 @@ bool GatherRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   const auto ndim_indices = indices->shape.size();
   int axis = param->axis->value;
   ICHECK_EQ(ndim_data, ndim_indices);
+  if (axis < 0) {
+    axis += ndim_data;
+  }
   ICHECK_GE(axis, 0);
   ICHECK_LT(axis, ndim_data);
 
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 41679bf16c5d..24f8edab7d98 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -3706,13 +3706,13 @@ def test_fn(dim, descending):
 
     inp = torch.randn(100)
     verify_model(test_fn(0, True), [inp])
-    verify_model(test_fn(0, False), [inp])
+    verify_model(test_fn(-1, False), [inp])
 
     inp = torch.randn(100, 100)
     verify_model(test_fn(0, True), [inp])
-    verify_model(test_fn(0, False), [inp])
+    verify_model(test_fn(-2, False), [inp])
     verify_model(test_fn(1, True), [inp])
-    verify_model(test_fn(1, False), [inp])
+    verify_model(test_fn(-1, False), [inp])
 
 
 def test_logical_and():
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 31b95b0b49ae..d2a5090943c3 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -1075,12 +1075,166 @@ def verify_scatter_add(dshape, ishape, axis=0, dtype="float32"):
 
 
 @tvm.testing.uses_gpu
-def test_gather():
+@pytest.mark.parametrize(
+    "data, axis, indices, ref_res",
+    [
+        ([[1, 2], [3, 4]], 1, [[0, 0], [1, 0]], [[1, 1], [4, 3]]),
+        ([[1, 2], [3, 4]], -1, [[0, 0], [1, 0]], [[1, 1], [4, 3]]),
+        (
+            [[[0, 1, 2], [3, 4, 5]], [[6, 7, 8], [9, 10, 11]]],
+            0,
+            [[[1, 0, 1], [1, 1, 0]]],
+            [[[6, 1, 8], [9, 10, 5]]],
+        ),
+        (
+            [[[0, 1, 2], [3, 4, 5]], [[6, 7, 8], [9, 10, 11]]],
+            -3,
+            [[[1, 0, 1], [1, 1, 0]]],
+            [[[6, 1, 8], [9, 10, 5]]],
+        ),
+        (
+            [
+                [
+                    [-0.2321, -0.2024, -1.7624],
+                    [-0.3829, -0.4246, 0.2448],
+                    [0.1822, 0.2360, -0.8965],
+                    [0.4497, -0.2224, 0.6103],
+                ],
+                [
+                    [0.0408, -0.7667, -0.4303],
+                    [-0.3216, 0.7489, -0.1502],
+                    [0.0144, -0.4699, -0.0064],
+                    [-0.0768, -1.6064, 1.3390],
+                ],
+            ],
+            1,
+            [[[2, 2, 0], [1, 0, 3]], [[3, 2, 0], [1, 0, 0]]],
+            [
+                [[0.1822, 0.2360, -1.7624], [-0.3829, -0.2024, 0.6103]],
+                [[-0.0768, -0.4699, -0.4303], [-0.3216, -0.7667, -0.4303]],
+            ],
+        ),
+        (
+            [
+                [
+                    [-0.2321, -0.2024, -1.7624],
+                    [-0.3829, -0.4246, 0.2448],
+                    [0.1822, 0.2360, -0.8965],
+                    [0.4497, -0.2224, 0.6103],
+                ],
+                [
+                    [0.0408, -0.7667, -0.4303],
+                    [-0.3216, 0.7489, -0.1502],
+                    [0.0144, -0.4699, -0.0064],
+                    [-0.0768, -1.6064, 1.3390],
+                ],
+            ],
+            -2,
+            [[[2, 2, 0], [1, 0, 3]], [[3, 2, 0], [1, 0, 0]]],
+            [
+                [[0.1822, 0.2360, -1.7624], [-0.3829, -0.2024, 0.6103]],
+                [[-0.0768, -0.4699, -0.4303], [-0.3216, -0.7667, -0.4303]],
+            ],
+        ),
+        (
+            [
+                [
+                    [-0.2321, -0.2024, -1.7624],
+                    [-0.3829, -0.4246, 0.2448],
+                    [0.1822, 0.2360, -0.8965],
+                    [0.4497, -0.2224, 0.6103],
+                ],
+                [
+                    [0.0408, -0.7667, -0.4303],
+                    [-0.3216, 0.7489, -0.1502],
+                    [0.0144, -0.4699, -0.0064],
+                    [-0.0768, -1.6064, 1.3390],
+                ],
+            ],
+            -2,
+            [[[2, 2, 0], [1, 0, 3]], [[3, 2, 0], [1, 0, 0]]],
+            [
+                [[0.1822, 0.2360, -1.7624], [-0.3829, -0.2024, 0.6103]],
+                [[-0.0768, -0.4699, -0.4303], [-0.3216, -0.7667, -0.4303]],
+            ],
+        ),
+        (
+            [
+                [
+                    [0.3050, 1.6986, 1.1034],
+                    [0.7020, -0.6960, -2.1818],
+                    [0.3116, -0.5773, -0.9912],
+                    [0.0835, -1.3915, -1.0720],
+                ],
+                [
+                    [0.1694, -0.6091, -0.6539],
+                    [-0.5234, -0.1218, 0.5084],
+                    [0.2374, -1.9537, -2.0078],
+                    [-0.5700, -1.0302, 0.1558],
+                ],
+            ],
+            2,
+            [
+                [[1, 1, 0, 1], [0, 0, 2, 2], [1, 2, 1, 2], [2, 2, 1, 0]],
+                [[0, 0, 1, 2], [2, 2, 1, 0], [1, 2, 0, 0], [0, 2, 0, 2]],
+            ],
+            [
+                [
+                    [1.6986, 1.6986, 0.3050, 1.6986],
+                    [0.7020, 0.7020, -2.1818, -2.1818],
+                    [-0.5773, -0.9912, -0.5773, -0.9912],
+                    [-1.0720, -1.0720, -1.3915, 0.0835],
+                ],
+                [
+                    [0.1694, 0.1694, -0.6091, -0.6539],
+                    [0.5084, 0.5084, -0.1218, -0.5234],
+                    [-1.9537, -2.0078, 0.2374, 0.2374],
+                    [-0.5700, 0.1558, -0.5700, 0.1558],
+                ],
+            ],
+        ),
+        (
+            [
+                [
+                    [0.3050, 1.6986, 1.1034],
+                    [0.7020, -0.6960, -2.1818],
+                    [0.3116, -0.5773, -0.9912],
+                    [0.0835, -1.3915, -1.0720],
+                ],
+                [
+                    [0.1694, -0.6091, -0.6539],
+                    [-0.5234, -0.1218, 0.5084],
+                    [0.2374, -1.9537, -2.0078],
+                    [-0.5700, -1.0302, 0.1558],
+                ],
+            ],
+            -1,
+            [
+                [[1, 1, 0, 1], [0, 0, 2, 2], [1, 2, 1, 2], [2, 2, 1, 0]],
+                [[0, 0, 1, 2], [2, 2, 1, 0], [1, 2, 0, 0], [0, 2, 0, 2]],
+            ],
+            [
+                [
+                    [1.6986, 1.6986, 0.3050, 1.6986],
+                    [0.7020, 0.7020, -2.1818, -2.1818],
+                    [-0.5773, -0.9912, -0.5773, -0.9912],
+                    [-1.0720, -1.0720, -1.3915, 0.0835],
+                ],
+                [
+                    [0.1694, 0.1694, -0.6091, -0.6539],
+                    [0.5084, 0.5084, -0.1218, -0.5234],
+                    [-1.9537, -2.0078, 0.2374, 0.2374],
+                    [-0.5700, 0.1558, -0.5700, 0.1558],
+                ],
+            ],
+        ),
+    ],
+)
+def test_gather(data, axis, indices, ref_res):
     def verify_gather(data, axis, indices, ref_res):
         data = np.asarray(data, dtype="float32")
         indices = np.asarray(indices, dtype="int32")
         ref_res = np.asarray(ref_res)
-
         d = relay.var("x", relay.TensorType(data.shape, "float32"))
         i = relay.var("y", relay.TensorType(indices.shape, "int32"))
         z = relay.gather(d, axis, i)
@@ -1093,70 +1247,7 @@ def verify_gather(data, axis, indices, ref_res):
                 op_res = intrp.evaluate(func)(data, indices)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
-    verify_gather([[1, 2], [3, 4]], 1, [[0, 0], [1, 0]], [[1, 1], [4, 3]])
-    verify_gather(
-        [[[0, 1, 2], [3, 4, 5]], [[6, 7, 8], [9, 10, 11]]],
-        0,
-        [[[1, 0, 1], [1, 1, 0]]],
-        [[[6, 1, 8], [9, 10, 5]]],
-    )
-    verify_gather(
-        [
-            [
-                [-0.2321, -0.2024, -1.7624],
-                [-0.3829, -0.4246, 0.2448],
-                [0.1822, 0.2360, -0.8965],
-                [0.4497, -0.2224, 0.6103],
-            ],
-            [
-                [0.0408, -0.7667, -0.4303],
-                [-0.3216, 0.7489, -0.1502],
-                [0.0144, -0.4699, -0.0064],
-                [-0.0768, -1.6064, 1.3390],
-            ],
-        ],
-        1,
-        [[[2, 2, 0], [1, 0, 3]], [[3, 2, 0], [1, 0, 0]]],
-        [
-            [[0.1822, 0.2360, -1.7624], [-0.3829, -0.2024, 0.6103]],
-            [[-0.0768, -0.4699, -0.4303], [-0.3216, -0.7667, -0.4303]],
-        ],
-    )
-    verify_gather(
-        [
-            [
-                [0.3050, 1.6986, 1.1034],
-                [0.7020, -0.6960, -2.1818],
-                [0.3116, -0.5773, -0.9912],
-                [0.0835, -1.3915, -1.0720],
-            ],
-            [
-                [0.1694, -0.6091, -0.6539],
-                [-0.5234, -0.1218, 0.5084],
-                [0.2374, -1.9537, -2.0078],
-                [-0.5700, -1.0302, 0.1558],
-            ],
-        ],
-        2,
-        [
-            [[1, 1, 0, 1], [0, 0, 2, 2], [1, 2, 1, 2], [2, 2, 1, 0]],
-            [[0, 0, 1, 2], [2, 2, 1, 0], [1, 2, 0, 0], [0, 2, 0, 2]],
-        ],
-        [
-            [
-                [1.6986, 1.6986, 0.3050, 1.6986],
-                [0.7020, 0.7020, -2.1818, -2.1818],
-                [-0.5773, -0.9912, -0.5773, -0.9912],
-                [-1.0720, -1.0720, -1.3915, 0.0835],
-            ],
-            [
-                [0.1694, 0.1694, -0.6091, -0.6539],
-                [0.5084, 0.5084, -0.1218, -0.5234],
-                [-1.9537, -2.0078, 0.2374, 0.2374],
-                [-0.5700, 0.1558, -0.5700, 0.1558],
-            ],
-        ],
-    )
+    verify_gather(data, axis, indices, ref_res)
 
 
 @tvm.testing.uses_gpu

From c0b9688869f26794765fb26a8d9b51191f60d6b2 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Sat, 6 Mar 2021 03:53:48 +0900
Subject: [PATCH 287/357] [Vulkan] Support passing 64 bit scalar  (#7572)

Co-authored-by: Wuwei Lin <wuwei@apache.org>
---
 src/runtime/metal/metal_module.mm            |  4 +--
 src/runtime/pack_args.h                      | 36 ++++++++++++++------
 src/runtime/vulkan/vulkan.cc                 | 14 ++++----
 src/target/source/codegen_metal.cc           |  7 +++-
 src/target/spirv/intrin_rule_spirv.cc        |  6 ++++
 tests/python/topi/python/test_topi_cumsum.py |  7 ++++
 tests/python/topi/python/test_topi_vision.py |  2 +-
 7 files changed, 55 insertions(+), 21 deletions(-)

diff --git a/src/runtime/metal/metal_module.mm b/src/runtime/metal/metal_module.mm
index 981dd6129f9e..8f1fde86f074 100644
--- a/src/runtime/metal/metal_module.mm
+++ b/src/runtime/metal/metal_module.mm
@@ -180,7 +180,7 @@ void Init(MetalModuleNode* m, ObjectPtr<Object> sptr, const std::string& func_na
     scache_[dev_id] = m->GetPipelineState(dev_id, func_name);
   }
   // invoke the function with void arguments
-  void operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion* pack_args) const {
+  void operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion64* pack_args) const {
     metal::MetalThreadEntry* t = metal::MetalThreadEntry::ThreadLocal();
     int device_id = t->context.device_id;
     if (scache_[device_id] == nil) {
@@ -197,7 +197,7 @@ void operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion* pack_args) const
     }
     if (num_pack_args_ != 0) {
       [encoder setBytes:pack_args
-                 length:num_pack_args_ * sizeof(ArgUnion)
+                 length:num_pack_args_ * sizeof(ArgUnion64)
                 atIndex:num_buffer_args_];
     }
     // launch
diff --git a/src/runtime/pack_args.h b/src/runtime/pack_args.h
index 45cde22bda08..7c852da77df6 100644
--- a/src/runtime/pack_args.h
+++ b/src/runtime/pack_args.h
@@ -40,13 +40,24 @@ namespace tvm {
 namespace runtime {
 /*!
  * \brief argument union type of 32bit.
- * Choose 32 bit because most GPU API do not work well with 64 bit.
  */
-union ArgUnion {
+union ArgUnion32 {
   int32_t v_int32;
   uint32_t v_uint32;
   float v_float32;
 };
+
+/*!
+ * \brief argument union type of 64 bit, for use by Vulkan and Metal runtime.
+ */
+union ArgUnion64 {
+  int32_t v_int32[2];
+  uint32_t v_uint32[2];
+  float v_float32[2];
+  int64_t v_int64;
+  uint64_t v_uint64;
+  double v_float64;
+};
 /*!
  * \brief Create a packed function from void addr types.
  *
@@ -140,9 +151,9 @@ inline PackedFunc PackFuncVoidAddr_(F f, const std::vector<ArgConvertCode>& code
   int num_args = static_cast<int>(codes.size());
   auto ret = [f, codes, num_args](TVMArgs args, TVMRetValue* ret) {
     TempArray<void*, N> addr_(num_args);
-    TempArray<ArgUnion, N> holder_(num_args);
+    TempArray<ArgUnion32, N> holder_(num_args);
     void** addr = addr_.data();
-    ArgUnion* holder = holder_.data();
+    ArgUnion32* holder = holder_.data();
     for (int i = 0; i < num_args; ++i) {
       switch (codes[i]) {
         case INT64_TO_INT64:
@@ -177,25 +188,28 @@ template <int N, typename F>
 inline PackedFunc PackFuncNonBufferArg_(F f, int base, const std::vector<ArgConvertCode>& codes) {
   int num_args = static_cast<int>(codes.size());
   auto ret = [f, codes, base, num_args](TVMArgs args, TVMRetValue* ret) {
-    TempArray<ArgUnion, N> holder_(num_args);
-    ArgUnion* holder = holder_.data();
+    TempArray<ArgUnion64, N> holder_(num_args);
+    ArgUnion64* holder = holder_.data();
     for (int i = 0; i < num_args; ++i) {
       switch (codes[i]) {
-        case INT64_TO_INT64:
+        case INT64_TO_INT64: {
+          holder[i].v_int64 = args.values[base + i].v_int64;
+          break;
+        }
         case FLOAT64_TO_FLOAT64: {
-          LOG(FATAL) << "Do not support 64bit argument to device function";
+          holder[i].v_float64 = args.values[base + i].v_float64;
           break;
         }
         case INT64_TO_INT32: {
-          holder[i].v_int32 = static_cast<int32_t>(args.values[base + i].v_int64);
+          holder[i].v_int32[0] = static_cast<int32_t>(args.values[base + i].v_int64);
           break;
         }
         case INT64_TO_UINT32: {
-          holder[i].v_uint32 = static_cast<uint32_t>(args.values[base + i].v_int64);
+          holder[i].v_uint32[0] = static_cast<uint32_t>(args.values[base + i].v_int64);
           break;
         }
         case FLOAT64_TO_FLOAT32: {
-          holder[i].v_float32 = static_cast<float>(args.values[base + i].v_float64);
+          holder[i].v_float32[0] = static_cast<float>(args.values[base + i].v_float64);
           break;
         }
         case HANDLE_TO_HANDLE: {
diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc
index f40fd80f38b5..794f3c570f96 100644
--- a/src/runtime/vulkan/vulkan.cc
+++ b/src/runtime/vulkan/vulkan.cc
@@ -711,7 +711,7 @@ class VulkanWrappedFunc {
     thread_axis_cfg_.Init(num_buffer_args + num_pack_args, thread_axis_tags);
   }
 
-  void operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion* pack_args) const;
+  void operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion64* pack_args) const;
 
  private:
   // internal module
@@ -875,7 +875,7 @@ class VulkanModuleNode final : public runtime::ModuleNode {
     VkPushConstantRange crange;
     crange.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
     crange.offset = 0;
-    crange.size = sizeof(ArgUnion) * num_pack_args;
+    crange.size = sizeof(ArgUnion64) * num_pack_args;
 
     VkPipelineLayoutCreateInfo playout_cinfo;
     playout_cinfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
@@ -1046,7 +1046,8 @@ VulkanStream* VulkanThreadEntry::Stream(size_t device_id) {
   return streams_[device_id].get();
 }
 
-void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion* pack_args) const {
+void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv,
+                                   const ArgUnion64* pack_args) const {
   int device_id = VulkanThreadEntry::ThreadLocal()->ctx.device_id;
   ICHECK_LT(device_id, kVulkanMaxNumDevice);
   const auto& vctx = VulkanDeviceAPI::Global()->context(device_id);
@@ -1075,7 +1076,7 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion
           descriptor_buffers.data());
       if (num_pack_args_ != 0) {
         vkCmdPushConstants(state->cmd_buffer_, pipeline->pipeline_layout,
-                           VK_SHADER_STAGE_COMPUTE_BIT, 0, num_pack_args_ * sizeof(ArgUnion),
+                           VK_SHADER_STAGE_COMPUTE_BIT, 0, num_pack_args_ * sizeof(ArgUnion64),
                            pack_args);
       }
       vkCmdDispatch(state->cmd_buffer_, wl.grid_dim(0), wl.grid_dim(1), wl.grid_dim(2));
@@ -1093,7 +1094,7 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion
   }
 
   // Otherwise, the more expensive deferred path.
-  std::vector<ArgUnion> pack_args_storage(pack_args, pack_args + num_pack_args_);
+  std::vector<ArgUnion64> pack_args_storage(pack_args, pack_args + num_pack_args_);
   const auto& deferred_initializer = [&vctx, pipeline, descriptor_buffers]() {
     std::vector<VkWriteDescriptorSet> write_descriptor_sets;
     write_descriptor_sets.resize(descriptor_buffers.size());
@@ -1119,7 +1120,8 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion
                             nullptr);
     if (pack_args_storage.size() != 0) {
       vkCmdPushConstants(state->cmd_buffer_, pipeline->pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT,
-                         0, pack_args_storage.size() * sizeof(ArgUnion), pack_args_storage.data());
+                         0, pack_args_storage.size() * sizeof(ArgUnion64),
+                         pack_args_storage.data());
     }
     vkCmdDispatch(state->cmd_buffer_, wl.grid_dim(0), wl.grid_dim(1), wl.grid_dim(2));
     VkMemoryBarrier barrier_info;
diff --git a/src/target/source/codegen_metal.cc b/src/target/source/codegen_metal.cc
index baa30065a7f9..c95d578df686 100644
--- a/src/target/source/codegen_metal.cc
+++ b/src/target/source/codegen_metal.cc
@@ -47,7 +47,7 @@ CodeGenMetal::CodeGenMetal() {
   decl_stream << "#include <metal_stdlib>\n";
   decl_stream << "using namespace metal;\n\n";
   decl_stream << "union __TVMArgUnion {\n"
-              << " int v_int;\n"
+              << " int v_int[2];\n"
               << "};\n\n";
 }
 
@@ -102,6 +102,11 @@ void CodeGenMetal::AddFunction(const PrimFunc& f) {
       std::string vid = AllocVarID(v.get());
       std::ostringstream vref;
       if (v.dtype().bits() == 32) {
+        decl_stream << "  ";
+        PrintType(v.dtype(), decl_stream);
+        decl_stream << " " << vid << "[2];\n";
+        vref << varg << "." << vid << "[0]";
+      } else if (v.dtype().bits() == 64) {
         decl_stream << "  ";
         PrintType(v.dtype(), decl_stream);
         decl_stream << " " << vid << ";\n";
diff --git a/src/target/spirv/intrin_rule_spirv.cc b/src/target/spirv/intrin_rule_spirv.cc
index 90b2eb2a671f..b75fb53b150d 100644
--- a/src/target/spirv/intrin_rule_spirv.cc
+++ b/src/target/spirv/intrin_rule_spirv.cc
@@ -62,8 +62,14 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.vulkan.fabs").set_body(DispatchGLSLPureIntr
 
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.vulkan.exp").set_body(DispatchGLSLPureIntrin<GLSLstd450Exp>);
 
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.vulkan.sin").set_body(DispatchGLSLPureIntrin<GLSLstd450Sin>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.vulkan.cos").set_body(DispatchGLSLPureIntrin<GLSLstd450Cos>);
+
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.vulkan.log").set_body(DispatchGLSLPureIntrin<GLSLstd450Log>);
 
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.vulkan.log2").set_body(DispatchGLSLPureIntrin<GLSLstd450Log2>);
+
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.vulkan.sqrt").set_body(DispatchGLSLPureIntrin<GLSLstd450Sqrt>);
 
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.vulkan.pow").set_body(DispatchGLSLPureIntrin<GLSLstd450Pow>);
diff --git a/tests/python/topi/python/test_topi_cumsum.py b/tests/python/topi/python/test_topi_cumsum.py
index a01a496f92e9..cfe5130643c5 100644
--- a/tests/python/topi/python/test_topi_cumsum.py
+++ b/tests/python/topi/python/test_topi_cumsum.py
@@ -28,6 +28,8 @@ def check_cumsum(np_ref, data, axis=None, dtype=None):
             "generic": (lambda x: topi.cumsum(x, axis, dtype), topi.generic.schedule_extern),
             "cuda": (lambda x: topi.cuda.cumsum(x, axis, dtype), topi.cuda.schedule_scan),
             "nvptx": (lambda x: topi.cuda.cumsum(x, axis, dtype), topi.cuda.schedule_scan),
+            "vulkan": (lambda x: topi.cuda.cumsum(x, axis, dtype), topi.cuda.schedule_scan),
+            "metal": (lambda x: topi.cuda.cumsum(x, axis, dtype), topi.cuda.schedule_scan),
         }
         fcompute, fschedule = tvm.topi.testing.dispatch(target, implementations)
         tvm.topi.testing.compare_numpy_tvm([data], np_ref, target, ctx, fcompute, fschedule)
@@ -44,6 +46,9 @@ def check_cumsum(np_ref, data, axis=None, dtype=None):
     check_cumsum(np.cumsum(data, dtype=np.int32), data, dtype="int32")
 
     for in_dtype in ["float32", "float64"]:
+        if target == "metal" and in_dtype == "float64":
+            # float64 is not supported in metal
+            continue
         data = np.random.randn(10, 10).astype(in_dtype)
         check_cumsum(np.cumsum(data), data)
         check_cumsum(np.cumsum(data, axis=0), data, axis=0)
@@ -70,3 +75,5 @@ def check_cumsum(np_ref, data, axis=None, dtype=None):
     test_cumsum(tvm.context("cpu"), tvm.target.Target("llvm"))
     test_cumsum(tvm.context("cuda"), tvm.target.Target("cuda"))
     test_cumsum(tvm.context("nvptx"), tvm.target.Target("nvptx"))
+    test_cumsum(tvm.context("vulkan"), tvm.target.Target("vulkan"))
+    test_cumsum(tvm.context("metal"), tvm.target.Target("metal"))
diff --git a/tests/python/topi/python/test_topi_vision.py b/tests/python/topi/python/test_topi_vision.py
index 839356892ab1..2fdf3cf4b170 100644
--- a/tests/python/topi/python/test_topi_vision.py
+++ b/tests/python/topi/python/test_topi_vision.py
@@ -112,7 +112,7 @@ def check_device(device):
         tvm.testing.assert_allclose(tvm_out2.asnumpy(), np_out2, rtol=1e-3)
         tvm.testing.assert_allclose(tvm_out3.asnumpy(), np_out3, rtol=1e-3)
 
-    for device in ["llvm", "cuda", "opencl"]:
+    for device in ["llvm", "cuda", "opencl", "vulkan"]:
         check_device(device)
 
 

From 7344b6666e76bb69fa1bf727c25074071fa522fb Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Fri, 5 Mar 2021 10:55:05 -0800
Subject: [PATCH 288/357] Fix autotuning, broken in #7337 (#7566)

* Fix autotuning, broken in #7337

* retrigger CI, because I don't understand how it passed
---
 python/tvm/autotvm/measure/measure_methods.py |  10 +-
 tests/python/integration/test_tuning.py       | 244 +++++++++++-------
 2 files changed, 157 insertions(+), 97 deletions(-)

diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 62fd811dc1ec..b68767bd0528 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -280,7 +280,13 @@ def get_build_kwargs(self):
 
     def run(self, measure_inputs, build_results):
         results = []
-        remote_args = (self.key, self.host, self.port, self.priority, self.timeout)
+        remote_kwargs = dict(
+            device_key=self.key,
+            host=self.host,
+            port=self.port,
+            priority=self.priority,
+            timeout=self.timeout,
+        )
 
         for i in range(0, len(measure_inputs), self.n_parallel):
             futures = []
@@ -300,7 +306,7 @@ def run(self, measure_inputs, build_results):
                     self.repeat,
                     self.min_repeat_ms,
                     self.cooldown_interval,
-                    remote_args,
+                    remote_kwargs,
                     self.enable_cpu_cache_flush,
                     module_loader,
                 )
diff --git a/tests/python/integration/test_tuning.py b/tests/python/integration/test_tuning.py
index 64b2c16e155e..813352c52096 100644
--- a/tests/python/integration/test_tuning.py
+++ b/tests/python/integration/test_tuning.py
@@ -18,9 +18,14 @@
 Test the tuner
 """
 import logging
+import sys
+import textwrap
 import time
 
+import pytest
+
 import tvm
+import tvm.relay
 from tvm import te
 
 from tvm import autotvm
@@ -29,94 +34,100 @@
 import tvm.testing
 
 
-@autotvm.template("testing/conv2d_no_batching")
-def conv2d_no_batching(N, H, W, CI, CO, KH, KW):
-    """An example template for testing"""
-    assert N == 1, "Only consider batch_size = 1 in this template"
-
-    data = te.placeholder((N, CI, H, W), name="data")
-    kernel = te.placeholder((CO, CI, KH, KW), name="kernel")
-
-    rc = te.reduce_axis((0, CI), name="rc")
-    ry = te.reduce_axis((0, KH), name="ry")
-    rx = te.reduce_axis((0, KW), name="rx")
-
-    conv = te.compute(
-        (N, CO, H - KH + 1, W - KW + 1),
-        lambda nn, ff, yy, xx: te.sum(
-            data[nn, rc, yy + ry, xx + rx] * kernel[ff, rc, ry, rx], axis=[rc, ry, rx]
-        ),
-        tag="conv2d_nchw",
-    )
-
-    s = te.create_schedule([conv.op])
-
-    output = conv
-    OL = s.cache_write(conv, "local")
-
-    # create cache stage
-    AA = s.cache_read(data, "shared", [OL])
-    WW = s.cache_read(kernel, "shared", [OL])
-    AL = s.cache_read(AA, "local", [OL])
-    WL = s.cache_read(WW, "local", [OL])
-
-    # tile and bind spatial axes
-    n, f, y, x = s[output].op.axis
-    cfg = autotvm.get_config()
-    cfg.define_split("tile_f", cfg.axis(f), num_outputs=4)
-    cfg.define_split("tile_y", cfg.axis(y), num_outputs=4)
-    cfg.define_split("tile_x", cfg.axis(x), num_outputs=4)
-    bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
-    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
-    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
-    kernel_scope = n  # this is the scope to attach global config inside this kernel
-
-    s[output].bind(bf, te.thread_axis("blockIdx.z"))
-    s[output].bind(by, te.thread_axis("blockIdx.y"))
-    s[output].bind(bx, te.thread_axis("blockIdx.x"))
-    s[output].bind(vf, te.thread_axis("vthread"))
-    s[output].bind(vy, te.thread_axis("vthread"))
-    s[output].bind(vx, te.thread_axis("vthread"))
-    s[output].bind(tf, te.thread_axis("threadIdx.z"))
-    s[output].bind(ty, te.thread_axis("threadIdx.y"))
-    s[output].bind(tx, te.thread_axis("threadIdx.x"))
-    s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
-    s[OL].compute_at(s[output], tx)
-
-    # tile and bind reduction axes
-    n, f, y, x = s[OL].op.axis
-    rc, ry, rx = s[OL].op.reduce_axis
-    cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=3)
-    cfg.define_split("tile_ry", cfg.axis(ry), num_outputs=3)
-    cfg.define_split("tile_rx", cfg.axis(rx), num_outputs=3)
-    rco, rcm, rci = cfg["tile_rc"].apply(s, OL, rc)
-    ryo, rym, ryi = cfg["tile_rx"].apply(s, OL, ry)
-    rxo, rxm, rxi = cfg["tile_ry"].apply(s, OL, rx)
-    s[OL].reorder(rco, ryo, rxo, rcm, rym, rxm, rci, ryi, rxi, n, f, y, x)
-
-    s[AA].compute_at(s[OL], rxo)
-    s[WW].compute_at(s[OL], rxo)
-    s[AL].compute_at(s[OL], rxm)
-    s[WL].compute_at(s[OL], rxm)
-
-    # cooperative fetching
-    for load in [AA, WW]:
-        n, f, y, x = s[load].op.axis
-        fused = s[load].fuse(n, f, y, x)
-        tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2])
-        ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
-        tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
-        s[load].bind(tz, te.thread_axis("threadIdx.z"))
-        s[load].bind(ty, te.thread_axis("threadIdx.y"))
-        s[load].bind(tx, te.thread_axis("threadIdx.x"))
-
-    # tune unroll
-    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
-    cfg.define_knob("unroll_explicit", [0, 1])
-    s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-    s[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
-
-    return s, [data, kernel, conv]
+def setup_module():
+    @autotvm.template("testing/conv2d_no_batching")
+    def conv2d_no_batching(N, H, W, CI, CO, KH, KW):
+        """An example template for testing"""
+        assert N == 1, "Only consider batch_size = 1 in this template"
+
+        data = te.placeholder((N, CI, H, W), name="data")
+        kernel = te.placeholder((CO, CI, KH, KW), name="kernel")
+
+        rc = te.reduce_axis((0, CI), name="rc")
+        ry = te.reduce_axis((0, KH), name="ry")
+        rx = te.reduce_axis((0, KW), name="rx")
+
+        conv = te.compute(
+            (N, CO, H - KH + 1, W - KW + 1),
+            lambda nn, ff, yy, xx: te.sum(
+                data[nn, rc, yy + ry, xx + rx] * kernel[ff, rc, ry, rx], axis=[rc, ry, rx]
+            ),
+            tag="conv2d_nchw",
+        )
+
+        s = te.create_schedule([conv.op])
+
+        output = conv
+        OL = s.cache_write(conv, "local")
+
+        # create cache stage
+        AA = s.cache_read(data, "shared", [OL])
+        WW = s.cache_read(kernel, "shared", [OL])
+        AL = s.cache_read(AA, "local", [OL])
+        WL = s.cache_read(WW, "local", [OL])
+
+        # tile and bind spatial axes
+        n, f, y, x = s[output].op.axis
+        cfg = autotvm.get_config()
+        cfg.define_split("tile_f", cfg.axis(f), num_outputs=4)
+        cfg.define_split("tile_y", cfg.axis(y), num_outputs=4)
+        cfg.define_split("tile_x", cfg.axis(x), num_outputs=4)
+        bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
+        by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
+        bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
+        kernel_scope = n  # this is the scope to attach global config inside this kernel
+
+        s[output].bind(bf, te.thread_axis("blockIdx.z"))
+        s[output].bind(by, te.thread_axis("blockIdx.y"))
+        s[output].bind(bx, te.thread_axis("blockIdx.x"))
+        s[output].bind(vf, te.thread_axis("vthread"))
+        s[output].bind(vy, te.thread_axis("vthread"))
+        s[output].bind(vx, te.thread_axis("vthread"))
+        s[output].bind(tf, te.thread_axis("threadIdx.z"))
+        s[output].bind(ty, te.thread_axis("threadIdx.y"))
+        s[output].bind(tx, te.thread_axis("threadIdx.x"))
+        s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
+        s[OL].compute_at(s[output], tx)
+
+        # tile and bind reduction axes
+        n, f, y, x = s[OL].op.axis
+        rc, ry, rx = s[OL].op.reduce_axis
+        cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=3)
+        cfg.define_split("tile_ry", cfg.axis(ry), num_outputs=3)
+        cfg.define_split("tile_rx", cfg.axis(rx), num_outputs=3)
+        rco, rcm, rci = cfg["tile_rc"].apply(s, OL, rc)
+        ryo, rym, ryi = cfg["tile_rx"].apply(s, OL, ry)
+        rxo, rxm, rxi = cfg["tile_ry"].apply(s, OL, rx)
+        s[OL].reorder(rco, ryo, rxo, rcm, rym, rxm, rci, ryi, rxi, n, f, y, x)
+
+        s[AA].compute_at(s[OL], rxo)
+        s[WW].compute_at(s[OL], rxo)
+        s[AL].compute_at(s[OL], rxm)
+        s[WL].compute_at(s[OL], rxm)
+
+        # cooperative fetching
+        for load in [AA, WW]:
+            n, f, y, x = s[load].op.axis
+            fused = s[load].fuse(n, f, y, x)
+            tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2])
+            ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
+            tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
+            s[load].bind(tz, te.thread_axis("threadIdx.z"))
+            s[load].bind(ty, te.thread_axis("threadIdx.y"))
+            s[load].bind(tx, te.thread_axis("threadIdx.x"))
+
+        # tune unroll
+        cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
+        cfg.define_knob("unroll_explicit", [0, 1])
+        s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
+        s[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
+
+        return s, [data, kernel, conv]
+
+
+def teardown_module():
+    # TODO(areusch): Tasks should not be registered into a global.
+    del autotvm.task.task.TASK_TABLE["testing/conv2d_no_batching"]
 
 
 def get_sample_task(target=tvm.target.cuda(), target_host=None):
@@ -131,19 +142,62 @@ def get_sample_task(target=tvm.target.cuda(), target_host=None):
 
 
 @tvm.testing.parametrize_targets("cuda", "opencl")
-def test_tuning(target, ctx):
+def test_tuning_gpu(target, ctx):
     # init task
     task, target = get_sample_task(target, None)
-    logging.info("%s", task.config_space)
+    logging.info("task config space: %s", task.config_space)
 
     measure_option = autotvm.measure_option(autotvm.LocalBuilder(), autotvm.LocalRunner())
 
+    results = []
+
     tuner = RandomTuner(task)
-    tuner.tune(n_trial=20, measure_option=measure_option)
+    tuner.tune(
+        n_trial=20,
+        measure_option=measure_option,
+        callbacks=(lambda _tuner, _inputs, rs: results.extend(rs),),
+    )
 
+    assert len(results) == 20
 
-if __name__ == "__main__":
-    # only print log when invoked from main
-    logging.basicConfig(level=logging.DEBUG)
+    successful_results = [r for r in results if r.error_no == autotvm.MeasureErrorNo.NO_ERROR]
+    assert len(successful_results) > 0, f"No successful tuning runs: {results!r}"
+
+
+def test_tuning_cpu():
+    ir_mod = tvm.parser.fromtext(
+        textwrap.dedent(
+            """
+        #[version = "0.0.5"]
+        def @main(%a : Tensor[(1, 3, 32, 32), float32], %b : Tensor[(3, 3, 5, 5), float32]) {
+               nn.conv2d(%a, %b, data_layout="NCHW", kernel_layout="OIHW")
+        }
+        """
+        )
+    )
+    tasks = autotvm.task.relay_integration.extract_from_program(
+        ir_mod, {}, tvm.target.create("llvm")
+    )
+    assert len(tasks) == 1, f"Extracted != 1 task from program: {tasks!r}"
+
+    task = tasks[0]
+
+    measure_option = autotvm.measure_option(autotvm.LocalBuilder(), autotvm.LocalRunner())
+
+    results = []
+
+    tuner = RandomTuner(task)
+    tuner.tune(
+        n_trial=20,
+        measure_option=measure_option,
+        callbacks=(lambda _tuner, _inputs, rs: results.extend(rs),),
+    )
+
+    assert len(results) == 20
 
-    test_tuning()
+    successful_results = [r for r in results if r.error_no == autotvm.MeasureErrorNo.NO_ERROR]
+    assert len(successful_results) > 0, f"No successful tuning runs: {results!r}"
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))

From d6c0cea8f46b1ba241e43d2453a096ec30b3e210 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Fri, 5 Mar 2021 10:55:59 -0800
Subject: [PATCH 289/357] [RUNTIME] Add device specific timers (#7472)

---
 include/tvm/runtime/profiling.h               | 150 ++++++++++++++++++
 src/runtime/cuda/cuda_device_api.cc           |  36 +++++
 .../graph/debug/graph_runtime_debug.cc        |  27 ++--
 src/runtime/profiling.cc                      |  97 +++++++++++
 src/runtime/rocm/rocm_device_api.cc           |  37 +++++
 src/runtime/vm/profiler/vm.cc                 |  25 +--
 src/runtime/vm/profiler/vm.h                  |   3 +-
 tests/cpp/profiling.cc                        |  47 ++++++
 8 files changed, 400 insertions(+), 22 deletions(-)
 create mode 100644 include/tvm/runtime/profiling.h
 create mode 100644 src/runtime/profiling.cc
 create mode 100644 tests/cpp/profiling.cc

diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h
new file mode 100644
index 000000000000..45b60ea18acc
--- /dev/null
+++ b/include/tvm/runtime/profiling.h
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file include/tvm/runtime/profiling.h
+ * \brief Runtime profiling including timers.
+ */
+#ifndef TVM_RUNTIME_PROFILING_H_
+#define TVM_RUNTIME_PROFILING_H_
+
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/registry.h>
+
+#include <string>
+
+namespace tvm {
+namespace runtime {
+
+/*! \brief Base class for all implementations.
+ *
+ * New implementations of this interface should make sure that `Start` and `Stop`
+ * are as lightweight as possible. Expensive state synchronization should be
+ * done in `SyncAndGetElapsedNanos`.
+ */
+class TimerNode : public Object {
+ public:
+  /*! \brief Start the timer.
+   *
+   * Note: this function should only be called once per object.
+   */
+  virtual void Start() = 0;
+  /*! \brief Stop the timer.
+   *
+   * Note: this function should only be called once per object.
+   */
+  virtual void Stop() = 0;
+  /*! \brief Synchronize timer state and return elapsed time between `Start` and `Stop`.
+   * \return The time in nanoseconds between `Start` and `Stop`.
+   *
+   * This function is necessary because we want to avoid timing the overhead of
+   * doing timing. When using multiple timers, it is recommended to stop all of
+   * them before calling `SyncAndGetElapsedNanos` on any of them.
+   *
+   * Note: this function should be only called once per object. It may incur
+   * a large synchronization overhead (for example, with GPUs).
+   */
+  virtual int64_t SyncAndGetElapsedNanos() = 0;
+
+  virtual ~TimerNode() {}
+
+  static constexpr const char* _type_key = "TimerNode";
+  TVM_DECLARE_BASE_OBJECT_INFO(TimerNode, Object);
+};
+
+/*! \brief Timer for a specific device.
+ *
+ * This is a managed reference to a TimerNode.
+ *
+ * \sa TimerNode
+ */
+class Timer : public ObjectRef {
+ public:
+  /*!
+   * \brief Get a device specific timer.
+   * \param ctx The device context to time.
+   * \return A `Timer` that has already been started.
+   *
+   * Use this function to time runtime of arbitrary regions of code on a specific
+   * device. The code that you want to time should be running on the device
+   * otherwise the timer will not return correct results. This is a lower level
+   * interface than TimeEvaluator and only runs the timed code once
+   * (TimeEvaluator runs the code multiple times).
+   *
+   * A default timer is used if a device specific one does not exist. This
+   * timer performs synchronization between the device and CPU, which can lead
+   * to overhead in the reported results.
+   *
+   * Example usage:
+   * \code{.cpp}
+   * Timer t = Timer::Start(TVMContext::cpu());
+   * my_long_running_function();
+   * t->Stop();
+   * ... // some more computation
+   * int64_t nanosecs = t->SyncAndGetElapsedNanos() // elapsed time in nanoseconds
+   * \endcode
+   *
+   * To add a new device-specific timer, register a new function
+   * "profiler.timer.my_device" (where `my_device` is the `DeviceName` of your
+   * device). This function should accept a `TVMContext` and return a new `Timer`
+   * that has already been started.
+   *
+   * For example, this is how the CPU timer is implemented:
+   * \code{.cpp}
+   *  class CPUTimerNode : public TimerNode {
+   *   public:
+   *    virtual void Start() { start_ = std::chrono::high_resolution_clock::now(); }
+   *    virtual void Stop() { duration_ = std::chrono::high_resolution_clock::now() - start_; }
+   *    virtual int64_t SyncAndGetElapsedNanos() { return duration_.count(); }
+   *    virtual ~CPUTimerNode() {}
+   *
+   *    static constexpr const char* _type_key = "CPUTimerNode";
+   *    TVM_DECLARE_FINAL_OBJECT_INFO(CPUTimerNode, TimerNode);
+   *
+   *   private:
+   *    std::chrono::high_resolution_clock::time_point start_;
+   *    std::chrono::duration<int64_t, std::nano> duration_;
+   *  };
+   *  TVM_REGISTER_OBJECT_TYPE(CPUTimerNode);
+   *
+   *  TVM_REGISTER_GLOBAL("profiling.timer.cpu").set_body_typed([](TVMContext ctx) {
+   *    return Timer(make_object<CPUTimerNode>());
+   *  });
+   * \endcode
+   */
+  static TVM_DLL Timer Start(TVMContext ctx);
+
+  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(Timer, ObjectRef, TimerNode);
+};
+
+/*!
+ * \brief Default timer if one does not exist for the context.
+ * \param ctx The context to time on.
+ *
+ * Note that this timer performs synchronization between the device and CPU,
+ * which can lead to overhead in the reported results.
+ */
+Timer DefaultTimer(TVMContext ctx);
+
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_PROFILING_H_
diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc
index c77395422e87..f156d68d283e 100644
--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -25,6 +25,7 @@
 #include <cuda_runtime.h>
 #include <dmlc/thread_local.h>
 #include <tvm/runtime/device_api.h>
+#include <tvm/runtime/profiling.h>
 #include <tvm/runtime/registry.h>
 
 #include <cstring>
@@ -243,5 +244,40 @@ TVM_REGISTER_GLOBAL("device_api.cpu_pinned").set_body([](TVMArgs args, TVMRetVal
   *rv = static_cast<void*>(ptr);
 });
 
+class GPUTimerNode : public TimerNode {
+ public:
+  virtual void Start() {
+    CUDA_CALL(cudaEventRecord(start_, CUDAThreadEntry::ThreadLocal()->stream));
+  }
+  virtual void Stop() { CUDA_CALL(cudaEventRecord(stop_, CUDAThreadEntry::ThreadLocal()->stream)); }
+  virtual int64_t SyncAndGetElapsedNanos() {
+    CUDA_CALL(cudaEventSynchronize(stop_));
+    float milliseconds = 0;
+    CUDA_CALL(cudaEventElapsedTime(&milliseconds, start_, stop_));
+    return milliseconds * 1e6;
+  }
+  virtual ~GPUTimerNode() {
+    CUDA_CALL(cudaEventDestroy(start_));
+    CUDA_CALL(cudaEventDestroy(stop_));
+  }
+  GPUTimerNode() {
+    CUDA_CALL(cudaEventCreate(&start_));
+    CUDA_CALL(cudaEventCreate(&stop_));
+  }
+
+  static constexpr const char* _type_key = "GPUTimerNode";
+  TVM_DECLARE_FINAL_OBJECT_INFO(GPUTimerNode, TimerNode);
+
+ private:
+  cudaEvent_t start_;
+  cudaEvent_t stop_;
+};
+
+TVM_REGISTER_OBJECT_TYPE(GPUTimerNode);
+
+TVM_REGISTER_GLOBAL("profiling.timer.gpu").set_body_typed([](TVMContext ctx) {
+  return Timer(make_object<GPUTimerNode>());
+});
+
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/graph/debug/graph_runtime_debug.cc b/src/runtime/graph/debug/graph_runtime_debug.cc
index 93bdd065c9d9..0e3003aa42c3 100644
--- a/src/runtime/graph/debug/graph_runtime_debug.cc
+++ b/src/runtime/graph/debug/graph_runtime_debug.cc
@@ -23,6 +23,7 @@
 #include <tvm/runtime/container.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/profiling.h>
 #include <tvm/runtime/registry.h>
 
 #include <chrono>
@@ -77,16 +78,25 @@ class GraphRuntimeDebug : public GraphRuntime {
                                                number * 1.618));  // 1.618 is chosen by random
           }
           tbegin = std::chrono::high_resolution_clock::now();
+          std::vector<std::vector<Timer>> op_timers;
+          for (size_t index = 0; index < op_execs_.size(); index++) {
+            op_timers.push_back({});
+          }
           for (int k = 0; k < number; k++) {
             for (size_t index = 0; index < op_execs_.size(); ++index) {
               if (op_execs_[index]) {
-                time_sec_per_op[index] += RunOpHost(index);
+                op_timers[index].push_back(RunOpHost(index));
               }
             }
           }
+          for (size_t index = 0; index < op_execs_.size(); ++index) {
+            for (auto t : op_timers[index]) {
+              time_sec_per_op[index] += t->SyncAndGetElapsedNanos() / 1e9;
+            }
+          }
           tend = std::chrono::high_resolution_clock::now();
           duration_ms =
-              std::chrono::duration_cast<std::chrono::duration<double> >(tend - tbegin).count() *
+              std::chrono::duration_cast<std::chrono::duration<double>>(tend - tbegin).count() *
               1000;
         } while (duration_ms < min_repeat_ms);
 
@@ -160,15 +170,12 @@ class GraphRuntimeDebug : public GraphRuntime {
     return results_arr[0];
   }
 
-  double RunOpHost(int index) {
-    auto op_tbegin = std::chrono::high_resolution_clock::now();
-    op_execs_[index]();
+  Timer RunOpHost(int index) {
     const TVMContext& ctx = data_entry_[entry_id(index, 0)]->ctx;
-    TVMSynchronize(ctx.device_type, ctx.device_id, nullptr);
-    auto op_tend = std::chrono::high_resolution_clock::now();
-    double op_duration =
-        std::chrono::duration_cast<std::chrono::duration<double> >(op_tend - op_tbegin).count();
-    return op_duration;
+    Timer t = Timer::Start(ctx);
+    op_execs_[index]();
+    t->Stop();
+    return t;
   }
 
   /*!
diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc
new file mode 100644
index 000000000000..3d204166986d
--- /dev/null
+++ b/src/runtime/profiling.cc
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/profiling.cc
+ * \brief Runtime profiling including timers.
+ */
+
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/profiling.h>
+
+#include <chrono>
+#include <map>
+
+namespace tvm {
+namespace runtime {
+
+class DefaultTimerNode : public TimerNode {
+ public:
+  virtual void Start() {
+    TVMSynchronize(ctx_.device_type, ctx_.device_id, nullptr);
+    start_ = std::chrono::high_resolution_clock::now();
+  }
+  virtual void Stop() {
+    TVMSynchronize(ctx_.device_type, ctx_.device_id, nullptr);
+    duration_ = std::chrono::high_resolution_clock::now() - start_;
+  }
+  virtual int64_t SyncAndGetElapsedNanos() { return duration_.count(); }
+  virtual ~DefaultTimerNode() {}
+
+  explicit DefaultTimerNode(TVMContext ctx) : ctx_(ctx) {}
+  static constexpr const char* _type_key = "DefaultTimerNode";
+  TVM_DECLARE_FINAL_OBJECT_INFO(DefaultTimerNode, TimerNode);
+
+ private:
+  std::chrono::high_resolution_clock::time_point start_;
+  std::chrono::duration<int64_t, std::nano> duration_;
+  TVMContext ctx_;
+};
+
+TVM_REGISTER_OBJECT_TYPE(DefaultTimerNode);
+TVM_REGISTER_OBJECT_TYPE(TimerNode);
+
+Timer DefaultTimer(TVMContext ctx) { return Timer(make_object<DefaultTimerNode>(ctx)); }
+
+class CPUTimerNode : public TimerNode {
+ public:
+  virtual void Start() { start_ = std::chrono::high_resolution_clock::now(); }
+  virtual void Stop() { duration_ = std::chrono::high_resolution_clock::now() - start_; }
+  virtual int64_t SyncAndGetElapsedNanos() { return duration_.count(); }
+  virtual ~CPUTimerNode() {}
+
+  static constexpr const char* _type_key = "CPUTimerNode";
+  TVM_DECLARE_FINAL_OBJECT_INFO(CPUTimerNode, TimerNode);
+
+ private:
+  std::chrono::high_resolution_clock::time_point start_;
+  std::chrono::duration<int64_t, std::nano> duration_;
+};
+TVM_REGISTER_OBJECT_TYPE(CPUTimerNode);
+
+TVM_REGISTER_GLOBAL("profiling.timer.cpu").set_body_typed([](TVMContext ctx) {
+  return Timer(make_object<CPUTimerNode>());
+});
+
+Timer Timer::Start(TVMContext ctx) {
+  auto f = Registry::Get(std::string("profiling.timer.") + DeviceName(ctx.device_type));
+  if (f == nullptr) {
+    Timer t = DefaultTimer(ctx);
+    t->Start();
+    return t;
+  } else {
+    Timer t = f->operator()(ctx);
+    t->Start();
+    return t;
+  }
+}
+
+TVM_REGISTER_GLOBAL("profiling.start_timer").set_body_typed(Timer::Start);
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/rocm/rocm_device_api.cc b/src/runtime/rocm/rocm_device_api.cc
index 26e44eca0d12..5f24ce0eec48 100644
--- a/src/runtime/rocm/rocm_device_api.cc
+++ b/src/runtime/rocm/rocm_device_api.cc
@@ -25,6 +25,7 @@
 #include <hip/hip_runtime_api.h>
 #include <hsa/hsa.h>
 #include <tvm/runtime/device_api.h>
+#include <tvm/runtime/profiling.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/support/logging.h>
 
@@ -200,5 +201,41 @@ TVM_REGISTER_GLOBAL("device_api.rocm").set_body([](TVMArgs args, TVMRetValue* rv
   DeviceAPI* ptr = ROCMDeviceAPI::Global();
   *rv = static_cast<void*>(ptr);
 });
+
+class ROCMTimerNode : public TimerNode {
+ public:
+  virtual void Start() {
+    ROCM_CALL(hipEventRecord(start_, ROCMThreadEntry::ThreadLocal()->stream));
+  }
+  virtual void Stop() { ROCM_CALL(hipEventRecord(stop_, ROCMThreadEntry::ThreadLocal()->stream)); }
+  virtual int64_t SyncAndGetElapsedNanos() {
+    ROCM_CALL(hipEventSynchronize(stop_));
+    float milliseconds = 0;
+    ROCM_CALL(hipEventElapsedTime(&milliseconds, start_, stop_));
+    return milliseconds * 1e6;
+  }
+  virtual ~ROCMTimerNode() {
+    ROCM_CALL(hipEventDestroy(start_));
+    ROCM_CALL(hipEventDestroy(stop_));
+  }
+  ROCMTimerNode() {
+    ROCM_CALL(hipEventCreate(&start_));
+    ROCM_CALL(hipEventCreate(&stop_));
+  }
+
+  static constexpr const char* _type_key = "ROCMTimerNode";
+  TVM_DECLARE_FINAL_OBJECT_INFO(ROCMTimerNode, TimerNode);
+
+ private:
+  hipEvent_t start_;
+  hipEvent_t stop_;
+};
+
+TVM_REGISTER_OBJECT_TYPE(ROCMTimerNode);
+
+TVM_REGISTER_GLOBAL("profiling.timer.rocm").set_body_typed([](TVMContext ctx) {
+  return Timer(make_object<ROCMTimerNode>());
+});
+
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/vm/profiler/vm.cc b/src/runtime/vm/profiler/vm.cc
index 94d827893b92..fc01a754ca50 100644
--- a/src/runtime/vm/profiler/vm.cc
+++ b/src/runtime/vm/profiler/vm.cc
@@ -45,7 +45,15 @@ PackedFunc VirtualMachineDebug::GetFunction(const std::string& name,
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
       ICHECK_EQ(args.size(), 1U);
       std::vector<std::pair<Index, double>> op_acc_time;
-      for (auto kv : op_durations_) {
+      std::unordered_map<Index, std::vector<double>> op_durations;
+      for (auto kv : op_timers_) {
+        std::vector<double> durations_us;
+        for (auto t : kv.second) {
+          durations_us.push_back(t->SyncAndGetElapsedNanos() / 1e3);
+        }
+        op_durations[kv.first] = durations_us;
+      }
+      for (auto kv : op_durations) {
         auto val =
             std::make_pair(kv.first, std::accumulate(kv.second.begin(), kv.second.end(), 0.0));
         op_acc_time.push_back(val);
@@ -66,7 +74,7 @@ PackedFunc VirtualMachineDebug::GetFunction(const std::string& name,
          << "#Duration(us): Sum/Mean/Min/Max" << std::endl;
 
       for (auto kv : op_acc_time) {
-        auto vals = op_durations_[kv.first];
+        auto vals = op_durations[kv.first];
         auto sum = kv.second;
         auto mean = sum / static_cast<double>(vals.size());
         auto min_value = *std::min_element(vals.begin(), vals.end());
@@ -85,7 +93,7 @@ PackedFunc VirtualMachineDebug::GetFunction(const std::string& name,
     });
   } else if (name == "reset") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      op_durations_.clear();
+      op_timers_.clear();
       op_invokes_.clear();
     });
   } else {
@@ -118,16 +126,11 @@ void VirtualMachineDebug::InvokePacked(Index packed_index, const PackedFunc& fun
   auto nd_array = Downcast<NDArray>(arg);
   auto ctx = nd_array->ctx;
 
-  TVMSynchronize(ctx.device_type, ctx.device_id, nullptr);
-
-  auto op_begin = std::chrono::high_resolution_clock::now();
+  Timer t = Timer::Start(ctx);
   VirtualMachine::InvokePacked(packed_index, func, arg_count, output_size, args);
-  TVMSynchronize(ctx.device_type, ctx.device_id, nullptr);
-  auto op_end = std::chrono::high_resolution_clock::now();
-  double op_duration =
-      std::chrono::duration_cast<std::chrono::duration<double>>(op_end - op_begin).count();
+  t->Stop();
 
-  op_durations_[packed_index].push_back(op_duration * 1e6);
+  op_timers_[packed_index].push_back(t);
   op_invokes_[packed_index] += 1;
 }
 
diff --git a/src/runtime/vm/profiler/vm.h b/src/runtime/vm/profiler/vm.h
index 797d414fe8f3..9f5ce87bcf47 100644
--- a/src/runtime/vm/profiler/vm.h
+++ b/src/runtime/vm/profiler/vm.h
@@ -25,6 +25,7 @@
 #ifndef TVM_RUNTIME_VM_PROFILER_VM_H_
 #define TVM_RUNTIME_VM_PROFILER_VM_H_
 
+#include <tvm/runtime/profiling.h>
 #include <tvm/runtime/vm/vm.h>
 
 #include <memory>
@@ -51,7 +52,7 @@ class VirtualMachineDebug : public VirtualMachine {
                     const std::vector<ObjectRef>& args) final;
 
   std::unordered_map<Index, std::string> packed_index_map_;
-  std::unordered_map<Index, std::vector<double>> op_durations_;
+  std::unordered_map<Index, std::vector<Timer>> op_timers_;
   std::unordered_map<Index, int> op_invokes_;
 };
 
diff --git a/tests/cpp/profiling.cc b/tests/cpp/profiling.cc
new file mode 100644
index 000000000000..6ec2fc060f9f
--- /dev/null
+++ b/tests/cpp/profiling.cc
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <tvm/runtime/profiling.h>
+
+#include <chrono>
+#include <thread>
+
+namespace tvm {
+namespace runtime {
+TEST(DefaultTimer, Basic) {
+  using namespace tvm::runtime;
+  DLContext ctx;
+  ctx.device_type = kDLCPU;
+  ctx.device_id = 0;
+
+  Timer t = Timer::Start(ctx);
+  std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  t->Stop();
+  int64_t elapsed = t->SyncAndGetElapsedNanos();
+  CHECK_GT(elapsed, 9 * 1e6);
+}
+}  // namespace runtime
+}  // namespace tvm
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  testing::FLAGS_gtest_death_test_style = "threadsafe";
+  return RUN_ALL_TESTS();
+}

From 1ae469789342e12b685f216f4e64e199accb0f47 Mon Sep 17 00:00:00 2001
From: "Huang, Guangtai" <hgt312@foxmail.com>
Date: Sat, 6 Mar 2021 05:47:08 +0800
Subject: [PATCH 290/357] [Relay][Pass] Avoid stack overflow when using
 PostOrderRewrite (#7588)

* init

* fix

* fix
---
 src/relay/ir/expr_functor.cc | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/src/relay/ir/expr_functor.cc b/src/relay/ir/expr_functor.cc
index d70c6fe2dd1f..5984a208efe0 100644
--- a/src/relay/ir/expr_functor.cc
+++ b/src/relay/ir/expr_functor.cc
@@ -103,11 +103,41 @@ Expr MixedModeMutator::VisitExpr(const Expr& expr) {
 class PostOrderRewriter : public MixedModeMutator {
  public:
   explicit PostOrderRewriter(ExprRewriter* rewriter) : rewriter_(rewriter) {}
+
   Expr DispatchVisitExpr(const Expr& expr) final {
     auto post = ExprFunctor::VisitExpr(expr);
     return rewriter_->Rewrite(expr, post);
   }
 
+  using MixedModeMutator::VisitExpr_;
+
+  Expr VisitExpr_(const LetNode* node) final {
+    auto pre_visit = [this](const LetNode* op) {
+      Expr var = this->Mutate(op->var);
+      Expr value = this->Mutate(op->value);
+    };
+    auto post_visit = [this, node](const LetNode* op) {
+      Var var = Downcast<Var>(this->Mutate(op->var));
+      Expr value = this->Mutate(op->value);
+      Expr body = this->Mutate(op->body);
+      Expr expr = GetRef<Expr>(op);
+      Expr post;
+      if (var.same_as(op->var) && value.same_as(op->value) && body.same_as(op->body)) {
+        post = expr;
+      } else {
+        post = Let(var, value, body);
+      }
+      //  avoid rewriting the first LetNode twice
+      if (op == node) {
+        this->memo_[expr] = post;
+      } else {
+        this->memo_[expr] = this->rewriter_->Rewrite(expr, post);
+      }
+    };
+    ExpandANormalForm(node, pre_visit, post_visit);
+    return memo_[GetRef<Expr>(node)];
+  }
+
  protected:
   ExprRewriter* rewriter_;
 };

From 783be9df5ecbc7e47770b439c57afaeeb890e9a3 Mon Sep 17 00:00:00 2001
From: mesauser <infidragon@126.com>
Date: Sat, 6 Mar 2021 06:42:43 +0800
Subject: [PATCH 291/357] [TOPI] disable test_shift with i8 datatype (#7597)

https://github.com/apache/tvm/issues/7539

Co-authored-by: guoweijun <guoweijun@baidu.com>
---
 tests/python/topi/python/test_topi_broadcast.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/topi/python/test_topi_broadcast.py b/tests/python/topi/python/test_topi_broadcast.py
index 44be28c318e4..ada03ea5377b 100644
--- a/tests/python/topi/python/test_topi_broadcast.py
+++ b/tests/python/topi/python/test_topi_broadcast.py
@@ -284,7 +284,7 @@ def test_shift():
     )
 
     verify_broadcast_binary_ele(
-        (1, 2, 2), (2,), topi.left_shift, np.left_shift, dtype="int8", rhs_min=0, rhs_max=32
+        (1, 2, 2), (2,), topi.left_shift, np.left_shift, dtype="int32", rhs_min=0, rhs_max=32
     )
 
 

From 0b4f669ed488c182f13cfa6dd788dc5e78eb8f15 Mon Sep 17 00:00:00 2001
From: Chenfan <jcf94@outlook.com>
Date: Sat, 6 Mar 2021 12:53:23 +0800
Subject: [PATCH 292/357] [AutoSchedule] Sparse dense tuning support with
 custom sketch rule (#7313)

* Add sparse dense tuning tutorial

* Add sparse input fusion

* Update the dag to support output fusion

* Update

* Add task input to search_task

* Update

* Add search_inputs to measure

* Lint fix

* Lint fix

* Update

* Update

* Update

* Update

* Add file save load support

* Update

* Update

* Update

* Remove add_task_inputs API

* Update

* Update

* Update

* Lint fix

* Lint fix

* Lint fix

* Lint fix

* Update

* Add example ci_log

* Update

* retrigger ci

* Update

* Update

* Update

* Lint fix

* Lint fix

* Lint fix
---
 include/tvm/auto_scheduler/measure_record.h   |   2 +-
 include/tvm/auto_scheduler/search_task.h      |   8 +-
 python/tvm/auto_scheduler/__init__.py         |   1 +
 python/tvm/auto_scheduler/measure.py          | 166 ++++++++-
 python/tvm/auto_scheduler/search_task.py      | 191 +++++++++-
 python/tvm/auto_scheduler/utils.py            |   3 +
 python/tvm/topi/nn/sparse.py                  | 118 +++++-
 src/auto_scheduler/feature.cc                 |   9 +-
 src/auto_scheduler/measure_record.cc          |  34 ++
 src/auto_scheduler/search_policy/utils.cc     |  16 +
 src/auto_scheduler/search_task.cc             |   7 +-
 .../unittest/test_auto_scheduler_measure.py   |  32 +-
 .../test_auto_scheduler_search_task.py        | 207 +++++++++++
 .../auto_scheduler/ci_logs/sparse_dense.json  |   2 +
 tutorials/auto_scheduler/tune_sparse_x86.py   | 339 ++++++++++++++++++
 15 files changed, 1109 insertions(+), 26 deletions(-)
 create mode 100644 tests/python/unittest/test_auto_scheduler_search_task.py
 create mode 100644 tutorials/auto_scheduler/ci_logs/sparse_dense.json
 create mode 100644 tutorials/auto_scheduler/tune_sparse_x86.py

diff --git a/include/tvm/auto_scheduler/measure_record.h b/include/tvm/auto_scheduler/measure_record.h
index ec40611d49b4..c82ed076eca7 100755
--- a/include/tvm/auto_scheduler/measure_record.h
+++ b/include/tvm/auto_scheduler/measure_record.h
@@ -34,7 +34,7 @@
 namespace tvm {
 namespace auto_scheduler {
 
-const std::string AUTO_SCHEDULER_LOG_VERSION = "v0.5";  // NOLINT(*)
+const std::string AUTO_SCHEDULER_LOG_VERSION = "v0.6";  // NOLINT(*)
 
 /*! \brief Callback for logging the input and results of measurements to file */
 class RecordToFileNode : public MeasureCallbackNode {
diff --git a/include/tvm/auto_scheduler/search_task.h b/include/tvm/auto_scheduler/search_task.h
index 9e7d3aa2cd32..14bf55abb447 100755
--- a/include/tvm/auto_scheduler/search_task.h
+++ b/include/tvm/auto_scheduler/search_task.h
@@ -26,6 +26,7 @@
 #define TVM_AUTO_SCHEDULER_SEARCH_TASK_H_
 
 #include <tvm/auto_scheduler/compute_dag.h>
+#include <tvm/runtime/ndarray.h>
 #include <tvm/target/target.h>
 
 namespace tvm {
@@ -120,6 +121,8 @@ class SearchTaskNode : public Object {
   HardwareParams hardware_params;
   /*! \brief The layout rewrite option used for measuring programs. */
   LayoutRewriteOption layout_rewrite_option;
+  /*! \brief Names of some user defined input data used in program measuring. */
+  Array<String> task_input_names;
 
   void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("compute_dag", &compute_dag);
@@ -128,6 +131,7 @@ class SearchTaskNode : public Object {
     v->Visit("target_host", &target_host);
     v->Visit("hardware_params", &hardware_params);
     v->Visit("layout_rewrite_option", &layout_rewrite_option);
+    v->Visit("task_input_names", &task_input_names);
   }
 
   static constexpr const char* _type_key = "auto_scheduler.SearchTask";
@@ -148,9 +152,11 @@ class SearchTask : public ObjectRef {
    * \param target_host The target host device of this search task.
    * \param hardware_params Hardware parameters used in this search task.
    * \param layout_rewrite_option The layout rewrite option used for measuring programs.
+   * \param task_input_names Names of some user defined input data used in program measuring.
    */
   SearchTask(ComputeDAG compute_dag, String workload_key, Target target, Target target_host,
-             Optional<HardwareParams> hardware_params, LayoutRewriteOption layout_rewrite_option);
+             Optional<HardwareParams> hardware_params, LayoutRewriteOption layout_rewrite_option,
+             Array<String> task_input_names);
 
   TVM_DEFINE_OBJECT_REF_METHODS(SearchTask, ObjectRef, SearchTaskNode);
 };
diff --git a/python/tvm/auto_scheduler/__init__.py b/python/tvm/auto_scheduler/__init__.py
index 06ca44d997e5..ff6d82a0242c 100644
--- a/python/tvm/auto_scheduler/__init__.py
+++ b/python/tvm/auto_scheduler/__init__.py
@@ -41,6 +41,7 @@
     LocalRunner,
     RPCRunner,
     LocalRPCMeasureContext,
+    register_task_input_check_func,
 )
 from .measure_record import RecordToFile, RecordReader, load_best_record, load_records, save_records
 from .relay_integration import (
diff --git a/python/tvm/auto_scheduler/measure.py b/python/tvm/auto_scheduler/measure.py
index 47ffde4327c4..959a9c5da82a 100644
--- a/python/tvm/auto_scheduler/measure.py
+++ b/python/tvm/auto_scheduler/measure.py
@@ -36,6 +36,7 @@
 import shutil
 import tempfile
 import multiprocessing
+import logging
 
 import tvm._ffi
 from tvm.runtime import Object, module, ndarray
@@ -50,6 +51,7 @@
     call_func_with_timeout,
     check_remote,
     get_const_tuple,
+    get_func_name,
     make_traceback_info,
     request_remote,
 )
@@ -58,6 +60,8 @@
     deserialize_workload_registry_entry,
 )
 
+# pylint: disable=invalid-name
+logger = logging.getLogger("auto_scheduler")
 
 # The time cost for measurements with errors
 # We use 1e10 instead of sys.float_info.max for better readability in log
@@ -223,6 +227,7 @@ def recover_measure_input(inp, rebuild_state=False):
         target_host=task.target_host,
         hardware_params=task.hardware_params,
         layout_rewrite_option=task.layout_rewrite_option,
+        task_inputs=list(task.task_input_names),
     )
 
     if rebuild_state:
@@ -719,6 +724,97 @@ def local_builder_build(inputs, timeout, n_parallel, build_func="default", verbo
     return results
 
 
+TASK_INPUT_CHECK_FUNC_REGISTRY = {}
+
+
+def register_task_input_check_func(func_name, f=None, override=False):
+    """Register a function that checks the input buffer map.
+
+    The input function should take a list of Tensor wich indicate the Input/output Tensor of a TVM
+    subgraph and return a Map from the input Tensor to its buffer name.
+
+    Parameters
+    ----------
+    func_name : Union[Function, str]
+        The check function that returns the compute declaration Tensors or its function name.
+    f : Optional[Function]
+        The check function to be registered.
+    override : boolean = False
+        Whether to override existing entry.
+
+    Examples
+    --------
+    .. code-block:: python
+
+      @auto_scheduler.register_task_input_check_func
+      def check_task_input_by_placeholder_name(args : List[Tensor]):
+          tensor_input_map = {}
+          for arg in args:
+              if isinstance(arg.op, tvm.te.PlaceholderOp):
+                  if arg.op.name != "placeholder":
+                      tensor_input_map[arg] = arg.op.name
+          return tensor_input_map
+    """
+    global TASK_INPUT_CHECK_FUNC_REGISTRY
+
+    if callable(func_name):
+        f = func_name
+        func_name = get_func_name(f)
+    if not isinstance(func_name, str):
+        raise ValueError("expect string function name")
+
+    def register(myf):
+        """internal register function"""
+        if func_name in TASK_INPUT_CHECK_FUNC_REGISTRY and not override:
+            raise RuntimeError("%s has been registered already" % func_name)
+        TASK_INPUT_CHECK_FUNC_REGISTRY[func_name] = myf
+        return myf
+
+    if f:
+        return register(f)
+    return register
+
+
+def _prepare_input_map(args):
+    """This function deals with special task inputs. Map the input Tensor of a TVM subgraph
+    to a specific buffer name in the global buffer map.
+
+    Parameters
+    ----------
+    args : List[Tensor]
+        Input/output Tensor of a TVM subgraph.
+
+    Returns
+    -------
+    Dict[Tensor, str] :
+        Map from the input Tensor to its buffer name.
+
+    Notes
+    -----
+    The buffer name is specially designed, and these buffer should be provided in
+    `SearchTask(..., task_inputs={...})`.
+    """
+    # pylint: disable=import-outside-toplevel
+
+    global TASK_INPUT_CHECK_FUNC_REGISTRY
+
+    # A dict that maps the input tensor arg to a buffer name
+    tensor_input_map = {}
+
+    # Case 0: Check placeholder name
+    for arg in args:
+        if isinstance(arg.op, tvm.te.PlaceholderOp):
+            if arg.op.name != "placeholder":
+                tensor_input_map[arg] = arg.op.name
+
+    # Case 1: Check specific tensor inputs
+    for func_name in TASK_INPUT_CHECK_FUNC_REGISTRY:
+        func = TASK_INPUT_CHECK_FUNC_REGISTRY[func_name]
+        tensor_input_map.update(func(args))
+
+    return tensor_input_map
+
+
 def _timed_eval_func(
     inp_serialized,
     build_res,
@@ -729,7 +825,11 @@ def _timed_eval_func(
     enable_cpu_cache_flush,
     verbose,
 ):
+    # pylint: disable=import-outside-toplevel
+    from .search_task import get_task_input_buffer  # lazily import to avoid recursive dependency
+
     inp = MeasureInput.deserialize(inp_serialized)
+    task_input_names = inp.task.task_input_names
     tic = time.time()
     error_no = 0
     error_msg = None
@@ -758,11 +858,31 @@ def _timed_eval_func(
 
     if error_no == 0:
         try:
-            args = [ndarray.empty(get_const_tuple(x.shape), x.dtype, ctx) for x in build_res.args]
             random_fill = tvm.get_global_func("tvm.contrib.random.random_fill", True)
             assert random_fill, "Please make sure USE_RANDOM is ON in the config.cmake"
-            for arg in args:
-                random_fill(arg)
+
+            tensor_input_map = _prepare_input_map(build_res.args) if task_input_names else {}
+            args = []
+            task_inputs_count = 0
+            for arg in build_res.args:
+                if arg in tensor_input_map:
+                    tensor_name = tensor_input_map[arg]
+                    if tensor_name in task_input_names:
+                        args.append(get_task_input_buffer(inp.task.workload_key, tensor_name))
+                        task_inputs_count += 1
+                    else:
+                        raise ValueError(
+                            "%s not found in task_inputs, " % (tensor_name)
+                            + "should provide with `SearchTask(..., task_inputs={...})`"
+                        )
+                else:
+                    empty_array = ndarray.empty(get_const_tuple(arg.shape), arg.dtype, ctx)
+                    random_fill(empty_array)
+                    args.append(empty_array)
+            if task_inputs_count != len(task_input_names):
+                logger.warning(
+                    "task_inputs not fully matched, check if there's any unexpected error"
+                )
             ctx.sync()
             costs = time_f(*args).results
         # pylint: disable=broad-except
@@ -911,7 +1031,11 @@ def _timed_rpc_run(
     enable_cpu_cache_flush,
     verbose,
 ):
+    # pylint: disable=import-outside-toplevel
+    from .search_task import get_task_input_buffer  # lazily import to avoid recursive dependency
+
     inp = MeasureInput.deserialize(inp_serialized)
+    task_input_names = inp.task.task_input_names
     tic = time.time()
     error_no = 0
     error_msg = None
@@ -943,18 +1067,36 @@ def _timed_rpc_run(
 
     if error_no == 0:
         try:
-            args = [ndarray.empty(get_const_tuple(x.shape), x.dtype, ctx) for x in build_res.args]
-            try:
-                random_fill = remote.get_function("tvm.contrib.random.random_fill")
-            except AttributeError:
-                raise AttributeError(
-                    "Please make sure USE_RANDOM is ON in the config.cmake " "on the remote devices"
+            random_fill = remote.get_function("tvm.contrib.random.random_fill")
+            assert (
+                random_fill
+            ), "Please make sure USE_RANDOM is ON in the config.cmake on the remote devices"
+
+            tensor_input_map = _prepare_input_map(build_res.args) if task_input_names else {}
+            args = []
+            task_inputs_count = 0
+            for arg in build_res.args:
+                if arg in tensor_input_map:
+                    tensor_name = tensor_input_map[arg]
+                    if tensor_name in task_input_names:
+                        args.append(get_task_input_buffer(inp.task.workload_key, tensor_name))
+                        task_inputs_count += 1
+                    else:
+                        raise ValueError(
+                            "%s not found in task_inputs, " % (tensor_name)
+                            + "should provide with `SearchTask(..., task_inputs={...})`"
+                        )
+                else:
+                    empty_array = ndarray.empty(get_const_tuple(arg.shape), arg.dtype, ctx)
+                    random_fill(empty_array)
+                    args.append(empty_array)
+            if task_inputs_count != len(task_input_names):
+                logger.warning(
+                    "task_inputs not fully matched, check if there's any unexpected error"
                 )
-            for arg in args:
-                random_fill(arg)
             ctx.sync()
-
             costs = time_f(*args).results
+
             # clean up remote files
             remote.remove(build_res.filename)
             remote.remove(os.path.splitext(build_res.filename)[0] + ".so")
diff --git a/python/tvm/auto_scheduler/search_task.py b/python/tvm/auto_scheduler/search_task.py
index 175c2fa06c39..57e239cf79e8 100644
--- a/python/tvm/auto_scheduler/search_task.py
+++ b/python/tvm/auto_scheduler/search_task.py
@@ -19,8 +19,12 @@
 
 import json
 
+import os
+import logging
+import numpy as np
+
 import tvm._ffi
-from tvm.runtime import Object
+from tvm.runtime import Object, ndarray
 
 from tvm.driver.build_module import build
 from tvm.target import Target
@@ -33,6 +37,9 @@
 from .workload_registry import WORKLOAD_FUNC_REGISTRY, register_workload_tensors
 from . import _ffi_api
 
+# pylint: disable=invalid-name
+logger = logging.getLogger("auto_scheduler")
+
 
 @tvm._ffi.register_object("auto_scheduler.HardwareParams")
 class HardwareParams(Object):
@@ -157,6 +164,156 @@ def __init__(
         )
 
 
+# The map stores special registered buffer for measurement.
+# This can be used for sparse workloads when we cannot use random tensors for measurment.
+# {
+#     "workload_key_0": {
+#         "task_input_0": Tensor(...),
+#         "task_input_1": Tensor(...)
+#     },
+#     "workload_key_1": {
+#         "task_input_2": Tensor(...),
+#         "task_input_3": Tensor(...)
+#     },
+#     ...
+# }
+TASK_INPUT_BUFFER_TABLE = {}
+
+
+def _save_buffer_to_file(buffer_name, buffer_data):
+    """Save the current Tensor buffer to a numpy file.
+
+    File name will be: {buffer_name}.{buffer_shape}_{buffer_data_type}.npy
+    """
+    np_data = buffer_data.asnumpy()
+
+    buffer_name += "."
+    for i in np_data.shape:
+        buffer_name += "%d_" % (i)
+    buffer_name += "%s" % (np_data.dtype)
+    buffer_name += ".npy"
+
+    np_data.tofile(buffer_name, " ")
+
+
+def _try_load_buffer_from_file(buffer_name):
+    """Try to load buffer from a numpy file, if not found, return None.
+
+    File name has a same format as `_save_buffer_to_file`.
+    """
+    filelist = os.listdir()
+
+    for file in filelist:
+        if file.startswith(buffer_name + "."):
+            meta_info = file.split(".")[-2].split("_")
+            shape = [int(i) for i in meta_info[:-1]]
+            dtype = meta_info[-1]
+            buffer_data = np.fromfile(file, dtype=dtype, sep=" ")
+            buffer_data = buffer_data.reshape(shape)
+            return ndarray.array(buffer_data)
+
+    return None
+
+
+def register_task_input_buffer(
+    workload_key,
+    input_name,
+    input_data,
+    overwrite=False,
+    save_to_file=False,
+):
+    """Register special buffer for measurement.
+
+    Parameters
+    ----------
+    workload_key : str
+        The workload key of the SearchTask.
+
+    input_name : str
+        The name of input buffer.
+
+    input_data : tvm.nd.NDArray
+        The input Tensor data.
+
+    overwrite : bool = False
+        Whether to overwrite the data if a name has already registered.
+
+    save_to_file : bool = False
+        Whether to save the data to a local file as well. This can be reused to resume the last
+        tuning process.
+
+    Returns
+    -------
+    tvm.nd.NDArray
+        The actual registered Tensor data of this input_name. With `overwrite` set to False, will
+        return the original one if the name has already registered before.
+    """
+    global TASK_INPUT_BUFFER_TABLE
+
+    if workload_key not in TASK_INPUT_BUFFER_TABLE:
+        TASK_INPUT_BUFFER_TABLE[workload_key] = {}
+    input_table = TASK_INPUT_BUFFER_TABLE[workload_key]
+
+    if not overwrite:
+        if input_name not in input_table.keys():
+            # Try to load buffer data from local file
+            tensor_from_file = _try_load_buffer_from_file(input_name)
+            if tensor_from_file:
+                input_table[input_name] = tensor_from_file
+
+        if input_name in input_table.keys():
+            logger.warning(
+                "Tensor %s exists in TASK_INPUT_BUFFER_TABLE, %s",
+                input_name,
+                "set overwrite to True or this Tensor will not be registered",
+            )
+            return input_table[input_name]
+
+    input_table[input_name] = input_data
+    if save_to_file:
+        _save_buffer_to_file(input_name, input_data)
+    return input_data
+
+
+def get_task_input_buffer(workload_key, input_name):
+    """Get special buffer for measurement.
+
+    The buffers are registered by `register_task_input_buffer`.
+
+    Parameters
+    ----------
+    workload_key : str
+        The workload key of the SearchTask.
+
+    input_name : str
+        The name of input buffer.
+
+    Returns
+    -------
+    tvm.nd.NDArray
+        The registered input buffer.
+    """
+    global TASK_INPUT_BUFFER_TABLE
+
+    if workload_key not in TASK_INPUT_BUFFER_TABLE:
+        TASK_INPUT_BUFFER_TABLE[workload_key] = {}
+    input_table = TASK_INPUT_BUFFER_TABLE[workload_key]
+
+    if input_name not in input_table.keys():
+        # Try to load buffer data from local file
+        tensor_from_file = _try_load_buffer_from_file(input_name)
+        if tensor_from_file:
+            input_table[input_name] = tensor_from_file
+
+    if input_name in input_table.keys():
+        return input_table[input_name]
+
+    raise ValueError(
+        "%s not found in TASK_INPUT_BUFFER_TABLE, " % (input_name)
+        + "should provide with `SearchTask(..., task_inputs={...})`"
+    )
+
+
 @tvm._ffi.register_object("auto_scheduler.SearchTask")
 class SearchTask(Object):
     """The computation information and hardware parameters for a schedule search task.
@@ -185,6 +342,16 @@ class SearchTask(Object):
         The NO_REWRITE and INSERT_TRANSFORM_STAGE are expected to be used when tuning a standalone
         op, and the REWRITE_FOR_PRE_TRANSFORMED is expected to be used when tuning ops inside a
         network.
+    task_inputs : Union[Dict[str, tvm.nd.NDArray], List[str]]
+        A dict maps the input names to input tensors or a list of input names.
+        Some special Tensor used as inputs in program measuring. Usually we do not need to care
+        about it, but for special workloads like Sparse computation the Sparse Tensor input are
+        meaningful that we cannot use random input directly.
+    task_inputs_overwrite : bool = False
+        Whether to overwrite the data if a name has already in the global table.
+    task_inputs_save_to_file : bool = False
+        Whether to save the data to a local file as well. This can be reused to resume the last
+        tuning process.
 
     Examples
     --------
@@ -212,6 +379,9 @@ def __init__(
         target_host=None,
         hardware_params=None,
         layout_rewrite_option=None,
+        task_inputs=None,
+        task_inputs_overwrite=False,
+        task_inputs_save_to_file=False,
     ):
         assert (
             func is not None or workload_key is not None
@@ -231,6 +401,22 @@ def __init__(
         if layout_rewrite_option is None:
             layout_rewrite_option = LayoutRewriteOption.get_target_default(target)
 
+        task_input_names = []
+        if isinstance(task_inputs, list):
+            task_input_names = task_inputs
+        elif isinstance(task_inputs, dict):
+            for input_name in task_inputs:
+                register_task_input_buffer(
+                    workload_key,
+                    input_name,
+                    task_inputs[input_name],
+                    task_inputs_overwrite,
+                    task_inputs_save_to_file,
+                )
+                task_input_names.append(input_name)
+        elif task_inputs is not None:
+            raise ValueError("task_inputs should be a dict or a list.")
+
         self.__init_handle_by_constructor__(
             _ffi_api.SearchTask,
             compute_dag,
@@ -239,6 +425,7 @@ def __init__(
             target_host,
             hardware_params,
             layout_rewrite_option,
+            task_input_names,
         )
 
     def tune(self, tuning_options, search_policy=None):
@@ -326,6 +513,7 @@ def __getstate__(self):
             "target_host": self.target_host,
             "hardware_params": self.hardware_params,
             "layout_rewrite_option": self.layout_rewrite_option,
+            "task_input_names": self.task_input_names,
         }
 
     def __setstate__(self, state):
@@ -350,6 +538,7 @@ def __setstate__(self, state):
             state["target_host"],
             state["hardware_params"],
             state["layout_rewrite_option"],
+            state["task_input_names"],
         )
 
 
diff --git a/python/tvm/auto_scheduler/utils.py b/python/tvm/auto_scheduler/utils.py
index 8aa33e6775f8..14dc5b8984c3 100644
--- a/python/tvm/auto_scheduler/utils.py
+++ b/python/tvm/auto_scheduler/utils.py
@@ -201,6 +201,9 @@ def serialize_args(args):
     Currently this is mainly used for tvm.tensor.Tensor
     """
     ret = []
+    if args is None:
+        return tuple(ret)
+
     for t in args:
         if isinstance(t, Tensor):
             t = ("TENSOR", get_const_tuple(t.shape), t.dtype)
diff --git a/python/tvm/topi/nn/sparse.py b/python/tvm/topi/nn/sparse.py
index 8145ed80af47..1bf18df09da3 100644
--- a/python/tvm/topi/nn/sparse.py
+++ b/python/tvm/topi/nn/sparse.py
@@ -18,7 +18,7 @@
 """Sparse operators"""
 from __future__ import absolute_import
 import tvm
-from tvm import te
+from tvm import te, auto_scheduler
 
 from ..utils import get_const_tuple
 
@@ -197,7 +197,7 @@ def _compute_block(nb_j, j, i):
 
 
 def _sparse_dense_sp_rhs_bsrmm(data, weight_data, weight_indices, weight_indptr):
-    (m, _) = get_const_tuple(data.shape)
+    (m, k) = get_const_tuple(data.shape)
     (_, bs_r, bs_c) = get_const_tuple(weight_data.shape)
     (num_blocks_plus_1,) = get_const_tuple(weight_indptr.shape)
     num_blocks = num_blocks_plus_1 - 1
@@ -218,7 +218,10 @@ def _compute_block(i, nb_j, j):
     idxm = tvm.tir.indexmod
 
     bsrmm_block = te.compute(
-        (m, num_blocks, bs_r), _compute_block, tag="sparse_dense_sp_rhs_bsrmm_block"
+        (m, num_blocks, bs_r),
+        _compute_block,
+        tag="sparse_dense_sp_rhs_bsrmm_block",
+        attrs={"FLOP": 2 * m * num_blocks * bs_r * k},
     )
     return te.compute(
         (m, num_blocks * bs_r),
@@ -356,3 +359,112 @@ def sparse_dense_alter_layout(_attrs, _inputs, _tinfos, _out_type):
     Unlike other TOPI functions, this function operates on both graph level and operator level.
     """
     return None
+
+
+@auto_scheduler.register_task_input_check_func
+def try_get_sparse_input(args):
+    """Analyze the input data from the given args.
+
+    Parameters
+    ----------
+    args : List[Tensor]
+        Input/output Tensor of a TVM subgraph.
+
+    Returns
+    -------
+    Dict[Tensor, str] :
+        Map from the input Tensor to its buffer name.
+
+    Notes
+    -----
+    The buffer name is specially designed, and these buffer should be provided in
+    `SearchTask(..., task_inputs={...})`.
+    """
+    sparse_prefix = sparse_data = sparse_indices = sparse_indptr = None
+
+    def _process_inputs(input_tensors, m, n, prefix_init):
+        nonlocal sparse_prefix
+        nonlocal sparse_data
+        nonlocal sparse_indices
+        nonlocal sparse_indptr
+
+        assert len(input_tensors) == 4
+        unsure_tensors = list(input_tensors)
+        # Get the Dense data
+        dense_data = None
+        for tensor in unsure_tensors:
+            if len(tensor.shape) == 2:
+                assert dense_data is None
+                dense_data = tensor
+                assert m == dense_data.shape[0]
+                k = dense_data.shape[1]
+        unsure_tensors.remove(dense_data)
+
+        # Get the Sparse data
+        sparse_data = None
+        for tensor in unsure_tensors:
+            if len(tensor.shape) == 3:
+                assert sparse_data is None
+                sparse_data = tensor
+                block_size, bs_r, bs_c = sparse_data.shape
+        unsure_tensors.remove(sparse_data)
+
+        # Get the Sparse indptr & indices
+        sparse_indices = None
+        for tensor in unsure_tensors:
+            assert len(tensor.shape) == 1
+            if tensor.shape[0] == block_size:
+                assert sparse_indices is None
+                sparse_indices = tensor
+        unsure_tensors.remove(sparse_indices)
+        assert len(unsure_tensors) == 1
+        sparse_indptr = unsure_tensors[0]
+
+        # Generate the sparse_prefix
+        density = 1.0
+        for i in sparse_data.shape:
+            density *= i
+        density /= k * n
+        density = density.value
+        sparse_prefix = "%s_%d_%d_%d_%d_%d_%.2f_" % (prefix_init, m, n, k, bs_r, bs_c, density)
+
+    visited = set()
+
+    def _traverse(t):
+        # We cannot directly add tensors to the set, because the comparison of
+        # two tensors with ndim=0 is ambiguous.
+        assert t.handle is not None
+        if t.handle.value in visited:
+            return
+
+        if isinstance(t.op, te.ComputeOp):
+            # TODO(jcf94): Currently only support to one sparse op, add more support here
+            if t.op.tag == "sparse_dense_sp_rhs_bsrmm":
+                m, n = t.shape
+                assert len(t.op.input_tensors) == 1
+                block_tensor = t.op.input_tensors[0]
+                _process_inputs(block_tensor.op.input_tensors, m, n, "sparse_dense_bsr")
+            if sparse_prefix is not None:
+                # Early stop if we find a sparse_prefix
+                # Notice: If any workload has more than one sparse input, this may get problem
+                return
+            for x in t.op.input_tensors:
+                _traverse(x)
+        visited.add(t.handle.value)
+
+    try:
+        for arg in args:
+            _traverse(arg)
+    # pylint: disable=broad-except
+    except Exception:
+        return {}
+
+    if sparse_data is None or sparse_indices is None or sparse_indptr is None:
+        return {}
+
+    sparse_input_map = {}
+    sparse_input_map[sparse_data] = sparse_prefix + "W_data"
+    sparse_input_map[sparse_indices] = sparse_prefix + "W_indices"
+    sparse_input_map[sparse_indptr] = sparse_prefix + "W_indptr"
+
+    return sparse_input_map
diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc
index cf516d8452e2..d93218c0208c 100755
--- a/src/auto_scheduler/feature.cc
+++ b/src/auto_scheduler/feature.cc
@@ -1399,7 +1399,7 @@ void GetPerStoreFeaturesFromFile(const std::string& filename, int max_lines, int
       Array<te::Tensor> tensors = (*workload_key_to_tensors)(workload_key);
       task = SearchTask(ComputeDAG(tensors), workload_key, cur_inp->task->target,
                         cur_inp->task->target_host, cur_inp->task->hardware_params,
-                        cur_inp->task->layout_rewrite_option);
+                        cur_inp->task->layout_rewrite_option, cur_inp->task->task_input_names);
       task_id = task_cache.size();
 
       // compute min cost for each task
@@ -1466,9 +1466,10 @@ void GetPerStoreFeaturesFromMeasurePairs(const Array<MeasureInput>& inputs,
         // The measure input is incomplete, rebuild task for incomplete measure pairs read from file
         try {
           Array<te::Tensor> tensors = (*workload_key_to_tensors)(workload_key);
-          task = SearchTask(ComputeDAG(tensors), workload_key, inputs[i]->task->target,
-                            inputs[i]->task->target_host, inputs[i]->task->hardware_params,
-                            inputs[i]->task->layout_rewrite_option);
+          task =
+              SearchTask(ComputeDAG(tensors), workload_key, inputs[i]->task->target,
+                         inputs[i]->task->target_host, inputs[i]->task->hardware_params,
+                         inputs[i]->task->layout_rewrite_option, inputs[i]->task->task_input_names);
         } catch (std::exception& e) {
           // Cannot build ComputeDAG from workload key, the task may have not been registered in
           // this search round
diff --git a/src/auto_scheduler/measure_record.cc b/src/auto_scheduler/measure_record.cc
index 1120f437b176..5dafa8d98702 100644
--- a/src/auto_scheduler/measure_record.cc
+++ b/src/auto_scheduler/measure_record.cc
@@ -169,6 +169,12 @@ struct Handler<::tvm::auto_scheduler::SearchTaskNode> {
       writer->WriteArrayItem(std::string(""));
     }
     writer->WriteArrayItem(static_cast<int>(data.layout_rewrite_option));
+    writer->WriteArraySeperator();
+    writer->BeginArray(false);
+    for (const auto& i : data.task_input_names) {
+      writer->WriteArrayItem(std::string(i));
+    }
+    writer->EndArray();
     writer->EndArray();
   }
   inline static void Read(dmlc::JSONReader* reader, ::tvm::auto_scheduler::SearchTaskNode* data) {
@@ -200,6 +206,17 @@ struct Handler<::tvm::auto_scheduler::SearchTaskNode> {
         reader->Read(&int_value);
         data->layout_rewrite_option = ::tvm::auto_scheduler::LayoutRewriteOption(int_value);
         s = reader->NextArrayItem();
+        if (s) {
+          reader->BeginArray();
+          s = reader->NextArrayItem();
+          while (s) {
+            reader->Read(&str_value);
+            data->task_input_names.push_back(str_value);
+            s = reader->NextArrayItem();
+          }
+          // Process the end of array
+          s = reader->NextArrayItem();
+        }
         ICHECK(!s);
       }
     }
@@ -444,5 +461,22 @@ TVM_REGISTER_GLOBAL("auto_scheduler.DeserializeMeasureInput").set_body_typed([](
   reader.Read(inp.get());
   return ObjectRef(inp);
 });
+
+TVM_REGISTER_GLOBAL("auto_scheduler.SerializeSearchTask")
+    .set_body_typed([](const SearchTask& search_task) {
+      std::ostringstream os;
+      dmlc::JSONWriter writer(&os);
+      writer.Write(*search_task.get());
+      return os.str();
+    });
+
+TVM_REGISTER_GLOBAL("auto_scheduler.DeserializeSearchTask").set_body_typed([](String json) {
+  std::istringstream ss(json);
+  dmlc::JSONReader reader(&ss);
+  auto search_task = make_object<SearchTaskNode>();
+  reader.Read(search_task.get());
+  return ObjectRef(search_task);
+});
+
 }  // namespace auto_scheduler
 }  // namespace tvm
diff --git a/src/auto_scheduler/search_policy/utils.cc b/src/auto_scheduler/search_policy/utils.cc
index d59df6965776..ce8dc39922e0 100644
--- a/src/auto_scheduler/search_policy/utils.cc
+++ b/src/auto_scheduler/search_policy/utils.cc
@@ -465,6 +465,22 @@ const std::vector<int>& SplitFactorizationMemo::GetFactors(int n) {
 
 /********** Utils interface API for ffi **********/
 
+TVM_REGISTER_GLOBAL("auto_scheduler.SearchPolicyUtilsGetConsumers")
+    .set_body_typed([](const SearchTask& task, const State& state, int stage_id) {
+      const std::set<int>& consumers = GetConsumers(task, state, stage_id);
+      tvm::Map<IntImm, IntImm> ret;
+      for (const auto& i : consumers) {
+        ret.Set(Integer(i), Integer(i));
+      }
+      return ret;
+    });
+
+TVM_REGISTER_GLOBAL("auto_scheduler.SearchPolicyUtilsIsElementwiseMatch")
+    .set_body_typed([](const SearchTask& task, const State& state, int stage_id,
+                       int target_stage_id) {
+      return ElementwiseMatch(task, state, stage_id, target_stage_id);
+    });
+
 TVM_REGISTER_GLOBAL("auto_scheduler.SearchPolicyUtilsIsTiled")
     .set_body_typed([](const Stage& stage) { return IsTiled(stage); });
 
diff --git a/src/auto_scheduler/search_task.cc b/src/auto_scheduler/search_task.cc
index 0abee16fceab..22c2893141cf 100755
--- a/src/auto_scheduler/search_task.cc
+++ b/src/auto_scheduler/search_task.cc
@@ -114,7 +114,7 @@ HardwareParams HardwareParamsNode::GetDefaultHardwareParams(const Target& target
 
 SearchTask::SearchTask(ComputeDAG compute_dag, String workload_key, Target target,
                        Target target_host, Optional<HardwareParams> hardware_params,
-                       LayoutRewriteOption layout_rewrite_option) {
+                       LayoutRewriteOption layout_rewrite_option, Array<String> task_input_names) {
   auto node = make_object<SearchTaskNode>();
   node->compute_dag = std::move(compute_dag);
   node->workload_key = std::move(workload_key);
@@ -127,6 +127,7 @@ SearchTask::SearchTask(ComputeDAG compute_dag, String workload_key, Target targe
         HardwareParamsNode::GetDefaultHardwareParams(node->target, node->target_host);
   }
   node->layout_rewrite_option = layout_rewrite_option;
+  node->task_input_names = std::move(task_input_names);
   data_ = std::move(node);
 }
 
@@ -142,9 +143,9 @@ TVM_REGISTER_GLOBAL("auto_scheduler.HardwareParams")
 TVM_REGISTER_GLOBAL("auto_scheduler.SearchTask")
     .set_body_typed([](ComputeDAG compute_dag, String workload_key, Target target,
                        Target target_host, Optional<HardwareParams> hardware_params,
-                       int layout_rewrite_option) {
+                       int layout_rewrite_option, Array<String> task_input_names) {
       return SearchTask(compute_dag, workload_key, target, target_host, hardware_params,
-                        LayoutRewriteOption(layout_rewrite_option));
+                        LayoutRewriteOption(layout_rewrite_option), task_input_names);
     });
 
 }  // namespace auto_scheduler
diff --git a/tests/python/unittest/test_auto_scheduler_measure.py b/tests/python/unittest/test_auto_scheduler_measure.py
index cc9d7a41548d..116981028cc9 100644
--- a/tests/python/unittest/test_auto_scheduler_measure.py
+++ b/tests/python/unittest/test_auto_scheduler_measure.py
@@ -19,6 +19,7 @@
 import json
 
 import multiprocessing
+import numpy as np
 import tvm
 from tvm import topi
 from tvm import te, auto_scheduler
@@ -26,7 +27,7 @@
 import tvm.testing
 import pickle
 
-from test_auto_scheduler_common import matmul_auto_scheduler_test, get_tiled_matmul
+from test_auto_scheduler_common import matmul_auto_scheduler_test
 from tvm.auto_scheduler import workload_registry
 
 
@@ -355,6 +356,34 @@ def test_measure_target_host():
         assert str(recovered_inp.task.target_host) == str(inp.task.target_host)
 
 
+@tvm.testing.requires_llvm
+def test_measure_special_inputs_map_by_name():
+    @auto_scheduler.register_workload
+    def foo():
+        X = te.placeholder(shape=[10], dtype="int32")
+        Index = te.placeholder(shape=[1], dtype="int32", name="Index")
+        Y = te.compute((1,), lambda i: X[Index[i]])
+        return [X, Index, Y]
+
+    # This workload cannot use random input for the `Index` input
+    task = auto_scheduler.SearchTask(
+        func=foo,
+        target="llvm",
+        task_inputs={
+            "Index": tvm.nd.array(np.array([5], dtype="int32")),
+        },
+    )
+
+    minp = auto_scheduler.MeasureInput(task, task.compute_dag.init_state)
+    local_builder = auto_scheduler.LocalBuilder()
+    local_runner = auto_scheduler.LocalRunner(timeout=10)
+
+    bress = local_builder.build([minp])
+    assert bress[0].error_no == 0
+    mress = local_runner.run([minp], bress)
+    assert mress[0].error_no == 0
+
+
 if __name__ == "__main__":
     test_record_split_reorder_fuse_annotation()
     test_record_compute_at_root_inline_cache_read_write()
@@ -366,3 +395,4 @@ def test_measure_target_host():
     test_dag_measure_local_builder_runner()
     test_measure_local_builder_rpc_runner()
     test_measure_target_host()
+    test_measure_special_inputs_map_by_name()
diff --git a/tests/python/unittest/test_auto_scheduler_search_task.py b/tests/python/unittest/test_auto_scheduler_search_task.py
new file mode 100644
index 000000000000..78e85dc213e0
--- /dev/null
+++ b/tests/python/unittest/test_auto_scheduler_search_task.py
@@ -0,0 +1,207 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Test search policy"""
+
+import numpy as np
+import tempfile
+
+import tvm
+import tvm.testing
+from tvm import auto_scheduler
+from tvm.auto_scheduler.utils import get_const_tuple
+from test_auto_scheduler_common import (
+    matmul_auto_scheduler_test,
+    zero_rank_compute_auto_scheduler_test,
+    zero_rank_reduce_auto_scheduler_test,
+)
+
+
+def test_search_task_add_task_input():
+    auto_scheduler.search_task.TASK_INPUT_BUFFER_TABLE.clear()
+    N = 64
+    target = "llvm"
+    test_input_0 = tvm.runtime.ndarray.empty((64, 64))
+    test_input_1 = tvm.runtime.ndarray.empty((10, 20))
+    test_input_2 = tvm.runtime.ndarray.empty((30, 40, 50))
+    task = auto_scheduler.SearchTask(
+        func="matmul_auto_scheduler_test",
+        args=(N, N, N),
+        target=target,
+        task_inputs={
+            "test_input_0": test_input_0,
+            "test_input_1": test_input_1,
+            "test_input_2": test_input_2,
+        },
+        task_inputs_overwrite=True,
+    )
+
+    assert len(task.task_input_names) == 3
+    assert task.task_input_names[0] == "test_input_0"
+    assert task.task_input_names[1] == "test_input_1"
+    assert task.task_input_names[2] == "test_input_2"
+
+
+def test_search_task_record():
+    auto_scheduler.search_task.TASK_INPUT_BUFFER_TABLE.clear()
+    N = 64
+    target = "llvm"
+
+    # Log with no task input
+    task = auto_scheduler.SearchTask(
+        func="matmul_auto_scheduler_test", args=(N, N, N), target=target
+    )
+    task_record = auto_scheduler._ffi_api.SerializeSearchTask(task)
+    new_task = auto_scheduler._ffi_api.DeserializeSearchTask(task_record)
+    # TODO(jcf94): Check the compute dag & hardware parameter
+    assert task.workload_key == new_task.workload_key
+    assert str(task.target) == str(new_task.target)
+    assert str(task.target_host) == str(new_task.target_host)
+    assert task.layout_rewrite_option == new_task.layout_rewrite_option
+
+    # Log with 1 task input
+    test_input_0 = tvm.runtime.ndarray.empty((64, 64))
+    task = auto_scheduler.SearchTask(
+        func="matmul_auto_scheduler_test",
+        args=(N, N, N),
+        target=target,
+        task_inputs={"test_input_0": test_input_0},
+        task_inputs_overwrite=True,
+    )
+    task_record = auto_scheduler._ffi_api.SerializeSearchTask(task)
+    new_task = auto_scheduler._ffi_api.DeserializeSearchTask(task_record)
+    assert task.workload_key == new_task.workload_key
+    assert str(task.target) == str(new_task.target)
+    assert str(task.target_host) == str(new_task.target_host)
+    assert task.layout_rewrite_option == new_task.layout_rewrite_option
+    assert len(new_task.task_input_names) == 1
+    assert new_task.task_input_names[0] == "test_input_0"
+
+    # Log with multiple task inputs
+    test_input_1 = tvm.runtime.ndarray.empty((64, 64))
+    task = auto_scheduler.SearchTask(
+        func="matmul_auto_scheduler_test",
+        args=(N, N, N),
+        target=target,
+        task_inputs={
+            "test_input_0": test_input_0,
+            "test_input_1": test_input_1,
+        },
+        task_inputs_overwrite=True,
+    )
+    task_record = auto_scheduler._ffi_api.SerializeSearchTask(task)
+    new_task = auto_scheduler._ffi_api.DeserializeSearchTask(task_record)
+    assert task.workload_key == new_task.workload_key
+    assert str(task.target) == str(new_task.target)
+    assert str(task.target_host) == str(new_task.target_host)
+    assert task.layout_rewrite_option == new_task.layout_rewrite_option
+    assert len(new_task.task_input_names) == 2
+    assert new_task.task_input_names[0] == "test_input_0"
+    assert new_task.task_input_names[1] == "test_input_1"
+
+    # Log with version 0.5
+    v5_log = """["[\\\"matmul_auto_scheduler_test\\\", 64, 64, 64]", "llvm -keys=cpu -link-params=0", [6, 64, 64, 0, 0, 0, 0, 0], "", 1]"""
+    new_task = auto_scheduler._ffi_api.DeserializeSearchTask(v5_log)
+    assert task.workload_key == new_task.workload_key
+    assert str(task.target) == str(new_task.target)
+    assert str(task.target_host) == str(new_task.target_host)
+    assert task.layout_rewrite_option == new_task.layout_rewrite_option
+    assert len(new_task.task_input_names) == 0
+
+
+def test_recover_measure_input_with_task_input():
+    auto_scheduler.search_task.TASK_INPUT_BUFFER_TABLE.clear()
+
+    # Since this file is tests for search_task, we only check the search_task here
+
+    # Log with no task input
+    task = auto_scheduler.SearchTask(
+        func=matmul_auto_scheduler_test, args=(512, 512, 512), target="llvm"
+    )
+    inp = auto_scheduler.measure.MeasureInput(task, task.compute_dag.init_state)
+    res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1)
+    measure_record = auto_scheduler.measure_record.dump_record_to_string(inp, res)
+    measure_log = auto_scheduler.measure_record.load_record_from_string(measure_record)
+    new_task = measure_log[0].task
+    assert task.workload_key == new_task.workload_key
+    assert str(task.target) == str(new_task.target)
+    assert str(task.target_host) == str(new_task.target_host)
+    assert task.layout_rewrite_option == new_task.layout_rewrite_option
+
+    # Log with 1 task input
+    test_input_0 = tvm.runtime.ndarray.empty((64, 64))
+    task = auto_scheduler.SearchTask(
+        func=matmul_auto_scheduler_test,
+        args=(512, 512, 512),
+        target="llvm",
+        task_inputs={
+            "test_input_0": test_input_0,
+        },
+        task_inputs_overwrite=True,
+    )
+    inp = auto_scheduler.measure.MeasureInput(task, task.compute_dag.init_state)
+    res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1)
+    measure_record = auto_scheduler.measure_record.dump_record_to_string(inp, res)
+    measure_log = auto_scheduler.measure_record.load_record_from_string(measure_record)
+    new_task = measure_log[0].task
+    assert task.workload_key == new_task.workload_key
+    assert str(task.target) == str(new_task.target)
+    assert str(task.target_host) == str(new_task.target_host)
+    assert task.layout_rewrite_option == new_task.layout_rewrite_option
+    assert len(new_task.task_input_names) == 1
+    assert new_task.task_input_names[0] == "test_input_0"
+
+    # Log with multiple task inputs
+    test_input_1 = tvm.runtime.ndarray.empty((64, 64))
+    task = auto_scheduler.SearchTask(
+        func=matmul_auto_scheduler_test,
+        args=(512, 512, 512),
+        target="llvm",
+        task_inputs={
+            "test_input_0": test_input_0,
+            "test_input_1": test_input_1,
+        },
+        task_inputs_overwrite=True,
+    )
+    inp = auto_scheduler.measure.MeasureInput(task, task.compute_dag.init_state)
+    res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1)
+    measure_record = auto_scheduler.measure_record.dump_record_to_string(inp, res)
+    measure_log = auto_scheduler.measure_record.load_record_from_string(measure_record)
+    new_task = measure_log[0].task
+    assert task.workload_key == new_task.workload_key
+    assert str(task.target) == str(new_task.target)
+    assert str(task.target_host) == str(new_task.target_host)
+    assert task.layout_rewrite_option == new_task.layout_rewrite_option
+    assert len(new_task.task_input_names) == 2
+    assert new_task.task_input_names[0] == "test_input_0"
+    assert new_task.task_input_names[1] == "test_input_1"
+
+    # Log with version 0.5
+    v5_log = """{"i": [["[\\\"matmul_auto_scheduler_test\\\", 512, 512, 512]", "llvm -keys=cpu -link-params=0", [6, 64, 64, 0, 0, 0, 0, 0], "", 1], [[], []]], "r": [[0.1], 0, 0.2, 1], "v": "v0.6"}"""
+    measure_log = auto_scheduler.measure_record.load_record_from_string(v5_log)
+    new_task = measure_log[0].task
+    assert task.workload_key == new_task.workload_key
+    assert str(task.target) == str(new_task.target)
+    assert str(task.target_host) == str(new_task.target_host)
+    assert task.layout_rewrite_option == new_task.layout_rewrite_option
+    assert len(new_task.task_input_names) == 0
+
+
+if __name__ == "__main__":
+    test_search_task_add_task_input()
+    test_search_task_record()
+    test_recover_measure_input_with_task_input()
diff --git a/tutorials/auto_scheduler/ci_logs/sparse_dense.json b/tutorials/auto_scheduler/ci_logs/sparse_dense.json
new file mode 100644
index 000000000000..7c1c100124dc
--- /dev/null
+++ b/tutorials/auto_scheduler/ci_logs/sparse_dense.json
@@ -0,0 +1,2 @@
+# Keep a valid schedule for demonstraction. This is used to prevent flasky errors in CI.
+{"i": [["[\"sparse_dense\", 512, 512, 512, [9831, 16, 1], [9831], [33], \"float32\"]", "llvm -keys=cpu -link-params=0", [6, 64, 64, 0, 0, 0, 0, 0], "", 1, ["sparse_dense_bsr_512_512_512_16_1_0.60_W_data", "sparse_dense_bsr_512_512_512_16_1_0.60_W_indices", "sparse_dense_bsr_512_512_512_16_1_0.60_W_indptr"]], [[], [["CI", 8], ["CI", 6], ["SP", 5, 0, 512, [1, 8], 1], ["FSP", 9, 0, 2, 1], ["SP", 5, 3, 32, [32], 1], ["FSP", 9, 2, 4, 1], ["RE", 5, [0, 3, 1, 4, 6, 2, 5, 7]], ["RE", 9, [0, 2, 1, 3]], ["CA", 5, 9, 1], ["CI", 4], ["FU", 9, [0, 1]], ["AN", 9, 0, 3], ["PR", 5, 0, "auto_unroll_max_step$0"], ["AN", 9, 2, 2]]]], "r": [[0.000957008], 0, 0.605709, 1614689820], "v": "v0.6"}
diff --git a/tutorials/auto_scheduler/tune_sparse_x86.py b/tutorials/auto_scheduler/tune_sparse_x86.py
new file mode 100644
index 000000000000..ced416f6c500
--- /dev/null
+++ b/tutorials/auto_scheduler/tune_sparse_x86.py
@@ -0,0 +1,339 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Auto-scheduling Sparse Matrix Multiplication on CPU with Custom Sketch Rule
+===========================================================================
+**Author**: `Chengfan Jia <https://github.com/jcf94/>`_
+
+This is a tutorial on how to use the auto-scheduler to tune a sparse matrix multiplication for
+CPUs.
+
+Auto-scheduler is designed to explore the schedule with best performance for a given computation
+declaration automatically. While sometimes, we may have a demand to try some special ops which may
+not been well-supported by auto-scheduler's default sketch rules and result in poor performance.
+Fortunately, auto-scheduler currently allows user to provide a CustomSketch to cover these cases.
+
+We use sparse matrix multiplication as an example in this tutorial to demonstrate how to implement
+and plug a custom sketch rule to the auto-scheduler's search policy.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
+"""
+
+import os
+import itertools
+
+import numpy as np
+import tvm
+from tvm import te, auto_scheduler, runtime, topi
+from tvm.auto_scheduler import _ffi_api
+from tvm.topi.utils import get_const_tuple
+
+import scipy.sparse as sp
+
+######################################################################
+# Define the computation
+# ^^^^^^^^^^^^^^^^^^^^^^
+# To begin with, let us define the computation of a sparse matmul with several relu and bias add.
+# The function should return the list of input/output tensors.
+# From these tensors, the auto-scheduler can get the whole computational graph.
+
+# We use this function to generate a random bsr matrix
+def random_bsr_matrix(M, N, BS_R, BS_C, density, dtype):
+    import itertools
+
+    Y = np.zeros((M, N), dtype=dtype)
+    assert M % BS_R == 0
+    assert N % BS_C == 0
+    nnz = int(density * M * N)
+    num_blocks = int(nnz / (BS_R * BS_C)) + 1
+    candidate_blocks = np.asarray(list(itertools.product(range(0, M, BS_R), range(0, N, BS_C))))
+    assert candidate_blocks.shape[0] == M // BS_R * N // BS_C
+    chosen_blocks = candidate_blocks[
+        np.random.choice(candidate_blocks.shape[0], size=num_blocks, replace=False)
+    ]
+    for i in range(len(chosen_blocks)):
+        r, c = chosen_blocks[i]
+        Y[r : r + BS_R, c : c + BS_C] = np.random.randn(BS_R, BS_C)
+    s = sp.bsr_matrix(Y, blocksize=(BS_R, BS_C))
+    assert s.data.shape == (num_blocks, BS_R, BS_C)
+    assert s.indices.shape == (num_blocks,)
+    assert s.indptr.shape == (M // BS_R + 1,)
+    return s
+
+
+@auto_scheduler.register_workload
+def sparse_dense(M, N, K, w_data_shape, w_indices_shape, w_indptr_shape, dtype):
+    X = te.placeholder(shape=(M, K), dtype=dtype)
+    W_data = te.placeholder(shape=w_data_shape, dtype=dtype)
+    W_indices = te.placeholder(shape=w_indices_shape, dtype="int32")
+    W_indptr = te.placeholder(shape=w_indptr_shape, dtype="int32")
+    B = te.placeholder(shape=(M, N), dtype=dtype)
+
+    out = topi.nn.sparse_dense(topi.nn.relu(X), W_data, W_indices, W_indptr)
+    out = te.compute((M, N), lambda i, j: out[i, j] + B[i, j], name="BiasAdd")
+    out = topi.nn.relu(out)
+
+    return [X, W_data, W_indices, W_indptr, B, out]
+
+
+######################################################################
+# Special step for sparse workload
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+# During schedule tuning, auto-scheduler will use random inputs to measure the performance of a
+# generated schedule. While we cannot directly use a random array as the input of a sparse op, for
+# the "indices" and "indptr" array are meaningful for the computation.
+#
+# To solve this problem, we register these as special buffers, and load them when process program
+# measuring.
+# See the `tvm.auto_scheduler.measure.py` for more details.
+
+# Define the basic shapes of this sparse computation
+M = K = N = 512
+BS_R = 16
+BS_C = 1
+density = 0.6
+
+# Generate the test data with numpy
+X_np = np.random.randn(M, K).astype("float32")
+X_np = np.maximum(np.zeros((M, K), dtype="float32"), X_np)  # Relu
+W_sp_np = random_bsr_matrix(N, K, BS_R, BS_C, density=density, dtype="float32")
+W_np = W_sp_np.todense()
+Y_np = X_np @ W_np.T  # Process the matrix multiplication
+B_np = np.random.randn(M, N).astype("float32")
+Y_np = Y_np + B_np  # Bias add
+Y_np = np.maximum(np.zeros((M, N), dtype="float32"), Y_np)  # Relu
+
+######################################################################
+# Create the search task
+# ^^^^^^^^^^^^^^^^^^^^^^
+# We then create a search task with M=N=K=512 and dtype="float32"
+# If your machine supports avx instructions, you can
+#
+#   - replace "llvm" below with "llvm -mcpu=core-avx2" to enable AVX2
+#   - replace "llvm" below with "llvm -mcpu=skylake-avx512" to enable AVX-512
+
+target = tvm.target.Target("llvm")
+
+# Register the sparse data to task inputs
+prefix = "sparse_dense_bsr_%d_%d_%d_%d_%d_%.2f_" % (M, N, K, BS_R, BS_C, density)
+task = tvm.auto_scheduler.SearchTask(
+    func=sparse_dense,
+    args=(M, N, K, W_sp_np.data.shape, W_sp_np.indices.shape, W_sp_np.indptr.shape, "float32"),
+    target=target,
+    task_inputs={
+        prefix + "W_data": runtime.ndarray.array(W_sp_np.data),
+        prefix + "W_indices": runtime.ndarray.array(W_sp_np.indices),
+        prefix + "W_indptr": runtime.ndarray.array(W_sp_np.indptr),
+    },
+    task_inputs_save_to_file=True,
+)
+
+# Inspect the computational graph
+print("Computational DAG:")
+print(task.compute_dag)
+
+######################################################################
+# Write the custom sketch for sparse dense op
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+# Before tuning, we will need to define the CustomSketchRule for the sparse dense op.
+#
+# CustomSketchRule consists of two parts: the condition function and the apply function.
+#
+#   - condition function: describe when to apply this sketch rule. For example, we can only apply
+#     the rule to the sparse ops by matching their name and tag.
+#   - apply function: describe how to generate the initial sketch. You can implement it using
+#     auto-scheduler provided loop state APIs.
+
+
+def meet_condition_func(search_policy, state, stage_id):
+    state = auto_scheduler.loop_state.State(state, search_policy.search_task.compute_dag)
+    if state.stages[stage_id].op.tag in [
+        "sparse_dense_sp_rhs_bsrmm",
+        "sparse_dense_sp_rhs_bsrmm_block",
+    ]:
+        return auto_scheduler.PreloadCustomSketchRule.APPLY_AND_SKIP_REST
+    else:
+        return auto_scheduler.PreloadCustomSketchRule.PASS
+
+
+def apply_func(search_policy, state, stage_id):
+    ret = []
+    s0 = auto_scheduler.loop_state.State(state, search_policy.search_task.compute_dag)
+    if s0.stages[stage_id].op.tag == "sparse_dense_sp_rhs_bsrmm_block":
+        return [s0.state_object, stage_id - 1]
+
+    sparse_dense = s0.stages[stage_id].op
+    sparse_dense_block = s0.stages[stage_id - 1].op
+    assert sparse_dense.tag == "sparse_dense_sp_rhs_bsrmm"
+    assert sparse_dense_block.tag == "sparse_dense_sp_rhs_bsrmm_block"
+
+    # Set the default consumer of compute block
+    consumer = sparse_dense
+
+    # If sparse dense has a single elementwise consumer
+    # We can compute inline the sparse_dense output stage
+    consumers = _ffi_api.SearchPolicyUtilsGetConsumers(
+        search_policy.search_task, s0.state_object, stage_id
+    )
+    if len(consumers) == 1:
+        consumer_id = int(consumers.items()[0][0])
+        if _ffi_api.SearchPolicyUtilsIsElementwiseMatch(
+            search_policy.search_task, s0.state_object, stage_id, consumer_id
+        ):
+            consumer = s0.stages[consumer_id].op
+            s0.compute_inline(sparse_dense)
+
+    i, nb_j, j, row_offset, c = s0[sparse_dense_block].iters
+    m, n = s0[consumer].iters
+    i0, i1, i2 = s0.split(sparse_dense_block, i, [None, None])
+    m0, m1 = s0.follow_split(consumer, m, len(s0.transform_steps) - 1, 1)
+    j0, j1 = s0.split(sparse_dense_block, nb_j, [None])
+    n0, n1 = s0.follow_split(consumer, n, len(s0.transform_steps) - 1, 1)
+    s0.reorder(sparse_dense_block, [i0, j0, i1, j1, row_offset, i2, j, c])
+    s0.reorder(consumer, [m0, n0, m1, n1])
+    s0.compute_at(sparse_dense_block, consumer, n0)
+
+    ret.append([s0.state_object, stage_id - 2])
+
+    return ret
+
+
+######################################################################
+# Next, we set parameters for the auto-scheduler with the custom sketch plugged in.
+#
+# * :code:`num_measure_trials` is the number of measurement trials we can use during the search.
+#   We only make 10 trials in this tutorial for a fast demonstration. In practice, 1000 is a
+#   good value for the search to converge. You can do more trials according to your time budget.
+# * In addition, we use :code:`RecordToFile` to dump measurement records into a file
+#   `sparse_dense.json`.
+#   The measurement records can be used to query the history best, resume the search,
+#   and do more analyses later.
+# * see :any:`auto_scheduler.TuningOptions` for more parameters
+# * Here, we need to create a :code:`auto_scheduler.SketchPolicy` object, and add the custom sketch
+#   rule as a `init_search_callbacks`.
+
+log_file = "sparse_dense.json"
+tune_option = auto_scheduler.TuningOptions(
+    num_measure_trials=10,
+    measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+    verbose=2,
+)
+
+search_policy = auto_scheduler.SketchPolicy(
+    task,
+    program_cost_model=auto_scheduler.XGBModel(),
+    init_search_callbacks=[
+        auto_scheduler.PreloadCustomSketchRule(meet_condition_func, apply_func, "SparseDense")
+    ],
+)
+
+######################################################################
+# Run the search
+# ^^^^^^^^^^^^^^
+# Now we get all inputs ready.
+# We can kick off the search and let the auto-scheduler do its magic.
+# After some measurement trials, we can load the best schedule from the log
+# file and apply it.
+
+# Run auto-tuning (search)
+# Notice: We do not run the tuning in our webpage server since it takes too long.
+# Uncomment the following line to run it by yourself.
+task.tune(tune_option, search_policy)
+
+# Apply the best schedule
+sch, args = task.apply_best(log_file)
+
+######################################################################
+# We can lower the schedule to see the IR after auto-scheduling.
+# The auto-scheduler correctly performs optimizations including multi-level tiling,
+# layout transformation, parallelization, vectorization, unrolling, and operator fusion.
+
+print("Lowered TIR:")
+print(tvm.lower(sch, args, simple_mode=True))
+
+######################################################################
+# Check correctness and evaluate performance
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+# We build the binary and check its correctness and performance.
+
+func = tvm.build(sch, args, target)
+
+ctx = tvm.cpu()
+
+X_tvm = tvm.nd.array(X_np, ctx=ctx)
+W_data_tvm = tvm.nd.array(W_sp_np.data, ctx=ctx)
+W_indices_tvm = tvm.nd.array(W_sp_np.indices, ctx=ctx)
+W_indptr_tvm = tvm.nd.array(W_sp_np.indptr, ctx=ctx)
+B_tvm = tvm.nd.array(B_np, ctx=ctx)
+Y_tvm = tvm.nd.empty(Y_np.shape, ctx=ctx)
+
+func(X_tvm, W_data_tvm, W_indices_tvm, W_indptr_tvm, B_tvm, Y_tvm)
+
+# Check results
+tvm.testing.assert_allclose(Y_np, Y_tvm.asnumpy(), atol=1e-4, rtol=1e-4)
+
+# Evaluate execution time.
+evaluator = func.time_evaluator(func.entry_name, ctx, min_repeat_ms=500)
+print(
+    "Execution time of this operator: %.3f ms"
+    % (
+        np.median(evaluator(X_tvm, W_data_tvm, W_indices_tvm, W_indptr_tvm, B_tvm, Y_tvm).results)
+        * 1000
+    )
+)
+
+######################################################################
+# .. note:: Tuning result example
+#
+#   .. code-block:: c
+#
+#    ----------------------------------------------------------------------
+#    Lowered TIR:
+#    primfn(placeholder_5: handle, placeholder_6: handle, placeholder_7: handle, placeholder_8: handle, placeholder_9: handle, compute_1: handle) -> ()
+#      attr = {"global_symbol": "main", "tir.noalias": True}
+#      buffers = {placeholder_2: Buffer(placeholder_10: Pointer(float32), float32, [9831, 16, 1], []),
+#                 placeholder_4: Buffer(placeholder_11: Pointer(int32), int32, [33], []),
+#                 placeholder_3: Buffer(placeholder_12: Pointer(float32), float32, [512, 512], []),
+#                 compute: Buffer(compute_2: Pointer(float32), float32, [512, 512], []),
+#                 placeholder_1: Buffer(placeholder_13: Pointer(float32), float32, [512, 512], []),
+#                 placeholder: Buffer(placeholder_14: Pointer(int32), int32, [9831], [])}
+#      buffer_map = {placeholder_7: placeholder, placeholder_9: placeholder_1, placeholder_6: placeholder_2, compute_1: compute, placeholder_5: placeholder_3, placeholder_8: placeholder_4} {
+#      for (i0.outer.i1.outer.fused: int32, 0, 1024) "parallel" {
+#        attr [compute_3: Pointer(float32)] "storage_scope" = "global";
+#        allocate(compute_3, float32, [256]) {
+#          for (nb_j.inner: int32, 0, 2) {
+#            for (i.inner.init: int32, 0, 8) {
+#              for (j.init: int32, 0, 16) {
+#                compute_3[(((i.inner.init*32) + (nb_j.inner*16)) + j.init)] = 0f32
+#              }
+#            }
+#            for (elem_idx: int32, 0, ((int32*)placeholder_11[(((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner) + 1)] - (int32*)placeholder_11[((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner)])) {
+#              for (i.inner: int32, 0, 8) {
+#                for (j: int32, 0, 16) {
+#                  compute_3[(((i.inner*32) + (nb_j.inner*16)) + j)] = ((float32*)compute_3[(((i.inner*32) + (nb_j.inner*16)) + j)] + ((float32*)placeholder_10[((((int32*)placeholder_11[((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner)]*16) + (elem_idx*16)) + j)]*max((float32*)placeholder_12[(((floordiv(i0.outer.i1.outer.fused, 16)*4096) + (i.inner*512)) + (int32*)placeholder_14[((int32*)placeholder_11[((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner)] + elem_idx)])], 0f32)))
+#                }
+#              }
+#            }
+#          }
+#          for (i0.inner: int32, 0, 8) {
+#            compute_2[ramp((((floordiv(i0.outer.i1.outer.fused, 16)*4096) + (i0.inner*512)) + (floormod(i0.outer.i1.outer.fused, 16)*32)), 1, 32)] = max(((float32x32*)compute_3[ramp((i0.inner*32), 1, 32)] + (float32x32*)placeholder_13[ramp((((floordiv(i0.outer.i1.outer.fused, 16)*4096) + (i0.inner*512)) + (floormod(i0.outer.i1.outer.fused, 16)*32)), 1, 32)]), broadcast(0f32, 32))
+#          }
+#        }
+#      }
+#    }

From 69c1c6d92e6cf28f57035b5386126493c85dd117 Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Fri, 5 Mar 2021 22:36:13 -0700
Subject: [PATCH 293/357] Move SimplifyConvPad to a new pass and don't enable
 it by default (#7603)

* Move SimplifyConvPad to a new pass and don't enable it by default

* rename pass

* move files

* fix lint

* adjust test tolerance
---
 python/tvm/relay/transform/transform.py       |  13 ++
 src/relay/transforms/fold_explicit_padding.cc | 207 ++++++++++++++++++
 src/relay/transforms/simplify_expr.cc         | 116 ----------
 .../relay/test_pass_fold_explicit_padding.py  | 102 +++++++++
 tests/python/relay/test_pass_simplify_expr.py |  76 -------
 5 files changed, 322 insertions(+), 192 deletions(-)
 create mode 100644 src/relay/transforms/fold_explicit_padding.cc
 create mode 100644 tests/python/relay/test_pass_fold_explicit_padding.py

diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py
index 0d078d39372d..b61f209505d8 100644
--- a/python/tvm/relay/transform/transform.py
+++ b/python/tvm/relay/transform/transform.py
@@ -1099,6 +1099,19 @@ def SimplifyExpr():
     return _ffi_api.SimplifyExpr()
 
 
+def FoldExplicitPadding():
+    """
+    FoldExplicitPadding finds explict padding before an op that can support
+    implicit padding and fuses them.
+
+    Returns
+    -------
+    ret : tvm.transform.Pass
+        The registered ImplicitPadding pass.
+    """
+    return _ffi_api.FoldExplicitPadding()
+
+
 def AnnotateSpans():
     """
     Annotate a program with span information by first generating its textual
diff --git a/src/relay/transforms/fold_explicit_padding.cc b/src/relay/transforms/fold_explicit_padding.cc
new file mode 100644
index 000000000000..d606eb445a79
--- /dev/null
+++ b/src/relay/transforms/fold_explicit_padding.cc
@@ -0,0 +1,207 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/transforms/fold_explicit_padding.cc
+ * \brief A pass for folding explicit pads into other ops.
+ */
+
+#include <tvm/relay/dataflow_matcher.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+#include <tvm/support/logging.h>
+
+#include "../op/tensor/transform.h"
+#include "pattern_utils.h"
+
+namespace tvm {
+namespace relay {
+
+/*!
+ * \brief SimplifyConvPad matches a pad followed by a conv/convtranspose/pool/etc
+ * with a pad attribute and merges the padding into the kernel.
+ */
+class SimplifyConvPad {
+ public:
+  DFPattern pattern() const { return pattern_; }
+
+  SimplifyConvPad() {
+    x_ = IsWildcard();
+    w_ = IsWildcard();
+    pad_ = IsOp("nn.pad")({x_});
+    conv1d_ = IsOp("nn.conv1d");
+    conv2d_ = IsOp("nn.conv2d");
+    conv3d_ = IsOp("nn.conv3d");
+    conv_ = (conv1d_ || conv2d_ || conv3d_)({pad_, w_});
+    pattern_ = conv_;
+  }
+
+  template <typename T>
+  Attrs MakeConvAttrs(const T* old_attrs, const Array<PrimExpr> padding) const {
+    ICHECK(old_attrs);
+    ICHECK(padding.size() == old_attrs->padding.size())
+        << "Number of dimensions to pad and convolution padding attributes should have the same "
+           "extent";
+
+    auto new_attrs = make_object<T>();
+    Array<PrimExpr> combined_padding;
+    for (size_t i = 0; i < padding.size(); ++i) {
+      combined_padding.push_back(padding[i] + old_attrs->padding[i]);
+    }
+    new_attrs->strides = old_attrs->strides;
+    new_attrs->padding = combined_padding;
+    new_attrs->dilation = old_attrs->dilation;
+    new_attrs->groups = old_attrs->groups;
+    new_attrs->channels = old_attrs->channels;
+    new_attrs->kernel_size = old_attrs->kernel_size;
+    new_attrs->data_layout = old_attrs->data_layout;
+    new_attrs->kernel_layout = old_attrs->kernel_layout;
+    new_attrs->out_layout = old_attrs->out_layout;
+    new_attrs->out_dtype = old_attrs->out_dtype;
+    return Attrs(new_attrs);
+  }
+
+  template <typename T>
+  Attrs GetAttrs(const PadAttrs* param, const T* attrs) const {
+    ICHECK(param);
+    ICHECK(attrs);
+    ICHECK(attrs->data_layout.size() == param->pad_width.size())
+        << "Data Layout and padding attributes should have the same extent";
+
+    std::string data_layout = attrs->data_layout;
+    std::set<char> image_dims({'H', 'W', 'D'});
+    Array<PrimExpr> padding;
+    // If we're padding a non-spatial dimension, don't simplify
+    // Convolution can only pad on spatial axes
+    for (size_t i = 0; i < param->pad_width.size(); ++i) {
+      if (!image_dims.count(data_layout[i])) {
+        for (size_t j = 0; j < param->pad_width[i].size(); ++j) {
+          if (param->pad_width[i][j] != 0) {
+            return Attrs();
+          }
+        }
+      }
+    }
+    for (size_t j = 0; j < param->pad_width[0].size(); ++j) {
+      for (size_t i = 0; i < param->pad_width.size(); ++i) {
+        if (image_dims.count(data_layout[i])) {
+          padding.push_back(param->pad_width[i][j]);
+        }
+      }
+    }
+
+    return MakeConvAttrs(attrs, padding);
+  }
+
+  Expr callback(const Expr& pre, const Expr& post,
+                const Map<DFPattern, Array<Expr>>& node_map) const {
+    const CallNode* call_node = post.as<CallNode>();
+    ICHECK(call_node);
+    auto pad = node_map[pad_][0];
+    const CallNode* pad_node = pad.as<CallNode>();
+    ICHECK(pad_node);
+    const PadAttrs* param = pad_node->attrs.as<PadAttrs>();
+    ICHECK(param);
+    if (param->pad_mode == "constant" && param->pad_value == 0.0) {
+      Attrs attrs;
+      if (node_map.count(conv1d_)) {
+        attrs = GetAttrs(param, call_node->attrs.as<Conv1DAttrs>());
+      } else if (node_map.count(conv2d_)) {
+        attrs = GetAttrs(param, call_node->attrs.as<Conv2DAttrs>());
+      } else if (node_map.count(conv3d_)) {
+        attrs = GetAttrs(param, call_node->attrs.as<Conv3DAttrs>());
+      } else {
+        return post;
+      }
+      if (!attrs.defined()) {
+        return post;
+      }
+      auto x = node_map[x_][0];
+      auto w = node_map[w_][0];
+      return Call(call_node->op, {x, w}, attrs, call_node->type_args, call_node->span);
+    }
+    return post;
+  }
+
+ private:
+  /*! \brief Pattern for rewriting */
+  DFPattern pattern_;
+  /*! \brief Pattern input */
+  DFPattern x_;
+  /*! \brief Pattern input weight */
+  DFPattern w_;
+  /*! \brief Pattern pad */
+  DFPattern pad_;
+  /*! \brief Pattern conv */
+  DFPattern conv_;
+  DFPattern conv1d_;
+  DFPattern conv2d_;
+  DFPattern conv3d_;
+};
+
+class SimplifyExplicitPadding {
+ public:
+  explicit SimplifyExplicitPadding(IRModule mod) : mod_(mod) {
+    CreateCallback(SimplifyConvPad());
+    // TODO(mbrookhart): ConvTranspose(Pad(x)), Pool(Pad(x))
+  }
+  template <typename T>
+  void CreateCallback(const T& pattern) {
+    auto func = [pattern](TVMArgs args, TVMRetValue* rv) {
+      Expr pre = args[0];
+      Expr post = args[1];
+      Map<DFPattern, Array<Expr>> node_map = args[2];
+      *rv = pattern.callback(pre, post, node_map);
+    };
+    callbacks_.push_back(DFPatternCallback(pattern.pattern(), PackedFunc(func), true));
+  }
+
+  Expr Simplify(const Expr& expr) { return RewritePatterns(callbacks_, expr, mod_); }
+
+ private:
+  IRModule mod_;
+  /*! \brief Callbacks for expr simplification */
+  Array<DFPatternCallback> callbacks_;
+};
+
+/*!
+ * \brief ImplicitPadding finds explict padding before an op that can
+ * support implicit padding and fuses them.
+ */
+Expr FoldExplicitPadding(const Expr& expr, const IRModule& mod) {
+  return SimplifyExplicitPadding(mod).Simplify(expr);
+}
+
+namespace transform {
+
+Pass FoldExplicitPadding() {
+  runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
+      [=](Function f, IRModule m, PassContext pc) {
+        return Downcast<Function>(FoldExplicitPadding(f, m));
+      };
+  return CreateFunctionPass(pass_func, 0, " FoldExplicitPadding", {"InferType"});
+}
+
+TVM_REGISTER_GLOBAL("relay._transform.FoldExplicitPadding").set_body_typed(FoldExplicitPadding);
+
+}  // namespace transform
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/transforms/simplify_expr.cc b/src/relay/transforms/simplify_expr.cc
index bfe04e10a9d0..74e48dc4bc54 100644
--- a/src/relay/transforms/simplify_expr.cc
+++ b/src/relay/transforms/simplify_expr.cc
@@ -82,121 +82,6 @@ class SimplifyReshape : public SimplifyPattern {
   DFPattern x_;
 };
 
-/*!
- * \brief SimplifyConvPad matches a pad followed by a conv/convtranspose/pool/etc
- * with a pad attribute and merges the padding into the kernel.
- */
-class SimplifyConvPad : public SimplifyPattern {
- public:
-  SimplifyConvPad() {
-    x_ = IsWildcard();
-    w_ = IsWildcard();
-    pad_ = IsOp("nn.pad")({x_});
-    conv1d_ = IsOp("nn.conv1d");
-    conv2d_ = IsOp("nn.conv2d");
-    conv3d_ = IsOp("nn.conv3d");
-    conv_ = (conv1d_ || conv2d_ || conv3d_)({pad_, w_});
-    pattern_ = conv_;
-  }
-  template <typename T>
-  Attrs MakeConvAttrs(const T* old_attrs, const Array<PrimExpr> padding) const {
-    ICHECK(old_attrs);
-    ICHECK(padding.size() == old_attrs->padding.size())
-        << "Number of dimensions to pad and convolution padding attributes should have the same "
-           "extent";
-
-    auto new_attrs = make_object<T>();
-    Array<PrimExpr> combined_padding;
-    for (size_t i = 0; i < padding.size(); ++i) {
-      combined_padding.push_back(padding[i] + old_attrs->padding[i]);
-    }
-    new_attrs->strides = old_attrs->strides;
-    new_attrs->padding = combined_padding;
-    new_attrs->dilation = old_attrs->dilation;
-    new_attrs->groups = old_attrs->groups;
-    new_attrs->channels = old_attrs->channels;
-    new_attrs->kernel_size = old_attrs->kernel_size;
-    new_attrs->data_layout = old_attrs->data_layout;
-    new_attrs->kernel_layout = old_attrs->kernel_layout;
-    new_attrs->out_layout = old_attrs->out_layout;
-    new_attrs->out_dtype = old_attrs->out_dtype;
-    return Attrs(new_attrs);
-  }
-  template <typename T>
-  Attrs GetAttrs(const PadAttrs* param, const T* attrs) const {
-    ICHECK(param);
-    ICHECK(attrs);
-    ICHECK(attrs->data_layout.size() == param->pad_width.size())
-        << "Data Layout and padding attributes should have the same extent";
-
-    std::string data_layout = attrs->data_layout;
-    std::set<char> image_dims({'H', 'W', 'D'});
-    Array<PrimExpr> padding;
-    // If we're padding a non-spatial dimension, don't simplify
-    // Convolution can only pad on spatial axes
-    for (size_t i = 0; i < param->pad_width.size(); ++i) {
-      if (!image_dims.count(data_layout[i])) {
-        for (size_t j = 0; j < param->pad_width[i].size(); ++j) {
-          if (param->pad_width[i][j] != 0) {
-            return Attrs();
-          }
-        }
-      }
-    }
-    for (size_t j = 0; j < param->pad_width[0].size(); ++j) {
-      for (size_t i = 0; i < param->pad_width.size(); ++i) {
-        if (image_dims.count(data_layout[i])) {
-          padding.push_back(param->pad_width[i][j]);
-        }
-      }
-    }
-
-    return MakeConvAttrs(attrs, padding);
-  }
-  Expr callback(const Expr& pre, const Expr& post,
-                const Map<DFPattern, Array<Expr>>& node_map) const override {
-    const CallNode* call_node = post.as<CallNode>();
-    ICHECK(call_node);
-    auto pad = node_map[pad_][0];
-    const CallNode* pad_node = pad.as<CallNode>();
-    ICHECK(pad_node);
-    const PadAttrs* param = pad_node->attrs.as<PadAttrs>();
-    ICHECK(param);
-    if (param->pad_mode == "constant" && param->pad_value == 0.0) {
-      Attrs attrs;
-      if (node_map.count(conv1d_)) {
-        attrs = GetAttrs(param, call_node->attrs.as<Conv1DAttrs>());
-      } else if (node_map.count(conv2d_)) {
-        attrs = GetAttrs(param, call_node->attrs.as<Conv2DAttrs>());
-      } else if (node_map.count(conv3d_)) {
-        attrs = GetAttrs(param, call_node->attrs.as<Conv3DAttrs>());
-      } else {
-        return post;
-      }
-      if (!attrs.defined()) {
-        return post;
-      }
-      auto x = node_map[x_][0];
-      auto w = node_map[w_][0];
-      return Call(call_node->op, {x, w}, attrs, call_node->type_args, call_node->span);
-    }
-    return post;
-  }
-
- private:
-  /*! \brief Pattern input */
-  DFPattern x_;
-  /*! \brief Pattern input weight */
-  DFPattern w_;
-  /*! \brief Pattern pad */
-  DFPattern pad_;
-  /*! \brief Pattern conv */
-  DFPattern conv_;
-  DFPattern conv1d_;
-  DFPattern conv2d_;
-  DFPattern conv3d_;
-};
-
 /*!
  * \brief FullArgwhere finds full followed by argwhere and turns it into an Arange op
  */
@@ -278,7 +163,6 @@ class ExprSimplifier {
   explicit ExprSimplifier(IRModule mod) : mod_(mod) {
     CreateCallback(SimplifyReshape());
     CreateCallback(FullElementwise());
-    CreateCallback(SimplifyConvPad());
   }
   template <typename T>
   void CreateCallback(const T& pattern) {
diff --git a/tests/python/relay/test_pass_fold_explicit_padding.py b/tests/python/relay/test_pass_fold_explicit_padding.py
new file mode 100644
index 000000000000..302a2b91bb8f
--- /dev/null
+++ b/tests/python/relay/test_pass_fold_explicit_padding.py
@@ -0,0 +1,102 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+from tvm import relay
+from tvm.relay import transform
+from tvm.relay.testing import run_opt_pass
+
+import numpy as np
+
+
+def test_simplify_conv_pad():
+    convs = [relay.nn.conv1d, relay.nn.conv2d, relay.nn.conv3d]
+
+    def validate(ndim, pad_width, pad_value, pad_mode, orig_padding, layout):
+        if layout[1] == "C":
+            shape = [1, 3] + [10] * ndim
+            wshape = [8, 3] + [3] * ndim
+        elif layout[-1] == "C":
+            shape = [1] + [10] * ndim + [3]
+            wshape = [8] + [3] * ndim + [3]
+        else:
+            raise ValueError("This test only supports NC* and N*C")
+
+        x = relay.var("x", shape=shape, dtype="float32")
+        w = relay.var("w", shape=wshape, dtype="float32")
+        pad = relay.nn.pad(x, pad_width, pad_value, pad_mode)
+        if layout[1] == "C":
+            conv = convs[ndim - 1](pad, w, padding=orig_padding)
+        else:
+            conv = convs[ndim - 1](
+                pad, w, padding=orig_padding, data_layout=layout, kernel_layout="DHWIO"[3 - ndim :]
+            )
+
+        if pad_mode == "constant" and pad_value == 0:
+            new_padding = []
+            for j in range(2):
+                for i in range(len(pad_width)):
+                    if layout[i] in ["D", "H", "W"]:
+                        new_padding.append(pad_width[i][j])
+            for i in range(len(new_padding)):
+                new_padding[i] += orig_padding[i]
+            if layout[1] == "C":
+                after = convs[ndim - 1](x, w, padding=new_padding)
+            else:
+                after = convs[ndim - 1](
+                    x, w, padding=new_padding, data_layout=layout, kernel_layout="DHWIO"[3 - ndim :]
+                )
+        else:
+            after = conv
+
+        zz = run_opt_pass(conv, transform.FoldExplicitPadding())
+        expected = run_opt_pass(after, transform.InferType())
+        assert tvm.ir.structural_equal(zz, expected)
+
+        mod1 = tvm.IRModule.from_expr(conv)
+        mod2 = tvm.IRModule.from_expr(zz)
+
+        with tvm.transform.PassContext():
+            ex1 = relay.create_executor("vm", mod=mod1, ctx=tvm.cpu(), target="llvm")
+        ex2 = relay.create_executor("vm", mod=mod2, ctx=tvm.cpu(), target="llvm")
+        x_np = np.random.rand(*shape).astype("float32")
+        w_np = np.random.rand(*wshape).astype("float32")
+        result1 = ex1.evaluate()(x_np, w_np)
+        result2 = ex2.evaluate()(x_np, w_np)
+
+        tvm.testing.assert_allclose(result1.asnumpy(), result2.asnumpy(), rtol=1e-5, atol=1e-5)
+
+    for orig_pad in [[0, 0], [2, 0], [0, 2]]:
+        for i_pad in [[0, 0], [1, 1], [1, 0]]:
+            for ndim in [1, 2, 3]:
+                for channels_last in [0, 1]:
+                    if channels_last:
+                        layout = "NDHWC"
+                        layout = layout[0:1] + layout[4 - ndim : 4] + layout[-1:]
+                        padding = [[0, 0]] + [i_pad] * ndim + [[0, 0]]
+                    else:
+                        layout = "NCDHW"
+                        layout = layout[0:2] + layout[5 - ndim :]
+                        padding = [[0, 0]] * 2 + [i_pad] * ndim
+
+                    validate(ndim, padding, 0, "constant", orig_pad * ndim, layout)
+    ndim = 2
+    validate(ndim, [[0, 0]] * 2 + [i_pad] * ndim, 1, "constant", orig_pad * ndim, "NCHW")
+    validate(ndim, [[0, 0]] * 2 + [i_pad] * ndim, 0, "edge", orig_pad * ndim, "NCHW")
+
+
+if __name__ == "__main__":
+    test_simplify_conv_pad()
diff --git a/tests/python/relay/test_pass_simplify_expr.py b/tests/python/relay/test_pass_simplify_expr.py
index e3e497e930f9..9531d896b2ed 100644
--- a/tests/python/relay/test_pass_simplify_expr.py
+++ b/tests/python/relay/test_pass_simplify_expr.py
@@ -124,82 +124,6 @@ def after_right(x, elem_op, value):
                 validate(shape, value, dtype)
 
 
-def test_simplify_conv_pad():
-    convs = [relay.nn.conv1d, relay.nn.conv2d, relay.nn.conv3d]
-
-    def validate(ndim, pad_width, pad_value, pad_mode, orig_padding, layout):
-        if layout[1] == "C":
-            shape = [1, 3] + [10] * ndim
-            wshape = [8, 3] + [3] * ndim
-        elif layout[-1] == "C":
-            shape = [1] + [10] * ndim + [3]
-            wshape = [8] + [3] * ndim + [3]
-        else:
-            raise ValueError("This test only supports NC* and N*C")
-
-        x = relay.var("x", shape=shape, dtype="float32")
-        w = relay.var("w", shape=wshape, dtype="float32")
-        pad = relay.nn.pad(x, pad_width, pad_value, pad_mode)
-        if layout[1] == "C":
-            conv = convs[ndim - 1](pad, w, padding=orig_padding)
-        else:
-            conv = convs[ndim - 1](
-                pad, w, padding=orig_padding, data_layout=layout, kernel_layout="DHWIO"[3 - ndim :]
-            )
-
-        if pad_mode == "constant" and pad_value == 0:
-            new_padding = []
-            for j in range(2):
-                for i in range(len(pad_width)):
-                    if layout[i] in ["D", "H", "W"]:
-                        new_padding.append(pad_width[i][j])
-            for i in range(len(new_padding)):
-                new_padding[i] += orig_padding[i]
-            if layout[1] == "C":
-                after = convs[ndim - 1](x, w, padding=new_padding)
-            else:
-                after = convs[ndim - 1](
-                    x, w, padding=new_padding, data_layout=layout, kernel_layout="DHWIO"[3 - ndim :]
-                )
-        else:
-            after = conv
-
-        zz = run_opt_pass(conv, transform.SimplifyExpr())
-        expected = run_opt_pass(after, transform.InferType())
-        assert tvm.ir.structural_equal(zz, expected)
-
-        mod1 = tvm.IRModule.from_expr(conv)
-        mod2 = tvm.IRModule.from_expr(zz)
-
-        with tvm.transform.PassContext(disabled_pass="SimplifyExpr"):
-            ex1 = relay.create_executor("vm", mod=mod1, ctx=tvm.cpu(), target="llvm")
-        ex2 = relay.create_executor("vm", mod=mod2, ctx=tvm.cpu(), target="llvm")
-        x_np = np.random.rand(*shape).astype("float32")
-        w_np = np.random.rand(*wshape).astype("float32")
-        result1 = ex1.evaluate()(x_np, w_np)
-        result2 = ex2.evaluate()(x_np, w_np)
-
-        tvm.testing.assert_allclose(result1.asnumpy(), result2.asnumpy())
-
-    for orig_pad in [[0, 0], [2, 0], [0, 2]]:
-        for i_pad in [[0, 0], [1, 1], [1, 0]]:
-            for ndim in [1, 2, 3]:
-                for channels_last in [0, 1]:
-                    if channels_last:
-                        layout = "NDHWC"
-                        layout = layout[0:1] + layout[4 - ndim : 4] + layout[-1:]
-                        padding = [[0, 0]] + [i_pad] * ndim + [[0, 0]]
-                    else:
-                        layout = "NCDHW"
-                        layout = layout[0:2] + layout[5 - ndim :]
-                        padding = [[0, 0]] * 2 + [i_pad] * ndim
-
-                    validate(ndim, padding, 0, "constant", orig_pad * ndim, layout)
-    ndim = 2
-    validate(ndim, [[0, 0]] * 2 + [i_pad] * ndim, 1, "constant", orig_pad * ndim, "NCHW")
-    validate(ndim, [[0, 0]] * 2 + [i_pad] * ndim, 0, "edge", orig_pad * ndim, "NCHW")
-
-
 if __name__ == "__main__":
     test_simplify_reshape()
     test_simplify_full_elementwise()

From 875f8ee2a704d1de6d6a681ac2c7b7073b73e79c Mon Sep 17 00:00:00 2001
From: Altan Haan <ahaan@octoml.ai>
Date: Sat, 6 Mar 2021 01:27:49 -0800
Subject: [PATCH 294/357] [Executor][Bugfix] Properly return and unflatten
 outputs from GraphExecutor (#7604)

* properly return and unflatten outputs from GraphExecutor

* lint

* cleaner approach, not sure what I was thinking before

* remove unused import

* forgot copyto cpu

* make solution even cleaner using iterator
---
 python/tvm/relay/build_module.py              | 24 ++++++++++++-------
 .../relay/test_backend_graph_runtime.py       | 21 ++++++++++++++++
 2 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 79eb7e4f19ff..4c9a898f2374 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -391,10 +391,20 @@ def _make_executor(self, expr=None):
         ret_type = self.mod["main"].checked_type.ret_type
         if _ty.is_dynamic(ret_type):
             raise ValueError("Graph Runtime only supports static graphs, got output type", ret_type)
-        num_outputs = len(ret_type.fields) if isinstance(ret_type, _ty.TupleType) else 1
         mod = build(self.mod, target=self.target)
         gmodule = _graph_rt.GraphModule(mod["default"](self.ctx))
 
+        def _unflatten(flat_iter, cur_type):
+            if isinstance(cur_type, _ty.TensorType):
+                return next(flat_iter)
+            if isinstance(cur_type, _ty.TupleType):
+                fields = []
+                for field_type in cur_type.fields:
+                    field = _unflatten(flat_iter, field_type)
+                    fields.append(field)
+                return fields
+            raise ValueError("Return type", ret_type, "contains unsupported type", cur_type)
+
         def _graph_wrapper(*args, **kwargs):
             args = self._convert_args(self.mod["main"], args, kwargs)
             # Create map of inputs.
@@ -402,13 +412,11 @@ def _graph_wrapper(*args, **kwargs):
                 gmodule.set_input(i, arg)
             # Run the module, and fetch the output.
             gmodule.run()
-            # make a copy so multiple invocation won't hurt perf.
-            if num_outputs == 1:
-                return gmodule.get_output(0).copyto(_nd.cpu(0))
-            outputs = []
-            for i in range(num_outputs):
-                outputs.append(gmodule.get_output(i).copyto(_nd.cpu(0)))
-            return outputs
+            flattened = []
+            for i in range(gmodule.get_num_outputs()):
+                flattened.append(gmodule.get_output(i).copyto(_nd.cpu(0)))
+            unflattened = _unflatten(iter(flattened), ret_type)
+            return unflattened
 
         return _graph_wrapper
 
diff --git a/tests/python/relay/test_backend_graph_runtime.py b/tests/python/relay/test_backend_graph_runtime.py
index 3c42b7b4196f..68708aaeb413 100644
--- a/tests/python/relay/test_backend_graph_runtime.py
+++ b/tests/python/relay/test_backend_graph_runtime.py
@@ -209,6 +209,27 @@ def test_compile_nested_tuples():
         ref = ref + 1
 
 
+def test_graph_executor_nested_tuples():
+    x, y, z, w = [relay.var(c, shape=(2, 3), dtype="float32") for c in "xyzw"]
+    out = relay.Tuple([x, relay.Tuple([y, relay.Tuple([z, w])])])
+    func = relay.Function([x, y, z, w], out)
+
+    exe = relay.create_executor(
+        kind="graph", mod=tvm.IRModule.from_expr(func), ctx=tvm.cpu(0), target="llvm"
+    )
+    f = exe.evaluate()
+
+    data = [np.random.uniform(size=(2, 3)).astype("float32") for _ in "xyzw"]
+    out = f(*data)
+    assert len(out) == 2
+    tvm.testing.assert_allclose(out[0].asnumpy(), data[0])
+    assert len(out[1]) == 2
+    tvm.testing.assert_allclose(out[1][0].asnumpy(), data[1])
+    assert len(out[1][1]) == 2
+    tvm.testing.assert_allclose(out[1][1][0].asnumpy(), data[2])
+    tvm.testing.assert_allclose(out[1][1][1].asnumpy(), data[3])
+
+
 if __name__ == "__main__":
     test_plan_memory()
     test_with_params()

From 8aa2a7cdbc81a0633b1f78ab28f31921e9fa9e98 Mon Sep 17 00:00:00 2001
From: "Huang, Guangtai" <hgt312@foxmail.com>
Date: Sat, 6 Mar 2021 18:24:49 +0800
Subject: [PATCH 295/357] [CUDA] BF16 support (#7014)

---
 include/tvm/runtime/data_type.h               |  9 ++-
 python/tvm/contrib/nvcc.py                    | 16 ++++-
 python/tvm/runtime/ndarray.py                 |  4 +-
 src/target/source/codegen_cuda.cc             | 72 ++++++++++++++++++-
 src/target/source/codegen_cuda.h              |  4 +-
 src/target/source/intrin_rule_cuda.cc         |  2 +
 src/target/source/literal/cuda_half_t.h       | 24 +++++++
 .../unittest/test_target_codegen_cuda.py      | 50 ++++++++++++-
 8 files changed, 174 insertions(+), 7 deletions(-)

diff --git a/include/tvm/runtime/data_type.h b/include/tvm/runtime/data_type.h
index d705be6c4deb..7d914ce6bff9 100644
--- a/include/tvm/runtime/data_type.h
+++ b/include/tvm/runtime/data_type.h
@@ -160,12 +160,19 @@ class DataType {
    */
   static DataType UInt(int bits, int lanes = 1) { return DataType(kDLUInt, bits, lanes); }
   /*!
-   * \brief Construct an uint type.
+   * \brief Construct an float type.
    * \param bits The number of bits in the type.
    * \param lanes The number of lanes
    * \return The constructed data type.
    */
   static DataType Float(int bits, int lanes = 1) { return DataType(kDLFloat, bits, lanes); }
+  /*!
+   * \brief Construct an bfloat type.
+   * \param bits The number of bits in the type.
+   * \param lanes The number of lanes
+   * \return The constructed data type.
+   */
+  static DataType BFloat(int bits, int lanes = 1) { return DataType(kDLBfloat, bits, lanes); }
   /*!
    * \brief Construct a bool type.
    * \param lanes The number of lanes
diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py
index 2a97b0b31d1e..f33603b923a5 100644
--- a/python/tvm/contrib/nvcc.py
+++ b/python/tvm/contrib/nvcc.py
@@ -302,8 +302,22 @@ def have_tensorcore(compute_version=None, target=None):
             major, minor = compute_version.split("_")[1]
             compute_version = major + "." + minor
     major, _ = parse_compute_version(compute_version)
+    if major >= 7:
+        return True
+
+    return False
+
+
+def have_bf16(compute_version):
+    """Either bf16 support is provided in the compute capability or not
 
-    if major == 7:
+    Parameters
+    ----------
+    compute_version : str
+        compute capability of a GPU (e.g. "8.0")
+    """
+    major, _ = parse_compute_version(compute_version)
+    if major >= 8:
         return True
 
     return False
diff --git a/python/tvm/runtime/ndarray.py b/python/tvm/runtime/ndarray.py
index 75da3d4a5c17..5c60515e3448 100644
--- a/python/tvm/runtime/ndarray.py
+++ b/python/tvm/runtime/ndarray.py
@@ -148,7 +148,9 @@ def copyfrom(self, source_array):
                     source_array.shape, shape
                 )
             )
-        source_array = np.ascontiguousarray(source_array, dtype=dtype)
+        source_array = np.ascontiguousarray(
+            source_array, dtype="uint16" if dtype == "bfloat16" else dtype
+        )
         assert source_array.flags["C_CONTIGUOUS"]
         data = source_array.ctypes.data_as(ctypes.c_void_p)
         nbytes = ctypes.c_size_t(source_array.size * source_array.dtype.itemsize)
diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc
index 2e9babacc441..e54acd2221d1 100644
--- a/src/target/source/codegen_cuda.cc
+++ b/src/target/source/codegen_cuda.cc
@@ -61,6 +61,18 @@ std::string CodeGenCUDA::Finish() {
     decl_stream << _cuda_half_util;
   }
 
+  if (enable_bf16_) {
+    decl_stream << "#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)\n";
+    decl_stream << "#include <cuda_bf16.h>\n";
+    decl_stream << "__device__ nv_bfloat16 max"
+                << "(nv_bfloat16 a, nv_bfloat16 b)\n"
+                << "{\n  return __hgt(a, b) ? a : b;\n}\n";
+    decl_stream << "__device__ nv_bfloat16 min(nv_bfloat16 a, nv_bfloat16 b)\n"
+                << "{\n  return __hlt(a, b) ? a : b;\n}\n";
+    decl_stream << "#endif\n\n";
+    decl_stream << _cuda_bfloat16_util;
+  }
+
   if (enable_warp_shuffle_) {
     decl_stream << _cuda_warp_intrinsic_util;
   }
@@ -170,6 +182,17 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
       os << lanes;
       return;
     }
+  } else if (t.is_bfloat16()) {
+    enable_bf16_ = true;
+    if (t.is_scalar()) {
+      os << "nv_bfloat16";
+    } else if (lanes <= 8) {
+      ICHECK_EQ(lanes % 2, 0) << "only support even lane for half type";
+      os << "uint" << lanes / 2;
+    } else {
+      fail = true;
+    }
+    if (!fail) return;
   } else if (t == DataType::Bool()) {
     os << "bool";
     return;
@@ -382,6 +405,8 @@ void CodeGenCUDA::PrintVecElemLoad(const std::string& vec, DataType t, int i,
     }
   } else if (t.is_float16()) {
     os << "((half2*)(&(" << vec << "." << access[i / 2] << ")))->" << access[i % 2];
+  } else if (t.is_bfloat16()) {
+    os << "((nv_bfloat162*)(&(" << vec << "." << access[i / 2] << ")))->" << access[i % 2];
   } else if (t.lanes() > 4 && t.lanes() <= 8) {
     std::string type_name;
     if (t.bits() == 16) {
@@ -427,6 +452,9 @@ void CodeGenCUDA::PrintVecElemStore(const std::string& vec, DataType t, int i,
   } else if (t.is_float16()) {
     stream << "((half2*)(&(" << vec << "." << access[i / 2] << ")))->" << access[i % 2] << " = "
            << value << ";\n";
+  } else if (t.is_bfloat16()) {
+    stream << "((nv_bfloat162*)(&(" << vec << "." << access[i / 2] << ")))->" << access[i % 2]
+           << " = " << value << ";\n";
   } else if (t.lanes() > 4 && t.lanes() <= 8) {
     std::string type_name;
     if (t.bits() == 16) {
@@ -687,7 +715,8 @@ void CodeGenCUDA::VisitStmt_(const AllocateNode* op) {
     if (scope == "wmma.matrix_a" || scope == "wmma.matrix_b") {
       ICHECK(op->dtype == DataType::Float(16) || op->dtype == DataType::Int(8) ||
              op->dtype == DataType::UInt(8) || op->dtype == DataType::Int(4) ||
-             op->dtype == DataType::UInt(4) || op->dtype == DataType::Int(1))
+             op->dtype == DataType::UInt(4) || op->dtype == DataType::Int(1) ||
+             op->dtype == DataType::BFloat(16))
           << "Matrix_a and matrix_b only support half or char or unsigned char "
           << "or uint4 or int4 or int1 type for now";
     } else {
@@ -767,6 +796,19 @@ void CodeGenCUDA::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  // NO
     return;
   }
 
+  if (op->dtype.is_bfloat16()) {
+    std::string v = PrintExpr(op->value);
+    os << "make_";
+    PrintType(op->dtype, os);
+    os << '(';
+    for (int i = 0; i < op->lanes / 2; ++i) {
+      if (i != 0) os << ", ";
+      os << "__pack_nv_bfloat162(" << v << ", " << v << ")";
+    }
+    os << ')';
+    return;
+  }
+
   std::string v = PrintExpr(op->value);
   os << "make_";
   PrintType(op->dtype, os);
@@ -836,6 +878,13 @@ void CodeGenCUDA::VisitExpr_(const SelectNode* op, std::ostream& os) {
 }
 
 inline void PrintConst(const FloatImmNode* op, std::ostream& os, CodeGenCUDA* p) {  // NOLINT(*)
+  // Type code is kBFloat
+  if (op->dtype.is_bfloat16()) {
+    os << "__float2bfloat16_rn";
+    os << '(' << std::scientific << op->value << 'f' << ')';
+    return;
+  }
+  // Type code is kFloat
   switch (op->dtype.bits()) {
     case 64:
     case 32: {
@@ -938,7 +987,7 @@ void CodeGenCUDA::HandleVolatileLoads(const std::string& value, const LoadNode*
   // Cast away volatile qualifier for fp16 types. That is, only loads and
   // stores are volatile. The loaded objects are not marked as volatile.
   //
-  if (op->dtype.is_float16() && IsVolatile(op->buffer_var.get())) {
+  if ((op->dtype.is_float16() || op->dtype.is_bfloat16()) && IsVolatile(op->buffer_var.get())) {
     os << "(";
     PrintType(op->dtype, os);
     os << ")(" << value << ")";
@@ -979,6 +1028,25 @@ void CodeGenCUDA::PrintVecElemLoadExpr(DataType t, int i, const std::string& val
     return;
   }
 
+  if (t.is_bfloat16()) {
+    if (i == 0) {
+      os << "make_";
+      PrintType(t, os);
+      os << '(';
+    }
+    if (i % 2 == 0) {
+      os << "__pack_bfloat162(" << value;
+    } else {
+      os << "," << value << ")";
+      if (i != t.lanes() - 1) {
+        os << ",";
+      } else {
+        os << ")";
+      }
+    }
+    return;
+  }
+
   if (i == 0) {
     os << "make_";
     PrintType(t, os);
diff --git a/src/target/source/codegen_cuda.h b/src/target/source/codegen_cuda.h
index 3cde8e379eb4..2098b8ac8344 100644
--- a/src/target/source/codegen_cuda.h
+++ b/src/target/source/codegen_cuda.h
@@ -42,7 +42,7 @@ class CodeGenCUDA final : public CodeGenC {
   void Init(bool output_ssa);
   std::string Finish();
   bool need_include_path() {
-    return (enable_fp16_ || enable_int8_ || need_math_constants_h_ || need_mma_h_);
+    return (enable_fp16_ || enable_bf16_ || enable_int8_ || need_math_constants_h_ || need_mma_h_);
   }
   // override behavior
   void PrintFuncPrefix() final;
@@ -88,6 +88,8 @@ class CodeGenCUDA final : public CodeGenC {
   std::string vid_global_barrier_expect_;
   // whether enable fp16
   bool enable_fp16_{false};
+  // whether enable bf16
+  bool enable_bf16_{false};
   // whether enable int8
   bool enable_int8_{false};
   // whether enable warp shuffle intrinsics
diff --git a/src/target/source/intrin_rule_cuda.cc b/src/target/source/intrin_rule_cuda.cc
index 5c562f7b1643..965b86c24d9e 100644
--- a/src/target/source/intrin_rule_cuda.cc
+++ b/src/target/source/intrin_rule_cuda.cc
@@ -43,6 +43,8 @@ struct CUDAMath {
         default:
           return "";
       }
+    } else if (t.is_bfloat16()) {
+      return 'h' + name;
     }
     return "";
   }
diff --git a/src/target/source/literal/cuda_half_t.h b/src/target/source/literal/cuda_half_t.h
index f8e92d508d88..3888f3a4fb07 100644
--- a/src/target/source/literal/cuda_half_t.h
+++ b/src/target/source/literal/cuda_half_t.h
@@ -311,6 +311,30 @@ static inline __device__ __host__ half htanh(half x) {
 #endif
 )";
 
+static constexpr const char* _cuda_bfloat16_util = R"(
+// Pack two bfloat16 values.
+static inline __device__ __host__ unsigned
+__pack_nv_bfloat162(const nv_bfloat16 x, const nv_bfloat16 y) {
+  unsigned v0 = *((unsigned short *)&x);
+  unsigned v1 = *((unsigned short *)&y);
+  return (v1 << 16) | v0;
+}
+
+// fix undefined fp16 match function
+static inline __device__ __host__ nv_bfloat16 hpow(nv_bfloat16 x, nv_bfloat16 y) {
+  float tmp_x = __bfloat162float(x);
+  float tmp_y = __bfloat162float(y);
+  float result = powf(tmp_x, tmp_y);
+  return __float2bfloat16(result);
+}
+
+static inline __device__ __host__ nv_bfloat16 htanh(nv_bfloat16 x) {
+  float tmp_x = __bfloat162float(x);
+  float result = tanhf(tmp_x);
+  return __float2bfloat16(result);
+}
+)";
+
 static constexpr const char* _cuda_warp_intrinsic_util = R"(
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)
 #define __shfl_sync(mask, var, lane, width) \
diff --git a/tests/python/unittest/test_target_codegen_cuda.py b/tests/python/unittest/test_target_codegen_cuda.py
index a228a640f108..06d7cb4bb7bb 100644
--- a/tests/python/unittest/test_target_codegen_cuda.py
+++ b/tests/python/unittest/test_target_codegen_cuda.py
@@ -19,7 +19,7 @@
 import numpy as np
 from tvm import topi
 import unittest
-from tvm.contrib.nvcc import have_fp16, have_int8
+from tvm.contrib.nvcc import have_fp16, have_int8, have_bf16
 from tvm.contrib import nvcc
 import tvm.testing
 
@@ -67,6 +67,53 @@ def check_cuda(dtype, n, lanes):
     check_cuda("float16", 64, 8)
 
 
+@tvm.testing.requires_gpu
+@tvm.testing.requires_cuda
+def test_cuda_bf16_vectorize_add():
+    if not have_bf16(tvm.gpu(0).compute_version):
+        print("skip because gpu does not support bf16")
+        return
+    num_thread = 8
+
+    def np_float2np_bf16(arr):
+        """Convert a numpy array of float to a numpy array
+        of bf16 in uint16"""
+        orig = arr.view("<u4")
+        bias = np.bitwise_and(np.right_shift(orig, 16), 1) + 0x7FFF
+        return np.right_shift(orig + bias, 16).astype("uint16")
+
+    def np_bf162np_float(arr):
+        """Convert a numpy array of bf16 (uint16) to a numpy array
+        of float"""
+        u32 = np.left_shift(arr.astype("uint32"), 16)
+        return u32.view("<f4")
+
+    def check_cuda(n, lanes):
+        A = te.placeholder((n,), name="A", dtype="bfloat16x%d" % lanes)
+        B = te.compute((n,), lambda i: A[i] + tvm.tir.const(1, A.dtype), name="B")
+        s = te.create_schedule(B.op)
+        xo, xi = s[B].split(B.op.axis[0], factor=num_thread)
+        s[B].bind(xo, bx)
+        s[B].bind(xi, tx)
+        with tvm.transform.PassContext(
+            disabled_pass=["tir.BF16Promote", "tir.BF16CastElimination", "tir.BF16TypeLowering"]
+        ):
+            fun = tvm.build(s, [A, B], "cuda")
+        ctx = tvm.gpu(0)
+        np_a = np.random.uniform(size=(n, lanes)).astype("float32")
+        np_a = np_bf162np_float(np_float2np_bf16(np_a))
+        a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np_float2np_bf16(np_a))
+        c = tvm.nd.empty((n,), B.dtype, ctx)
+        fun(a, c)
+        c = tvm.nd.empty((n, lanes), "uint16", ctx).copyfrom(c)
+        tvm.testing.assert_allclose(c.asnumpy(), np_float2np_bf16(np_a + 1))
+
+    check_cuda(64, 2)
+    check_cuda(64, 4)
+    check_cuda(64, 6)
+    check_cuda(64, 8)
+
+
 @tvm.testing.requires_gpu
 @tvm.testing.requires_cuda
 def test_cuda_multiply_add():
@@ -922,6 +969,7 @@ def test_unrolled_vectorization():
 
 if __name__ == "__main__":
     test_cuda_vectorize_add()
+    test_cuda_bf16_vectorize_add()
     test_cuda_multiply_add()
     test_cuda_vectorize_load()
     test_cuda_make_int8()

From 760e9b2b0423f5b66f206d9a9195a1b2b5a007a3 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Mon, 8 Mar 2021 16:07:23 +0900
Subject: [PATCH 296/357] [Torch, QNN] Support quantized mobilenet v3 from
 torch 1.8 (#7606)

* [Torch] support hardsigmoid

* qhswish first impl

* add qhardsigmoid but the result is not correct

* add qmv3 to test

* comment fix
---
 python/tvm/relay/frontend/pytorch.py          | 36 ++++++++++++---
 python/tvm/relay/frontend/qnn_torch.py        | 34 ++++++++++++++
 src/relay/transforms/fold_explicit_padding.cc |  2 +-
 tests/python/frontend/pytorch/qnn_test.py     | 44 +++++++++----------
 tests/python/frontend/pytorch/test_forward.py | 10 ++++-
 5 files changed, 96 insertions(+), 30 deletions(-)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index e5ad57c6b87a..c709e2b4e7bd 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -34,6 +34,7 @@
 from .. import expr as _expr
 from .. import function as _function
 from .. import op as _op
+from .. import qnn
 from ..ty import TupleType, TensorType, Any
 from ..loops import while_loop
 from .. import transform
@@ -805,14 +806,35 @@ def log_sigmoid(self, inputs, input_types):
         data = inputs[0]
         return _op.log(_op.tensor.sigmoid(data))
 
-    def hard_swish(self, inputs, input_types):
-        data = inputs[0]
-        dtype = input_types[0]
+    def hard_sigmoid(self, inputs, input_types):
+        def _relu6(x):
+            return _op.tensor.clip(x, 0.0, 6.0)
 
-        def _relu6(input_tensor):
-            return _op.tensor.clip(input_tensor, 0.0, 6.0)
+        def func(x):
+            return _relu6(x + _expr.const(3.0)) / _expr.const(6.0)
+
+        if self.is_quantized_tensor(inputs[0]):
+            input_scale = _expr.const(inputs[1])
+            input_zero_point = _expr.const(inputs[2])
+            # PyTorch seems to use the following output qparams, but accuracy
+            # is broken if we use this.
+            # TODO(masahi): Revisit this parameter choice
+            #
+            # Taken from src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
+            # output_scale = _expr.const(0.00390625)  # 1.0 / 2^8
+            # output_zero_point = _expr.const(-128)
+            output_scale = input_scale
+            output_zero_point = input_zero_point
+
+            data = qnn.op.dequantize(inputs[0], input_scale, input_zero_point, axis=1)
+            out = func(data)
+            return qnn.op.quantize(out, output_scale, output_zero_point, out_dtype="uint8")
+
+        return func(inputs[0])
 
-        return data * _relu6(data + _expr.const(3.0, dtype=dtype)) / _expr.const(6.0, dtype=dtype)
+    def hard_swish(self, inputs, input_types):
+        data = inputs[0]
+        return data * self.hard_sigmoid(inputs, input_types)
 
     def adaptive_avg_pool_2d(self, inputs, input_types):
         data = inputs[0]
@@ -2418,6 +2440,8 @@ def create_convert_map(self):
             "aten::__not__": self.logical_not,
             "aten::hardswish_": self.hard_swish,
             "aten::hardswish": self.hard_swish,
+            "aten::hardsigmoid_": self.hard_sigmoid,
+            "aten::hardsigmoid": self.hard_sigmoid,
             "aten::cumsum": self.cumsum,
             "aten::masked_fill": self.masked_fill,
             "aten::masked_select": self.masked_select,
diff --git a/python/tvm/relay/frontend/qnn_torch.py b/python/tvm/relay/frontend/qnn_torch.py
index e3431043bc86..2b85a1f3a1be 100644
--- a/python/tvm/relay/frontend/qnn_torch.py
+++ b/python/tvm/relay/frontend/qnn_torch.py
@@ -191,6 +191,7 @@ def _get_quant_param_for_input(input_value):
         "quantized::cat": (2, 3),
         "quantized::mul_scalar": (2, 3),
         "quantized::add_scalar": (2, 3),
+        "quantized::hardswish": (1, 2),
     }
 
     def dfs(current_node):
@@ -358,6 +359,8 @@ def add_input_quant_params_to_op_inputs(graph):
         "quantized::add_scalar": 1,
         "quantized::mul_scalar": 1,
         "quantized::relu6": 1,
+        "quantized::hardswish": 1,
+        "aten::hardsigmoid": 1,
     }
 
     need_input_quant_param = set(num_quantized_inputs.keys())
@@ -765,6 +768,7 @@ def _impl(inputs, _):
         out_zp = _expr.const(inputs[3])
 
         if q_min > z - c_q or q_max < z - c_q:
+            # TODO(masahi): Replace this with integer only compute
             dequant = relay.qnn.op.dequantize(inputs[0], _expr.const(s), _expr.const(z))
             dequantized_add = _op.tensor.add(dequant, _expr.const(c_q * s))
             return relay.qnn.op.quantize(
@@ -820,6 +824,35 @@ def _impl(inputs, _):
     return _impl
 
 
+def _hswish():
+    # refer to src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
+    # They fallback to fp32
+    def _impl(inputs, _):
+        assert len(inputs) == 5, "Input quant params not found in op inputs"
+        # TODO(masahi): Replace this with integer only compute.
+        # We do not have to strictly follow how PyTorch does it.
+
+        def relu6(x):
+            return _op.tensor.clip(x, 0.0, 6.0)
+
+        def hardsigmoid(x):
+            dtype = "float32"
+            return relu6(x + _expr.const(3.0, dtype=dtype)) / _expr.const(6.0, dtype=dtype)
+
+        output_scale = _expr.const(inputs[1])
+        output_zero_point = _expr.const(inputs[2])
+        input_scale = _expr.const(inputs[3])
+        input_zero_point = _expr.const(inputs[4])
+
+        dequant = relay.qnn.op.dequantize(inputs[0], input_scale, input_zero_point, axis=1)
+        dequantized_hswish = dequant * hardsigmoid(dequant)
+        return relay.qnn.op.quantize(
+            dequantized_hswish, output_scale, output_zero_point, out_dtype="uint8"
+        )
+
+    return _impl
+
+
 def _linear_dynamic():
     def _calculate_qparam(inp):
         # reference ATen/native/quantized/cpu/qlinear_dynamic.cpp
@@ -906,4 +939,5 @@ def _impl(inputs, _):
     "quantized::mul_scalar": _mul_scalar(),
     "quantized::relu6": _relu6(),
     "quantized::linear_dynamic": _linear_dynamic(),
+    "quantized::hardswish": _hswish(),
 }
diff --git a/src/relay/transforms/fold_explicit_padding.cc b/src/relay/transforms/fold_explicit_padding.cc
index d606eb445a79..bab8b814df05 100644
--- a/src/relay/transforms/fold_explicit_padding.cc
+++ b/src/relay/transforms/fold_explicit_padding.cc
@@ -182,7 +182,7 @@ class SimplifyExplicitPadding {
 };
 
 /*!
- * \brief ImplicitPadding finds explict padding before an op that can
+ * \brief FoldExplicitPadding finds explict padding before an op that can
  * support implicit padding and fuses them.
  */
 Expr FoldExplicitPadding(const Expr& expr, const IRModule& mod) {
diff --git a/tests/python/frontend/pytorch/qnn_test.py b/tests/python/frontend/pytorch/qnn_test.py
index 07e52b7079e8..29c69abba542 100644
--- a/tests/python/frontend/pytorch/qnn_test.py
+++ b/tests/python/frontend/pytorch/qnn_test.py
@@ -41,7 +41,6 @@ def torch_version_check():
 
 
 def get_tvm_runtime(script_module, input_name, ishape):
-
     input_shapes = [(input_name, ishape)]
     mod, params = relay.frontend.from_pytorch(script_module, input_shapes)
 
@@ -125,43 +124,40 @@ def fuse_model(self):
 
 # Mobilenet V3 related modules
 class Hsigmoid(nn.Module):
-    def __init__(self, inplace=True, add_stub=False):
+    def __init__(self, add_stub=False):
         super().__init__()
-        self.float_op = nn.quantized.FloatFunctional()
-        self.relu6 = nn.ReLU6(inplace=inplace)
         self.quant = QuantStub()
         self.dequant = DeQuantStub()
         self.add_stub = add_stub
+        self.hsigmoid = nn.Hardsigmoid()
 
     def forward(self, x):
         if self.add_stub:
             x = self.quant(x)
-        relu6 = self.relu6(self.float_op.add_scalar(x, 3.0))
-        mul = self.float_op.mul_scalar(relu6, 1 / 6.0)
+        x = self.hsigmoid(x)
         if self.add_stub:
-            mul = self.dequant(mul)
-        return mul
+            x = self.dequant(x)
+        return x
 
     def fuse_model(self):
         pass
 
 
 class Hswish(nn.Module):
-    def __init__(self, inplace=True, add_stub=False):
-        super(Hswish, self).__init__()
-        self.float_op = nn.quantized.FloatFunctional()
-        self.hsigmoid = Hsigmoid(inplace, add_stub=False)
+    def __init__(self, add_stub=False):
+        super().__init__()
         self.quant = QuantStub()
         self.dequant = DeQuantStub()
         self.add_stub = add_stub
+        self.hswish = nn.Hardswish()
 
     def forward(self, x):
         if self.add_stub:
             x = self.quant(x)
-        mul = self.float_op.mul(x, self.hsigmoid(x))
+        x = self.hswish(x)
         if self.add_stub:
-            mul = self.dequant(mul)
-        return mul
+            x = self.dequant(x)
+        return x
 
     def fuse_model(self):
         pass
@@ -274,18 +270,12 @@ def test_quantized_modules():
             ("conv_bn_relu" + postfix, imagenet_ishape, ConvBn(with_relu=True), per_channel),
             ("linear" + postfix, (16, 16), Linear(), per_channel),
             ("linear_relu" + postfix, (16, 16), Linear(with_relu=True), per_channel),
-        ]
-
-    if torch_version_check():
-        qmodules += [
             ("hsigmoid", imagenet_ishape, Hsigmoid(add_stub=True), False),
             ("hswish", imagenet_ishape, Hswish(add_stub=True), False),
             ("semodule", (1, 16, 64, 64), SqueezeExcite(16, add_stub=True), False),
             ("semodule, per_channel", (1, 16, 64, 64), SqueezeExcite(16, add_stub=True), True),
             ("mul_scalar negative", imagenet_ishape, MulScalarNegative(), False),
         ]
-    else:
-        print("Skipping tests that require torch > 1.4")
 
     for (module_name, ishape, raw_module, per_channel) in qmodules:
         raw_module.eval()
@@ -372,6 +362,13 @@ def get_imagenet_input():
             # ("googlenet", qgooglenet(pretrained=True), per_channel),
         ]
 
+    if is_version_greater_than("1.7.1"):
+        from torchvision.models.quantization import mobilenet_v3_large as qmobilenet_v3_large
+
+        qmodels.append(
+            ("mobilenet_v3_large", qmobilenet_v3_large(pretrained=True, quantize=True).eval(), True)
+        )
+
     results = []
 
     for (model_name, raw_model, per_channel) in qmodels:
@@ -385,7 +382,10 @@ def get_imagenet_input():
         inp = get_imagenet_input()
         pt_inp = torch.from_numpy(inp)
 
-        quantize_model(raw_model, pt_inp, per_channel=per_channel)
+        if "mobilenet_v3_large" not in model_name:
+            # mv3 was qat-ed, quantize=True option above makes it already quantized
+            quantize_model(raw_model, pt_inp, per_channel=per_channel)
+
         script_module = torch.jit.trace(raw_model, pt_inp).eval()
 
         with torch.no_grad():
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 24f8edab7d98..83c1698799c7 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -3651,6 +3651,13 @@ def test_hard_swish():
         verify_model(torch.nn.Hardswish(inplace=True).eval(), input_data=input)
 
 
+def test_hard_sigmoid():
+    examples = [torch.rand(8).float(), torch.rand(8, 10).float(), torch.rand(1, 1, 10).float()]
+    for input in examples:
+        verify_model(torch.nn.Hardsigmoid().eval(), input_data=input)
+        verify_model(torch.nn.Hardsigmoid(inplace=True).eval(), input_data=input)
+
+
 def test_cumsum():
     def test_fn(dim, dtype=None):
         return lambda x: torch.cumsum(x, dim=dim, dtype=dtype)
@@ -3893,6 +3900,8 @@ def test_fn(is_sorted, return_inverse, return_counts):
     test_logical_and()
     test_masked_select()
     test_unique()
+    test_hard_swish()
+    test_hard_sigmoid()
 
     # Model tests
     test_resnet18()
@@ -3931,4 +3940,3 @@ def test_fn(is_sorted, return_inverse, return_counts):
 
     # Test convert torch script(jit) with specific inputs' types
     test_convert_torch_script_with_input_types()
-    test_hard_swish()

From cc7f8dc2f4b2492872a3e56365a3cca07997ed97 Mon Sep 17 00:00:00 2001
From: Y <cy-l@live.com>
Date: Tue, 9 Mar 2021 01:50:57 +0800
Subject: [PATCH 297/357] [TE] Fix bug in AutoInlineElemWise and implement
 AutoInlineBroadcast (#7602)

* [TE] Fix bug in AutoInlineElemWise and implement AutoInlineBroadcast

* [TE] Add AutoInlineBroadcast API to schedule_pass.h
---
 include/tvm/te/schedule_pass.h                |  7 +++
 src/te/schedule/auto_inline_elem_wise.cc      | 16 +++--
 tests/python/unittest/test_te_schedule_ops.py | 60 +++++++++++++++----
 3 files changed, 65 insertions(+), 18 deletions(-)

diff --git a/include/tvm/te/schedule_pass.h b/include/tvm/te/schedule_pass.h
index a4efa7a94990..32e74f6ef9d5 100644
--- a/include/tvm/te/schedule_pass.h
+++ b/include/tvm/te/schedule_pass.h
@@ -41,6 +41,13 @@ namespace te {
  */
 void AutoInlineElemWise(Schedule sch);
 
+/*!
+ * \brief To automatically inline the broadcast operations.
+ *
+ * \param sch The schedule to be inlined.
+ */
+void AutoInlineBroarcast(Schedule sch);
+
 /*!
  * \brief To automatically inline operations with injective writes
  *   (i.e. writes without reduction or sequential loops). Note
diff --git a/src/te/schedule/auto_inline_elem_wise.cc b/src/te/schedule/auto_inline_elem_wise.cc
index e2b7215158b2..bf584df25825 100644
--- a/src/te/schedule/auto_inline_elem_wise.cc
+++ b/src/te/schedule/auto_inline_elem_wise.cc
@@ -39,15 +39,15 @@ class ElemWiseDetector : public tir::ExprVisitor {
     ExprVisitor::VisitExpr(e);
   }
 
-  void VisitExpr_(const CallNode* op) final {
-    Array<PrimExpr> axis = op->args;
-    if (axis_.size() != axis.size()) {
+  void VisitExpr_(const ProducerLoadNode* op) final {
+    Array<PrimExpr> indices = op->indices;
+    if (axis_.size() != indices.size()) {
       is_elem_wise_ = false;
       return;
     }
 
     for (size_t i = 0; i < axis_.size(); ++i) {
-      if (!axis[i].same_as(axis_[i]->var)) {
+      if (!indices[i].same_as(axis_[i]->var)) {
         is_elem_wise_ = false;
         return;
       }
@@ -83,7 +83,11 @@ bool IsBroadcast(const Operation& op) {
     if (compute->reduce_axis.size()) {
       return false;
     }
-    // TODO(nicolasvasilache): Implement Me
+    constexpr auto kBroadcast = "broadcast";
+    // broadcast op in topi has tag `broadcast`
+    if (op->tag == kBroadcast) {
+      return true;
+    }
   }
   return false;
 }
@@ -113,6 +117,8 @@ void AutoInlineInjective(Schedule sch) {
 
 TVM_REGISTER_GLOBAL("schedule.AutoInlineElemWise").set_body_typed(AutoInlineElemWise);
 
+TVM_REGISTER_GLOBAL("schedule.AutoInlineBroadcast").set_body_typed(AutoInlineBroadcast);
+
 TVM_REGISTER_GLOBAL("schedule.AutoInlineInjective").set_body_typed(AutoInlineInjective);
 
 }  // namespace te
diff --git a/tests/python/unittest/test_te_schedule_ops.py b/tests/python/unittest/test_te_schedule_ops.py
index 1555974169fc..255e0cdb1f21 100644
--- a/tests/python/unittest/test_te_schedule_ops.py
+++ b/tests/python/unittest/test_te_schedule_ops.py
@@ -110,19 +110,53 @@ def argmax_init(idx_typ, val_typ):
 
 
 def test_auto_inline():
-    m = te.var("m")
-    n = te.var("n")
-    A = te.placeholder((m, n), name="A")
-    B = te.placeholder((m, n), name="B")
-    C = te.placeholder((m, n), name="C")
-    T1 = te.compute((m, n), lambda i, j: A(i, j) * B(i, j), name="T1")
-    T2 = te.compute((m, n), lambda i, j: T1(i, j) + C(i, j), name="T2")
-
-    s = te.create_schedule(T2.op)
-    tvm.te.schedule.AutoInlineElemWise(s)
-    s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
+    def elemwise():
+        m = te.var("m")
+        n = te.var("n")
+        A = te.placeholder((m, n), name="A")
+        B = te.placeholder((m, n), name="B")
+        C = te.placeholder((m, n), name="C")
+        T1 = te.compute((m, n), lambda i, j: A(i, j) * B(i, j), name="T1")
+        T2 = te.compute((m, n), lambda i, j: T1(i, j) + C(i, j), name="T2")
+
+        return te.create_schedule(T2.op), T1
+
+    def broadcast():
+        m = te.var("m")
+        n = te.var("n")
+        A = te.placeholder((1,), name="A")
+        B = te.placeholder((m, n), name="B")
+        C = te.placeholder((m, n), name="C")
+        T1 = te.compute((m, n), lambda i, j: A(0) * B(i, j), name="T1", tag="broadcast")
+        T2 = te.compute((m, n), lambda i, j: T1(i, j) + C(i, j), name="T2")
+
+        return te.create_schedule(T2.op), T1
+
+    def injective():
+        m = te.var("m")
+        n = te.var("n")
+        A = te.placeholder((m,), name="A")
+        B = te.placeholder((m, n), name="B")
+        C = te.placeholder((m, n), name="C")
+        T1 = te.compute((m, n), lambda i, j: A(i) * B(i, j), name="T1")
+        T2 = te.compute((m, n), lambda i, j: T1(i, j) + C(i, j), name="T2")
+
+        return te.create_schedule(T2.op), T1
+
+    def check_auto_inline(schedule_func, auto_inline_func):
+        s, T1 = schedule_func()
+        # before auto inline the attach type is AttachType.kGroupRoot
+        assert s[T1].attach_type == 1
+        auto_inline_func(s)
+        # after auto inline the attach type is AttachType.kInline
+        assert s[T1].attach_type == 2
+        s = s.normalize()
+        bounds = tvm.te.schedule.InferBound(s)
+        stmt = tvm.te.schedule.ScheduleOps(s, bounds)
+
+    check_auto_inline(elemwise, tvm.te.schedule.AutoInlineElemWise)
+    check_auto_inline(broadcast, tvm.te.schedule.AutoInlineBroadcast)
+    check_auto_inline(injective, tvm.te.schedule.AutoInlineInjective)
 
 
 def test_schedule_const_bound():

From ca303aa11cec3449f09eb45043bf15134ecacc46 Mon Sep 17 00:00:00 2001
From: Chao Liu <monklof@gmail.com>
Date: Tue, 9 Mar 2021 02:50:56 +0800
Subject: [PATCH 298/357] [Relay] add ShapeFunc for tanh (#6898)

* add ShapeFunc for tanh

* _schedule_dense_small_batch turn autotvm off when dense's inner dim is unknown

* fix CI pylint
---
 python/tvm/relay/op/_tensor.py |  1 +
 python/tvm/topi/cuda/dense.py  | 25 +++++++++++++++++++------
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
index 7728d6e3efa4..5f68be84d46a 100644
--- a/python/tvm/relay/op/_tensor.py
+++ b/python/tvm/relay/op/_tensor.py
@@ -281,3 +281,4 @@ def elemwise_shape_func(attrs, inputs, _):
 register_shape_func("clip", False, elemwise_shape_func)
 register_shape_func("log2", False, elemwise_shape_func)
 register_shape_func("sigmoid", False, elemwise_shape_func)
+register_shape_func("tanh", False, elemwise_shape_func)
diff --git a/python/tvm/topi/cuda/dense.py b/python/tvm/topi/cuda/dense.py
index f8abe4d4d799..ad4882ab09f2 100644
--- a/python/tvm/topi/cuda/dense.py
+++ b/python/tvm/topi/cuda/dense.py
@@ -81,13 +81,26 @@ def _callback(op):
 
 
 def _schedule_dense_small_batch(cfg, s, C):
-    A, _ = C.op.input_tensors
-    _, in_dim = get_const_tuple(A.shape)
-    cfg.define_split("tile_k", in_dim, num_outputs=2)
-    if cfg.is_fallback:
-        cfg["tile_k"] = SplitEntity([-1, 64] if in_dim > 64 else [1, 64])
+    A, weights = C.op.input_tensors
+    _, in_dim_weights = get_const_tuple(weights.shape)
+    _, in_dim_A = get_const_tuple(A.shape)
+
+    if isinstance(in_dim_A, int):
+        in_dim = in_dim_A
+    elif isinstance(in_dim_weights, int):
+        in_dim = in_dim_weights
+    else:
+        in_dim = None
+
+    if in_dim is not None:
+        cfg.define_split("tile_k", in_dim, num_outputs=2)
+        if cfg.is_fallback:
+            cfg["tile_k"] = SplitEntity([-1, 64] if in_dim > 64 else [1, 64])
+        _, kf = cfg["tile_k"].apply(s, C, C.op.reduce_axis[0])
+    else:
+        tile_k = 64
+        _, kf = s[C].split(C.op.reduce_axis[0], tile_k)
 
-    _, kf = cfg["tile_k"].apply(s, C, C.op.reduce_axis[0])
     CF = s.rfactor(C, kf)
 
     if C.op in s.outputs:

From 8d1f5b238a08ff7425900a11dd7d9304664b955f Mon Sep 17 00:00:00 2001
From: LiangLiu <liangliu@buaa.edu.cn>
Date: Tue, 9 Mar 2021 02:54:06 +0800
Subject: [PATCH 299/357] [Relay] Fix relay op strategy for cuda dense int8
 (#7586)

* [Relay] Fix relay op strategy for cuda dense int8

* Remove uint8 && Add autotvm task extraction test for relay graph that contains dense op (int8 * int8 -> int32)

* Reformat the code of test case
---
 python/tvm/relay/op/strategy/cuda.py          |  2 +-
 .../relay/test_autotvm_task_extraction.py     | 21 +++++++++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 3b498a99495e..e0d0f165219e 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -657,7 +657,7 @@ def dense_strategy_cuda(attrs, inputs, out_type, target):
     data, weights = inputs
     b, i = get_const_tuple(data.shape)
     o, _ = get_const_tuple(weights.shape)
-    if out_type.dtype == "int8":
+    if data.dtype == "int8" and weights.dtype == "int8" and out_type.dtype == "int32":
         strategy.add_implementation(
             wrap_compute_dense(topi.cuda.dense_int8),
             wrap_topi_schedule(topi.cuda.schedule_dense_int8),
diff --git a/tests/python/relay/test_autotvm_task_extraction.py b/tests/python/relay/test_autotvm_task_extraction.py
index d6bfd8d0ec11..b3f1868969cc 100644
--- a/tests/python/relay/test_autotvm_task_extraction.py
+++ b/tests/python/relay/test_autotvm_task_extraction.py
@@ -102,5 +102,26 @@ def test_task_extraction():
     assert len(tasks) == 31
 
 
+def test_task_extraction_for_dense_int8_cuda():
+    target = "cuda"
+    dense = relay.op.get("nn.dense")
+
+    def get_net(batch, in_dim, out_dim, dtype, out_dtype):
+        data = tvm.relay.var("data", shape=[batch, in_dim], dtype=dtype)
+        weight = tvm.relay.var("weight", shape=[out_dim, in_dim], dtype=dtype)
+        out = relay.nn.dense(data, weight, out_dtype=out_dtype)
+        mod, params = relay.testing.create_workload(out)
+        return mod, params
+
+    mod, params = get_net(1, 16, 32, "float32", "float32")
+    tasks = autotvm.task.extract_from_program(mod, target=target, params=params, ops=(dense,))
+    assert len(tasks) == 1 and tasks[0].name == "dense_small_batch.cuda"
+
+    mod, params = get_net(1, 16, 32, "int8", "int32")
+    tasks = autotvm.task.extract_from_program(mod, target=target, params=params, ops=(dense,))
+    assert len(tasks) == 1 and tasks[0].name == "dense_int8.cuda"
+
+
 if __name__ == "__main__":
     test_task_extraction()
+    test_task_extraction_for_dense_int8_cuda()

From 717c5e0f6ee7a1ac25ee02b5ab66ae819a8a13f6 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Mon, 8 Mar 2021 11:37:52 -0800
Subject: [PATCH 300/357] Add logging to diagnose flaky ci-qemu test (#7610)

---
 tests/scripts/task_python_microtvm.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/scripts/task_python_microtvm.sh b/tests/scripts/task_python_microtvm.sh
index ba8018667895..2e06932ba536 100755
--- a/tests/scripts/task_python_microtvm.sh
+++ b/tests/scripts/task_python_microtvm.sh
@@ -18,6 +18,7 @@
 
 set -e
 set -u
+set -x  # NOTE(areusch): Adding to diagnose flaky timeouts
 
 source tests/scripts/setup-pytest-env.sh
 

From b827845b11a5ace277c319a50081789e940d6b84 Mon Sep 17 00:00:00 2001
From: Chao Liu <monklof@gmail.com>
Date: Tue, 9 Mar 2021 04:37:36 +0800
Subject: [PATCH 301/357] [Relay] add ShapeFunc for one_hot op (#7490)

* [Relay] add ShapeFunc for one_hot op

* fix pylint

* add test for shapefunc of one_hot op
---
 3rdparty/vta-hw                   |  2 +-
 python/tvm/relay/op/_transform.py | 25 +++++++++++++++++++++++++
 tests/python/relay/test_any.py    | 22 ++++++++++++++++++++++
 3 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/3rdparty/vta-hw b/3rdparty/vta-hw
index 57db5a718c74..87ce9acfae55 160000
--- a/3rdparty/vta-hw
+++ b/3rdparty/vta-hw
@@ -1 +1 @@
-Subproject commit 57db5a718c74a788c98120ebbe1230797be698c8
+Subproject commit 87ce9acfae550d1a487746e9d06c2e250076e54c
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 97f45278f073..e90263d794bc 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -247,6 +247,31 @@ def strided_slice_shape_func(attrs, inputs, _):
     ]
 
 
+@script
+def _one_hot_shape_func(indices_shape, depth, axis):
+    in_ndim = indices_shape.shape[0]
+    out_ndim = in_ndim + 1
+    true_axis = in_ndim if axis == -1 else axis
+    indices_i = 0
+    out = output_tensor((out_ndim,), "int64")
+    for i in range(out_ndim):
+        if i == true_axis:
+            out[i] = int64(depth)
+        else:
+            out[i] = int64(indices_shape[indices_i])
+            indices_i += 1
+    return out
+
+
+@_reg.register_shape_func("one_hot", False)
+def one_hot_shape_func(attrs, inputs, _):
+    """
+    Shape func for one_hot
+    """
+    shape_func = [_one_hot_shape_func(inputs[0], convert(attrs.depth), convert(attrs.axis))]
+    return shape_func
+
+
 @script
 def _concatenate_shape_func(inputs, axis):
     ndim = inputs[0].shape[0]
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index b75cc5f5e750..32292de4c8ea 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -260,6 +260,28 @@ def test_any_reshape():
     verify_any_reshape(any_dims(3), (-4, 2, -1, -2), (6, 3, 4), (2, 3, 3, 4))
 
 
+def verify_any_one_hot(indices_shape, indices_np_shape, depth, on_value, off_value, axis, dtype):
+    indices = relay.var("indices", shape=indices_shape, dtype="int32")
+    on_value_const = relay.const(on_value, dtype)
+    off_value_const = relay.const(off_value, dtype)
+    y = relay.one_hot(indices, on_value_const, off_value_const, depth, axis=axis, dtype=dtype)
+    params = [indices]
+    mod = tvm.IRModule()
+    mod["main"] = relay.Function(params, y)
+
+    indices_npy = np.random.randint(0, depth, size=indices_np_shape).astype("int32")
+    out_npy = tvm.topi.testing.one_hot(indices_npy, on_value, off_value, depth, axis, dtype)
+    args = [indices_npy]
+    check_result(args, mod, out_npy)
+
+
+@tvm.testing.uses_gpu
+def test_any_one_hot():
+    verify_any_one_hot(any_dims(1), (3,), 3, 1, 0, -1, "int32")
+    verify_any_one_hot(any_dims(2), (2, 2), 5, 0.5, -0.5, 1, "float32")
+    verify_any_one_hot(any_dims(4), (3, 2, 4, 5), 6, 1.0, 0.0, 0, "float32")
+
+
 def verify_any_argwhere(x_shape, x_np_shape, dtype="bool"):
     x = relay.var("x", shape=x_shape, dtype=dtype)
     y = relay.argwhere(x)

From 89bafd58c27e1dff670dddb3fcf7b4f84dc4eedc Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Mon, 8 Mar 2021 21:40:38 -0800
Subject: [PATCH 302/357] [RUNTIME] Unify load params interface (#7559)

---
 apps/android_camera/models/prepare_model.py   |  2 +-
 apps/bundle_deploy/build_model.py             |  6 +-
 apps/bundle_deploy/runtime.cc                 |  1 +
 apps/sgx/src/build_model.py                   |  4 +-
 .../wasm-graph/tools/build_graph_lib.py       |  4 +-
 docs/deploy/android.rst                       |  2 +-
 golang/sample/gen_mobilenet_lib.py            |  4 +-
 python/tvm/contrib/debugger/debug_result.py   |  6 +-
 python/tvm/driver/tvmc/compiler.py            |  4 +-
 python/tvm/driver/tvmc/runner.py              |  7 +-
 python/tvm/relay/param_dict.py                | 28 +++-----
 python/tvm/runtime/__init__.py                |  1 +
 python/tvm/runtime/params.py                  | 69 ++++++++++++++++++
 rust/tvm-graph-rt/src/graph.rs                |  2 +-
 rust/tvm-graph-rt/tests/build_model.py        |  4 +-
 .../tests/test_nn/src/build_test_graph.py     |  4 +-
 rust/tvm/examples/resnet/src/build_resnet.py  |  4 +-
 src/relay/backend/param_dict.cc               | 70 ++++---------------
 src/relay/backend/param_dict.h                | 27 +------
 src/runtime/file_utils.cc                     | 67 ++++++++++++++++++
 src/runtime/file_utils.h                      | 26 +++++++
 src/runtime/graph/graph_runtime.cc            | 31 ++------
 src/runtime/graph/graph_runtime.h             |  3 -
 src/runtime/vm/vm.cc                          |  2 +
 tests/python/contrib/test_tensorrt.py         |  4 +-
 tests/python/relay/test_cpp_build_module.py   |  4 +-
 tests/python/relay/test_param_dict.py         |  8 +--
 tests/python/unittest/test_runtime_graph.py   |  6 +-
 .../test_runtime_module_based_interface.py    | 10 +--
 tutorials/frontend/deploy_sparse.py           |  4 +-
 30 files changed, 238 insertions(+), 176 deletions(-)
 create mode 100644 python/tvm/runtime/params.py

diff --git a/apps/android_camera/models/prepare_model.py b/apps/android_camera/models/prepare_model.py
index ab20e028c2ad..f155d46c31a4 100644
--- a/apps/android_camera/models/prepare_model.py
+++ b/apps/android_camera/models/prepare_model.py
@@ -106,7 +106,7 @@ def main(model_str, output_path):
         f.write(graph)
     print("dumping params...")
     with open(output_path_str + "/" + "deploy_param.params", "wb") as f:
-        f.write(relay.save_param_dict(params))
+        f.write(runtime.save_param_dict(params))
     print("dumping labels...")
     synset_url = "".join(
         [
diff --git a/apps/bundle_deploy/build_model.py b/apps/bundle_deploy/build_model.py
index 0991ac9ad94b..8fbc01bcf4a6 100644
--- a/apps/bundle_deploy/build_model.py
+++ b/apps/bundle_deploy/build_model.py
@@ -20,7 +20,7 @@
 import os
 from tvm import relay
 import tvm
-from tvm import te
+from tvm import te, runtime
 import logging
 import json
 from tvm.contrib import cc as _cc
@@ -70,7 +70,7 @@ def build_module(opts):
         with open(
             os.path.join(build_dir, file_format_str.format(name="params", ext="bin")), "wb"
         ) as f_params:
-            f_params.write(relay.save_param_dict(params))
+            f_params.write(runtime.save_param_dict(params))
 
 
 def build_test_module(opts):
@@ -113,7 +113,7 @@ def build_test_module(opts):
         with open(
             os.path.join(build_dir, file_format_str.format(name="test_params", ext="bin")), "wb"
         ) as f_params:
-            f_params.write(relay.save_param_dict(lowered_params))
+            f_params.write(runtime.save_param_dict(lowered_params))
         with open(
             os.path.join(build_dir, file_format_str.format(name="test_data", ext="bin")), "wb"
         ) as fp:
diff --git a/apps/bundle_deploy/runtime.cc b/apps/bundle_deploy/runtime.cc
index 3224028b60a1..2f7e3848b4bf 100644
--- a/apps/bundle_deploy/runtime.cc
+++ b/apps/bundle_deploy/runtime.cc
@@ -23,6 +23,7 @@
 #include <tvm/runtime/registry.h>
 
 #include "../../src/runtime/c_runtime_api.cc"
+#include "../../src/runtime/container.cc"
 #include "../../src/runtime/cpu_device_api.cc"
 #include "../../src/runtime/file_utils.cc"
 #include "../../src/runtime/graph/graph_runtime.cc"
diff --git a/apps/sgx/src/build_model.py b/apps/sgx/src/build_model.py
index 868d3bcb9fc4..1fc297d8a094 100755
--- a/apps/sgx/src/build_model.py
+++ b/apps/sgx/src/build_model.py
@@ -23,7 +23,7 @@
 from os import path as osp
 import sys
 
-from tvm import relay
+from tvm import relay, runtime
 from tvm.relay import testing
 import tvm
 from tvm import te
@@ -49,7 +49,7 @@ def main():
     with open(osp.join(build_dir, "graph.json"), "w") as f_graph_json:
         f_graph_json.write(graph)
         with open(osp.join(build_dir, "params.bin"), "wb") as f_params:
-            f_params.write(relay.save_param_dict(params))
+            f_params.write(runtime.save_param_dict(params))
 
 
 if __name__ == "__main__":
diff --git a/apps/wasm-standalone/wasm-graph/tools/build_graph_lib.py b/apps/wasm-standalone/wasm-graph/tools/build_graph_lib.py
index 42695d28fadb..3d8a349b8744 100644
--- a/apps/wasm-standalone/wasm-graph/tools/build_graph_lib.py
+++ b/apps/wasm-standalone/wasm-graph/tools/build_graph_lib.py
@@ -24,7 +24,7 @@
 
 import onnx
 import tvm
-from tvm import relay
+from tvm import relay, runtime
 
 
 def _get_mod_and_params(model_file):
@@ -60,7 +60,7 @@ def build_graph_lib(model_file, opt_level):
         f_graph.write(graph_json)
 
     with open(os.path.join(out_dir, "graph.params"), "wb") as f_params:
-        f_params.write(relay.save_param_dict(params))
+        f_params.write(runtime.save_param_dict(params))
 
 
 if __name__ == "__main__":
diff --git a/docs/deploy/android.rst b/docs/deploy/android.rst
index 8c8fcfb49679..256978d00607 100644
--- a/docs/deploy/android.rst
+++ b/docs/deploy/android.rst
@@ -31,7 +31,7 @@ The code below will save the compilation output which is required on android tar
     with open("deploy_graph.json", "w") as fo:
         fo.write(graph.json())
     with open("deploy_param.params", "wb") as fo:
-        fo.write(relay.save_param_dict(params))
+        fo.write(runtime.save_param_dict(params))
 
 deploy_lib.so, deploy_graph.json, deploy_param.params will go to android target.
 
diff --git a/golang/sample/gen_mobilenet_lib.py b/golang/sample/gen_mobilenet_lib.py
index b82e0c476b9f..12f215b4fd9c 100644
--- a/golang/sample/gen_mobilenet_lib.py
+++ b/golang/sample/gen_mobilenet_lib.py
@@ -16,7 +16,7 @@
 # under the License.
 
 import os
-from tvm import relay, transform
+from tvm import relay, transform, runtime
 from tvm.contrib.download import download_testdata
 
 
@@ -94,4 +94,4 @@ def extract(path):
     fo.write(graph)
 
 with open("./mobilenet.params", "wb") as fo:
-    fo.write(relay.save_param_dict(params))
+    fo.write(runtime.save_param_dict(params))
diff --git a/python/tvm/contrib/debugger/debug_result.py b/python/tvm/contrib/debugger/debug_result.py
index 3159ab34397a..f58947f0766f 100644
--- a/python/tvm/contrib/debugger/debug_result.py
+++ b/python/tvm/contrib/debugger/debug_result.py
@@ -264,8 +264,4 @@ def save_tensors(params):
     """
     _save_tensors = tvm.get_global_func("tvm.relay._save_param_dict")
 
-    args = []
-    for k, v in params.items():
-        args.append(k)
-        args.append(tvm.nd.array(v))
-    return _save_tensors(*args)
+    return _save_tensors(params)
diff --git a/python/tvm/driver/tvmc/compiler.py b/python/tvm/driver/tvmc/compiler.py
index fc1805ee0ab4..83791e50f6d5 100644
--- a/python/tvm/driver/tvmc/compiler.py
+++ b/python/tvm/driver/tvmc/compiler.py
@@ -24,7 +24,7 @@
 
 import tvm
 from tvm import autotvm, auto_scheduler
-from tvm import relay
+from tvm import relay, runtime
 from tvm.contrib import cc
 from tvm.contrib import utils
 
@@ -282,7 +282,7 @@ def save_module(module_path, graph, lib, params, cross=None):
 
     with open(temp.relpath(param_name), "wb") as params_file:
         logger.debug("writing params to file to %s", params_file.name)
-        params_file.write(relay.save_param_dict(params))
+        params_file.write(runtime.save_param_dict(params))
 
     logger.debug("saving module as tar file to %s", module_path)
     with tarfile.open(module_path, "w") as tar:
diff --git a/python/tvm/driver/tvmc/runner.py b/python/tvm/driver/tvmc/runner.py
index 87ea3be1436a..1d23ccfb0c00 100644
--- a/python/tvm/driver/tvmc/runner.py
+++ b/python/tvm/driver/tvmc/runner.py
@@ -24,11 +24,11 @@
 import tempfile
 
 import numpy as np
-import tvm
 from tvm import rpc
 from tvm.autotvm.measure import request_remote
 from tvm.contrib import graph_runtime as runtime
 from tvm.contrib.debugger import debug_runtime
+from tvm.relay import load_param_dict
 
 from . import common
 from .common import TVMCException
@@ -163,9 +163,8 @@ def get_input_info(graph_str, params):
 
     shape_dict = {}
     dtype_dict = {}
-    # Use a special function to load the binary params back into a dict
-    load_arr = tvm.get_global_func("tvm.relay._load_param_dict")(params)
-    param_names = [v.name for v in load_arr]
+    params_dict = load_param_dict(params)
+    param_names = [k for (k, v) in params_dict.items()]
     graph = json.loads(graph_str)
     for node_id in graph["arg_nodes"]:
         node = graph["nodes"][node_id]
diff --git a/python/tvm/relay/param_dict.py b/python/tvm/relay/param_dict.py
index 2d0398e20486..2714607947f3 100644
--- a/python/tvm/relay/param_dict.py
+++ b/python/tvm/relay/param_dict.py
@@ -16,12 +16,7 @@
 # under the License.
 # pylint: disable=invalid-name
 """Helper utility to save parameter dicts."""
-import tvm
-import tvm._ffi
-
-
-_save_param_dict = tvm._ffi.get_global_func("tvm.relay._save_param_dict")
-_load_param_dict = tvm._ffi.get_global_func("tvm.relay._load_param_dict")
+import tvm.runtime
 
 
 def save_param_dict(params):
@@ -30,6 +25,9 @@ def save_param_dict(params):
     The result binary bytes can be loaded by the
     GraphModule with API "load_params".
 
+    .. deprecated:: 0.9.0
+        Use :py:func:`tvm.runtime.save_param_dict` instead.
+
     Parameters
     ----------
     params : dict of str to NDArray
@@ -47,21 +45,20 @@ def save_param_dict(params):
        # set up the parameter dict
        params = {"param0": arr0, "param1": arr1}
        # save the parameters as byte array
-       param_bytes = tvm.relay.save_param_dict(params)
+       param_bytes = tvm.runtime.save_param_dict(params)
        # We can serialize the param_bytes and load it back later.
        # Pass in byte array to module to directly set parameters
-       graph_runtime_mod.load_params(param_bytes)
+       tvm.runtime.load_param_dict(param_bytes)
     """
-    args = []
-    for k, v in params.items():
-        args.append(k)
-        args.append(tvm.nd.array(v))
-    return _save_param_dict(*args)
+    return tvm.runtime.save_param_dict(params)
 
 
 def load_param_dict(param_bytes):
     """Load parameter dictionary to binary bytes.
 
+    .. deprecated:: 0.9.0
+        Use :py:func:`tvm.runtime.load_param_dict` instead.
+
     Parameters
     ----------
     param_bytes: bytearray
@@ -72,7 +69,4 @@ def load_param_dict(param_bytes):
     params : dict of str to NDArray
         The parameter dictionary.
     """
-    if isinstance(param_bytes, (bytes, str)):
-        param_bytes = bytearray(param_bytes)
-    load_arr = _load_param_dict(param_bytes)
-    return {v.name: v.array for v in load_arr}
+    return tvm.runtime.load_param_dict(param_bytes)
diff --git a/python/tvm/runtime/__init__.py b/python/tvm/runtime/__init__.py
index 21c06c517bd7..7d58af70afe1 100644
--- a/python/tvm/runtime/__init__.py
+++ b/python/tvm/runtime/__init__.py
@@ -29,3 +29,4 @@
 from .ndarray import vpi, rocm, ext_dev, micro_dev
 from .module import load_module, enabled, system_lib
 from .container import String
+from .params import save_param_dict, load_param_dict
diff --git a/python/tvm/runtime/params.py b/python/tvm/runtime/params.py
new file mode 100644
index 000000000000..78e745686c95
--- /dev/null
+++ b/python/tvm/runtime/params.py
@@ -0,0 +1,69 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""Helper utility to save and load parameter dicts."""
+from . import _ffi_api, ndarray
+
+
+def save_param_dict(params):
+    """Save parameter dictionary to binary bytes.
+
+    The result binary bytes can be loaded by the
+    GraphModule with API "load_params".
+
+    Parameters
+    ----------
+    params : dict of str to NDArray
+        The parameter dictionary.
+
+    Returns
+    -------
+    param_bytes: bytearray
+        Serialized parameters.
+
+    Examples
+    --------
+    .. code-block:: python
+
+       # set up the parameter dict
+       params = {"param0": arr0, "param1": arr1}
+       # save the parameters as byte array
+       param_bytes = tvm.runtime.save_param_dict(params)
+       # We can serialize the param_bytes and load it back later.
+       # Pass in byte array to module to directly set parameters
+       tvm.runtime.load_param_dict(param_bytes)
+    """
+    transformed = {k: ndarray.array(v) for (k, v) in params.items()}
+    return _ffi_api.SaveParams(transformed)
+
+
+def load_param_dict(param_bytes):
+    """Load parameter dictionary to binary bytes.
+
+    Parameters
+    ----------
+    param_bytes: bytearray
+        Serialized parameters.
+
+    Returns
+    -------
+    params : dict of str to NDArray
+        The parameter dictionary.
+    """
+    if isinstance(param_bytes, (bytes, str)):
+        param_bytes = bytearray(param_bytes)
+    return _ffi_api.LoadParams(param_bytes)
diff --git a/rust/tvm-graph-rt/src/graph.rs b/rust/tvm-graph-rt/src/graph.rs
index 646a20daaf5b..83fe37ea7970 100644
--- a/rust/tvm-graph-rt/src/graph.rs
+++ b/rust/tvm-graph-rt/src/graph.rs
@@ -483,7 +483,7 @@ named! {
     )
 }
 
-/// Loads a param dict saved using `relay.save_param_dict`.
+/// Loads a param dict saved using `runtime.save_param_dict`.
 pub fn load_param_dict(bytes: &[u8]) -> Result<HashMap<String, Tensor>, GraphFormatError> {
     match parse_param_dict(bytes) {
         Ok((remaining_bytes, param_dict)) => {
diff --git a/rust/tvm-graph-rt/tests/build_model.py b/rust/tvm-graph-rt/tests/build_model.py
index d34b4403c936..969075929a42 100755
--- a/rust/tvm-graph-rt/tests/build_model.py
+++ b/rust/tvm-graph-rt/tests/build_model.py
@@ -23,7 +23,7 @@
 import numpy as np
 import tvm
 from tvm import te
-from tvm import relay
+from tvm import relay, runtime
 from tvm.relay import testing
 
 CWD = osp.dirname(osp.abspath(osp.expanduser(__file__)))
@@ -47,7 +47,7 @@ def main():
     with open(osp.join(CWD, "graph.json"), "w") as f_resnet:
         f_resnet.write(graph)
     with open(osp.join(CWD, "graph.params"), "wb") as f_params:
-        f_params.write(relay.save_param_dict(params))
+        f_params.write(runtime.save_param_dict(params))
 
 
 if __name__ == "__main__":
diff --git a/rust/tvm-graph-rt/tests/test_nn/src/build_test_graph.py b/rust/tvm-graph-rt/tests/test_nn/src/build_test_graph.py
index e743e48b01f8..0045b3b0557d 100755
--- a/rust/tvm-graph-rt/tests/test_nn/src/build_test_graph.py
+++ b/rust/tvm-graph-rt/tests/test_nn/src/build_test_graph.py
@@ -23,7 +23,7 @@
 
 import numpy as np
 import tvm
-from tvm import te
+from tvm import te, runtime
 from tvm import relay
 from tvm.relay import testing
 
@@ -49,7 +49,7 @@ def main():
         f_resnet.write(graph)
 
     with open(osp.join(out_dir, "graph.params"), "wb") as f_params:
-        f_params.write(relay.save_param_dict(params))
+        f_params.write(runtime.save_param_dict(params))
 
 
 if __name__ == "__main__":
diff --git a/rust/tvm/examples/resnet/src/build_resnet.py b/rust/tvm/examples/resnet/src/build_resnet.py
index 03ac611a191a..fdacb5bb1fca 100644
--- a/rust/tvm/examples/resnet/src/build_resnet.py
+++ b/rust/tvm/examples/resnet/src/build_resnet.py
@@ -27,7 +27,7 @@
 
 import tvm
 from tvm import te
-from tvm import relay
+from tvm import relay, runtime
 from tvm.relay import testing
 from tvm.contrib import graph_runtime, cc
 from PIL import Image
@@ -88,7 +88,7 @@ def build(target_dir):
         fo.write(graph)
 
     with open(osp.join(target_dir, "deploy_param.params"), "wb") as fo:
-        fo.write(relay.save_param_dict(params))
+        fo.write(runtime.save_param_dict(params))
 
 
 def download_img_labels():
diff --git a/src/relay/backend/param_dict.cc b/src/relay/backend/param_dict.cc
index 1d7e08abcdde..bb0fad9142c1 100644
--- a/src/relay/backend/param_dict.cc
+++ b/src/relay/backend/param_dict.cc
@@ -31,70 +31,24 @@
 #include <utility>
 #include <vector>
 
+#include "../../runtime/file_utils.h"
+
 namespace tvm {
 namespace relay {
 
 using namespace runtime;
 
-TVM_REGISTER_GLOBAL("tvm.relay._save_param_dict").set_body([](TVMArgs args, TVMRetValue* rv) {
-  ICHECK_EQ(args.size() % 2, 0u);
-  // `args` is in the form "key, value, key, value, ..."
-  size_t num_params = args.size() / 2;
-  std::vector<std::string> names;
-  names.reserve(num_params);
-  std::vector<DLTensor*> arrays;
-  arrays.reserve(num_params);
-  for (size_t i = 0; i < num_params * 2; i += 2) {
-    names.emplace_back(args[i].operator String());
-    arrays.emplace_back(args[i + 1].operator DLTensor*());
-  }
-  std::string bytes;
-  dmlc::MemoryStringStream strm(&bytes);
-  dmlc::Stream* fo = &strm;
-  uint64_t header = kTVMNDArrayListMagic, reserved = 0;
-  fo->Write(header);
-  fo->Write(reserved);
-  fo->Write(names);
-  {
-    uint64_t sz = static_cast<uint64_t>(arrays.size());
-    fo->Write(sz);
-    for (size_t i = 0; i < sz; ++i) {
-      tvm::runtime::SaveDLTensor(fo, arrays[i]);
-    }
-  }
-  TVMByteArray arr;
-  arr.data = bytes.c_str();
-  arr.size = bytes.length();
-  *rv = arr;
-});
-
-TVM_REGISTER_GLOBAL("tvm.relay._load_param_dict").set_body([](TVMArgs args, TVMRetValue* rv) {
-  std::string bytes = args[0];
-  std::vector<std::string> names;
-  dmlc::MemoryStringStream memstrm(&bytes);
-  dmlc::Stream* strm = &memstrm;
-  uint64_t header, reserved;
-  ICHECK(strm->Read(&header)) << "Invalid parameters file format";
-  ICHECK(header == kTVMNDArrayListMagic) << "Invalid parameters file format";
-  ICHECK(strm->Read(&reserved)) << "Invalid parameters file format";
-  ICHECK(strm->Read(&names)) << "Invalid parameters file format";
-  uint64_t sz;
-  strm->Read(&sz, sizeof(sz));
-  size_t size = static_cast<size_t>(sz);
-  ICHECK(size == names.size()) << "Invalid parameters file format";
-  tvm::Array<NamedNDArray> ret;
-  for (size_t i = 0; i < size; ++i) {
-    tvm::runtime::NDArray temp;
-    temp.Load(strm);
-    auto n = tvm::make_object<NamedNDArrayNode>();
-    n->name = std::move(names[i]);
-    n->array = temp;
-    ret.push_back(NamedNDArray(n));
-  }
-  *rv = ret;
+TVM_REGISTER_GLOBAL("tvm.relay._save_param_dict")
+    .set_body_typed([](const Map<String, NDArray>& params) {
+      std::string s = ::tvm::runtime::SaveParams(params);
+      // copy return array so it is owned by the ret value
+      TVMRetValue rv;
+      rv = TVMByteArray{s.data(), s.size()};
+      return rv;
+    });
+TVM_REGISTER_GLOBAL("tvm.relay._load_param_dict").set_body_typed([](const String& s) {
+  return ::tvm::runtime::LoadParams(s);
 });
 
-TVM_REGISTER_NODE_TYPE(NamedNDArrayNode);
-
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/backend/param_dict.h b/src/relay/backend/param_dict.h
index 384201f94648..96e17a9da07b 100644
--- a/src/relay/backend/param_dict.h
+++ b/src/relay/backend/param_dict.h
@@ -32,32 +32,7 @@
 #include <string>
 
 namespace tvm {
-namespace relay {
-
-/*! \brief Magic number for NDArray list file  */
-constexpr uint64_t kTVMNDArrayListMagic = 0xF7E58D4F05049CB7;
-
-/*!
- * \brief Wrapper node for naming `NDArray`s.
- */
-struct NamedNDArrayNode : public ::tvm::Object {
-  std::string name;
-  tvm::runtime::NDArray array;
-
-  void VisitAttrs(tvm::AttrVisitor* v) {
-    v->Visit("name", &name);
-    v->Visit("array", &array);
-  }
-
-  static constexpr const char* _type_key = "NamedNDArray";
-  TVM_DECLARE_FINAL_OBJECT_INFO(NamedNDArrayNode, Object);
-};
-
-class NamedNDArray : public ObjectRef {
- public:
-  TVM_DEFINE_OBJECT_REF_METHODS(NamedNDArray, ObjectRef, NamedNDArrayNode);
-};
-}  // namespace relay
+namespace relay {}  // namespace relay
 }  // namespace tvm
 
 #endif  // TVM_RELAY_BACKEND_PARAM_DICT_H_
diff --git a/src/runtime/file_utils.cc b/src/runtime/file_utils.cc
index 3957505a7c7d..92c398b559d2 100644
--- a/src/runtime/file_utils.cc
+++ b/src/runtime/file_utils.cc
@@ -24,6 +24,7 @@
 
 #include <dmlc/json.h>
 #include <dmlc/memory_io.h>
+#include <tvm/runtime/registry.h>
 #include <tvm/runtime/serializer.h>
 #include <tvm/support/logging.h>
 
@@ -158,5 +159,71 @@ void LoadMetaDataFromFile(const std::string& file_name,
 
 void RemoveFile(const std::string& file_name) { std::remove(file_name.c_str()); }
 
+Map<String, NDArray> LoadParams(const std::string& param_blob) {
+  dmlc::MemoryStringStream strm(const_cast<std::string*>(&param_blob));
+  return LoadParams(&strm);
+}
+Map<String, NDArray> LoadParams(dmlc::Stream* strm) {
+  Map<String, NDArray> params;
+  uint64_t header, reserved;
+  ICHECK(strm->Read(&header)) << "Invalid parameters file format";
+  ICHECK(header == kTVMNDArrayListMagic) << "Invalid parameters file format";
+  ICHECK(strm->Read(&reserved)) << "Invalid parameters file format";
+
+  std::vector<std::string> names;
+  ICHECK(strm->Read(&names)) << "Invalid parameters file format";
+  uint64_t sz;
+  strm->Read(&sz);
+  size_t size = static_cast<size_t>(sz);
+  ICHECK(size == names.size()) << "Invalid parameters file format";
+  for (size_t i = 0; i < size; ++i) {
+    // The data_entry is allocated on device, NDArray.load always load the array into CPU.
+    NDArray temp;
+    temp.Load(strm);
+    params.Set(names[i], temp);
+  }
+  return params;
+}
+
+void SaveParams(dmlc::Stream* strm, const Map<String, NDArray>& params) {
+  std::vector<std::string> names;
+  std::vector<const DLTensor*> arrays;
+  for (auto& p : params) {
+    names.push_back(p.first);
+    arrays.push_back(p.second.operator->());
+  }
+
+  uint64_t header = kTVMNDArrayListMagic, reserved = 0;
+  strm->Write(header);
+  strm->Write(reserved);
+  strm->Write(names);
+  {
+    uint64_t sz = static_cast<uint64_t>(arrays.size());
+    strm->Write(sz);
+    for (size_t i = 0; i < sz; ++i) {
+      tvm::runtime::SaveDLTensor(strm, arrays[i]);
+    }
+  }
+}
+
+std::string SaveParams(const Map<String, NDArray>& params) {
+  std::string bytes;
+  dmlc::MemoryStringStream strm(&bytes);
+  dmlc::Stream* fo = &strm;
+  SaveParams(fo, params);
+  return bytes;
+}
+
+TVM_REGISTER_GLOBAL("runtime.SaveParams").set_body_typed([](const Map<String, NDArray>& params) {
+  std::string s = ::tvm::runtime::SaveParams(params);
+  // copy return array so it is owned by the ret value
+  TVMRetValue rv;
+  rv = TVMByteArray{s.data(), s.size()};
+  return rv;
+});
+TVM_REGISTER_GLOBAL("runtime.LoadParams").set_body_typed([](const String& s) {
+  return ::tvm::runtime::LoadParams(s);
+});
+
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/file_utils.h b/src/runtime/file_utils.h
index dfa7d67f1bfe..718d10d5df70 100644
--- a/src/runtime/file_utils.h
+++ b/src/runtime/file_utils.h
@@ -94,6 +94,32 @@ void LoadMetaDataFromFile(const std::string& file_name,
  * \param file_name The file name.
  */
 void RemoveFile(const std::string& file_name);
+
+constexpr uint64_t kTVMNDArrayListMagic = 0xF7E58D4F05049CB7;
+/*!
+ * \brief Load parameters from a string.
+ * \param param_blob Serialized string of parameters.
+ * \return Map of parameter name to parameter value.
+ */
+Map<String, NDArray> LoadParams(const std::string& param_blob);
+/*!
+ * \brief Load parameters from a stream.
+ * \param strm Stream to load parameters from.
+ * \return Map of parameter name to parameter value.
+ */
+Map<String, NDArray> LoadParams(dmlc::Stream* strm);
+/*!
+ * \brief Serialize parameters to a byte array.
+ * \param params Parameters to save.
+ * \return String containing binary parameter data.
+ */
+std::string SaveParams(const Map<String, NDArray>& params);
+/*!
+ * \brief Serialize parameters to a stream.
+ * \param strm Stream to write to.
+ * \param params Parameters to save.
+ */
+void SaveParams(dmlc::Stream* strm, const Map<String, NDArray>& params);
 }  // namespace runtime
 }  // namespace tvm
 #endif  // TVM_RUNTIME_FILE_UTILS_H_
diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index 6d586cfdd042..6c51e711aef1 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -38,6 +38,8 @@
 #include <utility>
 #include <vector>
 
+#include "../file_utils.h"
+
 namespace tvm {
 namespace runtime {
 namespace details {
@@ -196,31 +198,10 @@ void GraphRuntime::LoadParams(const std::string& param_blob) {
 }
 
 void GraphRuntime::LoadParams(dmlc::Stream* strm) {
-  uint64_t header, reserved;
-  ICHECK(strm->Read(&header)) << "Invalid parameters file format";
-  ICHECK(header == kTVMNDArrayListMagic) << "Invalid parameters file format";
-  ICHECK(strm->Read(&reserved)) << "Invalid parameters file format";
-
-  std::vector<std::string> names;
-  ICHECK(strm->Read(&names)) << "Invalid parameters file format";
-  uint64_t sz;
-  strm->Read(&sz);
-  size_t size = static_cast<size_t>(sz);
-  ICHECK(size == names.size()) << "Invalid parameters file format";
-  for (size_t i = 0; i < size; ++i) {
-    int in_idx = GetInputIndex(names[i]);
-    if (in_idx < 0) {
-      NDArray temp;
-      temp.Load(strm);
-      continue;
-    }
-    uint32_t eid = this->entry_id(input_nodes_[in_idx], 0);
-    ICHECK_LT(eid, data_entry_.size());
-
-    // The data_entry is allocated on device, NDArray.load always load the array into CPU.
-    NDArray temp;
-    temp.Load(strm);
-    data_entry_[eid].CopyFrom(temp);
+  Map<String, NDArray> params = ::tvm::runtime::LoadParams(strm);
+  for (auto& p : params) {
+    uint32_t eid = this->entry_id(input_nodes_[GetInputIndex(p.first)], 0);
+    data_entry_[eid].CopyFrom(p.second);
   }
 }
 
diff --git a/src/runtime/graph/graph_runtime.h b/src/runtime/graph/graph_runtime.h
index 627911883dfb..a1e2ee3b5d74 100644
--- a/src/runtime/graph/graph_runtime.h
+++ b/src/runtime/graph/graph_runtime.h
@@ -47,9 +47,6 @@ namespace runtime {
     ICHECK_EQ(ret, 0) << TVMGetLastError(); \
   }
 
-/*! \brief Magic number for NDArray list file  */
-constexpr uint64_t kTVMNDArrayListMagic = 0xF7E58D4F05049CB7;
-
 /*! \brief operator attributes about tvm op */
 struct TVMOpParam {
   std::string func_name;
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index 3f890baf52c0..6d121aa67733 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -35,6 +35,8 @@
 #include <stdexcept>
 #include <vector>
 
+#include "../file_utils.h"
+
 using namespace tvm::runtime;
 
 namespace tvm {
diff --git a/tests/python/contrib/test_tensorrt.py b/tests/python/contrib/test_tensorrt.py
index 60d6b2aa7571..ae8214d6463c 100644
--- a/tests/python/contrib/test_tensorrt.py
+++ b/tests/python/contrib/test_tensorrt.py
@@ -22,7 +22,7 @@
 import tvm
 import tvm.relay.testing
 
-from tvm import relay
+from tvm import relay, runtime
 from tvm.relay.op.contrib import tensorrt
 from tvm.contrib import graph_runtime, utils
 from tvm.runtime.vm import VirtualMachine
@@ -265,7 +265,7 @@ def test_tensorrt_serialize_graph_runtime():
     def compile_graph(mod, params):
         with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}):
             graph, lib, params = relay.build(mod, params=params, target="cuda")
-            params = relay.save_param_dict(params)
+            params = runtime.save_param_dict(params)
         return graph, lib, params
 
     def run_graph(graph, lib, params):
diff --git a/tests/python/relay/test_cpp_build_module.py b/tests/python/relay/test_cpp_build_module.py
index 67f0621ef273..60f3dfa76e38 100644
--- a/tests/python/relay/test_cpp_build_module.py
+++ b/tests/python/relay/test_cpp_build_module.py
@@ -18,7 +18,7 @@
 
 import tvm
 from tvm import te
-from tvm import relay
+from tvm import relay, runtime
 from tvm.contrib.nvcc import have_fp16
 import tvm.testing
 
@@ -86,7 +86,7 @@ def test_fp16_build():
 
     # test
     rt = tvm.contrib.graph_runtime.create(g_json, mmod, ctx)
-    rt.load_params(relay.save_param_dict(params))
+    rt.load_params(runtime.save_param_dict(params))
     rt.run()
     out = rt.get_output(0)
 
diff --git a/tests/python/relay/test_param_dict.py b/tests/python/relay/test_param_dict.py
index 74c9ebcaa355..29e0b5c0463b 100644
--- a/tests/python/relay/test_param_dict.py
+++ b/tests/python/relay/test_param_dict.py
@@ -17,7 +17,7 @@
 import os
 import numpy as np
 import tvm
-from tvm import te
+from tvm import te, runtime
 import json
 import base64
 from tvm._ffi.base import py_str
@@ -31,7 +31,7 @@ def test_save_load():
     x = np.ones((10, 2)).astype("float32")
     y = np.ones((1, 2, 3)).astype("float32")
     params = {"x": x, "y": y}
-    param_bytes = relay.save_param_dict(params)
+    param_bytes = runtime.save_param_dict(params)
     assert isinstance(param_bytes, bytearray)
     param2 = relay.load_param_dict(param_bytes)
     assert len(param2) == 2
@@ -46,7 +46,7 @@ def test_ndarray_reflection():
     param_dict = {"x": tvm_array, "y": tvm_array}
     assert param_dict["x"].same_as(param_dict["y"])
     # Serialize then deserialize `param_dict`.
-    deser_param_dict = relay.load_param_dict(relay.save_param_dict(param_dict))
+    deser_param_dict = relay.load_param_dict(runtime.save_param_dict(param_dict))
     # Make sure the data matches the original data and `x` and `y` contain the same data.
     np.testing.assert_equal(deser_param_dict["x"].asnumpy(), tvm_array.asnumpy())
     # Make sure `x` and `y` contain the same data.
@@ -77,7 +77,7 @@ def verify_graph_runtime(remote, target, shape, dtype):
         lib = remote.load_module("dev_lib.o")
         ctx = remote.cpu(0)
         mod = graph_runtime.create(graph, lib, ctx)
-        mod.load_params(relay.save_param_dict(params))
+        mod.load_params(runtime.save_param_dict(params))
         mod.run()
         out = mod.get_output(0, tvm.nd.empty(shape, dtype=dtype, ctx=ctx))
         tvm.testing.assert_allclose(x_in + 1, out.asnumpy())
diff --git a/tests/python/unittest/test_runtime_graph.py b/tests/python/unittest/test_runtime_graph.py
index c43a35924420..16e9db42cba3 100644
--- a/tests/python/unittest/test_runtime_graph.py
+++ b/tests/python/unittest/test_runtime_graph.py
@@ -16,7 +16,7 @@
 # under the License.
 import tvm
 import tvm.testing
-from tvm import te
+from tvm import te, runtime
 import numpy as np
 import json
 from tvm import rpc
@@ -94,12 +94,12 @@ def check_sharing():
         graph, lib, params = relay.build(func, target="llvm", params=params)
 
         mod_shared = graph_runtime.create(graph, lib, tvm.cpu(0))
-        mod_shared.load_params(relay.save_param_dict(params))
+        mod_shared.load_params(runtime.save_param_dict(params))
         num_mods = 10
         mods = [graph_runtime.create(graph, lib, tvm.cpu(0)) for _ in range(num_mods)]
 
         for mod in mods:
-            mod.share_params(mod_shared, relay.save_param_dict(params))
+            mod.share_params(mod_shared, runtime.save_param_dict(params))
 
         a = np.random.uniform(size=(1, 10)).astype("float32")
         for mod in mods:
diff --git a/tests/python/unittest/test_runtime_module_based_interface.py b/tests/python/unittest/test_runtime_module_based_interface.py
index 51a587242ae3..a34fe4a062cb 100644
--- a/tests/python/unittest/test_runtime_module_based_interface.py
+++ b/tests/python/unittest/test_runtime_module_based_interface.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import numpy as np
-from tvm import relay
+from tvm import relay, runtime
 from tvm.relay import testing
 import tvm
 from tvm.contrib import graph_runtime
@@ -314,7 +314,7 @@ def verify_cpu_remove_package_params(obj_format):
         complied_graph_lib_no_params = complied_graph_lib["remove_params"]()
         complied_graph_lib_no_params.export_library(path_lib)
         with open(temp.relpath("deploy_param.params"), "wb") as fo:
-            fo.write(relay.save_param_dict(complied_graph_lib.get_params()))
+            fo.write(runtime.save_param_dict(complied_graph_lib.get_params()))
         loaded_lib = tvm.runtime.load_module(path_lib)
         data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32")
         ctx = tvm.cpu(0)
@@ -361,7 +361,7 @@ def verify_gpu_remove_package_params(obj_format):
         complied_graph_lib_no_params = complied_graph_lib["remove_params"]()
         complied_graph_lib_no_params.export_library(path_lib)
         with open(temp.relpath("deploy_param.params"), "wb") as fo:
-            fo.write(relay.save_param_dict(complied_graph_lib.get_params()))
+            fo.write(runtime.save_param_dict(complied_graph_lib.get_params()))
         loaded_lib = tvm.runtime.load_module(path_lib)
         data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32")
         ctx = tvm.gpu(0)
@@ -409,7 +409,7 @@ def verify_rpc_cpu_remove_package_params(obj_format):
         complied_graph_lib_no_params.export_library(path_lib)
         path_params = temp.relpath("deploy_param.params")
         with open(path_params, "wb") as fo:
-            fo.write(relay.save_param_dict(complied_graph_lib.get_params()))
+            fo.write(runtime.save_param_dict(complied_graph_lib.get_params()))
 
         from tvm import rpc
 
@@ -462,7 +462,7 @@ def verify_rpc_gpu_remove_package_params(obj_format):
         complied_graph_lib_no_params.export_library(path_lib)
         path_params = temp.relpath("deploy_param.params")
         with open(path_params, "wb") as fo:
-            fo.write(relay.save_param_dict(complied_graph_lib.get_params()))
+            fo.write(runtime.save_param_dict(complied_graph_lib.get_params()))
 
         from tvm import rpc
 
diff --git a/tutorials/frontend/deploy_sparse.py b/tutorials/frontend/deploy_sparse.py
index 9641fb8fd14c..98004a93c74f 100644
--- a/tutorials/frontend/deploy_sparse.py
+++ b/tutorials/frontend/deploy_sparse.py
@@ -81,7 +81,7 @@
 import itertools
 import numpy as np
 import tensorflow as tf
-from tvm import relay
+from tvm import relay, runtime
 from tvm.contrib import graph_runtime
 from tvm.relay import data_dep_optimization as ddo
 from tensorflow.python.framework.convert_to_constants import (
@@ -196,7 +196,7 @@ def import_graphdef(
             with open(os.path.join(abs_path, relay_file), "w") as fo:
                 fo.write(tvm.ir.save_json(mod))
             with open(os.path.join(abs_path, relay_params), "wb") as fo:
-                fo.write(relay.save_param_dict(params))
+                fo.write(runtime.save_param_dict(params))
 
     return mod, params, shape_dict
 

From a8d1055208bd4412018685609c81bb76e107aeef Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Mon, 8 Mar 2021 21:40:57 -0800
Subject: [PATCH 303/357] [FIX] Fix clang12 warnings (#7593)

---
 src/relay/backend/contrib/tensorrt/codegen.cc | 3 +++
 src/runtime/contrib/cublas/cublas.cc          | 4 ++--
 src/runtime/contrib/cublas/cublas_utils.cc    | 2 +-
 src/runtime/contrib/json/json_runtime.h       | 6 +++---
 src/runtime/micro/micro_session.cc            | 2 +-
 5 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/relay/backend/contrib/tensorrt/codegen.cc b/src/relay/backend/contrib/tensorrt/codegen.cc
index cb648333df8d..059dbc192a04 100644
--- a/src/relay/backend/contrib/tensorrt/codegen.cc
+++ b/src/relay/backend/contrib/tensorrt/codegen.cc
@@ -156,6 +156,9 @@ class TensorRTJSONSerializer : public backend::contrib::JSONSerializer {
         // with slice_mode = "size", attrs->end_value mean the size of the slice
         int end_value = attrs->end.value()[i].as<IntImmNode>()->value;
         size_value = (end_value == -1) ? ishape[i] - begin_value : end_value;
+      } else {
+        LOG(FATAL) << "Unexpected slice_mode " << attrs->slice_mode << ", expected end or size";
+        throw;
       }
       ICHECK_GT(size_value, 0);
       size.push_back(std::to_string(size_value));
diff --git a/src/runtime/contrib/cublas/cublas.cc b/src/runtime/contrib/cublas/cublas.cc
index ce69d4ca7bde..b12992f57159 100644
--- a/src/runtime/contrib/cublas/cublas.cc
+++ b/src/runtime/contrib/cublas/cublas.cc
@@ -167,7 +167,7 @@ inline void CallLtIgemm(TVMArgs args, TVMRetValue* ret, cublasLtHandle_t hdl) {
   ICHECK(CheckMixPrecisionType(A->dtype, C->dtype)) << "Unsupported data type";
   int32_t alpha = args.size() > 5 ? args[5] : 1;
   int32_t beta = args.size() > 6 ? args[6] : 0;
-  cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
+  cublasLtMatrixLayout_t Adesc = nullptr, Bdesc = nullptr, Cdesc = nullptr;
   auto A_data = reinterpret_cast<void*>(static_cast<char*>(A->data) + A->byte_offset);
   auto B_data = reinterpret_cast<void*>(static_cast<char*>(B->data) + B->byte_offset);
   auto C_data = reinterpret_cast<void*>(static_cast<char*>(C->data) + C->byte_offset);
@@ -204,7 +204,7 @@ inline void CallLtIgemm(TVMArgs args, TVMRetValue* ret, cublasLtHandle_t hdl) {
                                                       &order_COL32, sizeof(order_COL32)));
 
   CHECK_CUBLAS_ERROR(cublasLtMatmul(hdl, operationDesc, &alpha, B_data, Adesc, A_data, Bdesc, &beta,
-                                    C_data, Cdesc, C_data, Cdesc, NULL, NULL, 0, 0));
+                                    C_data, Cdesc, C_data, Cdesc, nullptr, nullptr, 0, nullptr));
 }
 #endif
 
diff --git a/src/runtime/contrib/cublas/cublas_utils.cc b/src/runtime/contrib/cublas/cublas_utils.cc
index d4ec08770723..4b4a1b755e66 100644
--- a/src/runtime/contrib/cublas/cublas_utils.cc
+++ b/src/runtime/contrib/cublas/cublas_utils.cc
@@ -35,7 +35,7 @@ CuBlasThreadEntry::CuBlasThreadEntry() { CHECK_CUBLAS_ERROR(cublasCreate(&handle
 CuBlasThreadEntry::~CuBlasThreadEntry() {
   if (handle) {
     cublasDestroy(handle);
-    handle = 0;
+    handle = nullptr;
   }
 }
 
diff --git a/src/runtime/contrib/json/json_runtime.h b/src/runtime/contrib/json/json_runtime.h
index 3ae652ccaf24..55f16635b9e6 100644
--- a/src/runtime/contrib/json/json_runtime.h
+++ b/src/runtime/contrib/json/json_runtime.h
@@ -55,7 +55,7 @@ class JSONRuntimeBase : public ModuleNode {
     LoadGraph(graph_json_);
   }
 
-  const char* type_key() const { return "json"; }
+  const char* type_key() const override { return "json"; }
 
   /*! \brief Initialize a specific json runtime. */
   virtual void Init(const Array<NDArray>& consts) = 0;
@@ -69,7 +69,7 @@ class JSONRuntimeBase : public ModuleNode {
    * \param sptr_to_self The pointer to the module node.
    * \return The packed function.
    */
-  virtual PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) {
+  PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) override {
     if (name == "get_symbol") {
       return PackedFunc(
           [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->symbol_name_; });
@@ -98,7 +98,7 @@ class JSONRuntimeBase : public ModuleNode {
     }
   }
 
-  virtual void SaveToBinary(dmlc::Stream* stream) {
+  void SaveToBinary(dmlc::Stream* stream) override {
     // Save the symbol
     stream->Write(symbol_name_);
     // Save the graph
diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc
index f26a717dae33..6c0d0c4c40fe 100644
--- a/src/runtime/micro/micro_session.cc
+++ b/src/runtime/micro/micro_session.cc
@@ -172,7 +172,7 @@ class MicroTransportChannel : public RPCChannel {
     // confusion.
     unsigned int seed = random_seed.load();
     if (seed == 0) {
-      seed = (unsigned int)time(NULL);
+      seed = (unsigned int)time(nullptr);
     }
     uint8_t initial_nonce = 0;
     for (int i = 0; i < kNumRandRetries && initial_nonce == 0; ++i) {

From d830f2cd9b5825da93ba49353f374d773356c798 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Tue, 9 Mar 2021 05:37:18 -0800
Subject: [PATCH 304/357] [Runtime][Object] Add Object::unique() (#7615)

---
 include/tvm/runtime/object.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/tvm/runtime/object.h b/include/tvm/runtime/object.h
index b5cf77d590f6..70ab7688c450 100644
--- a/include/tvm/runtime/object.h
+++ b/include/tvm/runtime/object.h
@@ -185,7 +185,11 @@ class TVM_DLL Object {
    */
   template <typename TargetType>
   inline bool IsInstance() const;
-
+  /*!
+   * \return Weather the cell has only one reference
+   * \note We use stl style naming to be consistent with known API in shared_ptr.
+   */
+  inline bool unique() const;
   /*!
    * \brief Get the type key of the corresponding index from runtime.
    * \param tindex The type index.
@@ -831,6 +835,8 @@ inline bool Object::IsInstance() const {
   }
 }
 
+inline bool Object::unique() const { return use_count() == 1; }
+
 template <typename ObjectType>
 inline const ObjectType* ObjectRef::as() const {
   if (data_ != nullptr && data_->IsInstance<ObjectType>()) {

From dfc231c97b6fa211256719894f23fa9a80d7326f Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Tue, 9 Mar 2021 05:38:13 -0800
Subject: [PATCH 305/357] [Bugfix][AutoScheduler] Correctly resume status
 (#7614)

---
 python/tvm/auto_scheduler/task_scheduler.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/auto_scheduler/task_scheduler.py b/python/tvm/auto_scheduler/task_scheduler.py
index b6b05298aef7..0221870badcf 100644
--- a/python/tvm/auto_scheduler/task_scheduler.py
+++ b/python/tvm/auto_scheduler/task_scheduler.py
@@ -516,9 +516,9 @@ def _restore_status(self, log_file, num_measures_per_round):
 
             if res.error_no == 0:
                 cost = array_mean(res.costs)
-                if self.best_costs[task_idx] < cost:
+                if cost < self.best_costs[task_idx]:
                     self.best_costs[task_idx] = cost
-                    self.task_best_cts = self.task_cts[task_idx]
+                    self.task_best_cts[task_idx] = self.task_cts[task_idx]
 
         for idx in range(len(self.tasks)):
             if self.task_cts[idx] - self.task_best_cts[idx] > self.early_stopping_task:

From 8f9e5a48ec5887776b0c8d72401c926a13a81264 Mon Sep 17 00:00:00 2001
From: LuukOddity <72092094+LuukOddity@users.noreply.github.com>
Date: Tue, 9 Mar 2021 15:05:24 +0100
Subject: [PATCH 306/357] Added MaybeAlign to CreateAtomicRMW calls to fix
 build for LLVM13 (#7617)

---
 src/target/llvm/codegen_amdgpu.cc | 12 ++++++++++++
 src/target/llvm/codegen_nvptx.cc  | 12 ++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/src/target/llvm/codegen_amdgpu.cc b/src/target/llvm/codegen_amdgpu.cc
index 605870f48c52..ca21892ccc5f 100644
--- a/src/target/llvm/codegen_amdgpu.cc
+++ b/src/target/llvm/codegen_amdgpu.cc
@@ -190,14 +190,26 @@ class CodeGenAMDGPU : public CodeGenLLVM {
       llvm::Value* v1 = MakeValue(op->args[1]);
       if (op->args[1]->dtype.is_float()) {
 #if TVM_LLVM_VERSION >= 90
+#if TVM_LLVM_VERSION >= 130
         return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, v0, v1,
+                                         llvm::MaybeAlign::MaybeAlign(),
                                          llvm::AtomicOrdering::Monotonic);
+#else
+        return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, v0, v1,
+                                         llvm::AtomicOrdering::Monotonic);
+#endif
 #else
         LOG(FATAL) << "Floating point atomic requires LLVM 9 or newer";
 #endif
       }
+#if TVM_LLVM_VERSION >= 130
       return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::Add, v0, v1,
+                                       llvm::MaybeAlign::MaybeAlign(),
                                        llvm::AtomicOrdering::Monotonic);
+#else
+      return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::Add, v0, v1,
+                                       llvm::AtomicOrdering::Monotonic);
+#endif
     }
     return CodeGenLLVM::CreateIntrinsic(op);
   }
diff --git a/src/target/llvm/codegen_nvptx.cc b/src/target/llvm/codegen_nvptx.cc
index d8002a2b58a6..05d017862516 100644
--- a/src/target/llvm/codegen_nvptx.cc
+++ b/src/target/llvm/codegen_nvptx.cc
@@ -238,14 +238,26 @@ llvm::Value* CodeGenNVPTX::CreateIntrinsic(const CallNode* op) {
     llvm::Value* v1 = MakeValue(op->args[1]);
     if (op->args[1]->dtype.is_float()) {
 #if TVM_LLVM_VERSION >= 90
+#if TVM_LLVM_VERSION >= 130
       return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, v0, v1,
+                                       llvm::MaybeAlign::MaybeAlign(),
                                        llvm::AtomicOrdering::Monotonic);
+#else
+      return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, v0, v1,
+                                       llvm::AtomicOrdering::Monotonic);
+#endif
 #else
       LOG(FATAL) << "Floating point atomic requires LLVM 9 or newer";
 #endif
     }
+#if TVM_LLVM_VERSION >= 130
     return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::Add, v0, v1,
+                                     llvm::MaybeAlign::MaybeAlign(),
                                      llvm::AtomicOrdering::Monotonic);
+#else
+    return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::Add, v0, v1,
+                                     llvm::AtomicOrdering::Monotonic);
+#endif
   }
   return CodeGenLLVM::CreateIntrinsic(op);
 }

From 12c3b3d096c0bd509199b9ecdd0f36107950cc9e Mon Sep 17 00:00:00 2001
From: PENGUINLIONG <admin@penguinliong.moe>
Date: Tue, 9 Mar 2021 22:15:59 +0800
Subject: [PATCH 307/357] Prevent host Vulkan SDK blocking cross-compilation
 (#7609)

---
 cmake/modules/Vulkan.cmake | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/cmake/modules/Vulkan.cmake b/cmake/modules/Vulkan.cmake
index 4df8986c800c..095790f08547 100644
--- a/cmake/modules/Vulkan.cmake
+++ b/cmake/modules/Vulkan.cmake
@@ -26,16 +26,11 @@ IF USE_VULKAN)
 tvm_option(USE_VULKAN_VALIDATION "Enable Vulkan API validation layers" OFF
   IF USE_VULKAN)
 
-if(Vulkan_FOUND)
-  # always set the includedir
-  # avoid global retrigger of cmake
-  include_directories(SYSTEM ${Vulkan_INCLUDE_DIRS})
-endif(Vulkan_FOUND)
-
 if(USE_VULKAN)
   if(NOT Vulkan_FOUND)
     message(FATAL_ERROR "Cannot find Vulkan, USE_VULKAN=" ${USE_VULKAN})
   endif()
+  include_directories(SYSTEM ${Vulkan_INCLUDE_DIRS})
   message(STATUS "Build with Vulkan support")
   file(GLOB RUNTIME_VULKAN_SRCS src/runtime/vulkan/vulkan.cc)
   file(GLOB COMPILER_VULKAN_SRCS src/target/spirv/*.cc)

From a0656f5c40d3db4ed593b6165b06fb72b7638de0 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Wed, 10 Mar 2021 01:27:02 +0900
Subject: [PATCH 308/357] [SPIRV] Minor update to TIR sort to make it work on
 VK/SPIR-V (#7607)

* sort started to working

* static size sort seems to be working

* test sort on vulkan

* add nvptx to sort test too
---
 python/tvm/topi/cuda/sort.py               | 21 ++++++++++-----------
 tests/python/topi/python/test_topi_sort.py |  6 +++---
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/python/tvm/topi/cuda/sort.py b/python/tvm/topi/cuda/sort.py
index ff5cc0681ad2..ca832ef0ef36 100644
--- a/python/tvm/topi/cuda/sort.py
+++ b/python/tvm/topi/cuda/sort.py
@@ -23,6 +23,7 @@
 from ..transform import strided_slice, transpose
 from .. import tag
 from ..utils import ceil_div, swap
+from ..math import cast
 
 
 def _schedule_sort(outs):
@@ -142,6 +143,8 @@ def bottom_up_merge(source, dest, source_idx, dest_idx, start, middle, end, even
         """
         # pylint: disable=arguments-out-of-order
         # initialize iterators
+        i = ib.allocate("int64", (1,), name="i", scope="local")
+        j = ib.allocate("int64", (1,), name="j", scope="local")
         i[0] = start
         j[0] = middle
         # set up indexes
@@ -189,12 +192,13 @@ def assign_j():
 
     def mergesort(source, dest, source_idx, dest_idx, size, width, even):
         # calculate the start, mid, and end points of this section
-        start[0] = width * tid
-        with ib.if_scope(start[0] < size):
-            middle[0] = tvm.te.min(start[0] + tvm.tir.indexdiv(width, 2), size)
-            end[0] = tvm.te.min(start[0] + width, size)
-            ## merge the start->middle and middle->end arrays
-            bottom_up_merge(source, dest, source_idx, dest_idx, start[0], middle[0], end[0], even)
+        start = width * tid
+
+        with ib.if_scope(start < size):
+            middle = cast(tvm.te.min(start + tvm.tir.indexdiv(width, 2), size), "int64")
+            end = cast(tvm.te.min(start + width, size), "int64")
+            # merge the start->middle and middle->end arrays
+            bottom_up_merge(source, dest, source_idx, dest_idx, start, middle, end, even)
 
     lim = tvm.tir.generic.cast(
         tvm.tir.ceil(tvm.tir.log2(tvm.tir.generic.cast(size, "float64"))), "int64"
@@ -203,11 +207,6 @@ def mergesort(source, dest, source_idx, dest_idx, size, width, even):
         width = 2 << l2_width
         # Define and launch the cuda kernel
         with ib.new_scope():
-            i = ib.allocate("int64", (1,), name="i", scope="local")
-            j = ib.allocate("int64", (1,), name="j", scope="local")
-            start = ib.allocate("int64", (1,), name="start", scope="local")
-            middle = ib.allocate("int64", (1,), name="middle", scope="local")
-            end = ib.allocate("int64", (1,), name="end", scope="local")
             tx = te.thread_axis("threadIdx.x")
             bx = te.thread_axis("blockIdx.x")
             ib.scope_attr(tx, "thread_extent", nthread_tx)
diff --git a/tests/python/topi/python/test_topi_sort.py b/tests/python/topi/python/test_topi_sort.py
index 626218f30144..85a35488ab22 100644
--- a/tests/python/topi/python/test_topi_sort.py
+++ b/tests/python/topi/python/test_topi_sort.py
@@ -75,7 +75,7 @@ def check_device(device):
         f(tvm_data, tvm_out)
         tvm.testing.assert_allclose(tvm_out.asnumpy(), np_sort, rtol=1e0)
 
-    for device in ["llvm", "cuda", "opencl"]:
+    for device in ["llvm", "cuda", "opencl", "vulkan", "nvptx"]:
         check_device(device)
 
 
@@ -115,7 +115,7 @@ def check_device(device):
         f(tvm_data, tvm_out)
         tvm.testing.assert_allclose(tvm_out.asnumpy(), np_indices.astype(data_dtype), rtol=1e0)
 
-    for device in ["llvm", "cuda", "opencl"]:
+    for device in ["llvm", "cuda", "opencl", "vulkan", "nvptx"]:
         check_device(device)
 
 
@@ -167,7 +167,7 @@ def check_device(device):
         else:
             tvm.testing.assert_allclose(tvm_res[0].asnumpy(), np_indices)
 
-    for device in ["llvm", "cuda", "opencl"]:
+    for device in ["llvm", "cuda", "opencl", "vulkan", "nvptx"]:
         check_device(device)
 
 

From 92ab9e45049663ac031d22d088dd430e49fc5827 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E5=8D=8E=20=28Zhenhua=20WANG=29?=
 <zhenhuaw@nvidia.com>
Date: Wed, 10 Mar 2021 05:32:14 +0800
Subject: [PATCH 309/357] Allow cuDNN in non-CUDA non-system dir (#7608)

cuDNN is not a builtin library of the CUDA toolkit package.
The user can install it in the CUDA directory, the system
directory, or anywhere else. This patch relax the restriction
of locating cuDNN in the CUDA directory. This is helpfull
when trying out different versions of cuDNN.
---
 cmake/config.cmake         |  5 ++++-
 cmake/modules/CUDA.cmake   |  5 +++--
 cmake/utils/FindCUDA.cmake | 44 ++++++++++++++++++++++++++++----------
 3 files changed, 40 insertions(+), 14 deletions(-)

diff --git a/cmake/config.cmake b/cmake/config.cmake
index 872feb918a4f..30c21f707c08 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -174,7 +174,10 @@ set(USE_FLATBUFFERS_PATH none)
 # - /path/to/edgetpu: use specific path to edgetpu library
 set(USE_EDGETPU OFF)
 
-# Whether use CuDNN
+# Possible values:
+# - ON: enable cuDNN with cmake's auto search in CUDA directory
+# - OFF: disable cuDNN
+# - /path/to/cudnn: use specific path to cuDNN path
 set(USE_CUDNN OFF)
 
 # Whether use cuBLAS
diff --git a/cmake/modules/CUDA.cmake b/cmake/modules/CUDA.cmake
index 1e104218a456..0ec2f1466bd1 100644
--- a/cmake/modules/CUDA.cmake
+++ b/cmake/modules/CUDA.cmake
@@ -16,12 +16,12 @@
 # under the License.
 
 # CUDA Module
-find_cuda(${USE_CUDA})
+find_cuda(${USE_CUDA} ${USE_CUDNN})
 
 if(CUDA_FOUND)
   # always set the includedir when cuda is available
   # avoid global retrigger of cmake
-	include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
+  include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
 endif(CUDA_FOUND)
 
 if(USE_CUDA)
@@ -40,6 +40,7 @@ if(USE_CUDA)
 
   if(USE_CUDNN)
     message(STATUS "Build with cuDNN support")
+    include_directories(SYSTEM ${CUDA_CUDNN_INCLUDE_DIRS})
     file(GLOB CONTRIB_CUDNN_SRCS src/runtime/contrib/cudnn/*.cc)
     list(APPEND RUNTIME_SRCS ${CONTRIB_CUDNN_SRCS})
     list(APPEND TVM_RUNTIME_LINKER_LIBS ${CUDA_CUDNN_LIBRARY})
diff --git a/cmake/utils/FindCUDA.cmake b/cmake/utils/FindCUDA.cmake
index 564b837515a7..aaddfb054366 100644
--- a/cmake/utils/FindCUDA.cmake
+++ b/cmake/utils/FindCUDA.cmake
@@ -19,10 +19,12 @@
 # Enhanced version of find CUDA.
 #
 # Usage:
-#   find_cuda(${USE_CUDA})
+#   find_cuda(${USE_CUDA} ${USE_CUDNN})
 #
 # - When USE_CUDA=ON, use auto search
 # - When USE_CUDA=/path/to/cuda-path, use the cuda path
+# - When USE_CUDNN=ON, use auto search
+# - When USE_CUDNN=/path/to/cudnn-path, use the cudnn path
 #
 # Provide variables:
 #
@@ -32,10 +34,11 @@
 # - CUDA_CUDA_LIBRARY
 # - CUDA_CUDART_LIBRARY
 # - CUDA_NVRTC_LIBRARY
+# - CUDA_CUDNN_INCLUDE_DIRS
 # - CUDA_CUDNN_LIBRARY
 # - CUDA_CUBLAS_LIBRARY
 #
-macro(find_cuda use_cuda)
+macro(find_cuda use_cuda use_cudnn)
   set(__use_cuda ${use_cuda})
   if(${__use_cuda} MATCHES ${IS_TRUE_PATTERN})
     find_package(CUDA QUIET)
@@ -64,9 +67,6 @@ macro(find_cuda use_cuda)
       find_library(CUDA_NVRTC_LIBRARY nvrtc
         ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
         ${CUDA_TOOLKIT_ROOT_DIR}/lib/Win32)
-      find_library(CUDA_CUDNN_LIBRARY cudnn
-        ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
-        ${CUDA_TOOLKIT_ROOT_DIR}/lib/Win32)
       find_library(CUDA_CUBLAS_LIBRARY cublas
         ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
         ${CUDA_TOOLKIT_ROOT_DIR}/lib/Win32)
@@ -85,12 +85,6 @@ macro(find_cuda use_cuda)
         PATHS ${CUDA_TOOLKIT_ROOT_DIR}
         PATH_SUFFIXES lib lib64 targets/x86_64-linux/lib targets/x86_64-linux/lib/stubs lib64/stubs lib/x86_64-linux-gnu
         NO_DEFAULT_PATH)
-      find_library(CUDA_CUDNN_LIBRARY cudnn
-        ${CUDA_TOOLKIT_ROOT_DIR}/lib64
-        ${CUDA_TOOLKIT_ROOT_DIR}/lib
-        NO_DEFAULT_PATH)
-      # search default path if cannot find cudnn in non-default
-      find_library(CUDA_CUDNN_LIBRARY cudnn)
       find_library(CUDA_CUBLAS_LIBRARY cublas
         ${CUDA_TOOLKIT_ROOT_DIR}/lib64
         ${CUDA_TOOLKIT_ROOT_DIR}/lib
@@ -102,10 +96,38 @@ macro(find_cuda use_cuda)
         ${CUDA_TOOLKIT_ROOT_DIR}/lib
         NO_DEFAULT_PATH)
     endif(MSVC)
+
+    # find cuDNN
+    set(__use_cudnn ${use_cudnn})
+    if(${__use_cudnn} MATCHES ${IS_TRUE_PATTERN})
+      set(CUDA_CUDNN_INCLUDE_DIRS ${CUDA_INCLUDE_DIRS})
+      if(MSVC)
+        find_library(CUDA_CUDNN_LIBRARY cudnn
+          ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
+          ${CUDA_TOOLKIT_ROOT_DIR}/lib/Win32)
+      else(MSVC)
+        find_library(CUDA_CUDNN_LIBRARY cudnn
+          ${CUDA_TOOLKIT_ROOT_DIR}/lib64
+          ${CUDA_TOOLKIT_ROOT_DIR}/lib
+          NO_DEFAULT_PATH)
+        # search default path if cannot find cudnn in non-default
+        find_library(CUDA_CUDNN_LIBRARY cudnn)
+      endif(MSVC)
+    elseif(IS_DIRECTORY ${__use_cudnn})
+      # cuDNN doesn't necessarily live in the CUDA dir
+      set(CUDA_CUDNN_ROOT_DIR ${__use_cudnn})
+      set(CUDA_CUDNN_INCLUDE_DIRS ${CUDA_CUDNN_ROOT_DIR}/include)
+      find_library(CUDA_CUDNN_LIBRARY cudnn
+        ${CUDA_CUDNN_ROOT_DIR}/lib64
+        ${CUDA_CUDNN_ROOT_DIR}/lib
+        NO_DEFAULT_PATH)
+    endif()
+
     message(STATUS "Found CUDA_TOOLKIT_ROOT_DIR=" ${CUDA_TOOLKIT_ROOT_DIR})
     message(STATUS "Found CUDA_CUDA_LIBRARY=" ${CUDA_CUDA_LIBRARY})
     message(STATUS "Found CUDA_CUDART_LIBRARY=" ${CUDA_CUDART_LIBRARY})
     message(STATUS "Found CUDA_NVRTC_LIBRARY=" ${CUDA_NVRTC_LIBRARY})
+    message(STATUS "Found CUDA_CUDNN_INCLUDE_DIRS=" ${CUDA_CUDNN_INCLUDE_DIRS})
     message(STATUS "Found CUDA_CUDNN_LIBRARY=" ${CUDA_CUDNN_LIBRARY})
     message(STATUS "Found CUDA_CUBLAS_LIBRARY=" ${CUDA_CUBLAS_LIBRARY})
     message(STATUS "Found CUDA_CUBLASLT_LIBRARY=" ${CUDA_CUBLASLT_LIBRARY})

From 85832a2abbbe4bfcdcdd6bebb2b0b95943464fd5 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <pivovaa@amazon.com>
Date: Tue, 9 Mar 2021 15:32:12 -0800
Subject: [PATCH 310/357] Fix RelayVM for 32-bit platforms (#7605)

---
 src/runtime/vm/executable.cc     | 12 +++---------
 src/runtime/vm/serialize_utils.h |  9 +++++----
 src/support/utils.h              |  9 +++++++++
 3 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc
index eb1707b25aa3..6992097e8d69 100644
--- a/src/runtime/vm/executable.cc
+++ b/src/runtime/vm/executable.cc
@@ -252,11 +252,7 @@ void Executable::SaveConstantSection(dmlc::Stream* strm) {
   }
 
   // Save the const to device mapping.
-  std::vector<size_t> const_device_type;
-  for (auto dev_type : this->const_device_type) {
-    const_device_type.push_back(static_cast<size_t>(dev_type));
-  }
-  strm->Write(const_device_type);
+  strm->Write(this->const_device_type);
 }
 
 void Executable::SavePrimitiveOpNames(dmlc::Stream* strm) {
@@ -525,12 +521,10 @@ void Executable::LoadConstantSection(dmlc::Stream* strm) {
   }
 
   // Load the const to device mapping.
-  std::vector<size_t> const_device_type;
+  std::vector<Index> const_device_type;
   STREAM_CHECK(strm->Read(&const_device_type), "constant");
   ICHECK_EQ(size, const_device_type.size());
-  for (auto dev : const_device_type) {
-    this->const_device_type.push_back(static_cast<Index>(dev));
-  }
+  this->const_device_type = const_device_type;
 }
 
 void Executable::LoadPrimitiveOpNames(dmlc::Stream* strm) {
diff --git a/src/runtime/vm/serialize_utils.h b/src/runtime/vm/serialize_utils.h
index 990da31750d4..b4a10806caaf 100644
--- a/src/runtime/vm/serialize_utils.h
+++ b/src/runtime/vm/serialize_utils.h
@@ -24,7 +24,6 @@
 #ifndef TVM_RUNTIME_VM_SERIALIZE_UTILS_H_
 #define TVM_RUNTIME_VM_SERIALIZE_UTILS_H_
 
-#include <dmlc/common.h>
 #include <dmlc/memory_io.h>
 #include <tvm/runtime/vm/executable.h>
 
@@ -32,6 +31,8 @@
 #include <string>
 #include <vector>
 
+#include "../../support/utils.h"
+
 namespace tvm {
 namespace runtime {
 namespace vm {
@@ -40,9 +41,9 @@ namespace vm {
 constexpr uint64_t kTVMVMBytecodeMagic = 0xD225DE2F4214151D;
 
 template <typename T>
-static inline size_t VectorHash(size_t key, const std::vector<T>& values) {
+static inline uint64_t VectorHash(uint64_t key, const std::vector<T>& values) {
   for (const auto& it : values) {
-    key = dmlc::HashCombine(key, it);
+    key = support::HashCombine(key, it);
   }
   return key;
 }
@@ -122,7 +123,7 @@ struct VMInstructionSerializer {
    * instruction.
    */
   Index Hash() const {
-    size_t key = static_cast<size_t>(opcode);
+    uint64_t key = static_cast<uint64_t>(opcode);
     key = VectorHash(key, fields);
     return key;
   }
diff --git a/src/support/utils.h b/src/support/utils.h
index ce1f2bed43f9..c51b7b966478 100644
--- a/src/support/utils.h
+++ b/src/support/utils.h
@@ -162,6 +162,15 @@ inline size_t HashCombine(size_t key, size_t value) {
   return key ^ (value + 0x9e3779b9 + (key << 6) + (key >> 2));
 }
 
+/*!
+ * \brief hash an object and combines uint64_t key with previous keys
+ */
+template <typename T>
+inline uint64_t HashCombine(uint64_t key, const T& value) {
+  std::hash<T> hash_func;
+  return key ^ (hash_func(value) + 0x9e3779b9 + (key << 6) + (key >> 2));
+}
+
 }  // namespace support
 }  // namespace tvm
 #endif  // TVM_SUPPORT_UTILS_H_

From f3d9cc1066e8fb13bc9da1ce6899ad6c09893bcb Mon Sep 17 00:00:00 2001
From: Trevor Morris <trevmorr@amazon.com>
Date: Tue, 9 Mar 2021 16:29:09 -0800
Subject: [PATCH 311/357] Fix TVM compile without LLVM (#7621)

* Fix TVM compile without LLVM

* Fix formatting
---
 src/target/metadata_module.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/target/metadata_module.cc b/src/target/metadata_module.cc
index e2575c34d8f2..0b30d42c876c 100644
--- a/src/target/metadata_module.cc
+++ b/src/target/metadata_module.cc
@@ -116,8 +116,12 @@ runtime::Module CreateMetadataModule(
       crt_exportable_modules.push_back(target_module);
       target_module = CreateCSourceCrtMetadataModule(crt_exportable_modules, target);
     } else if (target->kind->name == "llvm") {
+#ifdef TVM_LLVM_VERSION
       crt_exportable_modules.push_back(target_module);
       target_module = CreateLLVMCrtMetadataModule(crt_exportable_modules, target);
+#else   // TVM_LLVM_VERSION
+      LOG(FATAL) << "TVM was not built with LLVM enabled.";
+#endif  // TVM_LLVM_VERSION
     }
   } else {
     if (!non_crt_exportable_modules.empty()) {

From 3a0e3a5bbf6271201438c22cebd72581a6545024 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Wed, 10 Mar 2021 14:25:16 +0900
Subject: [PATCH 312/357] [SPIR-V] Fix pushconstants offset calculation for 32
 bit values (#7620)

* Fix push constant offset for 32 bit value

* add test

* remove unused function from test

* add dynamic cumsum test

* skip if vulkan is not enabled

* replace dynamic cumsum test with dynamic argsort for now

Co-authored-by: Masahiro Masuda <masahi@129@gmail.com>
---
 src/target/spirv/ir_builder.cc                |  9 ++++-
 .../unittest/test_target_codegen_spirv.py     | 34 +++++++++++++++++++
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/src/target/spirv/ir_builder.cc b/src/target/spirv/ir_builder.cc
index 273fc48c3e30..3a9de4e077dc 100644
--- a/src/target/spirv/ir_builder.cc
+++ b/src/target/spirv/ir_builder.cc
@@ -222,7 +222,14 @@ Value IRBuilder::DeclarePushConstant(const std::vector<SType>& value_types) {
     DataType t = value_types[i].type;
     uint32_t nbits = t.bits() * t.lanes();
     ICHECK_EQ(nbits % 8, 0);
-    offset += nbits / 8;
+    uint32_t bytes = (nbits / 8);
+    if (t.bits() == 32) {
+      // In our Vulkan runtime, each push constant always occupies 64 bit.
+      offset += bytes * 2;
+    } else {
+      ICHECK_EQ(t.bits(), 64);
+      offset += bytes;
+    }
   }
   // Decorate push constants as UBO
   this->Decorate(spv::OpDecorate, struct_type, spv::DecorationBlock);
diff --git a/tests/python/unittest/test_target_codegen_spirv.py b/tests/python/unittest/test_target_codegen_spirv.py
index 2cbf0bea9257..68be5c480358 100644
--- a/tests/python/unittest/test_target_codegen_spirv.py
+++ b/tests/python/unittest/test_target_codegen_spirv.py
@@ -17,6 +17,7 @@
 import tvm
 import tvm.testing
 from tvm import te
+from tvm import relay
 from tvm.topi.math import cast
 import numpy as np
 
@@ -71,5 +72,38 @@ def do_copy(A, B, n):
     tvm.testing.assert_allclose(b.asnumpy(), ref)
 
 
+def test_pushconstants():
+    if not tvm.testing.device_enabled("vulkan"):
+        return
+
+    def check_mod(mod, x_np, res_np):
+        target = "vulkan"
+        ctx = tvm.context(target, 0)
+        ex = relay.create_executor("vm", mod=mod, ctx=ctx, target=target)
+        res = ex.evaluate()(x_np).asnumpy()
+        tvm.testing.assert_allclose(res, res_np, atol=1e-5)
+
+    # Three 32 bit pushconstants: any_dim, stride, stride
+    dtype = "float32"
+    x = relay.var("x", shape=(relay.Any(),), dtype=dtype)
+    mod = tvm.IRModule()
+    mod["main"] = relay.Function([x], relay.sqrt(x))
+    x_np = np.random.uniform(size=(10,)).astype(dtype)
+    res_np = np.sqrt(x_np)
+
+    check_mod(mod, x_np, res_np)
+
+    # One 64 bit and one 32 bit constants
+    dtype = "int32"
+    x = relay.var("x", shape=(relay.Any(),), dtype=dtype)
+    mod = tvm.IRModule()
+    mod["main"] = relay.Function([x], relay.argsort(x))
+    x_np = np.random.randint(0, high=10, size=(10,)).astype(dtype)
+    res_np = np.argsort(x_np)
+
+    check_mod(mod, x_np, res_np)
+
+
 if __name__ == "__main__":
     test_bool_load()
+    test_pushconstants()

From ee052dd6425ca889cb33948826f96fdcc37ff4e4 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@octoml.ai>
Date: Wed, 10 Mar 2021 07:22:20 -0800
Subject: [PATCH 313/357] Introduce Model Library Format export format (#7533)

* Introduce Model Library Format export format.

 * This function produces a stable on-disk representation of TVM's
   compiler output.
 * It's intended just for use with the C runtime for microTVM right
   now. It could be expanded for other use cases.
 * This PR implements the Model Library Format RFC, which ultimately
   is intended to support the Project Generator API (RFC
   forthcoming).
 * There may be some changes to the format without revving the version
   number until downstream consumers are known. The Project Generator
   API is the first such known downstream consumer.
 * There are no plans currently to support generating old Model
   Library Format from TVM. The version number is intended as a
   compatibility check between the generator and downstream consumers.
---
 python/tvm/micro/__init__.py                  |   1 +
 python/tvm/micro/model_library_format.py      | 171 ++++++++++++++++
 .../relay/backend/graph_runtime_factory.py    |  12 +-
 python/tvm/relay/build_module.py              |  20 +-
 python/tvm/runtime/module.py                  |  26 ++-
 src/runtime/graph/graph_runtime_factory.cc    |   3 +-
 .../test_micro_model_library_format.py        | 190 ++++++++++++++++++
 7 files changed, 404 insertions(+), 19 deletions(-)
 create mode 100644 python/tvm/micro/model_library_format.py
 create mode 100644 tests/python/unittest/test_micro_model_library_format.py

diff --git a/python/tvm/micro/__init__.py b/python/tvm/micro/__init__.py
index 8e5807acec94..ade63f2da9e4 100644
--- a/python/tvm/micro/__init__.py
+++ b/python/tvm/micro/__init__.py
@@ -23,6 +23,7 @@
 from .debugger import GdbRemoteDebugger
 from .micro_library import MicroLibrary
 from .micro_binary import MicroBinary
+from .model_library_format import export_model_library_format, UnsupportedInModelLibraryFormatError
 from .session import (
     create_local_graph_runtime,
     create_local_debug_runtime,
diff --git a/python/tvm/micro/model_library_format.py b/python/tvm/micro/model_library_format.py
new file mode 100644
index 000000000000..4ce80be647c1
--- /dev/null
+++ b/python/tvm/micro/model_library_format.py
@@ -0,0 +1,171 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Defines functions for exporting to Model Library Format."""
+
+import datetime
+import json
+import os
+import re
+import tarfile
+
+from ..contrib import utils
+from ..relay.backend import graph_runtime_factory
+from ..relay import param_dict
+
+
+class UnsupportedInModelLibraryFormatError(Exception):
+    """Raised when export_model_library_format does not support the given Module tree."""
+
+
+def _populate_codegen_dir(mod, codegen_dir: str):
+    """Populate the codegen sub-directory as part of a Model Library Format export.
+
+    Parameters
+    ----------
+    mod : tvm.runtime.Module
+        Module which should be written to codegen_dir.
+    codegen_dir : str
+        Path to the codegen directory on disk.
+    """
+    dso_modules = mod._collect_dso_modules()
+    dso_module_handles = [m.handle.value for m in dso_modules]
+    non_dso_modules = mod._collect_from_import_tree(lambda m: m not in dso_modules)
+    if non_dso_modules:
+        raise UnsupportedInModelLibraryFormatError(
+            f"Don't know how to export non-c or non-llvm modules; found: {non_dso_modules!r}"
+        )
+
+    mod_indices = {"lib": 0, "src": 0}
+    host_codegen_dir = os.path.join(codegen_dir, "host")
+    for dso_mod in dso_modules:
+        if dso_mod.type_key == "c":
+            index = mod_indices["src"]
+            mod_indices["src"] += 1
+            parent_dir = os.path.join(host_codegen_dir, "src")
+            file_name = os.path.join(parent_dir, f"lib{index}.c")
+        elif dso_mod.type_key == "llvm":
+            index = mod_indices["lib"]
+            mod_indices["lib"] += 1
+            parent_dir = os.path.join(host_codegen_dir, "lib")
+            file_name = os.path.join(parent_dir, f"lib{index}.o")
+        else:
+            assert (
+                False
+            ), f"do not expect module with type_key={mod.type_key} from _collect_dso_modules"
+
+        if not os.path.exists(parent_dir):
+            os.makedirs(parent_dir)
+        dso_mod.save(file_name)
+
+
+def _build_memory_map(graph_json):
+    """Build a simpler memory map from graph JSON.
+
+    Parameters
+    ----------
+    graph_json : str
+        String representation of the graph_json created from tvm.relay.build().
+
+    Returns
+    -------
+    list :
+        A list with one entry per storage id describing that memory.
+    """
+    graph = json.loads(graph_json)
+
+    seen_storage_ids = set()
+    memory_map = []
+    for node_id, storage_id in enumerate(graph["attrs"]["storage_id"][1]):
+        if storage_id in seen_storage_ids:
+            continue
+
+        seen_storage_ids.add(storage_id)
+        num_elements = 1
+        for dim in graph["attrs"]["shape"][1][storage_id]:
+            num_elements *= dim
+
+        dltype = graph["attrs"]["dltype"][1][storage_id]
+        m = re.match(r"^[a-zA-Z]+([0-9]+)$", dltype)
+        assert m, f"Exported graph contains unknown dltype {dltype}"
+
+        elem_bits = int(m.group(1))
+
+        map_entry = {
+            "storage_id": storage_id,
+            "size_bytes": (num_elements * elem_bits + 7) // 8,
+        }
+        if node_id in graph["arg_nodes"]:
+            map_entry["input_binding"] = graph["nodes"][node_id]["name"]
+
+        memory_map.append(map_entry)
+
+    return memory_map
+
+
+def export_model_library_format(mod: graph_runtime_factory.GraphRuntimeFactoryModule, file_name):
+    """Export the build artifact in Model Library Format.
+
+    This function creates a .tar archive containing the build artifacts in a standardized
+    layout. It's intended to allow downstream automation to build TVM artifacts against the C
+    runtime.
+
+    Parameters
+    ----------
+    mod : tvm.relay.backend.graph_runtime_factory.GraphRuntimeFactoryModule
+        The return value of tvm.relay.build, which will be exported into Model Library Format.
+    file_name : str
+        Path to the .tar archive to generate.
+    """
+    tempdir = utils.tempdir()
+    metadata = {
+        "version": 1,
+        "model_name": mod.libmod_name,
+        "export_datetime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%SZ"),
+        "memory": _build_memory_map(mod.graph_json),
+        "target": {int(k): str(v) for k, v in mod.target.items()},
+        "runtimes": ["graph"],
+    }
+    with open(tempdir.relpath("metadata.json"), "w") as json_f:
+        json.dump(metadata, json_f, indent=2, sort_keys=True)
+
+    codegen_dir_path = tempdir.relpath("codegen")
+    os.mkdir(codegen_dir_path)
+    _populate_codegen_dir(mod.lib, codegen_dir_path)
+
+    parameters_dir_path = tempdir.relpath("parameters")
+    os.mkdir(parameters_dir_path)
+    param_filename = os.path.join(parameters_dir_path, f"{mod.libmod_name}.params")
+    with open(param_filename, "wb") as f:
+        f.write(param_dict.save_param_dict(mod.params))
+
+    with open(tempdir.relpath("relay.txt"), "w") as f:
+        f.write(str(mod.ir_mod))
+
+    graph_config_dir_path = tempdir.relpath(os.path.join("runtime-config", "graph"))
+    os.makedirs(graph_config_dir_path)
+    with open(os.path.join(graph_config_dir_path, "graph.json"), "w") as f:
+        f.write(mod.graph_json)
+
+    with tarfile.open(file_name, "w") as tar_f:
+
+        def reset(tarinfo):
+            tarinfo.uid = tarinfo.gid = 0
+            tarinfo.uname = tarinfo.gname = "root"
+            return tarinfo
+
+        tar_f.add(tempdir.temp_dir, arcname=".", filter=reset)
diff --git a/python/tvm/relay/backend/graph_runtime_factory.py b/python/tvm/relay/backend/graph_runtime_factory.py
index 3427a62cd491..e92ae710ca0b 100644
--- a/python/tvm/relay/backend/graph_runtime_factory.py
+++ b/python/tvm/relay/backend/graph_runtime_factory.py
@@ -16,9 +16,9 @@
 # under the License.
 """Graph runtime factory."""
 import warnings
-from tvm._ffi.base import string_types
-from tvm._ffi.registry import get_global_func
-from tvm.runtime import ndarray
+from ..._ffi.base import string_types
+from ..._ffi.registry import get_global_func
+from ...runtime import ndarray
 
 
 class GraphRuntimeFactoryModule:
@@ -31,6 +31,8 @@ class GraphRuntimeFactoryModule:
         The graph to be deployed in json format output by graph compiler.
         The graph can contain operator(tvm_op) that points to the name of
         PackedFunc in the libmod.
+    target : tvm.Target
+        The Target used to build this module.
     libmod : tvm.Module
         The module of the corresponding function
     libmod_name: str
@@ -39,13 +41,15 @@ class GraphRuntimeFactoryModule:
         The parameters of module
     """
 
-    def __init__(self, graph_json_str, libmod, libmod_name, params):
+    def __init__(self, ir_mod, target, graph_json_str, libmod, libmod_name, params):
         assert isinstance(graph_json_str, string_types)
         fcreate = get_global_func("tvm.graph_runtime_factory.create")
         args = []
         for k, v in params.items():
             args.append(k)
             args.append(ndarray.array(v))
+        self.ir_mod = ir_mod
+        self.target = target
         self.module = fcreate(graph_json_str, libmod, libmod_name, *args)
         self.graph_json = graph_json_str
         self.lib = libmod
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 4c9a898f2374..8e69d288df12 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -208,14 +208,14 @@ def _build_module_no_factory(mod, target=None, target_host=None, params=None, mo
     return build(mod, target, target_host, params, mod_name).module
 
 
-def build(mod, target=None, target_host=None, params=None, mod_name="default"):
+def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"):
     # fmt: off
     # pylint: disable=line-too-long
     """Helper function that builds a Relay function to run on TVM graph runtime.
 
     Parameters
     ----------
-    mod : :py:class:`~tvm.IRModule`
+    ir_mod : :py:class:`~tvm.IRModule`
         The IR module to build. Using relay.Function is deprecated.
 
     target : str, :any:`tvm.target.Target`, or dict of str(i.e. device/context name) to str/tvm.target.Target, optional
@@ -251,13 +251,13 @@ def build(mod, target=None, target_host=None, params=None, mod_name="default"):
     """
     # pylint: enable=line-too-long
     # fmt: on
-    if not isinstance(mod, (IRModule, _function.Function)):
+    if not isinstance(ir_mod, (IRModule, _function.Function)):
         raise ValueError("Type of input parameter mod must be tvm.IRModule")
 
-    if isinstance(mod, _function.Function):
+    if isinstance(ir_mod, _function.Function):
         if params:
-            mod = bind_params_by_name(mod, params)
-        mod = IRModule.from_expr(mod)
+            ir_mod = bind_params_by_name(ir_mod, params)
+        ir_mod = IRModule.from_expr(ir_mod)
         warnings.warn(
             "Please use input parameter mod (tvm.IRModule) "
             "instead of deprecated parameter mod (tvm.relay.function.Function)",
@@ -280,9 +280,11 @@ def build(mod, target=None, target_host=None, params=None, mod_name="default"):
 
     with tophub_context:
         bld_mod = BuildModule()
-        graph_json, mod, params = bld_mod.build(mod, target, target_host, params)
-        mod = _graph_runtime_factory.GraphRuntimeFactoryModule(graph_json, mod, mod_name, params)
-        return mod
+        graph_json, runtime_mod, params = bld_mod.build(ir_mod, target, target_host, params)
+        runtime_mod = _graph_runtime_factory.GraphRuntimeFactoryModule(
+            ir_mod, target, graph_json, runtime_mod, mod_name, params
+        )
+        return runtime_mod
 
 
 def optimize(mod, target=None, params=None):
diff --git a/python/tvm/runtime/module.py b/python/tvm/runtime/module.py
index 63267969ab4e..53576a60f32f 100644
--- a/python/tvm/runtime/module.py
+++ b/python/tvm/runtime/module.py
@@ -105,6 +105,9 @@ def __getitem__(self, name):
             raise ValueError("Can only take string as function name")
         return self.get_function(name)
 
+    def __eq__(self, other):
+        return self.handle.value == other.handle.value
+
     def __call__(self, *args):
         if self._entry:
             return self._entry(*args)
@@ -233,15 +236,27 @@ def evaluator(*args):
         except NameError:
             raise NameError("time_evaluate is only supported when RPC is enabled")
 
-    def _collect_dso_modules(self):
-        """Helper function to collect dso modules, then return it."""
+    def _collect_from_import_tree(self, filter_func):
+        """Helper function to collect modules from the tree matching a filter_func, then return it.
+
+        Parameters
+        ----------
+        filter_func : Callable[[Module], bool]
+            A function which is invoked for each Module discovered in the import tree (including
+            self).
+
+        Returns
+        -------
+        list[Module] :
+            A list of matching Module.
+        """
         visited, stack, dso_modules = set(), [], []
         # append root module
         visited.add(self)
         stack.append(self)
         while stack:
             module = stack.pop()
-            if module._dso_exportable():
+            if filter_func(module):
                 dso_modules.append(module)
             for m in module.imported_modules:
                 if m not in visited:
@@ -249,8 +264,9 @@ def _collect_dso_modules(self):
                     stack.append(m)
         return dso_modules
 
-    def _dso_exportable(self):
-        return self.type_key == "llvm" or self.type_key == "c"
+    def _collect_dso_modules(self):
+        is_dso_exportable = lambda m: (m.type_key == "llvm" or m.type_key == "c")
+        return self._collect_from_import_tree(is_dso_exportable)
 
     def export_library(self, file_name, fcompile=None, addons=None, workspace_dir=None, **kwargs):
         """Export the module and its imported device code one library.
diff --git a/src/runtime/graph/graph_runtime_factory.cc b/src/runtime/graph/graph_runtime_factory.cc
index 4d3993a9a36f..605d6b0ce892 100644
--- a/src/runtime/graph/graph_runtime_factory.cc
+++ b/src/runtime/graph/graph_runtime_factory.cc
@@ -156,7 +156,8 @@ TVM_REGISTER_GLOBAL("tvm.graph_runtime_factory.create").set_body([](TVMArgs args
                                  "graph_runtime_factory.create needs at least 3, "
                                  "but it has "
                               << args.num_args;
-  // The argument order is graph_json, module, module_name, params.
+  // The argument order is graph_json, module, module_name, param0_name, param0_tensor,
+  // [param1_name, param1_tensor], ...
   ICHECK_EQ((args.size() - 3) % 2, 0);
   std::unordered_map<std::string, tvm::runtime::NDArray> params;
   for (size_t i = 3; i < static_cast<size_t>(args.size()); i += 2) {
diff --git a/tests/python/unittest/test_micro_model_library_format.py b/tests/python/unittest/test_micro_model_library_format.py
new file mode 100644
index 000000000000..c999091cc3cc
--- /dev/null
+++ b/tests/python/unittest/test_micro_model_library_format.py
@@ -0,0 +1,190 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import datetime
+import json
+import os
+import sys
+import tarfile
+
+import numpy
+import pytest
+
+import tvm
+import tvm.relay
+from tvm.relay.backend import graph_runtime_factory
+import tvm.runtime.module
+import tvm.testing
+from tvm.contrib import utils
+
+
+def validate_graph_json(extract_dir, factory):
+    with open(os.path.join(extract_dir, "runtime-config", "graph", "graph.json")) as graph_f:
+        graph_json = graph_f.read()
+        assert graph_json == factory.graph_json
+
+        # Just check it parses and looks roughly right.
+        graph = json.loads(graph_json)
+        assert "nodes" in graph
+        assert len(graph["nodes"]) == 4
+        assert "attrs" in graph
+
+
+@tvm.testing.requires_micro
+def test_export_model_library_format_c():
+    with utils.TempDirectory.set_keep_for_debug(True):
+        target = tvm.target.target.micro("host")
+        with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+            relay_mod = tvm.parser.fromtext(
+                """
+            #[version = "0.0.5"]
+            def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[(1, 2), float32]) {
+            %0 = cast(%a, dtype="float32") + %b * %c;
+            %0
+            }"""
+            )
+            factory = tvm.relay.build(
+                relay_mod,
+                target,
+                target_host=target,
+                mod_name="add",
+                params={"c": numpy.array([[2.0, 4.0]], dtype="float32")},
+            )
+
+        temp_dir = utils.tempdir()
+        mlf_tar_path = temp_dir.relpath("lib.tar")
+        import tvm.micro as micro
+
+        micro.export_model_library_format(factory, mlf_tar_path)
+        tf = tarfile.open(mlf_tar_path)
+
+        extract_dir = temp_dir.relpath("extract")
+        os.mkdir(extract_dir)
+        tf.extractall(extract_dir)
+
+        with open(os.path.join(extract_dir, "metadata.json")) as json_f:
+            metadata = json.load(json_f)
+            assert metadata["version"] == 1
+            assert metadata["model_name"] == "add"
+            export_datetime = datetime.datetime.strptime(
+                metadata["export_datetime"], "%Y-%m-%d %H:%M:%SZ"
+            )
+            assert (datetime.datetime.now() - export_datetime) < datetime.timedelta(seconds=60 * 5)
+            assert metadata["target"] == {"1": str(target)}
+            assert metadata["memory"] == [
+                {"storage_id": 0, "size_bytes": 2, "input_binding": "a"},
+                {"storage_id": 1, "size_bytes": 8, "input_binding": "b"},
+                {"storage_id": 2, "size_bytes": 8, "input_binding": "p0"},
+                {"storage_id": 3, "size_bytes": 8},
+            ]
+
+        assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "lib0.c"))
+        assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "lib1.c"))
+
+        validate_graph_json(extract_dir, factory)
+
+        with open(os.path.join(extract_dir, "relay.txt")) as relay_f:
+            assert relay_f.read() == str(relay_mod)
+
+        with open(os.path.join(extract_dir, "parameters", "add.params"), "rb") as params_f:
+            params = tvm.relay.load_param_dict(params_f.read())
+            assert "p0" in params
+
+
+@tvm.testing.requires_micro
+def test_export_model_library_format_llvm():
+    with utils.TempDirectory.set_keep_for_debug(True):
+        target = tvm.target.target.micro("host")
+        assert str(target)[:2] == "c "
+        target = tvm.target.Target("llvm " + str(target)[2:])
+        with tvm.transform.PassContext(opt_level=3):
+            relay_mod = tvm.parser.fromtext(
+                """
+            #[version = "0.0.5"]
+            def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[(1, 2), float32]) {
+            %0 = cast(%a, dtype="float32") + %b * %c;
+            %0
+            }"""
+            )
+            factory = tvm.relay.build(
+                relay_mod,
+                target,
+                target_host=target,
+                mod_name="add",
+                params={"c": numpy.array([[2.0, 4.0]], dtype="float32")},
+            )
+
+        temp_dir = utils.tempdir()
+        mlf_tar_path = temp_dir.relpath("lib.tar")
+        import tvm.micro as micro
+
+        micro.export_model_library_format(factory, mlf_tar_path)
+        tf = tarfile.open(mlf_tar_path)
+
+        extract_dir = temp_dir.relpath("extract")
+        os.mkdir(extract_dir)
+        tf.extractall(extract_dir)
+
+        with open(os.path.join(extract_dir, "metadata.json")) as json_f:
+            metadata = json.load(json_f)
+            assert metadata["version"] == 1
+            assert metadata["model_name"] == "add"
+            export_datetime = datetime.datetime.strptime(
+                metadata["export_datetime"], "%Y-%m-%d %H:%M:%SZ"
+            )
+            assert (datetime.datetime.now() - export_datetime) < datetime.timedelta(seconds=60 * 5)
+            assert metadata["target"] == {"1": str(target)}
+            assert metadata["memory"] == [
+                {"storage_id": 0, "size_bytes": 2, "input_binding": "a"},
+                {"storage_id": 1, "size_bytes": 8, "input_binding": "b"},
+                {"storage_id": 2, "size_bytes": 8, "input_binding": "p0"},
+                {"storage_id": 3, "size_bytes": 8},
+            ]
+
+        assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "lib", "lib0.o"))
+
+        validate_graph_json(extract_dir, factory)
+
+        with open(os.path.join(extract_dir, "relay.txt")) as relay_f:
+            assert relay_f.read() == str(relay_mod)
+
+        with open(os.path.join(extract_dir, "parameters", "add.params"), "rb") as params_f:
+            params = tvm.relay.load_param_dict(params_f.read())
+            assert "p0" in params
+
+
+@tvm.testing.requires_micro
+def test_export_model():
+    module = tvm.support.FrontendTestModule()
+    factory = graph_runtime_factory.GraphRuntimeFactoryModule(
+        None, tvm.target.target.micro("host"), '"graph_json"', module, "test_module", {}
+    )
+
+    temp_dir = utils.tempdir()
+    import tvm.micro as micro
+    import tvm.micro.model_library_format as model_library_format
+
+    with pytest.raises(micro.UnsupportedInModelLibraryFormatError) as exc:
+        model_library_format._populate_codegen_dir(module, temp_dir.relpath("codegen"))
+
+        assert str(exc.exception) == (
+            "Don't know how to export non-c or non-llvm modules; found: ffi_testing"
+        )
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 8a9aebca13f819eb9233509fa53974da08db9b70 Mon Sep 17 00:00:00 2001
From: Luis Vega <vegaluisjose@users.noreply.github.com>
Date: Wed, 10 Mar 2021 07:22:57 -0800
Subject: [PATCH 314/357] [Runtime][Contrib][Verilator] remove explicit
 destructor call (#7485)

---
 src/runtime/contrib/verilator/verilator_runtime.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/runtime/contrib/verilator/verilator_runtime.cc b/src/runtime/contrib/verilator/verilator_runtime.cc
index bc96b69f2ffe..5dfb8441c864 100644
--- a/src/runtime/contrib/verilator/verilator_runtime.cc
+++ b/src/runtime/contrib/verilator/verilator_runtime.cc
@@ -80,7 +80,7 @@ VerilatorRuntime::~VerilatorRuntime() {
   auto dealloc = reinterpret_cast<VerilatorDeallocFunc>(lib_->GetSymbol("VerilatorDealloc"));
   ICHECK(dealloc != nullptr);
   dealloc(device_);
-  lib_->~VerilatorLibrary();
+  delete lib_;
 }
 
 void VerilatorRuntime::SetLibrary(const std::string& lib_path) { lib_path_ = lib_path; }
@@ -108,7 +108,7 @@ void VerilatorRuntime::Init(const Array<NDArray>& consts) {
   // enable profiler
   if (prof_enable_) prof_ = VerilatorProfiler::ThreadLocal();
 
-  // reset verilator device
+  // reset verilator device.
   reset(device_, reset_cycles_);
 
   CHECK_EQ(consts.size(), const_idx_.size())

From a877d53828defe89dda721479f9e8aa1ed1d8884 Mon Sep 17 00:00:00 2001
From: Eye <380614540@qq.com>
Date: Wed, 10 Mar 2021 23:24:26 +0800
Subject: [PATCH 315/357] fix:getcwd not work on android platform (#7390)

* fix:getcwd not work on android platform

* replace `exit()` with `_exit()` on subprocess in `cpp_rpc`

Co-authored-by: rqg <ranqingguo318@gmail.com>
---
 apps/cpp_rpc/rpc_env.cc    | 8 +++++++-
 apps/cpp_rpc/rpc_server.cc | 4 ++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/apps/cpp_rpc/rpc_env.cc b/apps/cpp_rpc/rpc_env.cc
index 5b351725b1f1..ea19cfa3979d 100644
--- a/apps/cpp_rpc/rpc_env.cc
+++ b/apps/cpp_rpc/rpc_env.cc
@@ -86,7 +86,13 @@ void CleanDir(const std::string& dirname);
 std::string BuildSharedLibrary(std::string file_in);
 
 RPCEnv::RPCEnv() {
-#ifndef _WIN32
+#if defined(ANDROID) || defined(__ANDROID__)
+  char cwd[PATH_MAX];
+  auto cmdline = fopen("/proc/self/cmdline", "r");
+  fread(cwd, 1, sizeof(cwd), cmdline);
+  fclose(cmdline);
+  base_ = "/data/data/" + std::string(cwd) + "/cache/rpc";
+#elif !defined(_WIN32)
   char cwd[PATH_MAX];
   if (getcwd(cwd, sizeof(cwd))) {
     base_ = std::string(cwd) + "/rpc";
diff --git a/apps/cpp_rpc/rpc_server.cc b/apps/cpp_rpc/rpc_server.cc
index 83b9a18c5f21..a4028ff61eca 100644
--- a/apps/cpp_rpc/rpc_server.cc
+++ b/apps/cpp_rpc/rpc_server.cc
@@ -168,14 +168,14 @@ class RPCServer {
         if (timer_pid == 0) {
           // Timer process
           sleep(timeout);
-          exit(0);
+          _exit(0);
         }
 
         const pid_t worker_pid = fork();
         if (worker_pid == 0) {
           // Worker process
           ServerLoopProc(conn, addr);
-          exit(0);
+          _exit(0);
         }
 
         int status = 0;

From 6b1f18089ed55b105ab8a6b1a59de0a73e595709 Mon Sep 17 00:00:00 2001
From: Tom Tan <lilotom@gmail.com>
Date: Wed, 10 Mar 2021 07:26:14 -0800
Subject: [PATCH 316/357] Improve tensor mismatch ICHECK message (#7335)

* Improve tensor mismatch assert message
---
 src/te/tensor.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/te/tensor.cc b/src/te/tensor.cc
index 18d4947cdddc..b48f39a38627 100644
--- a/src/te/tensor.cc
+++ b/src/te/tensor.cc
@@ -46,7 +46,7 @@ PrimExpr Tensor::operator()(Array<Var> indices) const {
 
 PrimExpr Tensor::operator()(Array<PrimExpr> indices) const {
   if (ndim() != 0) {
-    ICHECK_EQ(ndim(), indices.size()) << "Tensor dimension mismatch in read"
+    ICHECK_EQ(ndim(), indices.size()) << "Tensor dimension mismatch in read "
                                       << "ndim = " << ndim() << ", indices.size=" << indices.size();
   }
 

From 829f44c9c838c7ad5c6344754e45bccefc545b2c Mon Sep 17 00:00:00 2001
From: Trevor Morris <trevmorr@amazon.com>
Date: Wed, 10 Mar 2021 10:53:07 -0800
Subject: [PATCH 317/357] [CUDA][TOPI] Fix CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES
 with NMS for certain GPUs (#7623)

* Use less threads for certain GPUs to avoid register limit

* Move util function to nvcc.py

* Fix lint
---
 python/tvm/contrib/nvcc.py  | 41 +++++++++++++++++++++++++++++++++++++
 python/tvm/topi/cuda/nms.py |  9 ++++++++
 2 files changed, 50 insertions(+)

diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py
index f33603b923a5..7e49f55e8d32 100644
--- a/python/tvm/contrib/nvcc.py
+++ b/python/tvm/contrib/nvcc.py
@@ -216,6 +216,47 @@ def callback_libdevice_path(arch):
         return ""
 
 
+def get_target_compute_version(target=None):
+    """Utility function to get compute capability of compilation target.
+
+    Looks for the arch in three different places, first in the target attributes, then the global
+    scope, and finally the GPU device (if it exists).
+
+    Parameters
+    ----------
+    target : tvm.target.Target, optional
+        The compilation target
+
+    Returns
+    -------
+    compute_version : str
+        compute capability of a GPU (e.g. "8.0")
+    """
+    # 1. Target
+    if target:
+        if "arch" in target.attrs:
+            compute_version = target.attrs["arch"]
+            major, minor = compute_version.split("_")[1]
+            return major + "." + minor
+
+    # 2. Global scope
+    from tvm.autotvm.env import AutotvmGlobalScope  # pylint: disable=import-outside-toplevel
+
+    if AutotvmGlobalScope.current.cuda_target_arch:
+        major, minor = AutotvmGlobalScope.current.cuda_target_arch.split("_")[1]
+        return major + "." + minor
+
+    # 3. GPU
+    if tvm.gpu(0).exist:
+        return tvm.gpu(0).compute_version
+
+    warnings.warn(
+        "No CUDA architecture was specified or GPU detected."
+        "Try specifying it by adding '-arch=sm_xx' to your target."
+    )
+    return None
+
+
 def parse_compute_version(compute_version):
     """Parse compute capability string to divide major and minor version
 
diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
index 83b538554ed4..ccc2ec9d0c21 100644
--- a/python/tvm/topi/cuda/nms.py
+++ b/python/tvm/topi/cuda/nms.py
@@ -19,6 +19,7 @@
 """Non-maximum suppression operator"""
 import tvm
 from tvm import te
+from tvm.contrib import nvcc
 from tvm.contrib.thrust import can_use_thrust, can_use_rocthrust
 from tvm.tir import if_then_else
 from .sort import argsort, argsort_thrust
@@ -493,6 +494,14 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
         nthread_by = batch_size
         nthread_tx = max_threads
 
+        # Some cuda architectures have smaller limit of 32K for cudaDevAttrMaxRegistersPerBlock
+        # vs 64K for most GPUs. Since this kernel uses many registers (around 35), the limit will
+        # be exceeded with 1024 threads.
+        target = tvm.target.Target.current(allow_none=False)
+        if target.kind.name == "cuda":
+            if nvcc.get_target_compute_version(target) in ["3.2", "5.3", "6.2"]:
+                nthread_tx = 512
+
         by = te.thread_axis("blockIdx.y")
         tx = te.thread_axis("threadIdx.x")
         ib.scope_attr(by, "thread_extent", nthread_by)

From cf2abc8e08dcb9427feadc809a5a148d77b40418 Mon Sep 17 00:00:00 2001
From: fredster33 <64927044+fredster33@users.noreply.github.com>
Date: Wed, 10 Mar 2021 15:46:04 -0800
Subject: [PATCH 318/357] Grammar fix (#7622)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ac4ed62524b1..eec5bfd5797d 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@ License
 Contribute to TVM
 -----------------
 TVM adopts apache committer model, we aim to create an open source project that is maintained and owned by the community.
-Checkout the [Contributor Guide](https://tvm.apache.org/docs/contribute/)
+Check out the [Contributor Guide](https://tvm.apache.org/docs/contribute/).
 
 Acknowledgement
 ---------------

From dd61b049b57c70a7d1a824420855436850d6cb2d Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Wed, 10 Mar 2021 18:51:54 -0800
Subject: [PATCH 319/357] [TIR] Add PreOrderVisit and VisitPrimFuncs (#7627)

* [TIR] Add PreOrderVisit and VisitPrimFuncs

* Update stmt_functor.h

* address comments

* fix lint
---
 include/tvm/runtime/object.h             | 20 ++++-----
 include/tvm/tir/analysis.h               | 16 +++++++
 include/tvm/tir/stmt_functor.h           |  9 ++++
 include/tvm/topi/detail/constant_utils.h | 11 +++--
 src/tir/ir/stmt_functor.cc               | 54 +++++++++++++++++++++---
 tests/cpp/ir_functor_test.cc             | 53 +++++++++++++++++++++++
 6 files changed, 142 insertions(+), 21 deletions(-)

diff --git a/include/tvm/runtime/object.h b/include/tvm/runtime/object.h
index 70ab7688c450..47788394126e 100644
--- a/include/tvm/runtime/object.h
+++ b/include/tvm/runtime/object.h
@@ -186,7 +186,7 @@ class TVM_DLL Object {
   template <typename TargetType>
   inline bool IsInstance() const;
   /*!
-   * \return Weather the cell has only one reference
+   * \return Whether the cell has only one reference
    * \note We use stl style naming to be consistent with known API in shared_ptr.
    */
   inline bool unique() const;
@@ -337,7 +337,7 @@ inline RelayRefType GetRef(const ObjectType* ptr);
 /*!
  * \brief Downcast a base reference type to a more specific type.
  *
- * \param ref The inptut reference
+ * \param ref The input reference
  * \return The corresponding SubRef.
  * \tparam SubRef The target specific reference type.
  * \tparam BaseRef the current reference type.
@@ -416,7 +416,7 @@ class ObjectPtr {
     return *get();
   }
   /*!
-   * \brief copy assignmemt
+   * \brief copy assignment
    * \param other The value to be assigned.
    * \return reference to self.
    */
@@ -427,7 +427,7 @@ class ObjectPtr {
     return *this;
   }
   /*!
-   * \brief move assignmemt
+   * \brief move assignment
    * \param other The value to be assigned.
    * \return reference to self.
    */
@@ -632,7 +632,7 @@ struct ObjectPtrEqual {
 };
 
 /*!
- * \brief helper macro to declare a base object type that can be inheritated.
+ * \brief helper macro to declare a base object type that can be inherited.
  * \param TypeName The name of the current type.
  * \param ParentType The name of the ParentType
  */
@@ -648,10 +648,10 @@ struct ObjectPtrEqual {
     return _GetOrAllocRuntimeTypeIndex();                                                      \
   }                                                                                            \
   static uint32_t _GetOrAllocRuntimeTypeIndex() {                                              \
-    static uint32_t tidx = Object::GetOrAllocRuntimeTypeIndex(                                 \
+    static uint32_t tindex = Object::GetOrAllocRuntimeTypeIndex(                               \
         TypeName::_type_key, TypeName::_type_index, ParentType::_GetOrAllocRuntimeTypeIndex(), \
         TypeName::_type_child_slots, TypeName::_type_child_slots_can_overflow);                \
-    return tidx;                                                                               \
+    return tindex;                                                                             \
   }
 
 /*!
@@ -664,7 +664,7 @@ struct ObjectPtrEqual {
   static const constexpr int _type_child_slots = 0;         \
   TVM_DECLARE_BASE_OBJECT_INFO(TypeName, ParentType)
 
-/*! \brief helper macro to supress unused warning */
+/*! \brief helper macro to suppress unused warning */
 #if defined(__GNUC__)
 #define TVM_ATTRIBUTE_UNUSED __attribute__((unused))
 #else
@@ -686,7 +686,7 @@ struct ObjectPtrEqual {
   TVM_STR_CONCAT(TVM_OBJECT_REG_VAR_DEF, __COUNTER__) = TypeName::_GetOrAllocRuntimeTypeIndex()
 
 /*
- * \brief Define the default copy/move constructor and assign opeator
+ * \brief Define the default copy/move constructor and assign operator
  * \param TypeName The class typename.
  */
 #define TVM_DEFINE_DEFAULT_COPY_MOVE_AND_ASSIGN(TypeName) \
@@ -827,7 +827,7 @@ inline bool Object::IsInstance() const {
       if (!TargetType::_type_child_slots_can_overflow) return false;
       // Invariance: parent index is always smaller than the child.
       if (self->type_index_ < TargetType::RuntimeTypeIndex()) return false;
-      // The rare slower-path, check type hierachy.
+      // The rare slower-path, check type hierarchy.
       return self->DerivedFrom(TargetType::RuntimeTypeIndex());
     }
   } else {
diff --git a/include/tvm/tir/analysis.h b/include/tvm/tir/analysis.h
index e5b2c2b6957c..1ad78596586a 100644
--- a/include/tvm/tir/analysis.h
+++ b/include/tvm/tir/analysis.h
@@ -56,6 +56,22 @@ struct ExprDeepEqual {
   TVM_DLL bool operator()(const PrimExpr& lhs, const PrimExpr& rhs) const;
 };
 
+/*!
+ * \brief Visit the PrimFuncs in the IRModule
+ * \tparam FLambda The type of the PrimFunc visitor
+ * \param mod The IRModule to be visited
+ * \param fvisit The visitor to the PrimFuncs in the IRModule
+ */
+template <class FLambda>
+inline void VisitPrimFuncs(const IRModule& mod, FLambda fvisit) {
+  for (const auto& kv : mod->functions) {
+    const BaseFunc& base_func = kv.second;
+    if (const auto* prim_func = base_func.as<PrimFuncNode>()) {
+      fvisit(prim_func);
+    }
+  }
+}
+
 /*!
  * \brief Find undefined vars in the statement.
  * \param stmt The function to be checked.
diff --git a/include/tvm/tir/stmt_functor.h b/include/tvm/tir/stmt_functor.h
index d6303ae266e1..c1c618f0c22f 100644
--- a/include/tvm/tir/stmt_functor.h
+++ b/include/tvm/tir/stmt_functor.h
@@ -386,6 +386,15 @@ inline T Substitute(T input, const std::unordered_map<const VarNode*, PrimExpr>&
   return Substitute(std::move(input), vmap);
 }
 
+/*!
+ * \brief Recursively visit the IR in pre DFS order node, apply fvisit.
+ * If fvisit returns false, it won't visit the children of the node.
+ * \param stmt_or_expr The ir to be visited.
+ * \param fvisit The visitor function to be applied. If fvisit returns false, it won't visit the
+ * children of the node
+ */
+TVM_DLL void PreOrderVisit(const ObjectRef& stmt_or_expr,
+                           const std::function<bool(const ObjectRef&)>& fvisit);
 }  // namespace tir
 }  // namespace tvm
 
diff --git a/include/tvm/topi/detail/constant_utils.h b/include/tvm/topi/detail/constant_utils.h
index 92ff3a4e3804..95e68f5f6d61 100644
--- a/include/tvm/topi/detail/constant_utils.h
+++ b/include/tvm/topi/detail/constant_utils.h
@@ -119,12 +119,11 @@ inline std::vector<int64_t> GetConstInt64Values(Array<PrimExpr> exprs,
 }
 
 /*!
- * \brief Check weather the two expressions are equal or not, if not simplify the expressions and
- * check again \note This is stronger equality check than tvm::tir::Equal
- *
- * \param lhs First expreesion
- * \param rhs Second expreesion
- *
+ * \brief Check whether the two expressions are equal or not, if not simplify the expressions and
+ * check again
+ * \note This is stronger equality check than tvm::tir::Equal
+ * \param lhs First expression
+ * \param rhs Second expression
  * \return result True if both expressions are equal, else false
  */
 inline bool EqualCheck(PrimExpr lhs, PrimExpr rhs) {
diff --git a/src/tir/ir/stmt_functor.cc b/src/tir/ir/stmt_functor.cc
index 639d38db0a81..07574e4fb2f1 100644
--- a/src/tir/ir/stmt_functor.cc
+++ b/src/tir/ir/stmt_functor.cc
@@ -19,12 +19,14 @@
 /*!
  * \file stmt_functor.cc
  */
+#include <tvm/ir/module.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/tir/function.h>
 #include <tvm/tir/stmt_functor.h>
 
 #include <functional>
 
-#include "functor_common.h"
+#include "./functor_common.h"
 
 namespace tvm {
 namespace tir {
@@ -631,9 +633,9 @@ Stmt IRTransform(Stmt ir_node, const runtime::PackedFunc& f_preorder,
   return transform(std::move(ir_node));
 }
 
-class IRSubstitue : public StmtExprMutator {
+class IRSubstitute : public StmtExprMutator {
  public:
-  explicit IRSubstitue(std::function<Optional<PrimExpr>(const Var&)> vmap) : vmap_(vmap) {}
+  explicit IRSubstitute(std::function<Optional<PrimExpr>(const Var&)> vmap) : vmap_(vmap) {}
 
   PrimExpr VisitExpr_(const VarNode* op) final {
     Var var = GetRef<Var>(op);
@@ -679,11 +681,53 @@ class IRSubstitue : public StmtExprMutator {
 };
 
 Stmt Substitute(Stmt stmt, std::function<Optional<PrimExpr>(const Var&)> vmap) {
-  return IRSubstitue(vmap)(std::move(stmt));
+  return IRSubstitute(vmap)(std::move(stmt));
 }
 
 PrimExpr Substitute(PrimExpr expr, std::function<Optional<PrimExpr>(const Var&)> vmap) {
-  return IRSubstitue(vmap)(std::move(expr));
+  return IRSubstitute(vmap)(std::move(expr));
+}
+
+void PreOrderVisit(const ObjectRef& stmt_or_expr,
+                   const std::function<bool(const ObjectRef&)>& fvisit) {
+  class PreOrderVisitor : public StmtExprVisitor {
+   public:
+    explicit PreOrderVisitor(const std::function<bool(const ObjectRef&)>& f) : f_(f) {}
+
+   private:
+    void VisitExpr(const PrimExpr& expr) final {
+      const PrimExprNode* p_expr = expr.get();
+      if (visited_.count(p_expr) == 0) {
+        visited_.insert(p_expr);
+        if (f_(expr)) {
+          ExprVisitor::VisitExpr(expr);
+        }
+      }
+    }
+
+    void VisitStmt(const Stmt& stmt) final {
+      const StmtNode* p_stmt = stmt.get();
+      if (visited_.count(p_stmt) == 0) {
+        visited_.insert(p_stmt);
+        if (f_(stmt)) {
+          StmtVisitor::VisitStmt(stmt);
+        }
+      }
+    }
+
+    const std::function<bool(const ObjectRef&)>& f_;
+    std::unordered_set<const Object*> visited_;
+  };
+
+  PreOrderVisitor visitor(fvisit);
+  if (const auto* stmt = stmt_or_expr.as<StmtNode>()) {
+    visitor(GetRef<Stmt>(stmt));
+  } else if (const auto* expr = stmt_or_expr.as<PrimExprNode>()) {
+    visitor(GetRef<PrimExpr>(expr));
+  } else {
+    LOG(FATAL) << "InternalError: PreOrderVisit does not accept object with type: "
+               << stmt_or_expr->GetTypeKey();
+  }
 }
 
 TVM_REGISTER_GLOBAL("tir.IRTransform").set_body_typed(IRTransform);
diff --git a/tests/cpp/ir_functor_test.cc b/tests/cpp/ir_functor_test.cc
index 237dc46b99ca..1f7d18f747ea 100644
--- a/tests/cpp/ir_functor_test.cc
+++ b/tests/cpp/ir_functor_test.cc
@@ -19,10 +19,14 @@
 
 #include <dmlc/logging.h>
 #include <gtest/gtest.h>
+#include <tvm/ir/module.h>
 #include <tvm/node/functor.h>
+#include <tvm/relay/function.h>
+#include <tvm/tir/analysis.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/expr.h>
 #include <tvm/tir/expr_functor.h>
+#include <tvm/tir/function.h>
 #include <tvm/tir/op.h>
 #include <tvm/tir/stmt_functor.h>
 
@@ -52,6 +56,55 @@ TEST(IRF, CountVar) {
   ICHECK_EQ(n_var, 2);
 }
 
+TEST(IRF, VisitPrimFuncs) {
+  using namespace tvm;
+  using namespace tvm::tir;
+  PrimFunc prim_func(/*params=*/{}, /*body=*/Evaluate(Integer(0)));
+  relay::Function relay_func(/*params=*/{}, /*body=*/relay::Expr(nullptr),
+                             /*ret_type=*/relay::Type{nullptr}, /*ty_params=*/{});
+  IRModule mod({
+      {GlobalVar("main"), prim_func},
+      {GlobalVar("main2"), relay_func},
+  });
+  int n_visited = 0;
+  VisitPrimFuncs(mod, [&](const PrimFuncNode* func) { ++n_visited; });
+  ASSERT_EQ(n_visited, 1);
+}
+
+TEST(IRF, PreOrderVisit) {
+  using namespace tvm;
+  using namespace tvm::tir;
+  Stmt init = IfThenElse(const_true(), Evaluate(Integer(0)), Evaluate(Integer(0)));
+  Stmt body = Evaluate(Integer(1));
+  Block block(/*iter_vars=*/{}, /*reads=*/{},
+              /*writes=*/{}, /*name_hint=*/"block", /*body=*/body,
+              /*init=*/init);
+  bool init_visited = false;
+  bool stopped_at_if = true;
+  bool body_visited = false;
+  PreOrderVisit(block, [&](const ObjectRef& n) -> bool {
+    if (n->IsInstance<IfThenElseNode>()) {
+      init_visited = true;
+      return false;
+    }
+    if (const auto* eval = n.as<EvaluateNode>()) {
+      if (const auto* int_imm = eval->value.as<IntImmNode>()) {
+        if (int_imm->value == 0) {
+          stopped_at_if = false;
+        } else if (int_imm->value == 1) {
+          body_visited = true;
+        } else {
+          LOG(FATAL) << "Unreachable";
+        }
+      }
+    }
+    return true;
+  });
+  ASSERT_EQ(init_visited, true);
+  ASSERT_EQ(stopped_at_if, true);
+  ASSERT_EQ(body_visited, true);
+}
+
 TEST(IRF, ExprTransform) {
   using namespace tvm;
   using namespace tvm::tir;

From c5198632cf583b2d6c357b862d210cac7d6df37b Mon Sep 17 00:00:00 2001
From: liyuchao <xiamenlyc@163.com>
Date: Thu, 11 Mar 2021 17:02:39 +0800
Subject: [PATCH 320/357] [AutoScheduler] Fix incorrectly array context device
 and hide info at the beginning (#7632)

* [AutoScheduler] Fix incorrectly array context device and hide info at the beginning

* Lint fix
---
 python/tvm/auto_scheduler/measure.py          | 12 +++++-
 .../tvm/auto_scheduler/relay_integration.py   |  5 +++
 .../unittest/test_auto_scheduler_measure.py   | 37 ++++++++++++++++++-
 3 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/python/tvm/auto_scheduler/measure.py b/python/tvm/auto_scheduler/measure.py
index 959a9c5da82a..d02dcff3bba0 100644
--- a/python/tvm/auto_scheduler/measure.py
+++ b/python/tvm/auto_scheduler/measure.py
@@ -868,7 +868,11 @@ def _timed_eval_func(
                 if arg in tensor_input_map:
                     tensor_name = tensor_input_map[arg]
                     if tensor_name in task_input_names:
-                        args.append(get_task_input_buffer(inp.task.workload_key, tensor_name))
+                        args.append(
+                            ndarray.array(
+                                get_task_input_buffer(inp.task.workload_key, tensor_name), ctx
+                            )
+                        )
                         task_inputs_count += 1
                     else:
                         raise ValueError(
@@ -1079,7 +1083,11 @@ def _timed_rpc_run(
                 if arg in tensor_input_map:
                     tensor_name = tensor_input_map[arg]
                     if tensor_name in task_input_names:
-                        args.append(get_task_input_buffer(inp.task.workload_key, tensor_name))
+                        args.append(
+                            ndarray.array(
+                                get_task_input_buffer(inp.task.workload_key, tensor_name), ctx
+                            )
+                        )
                         task_inputs_count += 1
                     else:
                         raise ValueError(
diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py
index 68f53125c7ae..6cce30f2f559 100644
--- a/python/tvm/auto_scheduler/relay_integration.py
+++ b/python/tvm/auto_scheduler/relay_integration.py
@@ -117,12 +117,17 @@ def extract_tasks(
     env = TracingEnvironment(
         TracingMode.EXTRACT_TASK if include_simple_tasks else TracingMode.EXTRACT_COMPLEX_TASK_ONLY
     )
+
+    dispatch_ctx = DispatchContext.current
+    old_verbose = dispatch_ctx.verbose
+    dispatch_ctx.verbose = 0
     with env:
         # Wrap build call in a new thread to avoid the conflict
         # between python's multiprocessing and tvm's thread pool
         build_thread = threading.Thread(target=call_all_topi_funcs, args=(mod, params, target))
         build_thread.start()
         build_thread.join()
+    dispatch_ctx.verbose = old_verbose
 
     # create search tasks
     tasks = []
diff --git a/tests/python/unittest/test_auto_scheduler_measure.py b/tests/python/unittest/test_auto_scheduler_measure.py
index 116981028cc9..7605b70be6f4 100644
--- a/tests/python/unittest/test_auto_scheduler_measure.py
+++ b/tests/python/unittest/test_auto_scheduler_measure.py
@@ -357,7 +357,7 @@ def test_measure_target_host():
 
 
 @tvm.testing.requires_llvm
-def test_measure_special_inputs_map_by_name():
+def test_measure_special_inputs_map_by_name_local_runner():
     @auto_scheduler.register_workload
     def foo():
         X = te.placeholder(shape=[10], dtype="int32")
@@ -384,6 +384,38 @@ def foo():
     assert mress[0].error_no == 0
 
 
+@tvm.testing.requires_llvm
+def test_measure_special_inputs_map_by_name_rpc_runner():
+    @auto_scheduler.register_workload
+    def foo():
+        X = te.placeholder(shape=[10], dtype="int32")
+        Index = te.placeholder(shape=[1], dtype="int32", name="Index")
+        Y = te.compute((1,), lambda i: X[Index[i]])
+        return [X, Index, Y]
+
+    # This workload cannot use random input for the `Index` input
+    task = auto_scheduler.SearchTask(
+        func=foo,
+        target="llvm",
+        task_inputs={
+            "Index": tvm.nd.array(np.array([5], dtype="int32")),
+        },
+    )
+
+    for enable_cpu_cache_flush in [True, False]:
+        minp = auto_scheduler.MeasureInput(task, task.compute_dag.init_state)
+        local_builder = auto_scheduler.LocalBuilder()
+        measure_ctx = auto_scheduler.LocalRPCMeasureContext(
+            timeout=60, enable_cpu_cache_flush=enable_cpu_cache_flush
+        )
+        rpc_runner = measure_ctx.runner
+
+        bress = local_builder.build([minp])
+        assert bress[0].error_no == 0
+        mress = rpc_runner.run([minp], bress)
+        assert mress[0].error_no == 0
+
+
 if __name__ == "__main__":
     test_record_split_reorder_fuse_annotation()
     test_record_compute_at_root_inline_cache_read_write()
@@ -395,4 +427,5 @@ def foo():
     test_dag_measure_local_builder_runner()
     test_measure_local_builder_rpc_runner()
     test_measure_target_host()
-    test_measure_special_inputs_map_by_name()
+    test_measure_special_inputs_map_by_name_local_runner()
+    test_measure_special_inputs_map_by_name_rpc_runner()

From df6fb6938e1a816739565c03d8e079a811515031 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <pivovaa@amazon.com>
Date: Thu, 11 Mar 2021 06:36:29 -0800
Subject: [PATCH 321/357] [MIPS] Fix CALL16 reloc at 0x290 not against global
 symbol (#7634)

---
 src/target/llvm/codegen_cpu.cc        | 7 +++++--
 src/tir/transforms/make_packed_api.cc | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc
index b37cd73ece04..b49f850b2d90 100644
--- a/src/target/llvm/codegen_cpu.cc
+++ b/src/target/llvm/codegen_cpu.cc
@@ -437,11 +437,14 @@ void CodeGenCPU::CreateComputeScope(const AttrStmtNode* op) {
     arg_types.push_back(value->getType());
   }
   llvm::FunctionType* ftype = llvm::FunctionType::get(t_int_, arg_types, false);
+  // $xxx_compute_ functions are not global. They should be marked as static (via InternalLinkage)
+  // to call them correctly on MIPS platform (CALL16 issue)
+  // Linkage ld Error: CALL16 reloc at 0x290 not against global symbol
   llvm::Function* fcompute = llvm::Function::Create(
-      ftype, llvm::Function::PrivateLinkage,
+      ftype, llvm::Function::InternalLinkage,
       op->value.as<StringImmNode>()->value.operator llvm::StringRef(), module_.get());
   BasicBlock* compute_call_end = CheckCallSuccess(builder_->CreateCall(fcompute, arg_values));
-  // setup compute fuinction.
+  // setup compute function.
   std::unordered_map<const VarNode*, llvm::Value*> new_vmap;
   size_t idx = 0;
   for (auto it = fcompute->arg_begin(); it != fcompute->arg_end(); ++it, ++idx) {
diff --git a/src/tir/transforms/make_packed_api.cc b/src/tir/transforms/make_packed_api.cc
index 0946af6f640a..3842f3e9a8ee 100644
--- a/src/tir/transforms/make_packed_api.cc
+++ b/src/tir/transforms/make_packed_api.cc
@@ -229,7 +229,7 @@ PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) {
   //
   // For example, for auto broadcasting, checks are required to guarantee that
   // either 0 or the original stride will be correctly used. Checks here have
-  // to use the args that may have no let bining yet. Therefore, hoisting let
+  // to use the args that may have no let binding yet. Therefore, hoisting let
   // binding for args before buffer declaration is needed.
   for (const auto& kv : var_def) {
     binder.Bind(kv.second, kv.first, kv.first->name_hint, true);

From 56feab9f4d97f310018d6a1df6ed4d5dd75e9178 Mon Sep 17 00:00:00 2001
From: Qiang Zhang <johnson9009@163.com>
Date: Thu, 11 Mar 2021 22:41:50 +0800
Subject: [PATCH 322/357] [Test] Add Test Case to Cover Bug Fix by PR#7432
 (#7601)

---
 tests/python/relay/test_pass_auto_quantize.py | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/tests/python/relay/test_pass_auto_quantize.py b/tests/python/relay/test_pass_auto_quantize.py
index 8a7c4cbfbbd6..31f5ac6e71b1 100644
--- a/tests/python/relay/test_pass_auto_quantize.py
+++ b/tests/python/relay/test_pass_auto_quantize.py
@@ -307,6 +307,39 @@ def @main(
     verify_partition_fails(mod, params)
 
 
+def test_left_shift_negative():
+    data = relay.var("data", shape=(1, 16, 64, 64))
+    weight = relay.const(np.full((16, 16, 3, 3), 256.0))
+    conv2d = relay.nn.conv2d(data, weight, kernel_size=(3, 3), padding=(1, 1), channels=16)
+    relu = relay.nn.relu(conv2d)
+
+    mod = tvm.IRModule.from_expr(relu)
+
+    with tvm.transform.PassContext(opt_level=3):
+        with relay.quantize.qconfig(
+            calibrate_mode="global_scale", global_scale=8.0, skip_conv_layers=None
+        ):
+            qnn_mod = relay.quantize.quantize(mod)
+
+    class OpFinder(relay.ExprVisitor):
+        def __init__(self, op_name):
+            super(OpFinder, self).__init__()
+            self._op_name = op_name
+            self.ops = list()
+
+        def visit_call(self, call):
+            super().visit_call(call)
+            if call.op.name == self._op_name:
+                self.ops.append(call)
+
+    opf = OpFinder("left_shift")
+    opf.visit(qnn_mod["main"])
+    assert len(opf.ops) > 0, 'Broken case, can\'t find any "left_shift" operators.'
+    for left_shift_op in opf.ops:
+        shift_amount = left_shift_op.args[1].data.asnumpy()
+        assert shift_amount >= 0, "Shift amount must be non-negative."
+
+
 if __name__ == "__main__":
     test_mul_rewrite()
     test_batch_flatten_rewrite()
@@ -320,3 +353,4 @@ def @main(
     test_unquantizable_prefix_partition()
     test_unquantizable_core_partition()
     test_unquantizable_suffix_partition()
+    test_left_shift_negative()

From f8596b5228346eae18f25f22e3096fef758b1166 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Fri, 12 Mar 2021 00:56:28 +0900
Subject: [PATCH 323/357] [ONNX] Use take instead of min in NMS conditions
 (#7633)

---
 python/tvm/relay/frontend/onnx.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 860753d6cd0b..f31b8c927f8f 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -2453,7 +2453,7 @@ def _first_cond(
             nms_size_out,
         ):
             # Loop over classes, end when i == C
-            return _op.min(_op.less(i, C))
+            return _op.take(_op.less(i, C), _expr.const(0))
 
         def _first_body(
             i,
@@ -2561,7 +2561,7 @@ def _first_body(
 
         def _inner_cond(i, j, C, onnx_out, nms_size, out):
             # inner loop over number of classes
-            return _op.min(_op.less(j, C))
+            return _op.take(_op.less(j, C), _expr.const(0))
 
         def _inner_body(i, j, C, onnx_out, nms_size, out):
             # slice to get current batch and class for valid box indicator
@@ -2591,7 +2591,7 @@ def _inner_body(i, j, C, onnx_out, nms_size, out):
 
         def _outer_cond(i, B, C, onnx_out, nms_size_out, out):
             # Outer loop is over batch size
-            return _op.min(_op.less(i, B))
+            return _op.take(_op.less(i, B), _expr.const(0))
 
         def _outer_body(i, B, C, onnx_out, nms_size_out, out):
             # Outer loop just calls inner loop

From b2a3c481ebbb7cfbd5335fb11cd516ae5f348406 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Fri, 12 Mar 2021 04:42:47 +0900
Subject: [PATCH 324/357] [Ansor] Add HW param for Vulkan tuning (#7626)

* add HW param for VK

* query warp size properly

* guard against warp_size < 4 case

Co-authored-by: Masahiro Masuda <masahi@129@gmail.com>
---
 src/auto_scheduler/search_task.cc | 23 +++++++++++++++++++++++
 src/runtime/vulkan/vulkan.cc      | 25 ++++++++++++++++---------
 2 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/src/auto_scheduler/search_task.cc b/src/auto_scheduler/search_task.cc
index 22c2893141cf..f25e581dbf24 100755
--- a/src/auto_scheduler/search_task.cc
+++ b/src/auto_scheduler/search_task.cc
@@ -106,6 +106,29 @@ HardwareParams HardwareParamsNode::GetDefaultHardwareParams(const Target& target
       auto target_device = target->GetAttr<String>("device", "");
       LOG(FATAL) << "No default hardware parameters for opencl target device: " << target_device;
     }
+  } else if (device_type == kDLVulkan) {
+    auto ctx = TVMContext{static_cast<DLDeviceType>(device_type), 0};
+    auto device_name = "device_api.vulkan";
+    auto func = tvm::runtime::Registry::Get(device_name);
+    ICHECK(func != nullptr) << "Cannot find Vulkan device_api in registry";
+    auto device_api = static_cast<tvm::runtime::DeviceAPI*>(((*func)()).operator void*());
+
+    tvm::runtime::TVMRetValue ret;
+    device_api->GetAttr(ctx, tvm::runtime::DeviceAttrKind::kMaxSharedMemoryPerBlock, &ret);
+    int max_shared_memory_per_block = ret;
+
+    int max_local_memory_per_block = INT32_MAX;
+
+    device_api->GetAttr(ctx, tvm::runtime::DeviceAttrKind::kMaxThreadsPerBlock, &ret);
+    int max_threads_per_block = ret;
+
+    device_api->GetAttr(ctx, tvm::runtime::DeviceAttrKind::kWarpSize, &ret);
+    int warp_size = ret;
+
+    int max_vthread_extent = std::max(1, warp_size / 4);
+
+    return HardwareParams(-1, 16, 64, max_shared_memory_per_block, max_local_memory_per_block,
+                          max_threads_per_block, max_vthread_extent, warp_size);
   } else {
     LOG(FATAL) << "No default hardware parameters for target: " << target;
   }
diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc
index 794f3c570f96..ff1b82f930d7 100644
--- a/src/runtime/vulkan/vulkan.cc
+++ b/src/runtime/vulkan/vulkan.cc
@@ -367,28 +367,37 @@ void VulkanDeviceAPI::GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue*
   }
   ICHECK_LT(index, context_.size()) << "Invalid device id " << index;
   const auto& vctx = context(index);
+  VkPhysicalDeviceProperties phy_prop;
+  vkGetPhysicalDeviceProperties(vctx.phy_device, &phy_prop);
+
   switch (kind) {
     case kMaxThreadsPerBlock: {
-      VkPhysicalDeviceProperties phy_prop;
-      vkGetPhysicalDeviceProperties(vctx.phy_device, &phy_prop);
       int64_t value = phy_prop.limits.maxComputeWorkGroupInvocations;
       *rv = value;
       break;
     }
     case kMaxSharedMemoryPerBlock: {
-      VkPhysicalDeviceProperties phy_prop;
-      vkGetPhysicalDeviceProperties(vctx.phy_device, &phy_prop);
       int64_t value = phy_prop.limits.maxComputeSharedMemorySize;
       *rv = value;
       break;
     }
     case kWarpSize: {
-      *rv = 1;
+      VkPhysicalDeviceSubgroupProperties subgroup_prop;
+      subgroup_prop.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES;
+      subgroup_prop.pNext = NULL;
+
+      VkPhysicalDeviceProperties2 phy_prop2;
+      phy_prop2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
+      phy_prop2.pNext = &subgroup_prop;
+
+      vkGetPhysicalDeviceProperties2(vctx.phy_device, &phy_prop2);
+      int64_t subgroup_size = subgroup_prop.subgroupSize;
+      ICHECK(subgroup_size >= 1);
+
+      *rv = subgroup_size;
       break;
     }
     case kComputeVersion: {
-      VkPhysicalDeviceProperties phy_prop;
-      vkGetPhysicalDeviceProperties(vctx.phy_device, &phy_prop);
       int64_t value = phy_prop.apiVersion;
       std::ostringstream os;
       os << VK_VERSION_MAJOR(value) << "." << VK_VERSION_MINOR(value) << "."
@@ -405,8 +414,6 @@ void VulkanDeviceAPI::GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue*
     case kExist:
       break;
     case kMaxThreadDimensions: {
-      VkPhysicalDeviceProperties phy_prop;
-      vkGetPhysicalDeviceProperties(vctx.phy_device, &phy_prop);
       int64_t dims[3];
       dims[0] = phy_prop.limits.maxComputeWorkGroupSize[0];
       dims[1] = phy_prop.limits.maxComputeWorkGroupSize[1];

From 017ff94d15df85ea8476f8ad3ce234470072ae84 Mon Sep 17 00:00:00 2001
From: ANSHUMAN TRIPATHY <anshuman.t@huawei.com>
Date: Fri, 12 Mar 2021 02:31:59 +0530
Subject: [PATCH 325/357] [TOPI] Sparse Add Op added (#7435)

* [TOPI] Sparse Add Op added

* lint resolved

* TF frontend support added

* Test case added

* [1] Review comment handled

* [2] Review comment handled

* [3] Review comment handled

* [4] Review comment handled

* [5] Review comment handled
---
 python/tvm/relay/frontend/tensorflow.py       | 35 ++++++++++
 python/tvm/relay/op/nn/_nn.py                 |  5 ++
 python/tvm/relay/op/nn/nn.py                  | 47 +++++++++++++
 python/tvm/relay/op/strategy/generic.py       | 23 +++++++
 python/tvm/topi/nn/sparse.py                  | 69 +++++++++++++++++++
 src/relay/op/nn/sparse.cc                     | 41 +++++++++++
 .../frontend/tensorflow/test_forward.py       | 48 +++++++++++++
 tests/python/topi/python/test_topi_sparse.py  | 28 ++++++++
 8 files changed, 296 insertions(+)

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index c79c495b0360..f56d187b6a63 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -1286,6 +1286,40 @@ def _impl(inputs, attr, params, mod):
     return _impl
 
 
+def _sparse_tensor_dense_add():
+    # Sparse utility from scipy
+    from scipy.sparse import csr_matrix
+
+    def _impl(inputs, attr, params, mod):
+        assert (
+            len(inputs) == 4
+        ), "There should be 4 input tensors [sparse_indices, sparse_values, sparse_shape, dense]."
+
+        indices_tensor = _infer_value(inputs[0], params, mod).asnumpy()
+        values_tensor = _infer_value(inputs[1], params, mod).asnumpy()
+        dense_shape_tensor = _infer_value(inputs[2], params, mod).asnumpy()
+
+        data = inputs[3]
+
+        rows = [x[0] for x in indices_tensor]
+        cols = [x[1] for x in indices_tensor]
+
+        # Create scipy sparse Tensor(CSR)
+        weight_sp = csr_matrix(
+            (values_tensor, (rows, cols)), shape=tuple(dense_shape_tensor.tolist())
+        )
+
+        weight_data = _expr.const(weight_sp.data, weight_sp.data.dtype)
+        weight_indptrs = _expr.const(weight_sp.indptr, weight_sp.indptr.dtype)
+        weight_indices = _expr.const(weight_sp.indices, weight_sp.indices.dtype)
+
+        ret = _op.nn.sparse_add(data, [weight_data, weight_indices, weight_indptrs])
+
+        return ret
+
+    return _impl
+
+
 def _identity():
     def _impl(inputs, attr, params, mod):
         return inputs[0]
@@ -2787,6 +2821,7 @@ def _impl(inputs, attr, params, mod):
     "SparseSegmentSqrtNWithNumSegments": _sparse_segment_sum_sqrtn_with_num_segments(),
     "SparseSegmentMean": _sparse_segment_mean(),
     "SparseSegmentMeanWithNumSegments": _sparse_segment_mean_with_num_segments(),
+    "SparseTensorDenseAdd": _sparse_tensor_dense_add(),
     "Split": _split(False),
     "SplitV": _split(True),
     "Sqrt": AttrCvt("sqrt"),
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 6ae86c0786e5..af64873ee904 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -142,6 +142,11 @@ def alter_op_layout_sparse_dense(attrs, inputs, tinfos, out_type):
     return topi.nn.sparse_dense_alter_layout(attrs, inputs, tinfos, out_type)
 
 
+# sparse_add
+reg.register_strategy("nn.sparse_add", strategy.sparse_add_strategy)
+reg.register_pattern("nn.sparse_add", reg.OpPattern.OPAQUE)
+
+
 @reg.register_compute("nn.internal.sparse_dense_padded")
 def compute_sparse_dense_padded(attrs, inputs, out_type):
     """Compute definition of sparse_dense_padded"""
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index 5135ac74de25..a1147fec4d7e 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -2148,6 +2148,53 @@ def sparse_transpose(x):
     return expr.TupleWrapper(_make.sparse_transpose(x[0], x[1], x[2]), 3)
 
 
+# pylint: disable=no-else-return,inconsistent-return-statements
+def sparse_add(dense_mat, sparse_mat):
+    r"""
+    Computes the matrix addition of `dense_mat` and `sparse_mat`, where `dense_mat` is
+    a dense matrix and `sparse_mat` is a sparse (CSR) namedtuple with
+    fields `data`, `indices`, and `indptr`.
+
+    .. math::
+
+        \mbox{sparse_add}(dense_mat, sparse_mat)[m, n] = \mbox{add}(\mbox{as_dense}(S), (D))[m, n]
+
+    where `as_dense` returns dense equivalent of the given S(sparse matrix)
+    while performing addition with given D(dense matrix).
+
+    Parameters
+    ----------
+    dense_mat : tvm.relay.Expr
+        The input dense matrix for the matrix addition
+
+    sparse_mat : Union[namedtuple, Tuple[ndarray, ndarray, ndarray]].
+        The input sparse matrix(CSR) for the matrix addition.
+
+    Returns
+    -------
+    result: tvm.relay.Expr
+        The computed result.
+
+    Examples
+    -------
+    .. code-block:: python
+        dense_data = [[ 3.,   4.,   4. ]
+                      [ 4.,  2.,  5. ]]
+        sparse_data = [4., 8.]
+        sparse_indices =[0, 2]
+        sparse_indptr =[0, 1, 2]
+
+        output = relay.sparse_add(dense_data, sparse_data, sparse_indices, sparse_indptr)
+
+        output = [[ 7.,   4.,   4. ]
+                  [ 4.,  2.,  13. ]]
+    """
+    if hasattr(sparse_mat, "indices"):
+        return _make.sparse_add(dense_mat, sparse_mat.data, sparse_mat.indices, sparse_mat.indptr)
+    else:
+        return _make.sparse_add(dense_mat, sparse_mat[0], sparse_mat[1], sparse_mat[2])
+
+
 def contrib_conv2d_winograd_without_weight_transform(
     data,
     weight,
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index be86ea9d9184..04f25640574a 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -799,6 +799,29 @@ def sparse_dense_padded_strategy(attrs, inputs, out_type, target):
     raise NotImplementedError("sparse_dense_padded is only implemented for cuda")
 
 
+# sparse_add
+def wrap_compute_sparse_add(topi_compute):
+    """wrap sparse add topi compute"""
+
+    def _compute_sparse_add(attrs, inputs, out_type):
+        return [topi_compute(inputs[0], inputs[1], inputs[2], inputs[3])]
+
+    return _compute_sparse_add
+
+
+@override_native_generic_func("sparse_add_strategy")
+def sparse_add_strategy(attrs, inputs, out_type, target):
+    """sparse add generic strategy"""
+    logger.warning("sparse add is not optimized for this platform.")
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_sparse_add(topi.nn.sparse_add),
+        wrap_topi_schedule(topi.generic.schedule_extern),
+        name="sparse_add.generic",
+    )
+    return strategy
+
+
 # sparse_transpose
 @generic_func
 def schedule_sparse_transpose(attrs, outs, target):
diff --git a/python/tvm/topi/nn/sparse.py b/python/tvm/topi/nn/sparse.py
index 1bf18df09da3..756110624aa1 100644
--- a/python/tvm/topi/nn/sparse.py
+++ b/python/tvm/topi/nn/sparse.py
@@ -468,3 +468,72 @@ def _traverse(t):
     sparse_input_map[sparse_indptr] = sparse_prefix + "W_indptr"
 
     return sparse_input_map
+
+
+def sparse_add(dense_data, sparse_data, sparse_indices, sparse_indptr):
+    """
+    Computes sparse-dense addition
+
+    Parameters
+    ----------
+    dense_data : tvm.te.Tensor
+        2-D with shape [M, N]
+
+    sparse_data : tvm.te.Tensor
+        1-D with shape [nnz] (CSR)
+
+    sparse_indices : tvm.te.Tensor
+        1-D with shape [nnz] (CSR)
+
+    sparse_indptr : tvm.te.Tensor
+        1-D with shape [M + 1] (CSR)
+
+    Returns
+    -------
+    output : tvm.te.Tensor
+        2-D with shape [M, N]
+    """
+    # TODO(ANSHUMAN87): support BSR format too
+    assert len(sparse_data.shape) == 1, "only CSR format is supported"
+    return _sparse_add_csr(dense_data, sparse_data, sparse_indices, sparse_indptr)
+
+
+def _sparse_add_csr(dense_data_inp, sparse_data_inp, sparse_indices_inp, sparse_indptr_inp):
+    oshape = get_const_tuple(dense_data_inp.shape)
+
+    def _csr_add_ir(dense_data, sparse_data, sparse_indices, sparse_indptr, out_data):
+        irb = tvm.tir.ir_builder.create()
+        dense_data_ptr = irb.buffer_ptr(dense_data)
+        sparse_data_ptr = irb.buffer_ptr(sparse_data)
+        sparse_indices_ptr = irb.buffer_ptr(sparse_indices)
+        sparse_indptr_ptr = irb.buffer_ptr(sparse_indptr)
+
+        out_data_ptr = irb.buffer_ptr(out_data)
+
+        with irb.for_range(0, oshape[0], kind="vectorize", name="row") as row:
+            with irb.for_range(0, oshape[1], kind="parallel", name="col") as col:
+                out_data_ptr[row, col] = dense_data_ptr[row, col]
+
+        with irb.for_range(0, oshape[0], kind="parallel", name="row") as row:
+            offset = sparse_indptr_ptr[row]
+            diff = sparse_indptr_ptr[row + 1] - sparse_indptr_ptr[row]
+            with irb.for_range(0, diff, kind="serial", name="idx") as idx:
+                real_idx = offset + idx
+                col = sparse_indices_ptr[real_idx]
+                out_data_ptr[row, col] = sparse_data_ptr[real_idx] + out_data_ptr[row, col]
+
+        return irb.get()
+
+    return te.extern(
+        shape=oshape,
+        inputs=[dense_data_inp, sparse_data_inp, sparse_indices_inp, sparse_indptr_inp],
+        fcompute=lambda ins, outs: _csr_add_ir(ins[0], ins[1], ins[2], ins[3], outs[0]),
+        tag="sparse_add_csr",
+        dtype=[
+            dense_data_inp.dtype,
+            sparse_data_inp.dtype,
+            sparse_indices_inp.dtype,
+            sparse_indptr_inp.dtype,
+        ],
+        name="sparse_add_csr_output",
+    )
diff --git a/src/relay/op/nn/sparse.cc b/src/relay/op/nn/sparse.cc
index 6322cfffd7c2..b1a16f18b623 100644
--- a/src/relay/op/nn/sparse.cc
+++ b/src/relay/op/nn/sparse.cc
@@ -196,5 +196,46 @@ RELAY_REGISTER_OP("nn.sparse_transpose")
     .set_support_level(1)
     .add_type_rel("SparseTranspose", SparseTransposeRel);
 
+// relay.nn.sparse_add
+bool SparseAddRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                  const TypeReporter& reporter) {
+  ICHECK_EQ(types.size(), 5) << "expecting 4 inputs and 1 output.";
+  const auto* dense_data = types[0].as<TensorTypeNode>();
+  const auto* sparse_data = types[1].as<TensorTypeNode>();
+  ICHECK(reporter->Assert(sparse_data->dtype == dense_data->dtype))
+      << "sparse tensor and dense tensor datatype should match.";
+  ICHECK(reporter->Assert(sparse_data->shape.size() == 1)) << "sparse data tensor should be 1D.";
+  const auto* sparse_indices = types[2].as<TensorTypeNode>();
+  ICHECK(reporter->Assert(sparse_indices->shape.size() == 1))
+      << "sparse indices tensor should be 1D.";
+
+  reporter->Assign(types[4], TensorType(dense_data->shape, dense_data->dtype));
+  return true;
+}
+
+Expr MakeSparseAdd(Expr dense_data, Expr sparse_data, Expr sparse_indices, Expr sparse_indptr) {
+  static const Op& op = Op::Get("nn.sparse_add");
+  return Call(op, {dense_data, sparse_data, sparse_indices, sparse_indptr}, Attrs(), {});
+}
+
+TVM_REGISTER_GLOBAL("relay.op.nn._make.sparse_add").set_body_typed(MakeSparseAdd);
+
+RELAY_REGISTER_OP("nn.sparse_add")
+    .describe(R"code(Add a dense matrix X with sparse matrix Y.
+
+- **dense**: `(M, N)`
+- **sparse**: `(M, N)`
+
+- **out**: `(M, N)`.
+
+)code" TVM_ADD_FILELINE)
+    .set_num_inputs(4)
+    .add_argument("dense_data", "2D Tensor", "Dense data matrix.")
+    .add_argument("sparse_data", "1D Tensor", "Sparse data vector.")
+    .add_argument("sparse_indices", "1D Tensor", "Sparse indices vector.")
+    .add_argument("sparse_indptr", "1D Tensor", "Sparse index pointer vector.")
+    .set_support_level(1)
+    .add_type_rel("SparseAdd", SparseAddRel);
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 81aeb5ef886c..fa27dee37699 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -2352,6 +2352,54 @@ def test_forward_sparse_to_dense_v2():
     _test_sparse_to_dense_v2([[0, 0], [1, 3], [4, 3]], [3.0, 6.0, 9.0], [5, 5], "float32", 1.9)
 
 
+#######################################################################
+# tensorflow.sparse.add
+# ----------------------------------
+
+
+def _test_sparse_add(indices, values, A_shape, B_shape, dtype, flip=False):
+    """ One iteration of tf.sparse.add """
+
+    # TODO(ANSHUMAN87): support cuda
+    # TODO(ANSHUMAN87): support both sparse input case
+
+    with tf.Graph().as_default():
+        A_sp = tf.sparse.SparseTensor(
+            indices=indices, values=np.array(values).astype(dtype), dense_shape=A_shape
+        )
+        B = tf.placeholder(shape=B_shape, dtype=dtype, name="B")
+
+        # TODO(ANSHUMAN87): support user input threashold values
+        if flip:
+            result = tf.sparse.add(B, A_sp, threshold=0)
+        else:
+            result = tf.sparse.add(A_sp, B, threshold=0)
+
+        B_np = np.random.uniform(high=5.0, size=B_shape).astype(dtype)
+
+        compare_tf_with_tvm([B_np], [B.name], result.name, no_gpu=True)
+
+
+def test_sparse_add():
+    """ sparse.add op test"""
+    ###################################################################
+    #
+    # In order to create a SparseTensor, it requires 3 input as below:
+    #    SparseTensor(indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
+    #
+    # Above Sparse can be represented in Dense as below :
+    #    [[1, 0, 0, 0]
+    #     [0, 0, 2, 0]
+    #     [0, 0, 0, 0]]
+    #
+    # ------------------------------------------------------------------
+    for dtype_inp in ["float32", "float64", "int32"]:
+        _test_sparse_add([[0, 0], [1, 2]], [4.0, 8.0], [3, 4], [3, 4], dtype_inp)
+        _test_sparse_add([[0, 0], [1, 2]], [4.0, 8.0], [3, 4], [3, 4], dtype_inp, True)
+        _test_sparse_add([[0, 0], [1, 3], [4, 3]], [3.0, 6.0, 9.0], [5, 5], [5, 5], dtype_inp)
+        _test_sparse_add([[0, 0], [1, 3], [4, 3]], [3.0, 6.0, 9.0], [5, 5], [5, 5], dtype_inp, True)
+
+
 #######################################################################
 # StridedSlice
 # ------------
diff --git a/tests/python/topi/python/test_topi_sparse.py b/tests/python/topi/python/test_topi_sparse.py
index d5bd7aa1a21e..d84bd1530587 100644
--- a/tests/python/topi/python/test_topi_sparse.py
+++ b/tests/python/topi/python/test_topi_sparse.py
@@ -526,6 +526,33 @@ def test_sparse_dense_padded_alter_op():
             x = relay.build(tvm.IRModule.from_expr(f), target=tvm.target.Target("cuda"))
 
 
+def test_sparse_add_csr():
+    for indices_dtype in ["int32", "int64"]:
+        for data_dtype in ["float32", "float64"]:
+            M, K, density = 3, 49, 0.2
+            X_np = np.random.randn(M, K).astype(data_dtype)
+            Y_sp_np = sp.random(M, K, density=density, format="csr", dtype=data_dtype)
+            Y_np = Y_sp_np.todense()
+            Z_np = X_np + Y_np
+
+            Y_data = te.placeholder(shape=Y_sp_np.data.shape, dtype=data_dtype)
+            Y_indices = te.placeholder(shape=Y_sp_np.indices.shape, dtype=indices_dtype)
+            Y_indptr = te.placeholder(shape=Y_sp_np.indptr.shape, dtype=indices_dtype)
+            X = te.placeholder(shape=X_np.shape, dtype=data_dtype)
+            Z = topi.nn.sparse_add(X, Y_data, Y_indices, Y_indptr)
+            s = te.create_schedule(Z.op)
+            func = tvm.build(s, [X, Y_data, Y_indices, Y_indptr, Z])
+            Z_tvm = tvm.nd.array(np.zeros(Z_np.shape, dtype=Z_np.dtype))
+            func(
+                tvm.nd.array(X_np.astype(data_dtype)),
+                tvm.nd.array(Y_sp_np.data.astype(data_dtype)),
+                tvm.nd.array(Y_sp_np.indices.astype(indices_dtype)),
+                tvm.nd.array(Y_sp_np.indptr.astype(indices_dtype)),
+                Z_tvm,
+            )
+            tvm.testing.assert_allclose(Z_tvm.asnumpy(), Z_np, atol=1e-4, rtol=1e-4)
+
+
 if __name__ == "__main__":
     test_csrmv()
     test_csrmm()
@@ -537,3 +564,4 @@ def test_sparse_dense_padded_alter_op():
     test_sparse_dense_padded_alter_op()
     test_sparse_dense_csr_reverse()
     test_sparse_dense_bsr_reverse()
+    test_sparse_add_csr()

From e9e014b4685ea05885cb321f1e42d2d3f71a0407 Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Thu, 11 Mar 2021 14:53:48 -0800
Subject: [PATCH 326/357] [Relay][QNN] Simulated Quantize and Dequantize
 (#7613)

* Add initial implementation of flexible simulated qnn ops.

* Added proper topi testing and fixed qnn axis bug.

* Add injective schedule wrapping.

* Stuck on typerel problem.

* Relay integration fully working.

* Simulated quantize totally finished.

* Change dtype to be a scalar rather than tensor.

* Undo change to quantize.

* formatting.

* Fix attritubes.

* Fix negative axis dequantize bug.

* Add topi simulated dequantize.

* Add simulated_dequantize op to topi and relay.

* Formatting.

* Test negative axis perchannel dequantization.

* Lint formatting.

* Change import order to make lint happy.

* Fix pytest.

* Directly return make call.

* Clarify disable mode for simulated qnn ops and fix typos.

* Line too long oops.

Co-authored-by: Ubuntu <jwfromm@jwfromm-cpu-dev.itxhlkosmouevgkdrmwxfbs5qh.xx.internal.cloudapp.net>
---
 include/tvm/relay/qnn/attrs.h                 |  12 ++
 python/tvm/relay/qnn/op/__init__.py           |   2 +-
 python/tvm/relay/qnn/op/_qnn.py               |  52 +++++
 python/tvm/relay/qnn/op/qnn.py                |  72 ++++++-
 python/tvm/topi/nn/__init__.py                |   1 +
 python/tvm/topi/nn/qnn.py                     | 190 ++++++++++++++++++
 src/relay/qnn/op/dequantize.cc                |   9 +-
 src/relay/qnn/op/quantize.cc                  |  13 +-
 src/relay/qnn/op/simulated_dequantize.cc      |  80 ++++++++
 src/relay/qnn/op/simulated_quantize.cc        |  82 ++++++++
 tests/python/relay/test_op_qnn_dequantize.py  |   2 +-
 tests/python/relay/test_op_qnn_quantize.py    |   2 +-
 .../relay/test_op_qnn_simulated_dequantize.py | 177 ++++++++++++++++
 .../relay/test_op_qnn_simulated_quantize.py   | 178 ++++++++++++++++
 tests/python/topi/python/test_topi_qnn.py     | 157 +++++++++++++++
 15 files changed, 1019 insertions(+), 10 deletions(-)
 create mode 100644 python/tvm/relay/qnn/op/_qnn.py
 create mode 100644 python/tvm/topi/nn/qnn.py
 create mode 100644 src/relay/qnn/op/simulated_dequantize.cc
 create mode 100644 src/relay/qnn/op/simulated_quantize.cc
 create mode 100644 tests/python/relay/test_op_qnn_simulated_dequantize.py
 create mode 100644 tests/python/relay/test_op_qnn_simulated_quantize.py
 create mode 100644 tests/python/topi/python/test_topi_qnn.py

diff --git a/include/tvm/relay/qnn/attrs.h b/include/tvm/relay/qnn/attrs.h
index c5213fe07471..f0280a90c604 100644
--- a/include/tvm/relay/qnn/attrs.h
+++ b/include/tvm/relay/qnn/attrs.h
@@ -75,6 +75,18 @@ struct QuantizeAttrs : public tvm::AttrsNode<QuantizeAttrs> {
   }
 };
 
+struct SimulatedQuantizeAttrs : public tvm::AttrsNode<SimulatedQuantizeAttrs> {
+  int axis;
+
+  TVM_DECLARE_ATTRS(SimulatedQuantizeAttrs, "relay.attrs.SimulatedQuantizeAttrs") {
+    TVM_ATTR_FIELD(axis)
+        .describe(
+            "The output channel axis for channel wise quantization. Default value is -1,"
+            "which corresponds to the last axis.")
+        .set_default(-1);
+  }
+};
+
 /*! \brief Attribute for dequantize operator */
 struct DequantizeAttrs : public tvm::AttrsNode<DequantizeAttrs> {
   int axis;
diff --git a/python/tvm/relay/qnn/op/__init__.py b/python/tvm/relay/qnn/op/__init__.py
index 6d66e12eeafc..848409360a9d 100644
--- a/python/tvm/relay/qnn/op/__init__.py
+++ b/python/tvm/relay/qnn/op/__init__.py
@@ -19,4 +19,4 @@
 from __future__ import absolute_import as _abs
 from .qnn import *
 from .op import register_qnn_legalize
-from . import legalizations, layout_conversions
+from . import _qnn, legalizations, layout_conversions
diff --git a/python/tvm/relay/qnn/op/_qnn.py b/python/tvm/relay/qnn/op/_qnn.py
new file mode 100644
index 000000000000..a059c293a0f8
--- /dev/null
+++ b/python/tvm/relay/qnn/op/_qnn.py
@@ -0,0 +1,52 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument, len-as-condition
+"""QNN operator feature registration"""
+
+from tvm import topi
+
+from ...op.op import register_compute
+from ...op.op import register_injective_schedule
+from ...op.op import register_pattern, OpPattern
+
+
+@register_compute("qnn.simulated_quantize")
+def simulated_quantize_compute(attrs, inputs, output_type):
+    assert len(inputs) == 4
+    return [
+        topi.nn.simulated_quantize(
+            inputs[0], inputs[1], inputs[2], inputs[3], axis=attrs.get_int("axis")
+        )
+    ]
+
+
+register_injective_schedule("qnn.simulated_quantize")
+register_pattern("qnn.simulated_quantize", OpPattern.ELEMWISE)
+
+
+@register_compute("qnn.simulated_dequantize")
+def simulated_dequantize_compute(attrs, inputs, output_type):
+    assert len(inputs) == 4
+    return [
+        topi.nn.simulated_dequantize(
+            inputs[0], inputs[1], inputs[2], inputs[3], axis=attrs.get_int("axis")
+        )
+    ]
+
+
+register_injective_schedule("qnn.simulated_dequantize")
+register_pattern("qnn.simulated_dequantize", OpPattern.ELEMWISE)
diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index a5892f331f06..f02f8227e14a 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -18,8 +18,10 @@
 """QNN dialect operators."""
 
 from __future__ import absolute_import as _abs
+from tvm import relay
 from tvm.relay.expr import Tuple, TupleWrapper
 from tvm.relay.op.nn.utils import get_pad_tuple2d
+from tvm.topi.nn.qnn import SQNN_DTYPE_TO_CODE
 from . import _make
 from ... import op as reg
 from ...op import OpPattern
@@ -118,6 +120,40 @@ def quantize(data, output_scale, output_zero_point, axis=-1, out_dtype="int8"):
     return _make.quantize(data, output_scale, output_zero_point, axis, out_dtype)
 
 
+def simulated_quantize(data, output_scale, output_zero_point, axis=-1, out_dtype="int8"):
+    r"""Simulated Quantize op
+    Mimics the quantize op but has more flexibility in valid inputs and always
+    outputs the same type as the input. This can be useful for
+    calibrating or training a quantized network.
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input tensor to be quantized. Can be of type float32.
+    output_zero_point : tvm.relay.Expr
+        The output zero_point.
+    output_scale : tvm.relay.Expr
+        The output scale.
+    axis : int
+        The channel axis for quantization. Default value is -1 which corresponds to the last axis.
+    out_dtype : string or tvm.relay.Expr
+        A string or tensor indicating which datatype to quantize to.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    # Convert string dtype to a constant if needed.
+    if isinstance(out_dtype, str):
+        type_code = SQNN_DTYPE_TO_CODE[out_dtype]
+        out_dtype = relay.const(type_code, dtype="int32")
+    # Wrap reshapes around qnn parameter tensors to guarantee shape compatibility.
+    output_scale = relay.op.reshape(output_scale, [-1])
+    output_zero_point = relay.op.reshape(output_zero_point, [-1])
+    return _make.simulated_quantize(data, out_dtype, output_scale, output_zero_point, axis)
+
+
 def dequantize(data, input_scale, input_zero_point, axis=-1):
     r"""Dequantize op
     This operator takes quantized int8 and unit8 as input and produces
@@ -127,7 +163,7 @@ def dequantize(data, input_scale, input_zero_point, axis=-1):
     Parameters
     ----------
     data : tvm.relay.Expr
-        The input tensor to be dequantized. Can be of type [int8, uint8].
+        The input tensor to be dequantized. Can be of type [int8, uint8, int32].
     input_zero_point : tvm.relay.Expr
         The input zero_point.
     input_scale : tvm.relay.Expr
@@ -143,6 +179,40 @@ def dequantize(data, input_scale, input_zero_point, axis=-1):
     return _make.dequantize(data, input_scale, input_zero_point, axis)
 
 
+def simulated_dequantize(data, input_scale, input_zero_point, axis=-1, in_dtype="int8"):
+    r"""Simulated Dequantize op
+    Mimics the dequantize op but has more flexibility in valid inputs and always
+    outputs the same type as the input. This can be useful for calibrating or
+    training a quantized network.
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input tensor to be dequantized.
+    input_zero_point : tvm.relay.Expr
+        The input zero_point.
+    input_scale : tvm.relay.Expr
+        The input scale.
+    axis : int
+        The channel axis for quantization. Default value is -1 which corresponds to the last axis.
+    in_dtype : string or tvm.relay.Expr
+        A string or tensor indicating which datatype to dequantize from.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    # Convert string dtype to a constant if needed.
+    if isinstance(in_dtype, str):
+        type_code = SQNN_DTYPE_TO_CODE[in_dtype]
+        in_dtype = relay.const(type_code, dtype="int32")
+    # Wrap reshapes around qnn parameter tensors to guarantee shape compatibility.
+    input_scale = relay.op.reshape(input_scale, [-1])
+    input_zero_point = relay.op.reshape(input_zero_point, [-1])
+    return _make.simulated_dequantize(data, in_dtype, input_scale, input_zero_point, axis)
+
+
 def concatenate(data, input_scales, input_zero_points, output_scale, output_zero_point, axis):
     """Concatenate the quantized input tensors along the given axis.
 
diff --git a/python/tvm/topi/nn/__init__.py b/python/tvm/topi/nn/__init__.py
index 2ebbd1d67bd1..94a5b30c9b76 100644
--- a/python/tvm/topi/nn/__init__.py
+++ b/python/tvm/topi/nn/__init__.py
@@ -36,6 +36,7 @@
 from .conv2d_transpose import *
 from .conv1d_transpose import *
 from .bnn import *
+from .qnn import *
 from .upsampling import *
 from .local_response_norm import *
 from .bitserial_conv2d import *
diff --git a/python/tvm/topi/nn/qnn.py b/python/tvm/topi/nn/qnn.py
new file mode 100644
index 000000000000..caed28580037
--- /dev/null
+++ b/python/tvm/topi/nn/qnn.py
@@ -0,0 +1,190 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Quantized Neural Network (QNN) Operators"""
+import tvm
+from tvm import te, tir, topi
+
+SQNN_DISABLE = 0
+SQNN_INT8 = 1
+SQNN_UINT8 = 2
+SQNN_INT32 = 3
+
+SQNN_DTYPE_TO_CODE = {
+    "disable": SQNN_DISABLE,
+    "int8": SQNN_INT8,
+    "uint8": SQNN_UINT8,
+    "int32": SQNN_INT32,
+}
+
+SQNN_CODE_TO_DTYPE = {v: k for k, v in SQNN_DTYPE_TO_CODE.items()}
+
+
+@tvm.te.tag_scope(tag=topi.tag.ELEMWISE)
+def simulated_quantize(data, out_dtype, output_scale=None, output_zero_point=None, axis=-1):
+    """Simulated QNN quantize operator that mimics QNN outputs without changing datatype.
+    The benefit of this operator over true QNN quantize is that this operator allows dynamic
+    datatype selection and can operate on both per-channel and scalar scales and zero points while
+    QNN quantize requires both of these to be fixed at compile time.
+
+    Parameters
+    ----------
+    data: tvm.te.Tensor
+        An N-D input tensor to the operator.
+
+    out_dtype: tvm.te.Tensor
+        A scalar variable that indicates which datatype to simulate quantization with. Use
+        SQNN_DTYPE_TO_CODE to convert a dtype string into the corresponding variable
+        value.
+
+    output_scale: tvm.te.Tensor, optional
+        A scalar tensor representing the scale to use when quantizing to integer datatypes.
+        When it contains more than a single value, N must match the number of channels in data.
+
+    output_zero_point: tvm.te.Tensor, optional
+        A 1-D tensor representing the zero point to use when quantizing to integer datatypes.
+        When it contains more than a single value, N must match the number of channels in data.
+
+    axis: int, optional
+        The channel axis for quantization. Default value is -1 which corresponds to the last axis.
+
+    """
+    # When disabled, just pass through the input values.
+    def _compute_pass_through(value, *indices):
+        return value[indices]
+
+    # Simulate quantization for arbitrary integer datatypes. The computation for all datatypes is:
+    # Q_output = clip((round(input_tensor/output_scale) + output_zero_point),
+    #                 out_dtype::min,
+    #                 out_dtype::max)
+    def _compute_intn(dtype, value, *indices):
+        assert output_scale is not None and output_zero_point is not None
+        const_min = tvm.tir.min_value(dtype)
+        const_max = tvm.tir.max_value(dtype)
+        # Use indexmod to handle both scalar and per-channel QNN parameters.
+        scale_idx = tir.indexmod(indices[axis], topi.shape(output_scale)[0])
+        zp_idx = tir.indexmod(indices[axis], topi.shape(output_zero_point)[0])
+        return te.max(
+            te.min(
+                te.round(value[indices] / output_scale[scale_idx]) + output_zero_point[zp_idx],
+                const_max,
+            ),
+            const_min,
+        )
+
+    # Use an if chain to dynamically return the proper quantization based on the input datatype.
+    # This allows the op to compile once but apply different quantization approaches
+    # using a variable datatype input.
+    def _dispatch_sim_quantize(value):
+        pass_through_value = te.compute(
+            data.shape, lambda *indices: _compute_pass_through(value, *indices)
+        )
+        int8_value = te.compute(
+            data.shape,
+            lambda *indices: tir.if_then_else(
+                out_dtype.equal(SQNN_DTYPE_TO_CODE["int8"]),
+                _compute_intn("int8", value, *indices),
+                pass_through_value[indices],
+            ),
+        )
+        uint8_value = te.compute(
+            data.shape,
+            lambda *indices: tir.if_then_else(
+                out_dtype.equal(SQNN_DTYPE_TO_CODE["uint8"]),
+                _compute_intn("uint8", value, *indices),
+                int8_value[indices],
+            ),
+        )
+        int32_value = te.compute(
+            data.shape,
+            lambda *indices: tir.if_then_else(
+                out_dtype.equal(SQNN_DTYPE_TO_CODE["int32"]),
+                _compute_intn("int32", value, *indices),
+                uint8_value[indices],
+            ),
+        )
+
+        return int32_value
+
+    return te.compute(data.shape, lambda *indices: _dispatch_sim_quantize(data)[indices])
+
+
+@tvm.te.tag_scope(tag=topi.tag.ELEMWISE)
+def simulated_dequantize(data, in_dtype, input_scale=None, input_zero_point=None, axis=-1):
+    """Simulated QNN dequantize operator that mimics QNN outputs without changing datatype.
+    The benefit of this operator over true QNN dequantize is that this operator allows dynamic
+    datatype selection and can operate on both per-channel and scalar scales and zero points while
+    QNN dequantize requires both of these to be fixed at compile time.
+
+    Parameters
+    ----------
+    data: tvm.te.Tensor
+        An N-D input tensor to the operator.
+
+    in_dtype: tvm.te.Tensor
+        A scalar variable that indicates which datatype to simulate dequantization with. Use
+        SQNN_DTYPE_TO_CODE to convert a dtype string into the corresponding variable
+        value.
+
+    input_scale: tvm.te.Tensor, optional
+        A scalar tensor representing the scale to use when dequantizing from integer datatypes.
+        When it contains more than a single value, N must match the number of channels in data.
+
+    input_zero_point: tvm.te.Tensor, optional
+        A 1-D tensor representing the zero point to use when dequantizing from integer datatypes.
+        When it contains more than a single value, N must match the number of channels in data.
+
+    axis: int, optional
+        The channel axis for quantization. Default value is -1 which corresponds to the last axis.
+
+    """
+    # When disabled simply return the input tensor.
+    def _compute_pass_through(value, *indices):
+        return value[indices]
+
+    # Simulate dequantization for arbitrary integer datatypes. The computation for all datatypes is:
+    # DQ_output = (input - zero_point) * scale
+    def _compute_intn(value, *indices):
+        assert input_scale is not None and input_zero_point is not None
+        # Use indexmod to handle both scalar and per-channel QNN parameters.
+        scale_idx = tir.indexmod(indices[axis], topi.shape(input_scale)[0])
+        zp_idx = tir.indexmod(indices[axis], topi.shape(input_zero_point)[0])
+        return (value[indices] - input_zero_point[zp_idx]) * input_scale[scale_idx]
+
+    # Use an if chain to dynamically return the proper dequantization based on the input datatype.
+    # This allows the op to compile once but apply different quantization approaches
+    # using a variable datatype input.
+    def _dispatch_sim_dequantize(value):
+        pass_through_value = te.compute(
+            data.shape, lambda *indices: _compute_pass_through(value, *indices)
+        )
+        intn_condition = tvm.te.any(
+            in_dtype.equal(SQNN_DTYPE_TO_CODE["int8"]),
+            in_dtype.equal(SQNN_DTYPE_TO_CODE["uint8"]),
+            in_dtype.equal(SQNN_DTYPE_TO_CODE["int32"]),
+        )
+        intn_value = te.compute(
+            data.shape,
+            lambda *indices: tir.if_then_else(
+                intn_condition,
+                _compute_intn(value, *indices),
+                pass_through_value[indices],
+            ),
+        )
+
+        return intn_value
+
+    return te.compute(data.shape, lambda *indices: _dispatch_sim_dequantize(data)[indices])
diff --git a/src/relay/qnn/op/dequantize.cc b/src/relay/qnn/op/dequantize.cc
index 724441e0c523..b0fe9356a758 100644
--- a/src/relay/qnn/op/dequantize.cc
+++ b/src/relay/qnn/op/dequantize.cc
@@ -53,7 +53,7 @@ bool DequantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 
   const auto* dequantize_attrs = attrs.as<DequantizeAttrs>();
   int axis = dequantize_attrs->axis;
-  axis = (axis == -1) ? data->shape.size() - 1 : axis;
+  axis = (axis < 0) ? data->shape.size() + axis : axis;
   ICHECK_LT(axis, static_cast<int>(data->shape.size()))
       << "axis " << dequantize_attrs->axis << " is out of range";
   ICHECK_GE(axis, 0) << "axis " << dequantize_attrs->axis << " is out of range";
@@ -81,7 +81,7 @@ Expr MakeDequantize(Expr data, Expr input_scale, Expr input_zero_point, int axis
 Expr DequantizeLower(const Expr& input_tensor, const Expr& input_scale,
                      const Expr& input_zero_point, const Array<tvm::relay::Type>& types,
                      const DequantizeAttrs* attrs) {
-  const auto axis = attrs->axis;
+  auto axis = attrs->axis;
 
   ICHECK_EQ(types.size(), 4);
   auto in_type = types[0];
@@ -92,6 +92,11 @@ Expr DequantizeLower(const Expr& input_tensor, const Expr& input_scale,
 
   size_t n_dim = input_shape.size();
 
+  // Wrap axis from negative to positive if needed.
+  if (axis < 0) {
+    axis = static_cast<int>(n_dim) + axis;
+  }
+
   // Expand scale and zero point if the input tensor is channel quantized
   auto expanded_input_scale = input_scale;
   if (!IsConstScalar(input_scale) && !IsScalarType(types[1])) {
diff --git a/src/relay/qnn/op/quantize.cc b/src/relay/qnn/op/quantize.cc
index 9829834f43a3..751abfc5ca81 100644
--- a/src/relay/qnn/op/quantize.cc
+++ b/src/relay/qnn/op/quantize.cc
@@ -19,8 +19,8 @@
 
 /*!
  * \file src/relay/qnn/op/quantize.cc
- * \brief QNN dequantize operator. Dequantize operator converts from quantized
- * domain to unquantized domain.
+ * \brief QNN quantize operator. Quantize operator converts from unquantized
+ * domain to quantized domain.
  */
 
 #include <tvm/relay/analysis.h>
@@ -51,7 +51,7 @@ bool QuantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 
   const auto* quantize_attrs = attrs.as<QuantizeAttrs>();
   int axis = quantize_attrs->axis;
-  axis = (axis == -1) ? data->shape.size() - 1 : axis;
+  axis = (axis < 0) ? data->shape.size() + axis : axis;
   ICHECK_LT(axis, static_cast<int>(data->shape.size()))
       << "axis " << quantize_attrs->axis << " is out of range";
   ICHECK_GE(axis, 0) << "axis " << quantize_attrs->axis << " is out of range";
@@ -93,10 +93,15 @@ Expr QuantizeLower(const Expr& input_tensor, const Expr& output_scale,
   Array<IndexExpr> input_shape = in_tensor_type->shape;
 
   const auto out_dtype = attrs->out_dtype;
-  const auto axis = attrs->axis;
+  auto axis = attrs->axis;
 
   size_t n_dim = input_shape.size();
 
+  // Wrap axis from negative to positive if needed.
+  if (axis < 0) {
+    axis = static_cast<int>(n_dim) + axis;
+  }
+
   auto expanded_output_scale = output_scale;
   if (!IsConstScalar(output_scale) && !IsScalarType(types[1])) {
     expanded_output_scale = ExpandBiasToMatchAxis(output_scale, n_dim, {axis});
diff --git a/src/relay/qnn/op/simulated_dequantize.cc b/src/relay/qnn/op/simulated_dequantize.cc
new file mode 100644
index 000000000000..e1fc47d700c9
--- /dev/null
+++ b/src/relay/qnn/op/simulated_dequantize.cc
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/simulated_dequantize.cc
+ * \brief QNN simulated dequantize operator. Mimics the behavior
+ * of QNN dequantize in floating point with added flexibility.
+ */
+
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/relay/qnn/attrs.h>
+
+#include "../../transforms/pattern_utils.h"
+#include "../utils.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool SimulatedDequantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                            const TypeReporter& reporter) {
+  // types = [data_type, datatype_type, scale_type, zp_type, ret_type]
+  ICHECK_EQ(types.size(), 5);
+  const auto* data = types[0].as<TensorTypeNode>();
+  const auto* dtype = types[1].as<TensorTypeNode>();
+
+  if ((data == nullptr) || (dtype == nullptr)) {
+    return false;
+  }
+
+  // assign output type
+  reporter->Assign(types[4], TensorType(data->shape, data->dtype));
+  return true;
+}
+
+Expr MakeSimulatedDequantize(Expr data, Expr in_dtype, Expr input_scale, Expr input_zero_point,
+                             int axis) {
+  auto attrs = make_object<DequantizeAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.simulated_dequantize");
+  return Call(op, {data, in_dtype, input_scale, input_zero_point}, Attrs(attrs), {});
+}
+
+RELAY_REGISTER_OP("qnn.simulated_dequantize")
+    .describe(R"code(Simulates the functionality of qnn.dequantize but allows more flexible
+    dynamic input type conversion and always operates on float values.
+)code" TVM_ADD_FILELINE)
+    .set_attrs_type<DequantizeAttrs>()
+    .set_num_inputs(4)
+    .add_argument("data", "Tensor", "The tensor to dequantize.")
+    .add_argument("in_dtype", "Tensor",
+                  "A code corresponding to the type of quantization to convert from.")
+    .add_argument("input_scale", "Tensor", "The quantization scale of the input tensor.")
+    .add_argument("input_zero_point", "Tensor", "The quantization zero_point of the input tensor.")
+    .set_support_level(11)
+    .add_type_rel("QNNSimulatedDequantize", SimulatedDequantizeRel);
+
+TVM_REGISTER_GLOBAL("relay.qnn.op._make.simulated_dequantize")
+    .set_body_typed(MakeSimulatedDequantize);
+
+}  // namespace qnn
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/qnn/op/simulated_quantize.cc b/src/relay/qnn/op/simulated_quantize.cc
new file mode 100644
index 000000000000..089762a6ade0
--- /dev/null
+++ b/src/relay/qnn/op/simulated_quantize.cc
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/simulated_quantize.cc
+ * \brief QNN simulated quantize operator. Mimics the behavior
+ * of QNN quantize in floating point with added flexibility.
+ */
+
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/relay/qnn/attrs.h>
+
+#include "../../transforms/pattern_utils.h"
+#include "../utils.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+TVM_REGISTER_NODE_TYPE(SimulatedQuantizeAttrs);
+
+bool SimulatedQuantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                          const TypeReporter& reporter) {
+  // types = [data_type, datatype_type, scale_type, zp_type, ret_type]
+  ICHECK_EQ(types.size(), 5);
+  const auto* data = types[0].as<TensorTypeNode>();
+  const auto* dtype = types[1].as<TensorTypeNode>();
+
+  if ((data == nullptr) || (dtype == nullptr)) {
+    return false;
+  }
+
+  // assign output type
+  reporter->Assign(types[4], TensorType(data->shape, data->dtype));
+  return true;
+}
+
+Expr MakeSimulatedQuantize(Expr data, Expr out_dtype, Expr output_scale, Expr output_zero_point,
+                           int axis) {
+  auto attrs = make_object<SimulatedQuantizeAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.simulated_quantize");
+  return Call(op, {data, out_dtype, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+RELAY_REGISTER_OP("qnn.simulated_quantize")
+    .describe(R"code(Simulates the functionality of qnn.quantize but allows more flexible
+    dynamic input type conversion and always outputs float values.
+)code" TVM_ADD_FILELINE)
+    .set_attrs_type<SimulatedQuantizeAttrs>()
+    .set_num_inputs(4)
+    .add_argument("data", "Tensor", "The tensor to quantize.")
+    .add_argument("out_dtype", "Tensor",
+                  "A code corresponding to the type of quantization to apply.")
+    .add_argument("output_scale", "Tensor", "The quantization scale of the output tensor.")
+    .add_argument("output_zero_point", "Tensor",
+                  "The quantization zero_point of the output tensor.")
+    .set_support_level(11)
+    .add_type_rel("QNNSimulatedQuantize", SimulatedQuantizeRel);
+
+TVM_REGISTER_GLOBAL("relay.qnn.op._make.simulated_quantize").set_body_typed(MakeSimulatedQuantize);
+
+}  // namespace qnn
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/relay/test_op_qnn_dequantize.py b/tests/python/relay/test_op_qnn_dequantize.py
index e7fb161a13cb..1833458fdb75 100644
--- a/tests/python/relay/test_op_qnn_dequantize.py
+++ b/tests/python/relay/test_op_qnn_dequantize.py
@@ -98,7 +98,7 @@ def test_channelwise_axis_1():
     }
 
     dequantize_test_driver(
-        in_dtype="uint8", quant_args=quant_args, in_data=data, verify_output_data=output, axis=1
+        in_dtype="uint8", quant_args=quant_args, in_data=data, verify_output_data=output, axis=-1
     )
 
 
diff --git a/tests/python/relay/test_op_qnn_quantize.py b/tests/python/relay/test_op_qnn_quantize.py
index 2ef298679904..b300c5612174 100644
--- a/tests/python/relay/test_op_qnn_quantize.py
+++ b/tests/python/relay/test_op_qnn_quantize.py
@@ -127,7 +127,7 @@ def test_channelwise_axis_1():
     quantize_test_driver(
         in_dtype="float32",
         quant_args=quant_args,
-        axis=1,
+        axis=-1,
         out_dtype="uint8",
         in_data=data,
         verify_output_data=output,
diff --git a/tests/python/relay/test_op_qnn_simulated_dequantize.py b/tests/python/relay/test_op_qnn_simulated_dequantize.py
new file mode 100644
index 000000000000..0cc04e4998eb
--- /dev/null
+++ b/tests/python/relay/test_op_qnn_simulated_dequantize.py
@@ -0,0 +1,177 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+from tvm import te
+import numpy as np
+from tvm import relay
+from tvm.contrib import graph_runtime
+from tvm.runtime.vm import VirtualMachine
+from tvm.topi.nn.qnn import SQNN_DTYPE_TO_CODE
+
+
+def dequantize_test_driver(in_dtype, quant_args, axis, in_data):
+    shape = in_data.shape
+    input_data = relay.var("input_data", shape=shape, dtype=in_dtype)
+    input_zero_point = relay.const(quant_args["in_zero_point"])
+    input_scale = relay.const(quant_args["in_scale"])
+    dequantized_output = relay.qnn.op.dequantize(
+        input_data,
+        input_scale=input_scale,
+        input_zero_point=input_zero_point,
+        axis=axis,
+    )
+    mod = relay.Function(relay.analysis.free_vars(dequantized_output), dequantized_output)
+    mod = tvm.IRModule.from_expr(mod)
+    with tvm.transform.PassContext(opt_level=3):
+        graph, lib, params = relay.build(mod, "llvm", params=None)
+    rt_mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
+    rt_mod.set_input(input_data=in_data)
+    rt_mod.set_input(**params)
+    rt_mod.run()
+    res = rt_mod.get_output(0).asnumpy()
+    return res
+
+
+def build_simulated_dequantize(input_data, scale, zp, dtype, axis=-1):
+    sim_q = relay.qnn.op.simulated_dequantize(
+        input_data,
+        scale,
+        zp,
+        axis=axis,
+        in_dtype=dtype,
+    )
+    mod = tvm.IRModule.from_expr(sim_q)
+    with tvm.transform.PassContext(opt_level=3):
+        vm_exec = relay.vm.compile(mod, "llvm", params=None)
+    vm = VirtualMachine(vm_exec, tvm.cpu(0))
+    return vm
+
+
+def verify_simulated_dequantize_simple(dtype):
+    data = np.random.uniform(low=-128, high=127, size=[2, 5]).astype(dtype)
+    data_fp = data.astype("float32")
+    scale_np = np.float32(0.5)
+    zp_np = np.int32(127)
+    dtype_np = np.int32(SQNN_DTYPE_TO_CODE[dtype])
+    quant_args = {"in_zero_point": zp_np, "in_scale": scale_np}
+    dq_out = dequantize_test_driver(
+        in_dtype=dtype,
+        quant_args=quant_args,
+        axis=-1,
+        in_data=data,
+    )
+    input_data = relay.var("input_data", shape=data.shape, dtype="float32")
+    scale = relay.var("scale", shape=[])
+    zp = relay.var("zp", shape=[])
+    dtype = relay.var("dtype", shape=[])
+    vm = build_simulated_dequantize(input_data, scale, zp, dtype)
+    sim_dq_out = vm.invoke("main", input_data=data_fp, scale=scale_np, zp=zp_np, dtype=dtype_np)
+    np.testing.assert_equal(sim_dq_out.asnumpy(), dq_out)
+
+
+def test_simulated_dequantize():
+    verify_simulated_dequantize_simple("uint8")
+    verify_simulated_dequantize_simple("int8")
+    verify_simulated_dequantize_simple("int32")
+
+
+def test_dynamic_channels():
+    # Compile simulated quantize once but support either per-channel or scalar params.
+    data = np.random.uniform(low=-64, high=64, size=[2, 5]).astype("int8")
+    data_fp = data.astype("float32")
+    # Test scalar qnn params.
+    scale_np = np.asarray([0.5]).astype("float32")
+    zp_np = np.asarray([0]).astype("int32")
+    dtype_np = np.int32(SQNN_DTYPE_TO_CODE["int8"])
+    quant_args = {"in_zero_point": zp_np[0], "in_scale": scale_np[0]}
+    dq_out = dequantize_test_driver(
+        in_dtype="int8",
+        quant_args=quant_args,
+        axis=0,
+        in_data=data,
+    )
+    # Create variables with undefined shape and run with scalar inputs.
+    input_data = relay.var("input_data", shape=data.shape, dtype="float32")
+    scale = relay.var("scale", shape=[relay.Any()], dtype="float32")
+    zp = relay.var("zp", shape=[relay.Any()], dtype="int32")
+    dtype = relay.var("dtype", shape=[])
+    vm = build_simulated_dequantize(input_data, scale, zp, dtype, axis=0)
+    sim_dq_out = vm.invoke("main", input_data=data_fp, scale=scale_np, zp=zp_np, dtype=dtype_np)
+    np.testing.assert_equal(sim_dq_out.asnumpy(), dq_out)
+
+    # Now get the perchannel quantize output and compare without recompiling.
+    scale_np = np.array([0.5, 0.25]).astype("float32")
+    zp_np = np.array([127, 123]).astype("int32")
+
+    # Get the reference quantize output.
+    quant_args = {"in_zero_point": zp_np, "in_scale": scale_np}
+    dq_out = dequantize_test_driver(
+        in_dtype="int8",
+        quant_args=quant_args,
+        axis=0,
+        in_data=data,
+    )
+    # Run the simulated quantize without recompiling and confirm results match.
+    sim_dq_out = vm.invoke("main", input_data=data_fp, scale=scale_np, zp=zp_np, dtype=dtype_np)
+    np.testing.assert_equal(sim_dq_out.asnumpy(), dq_out)
+
+
+def test_dynamic_dtype():
+    # Compile simulated quantize once but support any type of quantization.
+    data = np.random.uniform(low=0, high=255, size=[2, 5]).astype("uint8")
+    data_fp = data.astype("float32")
+    # Test scalar uint8 to fp32.
+    scale_np = np.asarray([0.5]).astype("float32")
+    zp_np = np.asarray([127]).astype("int32")
+    dtype_np = np.int32(SQNN_DTYPE_TO_CODE["uint8"])
+    quant_args = {"in_zero_point": zp_np[0], "in_scale": scale_np[0]}
+    dq_out = dequantize_test_driver(
+        in_dtype="uint8",
+        quant_args=quant_args,
+        axis=-1,
+        in_data=data,
+    )
+    # Create variables with undefined shape and run with scalar inputs.
+    input_data = relay.var("input_data", shape=data.shape, dtype="float32")
+    scale = relay.var("scale", shape=[relay.Any()], dtype="float32")
+    zp = relay.var("zp", shape=[relay.Any()], dtype="int32")
+    dtype = relay.var("dtype", shape=[])
+    vm = build_simulated_dequantize(input_data, scale, zp, dtype)
+    sim_dq_out = vm.invoke("main", input_data=data_fp, scale=scale_np, zp=zp_np, dtype=dtype_np)
+    np.testing.assert_equal(sim_dq_out.asnumpy(), dq_out)
+
+    # Now test int8 to float32 compilation.
+    data = np.random.uniform(low=0, high=255, size=[2, 5]).astype("int8")
+    data_fp = data.astype("float32")
+    # Get the reference quantize output.
+    dq_out = dequantize_test_driver(
+        in_dtype="int8",
+        quant_args=quant_args,
+        axis=-1,
+        in_data=data,
+    )
+    # Run the simulated quantize without recompiling and confirm results match.
+    dtype_np = np.int32(SQNN_DTYPE_TO_CODE["int8"])
+    sim_dq_out = vm.invoke("main", input_data=data_fp, scale=scale_np, zp=zp_np, dtype=dtype_np)
+    np.testing.assert_equal(sim_dq_out.asnumpy(), dq_out)
+
+
+if __name__ == "__main__":
+    test_simulated_dequantize()
+    test_dynamic_channels()
+    test_dynamic_dtype()
diff --git a/tests/python/relay/test_op_qnn_simulated_quantize.py b/tests/python/relay/test_op_qnn_simulated_quantize.py
new file mode 100644
index 000000000000..ee4ba209dcb8
--- /dev/null
+++ b/tests/python/relay/test_op_qnn_simulated_quantize.py
@@ -0,0 +1,178 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+from tvm import te
+import numpy as np
+from tvm import relay
+from tvm.contrib import graph_runtime
+from tvm.runtime.vm import VirtualMachine
+from tvm.topi.nn.qnn import SQNN_DTYPE_TO_CODE
+
+
+def quantize_test_driver(in_dtype, quant_args, axis, out_dtype, in_data):
+    shape = in_data.shape
+    input_data = relay.var("input_data", shape=shape, dtype=in_dtype)
+    output_zero_point = relay.const(quant_args["out_zero_point"])
+    output_scale = relay.const(quant_args["out_scale"])
+    quantized_output = relay.qnn.op.quantize(
+        input_data,
+        output_scale=output_scale,
+        output_zero_point=output_zero_point,
+        axis=axis,
+        out_dtype=out_dtype,
+    )
+    mod = relay.Function(relay.analysis.free_vars(quantized_output), quantized_output)
+    mod = tvm.IRModule.from_expr(mod)
+    with tvm.transform.PassContext(opt_level=3):
+        graph, lib, params = relay.build(mod, "llvm", params=None)
+    rt_mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
+    rt_mod.set_input(input_data=in_data)
+    rt_mod.set_input(**params)
+    rt_mod.run()
+    res = rt_mod.get_output(0).asnumpy()
+    return res
+
+
+def build_simulated_quantize(input_data, scale, zp, dtype, axis=-1):
+    sim_q = relay.qnn.op.simulated_quantize(
+        input_data,
+        scale,
+        zp,
+        axis=axis,
+        out_dtype=dtype,
+    )
+    mod = tvm.IRModule.from_expr(sim_q)
+    with tvm.transform.PassContext(opt_level=3):
+        vm_exec = relay.vm.compile(mod, "llvm", params=None)
+    vm = VirtualMachine(vm_exec, tvm.cpu(0))
+    return vm
+
+
+def verify_simulated_quantize_simple(dtype):
+    data = np.random.uniform(low=-128, high=127, size=[2, 5]).astype("float32")
+    scale_np = np.float32(0.5)
+    zp_np = np.int32(127)
+    dtype_np = np.int32(SQNN_DTYPE_TO_CODE[dtype])
+    quant_args = {"out_zero_point": zp_np, "out_scale": scale_np}
+    q_out = quantize_test_driver(
+        in_dtype="float32",
+        quant_args=quant_args,
+        axis=-1,
+        out_dtype=dtype,
+        in_data=data,
+    )
+    input_data = relay.var("input_data", shape=data.shape, dtype="float32")
+    scale = relay.var("scale", shape=[])
+    zp = relay.var("zp", shape=[])
+    dtype = relay.var("dtype", shape=[])
+    vm = build_simulated_quantize(input_data, scale, zp, dtype)
+    sim_q_out = vm.invoke("main", input_data=data, scale=scale_np, zp=zp_np, dtype=dtype_np)
+    np.testing.assert_equal(sim_q_out.asnumpy(), q_out)
+
+
+def test_simulated_quantize():
+    verify_simulated_quantize_simple("uint8")
+    verify_simulated_quantize_simple("int8")
+    verify_simulated_quantize_simple("int32")
+
+
+def test_dynamic_channels():
+    # Compile simulated quantize once but support either per-channel or scalar params.
+    data = np.random.uniform(low=-64, high=64, size=[2, 5]).astype("float32")
+    # Test scalar qnn params.
+    scale_np = np.asarray([0.5]).astype("float32")
+    zp_np = np.asarray([127]).astype("int32")
+    dtype_np = np.int32(SQNN_DTYPE_TO_CODE["uint8"])
+    quant_args = {"out_zero_point": zp_np[0], "out_scale": scale_np[0]}
+    q_out = quantize_test_driver(
+        in_dtype="float32",
+        quant_args=quant_args,
+        axis=0,
+        out_dtype="uint8",
+        in_data=data,
+    )
+    # Create variables with undefined shape and run with scalar inputs.
+    input_data = relay.var("input_data", shape=data.shape, dtype="float32")
+    scale = relay.var("scale", shape=[relay.Any()], dtype="float32")
+    zp = relay.var("zp", shape=[relay.Any()], dtype="int32")
+    dtype = relay.var("dtype", shape=[])
+    vm = build_simulated_quantize(input_data, scale, zp, dtype, axis=0)
+    sim_q_out = vm.invoke("main", input_data=data, scale=scale_np, zp=zp_np, dtype=dtype_np)
+    np.testing.assert_equal(sim_q_out.asnumpy(), q_out)
+
+    # Now get the perchannel quantize output and compare without recompiling.
+    scale_np = np.array([0.5, 0.25]).astype("float32")
+    zp_np = np.array([127, 123]).astype("int32")
+
+    # Get the reference quantize output.
+    quant_args = {"out_zero_point": zp_np, "out_scale": scale_np}
+    q_out = quantize_test_driver(
+        in_dtype="float32",
+        quant_args=quant_args,
+        axis=0,
+        out_dtype="uint8",
+        in_data=data,
+    )
+    # Run the simulated quantize without recompiling and confirm results match.
+    sim_q_out = vm.invoke("main", input_data=data, scale=scale_np, zp=zp_np, dtype=dtype_np)
+    np.testing.assert_equal(sim_q_out.asnumpy(), q_out)
+
+
+def test_dynamic_dtype():
+    # Compile simulated quantize once but support any type of quantization.
+    data = np.random.uniform(low=-64, high=64, size=[2, 5]).astype("float32")
+    # Test scalar float32 to uint8.
+    scale_np = np.asarray([0.5]).astype("float32")
+    zp_np = np.asarray([127]).astype("int32")
+    dtype_np = np.int32(SQNN_DTYPE_TO_CODE["uint8"])
+    quant_args = {"out_zero_point": zp_np[0], "out_scale": scale_np[0]}
+    q_out = quantize_test_driver(
+        in_dtype="float32",
+        quant_args=quant_args,
+        axis=-1,
+        out_dtype="uint8",
+        in_data=data,
+    )
+    # Create variables with undefined shape and run with scalar inputs.
+    input_data = relay.var("input_data", shape=data.shape, dtype="float32")
+    scale = relay.var("scale", shape=[relay.Any()], dtype="float32")
+    zp = relay.var("zp", shape=[relay.Any()], dtype="int32")
+    dtype = relay.var("dtype", shape=[])
+    vm = build_simulated_quantize(input_data, scale, zp, dtype)
+    sim_q_out = vm.invoke("main", input_data=data, scale=scale_np, zp=zp_np, dtype=dtype_np)
+    np.testing.assert_equal(sim_q_out.asnumpy(), q_out)
+
+    # Now test float32 to int32 compilation.
+    # Get the reference quantize output.
+    q_out = quantize_test_driver(
+        in_dtype="float32",
+        quant_args=quant_args,
+        axis=-1,
+        out_dtype="int32",
+        in_data=data,
+    )
+    # Run the simulated quantize without recompiling and confirm results match.
+    dtype_np = np.int32(SQNN_DTYPE_TO_CODE["int32"])
+    sim_q_out = vm.invoke("main", input_data=data, scale=scale_np, zp=zp_np, dtype=dtype_np)
+    np.testing.assert_equal(sim_q_out.asnumpy(), q_out)
+
+
+if __name__ == "__main__":
+    test_simulated_quantize()
+    test_dynamic_channels()
+    test_dynamic_dtype()
diff --git a/tests/python/topi/python/test_topi_qnn.py b/tests/python/topi/python/test_topi_qnn.py
new file mode 100644
index 000000000000..a63f34fe08d0
--- /dev/null
+++ b/tests/python/topi/python/test_topi_qnn.py
@@ -0,0 +1,157 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test code for QNN operators."""
+import numpy as np
+import tvm
+from tvm import topi, relay, te
+from tvm.contrib import graph_runtime
+import tvm.topi.testing
+
+
+def verify_simulated_quantize(data_shape, out_dtype, channels, axis):
+    # Create placeholder variables for all qnn inputs.
+    A = te.placeholder(data_shape, name="value", dtype="float32")
+    D = te.placeholder([], name="dtype", dtype="int32")
+    S = te.placeholder([te.size_var("scale_dim")], name="scale", dtype="float32")
+    Z = te.placeholder([te.size_var("zp_dim")], name="zp", dtype="int32")
+    SIM_Q = topi.nn.simulated_quantize(A, D, output_scale=S, output_zero_point=Z, axis=axis)
+
+    # Create random numpy values to assign to inputs.
+    a_np = np.random.uniform(size=data_shape).astype("float32")
+    d_np = np.int32(topi.nn.SQNN_DTYPE_TO_CODE[out_dtype])
+    s_np = np.random.uniform(low=1e-4, high=0.1, size=channels).astype("float32")
+    z_np = np.random.uniform(low=-10, high=10, size=channels).astype("int32")
+    q_np = np.zeros(shape=data_shape, dtype="float32")
+
+    def check_device(device, ctx):
+        # Wrap the numpy arrays in nd arrays.
+        a = tvm.nd.array(a_np, ctx)
+        d = tvm.nd.array(d_np, ctx)
+        s = tvm.nd.array(s_np, ctx)
+        z = tvm.nd.array(z_np, ctx)
+        q = tvm.nd.array(q_np, ctx)
+
+        # Construct equivalent relay graph.
+        per_channel = channels[0] != 1
+        a_var = relay.var("a", shape=data_shape, dtype="float32")
+        if per_channel:
+            s_var = relay.const(s_np)
+            z_var = relay.const(z_np)
+        else:
+            s_var = relay.const(s_np[0])
+            z_var = relay.const(z_np[0])
+        real_q_op = relay.qnn.op.quantize(a_var, s_var, z_var, axis=axis, out_dtype=out_dtype)
+        with tvm.transform.PassContext(opt_level=3):
+            lib = relay.build(tvm.IRModule.from_expr(real_q_op), target=device)
+
+        # Get real qnn quantize output.
+        m = graph_runtime.GraphModule(lib["default"](ctx))
+        m.set_input("a", a_np)
+
+        m.run()
+        real_q_out = m.get_output(0)
+
+        # Compile the simulated quantize function.
+        with tvm.target.Target(device):
+            sched = tvm.topi.testing.get_injective_schedule(device)(SIM_Q)
+        func = tvm.build(sched, [A, D, S, Z, SIM_Q], device, name="sim_quantize")
+        func(a, d, s, z, q)
+
+        # Check correctness against the true qnn output.
+        tvm.testing.assert_allclose(q.asnumpy(), real_q_out.asnumpy().astype("float32"))
+
+    for target, ctx in tvm.testing.enabled_targets():
+        check_device(target, ctx)
+
+
+def test_simulated_quantize():
+    verify_simulated_quantize([1], "int8", [1], -1)
+    verify_simulated_quantize([2, 5], "int8", [5], 1)
+    verify_simulated_quantize([1, 32, 32, 32], "int8", [32], -1)
+    verify_simulated_quantize([1, 32, 32, 32], "uint8", [32], -2)
+    verify_simulated_quantize([2, 5], "int32", [5], 1)
+
+
+def verify_simulated_dequantize(data_shape, in_dtype, channels, axis):
+    # Create placeholder variables for all qnn inputs.
+    A = te.placeholder(data_shape, name="value", dtype="float32")
+    D = te.placeholder([], name="dtype", dtype="int32")
+    S = te.placeholder([te.size_var("scale_dim")], name="scale", dtype="float32")
+    Z = te.placeholder([te.size_var("zp_dim")], name="zp", dtype="int32")
+    SIM_DQ = topi.nn.simulated_dequantize(A, D, input_scale=S, input_zero_point=Z, axis=axis)
+
+    # Create random numpy values to assign to inputs.
+    a_np = np.random.uniform(low=-128, high=127, size=data_shape).astype(in_dtype)
+    a_np_f = a_np.astype("float32")
+    d_np = np.int32(topi.nn.SQNN_DTYPE_TO_CODE[in_dtype])
+    s_np = np.random.uniform(low=1e-4, high=0.1, size=channels).astype("float32")
+    z_np = np.random.uniform(low=-10, high=10, size=channels).astype("int32")
+    dq_np = np.zeros(shape=data_shape, dtype="float32")
+
+    def check_device(device, ctx):
+        # Wrap the numpy arrays in nd arrays.
+        a = tvm.nd.array(a_np_f, ctx)
+        d = tvm.nd.array(d_np, ctx)
+        s = tvm.nd.array(s_np, ctx)
+        z = tvm.nd.array(z_np, ctx)
+        dq = tvm.nd.array(dq_np, ctx)
+
+        # Construct equivalent relay graph.
+        per_channel = channels[0] != 1
+        a_var = relay.var("a", shape=data_shape, dtype=in_dtype)
+        if per_channel:
+            s_var = relay.const(s_np)
+            z_var = relay.const(z_np)
+        else:
+            s_var = relay.const(s_np[0])
+            z_var = relay.const(z_np[0])
+        real_dq_op = relay.qnn.op.dequantize(a_var, s_var, z_var, axis=axis)
+        with tvm.transform.PassContext(opt_level=3):
+            lib = relay.build(tvm.IRModule.from_expr(real_dq_op), target=device)
+
+        # Get real qnn quantize output.
+        m = graph_runtime.GraphModule(lib["default"](ctx))
+        m.set_input("a", a_np)
+
+        m.run()
+        real_dq_out = m.get_output(0)
+
+        # Compile the simulated quantize function.
+        with tvm.target.Target(device):
+            sched = tvm.topi.testing.get_injective_schedule(device)(SIM_DQ)
+        func = tvm.build(sched, [A, D, S, Z, SIM_DQ], device, name="sim_quantize")
+        func(a, d, s, z, dq)
+
+        # Check correctness against the true qnn output.
+        tvm.testing.assert_allclose(dq.asnumpy(), real_dq_out.asnumpy().astype("float32"))
+
+    for target, ctx in tvm.testing.enabled_targets():
+        check_device(target, ctx)
+
+
+def test_simulated_dequantize():
+    verify_simulated_dequantize([1], "int8", [1], -1)
+    verify_simulated_dequantize([2, 5], "int8", [5], 1)
+    verify_simulated_dequantize([2, 5], "int8", [2], 0)
+    verify_simulated_dequantize([1, 32, 32, 32], "int8", [32], -1)
+    verify_simulated_dequantize([1, 32, 32, 32], "uint8", [32], -2)
+    verify_simulated_dequantize([2, 5], "int32", [5], 1)
+
+
+if __name__ == "__main__":
+    test_simulated_quantize()
+    test_simulated_dequantize()

From 68b81ade90d8011d4f2cdbdaf0230f4f9d49196f Mon Sep 17 00:00:00 2001
From: apeskov <peskovnn@gmail.com>
Date: Fri, 12 Mar 2021 04:54:35 +0300
Subject: [PATCH 327/357] Introduce Apple BNNS backend (#7299)

* Introduce Apple BNNS backend

This is simple JSON based runtime which offload execution of
some operation into Accelerate frameworks via BNNS api.

Works only for:
 * macOS 11.0 and later
 * iOS 14.0 and later

Supported primitives:
 * conv2d and fusing with bias and relu
 * dense and fusing with bias and relu/gelu
 * batch_matmul

Signed-off-by: Alexander Peskov <peskovnn@gmail.com>

* [BNNS] Add conv2d DW test

Also fix some pylint issues

Signed-off-by: Alexander Peskov <peskovnn@gmail.com>

* [BNNS] Fix clang-format issues

Signed-off-by: Alexander Peskov <peskovnn@gmail.com>

* [BNNS] Refactoring. Add TView abstraction

Signed-off-by: Alexander Peskov <peskovnn@gmail.com>

* [BNNS] Add several more onnx topologies into tests

Signed-off-by: Alexander Peskov <peskovnn@gmail.com>

* [BNNS] Avoid redundant tensor allocation

Signed-off-by: Alexander Peskov <peskovnn@gmail.com>

* [BNNS] Fix conv_splitter issue

Signed-off-by: Alexander Peskov <peskovnn@gmail.com>

* [BNNS] Fix isse with bias {1,1,1,1}

Signed-off-by: Alexander Peskov <peskovnn@gmail.com>

* [BNNS] Min. Rename file

Signed-off-by: Alexander Peskov <peskovnn@gmail.com>

* Fix review comments. Initial

Signed-off-by: Alexander Peskov <peskovnn@gmail.com>

* [BNNS] test refactoring

Signed-off-by: Alexander Peskov <peskovnn@gmail.com>

* [BNNS] Fix cpplint issues

Signed-off-by: Alexander Peskov <peskovnn@gmail.com>

* [BNNS] Fix clang-format issues

Signed-off-by: Alexander Peskov <peskovnn@gmail.com>

* Fix python format

Signed-off-by: Alexander Peskov <peskovnn@gmail.com>

* Fix pylint issues

Signed-off-by: Alexander Peskov <peskovnn@gmail.com>

* [BNNS] Fix pylint. Second attempt

Signed-off-by: Alexander Peskov <peskovnn@gmail.com>

* [BNNS] Add integration documentation

* Check onnx import before use

Signed-off-by: Alexander Peskov <peskovnn@gmail.com>

* [BNNS] Add instance normalization operator

* Add fusing sigmoid activation after conv2d

* min changes

Signed-off-by: Alexander Peskov <peskovnn@gmail.com>

* Add pooling operations to BNNS runtime

Supports `nn.max_pool2d`, `nn.avg_pool2d`, `nn.global_max_pool2d` and
`nn.global_avg_pool2d` operations

* Fix lint

* Fix lint

* Apply comments

* Fix documentation

* Fix comment to refer to BNNS

Co-authored-by: dlexplorer <elvin.nnov@gmail.com>
Co-authored-by: Egor Churaev <egor.churaev@gmail.com>
---
 CMakeLists.txt                                |   2 +
 cmake/config.cmake                            |   3 +
 cmake/modules/contrib/BNNS.cmake              |  30 +
 docs/deploy/bnns.rst                          | 183 ++++++
 docs/deploy/index.rst                         |   1 +
 python/tvm/driver/tvmc/composite_target.py    |   5 +
 python/tvm/relay/op/contrib/__init__.py       |   1 +
 python/tvm/relay/op/contrib/bnns.py           | 327 ++++++++++
 src/relay/backend/contrib/bnns/codegen.cc     | 215 +++++++
 src/runtime/contrib/bnns/bnns_json_runtime.cc | 573 ++++++++++++++++++
 src/runtime/contrib/bnns/bnns_wrp.h           | 495 +++++++++++++++
 tests/cpp/contrib/bnns.cc                     | 307 ++++++++++
 tests/python/contrib/test_bnns/__init__.py    |  17 +
 .../contrib/test_bnns/infrastructure.py       | 330 ++++++++++
 tests/python/contrib/test_bnns/test_conv2d.py | 177 ++++++
 .../contrib/test_bnns/test_conv2d_patterns.py | 107 ++++
 tests/python/contrib/test_bnns/test_dense.py  | 190 ++++++
 tests/python/contrib/test_bnns/test_matmul.py | 113 ++++
 .../contrib/test_bnns/test_normalization.py   | 201 ++++++
 .../contrib/test_bnns/test_onnx_topologies.py | 140 +++++
 .../python/contrib/test_bnns/test_pooling.py  | 289 +++++++++
 21 files changed, 3706 insertions(+)
 create mode 100644 cmake/modules/contrib/BNNS.cmake
 create mode 100644 docs/deploy/bnns.rst
 create mode 100644 python/tvm/relay/op/contrib/bnns.py
 create mode 100644 src/relay/backend/contrib/bnns/codegen.cc
 create mode 100644 src/runtime/contrib/bnns/bnns_json_runtime.cc
 create mode 100644 src/runtime/contrib/bnns/bnns_wrp.h
 create mode 100644 tests/cpp/contrib/bnns.cc
 create mode 100644 tests/python/contrib/test_bnns/__init__.py
 create mode 100644 tests/python/contrib/test_bnns/infrastructure.py
 create mode 100644 tests/python/contrib/test_bnns/test_conv2d.py
 create mode 100644 tests/python/contrib/test_bnns/test_conv2d_patterns.py
 create mode 100644 tests/python/contrib/test_bnns/test_dense.py
 create mode 100644 tests/python/contrib/test_bnns/test_matmul.py
 create mode 100644 tests/python/contrib/test_bnns/test_normalization.py
 create mode 100644 tests/python/contrib/test_bnns/test_onnx_topologies.py
 create mode 100644 tests/python/contrib/test_bnns/test_pooling.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 56170c693e3c..1d2e2bcb68c4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -74,6 +74,7 @@ tvm_option(USE_CPP_RPC "Build CPP RPC" OFF)
 tvm_option(USE_TFLITE "Build with tflite support" OFF)
 tvm_option(USE_TENSORFLOW_PATH "TensorFlow root path when use TFLite" none)
 tvm_option(USE_COREML "Build with coreml support" OFF)
+tvm_option(USE_BNNS "Build with BNNS support" OFF)
 tvm_option(USE_TARGET_ONNX "Build with ONNX Codegen support" OFF)
 tvm_option(USE_ARM_COMPUTE_LIB "Build with Arm Compute Library" OFF)
 tvm_option(USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME "Build with Arm Compute Library graph runtime" OFF)
@@ -354,6 +355,7 @@ include(cmake/modules/contrib/HybridDump.cmake)
 include(cmake/modules/contrib/TFLite.cmake)
 include(cmake/modules/contrib/TF_TVMDSOOP.cmake)
 include(cmake/modules/contrib/CoreML.cmake)
+include(cmake/modules/contrib/BNNS.cmake)
 include(cmake/modules/contrib/ONNX.cmake)
 include(cmake/modules/contrib/ArmComputeLib.cmake)
 include(cmake/modules/contrib/TensorRT.cmake)
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 30c21f707c08..67370c635209 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -272,3 +272,6 @@ set(USE_HEXAGON_SDK /path/to/sdk)
 
 # Whether to use ONNX codegen
 set(USE_TARGET_ONNX OFF)
+
+# Whether enable BNNS runtime
+set(USE_BNNS OFF)
diff --git a/cmake/modules/contrib/BNNS.cmake b/cmake/modules/contrib/BNNS.cmake
new file mode 100644
index 000000000000..e14aa2857ebc
--- /dev/null
+++ b/cmake/modules/contrib/BNNS.cmake
@@ -0,0 +1,30 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+if(USE_BNNS STREQUAL "ON")
+  add_definitions(-DUSE_JSON_RUNTIME=1)
+  file(GLOB BNNS_RELAY_CONTRIB_SRC src/relay/backend/contrib/bnns/*.cc)
+  list(APPEND COMPILER_SRCS ${BNNS_RELAY_CONTRIB_SRC})
+  list(APPEND COMPILER_SRCS ${JSON_RELAY_CONTRIB_SRC})
+
+  list(APPEND TVM_RUNTIME_LINKER_LIBS "-framework Accelerate")
+
+  file(GLOB BNNS_CONTRIB_SRC src/runtime/contrib/bnns/*.cc)
+  list(APPEND RUNTIME_SRCS ${BNNS_CONTRIB_SRC})
+  message(STATUS "Build with BNNS JSON runtime: " ${EXTERN_LIBRARY_BNNS})
+endif()
+
diff --git a/docs/deploy/bnns.rst b/docs/deploy/bnns.rst
new file mode 100644
index 000000000000..cb15a4f3bd54
--- /dev/null
+++ b/docs/deploy/bnns.rst
@@ -0,0 +1,183 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+Relay BNNS Integration
+======================
+**Author**: `Egor Churaev <https://github.com/echuraev>`_
+
+Introduction
+------------
+
+Apple BNNS library is a collection of functions that can be used to construct neural networks
+for inference (and train). It’s supported in macOS, iOS, tvOS, and watchOS. BNNS provides
+primitives executed on all CPU supported on those platforms and optimized for high performance
+and low-energy consumption. This integration will offload as many operators as possible from Relay to BNNS.
+
+BNNS runtime is a part of platform API and available on all modern Apple operating systems.
+Application using BNNS will not depends on any additional external dependencies.
+
+BNNS functions uses Apple private hardware capabilities which are not exposed yet by Apple. Example
+of such capabilities can be AMX Apple cpu extension.
+
+This guide will demonstrate how to build TVM with BNNS codegen and runtime enabled. It will also provide example
+code to compile and run models using BNNS runtime. Finally, we document the supported operators.
+
+Building TVM with BNNS support
+------------------------------
+
+To turn on TVM BNNS codegen and TVM BNNS runtime you need to turn on the only USE_BNNS flag
+
+* USE_BNNS=ON/OFF - This flag will enable compiling a network with offloading subgraphs to BNNS primitives
+  and will link tvm library to the BNNS runtime module.
+
+Enabling of this flag will cause to search the default Accelerate Frameworks on current target SDK.
+The minimal versions of required SDK is macOS 11.0, iOS 14.0, tvOS 14.0 and watchOS 7.0.
+
+Example setting in config.cmake file:
+
+.. code:: cmake
+
+    set(USE_BNNS ON)
+
+BNNS partitioning of Relay graph
+--------------------------------
+
+Operations to be offloaded on BNNS execution must be annotated before passing of module for compilation.
+All ops annotated by `partition_for_bnns` will be offloaded for BNNS execution. The rest of the ops
+will go through the LLVM compilation and code generation.
+
+Important note: BNNS support primitives only with constant weights. To satisfy this requirements we have
+to map constants to related tensor abstraction in relay representation. To freeze tensors and operate
+with them as constants you may need to call ONNX importer with special flag "freeze_params=True"
+or performer binding manually. In general cases all relay importers don't do that by default.
+For your convenience "partition_for_bnns" can do this for you if params dictionary is passed as the argument.
+
+.. code:: python
+
+    from tvm.relay.op.contrib.bnns import partition_for_bnns
+    model = partition_for_bnns(model, params=params)
+
+
+Input data layout for operations to be offloaded to BNNS execution
+------------------------------------------------------------------
+
+BNNS kernels support only planar format of input data. The partitioner will require to have NCHW input
+layout for conv2d input.
+
+To use BNNS integration for models with interleave input layout, they should be converted before
+passing of module to `partition_for_bnns`. The layout conversion will happen only for explicitly
+enumerated types of ops. It might happen that depending on topology there might be regular data reorder
+around conv2d to interleave and planar layout. This will be reflected in performance penalties and affect
+execution time. It is recommended to analyze the whole topology and extend below list to convert all
+intermediate tensors to NCHW data layout.
+
+Example of input layouts change:
+
+.. code:: python
+
+    # For models with NHWC input layout
+    with tvm.transform.PassContext(opt_level=3):
+        mod = relay.transform.InferType()(mod)
+        mod = relay.transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"],
+                                            "nn.bias_add": ["NCHW", "default"],
+                                            "nn.relu": ["NCHW"]})(mod)
+
+
+Example: Build and Deploy Mobilenet v2 1.0 with BNNS
+----------------------------------------------------
+
+Create a Relay graph from a MXNet Mobilenet v2 1.0 model.
+
+.. code:: python
+
+    import tvm
+    from tvm import relay
+    import mxnet
+    from mxnet.gluon.model_zoo.vision import get_model
+
+    dtype = "float32"
+    input_shape = (1, 3, 224, 224)
+    block = get_model('mobilenetv2_1.0', pretrained=True)
+    module, params = relay.frontend.from_mxnet(block, shape={'data': input_shape}, dtype=dtype)
+
+
+Markup the parts of graphs to be offloaded to BNNS primitives. All ops which are supported by the BNNS
+integration will be handled by BNNS invocations, the rest of the ops will go through the
+regular TVM llvm compilation and code generation.
+
+After that you need to compile new module with target corresponding to required Apple platform
+
+.. code:: python
+
+    from tvm.relay.op.contrib.bnns import partition_for_bnns
+
+    # target for macOS Big Sur 11.1:
+    target = "llvm -mtriple=x86_64-apple-darwin20.2.0"
+
+    model = partition_for_bnns(model, params=params)  # to markup operations to be offloaded to BNNS
+    with tvm.transform.PassContext(opt_level=3):
+        lib = relay.build(model, target=target, target_host=target, params=params)
+
+Export the module.
+
+.. code:: python
+
+    lib.export_library('compiled.dylib')
+
+
+Load module and run inference on the target machine with TVM  built with ``USE_BNNS`` enabled
+
+.. code:: python
+
+    import tvm
+    import numpy as np
+    from tvm.contrib import graph_runtime
+
+    ctx = tvm.cpu(0)
+    loaded_lib = tvm.runtime.load_module('compiled.dylib')
+    gen_module = tvm.contrib.graph_runtime.GraphModule(loaded_lib['default'](ctx))
+
+    dtype = "float32"
+    input_shape = (1, 3, 224, 224)
+    input_data = np.random.uniform(0, 1, input_shape).astype(dtype)
+    gen_module.run(data=input_data)
+
+
+
+Operator support
+----------------
+
++------------------------+------------------------------------------------------------------------------+
+|       Relay Node       |              Remarks                                                         |
++========================+==============================================================================+
+| nn.conv2d              |                                                                              |
++------------------------+------------------------------------------------------------------------------+
+| nn.batch_norm          | Supported by BNNS integration only in nn.conv2d-batch_norm pattern           |
++------------------------+------------------------------------------------------------------------------+
+| nn.dense               |                                                                              |
++------------------------+------------------------------------------------------------------------------+
+| nn.batch_matmul        |                                                                              |
++------------------------+------------------------------------------------------------------------------+
+| nn.bias_add            | Supported by BNNS integration only as a bias part of nn.conv2d or nn.dense   |
+|                        | fusion                                                                       |
++------------------------+------------------------------------------------------------------------------+
+| add                    | Supported by BNNS integration only as a bias part of nn.conv2d or nn.dense fusion |
++------------------------+------------------------------------------------------------------------------+
+| nn.relu                | Supported by BNNS integration only as a part of nn.conv2d or nn.dense fusion |
++------------------------+------------------------------------------------------------------------------+
+| nn.gelu                | Supported by BNNS integration only as a part of nn.conv2d or nn.dense fusion |
++------------------------+------------------------------------------------------------------------------+
diff --git a/docs/deploy/index.rst b/docs/deploy/index.rst
index 2b37f734c3c3..3cbbb10bd74b 100644
--- a/docs/deploy/index.rst
+++ b/docs/deploy/index.rst
@@ -71,3 +71,4 @@ target device without relying on RPC. see the following resources on how to do s
    arm_compute_lib
    tensorrt
    vitis_ai
+   bnns
diff --git a/python/tvm/driver/tvmc/composite_target.py b/python/tvm/driver/tvmc/composite_target.py
index 0a2592685646..886160ad000c 100644
--- a/python/tvm/driver/tvmc/composite_target.py
+++ b/python/tvm/driver/tvmc/composite_target.py
@@ -21,6 +21,7 @@
 
 from tvm.relay.op.contrib.arm_compute_lib import partition_for_arm_compute_lib
 from tvm.relay.op.contrib.ethosn import partition_for_ethosn
+from tvm.relay.op.contrib.bnns import partition_for_bnns
 
 from .common import TVMCException
 
@@ -40,6 +41,10 @@
         "config_key": "relay.ext.ethos-n.options",
         "pass_pipeline": partition_for_ethosn,
     },
+    "bnns": {
+        "config_key": None,
+        "pass_pipeline": partition_for_bnns,
+    },
 }
 
 
diff --git a/python/tvm/relay/op/contrib/__init__.py b/python/tvm/relay/op/contrib/__init__.py
index 49abf36134b4..30c2db0ddf0b 100644
--- a/python/tvm/relay/op/contrib/__init__.py
+++ b/python/tvm/relay/op/contrib/__init__.py
@@ -20,6 +20,7 @@
 
 from .arm_compute_lib import *
 from .dnnl import *
+from .bnns import *
 from .coreml import *
 from .ethosn import *
 from .tensorrt import *
diff --git a/python/tvm/relay/op/contrib/bnns.py b/python/tvm/relay/op/contrib/bnns.py
new file mode 100644
index 000000000000..2ace502e6528
--- /dev/null
+++ b/python/tvm/relay/op/contrib/bnns.py
@@ -0,0 +1,327 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument
+"""BNNS library supported operators.
+Is a part of Accelerate framework on macOS/iOS platforms. Apple provide several APIs
+to handle tensor processing. Particularly:
+ * BNNS (basic neural )
+ * vDSP (1D and 2D tensor processing)
+"""
+import math
+import tvm.ir
+
+from tvm.relay import transform
+from tvm.relay.expr import const
+from tvm.relay.build_module import bind_params_by_name
+
+from .register import register_pattern_table, get_pattern_table
+from ...dataflow_pattern import wildcard, is_op, is_expr
+
+
+def partition_for_bnns(mod, params=None):
+    """Partition the graph greedily offloading supported
+    operators to BNNS.
+
+    Parameters
+    ----------
+    mod : Module
+        The module to run passes on.
+    params : Optional[Dict[str, NDArray]]
+        Constant input parameters.
+
+    Returns
+    -------
+    ret : annotated and partitioned module.
+    """
+    if params:
+        mod["main"] = bind_params_by_name(mod["main"], params)
+
+    seq = tvm.transform.Sequential(
+        [
+            transform.InferType(),
+            transform.FoldConstant(),
+            transform.FoldScaleAxis(),
+            transform.DynamicToStatic(),
+            transform.AlterOpLayout(),
+            # TODO(apeskov): WA. AlterOpLayout call lead to constants shape transformation
+            #   Some expand_dims op may appears after constants. It breaks BNNS fusing.
+            #   So we have to call FoldConstant right before bnns composite passes.
+            transform.FoldConstant(),
+            transform.MergeComposite(get_pattern_table("bnns")),
+            transform.AnnotateTarget("bnns"),
+            #   If you no need in per layer performance statistic you can
+            #   uncomment next line
+            # transform.MergeCompilerRegions(),
+            transform.PartitionGraph(),
+        ]
+    )
+
+    return seq(mod)
+
+
+def _register_external_op_helper(op_name, supported=True):
+    """The helper function to indicate that a given operator can be supported
+    by BNNS.
+
+    Parameters
+    ----------
+    op_name : Str
+        The name of supported operator that will be registered.
+
+    Returns
+    -------
+    f : callable
+        A function that returns if the operator is supported by BNNS.
+    """
+
+    @tvm.ir.register_op_attr(op_name, "target.bnns")
+    def _func_wrapper(expr):
+        return supported
+
+    return _func_wrapper
+
+
+_register_external_op_helper("nn.batch_matmul")
+
+
+@tvm.ir.register_op_attr("nn.max_pool2d", "target.bnns")
+def max_pool2d_check(expr):
+    """Check if the nn.max_pool2d can be executed in BNNS"""
+    attrs, args = expr.attrs, expr.args
+    data_typ = args[0].checked_type
+    rank = len(data_typ.shape)
+    if rank < 3 or rank > 4 or data_typ.dtype != "float32":
+        return False
+    if attrs.layout != "NCHW":
+        return False
+    return True
+
+
+@tvm.ir.register_op_attr("nn.avg_pool2d", "target.bnns")
+def avg_pool2d_check(expr):
+    """Check if the nn.avg_pool2d can be executed in BNNS"""
+    attrs, args = expr.attrs, expr.args
+    data_typ = args[0].checked_type
+    rank = len(data_typ.shape)
+    if rank < 3 or rank > 4 or data_typ.dtype != "float32":
+        return False
+    if attrs.layout != "NCHW":
+        return False
+    return True
+
+
+@tvm.ir.register_op_attr("nn.global_max_pool2d", "target.bnns")
+def global_max_pool2d_check(expr):
+    """Check if the nn.global_max_pool2d can be executed in BNNS"""
+    attrs, args = expr.attrs, expr.args
+    data_typ = args[0].checked_type
+    rank = len(data_typ.shape)
+    if rank < 3 or rank > 4 or data_typ.dtype != "float32":
+        return False
+    if attrs.layout != "NCHW":
+        return False
+    return True
+
+
+@tvm.ir.register_op_attr("nn.global_avg_pool2d", "target.bnns")
+def global_avg_pool2d_check(expr):
+    """Check if the nn.global_avg_pool2d can be executed in BNNS"""
+    attrs, args = expr.attrs, expr.args
+    data_typ = args[0].checked_type
+    rank = len(data_typ.shape)
+    if rank < 3 or rank > 4 or data_typ.dtype != "float32":
+        return False
+    if attrs.layout != "NCHW":
+        return False
+    return True
+
+
+def dtype_is_supported(dtype):
+    """Check if data type is supported by BNNS backend"""
+    return dtype in ("", "float32")
+
+
+@tvm.ir.register_op_attr("nn.conv2d", "target.bnns")
+def conv2d_check(expr):
+    """Check if the conv2d can be executed in BNNS"""
+    attrs, args = expr.attrs, expr.args
+    data_typ = args[0].checked_type
+    if len(data_typ.shape) != 4 or data_typ.dtype != "float32":
+        return False
+    if not isinstance(args[1], tvm.relay.expr.Constant):
+        return False
+    kernel_typ = args[1].checked_type
+    if len(kernel_typ.shape) != 4 or kernel_typ.dtype != "float32":
+        return False
+    if attrs.data_layout != "NCHW":
+        return False
+    if not dtype_is_supported(attrs.out_dtype):
+        return False
+    return True
+
+
+def bias_check(expr):
+    """Check is bias added through the correct dimension"""
+    attrs, args = expr.attrs, expr.args
+    if not isinstance(args[1], tvm.relay.expr.Constant):
+        return False
+    if expr.op.name == "nn.bias_add":
+        return attrs.axis == 1
+    if expr.op.name == "add":
+        b_shape = args[1].checked_type.shape
+        if len(b_shape) == 4:
+            return bool(b_shape[0] == 1 and b_shape[2] == 1 and b_shape[3] == 1)
+        if len(b_shape) == 3:
+            return bool(b_shape[1] == 1 and b_shape[2] == 1)
+
+    return False
+
+
+@tvm.ir.register_op_attr("nn.dense", "target.bnns")
+def dense(expr):
+    """Check if the dense can be used in BNNS."""
+    attrs, args = expr.attrs, expr.args
+    data_typ = args[0].checked_type
+    if data_typ.dtype != "float32":
+        return False
+    if not isinstance(args[1], tvm.relay.expr.Constant):
+        return False
+    kernel_typ = args[1].checked_type
+    if len(kernel_typ.shape) != 2 or kernel_typ.dtype != "float32":
+        return False
+    if attrs.out_dtype != "float32" and attrs.out_dtype != "":
+        return False
+    return True
+
+
+def make_conv_pattern(with_bias=True, activation="none"):
+    """Make pattern for bnns.conv2d primitive"""
+    data = wildcard()
+    weight = wildcard()
+    bias = wildcard()
+    pat = is_op("nn.conv2d")(data, weight)
+    if with_bias:
+        pat = is_op("add")(pat, bias) | is_op("nn.bias_add")(pat, bias)
+    if activation == "relu":
+        pat = is_op("nn.relu")(pat)
+    elif activation == "sigmoid":
+        pat = is_op("sigmoid")(pat)
+    return pat
+
+
+def check_conv(extract):
+    """Check conv pattern is supported by BNNS."""
+    bias_is_ok = True
+    call = extract
+    while call.op.name != "nn.conv2d":
+        if call.op.name in ("nn.bias_add", "add"):
+            bias_is_ok &= bias_check(call)
+        call = call.args[0]
+    return conv2d_check(call) and bias_is_ok
+
+
+def make_dense_bias_pattern():
+    """Make pattern for bnns.dense primitive"""
+    data = wildcard()
+    weight = wildcard()
+    bias = wildcard()
+    d = is_op("nn.dense")(data, weight)
+    return is_op("add")(d, bias)
+
+
+def make_dense_bias_gelu_pattern():
+    """Make pattern for bnns.dense primitive with fused bias and gelu activation"""
+    dense_bias = make_dense_bias_pattern()
+    const1 = is_expr(const(0.044715))
+    const2 = is_expr(const(math.sqrt(2 / math.pi)))
+
+    gelu = is_op("power")(dense_bias, is_expr(const(3, dtype="float32")))
+    gelu = is_op("multiply")(gelu, const1)
+    gelu = is_op("add")(gelu, dense_bias)
+    gelu = is_op("multiply")(gelu, const2)
+    gelu = is_op("tanh")(gelu)
+    gelu = is_op("add")(gelu, is_expr(const(1, dtype="float32")))
+    gelu = is_op("multiply")(gelu, is_expr(const(0.5)))
+    gelu = is_op("multiply")(gelu, dense_bias)
+    return gelu
+
+
+def check_dense(extract):
+    """Check dense pattern is supported by BNNS."""
+    call = extract
+    while call.op.name != "nn.dense":
+        call = call.args[0]
+    return dense(call)
+
+
+@tvm.ir.register_op_attr("nn.instance_norm", "target.bnns")
+def instance_norm_check(expr):
+    """Check if the nn.instance_norm can be executed in BNNS"""
+    attrs, args = expr.attrs, expr.args
+    data_typ = args[0].checked_type
+    rank = len(data_typ.shape)
+    if rank < 3 or rank > 4 or data_typ.dtype != "float32":
+        return False
+    if not isinstance(args[1], tvm.relay.expr.Constant) or not isinstance(
+        args[2], tvm.relay.expr.Constant
+    ):
+        return False
+    if attrs.axis == 0 and rank == 3 or attrs.axis == 1 and rank == 4:
+        return True
+    return False
+
+
+@register_pattern_table("bnns")
+def pattern_table():
+    """Get BNNS specific fusing patterns collection"""
+    conv2d_bias_pat = (
+        "bnns.conv2d_bias",
+        make_conv_pattern(with_bias=True),
+        check_conv,
+    )
+    conv2d_bias_relu_pat = (
+        "bnns.conv2d_bias_relu",
+        make_conv_pattern(with_bias=True, activation="relu"),
+        check_conv,
+    )
+    conv2d_relu_pat = (
+        "bnns.conv2d_relu",
+        make_conv_pattern(with_bias=False, activation="relu"),
+        check_conv,
+    )
+    conv2d_bias_sigmoid_pat = (
+        "bnns.conv2d_bias_sigmoid",
+        make_conv_pattern(with_bias=True, activation="sigmoid"),
+        check_conv,
+    )
+    conv2d_sigmoid_pat = (
+        "bnns.conv2d_sigmoid",
+        make_conv_pattern(with_bias=False, activation="sigmoid"),
+        check_conv,
+    )
+    dense_bias_gelu = ("bnns.dense_bias_gelu", make_dense_bias_gelu_pattern(), check_dense)
+    dense_bias = ("bnns.dense_bias", make_dense_bias_pattern(), check_dense)
+    bnns_patterns = [
+        conv2d_bias_relu_pat,
+        conv2d_relu_pat,
+        conv2d_bias_sigmoid_pat,
+        conv2d_sigmoid_pat,
+        conv2d_bias_pat,
+        dense_bias_gelu,
+        dense_bias,
+    ]
+    return bnns_patterns
diff --git a/src/relay/backend/contrib/bnns/codegen.cc b/src/relay/backend/contrib/bnns/codegen.cc
new file mode 100644
index 000000000000..72c32fb5b19e
--- /dev/null
+++ b/src/relay/backend/contrib/bnns/codegen.cc
@@ -0,0 +1,215 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file
+ * \brief Implementation of BNNS codegen APIs.
+ */
+
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/registry.h>
+
+#include <numeric>
+#include <sstream>
+
+#include "../../../../runtime/contrib/json/json_node.h"
+#include "../../utils.h"
+#include "../codegen_json/codegen_json.h"
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+
+using namespace backend;
+
+/*!
+ * \brief Retrieve the expected "root" op nested inside a fused call, such as conv2d in
+ *        relu(add(conv2d))
+ * \param call A Relay call node. Typically nn.relu when called the first time.
+ * \param max_depth The maximum number of calls before the root op, counting from current_call.
+ * \param root_name The name of expected "root" op in this fused call.
+ * \return A CallNode corresponding to the root op
+ */
+inline const CallNode* FindCallWithName(const CallNode* current_call, int max_depth,
+                                        const std::string& root_name) {
+  ICHECK(current_call && max_depth >= 0);
+
+  if (max_depth == 0) {
+    ICHECK(current_call && IsOp(current_call, root_name));
+    return current_call;
+  }
+  if (IsOp(current_call, root_name)) {
+    return current_call;
+  }
+
+  ICHECK_GT(current_call->args.size(), 0);
+
+  const auto* next_call = current_call->args[0].as<CallNode>();
+  return FindCallWithName(next_call, max_depth - 1, root_name);
+}
+
+class BNNSJSONSerializer : public backend::contrib::JSONSerializer {
+  using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
+  using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry;
+
+ public:
+  BNNSJSONSerializer(const std::string& symbol, const Expr& expr) : JSONSerializer(symbol, expr) {}
+
+  std::vector<JSONGraphNodeEntry> VisitExpr_(const CallNode* cn) override {
+    Expr expr = GetRef<Expr>(cn);
+    std::string name;
+    const CallNode* call = cn;
+    if (const auto* op_node = cn->op.as<OpNode>()) {
+      name = op_node->name;
+    } else if (const auto* fn = cn->op.as<FunctionNode>()) {
+      auto comp = fn->GetAttr<String>(attr::kComposite);
+      ICHECK(comp.defined()) << "BNNS JSON runtime only supports composite functions.";
+      name = comp.value();
+
+      auto body = fn->body.as<CallNode>();
+      if (name == "bnns.conv2d_bias_relu") {
+        auto add_op_type = IsOp(body->args[0].as<CallNode>(), "add") ? "add" : "nn.bias_add";
+        call = GetRootCall(body, 2, {"nn.conv2d", add_op_type, "nn.relu"});
+      } else if (name == "bnns.conv2d_bias") {
+        auto add_op_type = IsOp(body, "add") ? "add" : "nn.bias_add";
+        call = GetRootCall(body, 1, {"nn.conv2d", add_op_type});
+      } else if (name == "bnns.conv2d_relu") {
+        call = GetRootCall(body, 1, {"nn.conv2d", "nn.relu"});
+        ICHECK(call->op.as<OpNode>()) << "Not op node";
+      } else if (name == "bnns.conv2d_bias_sigmoid") {
+        auto add_op_type = IsOp(body->args[0].as<CallNode>(), "add") ? "add" : "nn.bias_add";
+        call = GetRootCall(body, 2, {"nn.conv2d", add_op_type, "sigmoid"});
+        ICHECK(call->op.as<OpNode>()) << "Not op node";
+      } else if (name == "bnns.conv2d_sigmoid") {
+        call = GetRootCall(body, 1, {"nn.conv2d", "sigmoid"});
+        ICHECK(call->op.as<OpNode>()) << "Not op node";
+      } else if (name == "bnns.dense_bias") {
+        call = GetRootCall(fn->body.as<CallNode>(), 1, {"nn.dense", "add"});
+      } else if (name == "bnns.dense_bias_gelu") {
+        call = FindCallWithName(fn->body.as<CallNode>(), 10, "nn.dense");
+      } else {
+        LOG(FATAL) << "Unrecognized BNNS pattern: " << name;
+      }
+    } else {
+      LOG(FATAL) << "BNNS JSON runtime does not support calls to " << cn->op->GetTypeKey();
+    }
+
+    std::vector<JSONGraphNodeEntry> inputs;
+    for (const auto& arg : cn->args) {
+      auto res = VisitExpr(arg);
+      inputs.insert(inputs.end(), res.begin(), res.end());
+    }
+    auto node = std::make_shared<JSONGraphNode>(name,     /* name_ */
+                                                "kernel", /* op_type_ */
+                                                inputs, 1 /* num_outputs_ */);
+    SetCallNodeAttribute(node, call);
+    return AddNode(node, GetRef<Expr>(cn));
+  }
+};
+
+/*!
+ * \brief The external compiler/codegen tool. It takes a Relay expression/module and
+ * compile it into a runtime module.
+ */
+runtime::Module BNNSCompiler(const ObjectRef& ref) {
+  ICHECK(ref->IsInstance<FunctionNode>());
+  auto func = Downcast<Function>(ref);
+  auto func_name = GetExtSymbol(func);
+  BNNSJSONSerializer serializer(func_name, func);
+  serializer.serialize();
+  std::string graph_json = serializer.GetJSON();
+  auto params = serializer.GetParams();
+
+  const auto* pf = runtime::Registry::Get("runtime.BNNSJSONRuntimeCreate");
+  ICHECK(pf != nullptr) << "Cannot find JSON runtime module to create";
+  auto mod = (*pf)(func_name, graph_json, params);
+  return mod;
+}
+
+TVM_REGISTER_GLOBAL("relay.ext.bnns").set_body_typed(BNNSCompiler);
+
+/**
+ * \brief A helper to expand the params by adding ones which used by BNNS runtime
+ * for a given expression. Same as default ConstantUpdater but skip constant from
+ * essential BNNS composed function ops.
+ */
+struct BNNSConstantUpdater : public ConstantUpdater {
+ public:
+  BNNSConstantUpdater(const std::string& symbol,
+                      std::unordered_map<std::string, runtime::NDArray>* params,
+                      const std::vector<std::string>& skip_mask)
+      : ConstantUpdater(symbol, params), skip_mask_(skip_mask) {}
+  using ConstantUpdater::VisitExpr_;
+
+  /**!
+   * Like an original implementation but avoid visiting of body nodes
+   * for BNNS specific composite primitives.
+   */
+  void VisitExpr_(const FunctionNode* op) final {
+    this->VisitSpan(op->span);
+    for (auto param : op->params) {
+      this->VisitExpr(param);
+    }
+
+    if (!isBNNSSpecificCompositeFunc(op)) {
+      this->VisitExpr(op->body);
+    }
+  }
+
+ private:
+  bool isBNNSSpecificCompositeFunc(const FunctionNode* op) {
+    auto comp = op->GetAttr<String>(attr::kComposite);
+    if (!comp) return false;
+
+    auto comp_name = comp.value();
+
+    bool is_match = false;
+    for (const auto& mask : skip_mask_) {
+      if (std::string(comp_name).substr(0, mask.size()) == mask) {
+        is_match = true;
+        break;
+      }
+    }
+    return is_match;
+  }
+
+  std::vector<std::string> skip_mask_;
+};
+
+Map<String, runtime::NDArray> BNNSConstantUpdaterFunc(Expr expr, std::string symbol) {
+  std::vector<std::string> bnns_composite_filter = {"bnns."};
+
+  // Visit all suitable constant nodes
+  std::unordered_map<std::string, runtime::NDArray> res;
+  BNNSConstantUpdater const_updater(symbol, &res, bnns_composite_filter);
+  const_updater(expr);
+
+  // Convert to tvm::Map
+  Map<String, runtime::NDArray> ret;
+  for (const auto& kvp : res) ret.Set(kvp.first, kvp.second);
+  return ret;
+}
+
+TVM_REGISTER_GLOBAL("relay.ext.bnns.constant_updater").set_body_typed(BNNSConstantUpdaterFunc);
+
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/runtime/contrib/bnns/bnns_json_runtime.cc b/src/runtime/contrib/bnns/bnns_json_runtime.cc
new file mode 100644
index 000000000000..87b01567cd30
--- /dev/null
+++ b/src/runtime/contrib/bnns/bnns_json_runtime.cc
@@ -0,0 +1,573 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/**
+ * \file
+ * \brief Simple JSON runtime for Apple BNNS primitives
+ */
+
+#include <tvm/runtime/c_backend_api.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/registry.h>
+
+#include <cstddef>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "../json/json_node.h"
+#include "../json/json_runtime.h"
+#include "bnns_wrp.h"
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+using namespace ::tvm::runtime;
+using namespace ::tvm::runtime::json;
+using namespace ::tvm::runtime::contrib::BNNS;
+
+struct ThreadingConfig {
+  /**
+   * Internal parallelism level ov BNNS primitive specified via BNNSFilterParameters
+   * struct. BNNS doesn't provide real control of internal threading, so it may be
+   * ignored by BNNS implementation.
+   *
+   * Valid values:
+   *   0  use default num of threads suggested by BNNS implementation
+   *  >0  suggests to use this num of internal BNNS threads
+   */
+  size_t internalConcurrency = 0;
+
+  /**
+   * TVM level parallelism for BNNS runtime.
+   * BNNS runtime will split primitive into set of independent sub primitives which
+   * can be executed in parallel. As a rule the splitting are performed through output
+   * channels, so the effective shape of executed primitive is changed.
+   *
+   * Valid values:
+   *   0  do not use graph level treading
+   *  >0  split into this num of primitives
+   */
+  size_t externalConcurrency = 0;
+};
+
+/**
+ * Depends on platform hardware the optimal ThreadingConfig may differ.
+ * This function contains a priori knowledge about some Apple platforms
+ * and their specific.
+ *
+ * @return default ThreadingConfig suggested for this platform
+ */
+ThreadingConfig getDefaultThreadingConfig() {
+  // TODO(apeskov): have to implement CPU/iOS version check.
+  //  meanwhile will use {0, 2} stub to utilize big cores of A13/A14 CPU.
+  return {0, 2};
+}
+
+/**
+ * Main entry point to BNNS runtime
+ */
+class BNNSJSONRuntime : public JSONRuntimeBase {
+ public:
+  BNNSJSONRuntime(const std::string& symbol_name, const std::string& graph_json,
+                  const Array<String> const_names)
+      : JSONRuntimeBase(symbol_name, graph_json, const_names) {}
+
+  const char* type_key() const override { return "bnns_json"; }
+
+  void Init(const Array<NDArray>& consts) override {
+    ICHECK_EQ(consts.size(), const_idx_.size())
+        << "The number of input constants must match the number of required.";
+
+    SetupConstants(consts);
+    BindInputsAndOutputs();
+    AllocateIntermediateTensors();
+    BuildEngine();
+  }
+
+  void Run() override {
+    // Wrap external handler into BNNS tensor representation
+    auto bind_ext_hdl_to_tensor = [this](uint32_t eid) {
+      const auto& ext_dlt = *data_entry_[eid];
+      auto& bnns_tensor = tensors_eid_[eid];
+      bnns_tensor->set_data_hdl(ext_dlt.data);
+    };
+
+    // Bind all input/output external data object into internal abstractions
+    for (const auto& eid : input_var_eid_) bind_ext_hdl_to_tensor(eid);
+    for (const auto& out_entity : outputs_) bind_ext_hdl_to_tensor(EntryID(out_entity));
+
+    // Invoke primitives in topological order
+    for (const auto& prim : primitives_) prim->execute();
+  }
+
+ private:
+  /** Make corresponding input/output tensor stubs */
+  void BindInputsAndOutputs() {
+    tensors_eid_.resize(data_entry_.size());
+    auto createTensor = [&](JSONGraphNodeEntry entry) {
+      auto node = nodes_[entry.id_];
+      auto dlshape = node.GetOpShape()[entry.index_];
+      auto dltype = node.GetOpDataType()[entry.index_];
+      void* data = nullptr;
+      if (data_entry_[entry.id_] != nullptr) data = data_entry_[entry.id_]->data;
+      tensors_eid_[entry.id_] = std::make_shared<BNNS::Tensor>(
+          BNNS::Shape{dlshape.begin(), dlshape.end()}, convertToBNNS(dltype), data);
+    };
+
+    for (auto& id : input_nodes_) {
+      auto eid = JSONGraphNodeEntry(id, 0);
+      createTensor(eid);
+    }
+
+    for (auto entry : outputs_) {
+      createTensor(entry);
+    }
+  }
+
+  /** Allocate intermediate tensors */
+  void AllocateIntermediateTensors() {
+    for (int i = 0; i < nodes_.size(); ++i) {
+      auto eid = JSONGraphNodeEntry(i, 0);
+      if (tensors_eid_[eid.id_] != nullptr) continue;
+      auto node = nodes_[i];
+      auto dlshape = node.GetOpShape()[0];
+      auto dltype = node.GetOpDataType()[0];
+      tensors_eid_[eid.id_] = std::make_shared<BNNS::Tensor>(
+          BNNS::Shape{dlshape.begin(), dlshape.end()}, convertToBNNS(dltype), nullptr);
+      tensors_eid_[eid.id_]->allocate_memory();
+    }
+  }
+
+  // Build up the engine based on the input graph.
+  void BuildEngine() {
+    // Build subgraph engine.
+    for (size_t nid = 0; nid < nodes_.size(); ++nid) {
+      const auto& node = nodes_[nid];
+      if (node.GetOpType() == "kernel") {
+        ICHECK_EQ(node.GetOpType(), "kernel");
+        auto op_name = node.GetOpName();
+        if ("nn.conv2d" == op_name) {
+          Conv2d(nid);
+        } else if ("bnns.conv2d_relu" == op_name) {
+          Conv2d(nid, false, "relu");
+        } else if ("bnns.conv2d_bias_relu" == op_name) {
+          Conv2d(nid, true, "relu");
+        } else if ("bnns.conv2d_sigmoid" == op_name) {
+          Conv2d(nid, false, "sigmoid");
+        } else if ("bnns.conv2d_bias_sigmoid" == op_name) {
+          Conv2d(nid, true, "sigmoid");
+        } else if ("bnns.conv2d_bias" == op_name) {
+          Conv2d(nid, true);
+        } else if ("nn.dense" == op_name) {
+          Dense(nid);
+        } else if ("bnns.dense_bias" == op_name) {
+          Dense(nid, true);
+        } else if ("bnns.dense_bias_gelu" == op_name) {
+          Dense(nid, true, true);
+        } else if ("nn.batch_matmul" == op_name) {
+          MatMul(nid);
+        } else if ("nn.instance_norm" == op_name) {
+          InstanceNormalization(nid);
+        } else if ("nn.max_pool2d" == op_name) {
+          Pooling(nid, false);
+        } else if ("nn.avg_pool2d" == op_name) {
+          Pooling(nid, true);
+        } else if ("nn.global_max_pool2d" == op_name) {
+          Pooling(nid, false, true);
+        } else if ("nn.global_avg_pool2d" == op_name) {
+          Pooling(nid, true, true);
+        } else {
+          LOG(FATAL) << "Unsupported op: " << op_name;
+        }
+      }
+    }
+  }
+
+  // Get BNNS tensor.
+  std::shared_ptr<BNNS::Tensor> GetBNNSTensor(const JSONGraphNodeEntry& entry) {
+    auto eid = EntryID(entry);
+    ICHECK(eid < tensors_eid_.size());
+    return tensors_eid_[eid];
+  }
+
+  void Conv2d(const size_t& nid, const bool has_bias = false,
+              const std::string activation_type = "none") {
+    auto node = nodes_[nid];
+
+    // Setup attributes.
+    auto src_entry = node.GetInputs()[0];
+    auto wgh_entry = node.GetInputs()[1];
+    auto dst_entry = JSONGraphNodeEntry(nid, 0);
+
+    auto dl_input_shape = nodes_[src_entry.id_].GetOpShape()[src_entry.index_];
+    auto dl_weight_shape = nodes_[wgh_entry.id_].GetOpShape()[wgh_entry.index_];
+    BNNS::Shape input_shape{dl_input_shape.begin(), dl_input_shape.end()};
+    BNNS::Shape weight_shape{dl_weight_shape.begin(), dl_weight_shape.end()};
+    std::vector<std::string> str_strides = node.GetAttr<std::vector<std::string>>("strides");
+    std::vector<std::string> str_dilation = node.GetAttr<std::vector<std::string>>("dilation");
+    std::vector<std::string> str_padding = node.GetAttr<std::vector<std::string>>("padding");
+    BNNS::Dim groups = std::stoi(node.GetAttr<std::vector<std::string>>("groups")[0]);
+
+    BNNS::Dim PH_L = std::stoi(str_padding[0]),  // height padding: left
+        PH_R = std::stoi(str_padding[2]),        // height padding: right
+        PW_L = std::stoi(str_padding[1]),        // width padding: left
+        PW_R = std::stoi(str_padding[3]),        // width padding: right
+        SH = std::stoi(str_strides[0]),          // height-wise stride
+        SW = std::stoi(str_strides[1]),          // weight-wise stride
+        DH = std::stoi(str_dilation[0]),         // height kernel dilation
+        DW = std::stoi(str_dilation[1]);         // width kernel dilation
+
+    // Memory descriptions.
+    const auto& src_t = GetBNNSTensor(src_entry);
+    const auto& wgh_t = GetBNNSTensor(wgh_entry);
+    const auto& dst_t = GetBNNSTensor(dst_entry);
+
+    auto src_view = TView::as_is(src_t).extract_outer_dim().with_layout(BNNSDataLayoutImageCHW);
+    auto wgh_view = TView::as_is(wgh_t).with_layout(BNNSDataLayoutConvolutionWeightsOIHW);
+    auto dst_view = TView::as_is(dst_t).extract_outer_dim().with_layout(BNNSDataLayoutImageCHW);
+    TView bias_view;
+
+    if (has_bias) {
+      auto bias_entry = node.GetInputs()[2];
+
+      auto bias_t = GetBNNSTensor(bias_entry);
+      bias_view = TView::as_is(bias_t).squeeze().with_layout(BNNSDataLayoutVector);
+    }
+
+    BNNSActivation activation = {BNNSActivationFunctionIdentity};
+    if (activation_type == "relu")
+      activation = {BNNSActivationFunctionRectifiedLinear};
+    else if (activation_type == "sigmoid")
+      activation = {BNNSActivationFunctionSigmoid};
+
+    BNNSLayerParametersConvolution conv_param = {
+        src_view.get_bnns_view(),
+        wgh_view.get_bnns_view(),
+        dst_view.get_bnns_view(),
+        bias_view.get_bnns_view(),
+        activation,
+        SW,                      /* x_stride */
+        SH,                      /* y_stride */
+        DW,                      /* x_dilation_stride */
+        DH,                      /* y_dilation_stride */
+        0,                       /* x_padding, explicit pads will be used */
+        0,                       /* y_padding, explicit pads will be used */
+        groups,                  /* groups */
+        {PW_L, PW_R, PH_L, PH_R} /* explicit pad values */
+    };
+
+    size_t num_sub_prim = default_thread_config.externalConcurrency;
+    std::vector<BNNSLayerParametersConvolution> params;
+    std::tie(params, src_view, dst_view) =
+        split_to_n(num_sub_prim, conv_param, src_view, wgh_view, bias_view, dst_view);
+
+    std::vector<BNNSFilter> filters(params.size(), nullptr);
+    for (int i = 0; i < params.size(); i++) {
+      auto common_filter_param = getCommonFilterParams();
+      filters[i] = BNNSFilterCreateLayerConvolution(&params[i], &common_filter_param);
+      ICHECK(filters[i]) << "BNNS primitive was not created. Unsupported attributes configuration";
+    }
+
+    primitives_.emplace_back(std::make_shared<BNNS::Primitive>(filters, src_view, dst_view));
+  }
+
+  void Dense(const size_t& nid, const bool has_bias = false, const bool has_gelu = false) {
+    auto node = nodes_[nid];
+
+    // Setup attributes.
+    auto src_entry = node.GetInputs()[0];
+    auto weight_entry = node.GetInputs()[1];
+    auto dst_entry = JSONGraphNodeEntry(nid, 0);
+
+    // Memory descriptions.
+    auto src_t = GetBNNSTensor(src_entry);
+    auto wgh_t = GetBNNSTensor(weight_entry);
+    auto dst_t = GetBNNSTensor(dst_entry);
+
+    auto src_view = TView::as_is(src_t).extract_outer_dim().with_layout(BNNSDataLayoutVector);
+    auto wgh_view = TView::as_is(wgh_t).with_layout(BNNSDataLayoutRowMajorMatrix);
+    auto dst_view = TView::as_is(dst_t).extract_outer_dim().with_layout(BNNSDataLayoutVector);
+
+    TView bias_view;
+    if (has_bias) {
+      auto bias_entry = node.GetInputs()[2];
+      auto bias_md = GetBNNSTensor(bias_entry);
+      bias_view = TView::as_is(bias_md).with_layout(BNNSDataLayoutVector);
+    }
+
+    BNNSActivation activation = {BNNSActivationFunctionIdentity};
+    if (has_gelu) {
+      activation = {BNNSActivationFunctionGELUApproximation};
+      activation.alpha = std::sqrt(2.0 / M_PI);
+      activation.beta = 0.044715;
+    }
+
+    BNNSLayerParametersFullyConnected layerParameters = {
+        src_view.get_bnns_view(),
+        wgh_view.get_bnns_view(),
+        dst_view.get_bnns_view(),
+        bias_view.get_bnns_view(),
+        activation,
+    };
+
+    auto common_filter_param = getCommonFilterParams();
+    auto filter = BNNSFilterCreateLayerFullyConnected(&layerParameters, &common_filter_param);
+    ICHECK(filter) << "BNNS primitive was not created. Unsupported attributes configuration";
+    std::vector<BNNSFilter> filters = {filter};
+    primitives_.emplace_back(std::make_shared<BNNS::Primitive>(filters, src_view, dst_view));
+  }
+
+  void MatMul(const size_t& nid) {
+    auto node = nodes_[nid];
+
+    // Setup attributes.
+    auto a_entry = node.GetInputs()[0];
+    auto b_entry = node.GetInputs()[1];
+    auto dst_entry = JSONGraphNodeEntry(nid, 0);
+    bool a_is_weighted = data_entry_[EntryID(a_entry)] != nullptr;
+    bool b_is_weighted = data_entry_[EntryID(b_entry)] != nullptr;
+
+    // Memory descriptions.
+    auto a_t = GetBNNSTensor(a_entry);
+    auto b_t = GetBNNSTensor(b_entry);
+    auto dst_t = GetBNNSTensor(dst_entry);
+
+    auto a_view = TView::as_is(a_t);
+    auto b_view = TView::as_is(b_t);
+    auto dst_view = TView::as_is(dst_t);
+
+    BNNSLayerParametersBroadcastMatMul layerParameters = {1,      // alpha
+                                                          0,      // beta
+                                                          false,  // transA
+                                                          true,   // transB
+                                                          false,  // quadratic
+                                                          a_is_weighted,
+                                                          b_is_weighted,
+                                                          a_view.get_bnns_view(),
+                                                          b_view.get_bnns_view(),
+                                                          dst_view.get_bnns_view()};
+
+    // BNNS limitation: MatMul use reverse dims values. However strides are calculated correctly
+    //    based on BNNSNDArrayDescriptor::layout value.
+    std::reverse(layerParameters.iA_desc.size, layerParameters.iA_desc.size + 3);
+    std::reverse(layerParameters.iB_desc.size, layerParameters.iB_desc.size + 3);
+    std::reverse(layerParameters.o_desc.size, layerParameters.o_desc.size + 3);
+
+    auto common_filter_param = getCommonFilterParams();
+    auto filter = BNNSFilterCreateLayerBroadcastMatMul(&layerParameters, &common_filter_param);
+    ICHECK(filter) << "BNNS primitive was not created. Unsupported attributes configuration";
+
+    std::vector<BNNSFilter> filters{filter};
+    if (a_is_weighted || b_is_weighted) {
+      auto src_view = a_is_weighted ? b_view : a_view;
+      primitives_.emplace_back(std::make_shared<BNNS::Primitive>(filters, src_view, dst_view));
+    } else {
+      primitives_.emplace_back(
+          std::make_shared<BNNS::TwoInputPrimitive>(filters, a_view, b_view, dst_view));
+    }
+  }
+
+  void InstanceNormalization(const size_t& nid) {
+    auto node = nodes_[nid];
+    size_t axis = std::stoi(node.GetAttr<std::vector<std::string>>("axis")[0]);
+    float epsilon = std::stof(node.GetAttr<std::vector<std::string>>("epsilon")[0]);
+    bool center = std::stoi(node.GetAttr<std::vector<std::string>>("center")[0]);
+    bool scale = std::stoi(node.GetAttr<std::vector<std::string>>("scale")[0]);
+
+    // Setup attributes.
+    auto src_entry = node.GetInputs()[0];
+    auto scale_entry = node.GetInputs()[1];
+    auto bias_entry = node.GetInputs()[2];
+    auto dst_entry = JSONGraphNodeEntry(nid, 0);
+
+    // Memory descriptions.
+    auto src_t = GetBNNSTensor(src_entry);
+    auto scale_t = GetBNNSTensor(scale_entry);
+    auto bias_t = GetBNNSTensor(bias_entry);
+    auto dst_t = GetBNNSTensor(dst_entry);
+
+    auto src_view = TView::as_is(src_t);
+    auto dst_view = TView::as_is(dst_t);
+    size_t src_rank = Tensor::getRank(src_view.get_bnns_view());
+    size_t dst_rank = Tensor::getRank(dst_view.get_bnns_view());
+    ICHECK_EQ(src_rank, dst_rank);
+    ICHECK_LE(src_rank, 4);
+    if (src_rank < 4) {
+      src_view = src_view.unsqueeze(4);
+      dst_view = dst_view.unsqueeze(4);
+    }
+    src_view = src_view.extract_outer_dim().with_layout(BNNSDataLayoutImageCHW);
+    dst_view = dst_view.extract_outer_dim().with_layout(BNNSDataLayoutImageCHW);
+    auto scale_view = TView::as_is(scale_t).with_layout(BNNSDataLayoutVector);
+    auto bias_view = TView::as_is(bias_t).with_layout(BNNSDataLayoutVector);
+    BNNSActivation activation = {BNNSActivationFunctionIdentity};
+
+    auto b_desc = bias_view.get_bnns_view();
+    if (!center) b_desc = {};
+    auto s_desc = scale_view.get_bnns_view();
+    if (!scale) s_desc = {};
+
+    // NOTE: Axis option is ignored in BNNS. The result doesn't depends on value of axis.
+    BNNSLayerParametersNormalization layerParameters = {src_view.get_bnns_view(),  // i_desc
+                                                        dst_view.get_bnns_view(),  // o_desc
+                                                        b_desc,                    // beta_desc
+                                                        s_desc,                    // gamma_desc
+                                                        {},          // moving_mean_desc
+                                                        {},          // moving_variance_desc
+                                                        1.f,         // momentum
+                                                        epsilon,     // epsilon
+                                                        activation,  // activation
+                                                        1,           // num_groups
+                                                        axis};       // normalization_axis
+
+    BNNSFilterType filter_type = BNNSInstanceNorm;
+    auto common_filter_param = getCommonFilterParams();
+    auto filter =
+        BNNSFilterCreateLayerNormalization(filter_type, &layerParameters, &common_filter_param);
+    ICHECK(filter) << "BNNS primitive was not created. Unsupported attributes configuration";
+
+    std::vector<BNNSFilter> filters{filter};
+    primitives_.emplace_back(std::make_shared<BNNS::NormPrimitive>(filters, src_view, dst_view));
+  }
+
+  void Pooling(const size_t& nid, bool avg_pooling, bool global = false) {
+    auto node = nodes_[nid];
+
+    auto src_entry = node.GetInputs()[0];
+    auto dst_entry = JSONGraphNodeEntry(nid, 0);
+
+    // Memory descriptions.
+    auto src_t = GetBNNSTensor(src_entry);
+    auto dst_t = GetBNNSTensor(dst_entry);
+
+    auto src_view = TView::as_is(src_t);
+    auto dst_view = TView::as_is(dst_t);
+    size_t src_rank = Tensor::getRank(src_view.get_bnns_view());
+    size_t dst_rank = Tensor::getRank(dst_view.get_bnns_view());
+    ICHECK_EQ(src_rank, dst_rank);
+    ICHECK_LE(src_rank, 4);
+    if (src_rank < 4) {
+      src_view = src_view.unsqueeze(4);
+      dst_view = dst_view.unsqueeze(4);
+    }
+    src_view = src_view.extract_outer_dim().with_layout(BNNSDataLayoutImageCHW);
+    dst_view = dst_view.extract_outer_dim().with_layout(BNNSDataLayoutImageCHW);
+    BNNSActivation activation = {BNNSActivationFunctionIdentity};
+    BNNSPoolingFunction pf = {BNNSPoolingFunctionMax};
+    if (avg_pooling) pf = {BNNSPoolingFunctionAverageCountExcludePadding};
+
+    // Setup attributes.
+    size_t k_height = 0;
+    size_t k_width = 0;
+    size_t y_padding = 0;
+    size_t x_padding = 0;
+    size_t y_stride = 1;
+    size_t x_stride = 1;
+    if (!global) {
+      std::vector<std::string> pool_size = node.GetAttr<std::vector<std::string>>("pool_size");
+      std::vector<std::string> padding = node.GetAttr<std::vector<std::string>>("padding");
+      std::vector<std::string> strides = node.GetAttr<std::vector<std::string>>("strides");
+      k_height = std::stoi(pool_size[0]);
+      k_width = std::stoi(pool_size[1]);
+      y_padding = std::stoi(padding[0]);
+      x_padding = std::stoi(padding[1]);
+      y_stride = std::stoi(strides[0]);
+      x_stride = std::stoi(strides[1]);
+    } else {
+      auto sv = src_view.get_bnns_view();
+      k_height = sv.size[1];
+      k_width = sv.size[0];
+    }
+
+    BNNSLayerParametersPooling layerParameters = {src_view.get_bnns_view(),  // i_desc
+                                                  dst_view.get_bnns_view(),  // o_desc
+                                                  {},                        // bias
+                                                  activation,                // activation
+                                                  pf,                        // pooling_function
+                                                  k_width,                   // k_width
+                                                  k_height,                  // k_height
+                                                  x_stride,                  // x_stride
+                                                  y_stride,                  // y_stride
+                                                  0,                         // x_dilation_stride
+                                                  0,                         // y_dilation_stride
+                                                  x_padding,                 // x_padding
+                                                  y_padding,                 // y_padding
+                                                  {}};  // pad left, right, up, down padding
+
+    auto common_filter_param = getCommonFilterParams();
+    auto filter = BNNSFilterCreateLayerPooling(&layerParameters, &common_filter_param);
+    ICHECK(filter) << "BNNS primitive was not created. Unsupported attributes configuration";
+
+    std::vector<BNNSFilter> filters{filter};
+    primitives_.emplace_back(std::make_shared<BNNS::PoolingPrimitive>(filters, src_view, dst_view));
+  }
+
+  BNNS::Dtype convertToBNNS(const DLDataType& dl_dtype) {
+    if (dl_dtype.code == DLDataTypeCode::kDLFloat) {
+      if (dl_dtype.bits == 32) return BNNSDataTypeFloat32;
+      if (dl_dtype.bits == 16) return BNNSDataTypeFloat16;
+    }
+    if (dl_dtype.code == DLDataTypeCode::kDLInt) {
+      if (dl_dtype.bits == 32) return BNNSDataTypeInt32;
+      if (dl_dtype.bits == 16) return BNNSDataTypeInt16;
+      if (dl_dtype.bits == 8) return BNNSDataTypeInt8;
+    }
+    if (dl_dtype.code == DLDataTypeCode::kDLUInt) {
+      if (dl_dtype.bits == 32) return BNNSDataTypeUInt32;
+      if (dl_dtype.bits == 16) return BNNSDataTypeUInt16;
+      if (dl_dtype.bits == 8) return BNNSDataTypeUInt8;
+    }
+    LOG(FATAL) << "Unsupported data type for BNNS runtime";
+    return BNNS::Dtype(0);
+  }
+
+  BNNSFilterParameters getCommonFilterParams() {
+    // NOTE: To force weights tensor copy on stage of filter create
+    //       just change : BNNSFlagsUseClientPtr -> 0
+    return {BNNSFlagsUseClientPtr, default_thread_config.internalConcurrency};
+  }
+
+  /** Default threading config. Should be used if there are
+   *  no other threading specificator. */
+  const ThreadingConfig default_thread_config = getDefaultThreadingConfig();
+
+  /** Collection of all primitives in topological order */
+  std::vector<std::shared_ptr<BNNS::Primitive>> primitives_;
+
+  /** Vector with BNNS tensors. Index of tensor matched with
+   *  corresponding EntryID from base JSONRuntimeBase. */
+  std::vector<TensorPtr> tensors_eid_;
+};
+
+runtime::Module BNNSJSONRuntimeCreate(String symbol_name, String graph_json,
+                                      const Array<String>& const_names) {
+  auto n = make_object<BNNSJSONRuntime>(symbol_name, graph_json, const_names);
+  return runtime::Module(n);
+}
+
+TVM_REGISTER_GLOBAL("runtime.BNNSJSONRuntimeCreate").set_body_typed(BNNSJSONRuntimeCreate);
+
+TVM_REGISTER_GLOBAL("runtime.module.loadbinary_bnns_json")
+    .set_body_typed(BNNSJSONRuntime::LoadFromBinary<BNNSJSONRuntime>);
+
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/contrib/bnns/bnns_wrp.h b/src/runtime/contrib/bnns/bnns_wrp.h
new file mode 100644
index 000000000000..b31e97e554da
--- /dev/null
+++ b/src/runtime/contrib/bnns/bnns_wrp.h
@@ -0,0 +1,495 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/**
+ * \file
+ * \brief C++ wrappers and helpers to handle BNNS objects
+ */
+
+#ifndef TVM_RUNTIME_CONTRIB_BNNS_BNNS_WRP_H_
+#define TVM_RUNTIME_CONTRIB_BNNS_BNNS_WRP_H_
+
+#include <Accelerate/Accelerate.h>
+
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <numeric>
+#include <tuple>
+#include <vector>
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+namespace BNNS {
+
+using Dim = size_t;
+using Shape = std::vector<Dim>;
+using Dtype = BNNSDataType;
+using HDL = void*;
+
+void* default_alloc(size_t size) { return malloc(size); }
+
+void default_free(void* ptr) { free(ptr); }
+
+/**
+ * Main abstraction for tensor representation
+ *
+ * Contains buffer handler and common attributes like shape and dtype.
+ */
+class Tensor {
+ public:
+  Tensor() = delete;
+  Tensor(Tensor&) = delete;
+
+  Tensor(Shape shape, Dtype dtype, void* hdl) {
+    auto rank = shape.size();
+    ICHECK(rank < BNNS_MAX_TENSOR_DIMENSION);
+
+    desc_ = {BNNSNDArrayFlags(0),
+             getPlainLayout(rank),
+             {},       // shape
+             {},       // strides
+             hdl,      // data handler
+             dtype,    // data type
+             nullptr,  // table_data (clustering case), is not used
+             dtype,
+             1.f,
+             0.f};
+    std::copy(shape.rbegin(), shape.rend(), std::begin(desc_.size));
+
+    desc_.data = hdl;
+    is_external_data = true;
+  }
+
+  ~Tensor() {
+    if (desc_.data && !is_external_data) {
+      default_free(desc_.data);
+      desc_.data = nullptr;
+    }
+  }
+
+  void allocate_memory() {
+    if (desc_.data && !is_external_data) {
+      default_free(desc_.data);
+    }
+    const size_t buff_size = getSize(desc_) * getElementSize(desc_);
+    desc_.data = default_alloc(buff_size);
+    ICHECK(desc_.data);
+    is_external_data = false;
+  }
+
+  void* get_data_hdl() const { return desc_.data; }
+
+  void set_data_hdl(void* hdl) {
+    if (desc_.data && !is_external_data) {
+      default_free(desc_.data);
+      desc_.data = nullptr;
+    }
+
+    desc_.data = hdl;
+    is_external_data = true;
+  }
+
+  const BNNSNDArrayDescriptor& get_desc() const { return desc_; }
+
+  static BNNSDataLayout getPlainLayout(size_t rank) {
+    ICHECK(rank <= BNNS_MAX_TENSOR_DIMENSION);
+    return static_cast<BNNSDataLayout>((rank << 16) | 0x8001);
+  }
+
+  static size_t getRank(BNNSDataLayout layout) { return (layout & 0xF0000) >> 16; }
+
+  static size_t getRank(BNNSNDArrayDescriptor desc) { return getRank(desc.layout); }
+
+  static size_t getSize(BNNSNDArrayDescriptor desc) {
+    auto rank = getRank(desc);
+    return std::accumulate(desc.size, desc.size + rank, 1, std::multiplies<int>());
+  }
+
+  /** return size of element in bytes */
+  static size_t getElementSize(Dtype dtype) { return (dtype & 0xFFFF) / 8; }
+
+  /** return size of element in bytes */
+  static size_t getElementSize(const BNNSNDArrayDescriptor& desc) {
+    return getElementSize(desc.data_type);
+  }
+
+ private:
+  bool is_external_data = false;
+  BNNSNDArrayDescriptor desc_;
+};
+
+using TensorPtr = std::shared_ptr<Tensor>;
+
+/**
+ * Tensor View object which represent how provided BNNS::Tensor will be considered
+ *
+ * The single BNNS::Tensor can be treated in different form depend on particular primitive
+ * expectation. More other some primitive supports only external form of batching. So we have
+ * some abstraction to describe how primitive will handle provided tensor.
+ *
+ * Batched View
+ *   View with extracted dimension as external batch value
+ *   example: Tensor [2, 3, 224, 224] -> View [3, 224, 224] with ext batch 2
+ *
+ * Party View
+ *   The collection of view on the same tensor, can be the same view or with some stride
+ *   example: Tensor [6, 5, 3, 3] -> 3 x View [2, 5, 3, 3] with stride 45
+ */
+class TView {
+ public:
+  /** Make view on provided tensor as is */
+  static TView as_is(const TensorPtr& origin) {
+    TView res;
+    res.origin_ = origin;
+    res.view_desc_ = origin->get_desc();
+    return res;
+  }
+
+  /** Extract outer dimension to separate batch field. TView will became batched view */
+  TView extract_outer_dim() const {
+    auto rank = Tensor::getRank(view_desc_);
+    TView res = *this;
+    res.batch_size_ = view_desc_.size[rank - 1];
+    res.batch_stride_ =
+        std::accumulate(view_desc_.size, view_desc_.size + rank - 1, 1, std::multiplies<>());
+    res.view_desc_.size[rank - 1] = 0;
+    res.view_desc_.layout = Tensor::getPlainLayout(rank - 1);
+    return res;
+  }
+
+  /** Squeeze all dims equal 1 */
+  TView squeeze(size_t min_rank = 1) const {
+    auto rank = Tensor::getRank(view_desc_);
+    size_t squeezed_shape[BNNS_MAX_TENSOR_DIMENSION] = {};
+    size_t squeezed_rank = 0;
+    for (int i = 0; i < rank; i++)
+      if (view_desc_.size[i] != 1) squeezed_shape[squeezed_rank++] = view_desc_.size[i];
+
+    if (min_rank > squeezed_rank) {
+      std::fill(squeezed_shape + squeezed_rank, squeezed_shape + min_rank, 1);
+      squeezed_rank = min_rank;
+    }
+
+    TView res = *this;
+    std::copy(squeezed_shape, squeezed_shape + squeezed_rank, res.view_desc_.size);
+    std::fill(res.view_desc_.size + squeezed_rank, res.view_desc_.size + rank, 0);
+    res.view_desc_.layout = Tensor::getPlainLayout(squeezed_rank);
+    return res;
+  }
+
+  /** Expand the shape of an array */
+  TView expand_dims(std::vector<size_t> axes) const {
+    auto rank = Tensor::getRank(view_desc_);
+    TView res = *this;
+    size_t unsqueezed_shape[BNNS_MAX_TENSOR_DIMENSION] = {};
+    size_t unsqueezed_rank = axes.size() + rank;
+    ICHECK_LE(unsqueezed_rank, BNNS_MAX_TENSOR_DIMENSION);
+    for (const auto& axis : axes) {
+      ICHECK_LT(axis, unsqueezed_rank);
+      unsqueezed_shape[axis] = 1;
+    }
+    for (int i = 0, orig_idx = 0; i < unsqueezed_rank; ++i) {
+      if (unsqueezed_shape[i] == 1) continue;
+      unsqueezed_shape[i] = view_desc_.size[orig_idx++];
+    }
+    std::copy(unsqueezed_shape, unsqueezed_shape + unsqueezed_rank, res.view_desc_.size);
+    res.view_desc_.layout = Tensor::getPlainLayout(unsqueezed_rank);
+    return res;
+  }
+
+  /** Unsqueeze tensor to a new rank */
+  TView unsqueeze(size_t new_rank) const {
+    ICHECK_LE(new_rank, BNNS_MAX_TENSOR_DIMENSION);
+    auto rank = Tensor::getRank(view_desc_);
+    ICHECK_GT(new_rank, rank);
+    std::vector<size_t> axes(new_rank - rank);
+    std::iota(axes.begin(), axes.end(), rank);
+    return expand_dims(axes);
+  }
+
+  /** Construct new TView with specified layout if it applicable */
+  TView with_layout(BNNSDataLayout layout) const {
+    ICHECK_EQ(Tensor::getRank(view_desc_), Tensor::getRank(layout));
+
+    TView res = *this;
+    res.view_desc_.layout = layout;
+    return res;
+  }
+
+  /** Construct party TView by splitting original TView into num parts */
+  TView party_split_n(size_t num) const {
+    ICHECK_EQ(party_size_, 1);
+
+    TView res = *this;
+    size_t rank = Tensor::getRank(view_desc_);
+    size_t size = Tensor::getSize(view_desc_);
+    res.party_size_ = num;
+    res.party_stride_ = size / num;
+
+    if (res.batch_size_ != 1) {
+      res.batch_size_ /= num;
+    } else {
+      res.view_desc_.size[rank - 1] /= num;
+      res.batch_stride_ /= num;
+    }
+    return res;
+  }
+
+  /** Construct party TView by duplicating original TView num times */
+  TView party_duplicate_n(size_t num) const {
+    ICHECK_EQ(party_size_, 1);
+
+    TView res = *this;
+    res.party_size_ = num;
+    res.party_stride_ = 0;
+
+    return res;
+  }
+
+  /** Return data buffer handler */
+  HDL get_data_hdl() const { return view_desc_.data; }
+
+  /** Return external batch dimension value */
+  size_t get_batch_size() const { return batch_size_; }
+
+  /** Return external batch dimension stride */
+  size_t get_stride() const { return batch_stride_; }
+
+  /** Return party element by index */
+  TView operator[](size_t i) const {
+    ICHECK_LT(i, party_size_);
+
+    TView res = *this;
+    res.party_size_ = 1;
+    if (origin_) {
+      auto hdl = reinterpret_cast<uint8_t*>(origin_->get_data_hdl());
+      hdl += i * party_stride_ * Tensor::getElementSize(view_desc_.data_type);
+      res.view_desc_.data = hdl;
+    }
+    return res;
+  }
+
+  /** Check if view is empty and doesn't relay to any tensor */
+  operator bool() const { return origin_ != nullptr; }
+
+  /** Get BNNS descriptor for particular View. Batch and Party attributed are ignored. */
+  const BNNSNDArrayDescriptor& get_bnns_view() const { return view_desc_; }
+
+ private:
+  /** Original tensor object to view on */
+  TensorPtr origin_;
+
+  /** Batched view parameters */
+  BNNSNDArrayDescriptor view_desc_ = {};
+  size_t batch_size_ = 1;
+  size_t batch_stride_ = 0;
+
+  /** Party representation parameters */
+  size_t party_size_ = 1;
+  size_t party_stride_ = 0;
+};
+
+/**
+ * Wrapper on top of BNNSFilter and src/dst TensorView.
+ *
+ * Support decomposed representation of filter and can execute sub primitives in parallel.
+ */
+class Primitive {
+ public:
+  Primitive(const std::vector<BNNSFilter> fs, const TView& src, const TView& dst)
+      : filters(fs), src_view(src), dst_view(dst) {}
+
+  virtual ~Primitive() {
+    for (auto& filter : filters)
+      if (filter) {
+        BNNSFilterDestroy(filter);
+        filter = nullptr;
+      }
+  }
+
+  /** Execute primitive with using specified src/dst */
+  void execute() {
+    auto res = TVMBackendParallelLaunch(run_task, this, filters.size());
+    ICHECK_EQ(res, 0) << "BNNS runtime. Primitive was not executed properly";
+  }
+
+ private:
+  virtual int execute_impl(int part_idx) {
+    const auto filter = this->filters[part_idx];
+    const auto src_view = this->src_view[part_idx];
+    const auto dst_view = this->dst_view[part_idx];
+
+    size_t mb = src_view.get_batch_size();
+
+    // NB! BNNS limitations
+    //   * Do not use simple BNNSFilterApply. There is a bug inside BNNS,
+    //     BNNSFilterApply doesn't work for grouped convolution.
+    //   * Group convolution doesn't support arbitrary stride for Batch dim.
+    //     The tensor should be dense.
+    return BNNSFilterApplyBatch(filter, mb, src_view.get_data_hdl(), src_view.get_stride(),
+                                dst_view.get_data_hdl(), dst_view.get_stride());
+  }
+
+  static int run_task(int task_id, TVMParallelGroupEnv* penv, void* cdata) {
+    auto prim = reinterpret_cast<Primitive*>(cdata);
+    return prim->execute_impl(task_id);
+  }
+
+ protected:
+  /** BNNS kernels/filters collect which will execute primitive */
+  std::vector<BNNSFilter> filters = {};
+  const TView src_view;
+  const TView dst_view;
+};
+
+/**
+ * Wrapper on top of BNNS::Primitive
+ *
+ * This primitive should be used for executing primitive with two inputs.
+ */
+class TwoInputPrimitive : public Primitive {
+ public:
+  TwoInputPrimitive(const std::vector<BNNSFilter> fs, const TView& src, const TView& src2,
+                    const TView& dst)
+      : Primitive(fs, src, dst), src2_view(src2) {}
+
+ private:
+  int execute_impl(int task_id) override {
+    const auto filter = this->filters[task_id];
+    const auto src_view = this->src_view[task_id];
+    const auto src2_view = this->src2_view[task_id];
+    const auto dst_view = this->dst_view[task_id];
+
+    size_t mb = src_view.get_batch_size();
+
+    return BNNSFilterApplyTwoInputBatch(filter, mb, src_view.get_data_hdl(), src_view.get_stride(),
+                                        src2_view.get_data_hdl(), src2_view.get_stride(),
+                                        dst_view.get_data_hdl(), dst_view.get_stride());
+  }
+
+ protected:
+  const TView src2_view;
+};
+
+/**
+ * Wrapper on top of BNNS::Primitive
+ *
+ * This primitive should be used for executing normalization filter
+ */
+class NormPrimitive : public Primitive {
+ public:
+  using Primitive::Primitive;
+
+ private:
+  int execute_impl(int task_id) override {
+    const auto filter = this->filters[task_id];
+    const auto src_view = this->src_view[task_id];
+    const auto dst_view = this->dst_view[task_id];
+
+    size_t mb = src_view.get_batch_size();
+    return BNNSNormalizationFilterApplyBatch(filter, mb, src_view.get_data_hdl(),
+                                             src_view.get_stride(), dst_view.get_data_hdl(),
+                                             dst_view.get_stride(), false);
+  }
+};
+
+/**
+ * Wrapper on top of BNNS::Primitive
+ *
+ * This primitive should be used for executing pooling filter
+ */
+class PoolingPrimitive : public Primitive {
+ public:
+  using Primitive::Primitive;
+
+ private:
+  int execute_impl(int task_id) override {
+    const auto filter = this->filters[task_id];
+    const auto src_view = this->src_view[task_id];
+    const auto dst_view = this->dst_view[task_id];
+
+    size_t mb = src_view.get_batch_size();
+    return BNNSPoolingFilterApplyBatch(filter, mb, src_view.get_data_hdl(), src_view.get_stride(),
+                                       dst_view.get_data_hdl(), dst_view.get_stride(), nullptr, 0);
+  }
+};
+
+/**
+ * Function which split primitive into sub primitives to parallel execution
+ *
+ * @param num requested num of sub primitives
+ * @param orig_conv_param original convolution descriptor
+ * @param src_view source tensor view
+ * @param wgh_view weight tensor view
+ * @param b_view bias tensor view
+ * @param dst_view destination tensor view
+ * @param num number of part to split into
+ * @return collection of Convolution descriptors plus corresponding src/dst tensors view
+ */
+static std::tuple<std::vector<BNNSLayerParametersConvolution>, TView, TView> split_to_n(
+    size_t num, const BNNSLayerParametersConvolution& orig_conv_param, const TView& src_view,
+    const TView& wgh_view, const TView& b_view, const TView& dst_view) {
+  size_t batch = src_view.get_batch_size();
+  size_t oc = dst_view.get_bnns_view().size[2];
+  size_t groups = orig_conv_param.groups;
+
+  BNNS::TView src_view_new;
+  BNNS::TView wgh_view_new;
+  BNNS::TView b_view_new;
+  BNNS::TView dst_view_new;
+
+  // TODO(apeskov): Add split by batch dim. Meanwhile we just disable it...
+  if (batch > 1 || oc % num != 0 || (groups > 1 && groups % num != 0)) {
+    return {{orig_conv_param}, src_view, dst_view};
+  }
+
+  // if groups > 1 split only by groups
+  // otherwise split inside one convolution by output channels
+  if (groups > 1) {
+    src_view_new = src_view.party_split_n(num);
+    groups = groups / num;
+  } else {
+    src_view_new = src_view.party_duplicate_n(num);
+  }
+
+  wgh_view_new = wgh_view.party_split_n(num);
+  b_view_new = b_view.party_split_n(num);
+  dst_view_new = dst_view.party_split_n(num);
+
+  std::vector<BNNSLayerParametersConvolution> res(num);
+  for (size_t i = 0; i < num; i++) {
+    auto& cur = res[i];
+    cur = orig_conv_param;
+
+    cur.i_desc = src_view_new[i].get_bnns_view();
+    cur.o_desc = dst_view_new[i].get_bnns_view();
+    cur.w_desc = wgh_view_new[i].get_bnns_view();
+    cur.bias = b_view_new[i].get_bnns_view();
+    cur.groups = groups;
+  }
+  return {res, src_view_new, dst_view_new};
+}
+
+}  // namespace BNNS
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_CONTRIB_BNNS_BNNS_WRP_H_
diff --git a/tests/cpp/contrib/bnns.cc b/tests/cpp/contrib/bnns.cc
new file mode 100644
index 000000000000..1efd487caff9
--- /dev/null
+++ b/tests/cpp/contrib/bnns.cc
@@ -0,0 +1,307 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+TEST(PackedFunc, Basic) {
+  using namespace tvm;
+  using namespace tvm::tir;
+  using namespace tvm::runtime;
+  int x = 0;
+  void* handle = &x;
+  DLTensor a;
+
+  Var v = PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
+    ICHECK(args.num_args == 3);
+    ICHECK(args.values[0].v_float64 == 1.0);
+    ICHECK(args.type_codes[0] == kDLFloat);
+    ICHECK(args.values[1].v_handle == &a);
+    ICHECK(args.type_codes[1] == kTVMDLTensorHandle);
+    ICHECK(args.values[2].v_handle == &x);
+    ICHECK(args.type_codes[2] == kTVMOpaqueHandle);
+    *rv = Var("a");
+  })(1.0, &a, handle);
+  ICHECK(v->name_hint == "a");
+}
+
+TEST(PackedFunc, Node) {
+  using namespace tvm;
+  using namespace tvm::tir;
+  using namespace tvm::runtime;
+  Var x;
+  Var t = PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
+    ICHECK(args.num_args == 1);
+    ICHECK(args[0].IsObjectRef<ObjectRef>());
+    Var b = args[0];
+    ICHECK(x.same_as(b));
+    *rv = b;
+  })(x);
+  ICHECK(t.same_as(x));
+}
+
+TEST(PackedFunc, NDArray) {
+  using namespace tvm;
+  using namespace tvm::runtime;
+  auto x = NDArray::Empty({}, String2DLDataType("float32"), TVMContext{kDLCPU, 0});
+  reinterpret_cast<float*>(x->data)[0] = 10.0f;
+  ICHECK(x.use_count() == 1);
+
+  PackedFunc forward([&](TVMArgs args, TVMRetValue* rv) { *rv = args[0]; });
+
+  NDArray ret = PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
+    NDArray y = args[0];
+    DLTensor* ptr = args[0];
+    ICHECK(ptr == x.operator->());
+    ICHECK(x.same_as(y));
+    ICHECK(x.use_count() == 2);
+    *rv = forward(y);
+  })(x);
+  ICHECK(ret.use_count() == 2);
+  ICHECK(ret.same_as(x));
+}
+
+TEST(PackedFunc, str) {
+  using namespace tvm;
+  using namespace tvm::runtime;
+  PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
+    ICHECK(args.num_args == 1);
+    std::string x = args[0];
+    ICHECK(x == "hello");
+    String y = args[0];
+    ICHECK(y == "hello");
+    *rv = x;
+  })("hello");
+
+  PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
+    ICHECK(args.num_args == 1);
+    runtime::String s = args[0];
+    ICHECK(s == "hello");
+  })(runtime::String("hello"));
+}
+
+TEST(PackedFunc, func) {
+  using namespace tvm;
+  using namespace tvm::runtime;
+  PackedFunc addone([&](TVMArgs args, TVMRetValue* rv) { *rv = args[0].operator int() + 1; });
+  // function as arguments
+  int r0 = PackedFunc([](TVMArgs args, TVMRetValue* rv) {
+    PackedFunc f = args[0];
+    // TVMArgValue -> Arguments as function
+    *rv = f(args[1]).operator int();
+  })(addone, 1);
+  ICHECK_EQ(r0, 2);
+
+  int r1 = PackedFunc([](TVMArgs args, TVMRetValue* rv) {
+    // TVMArgValue -> TVMRetValue
+    *rv = args[1];
+  })(2, 100);
+  ICHECK_EQ(r1, 100);
+
+  int r2 = PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
+    // re-assignment
+    *rv = args[0];
+    // TVMRetValue -> Function argument
+    *rv = addone(args[0].operator PackedFunc()(args[1], 1));
+  })(addone, 100);
+  ICHECK_EQ(r2, 102);
+}
+
+TEST(PackedFunc, Expr) {
+  using namespace tvm;
+  using namespace tvm::runtime;
+  // automatic conversion of int to expr
+  PackedFunc addone([](TVMArgs args, TVMRetValue* rv) {
+    PrimExpr x = args[0];
+    *rv = x.as<tvm::tir::IntImmNode>()->value + 1;
+  });
+  int r0 = PackedFunc([](TVMArgs args, TVMRetValue* rv) {
+    PackedFunc f = args[0];
+    // TVMArgValue -> Arguments as function
+    *rv = f(args[1]).operator int();
+  })(addone, 1);
+  ICHECK_EQ(r0, 2);
+}
+
+TEST(PackedFunc, Type) {
+  using namespace tvm;
+  using namespace tvm::runtime;
+  auto get_type = PackedFunc([](TVMArgs args, TVMRetValue* rv) {
+    DataType x = args[0];
+    *rv = x;
+  });
+  auto get_type2 = PackedFunc([](TVMArgs args, TVMRetValue* rv) { *rv = args[0]; });
+  ICHECK(get_type("int32").operator DataType() == DataType::Int(32));
+  ICHECK(get_type("float").operator DataType() == DataType::Float(32));
+  ICHECK(get_type2("float32x2").operator DataType() == DataType::Float(32, 2));
+}
+
+TEST(TypedPackedFunc, HighOrder) {
+  using namespace tvm;
+  using namespace tvm::runtime;
+  using Int1Func = TypedPackedFunc<int(int)>;
+  using Int2Func = TypedPackedFunc<int(int, int)>;
+  using BindFunc = TypedPackedFunc<Int1Func(Int2Func, int value)>;
+  BindFunc ftyped;
+  ftyped = [](Int2Func f1, int value) -> Int1Func {
+    auto binded = [f1, value](int x) { return f1(value, x); };
+    Int1Func x(binded);
+    return x;
+  };
+  auto add = [](int x, int y) { return x + y; };
+  ICHECK_EQ(ftyped(Int2Func(add), 1)(2), 3);
+  PackedFunc f = ftyped(Int2Func(add), 1);
+  ICHECK_EQ(f(3).operator int(), 4);
+  // call the type erased version.
+  Int1Func f1 = ftyped.packed()(Int2Func(add), 1);
+  ICHECK_EQ(f1(3), 4);
+}
+
+TEST(TypedPackedFunc, Deduce) {
+  using namespace tvm::runtime;
+  using tvm::runtime::detail::function_signature;
+
+  TypedPackedFunc<int(float)> x;
+  auto f = [](int x) -> int { return x + 1; };
+  std::function<void(float)> y;
+
+  static_assert(std::is_same<function_signature<decltype(x)>::FType, int(float)>::value,
+                "invariant1");
+  static_assert(std::is_same<function_signature<decltype(f)>::FType, int(int)>::value,
+                "invariant2");
+  static_assert(std::is_same<function_signature<decltype(y)>::FType, void(float)>::value,
+                "invariant3");
+}
+
+TEST(PackedFunc, ObjectConversion) {
+  using namespace tvm;
+  using namespace tvm::tir;
+  using namespace tvm::runtime;
+  TVMRetValue rv;
+  auto x = NDArray::Empty({}, String2DLDataType("float32"), TVMContext{kDLCPU, 0});
+  // assign null
+  rv = ObjectRef();
+  ICHECK_EQ(rv.type_code(), kTVMNullptr);
+
+  // Can assign NDArray to ret type
+  rv = x;
+  ICHECK_EQ(rv.type_code(), kTVMNDArrayHandle);
+  // Even if we assign base type it still shows as NDArray
+  rv = ObjectRef(x);
+  ICHECK_EQ(rv.type_code(), kTVMNDArrayHandle);
+  // Check convert back
+  ICHECK(rv.operator NDArray().same_as(x));
+  ICHECK(rv.operator ObjectRef().same_as(x));
+  ICHECK(!rv.IsObjectRef<PrimExpr>());
+
+  auto pf1 = PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
+    ICHECK_EQ(args[0].type_code(), kTVMNDArrayHandle);
+    ICHECK(args[0].operator NDArray().same_as(x));
+    ICHECK(args[0].operator ObjectRef().same_as(x));
+    ICHECK(args[1].operator ObjectRef().get() == nullptr);
+    ICHECK(args[1].operator NDArray().get() == nullptr);
+    ICHECK(args[1].operator Module().get() == nullptr);
+    ICHECK(args[1].operator Array<NDArray>().get() == nullptr);
+    ICHECK(!args[0].IsObjectRef<PrimExpr>());
+  });
+  pf1(x, ObjectRef());
+  pf1(ObjectRef(x), NDArray());
+
+  // testcases for modules
+  auto* pf = tvm::runtime::Registry::Get("runtime.SourceModuleCreate");
+  ICHECK(pf != nullptr);
+  Module m = (*pf)("", "xyz");
+  rv = m;
+  ICHECK_EQ(rv.type_code(), kTVMModuleHandle);
+  // Even if we assign base type it still shows as NDArray
+  rv = ObjectRef(m);
+  ICHECK_EQ(rv.type_code(), kTVMModuleHandle);
+  // Check convert back
+  ICHECK(rv.operator Module().same_as(m));
+  ICHECK(rv.operator ObjectRef().same_as(m));
+  ICHECK(!rv.IsObjectRef<NDArray>());
+
+  auto pf2 = PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
+    ICHECK_EQ(args[0].type_code(), kTVMModuleHandle);
+    ICHECK(args[0].operator Module().same_as(m));
+    ICHECK(args[0].operator ObjectRef().same_as(m));
+    ICHECK(args[1].operator ObjectRef().get() == nullptr);
+    ICHECK(args[1].operator NDArray().get() == nullptr);
+    ICHECK(args[1].operator Module().get() == nullptr);
+    ICHECK(!args[0].IsObjectRef<PrimExpr>());
+  });
+  pf2(m, ObjectRef());
+  pf2(ObjectRef(m), Module());
+}
+
+TEST(TypedPackedFunc, RValue) {
+  using namespace tvm;
+  using namespace tvm::runtime;
+  {
+    auto inspect = [](TVMArgs args, TVMRetValue* rv) {
+      for (int i = 0; i < args.size(); ++i) {
+        ICHECK_EQ(args[0].type_code(), kTVMObjectRValueRefArg);
+      }
+    };
+    PackedFunc finspect(inspect);
+    finspect(tir::Var("x"));
+  }
+  {
+    auto f = [](tir::Var x, bool move) {
+      if (move) {
+        ICHECK(x.unique());
+      } else {
+        ICHECK(!x.unique());
+      }
+      ICHECK(x->name_hint == "x");
+      return x;
+    };
+    TypedPackedFunc<tir::Var(tir::Var, bool)> tf(f);
+
+    tir::Var var("x");
+    ICHECK(var.unique());
+    tf(var, false);
+    // move the result to the function.
+    tir::Var ret = tf(std::move(var), true);
+    ICHECK(!var.defined());
+  }
+
+  {
+    // pass child class.
+    auto f = [](PrimExpr x, bool move) {
+      if (move) {
+        ICHECK(x.unique());
+      } else {
+        ICHECK(!x.unique());
+      }
+      return x;
+    };
+    TypedPackedFunc<PrimExpr(PrimExpr, bool)> tf(f);
+
+    tir::Var var("x");
+    ICHECK(var.unique());
+    tf(var, false);
+    tf(std::move(var), true);
+    // auto conversion.
+    tf(1, true);
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  testing::FLAGS_gtest_death_test_style = "threadsafe";
+  return RUN_ALL_TESTS();
+}
diff --git a/tests/python/contrib/test_bnns/__init__.py b/tests/python/contrib/test_bnns/__init__.py
new file mode 100644
index 000000000000..724b23f1378b
--- /dev/null
+++ b/tests/python/contrib/test_bnns/__init__.py
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Infrastructure and tests for BNNS"""
diff --git a/tests/python/contrib/test_bnns/infrastructure.py b/tests/python/contrib/test_bnns/infrastructure.py
new file mode 100644
index 000000000000..0107de54a04f
--- /dev/null
+++ b/tests/python/contrib/test_bnns/infrastructure.py
@@ -0,0 +1,330 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from itertools import zip_longest, combinations
+import json
+import os
+import warnings
+
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm import rpc
+from tvm.contrib import graph_runtime
+from tvm.relay.op.contrib.bnns import partition_for_bnns
+from tvm.contrib import utils
+from tvm.autotvm.measure import request_remote
+from tvm.relay.analysis import analysis
+
+
+class Device:
+    """
+    Common device configuration for python tests.
+
+    Check tests/python/contrib/arm_compute_lib/ for the presence of an test_config.json file.
+    This file can be used to override the default configuration here which will attempt to run the BNNS
+    runtime tests locally if the runtime is available. Changing the configuration will allow these
+    runtime tests to be offloaded to a remote device with BNNS via a tracker for example.
+
+    Notes
+    -----
+        The test configuration will be loaded once when the the class is created. If the configuration
+        changes between tests, any changes will not be picked up.
+
+
+    Attributes
+    ----------
+    connection_type : str
+        Details the type of RPC connection to use. Options:
+        local - Use the local device,
+        tracker - Connect to a tracker to request a remote device,
+        remote - Connect to a remote device directly.
+    host : str
+        Specify IP address or hostname of remote target.
+    port : int
+        Specify port number of remote target.
+    target : str
+        The compilation target.
+    device_key : str
+        The device key of the remote target. Use when connecting to a remote device via a tracker.
+    cross_compile : str
+        Specify path to cross compiler to use when connecting a remote device from a non-arm platform.
+    """
+
+    connection_type = "local"
+    host = "localhost"
+    port = 9090
+    target = "llvm"
+    device_key = ""
+    cross_compile = ""
+
+    def __init__(self):
+        """Keep remote device for lifetime of object."""
+        self.device = self._get_remote()
+
+    @classmethod
+    def _get_remote(cls):
+        """Get a remote (or local) device to use for testing."""
+        if cls.connection_type == "tracker":
+            device = request_remote(cls.device_key, cls.host, cls.port, timeout=1000)
+        elif cls.connection_type == "remote":
+            device = rpc.connect(cls.host, cls.port)
+        elif cls.connection_type == "local":
+            device = rpc.LocalSession()
+        else:
+            raise ValueError(
+                "connection_type in test_config.json should be one of: " "local, tracker, remote."
+            )
+
+        return device
+
+    @classmethod
+    def load(cls, file_name):
+        """Load test config
+
+        Load the test configuration by looking for file_name relative
+        to the test_bnns directory.
+        """
+        location = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+        config_file = os.path.join(location, file_name)
+        if not os.path.exists(config_file):
+            warnings.warn("Config file doesn't exist, resuming tests with default config.")
+            return
+        with open(config_file, mode="r") as config:
+            test_config = json.load(config)
+
+        cls.connection_type = test_config["connection_type"]
+        cls.host = test_config["host"]
+        cls.port = test_config["port"]
+        cls.target = test_config["target"]
+        cls.device_key = test_config.get("device_key") or ""
+        cls.cross_compile = test_config.get("cross_compile") or ""
+
+
+Device.target = "llvm"
+
+
+def skip_runtime_test():
+    """Skip test if it requires the runtime and it's not present."""
+    # BNNS codegen not present.
+    if not tvm.get_global_func("relay.ext.bnns", True):
+        print("Skip because BNNS codegen is not available.")
+        return True
+    return False
+
+
+def skip_codegen_test():
+    """Skip test if it requires the BNNS codegen and it's not present."""
+    if not tvm.get_global_func("relay.ext.bnns", True):
+        print("Skip because BNNS codegen is not available.")
+        return True
+
+
+def build_module(mod, target, params=None, enable_bnns=True, tvm_ops=0):
+    """Build module with option to build for BNNS."""
+    if isinstance(mod, tvm.relay.expr.Call):
+        mod = tvm.IRModule.from_expr(mod)
+    with tvm.transform.PassContext(opt_level=3):
+        if enable_bnns:
+            mod = partition_for_bnns(mod)
+        relay.backend.compile_engine.get().clear()
+        return relay.build(mod, target=target, target_host=target, params=params)
+
+
+def build_and_run(
+    mod,
+    inputs,
+    outputs,
+    params,
+    device,
+    enable_bnns=True,
+    no_runs=1,
+    tvm_ops=0,
+    config=None,
+):
+    """Build and run the relay module."""
+    if config is None:
+        config = {}
+
+    try:
+        lib = build_module(mod, device.target, params, enable_bnns, tvm_ops)
+    except Exception as e:
+        err_msg = "The module could not be built.\n"
+        if config:
+            err_msg += f"The test failed with the following parameters: {config}\n"
+        err_msg += str(e)
+        raise Exception(err_msg)
+
+    lib = update_lib(lib, device.device, device.cross_compile)
+    gen_module = graph_runtime.GraphModule(lib["default"](device.device.cpu(0)))
+    gen_module.set_input(**inputs)
+    out = []
+    for _ in range(no_runs):
+        gen_module.run()
+        out.append([gen_module.get_output(i) for i in range(outputs)])
+    return out
+
+
+def update_lib(lib, device, cross_compile):
+    """Export the library to the remote/local device."""
+    lib_name = "mod.so"
+    temp = utils.tempdir()
+    lib_path = temp.relpath(lib_name)
+    if cross_compile:
+        lib.export_library(lib_path, cc=cross_compile)
+    else:
+        lib.export_library(lib_path)
+    device.upload(lib_path)
+    lib = device.load_module(lib_name)
+    return lib
+
+
+def extract_bnns_modules(module):
+    """Get the BNNS module(s) from llvm module."""
+    return list(filter(lambda mod: mod.type_key == "bnns_json", module.get_lib().imported_modules))
+
+
+def verify(answers, atol, rtol, verify_saturation=False, config=None):
+    """Compare the array of answers. Each entry is a list of outputs."""
+    if config is None:
+        config = {}
+
+    if len(answers) < 2:
+        raise RuntimeError(f"No results to compare: expected at least two, found {len(answers)}")
+    for answer in zip_longest(*answers):
+        for outs in combinations(answer, 2):
+            try:
+                if verify_saturation:
+                    assert (
+                        np.count_nonzero(outs[0].asnumpy() == 255) < 0.25 * outs[0].asnumpy().size
+                    ), "Output is saturated: {}".format(outs[0])
+                    assert (
+                        np.count_nonzero(outs[0].asnumpy() == 0) < 0.25 * outs[0].asnumpy().size
+                    ), "Output is saturated: {}".format(outs[0])
+                tvm.testing.assert_allclose(
+                    outs[0].asnumpy(), outs[1].asnumpy(), rtol=rtol, atol=atol
+                )
+            except AssertionError as e:
+                err_msg = "Results not within the acceptable tolerance.\n"
+                if config:
+                    err_msg += f"The test failed with the following parameters: {config}\n"
+                err_msg += str(e)
+                raise AssertionError(err_msg)
+
+
+def verify_codegen(
+    module,
+    known_good_codegen,
+    num_bnns_modules,
+    tvm_ops=0,
+    target=Device.target,
+):
+    """Check BNNS codegen against a known good output."""
+    module = build_module(module, target, tvm_ops=tvm_ops)
+    bnns_modules = extract_bnns_modules(module)
+
+    assert len(bnns_modules) == num_bnns_modules, (
+        f"The number of BNNS modules produced ({len(bnns_modules)}) does not "
+        f"match the expected value ({num_bnns_modules})."
+    )
+
+    for mod in bnns_modules:
+        source = mod.get_source("json")
+        codegen = json.loads(source)["nodes"]
+        # remove input and const names as these cannot be predetermined
+        for node in range(len(codegen)):
+            if codegen[node]["op"] == "input" or codegen[node]["op"] == "const":
+                codegen[node]["name"] = ""
+        codegen_str = json.dumps(codegen, sort_keys=True, indent=2)
+        known_good_codegen_str = json.dumps(known_good_codegen, sort_keys=True, indent=2)
+
+        assert codegen_str == known_good_codegen_str, (
+            f"The JSON produced by codegen does not match the expected result. \n"
+            f"Actual={codegen_str} \n"
+            f"Expected={known_good_codegen_str}"
+        )
+
+
+def compare_inference_with_ref(func, params, atol=0.002, rtol=0.007):
+    """Compare scoring results for compilation with and without BNNS.
+
+    Provided function will be compiled two times with and without BNNS.
+    The scoring results for both type of compilation will be compared
+    with provided atol and rtol. The input data will be automatically
+    generated based of shape and dtype info provided for var nodes.
+
+    """
+    # Generate input tensor values
+    inputs = {}
+    for free_param in analysis.free_vars(func):
+        name = free_param.name_hint
+        dtype = free_param.type_annotation.dtype
+        shape = [s.value for s in free_param.type_annotation.shape]
+        inputs[name] = tvm.nd.array(np.random.uniform(0, 127, shape).astype(dtype))
+
+    # Run for both type of compilation
+    device = Device()
+    outputs = []
+    for bnns in [False, True]:
+        outputs.append(build_and_run(func, inputs, 1, params, device, enable_bnns=bnns)[0])
+
+    # Compare result tensors
+    verify(outputs, atol=atol, rtol=rtol)
+
+
+def generate_trials(space, r_factor=3):
+    """Generates a series of trials.
+
+    This algorithm generates a series of non-deterministic trials given a
+    space of options to test. A trial is generated by pulling a value from
+    each option in the space. On some occasions the values are shuffled to
+    ensure a different trial on each r_factor iteration. The algorithm ensures
+    that each value from an option is used at least once. The total number of
+    trials is determined by the r_factor * the option with the largest number
+    of values.
+
+    Parameters
+    ----------
+    space: List[List[Any]]
+        A list of different options with varying values to test.
+    r_factor: Optional[int]
+        The repeat factor.
+
+    Returns
+    -------
+    result: List[Tuple]
+        A list of trials specifying values for each option.
+
+    """
+    np.random.seed(0)
+    max_len = 1
+    for option in space:
+        max_len = max(max_len, len(option))
+
+    num_trials = r_factor * max_len
+    trials = []
+    for i in range(num_trials):
+        trial = []
+        for option in space:
+            if i % len(option) == 0:
+                np.random.shuffle(option)
+            trial.append(option[i % len(option)])
+
+        trials.append(trial)
+
+    return trials
diff --git a/tests/python/contrib/test_bnns/test_conv2d.py b/tests/python/contrib/test_bnns/test_conv2d.py
new file mode 100644
index 000000000000..886958cf3076
--- /dev/null
+++ b/tests/python/contrib/test_bnns/test_conv2d.py
@@ -0,0 +1,177 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""BNNS integration conv2d tests."""
+
+import numpy as np
+import pytest
+import tvm
+from tvm import relay
+
+from .infrastructure import skip_runtime_test, compare_inference_with_ref, generate_trials
+
+# TODO: Missed cases
+#   1. Bias as add with 3d const tensor. Lead to additional unsqueeze op between
+#   2. Check unsupported cases of fusion. Like bias add with axis != 1, add with broadcast by spatial dims
+#   3. Check if bias/weights is not constants. Should fallback into LLVM or decompose it
+#   4. Check if bias/weights is constants expr. Should works somehow.
+
+
+def _get_model(
+    shape,
+    kernel=(3, 3),
+    padding=(1, 1),
+    strides=(1, 1),
+    dilation=(1, 1),
+    groups=1,
+    dtype="float32",
+    channels=-1,  # -1 means same as input channels
+    bias_type="none",
+    activation_type="none",
+):
+    """Return a model and any parameters it may have"""
+    if channels == -1:
+        channels = shape[1]
+
+    a = relay.var("a", shape=shape, dtype=dtype)
+    weight_shape = (channels, shape[1] // groups, *kernel)
+    w = tvm.nd.array(np.random.uniform(-128, 127, weight_shape).astype(dtype))
+    weights = relay.const(w, dtype)
+    out = relay.nn.conv2d(
+        a,
+        weights,
+        kernel_size=kernel,
+        dilation=dilation,
+        strides=strides,
+        padding=padding,
+        groups=groups,
+        channels=channels,
+        out_dtype=dtype,
+    )
+    params = {"w": w}
+    if bias_type == "bias_add":
+        b = tvm.nd.array(np.random.uniform(-10, 10, weight_shape[0]).astype(dtype))
+        biasc = relay.const(b, dtype)
+        out = relay.nn.bias_add(out, biasc, axis=1)
+        params["b"] = b
+    elif bias_type == "add_3d" or bias_type == "add_4d":
+        bias_shape = (
+            (weight_shape[0], 1, 1) if bias_type == "add_3d" else (1, weight_shape[0], 1, 1)
+        )
+        b = tvm.nd.array(np.random.uniform(-10, 10, bias_shape).astype(dtype))
+        biasc = relay.const(b, dtype)
+        out = relay.add(out, biasc)
+        params["b"] = b
+
+    if activation_type == "relu":
+        out = relay.nn.relu(out)
+    elif activation_type == "sigmoid":
+        out = relay.op.sigmoid(out)
+    return out, params
+
+
+@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available")
+def test_conv2d():
+    np.random.seed(0)
+
+    kernel_hs = [1, 2, 3, 5]
+    kernel_ws = [1, 2, 3, 5]
+    pad = [(1, 1), (2, 2), (2, 1)]
+    strides = [(1, 1), (2, 2)]
+    dilation = [(1, 1)]
+    out_channels = [1, 4, 8, 16]
+    input_shapes = [(10, 10, 14), (12, 15, 16), (20, 20, 20)]
+    batches = [1, 2]
+    groups = [1, 2]
+    bias_kind = ["none", "add_3d", "add_4d", "bias.add"]
+    activation_kind = ["none", "relu", "sigmoid"]
+    trials = generate_trials(
+        [
+            kernel_hs,
+            kernel_ws,
+            pad,
+            strides,
+            dilation,
+            out_channels,
+            input_shapes,
+            groups,
+            batches,
+            bias_kind,
+            activation_kind,
+        ],
+        3,
+    )
+
+    for (
+        kernel_h,
+        kernel_w,
+        pad,
+        stride,
+        dilation,
+        out_channels,
+        input_shapes,
+        group,
+        batch,
+        bias,
+        activation,
+    ) in trials:
+        if out_channels % group != 0:
+            continue
+        func, params = _get_model(
+            shape=(batch, *input_shapes),
+            kernel=(kernel_h, kernel_w),
+            padding=pad,
+            strides=stride,
+            dilation=dilation,
+            groups=group,
+            channels=out_channels,
+            bias_type=bias,
+            activation_type=activation,
+        )
+        compare_inference_with_ref(func, params)
+
+
+@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available")
+def test_conv2d_dw():
+    if skip_runtime_test():
+        return
+
+    np.random.seed(0)
+    shape = [4, 5, 5]
+
+    for batch in [1, 2]:
+        mod, params = _get_model(shape=(batch, *shape), groups=shape[0])
+        compare_inference_with_ref(mod, params)
+
+
+@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available")
+def test_conv2d_with_oc1():
+    if skip_runtime_test():
+        return
+
+    np.random.seed(0)
+    shape = [3, 5, 5]
+
+    for batch in [1, 2]:
+        for bias in ["none", "add_4d"]:
+            mod, params = _get_model(shape=(batch, *shape), channels=1, bias_type=bias)
+            compare_inference_with_ref(mod, params)
+
+
+if __name__ == "__main__":
+    test_conv2d()
+    test_conv2d_dw()
+    test_conv2d_with_oc1()
diff --git a/tests/python/contrib/test_bnns/test_conv2d_patterns.py b/tests/python/contrib/test_bnns/test_conv2d_patterns.py
new file mode 100644
index 000000000000..b10504bbc961
--- /dev/null
+++ b/tests/python/contrib/test_bnns/test_conv2d_patterns.py
@@ -0,0 +1,107 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""BNNS pattern detection check"""
+
+import tvm
+from tvm import relay
+import numpy as np
+
+from tvm.relay.op.contrib.bnns import partition_for_bnns
+
+fp32 = "float32"
+
+
+def partition(exp):
+    """Apply BNNS specific partitioning transformation"""
+    mod = tvm.IRModule.from_expr(exp)
+    with tvm.transform.PassContext(opt_level=3):
+        mod = partition_for_bnns(mod)
+    return mod
+
+
+def is_op_fused(func, op_name):
+    is_fused = False
+
+    def visit(op):
+        if (
+            isinstance(op, tvm.relay.function.Function)
+            and op_name in op.attrs["PartitionedFromPattern"]
+        ):
+            nonlocal is_fused
+            is_fused = True
+
+    tvm.relay.analysis.post_order_visit(func.body, visit)
+    return is_fused
+
+
+def test_pattern_conv2d_with_bias_add():
+    for axis in (1, 2):
+        a = relay.var("a", shape=(2, 7, 8, 8), dtype=fp32)
+        w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype(fp32))
+        res = relay.nn.conv2d(a, w, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype=fp32)
+        b = relay.const(np.random.uniform(-10, 10, 8).astype(fp32))
+        res = relay.nn.bias_add(res, b, axis=axis)
+
+        mod = partition(res)
+        bias_is_fused = is_op_fused(mod["bnns_0"], "nn.bias_add")
+
+        assert bias_is_fused if axis == 1 else not bias_is_fused
+
+
+def test_pattern_conv2d_with_add():
+    workloads = {8: False, (8, 1): False, (8, 1, 1): True, (1, 8, 1, 1): True}
+
+    for b_shape, should_be_fused in workloads.items():
+        a = relay.var("a", shape=(2, 7, 8, 8), dtype=fp32)
+        w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype(fp32))
+        res = relay.nn.conv2d(a, w, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype=fp32)
+        b = relay.const(np.random.uniform(-10, 10, b_shape).astype(fp32))
+        res = relay.add(res, b)
+
+        mod = partition(res)
+        bias_is_fused = is_op_fused(mod["bnns_0"], "add")
+
+        assert bias_is_fused == should_be_fused
+
+
+def test_pattern_conv2d_with_non_cons_weights():
+    for const_weights in (True, False):
+        a = relay.var("a", shape=(2, 7, 8, 8), dtype=fp32)
+        if const_weights:
+            w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype(fp32))
+        else:
+            w = relay.var("w", shape=(8, 7, 3, 3), dtype=fp32)
+
+        res = relay.nn.conv2d(a, w, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype=fp32)
+
+        mod = partition(res)
+        use_bnns = len(mod.get_global_vars()) == 2  # GlobalVar: "main" and "bnns_0"
+
+        assert use_bnns == const_weights
+
+
+def test_pattern_conv2d_with_non_cons_bias():
+    a = relay.var("a", shape=[2, 7, 8, 8], dtype=fp32)
+    w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype(fp32))
+    res = relay.nn.conv2d(a, w, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype=fp32)
+    b = relay.var("b", shape=[8], dtype=fp32)
+    res = relay.nn.bias_add(res, b, axis=1)
+
+    mod = partition(res)
+    bias_is_fused = is_op_fused(mod["bnns_0"], "nn.bias_add")
+
+    assert not bias_is_fused
diff --git a/tests/python/contrib/test_bnns/test_dense.py b/tests/python/contrib/test_bnns/test_dense.py
new file mode 100644
index 000000000000..c2cf9bf71373
--- /dev/null
+++ b/tests/python/contrib/test_bnns/test_dense.py
@@ -0,0 +1,190 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""BNNS integration dense tests."""
+
+import numpy as np
+import math
+import pytest
+import tvm
+from tvm import relay
+from .infrastructure import (
+    Device,
+    skip_runtime_test,
+    skip_codegen_test,
+    build_and_run,
+    verify,
+    verify_codegen,
+    generate_trials,
+)
+
+
+def _get_model(shape, weight_shape, units, dtype, var_names, has_bias=False, has_gelu=False):
+    """Return a model and any parameters it may have"""
+    a = relay.var(next(var_names), shape=shape, dtype=dtype)
+    w = tvm.nd.array(np.random.uniform(-128, 127, weight_shape).astype(dtype))
+    weights = relay.const(w, dtype)
+    out = relay.nn.dense(a, weights, units=units, out_dtype=dtype)
+    params = {"w": w}
+    if has_bias:
+        b = tvm.nd.array(np.random.randint(-128, 127, weight_shape[0]).astype(dtype))
+        biasc = relay.const(b, dtype)
+        out = relay.op.add(out, biasc)
+        params["b"] = b
+    if has_gelu:
+        const1 = relay.const(0.044715)
+        const2 = relay.const(math.sqrt(2 / math.pi))
+        bias = out
+        out = relay.op.power(bias, relay.const(3.0, "float32"))
+        out = relay.op.multiply(out, const1)
+        out = relay.op.add(out, bias)
+        out = relay.op.multiply(out, const2)
+        out = relay.op.tanh(out)
+        out = relay.op.add(out, relay.const(1, "float32"))
+        out = relay.op.multiply(out, relay.const(0.5))
+        out = relay.op.multiply(out, bias)
+    return out, params
+
+
+def _get_expected_codegen(shape, weight_shape, units, dtype, has_bias=False, has_gelu=False):
+    output_shape = (shape[0], units)
+    name = "nn.dense"
+    if has_bias is True:
+        name = "bnns.dense_bias"
+    if has_bias is True and has_gelu is True:
+        name = "bnns.dense_bias_gelu"
+
+    node = {
+        "op": "kernel",
+        "name": name,
+        "inputs": [],
+        "attrs": {
+            "num_outputs": "1",
+            "out_dtype": [["float32"]],
+            "shape": [[list(output_shape)]],
+            "dtype": [[dtype]],
+            "units": [[str(units)]],
+        },
+    }
+
+    inputs = [
+        {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[str(dtype)]]}},
+        {
+            "op": "const",
+            "name": "",
+            "attrs": {"shape": [[list(weight_shape)]], "dtype": [[str(dtype)]]},
+        },
+    ]
+
+    if has_bias:
+        inputs.append(
+            {
+                "op": "const",
+                "name": "",
+                "attrs": {"shape": [[[weight_shape[0]]]], "dtype": [["float32"]]},
+            }
+        )
+
+    input_idx = 0
+    for _ in range(len(inputs)):
+        node["inputs"].append([input_idx, 0, 0])
+        input_idx += 1
+    node["attrs"]["num_inputs"] = str(len(inputs))
+    inputs.append(node)
+    return inputs
+
+
+@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available")
+def test_dense():
+    device = Device()
+    np.random.seed(0)
+
+    dtype = ["float32"]
+    shape = [
+        ((1, 128), (16, 128), 16),
+        ((32, 32), (32, 32), 32),
+        ((1, 64), (1, 64), 1),
+        ((11, 2), (2, 2), 2),
+        ((2, 2), (1, 2), 1),
+    ]
+    composite = [False, True]
+    trials = generate_trials([dtype, shape, composite, composite], 3)
+
+    for dtype, (shape, weight_shape, units), with_bias, with_gelu in trials:
+        outputs = []
+        inputs = {"a": tvm.nd.array(np.random.uniform(-128, 127, shape).astype(dtype))}
+        func, params = _get_model(
+            shape,
+            weight_shape,
+            units,
+            dtype,
+            var_names=iter(inputs),
+            has_bias=with_bias,
+            has_gelu=with_gelu,
+        )
+        for bnns in [False, True]:
+            outputs.append(
+                build_and_run(
+                    func,
+                    inputs,
+                    1,
+                    params,
+                    device,
+                    enable_bnns=bnns,
+                )[0]
+            )
+
+        config = {
+            "shape": shape,
+            "weight_shape": weight_shape,
+            "units": units,
+            "dtype": dtype,
+            "with_bias": with_bias,
+            "with_gelu": with_gelu,
+        }
+        verify(outputs, atol=0.001, rtol=0.01, config=config)
+
+
+@pytest.mark.skipif(skip_codegen_test(), reason="Skip because BNNS codegen is not available")
+def test_codegen_dense():
+    np.random.seed(0)
+
+    dtype = ["float32"]
+    shape = [
+        ((1, 128), (16, 128), 16),
+        ((32, 32), (32, 32), 32),
+        ((1, 64), (1, 64), 1),
+        ((11, 2), (2, 2), 2),
+        ((2, 2), (1, 2), 1),
+    ]
+    composite = [False, True]
+    trials = generate_trials([dtype, shape, composite, composite], 3)
+
+    for dtype, (shape, weight_shape, units), with_bias, with_gelu in trials:
+        inputs = {"a"}
+
+        args = (shape, weight_shape, units, dtype)
+
+        func, params = _get_model(
+            *args, var_names=iter(inputs), has_bias=with_bias, has_gelu=with_gelu
+        )
+        exp_codegen = _get_expected_codegen(*args, has_bias=with_bias, has_gelu=with_gelu)
+        verify_codegen(func, exp_codegen, 1)
+
+
+if __name__ == "__main__":
+    test_dense()
+    test_codegen_dense()
diff --git a/tests/python/contrib/test_bnns/test_matmul.py b/tests/python/contrib/test_bnns/test_matmul.py
new file mode 100644
index 000000000000..7bf4d48f8e88
--- /dev/null
+++ b/tests/python/contrib/test_bnns/test_matmul.py
@@ -0,0 +1,113 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""BNNS integration dense tests."""
+
+import numpy as np
+import math
+import pytest
+import tvm
+from tvm import relay
+from tvm import testing
+from .infrastructure import (
+    Device,
+    skip_runtime_test,
+    skip_codegen_test,
+    verify_codegen,
+    build_and_run,
+    verify,
+    generate_trials,
+)
+
+
+def _get_model(a_shape, b_shape, dtype, var_names, is_a_constant=False, is_b_constant=False):
+    """Return a model and any parameters it may have"""
+    a = relay.var(next(var_names), shape=a_shape, dtype=dtype)
+    b = relay.var(next(var_names), shape=b_shape, dtype=dtype)
+    params = {}
+    if is_b_constant is True:
+        b = tvm.nd.array(np.random.uniform(-128, 127, b_shape).astype(dtype))
+        params["b"] = b
+        b = relay.const(b, dtype)
+    if is_a_constant is True:
+        a = tvm.nd.array(np.random.uniform(-128, 127, a_shape).astype(dtype))
+        params["a"] = a
+        a = relay.const(a, dtype)
+    out = relay.nn.batch_matmul(a, b)
+    return out, params
+
+
+@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available")
+def test_matmul():
+    device = Device()
+    np.random.seed(0)
+    dtype = "float32"
+
+    # C[N, I, J] = A[N, I, K] * B[N, J, K]
+    shapes_config = [
+        # B, I, J, K
+        [1, 4, 4, 3],
+        [1, 16, 32, 32],
+        [2, 1, 1, 3],
+        [2, 16, 32, 32],
+        [5, 1, 1, 3],
+    ]
+    data_config = [
+        # A_is_constant, B_is_constant
+        [False, True],
+        [True, False],
+        [False, False],
+    ]
+
+    for N, I, J, K in shapes_config:
+        a_shape = [N, I, K]
+        b_shape = [N, J, K]
+        for is_a_constant, is_b_constant in data_config:
+            outputs = []
+            inputs = {
+                "a": tvm.nd.array(np.random.uniform(-128, 127, a_shape).astype(dtype)),
+                "b": tvm.nd.array(np.random.uniform(-128, 127, b_shape).astype(dtype)),
+            }
+            func, params = _get_model(
+                a_shape,
+                b_shape,
+                dtype,
+                var_names=iter(inputs),
+                is_a_constant=is_a_constant,
+                is_b_constant=is_b_constant,
+            )
+            for enable_bnns in [False, True]:
+                outputs.append(
+                    build_and_run(
+                        func,
+                        inputs,
+                        1,
+                        params,
+                        device,
+                        enable_bnns=enable_bnns,
+                    )[0]
+                )
+
+            config = {
+                "a_shape": a_shape,
+                "b_shape": b_shape,
+                "dtype": dtype,
+            }
+            verify(outputs, atol=0.001, rtol=0.01, config=config)
+
+
+if __name__ == "__main__":
+    test_matmul()
diff --git a/tests/python/contrib/test_bnns/test_normalization.py b/tests/python/contrib/test_bnns/test_normalization.py
new file mode 100644
index 000000000000..094cfb041c3c
--- /dev/null
+++ b/tests/python/contrib/test_bnns/test_normalization.py
@@ -0,0 +1,201 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""BNNS integration normalization tests."""
+
+import numpy as np
+import math
+import pytest
+import tvm
+from tvm import relay
+from tvm import testing
+from .infrastructure import (
+    Device,
+    skip_runtime_test,
+    skip_codegen_test,
+    verify_codegen,
+    build_and_run,
+    verify,
+    generate_trials,
+)
+
+
+def _get_model(
+    shape, b_shape, s_shape, dtype, var_names, axis=1, epsilon=1e-5, center=True, scale=True
+):
+    """Return a model and any parameters it may have"""
+    src = relay.var(next(var_names), shape=shape, dtype=dtype)
+    params = {}
+    b = tvm.nd.array(np.random.uniform(-128, 127, b_shape).astype(dtype))
+    params["b"] = b
+    b = relay.const(b, dtype)
+    s = tvm.nd.array(np.random.uniform(-128, 127, b_shape).astype(dtype))
+    params["b"] = s
+    s = relay.const(s, dtype)
+    out = relay.nn.instance_norm(src, s, b, axis, epsilon, center, scale)
+
+    return out, params
+
+
+def _get_expected_codegen(shape, axis, center, scale, dtype, offload_on_bnns):
+    output_shape = shape
+    name = "nn.instance_norm"
+
+    node = {
+        "op": "kernel",
+        "name": name,
+        "inputs": [],
+        "attrs": {
+            "num_outputs": "1",
+            "axis": [[str(axis)]],
+            "center": [[str(int(center))]],
+            "scale": [[str(int(scale))]],
+            "shape": [[list(output_shape)]],
+            "dtype": [[dtype]],
+            "epsilon": [["1.0000000000000001e-05"]],
+        },
+    }
+
+    inputs = [
+        {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[str(dtype)]]}},
+        {
+            "op": "const",
+            "name": "",
+            "attrs": {"shape": [[[shape[axis]]]], "dtype": [[str(dtype)]]},
+        },
+        {
+            "op": "const",
+            "name": "",
+            "attrs": {"shape": [[[shape[axis]]]], "dtype": [[str(dtype)]]},
+        },
+    ]
+
+    input_idx = 0
+    for _ in range(len(inputs)):
+        node["inputs"].append([input_idx, 0, 0])
+        input_idx += 1
+    node["attrs"]["num_inputs"] = str(len(inputs))
+    inputs.append(node)
+    return inputs
+
+
+@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available")
+def test_normalization():
+    device = Device()
+    np.random.seed(0)
+    dtype = "float32"
+
+    shapes_config = [
+        [1, 2, 3, 4],
+        [3, 2, 3, 4],
+        [2, 2, 3],
+        [16, 32, 32],
+        [5, 3],
+    ]
+    axes = [-1, 0, 1, 2]
+
+    for shape in shapes_config:
+        for axis in axes:
+            if len(shape) == 2 and axis != 0:
+                continue
+            for center in [False, True]:
+                for scale in [False, True]:
+                    outputs = []
+                    inputs = {
+                        "src": tvm.nd.array(np.random.uniform(-128, 127, shape).astype(dtype)),
+                    }
+                    func, params = _get_model(
+                        shape,
+                        [shape[axis]],
+                        [shape[axis]],
+                        dtype,
+                        var_names=iter(inputs),
+                        axis=axis,
+                        center=center,
+                        scale=scale,
+                    )
+                    for enable_bnns in [False, True]:
+                        outputs.append(
+                            build_and_run(
+                                func,
+                                inputs,
+                                1,
+                                params,
+                                device,
+                                enable_bnns=enable_bnns,
+                            )[0]
+                        )
+
+                    config = {
+                        "dtype": dtype,
+                    }
+                    verify(outputs, atol=0.001, rtol=0.01, config=config)
+
+
+@pytest.mark.skipif(skip_codegen_test(), reason="Skip because BNNS codegen is not available")
+def test_codegen_normalization():
+    np.random.seed(0)
+
+    dtype = "float32"
+    shapes_config = [
+        [1, 2, 3, 4],
+        [3, 2, 3, 4],
+        [2, 2, 3],
+        [16, 32, 32],
+        [5, 3],
+    ]
+    axes = [-1, 0, 1, 2]
+
+    def check_normalization(rank, axis):
+        if rank < 3 or rank > 4:
+            return False
+        if axis == 0 and rank == 3 or axis == 1 and rank == 4:
+            return True
+        return False
+
+    for shape in shapes_config:
+        for axis in axes:
+            if len(shape) == 2 and axis != 0:
+                continue
+            for center in [False, True]:
+                for scale in [False, True]:
+                    inputs = {"src"}
+
+                    args = (shape, axis, center, scale, dtype)
+
+                    func, params = _get_model(
+                        shape,
+                        [shape[axis]],
+                        [shape[axis]],
+                        dtype,
+                        var_names=iter(inputs),
+                        axis=axis,
+                        center=center,
+                        scale=scale,
+                    )
+
+                    offload_on_bnns = check_normalization(len(shape), axis)
+                    if offload_on_bnns is True:
+                        bnns_blocks = 1
+                    else:
+                        bnns_blocks = 0
+                    exp_codegen = _get_expected_codegen(*args, offload_on_bnns)
+                    verify_codegen(func, exp_codegen, bnns_blocks)
+
+
+if __name__ == "__main__":
+    test_normalization()
+    test_codegen_normalization()
diff --git a/tests/python/contrib/test_bnns/test_onnx_topologies.py b/tests/python/contrib/test_bnns/test_onnx_topologies.py
new file mode 100644
index 000000000000..86f98eb6e8de
--- /dev/null
+++ b/tests/python/contrib/test_bnns/test_onnx_topologies.py
@@ -0,0 +1,140 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""BNNS pattern detection check"""
+
+import pytest
+
+import tvm
+from tvm import relay
+from tvm.relay import transform
+from tvm.contrib import utils, graph_runtime
+from tvm.contrib.download import download_testdata
+from tvm.relay.op.contrib.bnns import partition_for_bnns
+
+import numpy as np
+
+pytest.importorskip("onnx")
+
+bnns_is_absent = tvm.get_global_func("relay.ext.bnns", True) is None
+
+TARGET = "llvm"
+INPUT_SHAPE = [1, 3, 224, 224]
+
+BASE_MODEL_URL = "https://github.com/onnx/models/raw/master/"
+MODEL_URL_COLLECTION = {
+    "BERT": "text/machine_comprehension/bert-squad/model/bertsquad-10.onnx",
+    "MobileNet-v2": "vision/classification/mobilenet/model/mobilenetv2-7.onnx",
+    "ResNet50-v1": "vision/classification/resnet/model/resnet50-v1-7.onnx",
+    "ResNet50-v2": "vision/classification/resnet/model/resnet50-v2-7.onnx",
+    "SqueezeNet-v1.1": "vision/classification/squeezenet/model/squeezenet1.1-7.onnx",
+    "SqueezeNet-v1.0": "vision/classification/squeezenet/model/squeezenet1.0-7.onnx",
+    "Inception-v1": "vision/classification/inception_and_googlenet/inception_v1/model/inception-v1-7.onnx",
+    "Inception-v2": "vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-7.onnx",
+}
+
+
+def get_onnx_input_name(model):
+    inputs = [node.name for node in model.graph.input]
+    initializer = [node.name for node in model.graph.initializer]
+
+    inputs = list(set(inputs) - set(initializer))
+    return inputs
+
+
+def get_model_url(model_name):
+    return BASE_MODEL_URL + MODEL_URL_COLLECTION[model_name]
+
+
+def get_name_from_url(url):
+    return url[url.rfind("/") + 1 :].strip()
+
+
+def find_of_download(model_name):
+    model_url = get_model_url(model_name)
+    model_file_name = get_name_from_url(model_url)
+    return download_testdata(model_url, model_file_name, module="models")
+
+
+def get_model(model_name):
+    model_path = find_of_download(model_name)
+    onnx_model = onnx.load(model_path)
+    input_names = get_onnx_input_name(onnx_model)
+    input_dict = {}
+    for name in input_names:
+        input_dict[name] = INPUT_SHAPE  # TODO: hardcode
+    mod, params = relay.frontend.from_onnx(onnx_model, input_dict, freeze_params=True)
+    return mod, params, input_dict
+
+
+def simplify_model(mod):
+    """
+    Simplify execution graph
+
+    At least merge BatchNorm into convolution. For this purpose decompose BN primitive
+    into simple operation which can be calculated as const expr and after that merged
+    into nearest conv/dense primitive.
+    """
+    seq = tvm.transform.Sequential(
+        [
+            transform.InferType(),
+            transform.FoldConstant(),
+            transform.SimplifyInference(),
+            transform.FoldScaleAxis(),
+        ]
+    )
+    return seq(mod)
+
+
+def process(model_name):
+    temp = utils.tempdir()
+    model, params, input_dict = get_model(model_name)
+
+    def run(mod, target, simplify=True, with_bnns=False):
+        with tvm.transform.PassContext(opt_level=3):
+            if simplify:
+                mod = simplify_model(mod)
+            if with_bnns:
+                mod = partition_for_bnns(mod)
+            graph_module = relay.build(mod, target=target, target_host=target, params=params)
+
+        lib_name = "deploy.tar"
+        path_dso = temp.relpath(lib_name)
+        graph_module.export_library(path_dso)
+
+        ctx = tvm.cpu(0)
+        loaded_lib = tvm.runtime.load_module(path_dso)
+
+        module = graph_runtime.GraphModule(loaded_lib["default"](ctx))
+        module.run()
+        return module.get_output(0).asnumpy()
+
+    res_llvm = run(model, TARGET, simplify=True, with_bnns=False)
+    res_bnns = run(model, TARGET, simplify=True, with_bnns=True)
+
+    tvm.testing.assert_allclose(
+        res_llvm,
+        res_bnns,
+        atol=0.002,
+        rtol=0.007,
+    )
+
+
+@pytest.mark.skip(reason="Manually disabled because of huge complexity")
+@pytest.mark.skipif(bnns_is_absent, reason="BNNS runtime is absent")
+@pytest.mark.parametrize("model_name", MODEL_URL_COLLECTION.keys())
+def test_topology(model_name):
+    process(model_name)
diff --git a/tests/python/contrib/test_bnns/test_pooling.py b/tests/python/contrib/test_bnns/test_pooling.py
new file mode 100644
index 000000000000..77a78d4bf7e1
--- /dev/null
+++ b/tests/python/contrib/test_bnns/test_pooling.py
@@ -0,0 +1,289 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""BNNS integration pooling tests."""
+
+import numpy as np
+import pytest
+import tvm
+from tvm import relay
+from tvm import testing
+from .infrastructure import (
+    skip_runtime_test,
+    skip_codegen_test,
+    build_and_run,
+    verify,
+    verify_codegen,
+)
+from .infrastructure import Device
+
+
+def _calculate_output_shape(shape, sizes, padding, strides):
+    """Calculate pooling output shape."""
+    output_height = ((shape[2] - sizes[0] + padding[0] + padding[2]) / strides[0]) + 1
+    output_width = ((shape[3] - sizes[1] + padding[1] + padding[3]) / strides[1]) + 1
+    return 1, shape[1], int(output_height), int(output_width)
+
+
+def _get_pooling_model(
+    shape, dtype, typef, sizes, strides, padding, ceil_mode, count_include_pad, var_names
+):
+    """Return a model and any parameters it may have."""
+    if len(padding) == 2:
+        padding = (padding[0], padding[1], padding[0], padding[1])
+    out = relay.var(next(var_names), shape=shape, dtype=dtype)
+
+    if typef == "nn.max_pool2d":
+        out = relay.nn.max_pool2d(
+            out,
+            pool_size=sizes,
+            strides=strides,
+            padding=padding,
+            ceil_mode=ceil_mode,
+        )
+    elif typef == "nn.avg_pool2d":
+        out = relay.nn.avg_pool2d(
+            out,
+            pool_size=sizes,
+            strides=strides,
+            padding=padding,
+            ceil_mode=ceil_mode,
+            count_include_pad=count_include_pad,
+        )
+    else:
+        raise ValueError("Function not supported")
+
+    return out
+
+
+def _get_global_pooling_model(shape, dtype, typef, var_names):
+    """Return a model and any parameters it may have."""
+    out = relay.var(next(var_names), shape=shape, dtype=dtype)
+
+    if typef == "nn.global_max_pool2d":
+        out = relay.nn.global_max_pool2d(out)
+    elif typef == "nn.global_avg_pool2d":
+        out = relay.nn.global_avg_pool2d(out)
+    else:
+        raise ValueError("Function not supported")
+
+    return out
+
+
+def _get_expected_pooling_codegen(
+    shape, dtype, typef, sizes, strides, padding, ceil_mode, count_include_pad
+):
+    if len(padding) == 2:
+        padding = (padding[0], padding[1], padding[0], padding[1])
+    output_shape = _calculate_output_shape(shape, sizes, padding, strides)
+
+    node = {
+        "op": "kernel",
+        "name": typef,
+        "inputs": [[0, 0, 0]],
+        "attrs": {
+            "num_inputs": "1",
+            "num_outputs": "1",
+            "layout": [["NCHW"]],
+            "shape": [[list(output_shape)]],
+            "dtype": [[dtype]],
+            "padding": [[str(p) for p in padding]],
+            "strides": [[str(s) for s in strides]],
+            "pool_size": [[str(s) for s in sizes]],
+            "ceil_mode": [[str(1 if ceil_mode else 0)]],
+        },
+    }
+
+    if typef == "nn.avg_pool2d" or typef == "nn.l2_pool2d":
+        node["attrs"]["count_include_pad"] = [["1" if count_include_pad else "0"]]
+
+    input = {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[dtype]]}}
+    return [input, node]
+
+
+def _get_expected_global_pooling_codegen(shape, dtype, typef):
+    node = {
+        "op": "kernel",
+        "name": typef,
+        "inputs": [[0, 0, 0]],
+        "attrs": {
+            "num_inputs": "1",
+            "num_outputs": "1",
+            "layout": [["NCHW"]],
+            "shape": [[[1, shape[1], 1, 1]]],
+            "dtype": [[dtype]],
+        },
+    }
+
+    input = {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[dtype]]}}
+    return [input, node]
+
+
+@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available")
+def test_pooling():
+    device = Device()
+    np.random.seed(0)
+
+    dtype = "float32"
+    trials = [
+        ["nn.max_pool2d", (3, 3), (2, 2), (0, 0), False, False, (27, 27, 512)],
+        ["nn.max_pool2d", (2, 2), (2, 2), (0, 0), False, True, (16, 16, 16)],
+        ["nn.max_pool2d", (3, 3), (2, 2), (1, 1), True, True, (15, 15, 16)],
+        ["nn.max_pool2d", (2, 2), (2, 2), (0, 1), False, False, (16, 16, 16)],
+        ["nn.avg_pool2d", (2, 2), (2, 2), (1, 1), False, False, (16, 16, 16)],
+        ["nn.avg_pool2d", (2, 2), (2, 2), (0, 0), False, True, (16, 16, 16)],
+        ["nn.avg_pool2d", (3, 3), (2, 2), (0, 1), True, False, (15, 15, 16)],
+    ]
+
+    for (
+        typef,
+        size,
+        stride,
+        pad,
+        ceil_mode,
+        count_include_pad,
+        input_shape,
+    ) in trials:
+        shape = (1, *input_shape)
+        outputs = []
+        inputs = {
+            "a": tvm.nd.array(np.random.uniform(-127, 128, shape).astype(dtype)),
+        }
+
+        func = _get_pooling_model(
+            shape, dtype, typef, size, stride, pad, ceil_mode, count_include_pad, iter(inputs)
+        )
+
+        config = {
+            "size": size,
+            "stride": stride,
+            "shape": shape,
+            "pooling type": typef,
+            "dtype": dtype,
+            "padding": pad,
+            "ceil_mode": ceil_mode,
+            "count_include_pad": count_include_pad,
+            "inputs": inputs,
+        }
+
+        params = None
+        for enable_bnns in [False, True]:
+            outputs.append(
+                build_and_run(
+                    func, inputs, 1, params, device, enable_bnns=enable_bnns, config=config
+                )[0]
+            )
+
+        verify(outputs, atol=0.001, rtol=0.001, config=config)
+
+
+@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available")
+def test_global_pooling():
+    device = Device()
+    np.random.seed(0)
+
+    dtype = "float32"
+
+    trials = [
+        ["nn.global_max_pool2d", (8, 8, 16)],
+        ["nn.global_max_pool2d", (9, 9, 16)],
+        ["nn.global_max_pool2d", (8, 8, 16)],
+        ["nn.global_avg_pool2d", (8, 8, 16)],
+        ["nn.global_avg_pool2d", (8, 8, 16)],
+        ["nn.global_avg_pool2d", (9, 9, 16)],
+    ]
+
+    for typef, input_shape in trials:
+        shape = (1, *input_shape)
+        outputs = []
+        inputs = {
+            "a": tvm.nd.array(np.random.uniform(-127, 128, shape).astype(dtype)),
+        }
+
+        func = _get_global_pooling_model(shape, dtype, typef, iter(inputs))
+        config = {
+            "shape": shape,
+            "pooling type": typef,
+            "dtype": dtype,
+        }
+
+        for enable_bnns in [False, True]:
+            outputs.append(
+                build_and_run(
+                    func, inputs, 1, None, device, enable_bnns=enable_bnns, config=config
+                )[0]
+            )
+
+        verify(outputs, atol=0.001, rtol=0.001, config=config)
+
+
+@pytest.mark.skipif(skip_codegen_test(), reason="Skip because BNNS codegen is not available")
+def test_codegen_pooling():
+    dtype = "float32"
+
+    trials = [
+        ["nn.max_pool2d", (2, 2), (2, 2), (0, 0), False, True, (16, 16, 16)],
+        ["nn.max_pool2d", (3, 3), (2, 2), (1, 1), True, True, (15, 15, 16)],
+        ["nn.max_pool2d", (2, 2), (2, 2), (0, 1), False, False, (16, 16, 16)],
+        ["nn.avg_pool2d", (2, 2), (2, 2), (1, 1), False, False, (16, 16, 16)],
+        ["nn.avg_pool2d", (2, 2), (2, 2), (0, 0), False, True, (16, 16, 16)],
+        ["nn.avg_pool2d", (3, 3), (2, 2), (0, 1), True, False, (15, 15, 16)],
+    ]
+
+    for (
+        typef,
+        size,
+        stride,
+        pad,
+        ceil_mode,
+        count_include_pad,
+        input_shape,
+    ) in trials:
+        shape = (1, *input_shape)
+        inputs = {"a"}
+        args = (shape, dtype, typef, size, stride, pad, False, False)
+        func = _get_pooling_model(*args, iter(inputs))
+        exp_codegen = _get_expected_pooling_codegen(*args)
+        verify_codegen(func, exp_codegen, 1)
+
+
+@pytest.mark.skipif(skip_codegen_test(), reason="Skip because BNNS codegen is not available")
+def test_codegen_global_pooling():
+    dtype = "float32"
+
+    trials = [
+        ["nn.global_max_pool2d", (8, 8, 16)],
+        ["nn.global_max_pool2d", (9, 9, 16)],
+        ["nn.global_max_pool2d", (8, 8, 16)],
+        ["nn.global_avg_pool2d", (8, 8, 16)],
+        ["nn.global_avg_pool2d", (8, 8, 16)],
+        ["nn.global_avg_pool2d", (9, 9, 16)],
+    ]
+
+    for typef, input_shape in trials:
+        shape = (1, *input_shape)
+        inputs = {"a"}
+        args = (shape, dtype, typef)
+        func = _get_global_pooling_model(*args, iter(inputs))
+        exp_codegen = _get_expected_global_pooling_codegen(*args)
+        verify_codegen(func, exp_codegen, 1)
+
+
+if __name__ == "__main__":
+    test_pooling()
+    test_global_pooling()
+    test_codegen_pooling()
+    test_codegen_global_pooling()

From 8d08b21ec47900aef6db0c4b3fb2415776aa3e29 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Thu, 11 Mar 2021 18:53:15 -0800
Subject: [PATCH 328/357] [PROFILING] Combine USE_VM_PROFILER and
 USE_GRAPH_RUNTIME_DEBUG into a single flag USE_PROFILER (#7637)

---
 CMakeLists.txt                                | 36 ++++++++++---------
 cmake/config.cmake                            |  7 ++--
 tests/scripts/task_config_build_arm.sh        |  3 +-
 tests/scripts/task_config_build_cpu.sh        |  3 +-
 tests/scripts/task_config_build_gpu.sh        |  3 +-
 tests/scripts/task_config_build_gpu_vulkan.sh |  3 +-
 tests/scripts/task_config_build_i386.sh       |  3 +-
 tests/scripts/task_config_build_wasm.sh       |  3 +-
 8 files changed, 27 insertions(+), 34 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1d2e2bcb68c4..451b6a7ee2c2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -35,7 +35,7 @@ tvm_option(USE_THREADS "Build with thread support" ON)
 tvm_option(USE_LLVM "Build with LLVM, can be set to specific llvm-config path" OFF)
 tvm_option(USE_STACKVM_RUNTIME "Include stackvm into the runtime" OFF)
 tvm_option(USE_GRAPH_RUNTIME "Build with tiny graph runtime" ON)
-tvm_option(USE_GRAPH_RUNTIME_DEBUG "Build with tiny graph runtime debug mode" OFF)
+tvm_option(USE_PROFILER "Build profiler for the VM and graph runtime" ON)
 tvm_option(USE_OPENMP "Build with OpenMP thread pool implementation" OFF)
 tvm_option(USE_RELAY_DEBUG "Building Relay in debug mode..." OFF)
 tvm_option(USE_RTTI "Build with RTTI" ON)
@@ -262,13 +262,6 @@ list(APPEND COMPILER_SRCS ${RELAY_BACKEND_SRCS})
 list(APPEND COMPILER_SRCS ${RELAY_IR_SRCS})
 list(APPEND COMPILER_SRCS ${RELAY_QNN_SRCS})
 
-
-if(USE_VM_PROFILER)
-  message(STATUS "Build compiler with Relay VM profiler support...")
-  file(GLOB BACKEND_VM_PROFILER_SRCS src/relay/backend/vm/profiler/*.cc)
-  list(APPEND COMPILER_SRCS ${BACKEND_VM_PROFILER_SRCS})
-endif(USE_VM_PROFILER)
-
 file(GLOB DATATYPE_SRCS src/target/datatype/*.cc)
 list(APPEND COMPILER_SRCS ${DATATYPE_SRCS})
 list(APPEND COMPILER_SRCS "src/target/datatype/myfloat/myfloat.cc")
@@ -315,20 +308,29 @@ if(USE_GRAPH_RUNTIME)
   file(GLOB RUNTIME_GRAPH_SRCS src/runtime/graph/*.cc)
   list(APPEND RUNTIME_SRCS ${RUNTIME_GRAPH_SRCS})
 
-  if(USE_GRAPH_RUNTIME_DEBUG)
-    message(STATUS "Build with Graph runtime debug support...")
-    file(GLOB RUNTIME_GRAPH_DEBUG_SRCS src/runtime/graph/debug/*.cc)
-    list(APPEND RUNTIME_SRCS ${RUNTIME_GRAPH_DEBUG_SRCS})
-    set_source_files_properties(${RUNTIME_GRAPH_SRCS}
-      PROPERTIES COMPILE_DEFINITIONS "TVM_GRAPH_RUNTIME_DEBUG")
-  endif(USE_GRAPH_RUNTIME_DEBUG)
 endif(USE_GRAPH_RUNTIME)
 
+# convert old options for profiler
+if(USE_GRAPH_RUNTIME_DEBUG)
+  unset(USE_GRAPH_RUNTIME_DEBUG CACHE)
+  set(USE_PROFILER ON)
+endif()
 if(USE_VM_PROFILER)
-  message(STATUS "Build with Relay VM profiler support...")
+  unset(USE_VM_PROFILER CACHE)
+  set(USE_PROFILER ON)
+endif()
+
+if(USE_PROFILER)
+  message(STATUS "Build with profiler...")
+
+  file(GLOB RUNTIME_GRAPH_DEBUG_SRCS src/runtime/graph/debug/*.cc)
+  list(APPEND RUNTIME_SRCS ${RUNTIME_GRAPH_DEBUG_SRCS})
+  set_source_files_properties(${RUNTIME_GRAPH_SRCS}
+    PROPERTIES COMPILE_DEFINITIONS "TVM_GRAPH_RUNTIME_DEBUG")
+
   file(GLOB RUNTIME_VM_PROFILER_SRCS src/runtime/vm/profiler/*.cc)
   list(APPEND RUNTIME_SRCS ${RUNTIME_VM_PROFILER_SRCS})
-endif(USE_VM_PROFILER)
+endif(USE_PROFILER)
 
 # Module rules
 include(cmake/modules/VTA.cmake)
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 67370c635209..eee07c356ae6 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -99,11 +99,8 @@ set(USE_STACKVM_RUNTIME OFF)
 # Whether enable tiny embedded graph runtime.
 set(USE_GRAPH_RUNTIME ON)
 
-# Whether enable additional graph debug functions
-set(USE_GRAPH_RUNTIME_DEBUG OFF)
-
-# Whether enable additional vm profiler functions
-set(USE_VM_PROFILER OFF)
+# Whether to enable the profiler for the graph runtime and vm
+set(USE_PROFILER ON)
 
 # Whether enable uTVM standalone runtime
 set(USE_MICRO_STANDALONE_RUNTIME OFF)
diff --git a/tests/scripts/task_config_build_arm.sh b/tests/scripts/task_config_build_arm.sh
index 80527466c71e..b3a084aef371 100755
--- a/tests/scripts/task_config_build_arm.sh
+++ b/tests/scripts/task_config_build_arm.sh
@@ -25,10 +25,9 @@ cp ../cmake/config.cmake .
 
 echo set\(USE_SORT ON\) >> config.cmake
 echo set\(USE_RPC ON\) >> config.cmake
-echo set\(USE_GRAPH_RUNTIME_DEBUG ON\) >> config.cmake
 echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
-echo set\(USE_VM_PROFILER ON\) >> config.cmake
+echo set\(USE_PROFILER ON\) >> config.cmake
 echo set\(USE_LLVM llvm-config-8\) >> config.cmake
 echo set\(CMAKE_CXX_COMPILER g++\) >> config.cmake
 echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake
diff --git a/tests/scripts/task_config_build_cpu.sh b/tests/scripts/task_config_build_cpu.sh
index db636063b9e3..aa5581b0e11a 100755
--- a/tests/scripts/task_config_build_cpu.sh
+++ b/tests/scripts/task_config_build_cpu.sh
@@ -26,8 +26,7 @@ cp ../cmake/config.cmake .
 echo set\(USE_SORT ON\) >> config.cmake
 echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
-echo set\(USE_GRAPH_RUNTIME_DEBUG ON\) >> config.cmake
-echo set\(USE_VM_PROFILER ON\) >> config.cmake
+echo set\(USE_PROFILER ON\) >> config.cmake
 echo set\(USE_DNNL_CODEGEN ON\) >> config.cmake
 echo set\(USE_ARM_COMPUTE_LIB ON\) >> config.cmake
 echo set\(USE_LLVM llvm-config-11\) >> config.cmake
diff --git a/tests/scripts/task_config_build_gpu.sh b/tests/scripts/task_config_build_gpu.sh
index 155bac80533f..13dfb4136547 100755
--- a/tests/scripts/task_config_build_gpu.sh
+++ b/tests/scripts/task_config_build_gpu.sh
@@ -36,8 +36,7 @@ echo set\(USE_RPC ON\) >> config.cmake
 echo set\(USE_SORT ON\) >> config.cmake
 echo set\(USE_GRAPH_RUNTIME ON\) >> config.cmake
 echo set\(USE_STACKVM_RUNTIME ON\) >> config.cmake
-echo set\(USE_GRAPH_RUNTIME_DEBUG ON\) >> config.cmake
-echo set\(USE_VM_PROFILER ON\) >> config.cmake
+echo set\(USE_PROFILER ON\) >> config.cmake
 echo set\(USE_ANTLR ON\) >> config.cmake
 echo set\(USE_VTA_TSIM ON\) >> config.cmake
 echo set\(USE_VTA_FSIM ON\) >> config.cmake
diff --git a/tests/scripts/task_config_build_gpu_vulkan.sh b/tests/scripts/task_config_build_gpu_vulkan.sh
index 74096b1a9760..5865dc969958 100755
--- a/tests/scripts/task_config_build_gpu_vulkan.sh
+++ b/tests/scripts/task_config_build_gpu_vulkan.sh
@@ -27,7 +27,6 @@ echo set\(USE_OPENCL ON\) >> config.cmake
 echo set\(USE_ROCM ON\) >> config.cmake
 echo set\(USE_VULKAN ON\) >> config.cmake
 echo set\(USE_MICRO ON\) >> config.cmake
-echo set\(USE_GRAPH_RUNTIME_DEBUG ON\) >> config.cmake
-echo set\(USE_VM_PROFILER ON\) >> config.cmake
+echo set\(USE_PROFILER ON\) >> config.cmake
 echo set\(CMAKE_CXX_COMPILER clang-7\) >> config.cmake
 echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake
diff --git a/tests/scripts/task_config_build_i386.sh b/tests/scripts/task_config_build_i386.sh
index 68e61c6a039c..05acbb022124 100755
--- a/tests/scripts/task_config_build_i386.sh
+++ b/tests/scripts/task_config_build_i386.sh
@@ -25,10 +25,9 @@ cp ../cmake/config.cmake .
 
 echo set\(USE_SORT ON\) >> config.cmake
 echo set\(USE_RPC ON\) >> config.cmake
-echo set\(USE_GRAPH_RUNTIME_DEBUG ON\) >> config.cmake
 echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
-echo set\(USE_VM_PROFILER ON\) >> config.cmake
+echo set\(USE_PROFILER ON\) >> config.cmake
 echo set\(USE_LLVM llvm-config-4.0\) >> config.cmake
 echo set\(CMAKE_CXX_COMPILER g++\) >> config.cmake
 echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake
diff --git a/tests/scripts/task_config_build_wasm.sh b/tests/scripts/task_config_build_wasm.sh
index c37a119b0590..78dc7550028b 100755
--- a/tests/scripts/task_config_build_wasm.sh
+++ b/tests/scripts/task_config_build_wasm.sh
@@ -26,8 +26,7 @@ cp ../cmake/config.cmake .
 echo set\(USE_SORT ON\) >> config.cmake
 echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
-echo set\(USE_GRAPH_RUNTIME_DEBUG ON\) >> config.cmake
-echo set\(USE_VM_PROFILER ON\) >> config.cmake
+echo set\(USE_PROFILER ON\) >> config.cmake
 echo set\(USE_LLVM llvm-config-11\) >> config.cmake
 echo set\(USE_ANTLR ON\) >> config.cmake
 echo set\(CMAKE_CXX_COMPILER g++\) >> config.cmake

From 9d72bd051bdc02ab678d223a794ab6cb607866ba Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Thu, 11 Mar 2021 20:25:35 -0800
Subject: [PATCH 329/357] [RUNTIME] Switch time evaluator to use device
 specific timing. (#7631)

---
 src/runtime/rpc/rpc_module.cc | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index 4f721e122a4c..34691415c1a4 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -23,6 +23,7 @@
  */
 #include <tvm/runtime/container.h>
 #include <tvm/runtime/device_api.h>
+#include <tvm/runtime/profiling.h>
 #include <tvm/runtime/registry.h>
 
 #include <cstring>
@@ -364,8 +365,6 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repe
       if (f_preproc != nullptr) {
         f_preproc.CallPacked(args, &temp);
       }
-      std::chrono::time_point<std::chrono::high_resolution_clock, std::chrono::nanoseconds> tbegin,
-          tend;
       double duration_ms = 0.0;
 
       do {
@@ -374,20 +373,17 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repe
                                              number * 1.618));  // 1.618 is chosen by random
         }
 
-        tbegin = std::chrono::high_resolution_clock::now();
+        Timer t = Timer::Start(ctx);
         // start timing
         for (int i = 0; i < number; ++i) {
           pf.CallPacked(args, &temp);
         }
-        DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
-        tend = std::chrono::high_resolution_clock::now();
-
-        duration_ms =
-            std::chrono::duration_cast<std::chrono::duration<double>>(tend - tbegin).count() * 1000;
+        t->Stop();
+        int64_t t_nanos = t->SyncAndGetElapsedNanos();
+        duration_ms = t_nanos / 1e6;
       } while (duration_ms < min_repeat_ms);
 
-      double speed =
-          std::chrono::duration_cast<std::chrono::duration<double>>(tend - tbegin).count() / number;
+      double speed = duration_ms / 1e3 / number;
       os.write(reinterpret_cast<char*>(&speed), sizeof(speed));
     }
 

From 1a506791ba7b7f041de318b6a83c2a7f080ee1cd Mon Sep 17 00:00:00 2001
From: eleflea <dmzitnzc@gmail.com>
Date: Fri, 12 Mar 2021 14:22:04 +0800
Subject: [PATCH 330/357] fix missing qparams in aten::upsample_nearest2d
 (#7646)

---
 python/tvm/relay/frontend/qnn_torch.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/tvm/relay/frontend/qnn_torch.py b/python/tvm/relay/frontend/qnn_torch.py
index 2b85a1f3a1be..2dd84b650bd2 100644
--- a/python/tvm/relay/frontend/qnn_torch.py
+++ b/python/tvm/relay/frontend/qnn_torch.py
@@ -353,6 +353,7 @@ def add_input_quant_params_to_op_inputs(graph):
         "quantized::mul": 2,
         "aten::dequantize": 1,
         "aten::mean": 1,
+        "aten::upsample_nearest2d": 1,
         "aten::upsample_bilinear2d": 1,
         "aten::relu_": 1,
         "aten::relu": 1,

From b80e777f1cafe61d8886d7567906e9b018788447 Mon Sep 17 00:00:00 2001
From: Chris Hoge <chris@hogepodge.com>
Date: Fri, 12 Mar 2021 09:39:06 -0800
Subject: [PATCH 331/357] [docs] Getting Started with TVM: Auto Scheduler and
 matmul (#7644)

Moves the auto scheduler with matmul example into the tutorial,
expands to follow the flow of the larger getting started tutorial.
Indended to follow the AutoTVM tutorial on matrix multiplication.
---
 docs/conf.py                                  |   5 +-
 .../tune_matmul_x86.py                        | 136 ++++++++++--------
 2 files changed, 83 insertions(+), 58 deletions(-)
 rename tutorials/{auto_scheduler => get_started}/tune_matmul_x86.py (55%)

diff --git a/docs/conf.py b/docs/conf.py
index ad838f767f80..c9c68706998b 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -210,10 +210,11 @@
 # The unlisted files always appear after listed files.
 within_subsection_order = {
     "get_started": [
-        "relay_quick_start.py",
-        "tensor_expr_get_started.py",
         "tvmc_command_line_driver.py",
+        "tensor_expr_get_started.py",
+        "autoschedule_matmul.py",
         "cross_compilation_and_rpc.py",
+        "relay_quick_start.py",
     ],
     "frontend": [
         "from_pytorch.py",
diff --git a/tutorials/auto_scheduler/tune_matmul_x86.py b/tutorials/get_started/tune_matmul_x86.py
similarity index 55%
rename from tutorials/auto_scheduler/tune_matmul_x86.py
rename to tutorials/get_started/tune_matmul_x86.py
index 084f5ae67518..a51f01115f31 100644
--- a/tutorials/auto_scheduler/tune_matmul_x86.py
+++ b/tutorials/get_started/tune_matmul_x86.py
@@ -15,24 +15,27 @@
 # specific language governing permissions and limitations
 # under the License.
 """
-Auto-scheduling Matrix Multiplication for CPU
-=============================================
+Optimizing Operators with Auto-scheduling
+=========================================
 **Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, \
             `Chengfan Jia <https://github.com/jcf94/>`_
 
-This is a tutorial on how to use the auto-scheduler for CPUs.
+In this tutorial, we will show how TVM's Auto Scheduling feature can find
+optimal schedules without the need for writing a custom template.
 
-Different from the template-based :ref:`autotvm <tutorials-autotvm-sec>` which relies on
-manual templates to define the search space, the auto-scheduler does not require any templates.
-Users only need to write the computation declaration without any schedule commands or templates.
-The auto-scheduler can automatically generate a large search space and
-find a good schedule in the space.
+Different from the template-based :ref:`<autotvm_matmul>` which relies on
+manual templates to define the search space, the auto-scheduler does not
+require any templates.  Users only need to write the computation declaration
+without any schedule commands or templates.  The auto-scheduler can
+automatically generate a large search space and find a good schedule in the
+space.
 
 We use matrix multiplication as an example in this tutorial.
 
-Note that this tutorial will not run on Windows or recent versions of macOS. To
-get it to run, you will need to wrap the body of this tutorial in a :code:`if
-__name__ == "__main__":` block.
+.. note::
+  Note that this tutorial will not run on Windows or recent versions of macOS. To
+  get it to run, you will need to wrap the body of this tutorial in a :code:`if
+  __name__ == "__main__":` block.
 """
 
 import os
@@ -41,15 +44,18 @@
 import tvm
 from tvm import te, auto_scheduler
 
-######################################################################
-# Define the computation
-# ^^^^^^^^^^^^^^^^^^^^^^
-# To begin with, let us define the computation of a matmul with bias add.
-# The function should return the list of input/output tensors.
-# From these tensors, the auto-scheduler can get the whole computational graph.
+################################################################################
+# Defining the Matrix Multiplication
+# ----------------------------------
+# To start, we define a matrix multiplication with a bias addition.  Note that
+# this uses standard operations available in TVMs Tensor Expression language.
+# The major difference is the use of the `auto_sceduler` decorator at the top
+# of the function definition.  The function should return a list of
+# input/output tensors.  From these tensors, the auto-scheduler can get the
+# whole computational graph.
 
 
-@auto_scheduler.register_workload
+@auto_scheduler.register_workload  # Note the auto_scheduler decorator
 def matmul_add(N, L, M, dtype):
     A = te.placeholder((N, L), name="A", dtype=dtype)
     B = te.placeholder((L, M), name="B", dtype=dtype)
@@ -67,12 +73,17 @@ def matmul_add(N, L, M, dtype):
     return [A, B, C, out]
 
 
-######################################################################
+################################################################################
 # Create the search task
-# ^^^^^^^^^^^^^^^^^^^^^^
-# We then create a search task with N=L=M=1024 and dtype="float32"
-# If your machine supports avx instructions, you can
+# ----------------------
+# With the function defined, we can now create the task for the auto_scheduler
+# to search against. We specify the particular parameters for this matrix
+# multiplication, in this case a multiplication of to square matricies of size
+# 1024x1024. We then create a search task with N=L=M=1024 and dtype="float32"
 #
+# .. note:: Improve performance with custom targets
+#   In order for TVM to take full advantage of specific hardware platforms,
+#   you will want to manuall specify your CPU capabilities. For example:
 #   - replace "llvm" below with "llvm -mcpu=core-avx2" to enable AVX2
 #   - replace "llvm" below with "llvm -mcpu=skylake-avx512" to enable AVX-512
 
@@ -84,15 +95,18 @@ def matmul_add(N, L, M, dtype):
 print("Computational DAG:")
 print(task.compute_dag)
 
-######################################################################
+################################################################################
+# Set Parameters for Auto-Scheduler
+# ---------------------------------
 # Next, we set parameters for the auto-scheduler.
 #
-# * :code:`num_measure_trials` is the number of measurement trials we can use during the search.
-#   We only make 10 trials in this tutorial for a fast demonstration. In practice, 1000 is a
-#   good value for the search to converge. You can do more trials according to your time budget.
-# * In addition, we use :code:`RecordToFile` to dump measurement records into a file `matmul.json`.
-#   The measurement records can be used to query the history best, resume the search,
-#   and do more analyses later.
+# * :code:`num_measure_trials` is the number of measurement trials we can use
+#   during the search.  We only make 10 trials in this tutorial for a fast
+#   demonstration. In practice, 1000 is a good value for the search to converge.
+#   You can do more trials according to your time budget.
+# * In addition, we use :code:`RecordToFile` to log measurement records into a
+#   file `matmul.json`.  The measurement records can be used to query the history
+#   best, resume the search, and do more analyses later.
 # * see :any:`auto_scheduler.TuningOptions` for more parameters
 
 log_file = "matmul.json"
@@ -102,30 +116,32 @@ def matmul_add(N, L, M, dtype):
     verbose=2,
 )
 
-######################################################################
+################################################################################
 # Run the search
-# ^^^^^^^^^^^^^^
-# Now we get all inputs ready. Pretty simple, isn't it?
-# We can kick off the search and let the auto-scheduler do its magic.
-# After some measurement trials, we can load the best schedule from the log
-# file and apply it.
+# --------------
+# Now we get all inputs ready. Pretty simple, isn't it?  We can kick off the
+# search and let the auto-scheduler do its magic.  After some measurement
+# trials, we can load the best schedule from the log file and apply it.
 
 # Run auto-tuning (search)
 task.tune(tune_option)
 # Apply the best schedule
 sch, args = task.apply_best(log_file)
 
-######################################################################
-# We can lower the schedule to see the IR after auto-scheduling.
-# The auto-scheduler correctly performs optimizations including multi-level tiling,
-# layout transformation, parallelization, vectorization, unrolling, and operator fusion.
+################################################################################
+# Inspecting the Optimized Schedule
+# ---------------------------------
+# We can lower the schedule to see the IR after auto-scheduling.  The
+# auto-scheduler correctly performs optimizations including multi-level tiling,
+# layout transformation, parallelization, vectorization, unrolling, and
+# operator fusion.
 
 print("Lowered TIR:")
 print(tvm.lower(sch, args, simple_mode=True))
 
-######################################################################
+################################################################################
 # Check correctness and evaluate performance
-# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+# ------------------------------------------
 # We build the binary and check its correctness and performance.
 
 func = tvm.build(sch, args, target)
@@ -152,26 +168,25 @@ def matmul_add(N, L, M, dtype):
 )
 
 
-######################################################################
+################################################################################
 # Using the record file
-# ^^^^^^^^^^^^^^^^^^^^^
-# During the search, all measurement records are dumped into the record
-# file "matmul.json". The measurement records can be used to re-apply search results,
-# resume the search, and perform other analyses.
-
-######################################################################
-# Here is an example where we load the best schedule from a file,
-# and print the equivalent python schedule API. This can be used for
-# debugging and learning the behavior of the auto-scheduler.
+# ---------------------
+# During the search, all measurement records are logged into the record file
+# "matmul.json". The measurement records can be used to re-apply search
+# results, resume the search, and perform other analyses.
+#
+# Here is an example where we load the best schedule from a file, and print the
+# equivalent python schedule API. This can be used for debugging and learning
+# the behavior of the auto-scheduler.
 
 print("Equivalent python schedule:")
 print(task.print_best(log_file))
 
-######################################################################
-# A more complicated example is to resume the search.
-# In this case, we need to create the search policy and cost model by ourselves
-# and resume the status of search policy and cost model with the log file.
-# In the example below we resume the status and do more 5 trials.
+################################################################################
+# A more complicated example is to resume the search.  In this case, we need to
+# create the search policy and cost model by ourselves and resume the status of
+# search policy and cost model with the log file.  In the example below we
+# resume the status and do more 5 trials.
 
 
 def resume_search(task, log_file):
@@ -188,3 +203,12 @@ def resume_search(task, log_file):
 
 
 resume_search(task, log_file)
+
+################################################################################
+# Final Notes and Summary
+# -----------------------
+# In this tutorial, we have shown how to use the TVM Auto-Scheduler to
+# automatically optimize a matrix multiplication, without the need to specify a
+# search template.  It ends a series of examples that starts from the Tensor
+# Expression (TE) language that demonstrates how TVM can optimize computational
+# operations.

From fe25b9e7c5f9c95d211f63ae544a9532eb50b398 Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Fri, 12 Mar 2021 18:03:53 +0000
Subject: [PATCH 332/357] [TVMC] Allow options on --target to contain dots.
 (#7651)

* Allow tvmc compile --target options to accept dots
 * Adds testing for dot separator in quoted and unquoted
   values
 * Add an "unquoting" conditional so that quoted and
   unquoted strings look the same when parsed
---
 python/tvm/driver/tvmc/common.py             |  7 ++++++-
 tests/python/driver/tvmc/test_tvmc_common.py | 21 ++++++++++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/python/tvm/driver/tvmc/common.py b/python/tvm/driver/tvmc/common.py
index 71bf42ae1e5c..c5cb5f29031f 100644
--- a/python/tvm/driver/tvmc/common.py
+++ b/python/tvm/driver/tvmc/common.py
@@ -133,7 +133,7 @@ def tokenize_target(target):
 
     target_pattern = (
         r"(\-{0,2}[\w\-]+\=?"
-        r"(?:[\w\+\-]+(?:,[\w\+\-])*|[\'][\w\+\-,\s]+[\']|[\"][\w\+\-,\s]+[\"])*|,)"
+        r"(?:[\w\+\-\.]+(?:,[\w\+\-\.])*|[\'][\w\+\-,\s\.]+[\']|[\"][\w\+\-,\s\.]+[\"])*|,)"
     )
 
     return re.findall(target_pattern, target)
@@ -223,6 +223,11 @@ def parse_target(target):
                 else:
                     opt = opt[1:] if opt.startswith("-") else opt
                     opt_name, opt_value = opt.split("=", maxsplit=1)
+
+                    # remove quotes from the value: quotes are only parsed if they match,
+                    # so it is safe to assume that if the string starts with quote, it ends
+                    # with quote.
+                    opt_value = opt_value[1:-1] if opt_value[0] in ('"', "'") else opt_value
             except ValueError:
                 raise ValueError(f"Error when parsing '{opt}'")
 
diff --git a/tests/python/driver/tvmc/test_tvmc_common.py b/tests/python/driver/tvmc/test_tvmc_common.py
index b272ceccea39..23ea4f46b2ff 100644
--- a/tests/python/driver/tvmc/test_tvmc_common.py
+++ b/tests/python/driver/tvmc/test_tvmc_common.py
@@ -273,3 +273,24 @@ def test_parse_multiple_target_with_opts():
     assert "myopt" in targets[0]["opts"]
     assert "value" == targets[0]["opts"]["myopt"]
     assert "llvm" == targets[1]["name"]
+
+
+def test_parse_multiple_separators_on_target():
+    targets = tvmc.common.parse_target("foo -option1=+v1.0x,+value,+bar")
+
+    assert len(targets) == 1
+    assert "+v1.0x,+value,+bar" == targets[0]["opts"]["option1"]
+
+
+def test_parse_single_quoted_multiple_separators_on_target():
+    targets = tvmc.common.parse_target("foo -option1='+v1.0x,+value'")
+
+    assert len(targets) == 1
+    assert "+v1.0x,+value" == targets[0]["opts"]["option1"]
+
+
+def test_parse_double_quoted_multiple_separators_on_target():
+    targets = tvmc.common.parse_target('foo -option1="+v1.0x,+value"')
+
+    assert len(targets) == 1
+    assert "+v1.0x,+value" == targets[0]["opts"]["option1"]

From 692da0a32bc4c6d6bdc5d219182a00d9e7945c68 Mon Sep 17 00:00:00 2001
From: YubinCao <61700216+YubinCao@users.noreply.github.com>
Date: Mon, 15 Mar 2021 15:22:35 +0800
Subject: [PATCH 333/357] [docker] fixed ci-gpu docker environment path typo.
 (#7648)

---
 docker/Dockerfile.ci_gpu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu
index ac76af6b0a1e..a44677f5ce56 100644
--- a/docker/Dockerfile.ci_gpu
+++ b/docker/Dockerfile.ci_gpu
@@ -107,8 +107,8 @@ ENV PATH=/usr/local/nvidia/bin:${PATH}
 ENV PATH=/usr/local/cuda/bin:${PATH}
 ENV CPLUS_INCLUDE_PATH=/usr/local/cuda/include:${CPLUS_INCLUDE_PATH}
 ENV C_INCLUDE_PATH=/usr/local/cuda/include:${C_INCLUDE_PATH}
-ENV LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/compact:${LIBRARY_PATH}
-ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/compact:${LD_LIBRARY_PATH}
+ENV LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/compat:${LIBRARY_PATH}
+ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/compat:${LD_LIBRARY_PATH}
 
 ENV LD_LIBRARY_PATH=/opt/rocm/lib:${LD_LIBRARY_PATH}
 ENV PATH=/node_modules/.bin:${PATH}

From 3beec22264f56f734de7d14cd6382b96e83e280a Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Mon, 15 Mar 2021 08:32:33 -0700
Subject: [PATCH 334/357] Fix issue when group attribute isnt defined in
 convtranspose. (#7655)

---
 python/tvm/relay/frontend/onnx.py          |  2 +-
 tests/python/frontend/onnx/test_forward.py | 68 +++++++++-------------
 2 files changed, 30 insertions(+), 40 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index f31b8c927f8f..4a0a1ed09b9f 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -446,7 +446,7 @@ def _impl_v1(cls, inputs, attr, params):
         # get number of channels
         channels = infer_channels(inputs[1], True)
         attr["channels"] = channels
-        groups = attr.pop("group")
+        groups = attr.get("group", 1)
         attr["groups"] = groups
         # infer pads for auto_pad
         data = inputs[0]
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 1e1341640ea0..177bed66f466 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -2489,42 +2489,27 @@ def verify_convtranspose_with_padding(
     dilations,
     auto_pad="NOTSET",
     unset_pad=False,
+    group=1,
 ):
-    if unset_pad:
-        node = helper.make_node(
-            "ConvTranspose",
-            inputs=["x", "W"],
-            outputs=["y"],
-            kernel_shape=kernel_shape,
-            # Default values for other attributes:
-            strides=strides,
-            dilations=dilations,
-            group=1,
-        )
-    elif padding is None:
-        node = helper.make_node(
-            "ConvTranspose",
-            inputs=["x", "W"],
-            outputs=["y"],
-            kernel_shape=kernel_shape,
-            # Default values for other attributes:
-            strides=strides,
-            dilations=dilations,
-            group=1,
-            auto_pad=auto_pad,
-        )
-    else:
-        node = helper.make_node(
-            "ConvTranspose",
-            inputs=["x", "W"],
-            outputs=["y"],
-            kernel_shape=kernel_shape,
-            # Default values for other attributes:
-            strides=strides,
-            dilations=dilations,
-            group=1,
-            pads=padding,
-        )
+    node = helper.make_node(
+        "ConvTranspose",
+        inputs=["x", "W"],
+        outputs=["y"],
+        kernel_shape=kernel_shape,
+        # Default values for other attributes:
+        strides=strides,
+        dilations=dilations,
+    )
+    if not unset_pad:
+        if padding is None:
+            pad_attr = helper.make_attribute("auto_pad", auto_pad)
+        else:
+            pad_attr = helper.make_attribute("pads", padding)
+        node.attribute.append(pad_attr)
+
+    if group is not None:
+        group_attr = helper.make_attribute("group", group)
+        node.attribute.append(group_attr)
 
     graph = helper.make_graph(
         [node],
@@ -2536,22 +2521,25 @@ def verify_convtranspose_with_padding(
         outputs=[helper.make_tensor_value_info("y", TensorProto.FLOAT, list(y_shape))],
     )
 
-    model = helper.make_model(graph, producer_name="conv_test")
+    model = helper.make_model(graph, producer_name="convtranspose_pad_test")
 
     verify_with_ort(model, [x_shape, w_shape], [y_shape], use_vm=True, convert_to_static=True)
 
 
-def verify_convtranspose(x_shape, w_shape, y_shape, p):
+def verify_convtranspose(x_shape, w_shape, y_shape, p, group=1):
     node = onnx.helper.make_node(
         "ConvTranspose",
         inputs=["x", "W"],
         outputs=["y"],
         strides=[3, 2],
-        group=1,
         kernel_shape=[3, 3],
         pads=p,
     )
 
+    if group is not None:
+        group_attr = helper.make_attribute("group", group)
+        node.attribute.append(group_attr)
+
     graph = helper.make_graph(
         [node],
         "verify_convtranspose_test",
@@ -2562,7 +2550,7 @@ def verify_convtranspose(x_shape, w_shape, y_shape, p):
         outputs=[helper.make_tensor_value_info("y", TensorProto.FLOAT, list(y_shape))],
     )
 
-    model = helper.make_model(graph, producer_name="convtranspose_trest")
+    model = helper.make_model(graph, producer_name="convtranspose_test")
     verify_with_ort(model, [x_shape, w_shape], y_shape)
 
 
@@ -2574,6 +2562,8 @@ def test_convtranspose():
     # (1, 2, 7, 3) output tensor
     # [1, 2, 1, 2] list for pads
     verify_convtranspose((1, 1, 3, 3), (1, 2, 3, 3), (1, 2, 7, 3), [1, 2, 1, 2])
+    # Test undefined groups.
+    verify_convtranspose((1, 1, 3, 3), (1, 2, 3, 3), (1, 2, 7, 3), [1, 2, 1, 2], group=None)
 
     def repeat(N, D):
         return tuple([N for _ in range(D)])

From 1878889d92bfe84c5ed9e1ca87f258fdc9826c4a Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Mon, 15 Mar 2021 23:38:21 +0800
Subject: [PATCH 335/357] revert SET_LLVM flag (#7657)

Co-authored-by: Lei Wang <34334180+NjtechPrinceling@users.noreply.github.com>
---
 cmake/config.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/config.cmake b/cmake/config.cmake
index eee07c356ae6..65859566a664 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -113,7 +113,7 @@ set(USE_MICRO_STANDALONE_RUNTIME OFF)
 # - OFF: disable llvm, note this will disable CPU codegen
 #        which is needed for most cases
 # - /path/to/llvm-config: enable specific LLVM when multiple llvm-dev is available.
-set(USE_LLVM ON)
+set(USE_LLVM OFF)
 
 #---------------------------------------------
 # Contrib libraries

From d373d25aa29038e6d7535fb620d5303a117010cb Mon Sep 17 00:00:00 2001
From: eric <eun.taik.lee@samsung.com>
Date: Tue, 16 Mar 2021 00:39:33 +0900
Subject: [PATCH 336/357] fix build break for android_rpc (#7664)

---
 apps/android_rpc/app/src/main/jni/tvm_runtime.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/apps/android_rpc/app/src/main/jni/tvm_runtime.h b/apps/android_rpc/app/src/main/jni/tvm_runtime.h
index 2005568c608c..fb5993066448 100644
--- a/apps/android_rpc/app/src/main/jni/tvm_runtime.h
+++ b/apps/android_rpc/app/src/main/jni/tvm_runtime.h
@@ -47,6 +47,7 @@
 #include "../src/runtime/module.cc"
 #include "../src/runtime/ndarray.cc"
 #include "../src/runtime/object.cc"
+#include "../src/runtime/profiling.cc"
 #include "../src/runtime/registry.cc"
 #include "../src/runtime/rpc/rpc_channel.cc"
 #include "../src/runtime/rpc/rpc_endpoint.cc"

From 10f5d17a668a1bb8fbd021cadbe6d052a180f706 Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Mon, 15 Mar 2021 15:41:31 +0000
Subject: [PATCH 337/357] [TVMC] Refactoring to document the --target regex and
 simplify test cases (#7654)

* Adds comments to document the regex being used to parse the
   --target=value string
 * Concatenate test cases without reducing the number of asserts
   or number of actual tests
---
 python/tvm/driver/tvmc/common.py             | 12 ++++++++-
 tests/python/driver/tvmc/test_tvmc_common.py | 26 ++++++++------------
 2 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/python/tvm/driver/tvmc/common.py b/python/tvm/driver/tvmc/common.py
index c5cb5f29031f..fbd7bc897683 100644
--- a/python/tvm/driver/tvmc/common.py
+++ b/python/tvm/driver/tvmc/common.py
@@ -131,9 +131,19 @@ def tokenize_target(target):
         a list of parsed tokens extracted from the target string
     """
 
+    # Regex to tokenize the "--target" value. It is split into five parts
+    # to match with:
+    #  1. target and option names e.g. llvm, -mattr=, -mcpu=
+    #  2. option values, all together, without quotes e.g. -mattr=+foo,+opt
+    #  3. option values, when single quotes are used e.g. -mattr='+foo, +opt'
+    #  4. option values, when double quotes are used e.g. -mattr="+foo ,+opt"
+    #  5. commas that separate different targets e.g. "my-target, llvm"
     target_pattern = (
         r"(\-{0,2}[\w\-]+\=?"
-        r"(?:[\w\+\-\.]+(?:,[\w\+\-\.])*|[\'][\w\+\-,\s\.]+[\']|[\"][\w\+\-,\s\.]+[\"])*|,)"
+        r"(?:[\w\+\-\.]+(?:,[\w\+\-\.])*"
+        r"|[\'][\w\+\-,\s\.]+[\']"
+        r"|[\"][\w\+\-,\s\.]+[\"])*"
+        r"|,)"
     )
 
     return re.findall(target_pattern, target)
diff --git a/tests/python/driver/tvmc/test_tvmc_common.py b/tests/python/driver/tvmc/test_tvmc_common.py
index 23ea4f46b2ff..474649d8b1b3 100644
--- a/tests/python/driver/tvmc/test_tvmc_common.py
+++ b/tests/python/driver/tvmc/test_tvmc_common.py
@@ -275,22 +275,16 @@ def test_parse_multiple_target_with_opts():
     assert "llvm" == targets[1]["name"]
 
 
-def test_parse_multiple_separators_on_target():
-    targets = tvmc.common.parse_target("foo -option1=+v1.0x,+value,+bar")
-
-    assert len(targets) == 1
-    assert "+v1.0x,+value,+bar" == targets[0]["opts"]["option1"]
+def test_parse_quotes_and_separators_on_options():
+    targets_no_quote = tvmc.common.parse_target("foo -option1=+v1.0x,+value,+bar")
+    targets_single_quote = tvmc.common.parse_target("foo -option1='+v1.0x,+value'")
+    targets_double_quote = tvmc.common.parse_target('foo -option1="+v1.0x,+value"')
 
+    assert len(targets_no_quote) == 1
+    assert "+v1.0x,+value,+bar" == targets_no_quote[0]["opts"]["option1"]
 
-def test_parse_single_quoted_multiple_separators_on_target():
-    targets = tvmc.common.parse_target("foo -option1='+v1.0x,+value'")
-
-    assert len(targets) == 1
-    assert "+v1.0x,+value" == targets[0]["opts"]["option1"]
+    assert len(targets_single_quote) == 1
+    assert "+v1.0x,+value" == targets_single_quote[0]["opts"]["option1"]
 
-
-def test_parse_double_quoted_multiple_separators_on_target():
-    targets = tvmc.common.parse_target('foo -option1="+v1.0x,+value"')
-
-    assert len(targets) == 1
-    assert "+v1.0x,+value" == targets[0]["opts"]["option1"]
+    assert len(targets_double_quote) == 1
+    assert "+v1.0x,+value" == targets_double_quote[0]["opts"]["option1"]

From 5bd78b398f3658bc66bec66aea78e6aa5faf872f Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Mon, 15 Mar 2021 17:09:05 +0000
Subject: [PATCH 338/357] [TVMC] Fix to check whether a path passed to --target
 is strictly a file (#7663)

* When we use file with --target, the validation in place was only
   checking whether it was a valid path. For the case in which the
   path is a directory, it causes a crash when tvmc then tries to
   open the path.
 * This fix moved the check to be strictly for files, not only a valid
   path
---
 python/tvm/driver/tvmc/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/driver/tvmc/common.py b/python/tvm/driver/tvmc/common.py
index fbd7bc897683..864c3a9bddb4 100644
--- a/python/tvm/driver/tvmc/common.py
+++ b/python/tvm/driver/tvmc/common.py
@@ -280,7 +280,7 @@ def target_from_cli(target):
     """
     extra_targets = []
 
-    if os.path.exists(target):
+    if os.path.isfile(target):
         with open(target) as target_file:
             logger.debug("target input is a path: %s", target)
             target = "".join(target_file.readlines())

From 67f83973bb8e586046df745c2b70fc0a85917504 Mon Sep 17 00:00:00 2001
From: Akira Maruoka <akmaru0266@gmail.com>
Date: Tue, 16 Mar 2021 02:33:15 +0900
Subject: [PATCH 339/357] Fixed strided_slice size (#7659)

Co-authored-by: Akira Maruoka <akira.maruoka@fixstars.com>
---
 python/tvm/relay/frontend/tflite.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index 1b593ad8dea3..d6f704703cae 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -3093,7 +3093,7 @@ def convert_detection_postprocess(self, op):
         valid_count = ret[0]
         # keep only the top 'max_detections' rows
         ret = _op.strided_slice(
-            ret[1], [0, 0, 0], [batch_size, custom_options["max_detections"], anchor_boxes]
+            ret[1], [0, 0, 0], [batch_size, custom_options["max_detections"], 6]
         )
         # the output needs some reshaping to match tflite
         ret = _op.split(ret, 6, axis=2)

From 068fed94cf3468e3df510ac8a9aed635ed746804 Mon Sep 17 00:00:00 2001
From: Nicola Lancellotti <nicola.lancellotti@arm.com>
Date: Mon, 15 Mar 2021 19:51:26 +0000
Subject: [PATCH 340/357] Remove pytest dependency in arm_compute_lib.py
 (#7556)

* Add OpAttrContext class which allows to temporarily change an attribute of an operator

Change-Id: I19b809a105ea8769e56bd89e028e090959a08728

* Replace TempOpAttr with OpAttrContext in arm_compute_lib.py

Change-Id: I1c42dd6a29e765b06ce28192397016efeea2e82a
---
 .../tvm/relay/op/contrib/arm_compute_lib.py   | 39 +++++++++++++++++--
 1 file changed, 36 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/op/contrib/arm_compute_lib.py b/python/tvm/relay/op/contrib/arm_compute_lib.py
index 139f25fef4fd..fabb639845b6 100644
--- a/python/tvm/relay/op/contrib/arm_compute_lib.py
+++ b/python/tvm/relay/op/contrib/arm_compute_lib.py
@@ -18,11 +18,11 @@
 """Arm Compute Library supported operators."""
 import tvm
 
+from tvm import relay
 from tvm._ffi import register_func
 from tvm.relay.expr import const
 from tvm.relay import transform
 from tvm.relay.build_module import bind_params_by_name
-from tvm.relay.testing.temp_op_attr import TempOpAttr
 
 from ...dataflow_pattern import wildcard, is_op, is_constant, is_expr
 from .register import register_pattern_table
@@ -111,9 +111,9 @@ def convert_conv(attrs, inputs, tinfos, desired_layouts):
 
         return convert_conv
 
-    with TempOpAttr(
+    with OpAttrContext(
         "nn.conv2d", "FTVMConvertOpLayout", convert_layout_conv2d(tvm.relay.nn.conv2d)
-    ), TempOpAttr(
+    ), OpAttrContext(
         "qnn.conv2d", "FTVMConvertOpLayout", convert_layout_conv2d(tvm.relay.qnn.op.conv2d)
     ):
         seq = tvm.transform.Sequential(
@@ -481,3 +481,36 @@ def qnn_add(expr):
             return False
 
     return True
+
+
+class OpAttrContext(object):
+    """ Temporarily changes the attr of an op. """
+
+    def __init__(self, op_name, attr_key, attr_value):
+        """Saves the required info for RAII pattern usage.
+
+        Parameters
+        ----------
+        op_name : str
+            The op name.
+
+        attr_key : str
+            The attribute name.
+
+        attr_value : object
+            The attribute value.
+        """
+        self.op = relay.op.get(op_name)
+        self.attr_key = attr_key
+        self.attr_value = attr_value
+
+    def __enter__(self):
+        self.older_attr = self.op.get_attr(self.attr_key)
+        self.op.reset_attr(self.attr_key)
+        self.op.set_attr(self.attr_key, self.attr_value)
+        return self
+
+    def __exit__(self, ptype, value, trace):
+        self.op.reset_attr(self.attr_key)
+        if self.older_attr:
+            self.op.set_attr(self.attr_key, self.older_attr)

From 7f969864d90ae3f57a9bad4ccf3eacd3c49e44d9 Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Mon, 15 Mar 2021 18:16:42 -0700
Subject: [PATCH 341/357] [Relay][Pass] Simplify consecutive
 transpose/layout_transform (#7656)

* [Relay][Pass] Simplify consecutive transpose/layout_transform

* lint

* fix

* support negative

* comment
---
 src/relay/op/make_op.h                        |  2 +
 src/relay/transforms/simplify_expr.cc         | 94 +++++++++++++++++++
 tests/python/relay/test_pass_simplify_expr.py | 58 ++++++++++++
 3 files changed, 154 insertions(+)

diff --git a/src/relay/op/make_op.h b/src/relay/op/make_op.h
index 79f7e135e29d..36a5ec1c0e72 100644
--- a/src/relay/op/make_op.h
+++ b/src/relay/op/make_op.h
@@ -75,6 +75,8 @@ Expr MakeSqueeze(Expr data, Array<Integer> axis);
 
 Expr MakeStack(Expr data, int axis);
 
+Expr MakeTranspose(Expr data, Array<Integer> axes);
+
 Expr MakeStridedSlice(Expr data, Array<Integer> begin, Array<Integer> end, Array<Integer> strides,
                       String slice_mode);
 
diff --git a/src/relay/transforms/simplify_expr.cc b/src/relay/transforms/simplify_expr.cc
index 74e48dc4bc54..3c8876ceccb5 100644
--- a/src/relay/transforms/simplify_expr.cc
+++ b/src/relay/transforms/simplify_expr.cc
@@ -82,6 +82,99 @@ class SimplifyReshape : public SimplifyPattern {
   DFPattern x_;
 };
 
+/*!
+ * \brief SimplifyTranspose matches the pattern of consecutive transpose op,
+ *   and merges or cancels them.
+ */
+class SimplifyTranspose : public SimplifyPattern {
+ public:
+  SimplifyTranspose() {
+    x_ = IsWildcard();
+    auto trans1 = IsOp("transpose") || IsOp("layout_transform");
+    auto trans2 = IsOp("transpose") || IsOp("layout_transform");
+    pattern_ = trans1({trans2({x_})});
+  }
+
+  Expr callback(const Expr& pre, const Expr& post,
+                const Map<DFPattern, Array<Expr>>& node_map) const override {
+    // Helper function to get the axes from call node attribute
+    auto get_axes_from_call = [](const Call trans_call, int ndim) {
+      std::vector<int> attr_axes;
+      if (auto attr = trans_call->attrs.as<TransposeAttrs>()) {
+        if (attr->axes.defined()) {
+          for (int i = 0; i < ndim; ++i) {
+            int64_t axis = attr->axes[i];
+            axis += (axis < 0) ? ndim : 0;
+            attr_axes.push_back(axis);
+          }
+        } else {
+          // Empty axes means reverse
+          for (int i = ndim - 1; i >= 0; --i) {
+            attr_axes.push_back(i);
+          }
+        }
+      } else if (auto attr = trans_call->attrs.as<LayoutTransformAttrs>()) {
+        Layout src_layout(attr->src_layout);
+        Layout dst_layout(attr->dst_layout);
+        for (int i = 0; i < ndim; ++i) {
+          attr_axes.push_back(src_layout.IndexOf(dst_layout[i]));
+        }
+      } else {
+        CHECK(false) << "Expected transpose or layout_transform, but got "
+                     << Downcast<Op>(trans_call->op)->name;
+      }
+      return std::move(attr_axes);
+    };
+
+    auto x = node_map[x_][0];
+
+    // Initialize axes
+    int ndim = Downcast<TensorType>(pre->checked_type())->shape.size();
+    Array<Integer> axes;
+    for (int i = 0; i < ndim; ++i) {
+      axes.push_back(i);
+    }
+
+    // Collect axes changes from the matched pattern, including two consecutive transposes.
+    std::vector<std::vector<int>> interm_axes;
+    Call trans_call = Downcast<Call>(post);
+    interm_axes.push_back(get_axes_from_call(trans_call, ndim));
+    trans_call = Downcast<Call>(trans_call->args[0]);
+    interm_axes.push_back(get_axes_from_call(trans_call, ndim));
+
+    // Calculate the final axes in reverse order (from root to output)
+    auto it = interm_axes.rbegin();
+    while (it != interm_axes.rend()) {
+      auto interm = *it;
+
+      Array<Integer> new_axes;
+      for (int i = 0; i < ndim; ++i) {
+        new_axes.push_back(axes[interm[i]]);
+      }
+      axes = new_axes;
+      it++;
+    }
+
+    // Check if the transpose is still required
+    bool need_transpose = false;
+    for (int i = 0; i < ndim; ++i) {
+      if (axes[i] != i) {
+        need_transpose = true;
+        break;
+      }
+    }
+
+    if (need_transpose) {
+      return MakeTranspose(x, axes);
+    }
+    return x;
+  }
+
+ private:
+  /*! \brief Pattern input */
+  DFPattern x_;
+};
+
 /*!
  * \brief FullArgwhere finds full followed by argwhere and turns it into an Arange op
  */
@@ -162,6 +255,7 @@ class ExprSimplifier {
  public:
   explicit ExprSimplifier(IRModule mod) : mod_(mod) {
     CreateCallback(SimplifyReshape());
+    CreateCallback(SimplifyTranspose());
     CreateCallback(FullElementwise());
   }
   template <typename T>
diff --git a/tests/python/relay/test_pass_simplify_expr.py b/tests/python/relay/test_pass_simplify_expr.py
index 9531d896b2ed..897f90b9ee2a 100644
--- a/tests/python/relay/test_pass_simplify_expr.py
+++ b/tests/python/relay/test_pass_simplify_expr.py
@@ -60,6 +60,63 @@ def symbolic():
     assert tvm.ir.structural_equal(zz, after)
 
 
+def test_simplify_transpose():
+    # Test a series of transpose and layout_transform ops
+    def before1():
+        x = relay.var("x", shape=(1, 3, 224, 224), dtype="float32")  # NCHW
+        y = relay.transpose(x, axes=[0, 2, 3, 1])  # To NHWC
+        y = relay.layout_transform(y, "NHWC", "HWCN")  # To HWCN
+        y = relay.transpose(y, axes=[3, 0, 1, 2])  # To NHWC
+        return relay.Function([x], y)
+
+    def expected1():
+        x = relay.var("x", shape=(1, 3, 224, 224), dtype="float32")  # NCHW
+        y = relay.transpose(x, axes=[0, 2, 3, 1])  # To NHWC
+        return relay.Function([x], y)
+
+    # Test that all transpose ops can be cancelled
+    def before2():
+        x = relay.var("x", shape=(1, 3, 224, 224), dtype="float32")  # NCHW
+        y = relay.nn.relu(x)
+        y = relay.transpose(y, axes=[0, 2, 3, 1])  # To NHWC
+        y = relay.transpose(y, axes=[1, 2, 3, 0])  # To HWCN
+        y = relay.transpose(y, axes=[3, 2, 0, 1])  # To NCHW
+        return relay.Function([x], y)
+
+    def expected2():
+        x = relay.var("x", shape=(1, 3, 224, 224), dtype="float32")  # NCHW
+        y = relay.nn.relu(x)
+        return relay.Function([x], y)
+
+    # Test default axis (reverse) and negative axis
+    def before3():
+        x = relay.var("x", shape=(1, 3, 224, 224), dtype="float32")  # NCHW
+        y = relay.nn.relu(x)
+        y = relay.transpose(y)  # Reverse
+        y = relay.transpose(y)  # Reverse
+        y = relay.transpose(y, axes=[0, 2, -1, 1])
+        y = relay.transpose(y)  # Reverse
+        y = relay.transpose(y)  # Reverse
+        return relay.Function([x], y)
+
+    def expected3():
+        x = relay.var("x", shape=(1, 3, 224, 224), dtype="float32")  # NCHW
+        y = relay.nn.relu(x)
+        y = relay.transpose(y, axes=[0, 2, 3, 1])
+        return relay.Function([x], y)
+
+    for before, expected in [
+        [before1(), expected1()],
+        [before2(), expected2()],
+        [before3(), expected3()],
+    ]:
+        after = run_opt_pass(before, transform.SimplifyExpr())
+        expected = run_opt_pass(expected, transform.InferType())
+        assert tvm.ir.structural_equal(after, expected), "\nafter: {} \nexpected: {}".format(
+            after, expected
+        )
+
+
 def test_simplify_full_elementwise():
     def validate(shape, value, dtype):
         def before_left(x, elem_op, full):
@@ -126,4 +183,5 @@ def after_right(x, elem_op, value):
 
 if __name__ == "__main__":
     test_simplify_reshape()
+    test_simplify_transpose()
     test_simplify_full_elementwise()

From 348d4e7532f7d5ce68d76e7634a5f11b7dc60ab8 Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Mon, 15 Mar 2021 23:10:44 -0600
Subject: [PATCH 342/357] init the concat tensor with 1s and then slice them
 away (#7666)

---
 python/tvm/relay/frontend/onnx.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 4a0a1ed09b9f..391eaaab5f64 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -2629,10 +2629,10 @@ def _outer_body(i, B, C, onnx_out, nms_size_out, out):
 
         # Call the second loop, rework outputs into correct form
         init_count = _op.const(np.array([0]).astype("int64"), dtype="int64")
-        init_out = _op.const(np.array([]).reshape([0, 3]).astype("int64"), dtype="int64")
+        init_out = _op.const(np.array([1, 1, 1]).reshape([1, 3]).astype("int64"), dtype="int64")
         loop_vals = outer_loop(init_count, B, C, onnx_output, nms_size_output, init_out)
-
-        return _expr.TupleGetItem(loop_vals, 5)
+        loop_out = _expr.TupleGetItem(loop_vals, 5)
+        return _op.strided_slice(loop_out, [1, 0], shape_of(loop_out), [1, 1])
 
 
 # compatible operators that do NOT require any conversion.

From d288bbc5df3660355adbf97f2f84ecd232e269ff Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Mon, 15 Mar 2021 23:12:16 -0600
Subject: [PATCH 343/357] [TOPI][GPU] Mergepath sort with odd-even block sort
 (#7611)

* Mergepath sort with odd-even block sort

* fix lint, add test

* respond to review comments

* speed up tests by reducing dtype skews

* fix bad rebase

* change threading to support vulkan

* fix lint

* only sort if the data is non-empty

* fix lint again

* fix for vk

* move if to higher scope

* fix typo

Co-authored-by: Masahiro Masuda <masahi@129@gmail.com>
---
 include/tvm/tir/stmt.h               |   4 +
 python/tvm/topi/cuda/sort.py         | 604 ++++++++++++++++++++-------
 src/tir/transforms/storage_access.cc |   4 +
 tests/python/relay/test_op_level6.py |   9 +-
 4 files changed, 457 insertions(+), 164 deletions(-)

diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h
index ac660bfb7461..6445bb1fe73f 100644
--- a/include/tvm/tir/stmt.h
+++ b/include/tvm/tir/stmt.h
@@ -1312,6 +1312,10 @@ constexpr const char* fragment_shape = "fragment_shape";
  */
 constexpr const char* fragment_layout = "fragment_layout";
 
+/*!
+ * \brief Mark that the kernel is hand threaded and doesn't need syncs inserted
+ */
+constexpr const char* hand_threaded = "hand_threaded";
 /*!
  * \brief Check if attr_key is a pragma key extension
  * \param attr_key The attr key to be compared
diff --git a/python/tvm/topi/cuda/sort.py b/python/tvm/topi/cuda/sort.py
index ca832ef0ef36..5ebd3060a6bb 100644
--- a/python/tvm/topi/cuda/sort.py
+++ b/python/tvm/topi/cuda/sort.py
@@ -57,6 +57,20 @@ def traverse(op):
     return s
 
 
+def _get_threads(ib, nthread_tx, nthread_bx, nthread_by, nthread_bz):
+    tx = te.thread_axis("threadIdx.x")
+    bx = te.thread_axis("blockIdx.x")
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+
+    by = te.thread_axis("blockIdx.y")
+    bz = te.thread_axis("blockIdx.z")
+    ib.scope_attr(by, "thread_extent", nthread_by)
+    ib.scope_attr(bz, "thread_extent", nthread_bz)
+
+    return tx, bx, by, bz
+
+
 def _sort_init(ib, shape, axis, keys_in, keys_out, values_out=None, value_init_func=None):
     """Initialize the output buffers by copying from inputs"""
     axis_mul_before = 1
@@ -78,16 +92,8 @@ def _sort_init(ib, shape, axis, keys_in, keys_out, values_out=None, value_init_f
 
     # Copy the keys_in to initial output
     with ib.new_scope():
-        tx = te.thread_axis("threadIdx.x")
-        bx = te.thread_axis("blockIdx.x")
-        ib.scope_attr(tx, "thread_extent", nthread_tx)
-        ib.scope_attr(bx, "thread_extent", nthread_bx)
+        tx, bx, by, bz = _get_threads(ib, nthread_tx, nthread_bx, nthread_by, nthread_bz)
         tid = bx * nthread_tx + tx
-
-        by = te.thread_axis("blockIdx.y")
-        bz = te.thread_axis("blockIdx.z")
-        ib.scope_attr(by, "thread_extent", nthread_by)
-        ib.scope_attr(bz, "thread_extent", nthread_bz)
         idx = (by * shape[axis] + tid) * axis_mul_after + bz
         with ib.if_scope(tid < shape[axis]):
             keys_out[idx] = keys_in[idx]
@@ -97,6 +103,100 @@ def _sort_init(ib, shape, axis, keys_in, keys_out, values_out=None, value_init_f
     return axis_mul_before, axis_mul_after
 
 
+## TODO(mbrookhart): These are effective optimziation hyperparametrs
+## Perhaps we can autotune?
+block_size = 128
+thread_work = 4
+
+
+def _odd_even_sort(
+    ib,
+    size,
+    axis_mul_before,
+    axis_mul_after,
+    is_ascend,
+    keys,
+    keys_swap,
+    values=None,
+    values_swap=None,
+):
+
+    nthread_tx = block_size // 2
+    nthread_bx = ceil_div(size, block_size)
+    nthread_by = axis_mul_before
+    nthread_bz = axis_mul_after
+    with ib.new_scope():
+        ib.scope_attr(tvm.tir.const(0), "hand_threaded", 0)
+        tx, bx, by, bz = _get_threads(ib, nthread_tx, nthread_bx, nthread_by, nthread_bz)
+        tid = 2 * tx
+        start = bx * block_size
+
+        ## Create shared memory as syncable thread scratch space
+        tmp_keys_swap = ib.allocate(
+            keys_swap.dtype,
+            (block_size,),
+            name="temp_keys_swap",
+            scope="shared",
+        )
+        if values_swap is not None:
+            tmp_values_swap = ib.allocate(
+                values_swap.dtype,
+                (block_size,),
+                name="temp_values_swap",
+                scope="shared",
+            )
+
+        ## Create thread local data for swapping
+        temp_keys = ib.allocate(keys_swap.dtype, (1,), name="temp_keys", scope="local")
+        if values_swap is not None:
+            temp_values = ib.allocate(values_swap.dtype, (1,), name="temp_values", scope="local")
+
+        temp_cond1 = ib.allocate(keys_swap.dtype, (1,), name="temp_cond1", scope="local")
+        temp_cond2 = ib.allocate(keys_swap.dtype, (1,), name="temp_cond2", scope="local")
+        # Copy data to scratch space
+        base_idx = by * size * axis_mul_after + bz
+        with ib.for_range(0, 2) as n:
+            with ib.if_scope((tid + n + start) < size):
+                tmp_keys_swap[tid + n] = keys[base_idx + (tid + n + start) * axis_mul_after]
+                if values_swap is not None:
+                    tmp_values_swap[tid + n] = values[base_idx + (tid + n + start) * axis_mul_after]
+
+        ib.emit(tvm.tir.Call(None, "tir.tvm_storage_sync", tvm.runtime.convert(["shared"])))
+
+        idxm = tvm.tir.indexmod
+        # OddEvenTransposeSort
+        current_sort_num = tvm.tir.min(block_size, size - start)
+        with ib.for_range(0, current_sort_num) as k:
+            n = idxm(tid + k, 2)
+            with ib.if_scope(tid + n < current_sort_num - 1):
+                temp_cond1[0] = tmp_keys_swap[tid + n]
+                temp_cond2[0] = tmp_keys_swap[tid + n + 1]
+                if is_ascend:
+                    cond = temp_cond1[0] > temp_cond2[0]
+                else:
+                    cond = temp_cond1[0] < temp_cond2[0]
+                with ib.if_scope(cond):
+                    temp_keys[0] = tmp_keys_swap[tid + n]
+                    tmp_keys_swap[tid + n] = tmp_keys_swap[tid + n + 1]
+                    tmp_keys_swap[tid + n + 1] = temp_keys[0]
+                    if values_swap is not None:
+                        temp_values[0] = tmp_values_swap[tid + n]
+                        tmp_values_swap[tid + n] = tmp_values_swap[tid + n + 1]
+                        tmp_values_swap[tid + n + 1] = temp_values[0]
+            ib.emit(tvm.tir.Call(None, "tir.tvm_storage_sync", tvm.runtime.convert(["shared"])))
+
+        ## Copy sorted data to output
+        with ib.for_range(0, 2) as n:
+            with ib.if_scope(tid + n + start < size):
+                keys[base_idx + (tid + n + start) * axis_mul_after] = tmp_keys_swap[tid + n]
+                keys_swap[base_idx + (tid + n + start) * axis_mul_after] = tmp_keys_swap[tid + n]
+                if values_swap is not None:
+                    values[base_idx + (tid + n + start) * axis_mul_after] = tmp_values_swap[tid + n]
+                    values_swap[base_idx + (tid + n + start) * axis_mul_after] = tmp_values_swap[
+                        tid + n
+                    ]
+
+
 def _sort_common(
     ib,
     size,
@@ -110,22 +210,22 @@ def _sort_common(
 ):
     """Either sort only values or sort values by keys."""
 
-    ## we are looping over the array doing mergesort from the bottom up.
-    ## The outer loop runs on the host and launches a cuda kernel for each iteration
-    ## of the algorithm.
-    ## The basic idea is that at iteration 0, each thread does sort on 2 elements.
-    ## On iteration 1, each thread merges 2 sorted arrays of 2 elements,
-    ## to deal with 4 total elements.
-    ## On iteration 2, each thread merges 2 sorted arrays of 4 elements,
-    ## to deal with 8 total elements. On iteration 3, each thread deals with 16 elements, etc
-    ## On the final iteration of the algorithm, one thread will merge two sorted lists
-    ## to sort the entire array
+    ## This function performs a multi-level mergesort
+    ## For blocks of length <= block_size, it does odd-even transpose sort
+    ##    in GPU shared memory
+    ## For intermediate block sizes (>block_size, < max_threads * thread_work)
+    ##    it uses the mergpath algorthim https://arxiv.org/abs/1406.2628
+    ##    to merge blocks in parallel
+    ## At some point, the size of the blocks to be merged is too big for max_threads
+    ##    and we switch to using a dual-level mergepath where the outer mergepath
+    ##    finds the start/end locations of the inner mergepath so that we can split
+    ##    the merge into more blocks
 
     max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
+    nthread_by = axis_mul_before * axis_mul_after
+    nthread_bz = 1
     nthread_tx = max_threads
-    nthread_bx = ceil_div(size, max_threads)
-    nthread_by = axis_mul_before
-    nthread_bz = axis_mul_after
+    nthread_bx = ceil_div(size, nthread_tx)
 
     def compare(a, b):
         """
@@ -137,91 +237,234 @@ def compare(a, b):
             out = b <= a
         return out
 
-    def bottom_up_merge(source, dest, source_idx, dest_idx, start, middle, end, even):
-        """
-        Merge the two sections of the array assigned to this thread
-        """
-        # pylint: disable=arguments-out-of-order
-        # initialize iterators
+    # Sort the lower levels of the merge using odd-even sort, it's fast for small inputs
+    lower_lim = tvm.tir.generic.cast(
+        tvm.tir.ceil(tvm.tir.log2(tvm.tir.generic.cast(block_size, "float64"))), "int64"
+    )
+
+    _odd_even_sort(
+        ib,
+        size,
+        axis_mul_before * axis_mul_after,
+        1,
+        is_ascend,
+        keys,
+        keys_swap,
+        values,
+        values_swap,
+    )
+
+    upper_lim = tvm.tir.generic.cast(
+        tvm.tir.ceil(tvm.tir.log2(tvm.tir.generic.cast(size, "float64"))), "int64"
+    )
+
+    def get_merge_begin(source, base_idx, aCount, bCount, aStart, bStart, diag, step_count):
+        first = ib.allocate("int64", (1,), name="first", scope="local")
+        mid = ib.allocate("int64", (1,), name="mid", scope="local")
+        last = ib.allocate("int64", (1,), name="last", scope="local")
+        first[0] = tvm.te.max(0, diag - bCount)
+        last[0] = tvm.te.min(diag, aCount)
+        with ib.while_loop(first[0] < last[0]):
+            mid = (first[0] + last[0]) >> 1
+            a = source[base_idx + (aStart + mid)]
+            b = source[base_idx + (bStart + diag - 1 - mid)]
+            with ib.if_scope(compare(a, b)):
+                first[0] = mid + 1
+            with ib.else_scope():
+                last[0] = mid
+        return first[0], last[0]
+
+    def serial_merge(
+        source,
+        dest,
+        source_idx,
+        dest_idx,
+        base_idx,
+        aCount,
+        bCount,
+        aStart,
+        bStart,
+        kStart,
+        diag,
+        step_count,
+        first,
+        last,
+    ):
         i = ib.allocate("int64", (1,), name="i", scope="local")
         j = ib.allocate("int64", (1,), name="j", scope="local")
-        i[0] = start
-        j[0] = middle
-        # set up indexes
-        base_idx = by * size * axis_mul_after + bz
-        # iterate over the output loop
-        with ib.for_range(0, end - start) as k:
-            i_idx = base_idx + i[0] * axis_mul_after
-            j_idx = base_idx + j[0] * axis_mul_after
-            k_idx = base_idx + (k + start) * axis_mul_after
-
-            def swap_values(source, dest, source_idx, dest_idx):
-                def assign_i():
-                    """assign i value to current output"""
-                    dest[k_idx] = source[i_idx]
-                    if values is not None:
-                        dest_idx[k_idx] = source_idx[i_idx]
-                    i[0] += 1
-
-                def assign_j():
-                    """assign j value to current output"""
-                    dest[k_idx] = source[j_idx]
-                    if values is not None:
-                        dest_idx[k_idx] = source_idx[j_idx]
-                    j[0] += 1
-
-                ## if both of the iterators are in range
-                with ib.if_scope(tvm.tir.all(i[0] < middle, j[0] < end)):
-                    # compare them and insert whichever is next into the output
-                    with ib.if_scope(compare(source[i_idx], source[j_idx])):
-                        assign_i()
-                    with ib.else_scope():
-                        assign_j()
-                # otherwise, simply copy the remainder of the valid iterator to the output
-                with ib.else_scope():
-                    with ib.if_scope(i[0] < middle):
-                        assign_i()
-                    with ib.else_scope():
-                        assign_j()
+        i[0] = aStart + first
+        j[0] = bStart + diag - last
+        with ib.for_range(0, tvm.te.min(aCount + bCount - diag, step_count)) as count:
+            i_idx = base_idx + i[0]
+            j_idx = base_idx + j[0]
+            k_idx = base_idx + (kStart + diag + count)
+
+            def assign_i():
+                """assign i value to current output"""
+                dest[k_idx] = source[i_idx]
+                if values is not None:
+                    dest_idx[k_idx] = source_idx[i_idx]
+                i[0] += 1
 
-            # Switch which input is the source and which is the destination each iteration
-            with ib.if_scope(even):
-                swap_values(source, dest, source_idx, dest_idx)
+            def assign_j():
+                """assign j value to current output"""
+                dest[k_idx] = source[j_idx]
+                if values is not None:
+                    dest_idx[k_idx] = source_idx[j_idx]
+                j[0] += 1
+
+            ## if both of the iterators are in range
+            with ib.if_scope(tvm.tir.all(i[0] < aStart + aCount, j[0] < bStart + bCount)):
+                # compare them and insert whichever is next into the output
+                with ib.if_scope(compare(source[i_idx], source[j_idx])):
+                    assign_i()
+                with ib.else_scope():
+                    assign_j()
+            # otherwise, simply copy the remainder of the valid iterator to the output
             with ib.else_scope():
-                swap_values(dest, source, dest_idx, source_idx)
-
-    def mergesort(source, dest, source_idx, dest_idx, size, width, even):
-        # calculate the start, mid, and end points of this section
-        start = width * tid
-
-        with ib.if_scope(start < size):
-            middle = cast(tvm.te.min(start + tvm.tir.indexdiv(width, 2), size), "int64")
-            end = cast(tvm.te.min(start + width, size), "int64")
-            # merge the start->middle and middle->end arrays
-            bottom_up_merge(source, dest, source_idx, dest_idx, start, middle, end, even)
+                with ib.if_scope(i[0] < aStart + aCount):
+                    assign_i()
+                with ib.else_scope():
+                    assign_j()
 
-    lim = tvm.tir.generic.cast(
-        tvm.tir.ceil(tvm.tir.log2(tvm.tir.generic.cast(size, "float64"))), "int64"
-    )
-    with ib.for_range(0, lim, dtype="int64") as l2_width:
-        width = 2 << l2_width
+    with ib.for_range(0, upper_lim - lower_lim, dtype="int64") as l2_width:
+        width = 2 << (l2_width + lower_lim)
         # Define and launch the cuda kernel
         with ib.new_scope():
-            tx = te.thread_axis("threadIdx.x")
-            bx = te.thread_axis("blockIdx.x")
-            ib.scope_attr(tx, "thread_extent", nthread_tx)
-            # Reduce the number of blocks as the work per thread grows
-            ib.scope_attr(
-                bx,
-                "thread_extent",
-                tvm.tir.generic.cast(ceil_div(size, width * max_threads), "int32"),
-            )
-            tid = bx * nthread_tx + tx
-
-            by = te.thread_axis("blockIdx.y")
-            bz = te.thread_axis("blockIdx.z")
-            ib.scope_attr(by, "thread_extent", nthread_by)
-            ib.scope_attr(bz, "thread_extent", nthread_bz)
+            target = tvm.target.Target.current()
+            if "vulkan" in str(target):
+                # Vulkan can't handle dynamic nthread, so we thread slightly differently
+                # for vulkan. We don't do this generally because it causes a 15% perf
+                # regression on other platforms
+                ntx = max_threads
+                nbx = tvm.tir.generic.cast(ceil_div(width, max_threads * thread_work), "int32")
+                nbz = tvm.tir.generic.cast(ceil_div(size, width), "int32")
+                tx, bx, by, bz = _get_threads(ib, ntx, nbx, nthread_by, nbz)
+            else:
+                ntx = tvm.tir.generic.cast(tvm.te.min(max_threads, width), "int32")
+                nbx = tvm.tir.generic.cast(ceil_div(width, max_threads * thread_work), "int32")
+                nbz = tvm.tir.generic.cast(ceil_div(size, width), "int32")
+                tx, bx, by, bz = _get_threads(ib, ntx, nbx, nthread_by, nbz)
+
+            def mergepath(
+                source,
+                dest,
+                source_idx,
+                dest_idx,
+                aCount,
+                bCount,
+                aStart,
+                bStart,
+                kStart,
+                step_count,
+                even,
+            ):
+                # pylint: disable=arguments-out-of-order
+                def merge(source, dest, source_idx, dest_idx):
+                    diag = tx * step_count
+                    first, last = get_merge_begin(
+                        source,
+                        by * size,
+                        aCount,
+                        bCount,
+                        aStart,
+                        bStart,
+                        diag,
+                        step_count,
+                    )
+                    # iterate over the output loop
+                    serial_merge(
+                        source,
+                        dest,
+                        source_idx,
+                        dest_idx,
+                        by * size,
+                        aCount,
+                        bCount,
+                        aStart,
+                        bStart,
+                        kStart,
+                        diag,
+                        step_count,
+                        first,
+                        last,
+                    )
+
+                with ib.if_scope(even):
+                    merge(source, dest, source_idx, dest_idx)
+                with ib.else_scope():
+                    merge(dest, source, dest_idx, source_idx)
+
+            def mergesort(source, dest, source_idx, dest_idx, size, width, even):
+                # calculate the start, mid, and end points of this section
+                start = width * bz
+                middle = cast(tvm.te.min(start + tvm.tir.indexdiv(width, 2), size), "int64")
+                end = cast(tvm.te.min(start + width, size), "int64")
+                with ib.if_scope(start < size):
+                    with ib.if_scope(nbx == 1):
+                        ## merge the start->middle and middle->end arrays
+                        aCount = middle - start
+                        bCount = end - middle
+                        mergepath(
+                            source,
+                            dest,
+                            source_idx,
+                            dest_idx,
+                            aCount,
+                            bCount,
+                            start,
+                            middle,
+                            start,
+                            ceil_div(width, ntx),
+                            even,
+                        )
+                    with ib.else_scope():
+                        step_count = max_threads * thread_work
+                        diag = bx * step_count
+
+                        def do_merge(first, last):
+                            aStart = start + first
+                            bStart = middle + diag - last
+                            aCount = tvm.te.min(middle - aStart, step_count)
+                            bCount = tvm.te.min(end - bStart, step_count)
+                            mergepath(
+                                source,
+                                dest,
+                                source_idx,
+                                dest_idx,
+                                aCount,
+                                bCount,
+                                aStart,
+                                bStart,
+                                start + diag,
+                                thread_work,
+                                even,
+                            )
+
+                        with ib.if_scope(even):
+                            first, last = get_merge_begin(
+                                source,
+                                by * size,
+                                middle - start,
+                                end - middle,
+                                start,
+                                middle,
+                                diag,
+                                step_count,
+                            )
+                            do_merge(first, last)
+                        with ib.else_scope():
+                            first, last = get_merge_begin(
+                                dest,
+                                by * size,
+                                middle - start,
+                                end - middle,
+                                start,
+                                middle,
+                                diag,
+                                step_count,
+                            )
+                            do_merge(first, last)
 
             # Call the kernel
             mergesort(
@@ -233,29 +476,23 @@ def mergesort(source, dest, source_idx, dest_idx, size, width, even):
                 width,
                 tvm.tir.indexmod(l2_width, 2) == 0,
             )
-
+    nthread_by = axis_mul_before
+    nthread_bz = axis_mul_after
+    nthread_tx = max_threads
+    nthread_bx = ceil_div(size, nthread_tx)
     ## if the final sorted data ended up in the swap, copy it to the real output
-    with ib.if_scope(tvm.tir.indexmod(lim, 2) == 1):
+    with ib.if_scope(
+        tvm.tir.all(upper_lim > lower_lim, tvm.tir.indexmod(upper_lim - lower_lim, 2) == 1)
+    ):
         with ib.new_scope():
-            tx = te.thread_axis("threadIdx.x")
-            bx = te.thread_axis("blockIdx.x")
-            ib.scope_attr(tx, "thread_extent", nthread_tx)
-            ib.scope_attr(bx, "thread_extent", nthread_bx)
+            tx, bx, by, bz = _get_threads(ib, nthread_tx, nthread_bx, nthread_by, nthread_bz)
             tid = bx * nthread_tx + tx
-
-            by = te.thread_axis("blockIdx.y")
-            bz = te.thread_axis("blockIdx.z")
-            ib.scope_attr(by, "thread_extent", nthread_by)
-            ib.scope_attr(bz, "thread_extent", nthread_bz)
-            idx = (by * size + tid) * axis_mul_after + bz
+            idx = (by * axis_mul_after + bz) * size + tid
             with ib.if_scope(tid < size):
-                idx = (by * size + tid) * axis_mul_after + bz
                 keys[idx] = keys_swap[idx]
                 if values is not None:
                     values[idx] = values_swap[idx]
 
-    return ib.get()
-
 
 def sort_ir(
     data, values_out, values_out_swap, axis, is_ascend, indices_out=None, indices_out_swap=None
@@ -301,27 +538,30 @@ def sort_ir(
         assert indices_out_swap is not None
         indices_out_swap = ib.buffer_ptr(indices_out_swap)
 
-    axis_mul_before, axis_mul_after = _sort_init(
-        ib,
-        shape,
-        axis,
-        data,
-        values_out,
-        indices_out,
-        value_init_func=lambda _, tid: tvm.tir.generic.cast(tid, indices_out.dtype),
-    )
+    with ib.if_scope(shape[axis] > 0):
+        axis_mul_before, axis_mul_after = _sort_init(
+            ib,
+            shape,
+            axis,
+            data,
+            values_out,
+            indices_out,
+            value_init_func=lambda _, tid: tvm.tir.generic.cast(tid, indices_out.dtype),
+        )
+
+        _sort_common(
+            ib,
+            shape[axis],
+            axis_mul_before,
+            axis_mul_after,
+            is_ascend,
+            values_out,
+            values_out_swap,
+            values=indices_out,
+            values_swap=indices_out_swap,
+        )
 
-    return _sort_common(
-        ib,
-        shape[axis],
-        axis_mul_before,
-        axis_mul_after,
-        is_ascend,
-        values_out,
-        values_out_swap,
-        values=indices_out,
-        values_swap=indices_out_swap,
-    )
+    return ib.get()
 
 
 def sort_by_key_ir(
@@ -376,27 +616,29 @@ def sort_by_key_ir(
     values_out = ib.buffer_ptr(values_out)
     values_out_swap = ib.buffer_ptr(values_out_swap)
 
-    axis_mul_before, axis_mul_after = _sort_init(
-        ib,
-        shape,
-        axis,
-        keys_in,
-        keys_out,
-        values_out,
-        value_init_func=lambda idx, _: values_in[idx],
-    )
-
-    return _sort_common(
-        ib,
-        shape[axis],
-        axis_mul_before,
-        axis_mul_after,
-        is_ascend,
-        keys_out,
-        keys_out_swap,
-        values=values_out,
-        values_swap=values_out_swap,
-    )
+    with ib.if_scope(shape[axis] > 0):
+        axis_mul_before, axis_mul_after = _sort_init(
+            ib,
+            shape,
+            axis,
+            keys_in,
+            keys_out,
+            values_out,
+            value_init_func=lambda idx, _: values_in[idx],
+        )
+
+        _sort_common(
+            ib,
+            shape[axis],
+            axis_mul_before,
+            axis_mul_after,
+            is_ascend,
+            keys_out,
+            keys_out_swap,
+            values=values_out,
+            values_swap=values_out_swap,
+        )
+    return ib.get()
 
 
 def sort(data, axis=-1, is_ascend=1):
@@ -419,16 +661,29 @@ def sort(data, axis=-1, is_ascend=1):
     out : tvm.te.Tensor
         The output of this function.
     """
+    ndim = len(data.shape)
+    axis = ndim + axis if axis < 0 else axis
+    if axis != ndim - 1:
+        # Prepare for sorting along axis -1.
+        axes = swap(list(range(ndim)), axis)
+        data = transpose(data, axes)
+
     value_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "value_buf", data_alignment=8)
     value_buf_swap = tvm.tir.decl_buffer(data.shape, data.dtype, "value_buf_swap", data_alignment=8)
+
     out = te.extern(
         [data.shape, data.shape],
         [data],
-        lambda ins, outs: sort_ir(ins[0], outs[0], outs[1], axis, is_ascend),
+        lambda ins, outs: sort_ir(ins[0], outs[0], outs[1], -1, is_ascend),
         out_buffers=[value_buf, value_buf_swap],
         name="sort_gpu",
         tag="sort_gpu",
     )[0]
+
+    if axis != ndim - 1:
+        axes = swap(list(range(ndim)), axis)
+        out = transpose(out, axes)
+
     return out
 
 
@@ -507,10 +762,18 @@ def argsort(data, axis=-1, is_ascend=1, dtype="float32"):
     out : tvm.te.Tensor
         The output of this function.
     """
+    ndim = len(data.shape)
+    axis = ndim + axis if axis < 0 else axis
+    if axis != ndim - 1:
+        # Prepare for sorting along axis -1.
+        axes = swap(list(range(ndim)), axis)
+        data = transpose(data, axes)
+
     value_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "value_buf", data_alignment=8)
     value_swap_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "value_swap_buf", data_alignment=8)
     indices_buf = tvm.tir.decl_buffer(data.shape, dtype, "out_buf", data_alignment=8)
     indices_swap_buf = tvm.tir.decl_buffer(data.shape, dtype, "out_swap_buf", data_alignment=8)
+
     out = te.extern(
         [data.shape, data.shape, data.shape, data.shape],
         [data],
@@ -518,7 +781,7 @@ def argsort(data, axis=-1, is_ascend=1, dtype="float32"):
             ins[0],
             outs[0],
             outs[2],
-            axis,
+            -1,
             is_ascend,
             indices_out=outs[1],
             indices_out_swap=outs[3],
@@ -527,6 +790,11 @@ def argsort(data, axis=-1, is_ascend=1, dtype="float32"):
         name="argsort_gpu",
         tag="argsort_gpu",
     )[1]
+
+    if axis != ndim - 1:
+        axes = swap(list(range(ndim)), axis)
+        out = transpose(out, axes)
+
     return out
 
 
@@ -625,21 +893,30 @@ def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"):
     ndim = len(data.shape)
     axis = axis + ndim if axis < 0 else axis
     assert 0 <= axis < ndim
+    dshape = data.shape
+    if axis != ndim - 1:
+        axes = swap(list(range(ndim)), axis)
+        data = transpose(data, axes)
+
     values_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "values_buf", data_alignment=8)
     values_swap_buf = tvm.tir.decl_buffer(
         data.shape, data.dtype, "values_swap_buf", data_alignment=8
     )
     indices_buf = tvm.tir.decl_buffer(data.shape, dtype, "indices_buf", data_alignment=8)
     indices_swap_buf = tvm.tir.decl_buffer(data.shape, dtype, "indies_swap_buf", data_alignment=8)
+
     if ret_type == "values":
         output = te.extern(
             [data.shape, data.shape],
             [data],
-            lambda ins, outs: sort_ir(ins[0], outs[0], outs[1], axis, is_ascend),
+            lambda ins, outs: sort_ir(ins[0], outs[0], outs[1], -1, is_ascend),
             out_buffers=[values_buf, values_swap_buf],
             name="topk_gpu",
             tag="topk_gpu",
         )[0]
+        if axis != ndim - 1:
+            axes = swap(list(range(ndim)), axis)
+            output = transpose(output, axes)
     else:
         output = te.extern(
             [data.shape, data.shape, data.shape, data.shape],
@@ -648,7 +925,7 @@ def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"):
                 ins[0],
                 outs[0],
                 outs[2],
-                axis,
+                -1,
                 is_ascend,
                 indices_out=outs[1],
                 indices_out_swap=outs[3],
@@ -657,6 +934,11 @@ def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"):
             name="topk_gpu",
             tag="topk_gpu",
         )[0:2]
+        if axis != ndim - 1:
+            axes = swap(list(range(ndim)), axis)
+            output[0] = transpose(output[0], axes)
+            output[1] = transpose(output[1], axes)
+
     if isinstance(k, int) and k < 1:
         if ret_type == "indices":
             return output[1]
@@ -668,7 +950,7 @@ def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"):
         if i == axis:
             end.append(k if isinstance(k, int) else tvm.te.size_var("dim"))
         else:
-            end.append(data.shape[i])
+            end.append(dshape[i])
     if ret_type == "both":
         values_out, indices_out = output
         values_out = strided_slice(values_out, beg, end, strides)
diff --git a/src/tir/transforms/storage_access.cc b/src/tir/transforms/storage_access.cc
index 38143c14b021..00002d3587db 100644
--- a/src/tir/transforms/storage_access.cc
+++ b/src/tir/transforms/storage_access.cc
@@ -132,6 +132,10 @@ void StorageAccessVisitor::VisitStmt_(const AttrStmtNode* op) {
       StmtExprVisitor::VisitStmt_(op);
     }
     env_threads_.pop_back();
+  } else if (op->attr_key == attr::hand_threaded) {
+    // skip this pass on blocks that were hand_threaded
+    // this avoids control flow and read/write conflicts
+    // between hand-threaded kernels and automatic threading
   } else {
     StmtExprVisitor::VisitStmt_(op);
   }
diff --git a/tests/python/relay/test_op_level6.py b/tests/python/relay/test_op_level6.py
index 0dac69e36025..f4b785f59df8 100644
--- a/tests/python/relay/test_op_level6.py
+++ b/tests/python/relay/test_op_level6.py
@@ -26,6 +26,7 @@
 @tvm.testing.uses_gpu
 def test_sort():
     def verify_sort(shape, axis, is_ascend, is_dyn=False):
+
         if is_dyn:
             x = relay.var("x", relay.TensorType([relay.Any()] * len(shape), "float32"))
         else:
@@ -87,9 +88,11 @@ def verify_argsort(shape, axis, is_ascend, dtype, is_dyn=False):
         for dtype in ["int32", "int64", "float32", "float64"]:
             verify_argsort((2, 3, 4), axis=0, is_ascend=False, dtype=dtype, is_dyn=is_dyn)
             verify_argsort((1, 4, 6), axis=1, is_ascend=True, dtype=dtype, is_dyn=is_dyn)
-            verify_argsort((3, 5, 6), axis=-1, is_ascend=False, dtype=dtype, is_dyn=is_dyn)
-            verify_argsort((3, 2000, 6), axis=1, is_ascend=False, dtype=dtype, is_dyn=is_dyn)
-            verify_argsort((1, 122640), axis=1, is_ascend=False, dtype=dtype, is_dyn=is_dyn)
+        dtype = "int32"
+        verify_argsort((3, 5, 6), axis=-1, is_ascend=False, dtype=dtype, is_dyn=is_dyn)
+        verify_argsort((3, 6000, 6), axis=1, is_ascend=False, dtype=dtype, is_dyn=is_dyn)
+        verify_argsort((1000, 1, 1), axis=0, is_ascend=False, dtype=dtype, is_dyn=is_dyn)
+        verify_argsort((1, 122640), axis=1, is_ascend=False, dtype=dtype, is_dyn=is_dyn)
 
 
 @tvm.testing.uses_gpu

From 343b689bf894c72fbda5d8ef90d6b27b5a2613fd Mon Sep 17 00:00:00 2001
From: Chris Hoge <chris@hogepodge.com>
Date: Tue, 16 Mar 2021 12:19:38 -0700
Subject: [PATCH 344/357] [docs] Getting Started with TVM: TVMC Tutorial
 (#7640)

* Getting Started with TVM: TVMC Tutorial

An update of the TVMC tutorial, follows the introduction
and installation sections of the new getting started tutorial

* Update tutorials/get_started/tvmc_command_line_driver.py

Co-authored-by: Leandro Nunes <leandro.nunes@arm.com>

* Style and formatting fixes

Co-authored-by: Leandro Nunes <leandro.nunes@arm.com>
---
 .../get_started/tvmc_command_line_driver.py   | 552 +++++++++++-------
 1 file changed, 357 insertions(+), 195 deletions(-)

diff --git a/tutorials/get_started/tvmc_command_line_driver.py b/tutorials/get_started/tvmc_command_line_driver.py
index bcdf03e56875..fffbfbf0356f 100644
--- a/tutorials/get_started/tvmc_command_line_driver.py
+++ b/tutorials/get_started/tvmc_command_line_driver.py
@@ -15,31 +15,33 @@
 # specific language governing permissions and limitations
 # under the License.
 """
-Getting Started with TVM command line driver - TVMC
-===================================================
+Compiling and Optimizing a Model with TVMC
+==========================================
 **Authors**:
 `Leandro Nunes <https://github.com/leandron>`_,
-`Matthew Barrett <https://github.com/mbaret>`_
-
-This tutorial is an introduction to working with TVMC, the TVM command
-line driver. TVMC is a tool that exposes TVM features such as
-auto-tuning, compiling, profiling and execution of models, via a
-command line interface.
-
-In this tutorial we are going to use TVMC to compile, run and tune a
-ResNet-50 on a x86 CPU.
-
-We are going to start by downloading ResNet 50 V2. Then, we are going
-to use TVMC to compile this model into a TVM module, and use the
-compiled module to generate predictions. Finally, we are going to experiment
-with the auto-tuning options, that can be used to help the compiler to
-improve network performance.
-
-The final goal is to give an overview of TVMC's capabilities and also
-some guidance on where to look for more information.
+`Matthew Barrett <https://github.com/mbaret>`_,
+`Chris Hoge <https://github.com/hogepodge>`_
+
+In this section, we will work with TVMC, the TVM command line driver. TVMC is a
+tool that exposes TVM features such as auto-tuning, compiling, profiling and
+execution of models through a command line interface.
+
+Upon completion of this section, we will have used TVMC to accomplish the
+following tasks:
+
+* Compile a pre-trained ResNet 50 v2 model for the TVM runtime.
+* Run a real image through the compiled model, and interpret the output and
+  model performance.
+* Tune the model on a CPU using TVM.
+* Re-compile an optimized model using the tuning data collected by TVM.
+* Run the image through the optimized model, and compare the output and model
+  performance.
+
+The goal of this section is to give you an overview of TVM and TVMC's
+capabilities, and set the stage for understanding how TVM works.
 """
 
-######################################################################
+################################################################################
 # Using TVMC
 # ----------
 #
@@ -61,32 +63,35 @@
 #
 #   tvmc --help
 #
-#
-# As you can see in the help page, the main features are
-# accessible via the subcommands ``tune``, ``compile`` and ``run``.
-# To read about specific options under a given subcommand, use
-# ``tvmc <subcommand> --help``.
-#
-# In the following sections we will use TVMC to tune, compile and
-# run a model. But first, we need a model.
+# The main features of TVM available to ``tvmc`` are from subcommands
+# ``compile``, and ``run``, and ``tune``.  To read about specific options under
+# a given subcommand, use ``tvmc <subcommand> --help``. We will cover each of
+# these commands in this tutorial, but first we need to download a pre-trained
+# model to work with.
 #
 
 
-######################################################################
-# Obtaining the model
+################################################################################
+# Obtaining the Model
 # -------------------
 #
-# We are going to use ResNet-50 V2 as an example to experiment with TVMC.
-# The version below is in ONNX format. To download the file, you can use
-# the command below:
+# For this tutorial, we will be working with ResNet-50 v2. ResNet-50 is a
+# convolutional neural network that is 50-layers deep and designed to classify
+# images. The model we will be using has been pre-trained on more than a
+# million images with 1000 different classifications. The network has an input
+# image size of 224x224. If you are interested exploring more of how the
+# ResNet-50 model is structured, we recommend downloading `Netron
+# <https://netron.app>`, a freely available ML model viewer.
+#
+# For this tutorial we will be using the model in ONNX format.
 #
 # .. code-block:: bash
 #
 #   wget https://github.com/onnx/models/raw/master/vision/classification/resnet/model/resnet50-v2-7.onnx
 #
-#
 
-######################################################################
+
+################################################################################
 # .. note:: Supported model formats
 #
 #   TVMC supports models created with Keras, ONNX, TensorFlow, TFLite
@@ -96,241 +101,398 @@
 #
 
 
-######################################################################
-# Compiling the model
-# -------------------
+################################################################################
+# Compiling an ONNX Model to the TVM Runtime
+# ------------------------------------------
 #
-# The next step once we've downloaded ResNet-50, is to compile it,
-# To accomplish that, we are going to use ``tvmc compile``. The
-# output we get from the compilation process is a TAR package,
-# that can be used to run our model on the target device.
+# Once we've downloaded the ResNet-50 model, the next step is to compile it. To
+# accomplish that, we are going to use ``tvmc compile``. The output we get from
+# the compilation process is a TAR package of the model compiled to a dynamic
+# library for our target platform. We can run that model on our target device
+# using the TVM runtime.
 #
 # .. code-block:: bash
 #
 #   tvmc compile \
-#     --target "llvm" \
-#     --output compiled_module.tar \
-#     resnet50-v2-7.onnx
+#   --target "llvm" \
+#   --output resnet50-v2-7-tvm.tar \
+#   resnet50-v2-7.onnx
 #
-# Once compilation finishes, the output ``compiled_module.tar`` will be created. This
-# can be directly loaded by your application and run via the TVM runtime APIs.
+# Let's take a look at the files that ``tvmc compile`` creates in the module:
 #
+# .. code-block:: bash
+#
+# 	mkdir model
+# 	tar -xvf resnet50-v2-7-tvm.tar -C model
+# 	ls model
+#
+# You will see three files listed.
+#
+# * ``mod.so`` is the model, represented as a C++ library, that can be loaded
+#   by the TVM runtime.
+# * ``mod.json`` is a text representation of the TVM Relay computation graph.
+# * ``mod.params`` is a file containing the parameters for the pre-trained
+#   model.
+#
+# This module can be directly loaded by your application, and the model can be
+# run via the TVM runtime APIs.
 
 
-######################################################################
-# .. note:: Defining the correct target
+################################################################################
+# .. note:: Defining the Correct Target
 #
 #   Specifying the correct target (option ``--target``) can have a huge
 #   impact on the performance of the compiled module, as it can take
 #   advantage of hardware features available on the target. For more
 #   information, please refer to `Auto-tuning a convolutional network
 #   for x86 CPU <https://tvm.apache.org/docs/tutorials/autotvm/tune_relay_x86.html#define-network>`_.
+#   We recommend identifying which CPU you are running, along with optional features,
+#   and set the target appropriately.
 #
 
-
-######################################################################
-#
-# In the next step, we are going to use the compiled module, providing it
-# with some inputs, to generate some predictions.
-#
-
-
-######################################################################
-# Input pre-processing
-# --------------------
+################################################################################
+# Running the Model from The Compiled Module with TVMC
+# ----------------------------------------------------
 #
-# In order to generate predictions, we will need two things:
+# Now that we've compiled the model to this module, we can use the TVM runtime
+# to make predictions with it. TVMC has the TVM runtime built in to it,
+# allowing you to run compiled TVM models. To use TVMC to run the model and
+# make predictions, we need two things:
 #
-# - the compiled module, which we just produced;
-# - a valid input to the model
+# - The compiled module, which we just produced.
+# - Valid input to the model to make predictions on.
 #
-# Each model is particular when it comes to expected tensor shapes, formats and data
-# types. For this reason, most models require some pre and
-# post processing, to ensure the input(s) is valid and to interpret the output(s).
+# Each model is particular when it comes to expected tensor shapes, formats and
+# data types. For this reason, most models require some pre and
+# post-processing, to ensure the input is valid and to interpret the output.
+# TVMC has adopted NumPy's ``.npz`` format for both input and output data. This
+# is a well-supported NumPy format to serialize multiple arrays into a file
 #
-# In TVMC, we adopted NumPy's ``.npz`` format for both input and output data.
-# This is a well-supported NumPy format to serialize multiple arrays into a file.
-#
-# We will use the usual cat image, similar to other TVM tutorials:
+# As input for this tutorial, we will use the image of a cat, but you can feel
+# free to substitute image for any of your choosing.
 #
 # .. image:: https://s3.amazonaws.com/model-server/inputs/kitten.jpg
 #    :height: 224px
 #    :width: 224px
 #    :align: center
+
+
+################################################################################
+# Input pre-processing
+# ~~~~~~~~~~~~~~~~~~~~
 #
 # For our ResNet 50 V2 model, the input is expected to be in ImageNet format.
 # Here is an example of a script to pre-process an image for ResNet 50 V2.
 #
-from tvm.contrib.download import download_testdata
-from PIL import Image
-import numpy as np
-
-img_url = "https://s3.amazonaws.com/model-server/inputs/kitten.jpg"
-img_path = download_testdata(img_url, "imagenet_cat.png", module="data")
-
-# Resize it to 224x224
-resized_image = Image.open(img_path).resize((224, 224))
-img_data = np.asarray(resized_image).astype("float32")
-
-# ONNX expects NCHW input, so convert the array
-img_data = np.transpose(img_data, (2, 0, 1))
-
-# Normalize according to ImageNet
-imagenet_mean = np.array([0.485, 0.456, 0.406])
-imagenet_stddev = np.array([0.229, 0.224, 0.225])
-norm_img_data = np.zeros(img_data.shape).astype("float32")
-for i in range(img_data.shape[0]):
-    norm_img_data[i, :, :] = (img_data[i, :, :] / 255 - imagenet_mean[i]) / imagenet_stddev[i]
-
-# Add batch dimension
-img_data = np.expand_dims(norm_img_data, axis=0)
-
-# Save to .npz (outputs imagenet_cat.npz)
-np.savez("imagenet_cat", data=img_data)
-
+# .. code-block:: python
+#    :caption: preprocess.py
+#    :name: preprocess.py
+#
+#     #!python ./preprocess.py
+#     from tvm.contrib.download import download_testdata
+#     from PIL import Image
+#     import numpy as np
+#
+#     img_url = "https://s3.amazonaws.com/model-server/inputs/kitten.jpg"
+#     img_path = download_testdata(img_url, "imagenet_cat.png", module="data")
+#
+#     # Resize it to 224x224
+#     resized_image = Image.open(img_path).resize((224, 224))
+#     img_data = np.asarray(resized_image).astype("float32")
+#
+#     # ONNX expects NCHW input, so convert the array
+#     img_data = np.transpose(img_data, (2, 0, 1))
+#
+#     # Normalize according to ImageNet
+#     imagenet_mean = np.array([0.485, 0.456, 0.406])
+#     imagenet_stddev = np.array([0.229, 0.224, 0.225])
+#     norm_img_data = np.zeros(img_data.shape).astype("float32")
+#     for i in range(img_data.shape[0]):
+#    	    norm_img_data[i, :, :] = (img_data[i, :, :] / 255 - imagenet_mean[i]) / imagenet_stddev[i]
+#
+#     # Add batch dimension
+#     img_data = np.expand_dims(norm_img_data, axis=0)
+#
+#     # Save to .npz (outputs imagenet_cat.npz)
+#     np.savez("imagenet_cat", data=img_data)
+#
 
-######################################################################
-# Running the compiled module
-# ---------------------------
+################################################################################
+# Running the Compiled Module
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
-# With both the compiled module and input file in hand, we can run it by
-# invoking ``tvmc run``.
+# With both the model and input data in hand, we can now run TVMC to make a
+# prediction:
 #
 # .. code-block:: bash
 #
-#    tvmc run \
-#      --inputs imagenet_cat.npz \
-#      --output predictions.npz \
-#      compiled_module.tar
+#     tvmc run \
+#     --inputs imagenet_cat.npz \
+#     --output predictions.npz \
+#     resnet50-v2-7-tvm.tar
 #
-# When running the above command, a new file ``predictions.npz`` should
-# be produced. It contains the output tensors.
+# Recall that the `.tar` model file includes a C++ library, a description of
+# the Relay model, and the parameters for the model. TVMC includes the TVM
+# runtime, which can load the model and make predictions against input. When
+# running the above command, TVMC outputs a new file, ``predictions.npz``, that
+# contains the model output tensors in NumPy format.
 #
 # In this example, we are running the model on the same machine that we used
-# for compilation. In some cases we might want to run it remotely via
-# an RPC Tracker. To read more about these options please check ``tvmc
-# run --help``.
-#
+# for compilation. In some cases we might want to run it remotely via an RPC
+# Tracker. To read more about these options please check ``tvmc run --help``.
 
-######################################################################
-# Output post-processing
-# ----------------------
+################################################################################
+# Output Post-Processing
+# ~~~~~~~~~~~~~~~~~~~~~~
 #
-# As previously mentioned, each model will have its own particular way
-# of providing output tensors.
+# As previously mentioned, each model will have its own particular way of
+# providing output tensors.
 #
-# In our case, we need to run some post-processing to render the
-# outputs from ResNet 50 V2 into a more human-readable form.
+# In our case, we need to run some post-processing to render the outputs from
+# ResNet 50 V2 into a more human-readable form, using the lookup-table provided
+# for the model.
 #
-# The script below shows an example of the post-processing to extract
-# labels from the output of our compiled module.
+# The script below shows an example of the post-processing to extract labels
+# from the output of our compiled module.
 #
-import os.path
-import numpy as np
-
-from scipy.special import softmax
-
-from tvm.contrib.download import download_testdata
-
-# Download a list of labels
-labels_url = "https://s3.amazonaws.com/onnx-model-zoo/synset.txt"
-labels_path = download_testdata(labels_url, "synset.txt", module="data")
-
-with open(labels_path, "r") as f:
-    labels = [l.rstrip() for l in f]
-
-output_file = "predictions.npz"
-
-# Open the output and read the output tensor
-if os.path.exists(output_file):
-    with np.load(output_file) as data:
-        scores = softmax(data["output_0"])
-        scores = np.squeeze(scores)
-        ranks = np.argsort(scores)[::-1]
-
-        for rank in ranks[0:5]:
-            print("class='%s' with probability=%f" % (labels[rank], scores[rank]))
-
-
-########################################################################
-# When running the script, a list of predictions should be printed similar
-# the the example below.
+# .. code-block:: python
+#     :caption: postprocess.py
+#     :name: postprocess.py
+#
+#     #!python ./postprocess.py
+#     import os.path
+#     import numpy as np
+#
+#     from scipy.special import softmax
+#
+#     from tvm.contrib.download import download_testdata
+#
+#     # Download a list of labels
+#     labels_url = "https://s3.amazonaws.com/onnx-model-zoo/synset.txt"
+#     labels_path = download_testdata(labels_url, "synset.txt", module="data")
+#
+#     with open(labels_path, "r") as f:
+#         labels = [l.rstrip() for l in f]
+#
+#     output_file = "predictions.npz"
+#
+#     # Open the output and read the output tensor
+#     if os.path.exists(output_file):
+#         with np.load(output_file) as data:
+#             scores = softmax(data["output_0"])
+#             scores = np.squeeze(scores)
+#             ranks = np.argsort(scores)[::-1]
+#
+#             for rank in ranks[0:5]:
+#                 print("class='%s' with probability=%f" % (labels[rank], scores[rank]))
+#
+# Running this script should produce the following output:
 #
 # .. code-block:: bash
 #
-#   $ python post_processing.py
-#   class=n02123045 tabby, tabby cat ; probability=446.000000
-#   class=n02123159 tiger cat ; probability=675.000000
-#   class=n02124075 Egyptian cat ; probability=836.000000
-#   class=n02129604 tiger, Panthera tigris ; probability=917.000000
-#   class=n04040759 radiator ; probability=213.000000
+#     python postprocess.py
 #
+#     # class='n02123045 tabby, tabby cat' with probability=0.610553
+#     # class='n02123159 tiger cat' with probability=0.367179
+#     # class='n02124075 Egyptian cat' with probability=0.019365
+#     # class='n02129604 tiger, Panthera tigris' with probability=0.001273
+#     # class='n04040759 radiator' with probability=0.000261
+#
+# Try replacing the cat image with other images, and see what sort of
+# predictions the ResNet model makes.
 
-
-######################################################################
-# Tuning the model
-# ----------------
+################################################################################
+# Automatically Tuning the ResNet Model
+# -------------------------------------
+#
+# The previous model was compiled to work on the TVM runtime, but did not
+# include any platform specific optimization. In this section, we will show you
+# how to build an optimized model using TVMC to target your working platform.
 #
 # In some cases, we might not get the expected performance when running
-# inferences using our compiled module. In cases like this, we can make use
-# of the auto-tuner, to find a better configuration for our model and
-# get a boost in performance.
-#
-# Tuning in TVM refers to the process by which a model is optimized
-# to run faster on a given target. This differs from training or
-# fine-tuning in that it does not affect the accuracy of the model,
-# but only the runtime performance.
-#
-# As part of the tuning process, TVM will try running many different
-# operator implementation variants to see which perform best. The
-# results of these runs are stored in a tuning records file, which is
+# inferences using our compiled module.  In cases like this, we can make use of
+# the auto-tuner, to find a better configuration for our model and get a boost
+# in performance. Tuning in TVM refers to the process by which a model is
+# optimized to run faster on a given target. This differs from training or
+# fine-tuning in that it does not affect the accuracy of the model, but only
+# the runtime performance. As part of the tuning process, TVM will try running
+# many different operator implementation variants to see which perform best.
+# The results of these runs are stored in a tuning records file, which is
 # ultimately the output of the ``tune`` subcommand.
 #
 # In the simplest form, tuning requires you to provide three things:
 #
-# - the target specification of the device you intend to run this model on;
-# - the path to an output file in which the tuning records will be stored, and finally,
+# - the target specification of the device you intend to run this model on
+# - the path to an output file in which the tuning records will be stored, and
+#   finally
 # - a path to the model to be tuned.
 #
-#
 # The example below demonstrates how that works in practice:
 #
 # .. code-block:: bash
 #
-#   tvmc tune \
+#     tvmc tune \
 #     --target "llvm" \
-#     --output autotuner_records.json \
+#     --output resnet50-v2-7-autotuner_records.json \
 #     resnet50-v2-7.onnx
 #
+# In this example, you will see better results if you indicate a more specific
+# target for the `--target` flag.  For example, on an Intel i7 processor you
+# could use `--target llvm -mcpu=skylake`. For this tuning example, we are
+# tuning locally on the CPU using LLVM as the compiler for the specified
+# achitecture.
+#
+# TVMC will perform a search against the parameter space for the model, trying
+# out different configurations for operators and choosing the one that runs
+# fastest on your platform. Although this is a guided search based on the CPU
+# and model operations, it can still take several hours to complete the search.
+# The output of this search will be saved to the
+# `resnet50-v2-7-autotuner_records.json` file, which will later be used to
+# compile an optimized model.
+#
+# .. note:: Defining the Tuning Search Algorithm
+#
+#   By default this search is guided using an `XGBoost Grid` algorithm.
+#   Depending on your model complexity and amount of time avilable, you might
+#   want to choose a different algorithm. A full list is available by
+#   consulting ``tvmc tune --help``.
+#
+# The output will look something like this for a consumer-level Skylake CPU:
+#
+# .. code-block:: bash
+#
+#   tvmc tune   --target "llvm -mcpu=broadwell"   --output resnet50-v2-7-autotuner_records.json   resnet50-v2-7.onnx
+#   # [Task  1/24]  Current/Best:    9.65/  23.16 GFLOPS | Progress: (60/1000) | 130.74 s Done.
+#   # [Task  1/24]  Current/Best:    3.56/  23.16 GFLOPS | Progress: (192/1000) | 381.32 s Done.
+#   # [Task  2/24]  Current/Best:   13.13/  58.61 GFLOPS | Progress: (960/1000) | 1190.59 s Done.
+#   # [Task  3/24]  Current/Best:   31.93/  59.52 GFLOPS | Progress: (800/1000) | 727.85 s Done.
+#   # [Task  4/24]  Current/Best:   16.42/  57.80 GFLOPS | Progress: (960/1000) | 559.74 s Done.
+#   # [Task  5/24]  Current/Best:   12.42/  57.92 GFLOPS | Progress: (800/1000) | 766.63 s Done.
+#   # [Task  6/24]  Current/Best:   20.66/  59.25 GFLOPS | Progress: (1000/1000) | 673.61 s Done.
+#   # [Task  7/24]  Current/Best:   15.48/  59.60 GFLOPS | Progress: (1000/1000) | 953.04 s Done.
+#   # [Task  8/24]  Current/Best:   31.97/  59.33 GFLOPS | Progress: (972/1000) | 559.57 s Done.
+#   # [Task  9/24]  Current/Best:   34.14/  60.09 GFLOPS | Progress: (1000/1000) | 479.32 s Done.
+#   # [Task 10/24]  Current/Best:   12.53/  58.97 GFLOPS | Progress: (972/1000) | 642.34 s Done.
+#   # [Task 11/24]  Current/Best:   30.94/  58.47 GFLOPS | Progress: (1000/1000) | 648.26 s Done.
+#   # [Task 12/24]  Current/Best:   23.66/  58.63 GFLOPS | Progress: (1000/1000) | 851.59 s Done.
+#   # [Task 13/24]  Current/Best:   25.44/  59.76 GFLOPS | Progress: (1000/1000) | 534.58 s Done.
+#   # [Task 14/24]  Current/Best:   26.83/  58.51 GFLOPS | Progress: (1000/1000) | 491.67 s Done.
+#   # [Task 15/24]  Current/Best:   33.64/  58.55 GFLOPS | Progress: (1000/1000) | 529.85 s Done.
+#   # [Task 16/24]  Current/Best:   14.93/  57.94 GFLOPS | Progress: (1000/1000) | 645.55 s Done.
+#   # [Task 17/24]  Current/Best:   28.70/  58.19 GFLOPS | Progress: (1000/1000) | 756.88 s Done.
+#   # [Task 18/24]  Current/Best:   19.01/  60.43 GFLOPS | Progress: (980/1000) | 514.69 s Done.
+#   # [Task 19/24]  Current/Best:   14.61/  57.30 GFLOPS | Progress: (1000/1000) | 614.44 s Done.
+#   # [Task 20/24]  Current/Best:   10.47/  57.68 GFLOPS | Progress: (980/1000) | 479.80 s Done.
+#   # [Task 21/24]  Current/Best:   34.37/  58.28 GFLOPS | Progress: (308/1000) | 225.37 s Done.
+#   # [Task 22/24]  Current/Best:   15.75/  57.71 GFLOPS | Progress: (1000/1000) | 1024.05 s Done.
+#   # [Task 23/24]  Current/Best:   23.23/  58.92 GFLOPS | Progress: (1000/1000) | 999.34 s Done.
+#   # [Task 24/24]  Current/Best:   17.27/  55.25 GFLOPS | Progress: (1000/1000) | 1428.74 s Done.
+#
+# Tuning sessions can take a long time, so ``tvmc tune`` offers many options to customize your tuning
+# process, in terms of number of repetitions (``--repeat`` and ``--number``, for example), the tuning
+# algorithm to be used, and so on. Check ``tvmc tune --help`` for more information.
+#
+
+################################################################################
+# Compiling an Optimized Model with Tuning Data
+# ----------------------------------------------
+#
+# As an output of the tuning process above, we obtained the tuning records
+# stored in ``resnet50-v2-7-autotuner_records.json``. This file can be used in
+# two ways:
+#
+# - As input to further tuning (via ``tvmc tune --tuning-records``).
+# - As input to the compiler
+#
+# The compiler will use the results to generate high performance code for the
+# model on your specified target. To do that we can use ``tvmc compile
+# --tuning-records``. Check ``tvmc compile --help`` for more information.
+#
+# Now that tuning data for the model has been collected, we can re-compile the
+# model using optimized operators to speed up our computations.
+#
+# .. code-block:: bash
+#
+#   tvmc compile \
+#   --target "llvm" \
+#   --tuning-records resnet50-v2-7-autotuner_records.json  \
+#   --output resnet50-v2-7-tvm_autotuned.tar \
+#   resnet50-v2-7.onnx
+#
+# Verify that the optimized model runs and produces the same results:
+#
+# .. code-block:: bash
+#
+#   tvmc run \
+#   --inputs imagenet_cat.npz \
+#   --output predictions.npz \
+#   resnet50-v2-7-tvm_autotuned.tar
+#
+#   python postproccess.py
+#
+# Verifying that the predictions are the same:
+#
+# .. code-block:: bash
+#
+#   # class='n02123045 tabby, tabby cat' with probability=0.610550
+#   # class='n02123159 tiger cat' with probability=0.367181
+#   # class='n02124075 Egyptian cat' with probability=0.019365
+#   # class='n02129604 tiger, Panthera tigris' with probability=0.001273
+#   # class='n04040759 radiator' with probability=0.000261
+
+################################################################################
+# Comparing the Tuned and Untuned Models
+# --------------------------------------
+#
+# TVMC gives you tools for basic performance benchmarking between the models.
+# You can specify a number of repetitions and that TVMC report on the model run
+# time (independent of runtime startup). We can get a rough idea of how much
+# tuning has improved the model performance. For example, on a test Intel i7
+# system, we see that the tuned model runs 47% faster than the untuned model:
+#
+# .. code-block:: bash
 #
-# Tuning sessions can take a long time, so ``tvmc tune`` offers many options to
-# customize your tuning process, in terms of number of repetitions (``--repeat`` and
-# ``--number``, for example), the tuning algorithm to be use, and so on.
-# Check ``tvmc tune --help`` for more information.
+#   tvmc run \
+#   --inputs imagenet_cat.npz \
+#   --output predictions.npz  \
+#   --print-time \
+#   --repeat 100 \
+#   resnet50-v2-7-tvm_autotuned.tar
 #
-# As an output of the tuning process above, we obtained the tuning records stored
-# in ``autotuner_records.json``. This file can be used in two ways:
+#   # Execution time summary:
+#   # mean (s)   max (s)    min (s)    std (s)
+#   # 0.09219    0.11573    0.08985    0.00315
 #
-# - as an input to further tuning (via ``tvmc tune --tuning-records``), or
-# - as an input to the compiler
+#   tvmc run \
+#   --inputs imagenet_cat.npz \
+#   --output predictions.npz  \
+#   --print-time \
+#   --repeat 100 \
+#   resnet50-v2-7-tvm.tar
 #
-# The compiler will use the results to generate high performance code for the model
-# on your specified target. To do that we can use ``tvmc compile --tuning-records``.
-# Check ``tvmc compile --help`` for more information.
+#   # Execution time summary:
+#   # mean (s)   max (s)    min (s)    std (s)
+#   # 0.19332    0.21997    0.18504    0.00711
 #
 
 
-######################################################################
+################################################################################
 # Final Remarks
 # -------------
 #
-# In this tutorial, we presented TVMC, a command line driver for TVM.
-# We demonstrated how to compile, run and tune a model, as well
-# as discussed the need for pre and post processing of inputs and outputs.
+# In this tutorial, we presented TVMC, a command line driver for TVM. We
+# demonstrated how to compile, run, and tune a model. We also discussed the
+# need for pre and post-processing of inputs and outputs. After the tuning
+# process, we demonstrated how to compare the performance of the unoptimized
+# and optimize models.
 #
 # Here we presented a simple example using ResNet 50 V2 locally. However, TVMC
 # supports many more features including cross-compilation, remote execution and
 # profiling/benchmarking.
 #
-# To see what other options are available, please have a look at ``tvmc --help``.
+# To see what other options are available, please have a look at ``tvmc
+# --help``.
 #
+# In the next tutorial, `Compiling and Optimizing a Model with the Python
+# AutoScheduler <auto_tuning_with_pyton>`_, we will cover the same compilation
+# and optimization steps using the Python interface.

From b8b63cd1a40250cdb12c14050b3b1f545eeafa70 Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Tue, 16 Mar 2021 15:29:59 -0700
Subject: [PATCH 345/357] add nvcc support (#7668)

---
 python/tvm/contrib/cc.py                        | 16 ++++++++++------
 python/tvm/runtime/module.py                    |  3 +++
 src/relay/backend/contrib/codegen_c/codegen_c.h |  2 ++
 src/target/source/source_module.cc              |  2 +-
 4 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/python/tvm/contrib/cc.py b/python/tvm/contrib/cc.py
index 59a1d11216ee..f48ae395fbcd 100644
--- a/python/tvm/contrib/cc.py
+++ b/python/tvm/contrib/cc.py
@@ -192,12 +192,16 @@ def _fcompile(outputs, objects, options=None):
 
 def _linux_compile(output, objects, options, compile_cmd="g++", compile_shared=False):
     cmd = [compile_cmd]
-    if compile_shared or output.endswith(".so") or output.endswith(".dylib"):
-        cmd += ["-shared", "-fPIC"]
-        if sys.platform == "darwin":
-            cmd += ["-undefined", "dynamic_lookup"]
-    elif output.endswith(".obj"):
-        cmd += ["-c"]
+    if compile_cmd != "nvcc":
+        if compile_shared or output.endswith(".so") or output.endswith(".dylib"):
+            cmd += ["-shared", "-fPIC"]
+            if sys.platform == "darwin":
+                cmd += ["-undefined", "dynamic_lookup"]
+        elif output.endswith(".obj"):
+            cmd += ["-c"]
+    else:
+        if compile_shared or output.endswith(".so") or output.endswith(".dylib"):
+            cmd += ["--shared"]
     cmd += ["-o", output]
     if isinstance(objects, str):
         cmd += [objects]
diff --git a/python/tvm/runtime/module.py b/python/tvm/runtime/module.py
index 53576a60f32f..09bef9ecbd6a 100644
--- a/python/tvm/runtime/module.py
+++ b/python/tvm/runtime/module.py
@@ -339,6 +339,9 @@ def export_library(self, file_name, fcompile=None, addons=None, workspace_dir=No
                 else:
                     assert module.type_key == "c"
                     object_format = "c"
+                    if "cc" in kwargs:
+                        if kwargs["cc"] == "nvcc":
+                            object_format = "cu"
                     has_c_module = True
             path_obj = os.path.join(workspace_dir, f"lib{index}.{object_format}")
             module.save(path_obj)
diff --git a/src/relay/backend/contrib/codegen_c/codegen_c.h b/src/relay/backend/contrib/codegen_c/codegen_c.h
index af835cfca02e..b81fd14b99c2 100644
--- a/src/relay/backend/contrib/codegen_c/codegen_c.h
+++ b/src/relay/backend/contrib/codegen_c/codegen_c.h
@@ -343,6 +343,8 @@ class CodegenCBase {
     std::string dtype;
     if (runtime::TypeMatch(ttype->dtype, kDLFloat, 32)) {
       dtype = "float";
+    } else if (runtime::TypeMatch(ttype->dtype, kDLFloat, 16)) {
+      dtype = "half";
     } else if (runtime::TypeMatch(ttype->dtype, kDLInt, 32)) {
       dtype = "int";
     } else if (runtime::TypeMatch(ttype->dtype, kDLInt, 64)) {
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index a7732719a699..26f1850c0e47 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -104,7 +104,7 @@ class CSourceModuleNode : public runtime::ModuleNode {
   void SaveToFile(const std::string& file_name, const std::string& format) final {
     std::string fmt = GetFileFormat(file_name, format);
     std::string meta_file = GetMetaFilePath(file_name);
-    if (fmt == "c") {
+    if (fmt == "c" || fmt == "cu") {
       ICHECK_NE(code_.length(), 0);
       SaveBinaryToFile(file_name, code_);
     } else {

From e697f03539acce7a6ed78eb7757c207398dd6b72 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <pivovaa@amazon.com>
Date: Tue, 16 Mar 2021 15:30:32 -0700
Subject: [PATCH 346/357] Fix relay.testing.darknet convert_image (#7667)

---
 python/tvm/relay/testing/darknet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/relay/testing/darknet.py b/python/tvm/relay/testing/darknet.py
index c0468b7ef692..e1345043c6bb 100644
--- a/python/tvm/relay/testing/darknet.py
+++ b/python/tvm/relay/testing/darknet.py
@@ -31,7 +31,7 @@
 def convert_image(image):
     """Convert the image with numpy."""
     imagex = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-    imagex = np.array(image)
+    imagex = np.array(imagex)
     imagex = imagex.transpose((2, 0, 1))
     imagex = np.divide(imagex, 255.0)
     imagex = np.flip(imagex, 0)

From 4abbe4902e451cc5a963b8b60a70e548d48ace62 Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Tue, 16 Mar 2021 21:21:21 -0700
Subject: [PATCH 347/357] [Torch] Remove unnecessary reshapes for batch_matmul
 (#7675)

* [Torch] Remove unnecessary reshapes for batch_matmul

* lint

* fix

* reorder

* lint
---
 python/tvm/relay/frontend/pytorch.py | 29 +++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index c709e2b4e7bd..fd0a07e35c15 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -1094,8 +1094,7 @@ def instance_norm(self, inputs, input_types):
             data, gamma, beta, axis=1, epsilon=epsilon, center=center, scale=scale
         )
 
-    @staticmethod
-    def get_dims(data):
+    def get_dims(self, data):
         import torch
 
         if isinstance(data, _expr.Expr):
@@ -1575,15 +1574,31 @@ def matmul(self, inputs, input_types):
 
         # When performing a batch matmul, we need to properly handle N-dim shapes.
         if len(a_shape) > 2 or len(b_shape) > 2:
-            # Convert a and b into 3 dimensional tensors.
-            a = _op.reshape(inputs_0, [-1, a_shape[-2], a_shape[-1]])
-            b = _op.reshape(inputs_1, [-1, b_shape[-2], b_shape[-1]])
+            # Convert a into a 3 dimensional tensors.
+            need_reshape_output = False
+            if len(a_shape) != 3:
+                a = _op.reshape(inputs_0, [-1, a_shape[-2], a_shape[-1]])
+                need_reshape_output = True
+            else:
+                a = inputs_0
+
             # Transpose matrix dimensions of b.
-            b = _op.transpose(b, [0, 2, 1])
+            trans_axes = list(range(len(b_shape)))
+            trans_axes[-2], trans_axes[-1] = trans_axes[-1], trans_axes[-2]
+            b = _op.transpose(inputs_1, trans_axes)
+
+            # Convert b into a 3 dimensional tensor. Note that the last two dimensions
+            # are transposed.
+            if len(b_shape) != 3:
+                b = _op.reshape(b, [-1, b_shape[-1], b_shape[-2]])
+
             # Perform a batch matmul.
             output = _op.nn.batch_matmul(a, b)
+
             # Reshape output to original dimensions.
-            return _op.reshape(output, [*a_shape[:-2], a_shape[-2], b_shape[-1]])
+            if need_reshape_output:
+                return _op.reshape(output, [*a_shape[:-2], a_shape[-2], b_shape[-1]])
+            return output
 
         # Otherwise a simple dense op will get the job done.
         if len(b_shape) == 1:

From c55608f2a123541f8f38bf71267a20831f6f7678 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Wed, 17 Mar 2021 22:29:01 +0900
Subject: [PATCH 348/357] [SPIRV] Declare int64 capability by default (#7681)

---
 src/target/spirv/ir_builder.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/target/spirv/ir_builder.cc b/src/target/spirv/ir_builder.cc
index 3a9de4e077dc..5a1457387ae5 100644
--- a/src/target/spirv/ir_builder.cc
+++ b/src/target/spirv/ir_builder.cc
@@ -48,6 +48,8 @@ void IRBuilder::InitHeader() {
   header_.push_back(0U);
   // shader
   ib_.Begin(spv::OpCapability).Add(spv::CapabilityShader).Commit(&header_);
+  // Declare int64 capability by default
+  ib_.Begin(spv::OpCapability).Add(spv::CapabilityInt64).Commit(&header_);
   // memory model
   ib_.Begin(spv::OpMemoryModel)
       .AddSeq(spv::AddressingModelLogical, spv::MemoryModelGLSL450)

From 60ff0c79fa90230d8e1fd4d5497229c5f0688e42 Mon Sep 17 00:00:00 2001
From: zhuochen <zhuochen@outlook.com>
Date: Thu, 18 Mar 2021 00:39:05 +0800
Subject: [PATCH 349/357] [Runtime] Extend Graph Runtime To Support Cuda Graph
 Launch (#7616)

* add graph runtime cuGraph poc

* lint format

* add unittest

* fix review comments

* Update CMakeLists.txt

Co-authored-by: Cody Yu <comaniac0422@gmail.com>

* build cuda graph runtime in gpu test

* Revert "build cuda graph runtime in gpu test"

This reverts commit f286711e4126c696860be3ec3d82400ca8542bd5.

* rename cuGraph to CUDA Graph

* rename cuda_graph

* rename cuda_graph

* lint format

* Update src/runtime/graph/graph_runtime_factory.cc

Co-authored-by: Cody Yu <comaniac0422@gmail.com>

* Update python/tvm/testing.py

Co-authored-by: Cody Yu <comaniac0422@gmail.com>

* fix lint error

* remove unnecessary warn

* add test, fix lint

* fix lint W0223

Co-authored-by: Cody Yu <comaniac0422@gmail.com>
---
 CMakeLists.txt                                |   1 +
 cmake/config.cmake                            |   3 +
 cmake/modules/CUDA.cmake                      |  11 ++
 python/tvm/contrib/cuda_graph/__init__.py     |  16 +++
 .../contrib/cuda_graph/cuda_graph_runtime.py  | 134 +++++++++++++++++
 python/tvm/contrib/nvcc.py                    |  12 ++
 python/tvm/testing.py                         |  19 +++
 .../cuda_graph/graph_runtime_cuda_graph.cc    | 135 ++++++++++++++++++
 src/runtime/graph/graph_runtime_factory.cc    |  33 +++++
 src/runtime/graph/graph_runtime_factory.h     |   8 ++
 .../unittest/test_runtime_graph_cuda_graph.py | 100 +++++++++++++
 .../test_runtime_module_based_interface.py    |  30 ++++
 12 files changed, 502 insertions(+)
 create mode 100644 python/tvm/contrib/cuda_graph/__init__.py
 create mode 100644 python/tvm/contrib/cuda_graph/cuda_graph_runtime.py
 create mode 100644 src/runtime/graph/cuda_graph/graph_runtime_cuda_graph.cc
 create mode 100644 tests/python/unittest/test_runtime_graph_cuda_graph.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 451b6a7ee2c2..16968ce41f70 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -35,6 +35,7 @@ tvm_option(USE_THREADS "Build with thread support" ON)
 tvm_option(USE_LLVM "Build with LLVM, can be set to specific llvm-config path" OFF)
 tvm_option(USE_STACKVM_RUNTIME "Include stackvm into the runtime" OFF)
 tvm_option(USE_GRAPH_RUNTIME "Build with tiny graph runtime" ON)
+tvm_option(USE_GRAPH_RUNTIME_CUDA_GRAPH "Build with tiny graph runtime with CUDA Graph for GPUs" OFF)
 tvm_option(USE_PROFILER "Build profiler for the VM and graph runtime" ON)
 tvm_option(USE_OPENMP "Build with OpenMP thread pool implementation" OFF)
 tvm_option(USE_RELAY_DEBUG "Building Relay in debug mode..." OFF)
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 65859566a664..60c718c97bc1 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -99,6 +99,9 @@ set(USE_STACKVM_RUNTIME OFF)
 # Whether enable tiny embedded graph runtime.
 set(USE_GRAPH_RUNTIME ON)
 
+# Whether enable tiny graph runtime with CUDA Graph
+set(USE_GRAPH_RUNTIME_CUDA_GRAPH OFF)
+
 # Whether to enable the profiler for the graph runtime and vm
 set(USE_PROFILER ON)
 
diff --git a/cmake/modules/CUDA.cmake b/cmake/modules/CUDA.cmake
index 0ec2f1466bd1..262a4e6e7123 100644
--- a/cmake/modules/CUDA.cmake
+++ b/cmake/modules/CUDA.cmake
@@ -65,6 +65,17 @@ if(USE_CUDA)
     list(APPEND RUNTIME_SRCS ${CONTRIB_THRUST_SRC})
   endif(USE_THRUST)
 
+  if(USE_GRAPH_RUNTIME_CUDA_GRAPH)
+    if(NOT USE_GRAPH_RUNTIME)
+      message(FATAL_ERROR "CUDA Graph is only supported by graph runtime, please set USE_GRAPH_RUNTIME=ON")
+    endif()
+    if(CUDAToolkit_VERSION_MAJOR LESS "10")
+      message(FATAL_ERROR "CUDA Graph requires CUDA 10 or above, got=" ${CUDAToolkit_VERSION})
+    endif()
+    message(STATUS "Build with Graph runtime with CUDA Graph support...")
+    file(GLOB RUNTIME_CUDA_GRAPH_SRCS src/runtime/graph/cuda_graph/*.cc)
+    list(APPEND RUNTIME_SRCS ${RUNTIME_CUDA_GRAPH_SRCS})
+  endif()
 else(USE_CUDA)
   list(APPEND COMPILER_SRCS src/target/opt/build_cuda_off.cc)
 endif(USE_CUDA)
diff --git a/python/tvm/contrib/cuda_graph/__init__.py b/python/tvm/contrib/cuda_graph/__init__.py
new file mode 100644
index 000000000000..13a83393a912
--- /dev/null
+++ b/python/tvm/contrib/cuda_graph/__init__.py
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
diff --git a/python/tvm/contrib/cuda_graph/cuda_graph_runtime.py b/python/tvm/contrib/cuda_graph/cuda_graph_runtime.py
new file mode 100644
index 000000000000..45ec89d37b3d
--- /dev/null
+++ b/python/tvm/contrib/cuda_graph/cuda_graph_runtime.py
@@ -0,0 +1,134 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Graph runtime with CUDA Graph"""
+import tvm._ffi
+
+from tvm._ffi.base import string_types
+from tvm.contrib import graph_runtime
+
+
+def create(graph_json_str, libmod, ctx):
+    """Create a runtime executor module given a graph and module.
+
+    Parameters
+    ----------
+    graph_json_str : str
+        The graph to be deployed in json format output by json graph.
+        The graph can contain operator(tvm_op) that points to the name
+        of PackedFunc in the libmod.
+
+    libmod : tvm.runtime.Module
+        The module of the corresponding function
+
+    ctx : TVMContext
+        The context to deploy the module, only supports CUDA GPU
+
+    Returns
+    -------
+    graph_module : GraphModuleCudaGraph
+        CUDA graph runtime module that can be used to execute the graph.
+
+    Note
+    ----
+    See also :py:class:`tvm.contrib.cuda_graph.cuda_graph_runtime.GraphModuleCudaGraph`
+    for examples to directly construct a GraphModuleCudaGraph from an exported
+    relay compiled library.
+    """
+    assert isinstance(graph_json_str, string_types)
+    try:
+        ctx, num_rpc_ctx, device_type_id = graph_runtime.get_device_ctx(libmod, ctx)
+        if num_rpc_ctx == len(ctx):
+            fcreate = ctx[0]._rpc_sess.get_function("tvm.graph_runtime_cuda_graph.create")
+        else:
+            fcreate = tvm._ffi.get_global_func("tvm.graph_runtime_cuda_graph.create")
+    except ValueError:
+        raise ValueError(
+            "To enable CUDA graph support (experimental), please set "
+            "'(USE_GRAPH_RUNTIME_CUGRAPH ON)' in config.cmake and rebuild TVM"
+        )
+
+    return GraphModuleCudaGraph(fcreate(graph_json_str, libmod, *device_type_id))
+
+
+class GraphModuleCudaGraph(graph_runtime.GraphModule):
+    """CUDA graph runtime module.
+
+    This is a CUDA graph runtime wrapper over the TVM runtime.
+    Runtime interfaces are wrapped with CUDA graph functionalities.
+
+    Parameters
+    ----------
+    module : Module
+        The internal tvm module that holds the actual graph functions.
+    """
+
+    def __init__(self, module):
+        self._start_capture = module["start_capture"]
+        self._end_capture = module["end_capture"]
+        self._run_cuda_graph = module["run_cuda_graph"]
+        self._cuda_graph_captured = False
+        graph_runtime.GraphModule.__init__(self, module)
+
+    def capture_cuda_graph(self):
+        """Capture a CUDA graph for tvm_op graph
+
+        This should be called before run_cuda_graph() to capture and
+        instantiate a CUDA graph instance.
+        """
+        self._run()  # call cuModuleLoadData before cudaStream API
+        self._start_capture()
+        self._run()
+        self._end_capture()
+        self._cuda_graph_captured = True
+
+    def run_cuda_graph(self):
+        """Run the CUDA graph for tvm_op graph
+
+        Run the captured CUDA graph instance instead of the
+        for-loop kernel launch of default graph runtime
+        """
+        self._run_cuda_graph()
+
+    def run(self, **input_dict):
+        """A run wrapper for graph capture / launch, user can just
+        change default graph runtime to cuda graph runtime, and
+        the first call will capture a cuda graph for future launch
+
+        Parameters
+        ----------
+        input_dict: dict of str to NDArray
+            List of input values to be feed to
+        """
+        if input_dict:
+            self.set_input(**input_dict)
+        if not self._cuda_graph_captured:
+            self.capture_cuda_graph()
+        else:
+            self._run_cuda_graph()
+
+    def debug_get_output(self, node, out):
+        """Run graph up to node and get the output to out
+
+        Parameters
+        ----------
+        node : int / str
+            The node index or name
+
+        out : NDArray
+            The output array container
+        """
+        raise NotImplementedError("Please use debugger.debug_runtime as graph_runtime instead.")
diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py
index 7e49f55e8d32..99844f799d7a 100644
--- a/python/tvm/contrib/nvcc.py
+++ b/python/tvm/contrib/nvcc.py
@@ -349,6 +349,18 @@ def have_tensorcore(compute_version=None, target=None):
     return False
 
 
+def have_cudagraph():
+    """Either CUDA Graph support is provided"""
+    try:
+        cuda_path = find_cuda_path()
+        cuda_ver = get_cuda_version(cuda_path)
+        if cuda_ver < 10.0:
+            return False
+        return True
+    except RuntimeError:
+        return False
+
+
 def have_bf16(compute_version):
     """Either bf16 support is provided in the compute capability or not
 
diff --git a/python/tvm/testing.py b/python/tvm/testing.py
index d65ab23677b5..1cb43b29c521 100644
--- a/python/tvm/testing.py
+++ b/python/tvm/testing.py
@@ -514,6 +514,25 @@ def requires_cuda(*args):
     return _compose(args, _requires_cuda)
 
 
+def requires_cudagraph(*args):
+    """Mark a test as requiring the CUDA Graph Feature
+
+    This also marks the test as requiring cuda
+
+    Parameters
+    ----------
+    f : function
+        Function to mark
+    """
+    _requires_cudagraph = [
+        pytest.mark.skipif(
+            not nvcc.have_cudagraph(), reason="CUDA Graph is not supported in this environment"
+        ),
+        *requires_cuda(),
+    ]
+    return _compose(args, _requires_cudagraph)
+
+
 def requires_opencl(*args):
     """Mark a test as requiring the OpenCL runtime.
 
diff --git a/src/runtime/graph/cuda_graph/graph_runtime_cuda_graph.cc b/src/runtime/graph/cuda_graph/graph_runtime_cuda_graph.cc
new file mode 100644
index 000000000000..ee5e50a3b9d4
--- /dev/null
+++ b/src/runtime/graph/cuda_graph/graph_runtime_cuda_graph.cc
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file graph_runtime_cuda_graph.cc
+ */
+
+#include <tvm/runtime/registry.h>
+
+#include "../../cuda/cuda_common.h"
+#include "../graph_runtime.h"
+
+namespace tvm {
+namespace runtime {
+
+/*!
+ * \brief Graph runtime with CUDA Graph Support.
+ *
+ *  This is the extension of GraphRuntime class used for CUDA graph launch
+ *  instead of CUDA kernel launch. CUDA graph launch requires CUDA 10.0 or
+ *  above, currently there are two ways of constructing CUDA graphs:
+ *  (1) Using CUDA stream capture API to capture a series of operations on
+ *  CUDA stream, and automatically generates a graph (2) Building a graph
+ *  using CUDA graph API manually. This implementation uses stream capture.
+ */
+class GraphRuntimeCudaGraph : public GraphRuntime {
+ public:
+  /*!
+   * \brief Begin CUDA graph capture on stream, the stream enters capture mode.
+   */
+  void StartCapture() {
+    const TVMContext& ctx = data_entry_[entry_id(0, 0)]->ctx;
+
+    TVMStreamCreate(ctx.device_type, ctx.device_id, &capture_stream_);
+    TVMSetStream(ctx.device_type, ctx.device_id, capture_stream_);
+
+    CUDA_CALL(cudaStreamBeginCapture(static_cast<cudaStream_t>(capture_stream_),
+                                     cudaStreamCaptureModeGlobal));
+  }
+
+  /*!
+   * \brief Launch the instantiated graph on stream
+   */
+  void RunCudaGraph() {
+    cudaStream_t cuStream = static_cast<cudaStream_t>(capture_stream_);
+    CUDA_CALL(cudaGraphLaunch(cuda_graph_exec_, cuStream));
+    CUDA_CALL(cudaStreamSynchronize(cuStream));
+  }
+
+  /*!
+   * \brief End CUDA graph capture on stream, a graph will be created and
+   * instantiated.
+   */
+  void EndCapture() {
+    cudaGraph_t graph;
+    CUDA_CALL(cudaStreamEndCapture(static_cast<cudaStream_t>(capture_stream_), &graph));
+
+    cudaGraphNode_t* nodes = NULL;
+    size_t numNodes = 0;
+    CUDA_CALL(cudaGraphGetNodes(graph, nodes, &numNodes));
+    LOG(INFO) << "Num of nodes in the cuda graph created using stream capture API = " << numNodes;
+
+    CUDA_CALL(cudaGraphInstantiate(&cuda_graph_exec_, graph, NULL, NULL, 0));
+  }
+
+  /*!
+   * \brief GetFunction Get the function based on input.
+   * \param name The function which needs to be invoked.
+   * \param sptr_to_self Packed function pointer.
+   */
+  PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self);
+
+ private:
+  /*! \brief The Cuda stream on which to capture a CUDA graph. */
+  TVMStreamHandle capture_stream_;
+  /*! \brief The captured CUDA graph will be instantiated to this. */
+  cudaGraphExec_t cuda_graph_exec_;
+};
+
+PackedFunc GraphRuntimeCudaGraph::GetFunction(const std::string& name,
+                                              const ObjectPtr<Object>& sptr_to_self) {
+  if (name == "run_cuda_graph") {
+    return PackedFunc(
+        [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->RunCudaGraph(); });
+  } else if (name == "start_capture") {
+    return PackedFunc(
+        [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->StartCapture(); });
+  } else if (name == "end_capture") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->EndCapture(); });
+  } else {
+    return GraphRuntime::GetFunction(name, sptr_to_self);
+  }
+}
+
+Module GraphRuntimeCudaGraphCreate(const std::string& sym_json, const tvm::runtime::Module& m,
+                                   const std::vector<TVMContext>& ctxs,
+                                   PackedFunc lookup_linked_param_func) {
+  auto exec = make_object<GraphRuntimeCudaGraph>();
+  exec->Init(sym_json, m, ctxs, lookup_linked_param_func);
+  return Module(exec);
+}
+
+TVM_REGISTER_GLOBAL("tvm.graph_runtime_cuda_graph.create")
+    .set_body([](TVMArgs args, TVMRetValue* rv) {
+      ICHECK_GE(args.num_args, 4) << "The expected number of arguments for graph_runtime.create is "
+                                     "at least 4, but it has "
+                                  << args.num_args;
+      PackedFunc lookup_linked_param_func;
+      int ctx_start_arg = 2;
+      if (args[2].type_code() == kTVMPackedFuncHandle) {
+        lookup_linked_param_func = args[2];
+        ctx_start_arg++;
+      }
+
+      *rv = GraphRuntimeCudaGraphCreate(args[0], args[1], GetAllContext(args, ctx_start_arg),
+                                        lookup_linked_param_func);
+    });
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/graph/graph_runtime_factory.cc b/src/runtime/graph/graph_runtime_factory.cc
index 605d6b0ce892..1682afa8464a 100644
--- a/src/runtime/graph/graph_runtime_factory.cc
+++ b/src/runtime/graph/graph_runtime_factory.cc
@@ -72,6 +72,14 @@ PackedFunc GraphRuntimeFactory::GetFunction(
       exec->Import(this->imports_[0]);
       *rv = Module(exec);
     });
+  } else if (name == "cuda_graph_create") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      std::vector<TVMContext> contexts;
+      for (int i = 0; i < args.num_args; ++i) {
+        contexts.emplace_back(args[i].operator TVMContext());
+      }
+      *rv = this->CudaGraphRuntimeCreate(contexts);
+    });
   } else {
     return PackedFunc();
   }
@@ -130,6 +138,31 @@ Module GraphRuntimeFactory::DebugRuntimeCreate(const std::vector<TVMContext>& ct
   return mod;
 }
 
+Module GraphRuntimeFactory::CudaGraphRuntimeCreate(const std::vector<TVMContext>& ctxs) {
+  const PackedFunc* pf = tvm::runtime::Registry::Get("tvm.graph_runtime_cuda_graph.create");
+  ICHECK(pf != nullptr) << "Cannot find function tvm.graph_runtime_cuda_graph.create in registry. "
+                           "Did you set(USE_GRAPH_RUNTIME_CUGRAPH=ON)?";
+  std::vector<int> unpacked_ctxs;
+  for (const auto& ctx : ctxs) {
+    unpacked_ctxs.emplace_back(ctx.device_type);
+    unpacked_ctxs.emplace_back(ctx.device_id);
+  }
+  size_t args_size = unpacked_ctxs.size() + 2;
+  std::vector<TVMValue> values(args_size);
+  std::vector<int> codes(args_size);
+  runtime::TVMArgsSetter setter(values.data(), codes.data());
+  setter(0, this->graph_json_);
+  setter(1, this->imports_[0]);
+  for (size_t i = 0; i < unpacked_ctxs.size(); ++i) {
+    setter(i + 2, unpacked_ctxs[i]);
+  }
+  TVMRetValue rv;
+  pf->CallPacked(TVMArgs(values.data(), codes.data(), args_size), &rv);
+  Module mod = rv.operator Module();
+  SetParams(const_cast<GraphRuntime*>(mod.as<GraphRuntime>()), this->params_);
+  return mod;
+}
+
 Module GraphRuntimeFactoryModuleLoadBinary(void* strm) {
   dmlc::Stream* stream = static_cast<dmlc::Stream*>(strm);
   std::string graph_json;
diff --git a/src/runtime/graph/graph_runtime_factory.h b/src/runtime/graph/graph_runtime_factory.h
index 98fb27c43ea2..f2f11ee66802 100644
--- a/src/runtime/graph/graph_runtime_factory.h
+++ b/src/runtime/graph/graph_runtime_factory.h
@@ -89,6 +89,14 @@ class TVM_DLL GraphRuntimeFactory : public runtime::ModuleNode {
    */
   Module DebugRuntimeCreate(const std::vector<TVMContext>& ctxs);
 
+  /*!
+   * \brief Create a specific cuda graph runtime module
+   * \param ctxs The context of the host and devices where graph nodes will be
+   *  executed on.
+   * \return created cuda graph runtime module
+   */
+  Module CudaGraphRuntimeCreate(const std::vector<TVMContext>& ctx);
+
   /*!
    * \brief Set params.
    * \param graph_runtime The graph runtime we want to set the params into.
diff --git a/tests/python/unittest/test_runtime_graph_cuda_graph.py b/tests/python/unittest/test_runtime_graph_cuda_graph.py
new file mode 100644
index 000000000000..4a31873cb93c
--- /dev/null
+++ b/tests/python/unittest/test_runtime_graph_cuda_graph.py
@@ -0,0 +1,100 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import json
+import os
+import re
+import sys
+import time
+
+import pytest
+
+import tvm
+import tvm.testing
+from tvm import te
+import numpy as np
+
+from tvm.contrib import utils, graph_runtime
+from tvm.contrib.cuda_graph import cuda_graph_runtime
+
+
+bx = te.thread_axis("blockIdx.x")
+tx = te.thread_axis("threadIdx.x")
+
+
+@tvm.testing.requires_cudagraph
+def test_graph_simple():
+    n = 32
+    A = te.placeholder((n,), name="A")
+    B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
+    s = te.create_schedule(B.op)
+    xo, xi = s[B].split(B.op.axis[0], factor=8)
+    s[B].bind(xo, bx)
+    s[B].bind(xi, tx)
+
+    node0 = {"op": "null", "name": "x", "inputs": []}
+    node1 = {
+        "op": "tvm_op",
+        "name": "add",
+        "inputs": [[0, 0, 0]],
+        "attrs": {"func_name": "myadd", "flatten_data": "1", "num_inputs": "1", "num_outputs": "1"},
+    }
+    nodes = [node0, node1]
+    arg_nodes = [0]
+    node_row_ptr = [0, 1, 2]
+    outputs = [[1, 0, 0]]
+    shape = (n,)
+    attrs = {
+        "shape": ["list_shape", [shape, shape]],
+        "dltype": ["list_str", ["float32", "float32"]],
+        "storage_id": ["list_int", [0, 1]],
+    }
+    graph = {
+        "nodes": nodes,
+        "arg_nodes": arg_nodes,
+        "node_row_ptr": node_row_ptr,
+        "heads": outputs,
+        "attrs": attrs,
+    }
+    graph = json.dumps(graph)
+
+    def check_verify():
+        mlib = tvm.build(s, [A, B], "cuda", name="myadd")
+        ctx = tvm.gpu(0)
+        try:
+            mod = cuda_graph_runtime.create(graph, mlib, ctx)
+        except ValueError:
+            return
+
+        for i in range(3):
+            a = np.random.uniform(size=(n,)).astype(A.dtype)
+            mod.run(x=a)  # The first run captured a CUDA graph
+            out = mod.get_output(0, tvm.nd.empty((n,)))
+            np.testing.assert_equal(out.asnumpy(), a + 1)
+
+        # capture / run CUDA graph manually
+        mod.capture_cuda_graph()
+        a = np.random.uniform(size=(n,)).astype(A.dtype)
+        mod.set_input(x=a)
+        mod.run_cuda_graph()
+        out = mod.get_output(0, tvm.nd.empty((n,)))
+        np.testing.assert_equal(out.asnumpy(), a + 1)
+
+    check_verify()
+
+
+if __name__ == "__main__":
+    test_graph_simple()
diff --git a/tests/python/unittest/test_runtime_module_based_interface.py b/tests/python/unittest/test_runtime_module_based_interface.py
index a34fe4a062cb..930011d4fd33 100644
--- a/tests/python/unittest/test_runtime_module_based_interface.py
+++ b/tests/python/unittest/test_runtime_module_based_interface.py
@@ -20,6 +20,7 @@
 import tvm
 from tvm.contrib import graph_runtime
 from tvm.contrib.debugger import debug_runtime
+from tvm.contrib.cuda_graph import cuda_graph_runtime
 import tvm.testing
 
 
@@ -538,6 +539,35 @@ def test_debug_graph_runtime():
     tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
 
 
+@tvm.testing.requires_cudagraph
+def test_cuda_graph_runtime():
+    mod, params = relay.testing.synthetic.get_workload()
+    with tvm.transform.PassContext(opt_level=3):
+        complied_graph_lib = relay.build_module.build(mod, "cuda", params=params)
+    data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32")
+
+    ctx = tvm.gpu()
+    try:
+        gmod = complied_graph_lib["cuda_graph_create"](ctx)
+    except:
+        print("Skip because cuda_graph not enabled")
+        return
+    set_input = gmod["set_input"]
+    run = gmod["run"]
+    get_output = gmod["get_output"]
+    set_input("data", tvm.nd.array(data))
+    run()
+    out = get_output(0).asnumpy()
+    tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
+
+    # cuda graph runtime wrapper
+    cu_gmod = cuda_graph_runtime.GraphModuleCudaGraph(gmod)
+    cu_gmod.set_input("data", data)
+    cu_gmod.run()
+    out = cu_gmod.get_output(0).asnumpy()
+    tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
+
+
 def test_multiple_imported_modules():
     def make_func(symbol):
         n = tvm.te.size_var("n")

From 5c460ff45a31939e8e9c7e83cf00184053809788 Mon Sep 17 00:00:00 2001
From: Yizhi Liu <liuyizhi@apache.org>
Date: Wed, 17 Mar 2021 10:59:34 -0700
Subject: [PATCH 350/357] [COMMUNITY] @areusch -> Committer (#7679)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index f72220d07f16..eb2af2151acc 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -60,6 +60,7 @@ We do encourage everyone to work anything they are interested in.
 - [Thierry Moreau](https://github.com/tmoreau89) (PMC): @tmoreau89 - vta
 - [Kazutaka Morita](https://github.com/kazum): @kazum - frontends, opencl
 - [Krzysztof Parzyszek](https://github.com/kparzysz-quic): @kparzysz-quic - hexagon, llvm
+- [Andrew Reusch](https://github.com/areusch): @areusch - runtime, µTVM
 - [Jared Roesch](https://github.com/jroesch) (PMC): @jroesch - relay
 - [Siju Samuel](https://github.com/siju-samuel): @siju-samuel - frontends
 - [Siva](https://github.com/srkreddy1238): @srkreddy1238 - frontends, golang

From c871784f3eaaab960b6d29ac9b31bb287eb9e588 Mon Sep 17 00:00:00 2001
From: Chao Liu <monklof@gmail.com>
Date: Thu, 18 Mar 2021 02:45:49 +0800
Subject: [PATCH 351/357] [Frontend,TOPI] Improve dynamism for BatchMatmul and
 Dense (#7496)

* [TOPI] Dense cuda schedule support dynamic dimension

* [TOPI] batch_matmul cublas te computation support dynamism

* [Frontend] tensorflow frontend: dynamic support for BatchMatmul

* [TOPI] nn batch_matmul te computation support dynamism

* fix CI

* Update python/tvm/topi/nn/batch_matmul.py

Co-authored-by: Cody Yu <comaniac0422@gmail.com>

* Update python/tvm/topi/cuda/batch_matmul.py

Co-authored-by: Cody Yu <comaniac0422@gmail.com>

* remove concat_dynamic_shape function

* update topi dense op integer checking

* fix ci

* Update python/tvm/relay/frontend/tensorflow.py

Co-authored-by: Cody Yu <comaniac0422@gmail.com>

* Update batch_matmul.py

* [Frontend] add test for batch_matmul in dynamic shaped case

Co-authored-by: Cody Yu <comaniac0422@gmail.com>
---
 python/tvm/relay/frontend/tensorflow.py       | 54 +++++++++++++++----
 python/tvm/topi/cuda/batch_matmul.py          |  7 +--
 python/tvm/topi/cuda/dense.py                 | 11 ++--
 .../frontend/tensorflow/test_forward.py       | 52 +++++++++++++++++-
 4 files changed, 104 insertions(+), 20 deletions(-)

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index f56d187b6a63..1946223a50a4 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -44,6 +44,17 @@
 __all__ = ["from_tensorflow"]
 
 
+def check_symbolic_shape(shape):
+    return not all([isinstance(dim, (int, tvm.tir.IntImm)) for dim in shape])
+
+
+def list_shape_of(tensor, ndim):
+    shape_tensor = _op.shape_of(tensor)
+    return [
+        _op.strided_slice(shape_tensor, begin=[i], end=[i + 1], strides=[1]) for i in range(ndim)
+    ]
+
+
 def _get_pad_pair(input1d, kernel1d, stride1d):
     if input1d % stride1d == 0:
         pad = max(kernel1d - stride1d, 0)
@@ -1022,13 +1033,31 @@ def _impl(inputs, attr, params, mod):
         input_y = inputs[1]
         orig_shape_x = _infer_shape(input_x, mod)
         orig_shape_y = _infer_shape(input_y, mod)
+        ndim = len(orig_shape_x)
+
+        is_static = not check_symbolic_shape(orig_shape_x)
+
+        if ndim > 3 and not is_static:
+            shape_of_x = list_shape_of(inputs[0], ndim)
+            shape_of_y = list_shape_of(inputs[1], ndim)
 
         # reshape n-dimensional batch matmul into 3d
-        if len(orig_shape_x) > 3:
+        if ndim > 3:
             outer_dims = [orig_shape_x[i] for i in range(0, len(orig_shape_x) - 2)]
-            num_outer_elts = np.prod(outer_dims)
-            new_shape_x = (num_outer_elts, orig_shape_x[-2], orig_shape_x[-1])
-            new_shape_y = (num_outer_elts, orig_shape_y[-2], orig_shape_y[-1])
+            if is_static:
+                num_outer_elts = np.prod(outer_dims)
+                new_shape_x = (num_outer_elts, orig_shape_x[-2], orig_shape_x[-1])
+                new_shape_y = (num_outer_elts, orig_shape_y[-2], orig_shape_y[-1])
+            else:  # handle dynamic shape (dyn.reshape op)
+                # new shape = [prod(shape[:-2]), -2, -1]
+                new_shape_x = [_op.const(1), shape_of_x[-2], shape_of_x[-1]]
+                new_shape_y = [_op.const(1), shape_of_y[-2], shape_of_y[-1]]
+                for i in range(ndim - 2):
+                    new_shape_x[0] *= shape_of_x[i]
+                    new_shape_y[0] *= shape_of_y[i]
+                new_shape_x = _op.concatenate(_op.Tuple(new_shape_x), axis=0)
+                new_shape_y = _op.concatenate(_op.Tuple(new_shape_y), axis=0)
+
             input_x = _op.reshape(input_x, newshape=new_shape_x)
             input_y = _op.reshape(input_y, newshape=new_shape_y)
 
@@ -1039,12 +1068,19 @@ def _impl(inputs, attr, params, mod):
         ret = get_relay_op("batch_matmul")(input_x, input_y)
 
         # reshape result back to n-dimensional
-        if len(orig_shape_x) > 3:
-            final_shape = list(orig_shape_x)
-            final_shape[-2] = orig_shape_x[-1] if adj_x else orig_shape_x[-2]
-            final_shape[-1] = orig_shape_y[-2] if adj_y else orig_shape_y[-1]
-            ret = _op.reshape(ret, newshape=final_shape)
+        if ndim > 3:
+            if is_static:
+                final_shape = list(orig_shape_x)
+                final_shape[-2] = orig_shape_x[-1] if adj_x else orig_shape_x[-2]
+                final_shape[-1] = orig_shape_y[-2] if adj_y else orig_shape_y[-1]
+            else:
+                # calculate the resulting shape = [shape[:-2], 0, 0]
+                final_shape = list(shape_of_x)
+                final_shape[-2] = shape_of_x[-1] if adj_x else shape_of_x[-2]
+                final_shape[-1] = shape_of_y[-2] if adj_y else shape_of_y[-1]
+                final_shape = _op.concatenate(_op.Tuple(final_shape), axis=0)
 
+            ret = _op.reshape(ret, newshape=final_shape)
         return ret
 
     return _impl
diff --git a/python/tvm/topi/cuda/batch_matmul.py b/python/tvm/topi/cuda/batch_matmul.py
index 006b866d6bad..04e484f526d2 100644
--- a/python/tvm/topi/cuda/batch_matmul.py
+++ b/python/tvm/topi/cuda/batch_matmul.py
@@ -159,9 +159,10 @@ def batch_matmul_cublas(cfg, x, y, out_shape=None):
     output : tvm.te.Tensor
         3-D with shape [batch, M, N]
     """
-    b, m, k = x.shape
-    b, n, k = y.shape
-    cfg.add_flop(b * m * k * n * 2)
+    b, m, k = get_const_tuple(x.shape)
+    b, n, k = get_const_tuple(y.shape)
+    if all([isinstance(s, int) for s in [b, m, n, k]]):
+        cfg.add_flop(b * m * k * n * 2)
     return cublas.batch_matmul(x, y, False, True)
 
 
diff --git a/python/tvm/topi/cuda/dense.py b/python/tvm/topi/cuda/dense.py
index ad4882ab09f2..8adc38b84b1b 100644
--- a/python/tvm/topi/cuda/dense.py
+++ b/python/tvm/topi/cuda/dense.py
@@ -17,7 +17,7 @@
 # pylint: disable=invalid-name, unused-argument
 """Schedule for dense operator"""
 import logging
-from tvm import te, tir
+from tvm import te
 import tvm.autotvm as autotvm
 from tvm.autotvm.task.space import SplitEntity
 from tvm.contrib import cublas
@@ -39,14 +39,11 @@ def dense_cublas(cfg, data, weight, bias=None, out_dtype=None):
     if out_dtype is None:
         out_dtype = data.dtype
     assert out_dtype == data.dtype, "Mixed precision not supported."
-    batch, in_dim = data.shape
-    out_dim, _ = weight.shape
+    batch, in_dim = get_const_tuple(data.shape)
+    out_dim, _ = get_const_tuple(weight.shape)
     matmul = cublas.matmul(data, weight, False, True)
-    if isinstance(batch, int):
+    if all(isinstance(d, int) for d in [batch, in_dim, out_dim]):
         cfg.add_flop(batch * in_dim * out_dim * 2)
-    elif isinstance(batch, tir.IntImm):
-        cfg.add_flop(batch.value * in_dim * out_dim * 2)
-    # if we get a te.Var, we cannot add flop counts
     if bias is not None:
         matmul = te.compute(
             (batch, out_dim), lambda i, j: matmul[i, j] + bias[j], tag=tag.BROADCAST
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index fa27dee37699..22afe8f88f66 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -210,6 +210,7 @@ def compare_tf_with_tvm(
     mode="graph_runtime",
     cuda_layout="NCHW",
     add_shapes_to_graph_def=True,
+    targets=None,
 ):
     """Generic function to generate and compare tensorflow and TVM output"""
 
@@ -233,13 +234,18 @@ def name_without_num(name):
 
         tf_output = run_tf_graph(sess, in_data, in_name, out_name)
 
-        for device in ["llvm", "cuda"]:
+        devices = targets if targets else ["llvm", "cuda"]
+
+        for device in devices:
             ctx = tvm.context(device, 0)
             if not tvm.testing.device_enabled(device):
                 print("Skip because %s is not enabled" % device)
                 continue
             if no_gpu and device == "cuda":
                 continue
+            if "cublas" in device and not tvm.get_global_func("tvm.contrib.cublas.matmul", True):
+                print("Skip because cublas is not enabled: %s" % device)
+                continue
 
             tvm_output = run_tvm_graph(
                 final_graph_def,
@@ -1781,6 +1787,23 @@ def _test_batch_matmul(A_shape, B_shape, dtype, adjoint_a=False, adjoint_b=False
         compare_tf_with_tvm([A_np, B_np], [A.name, B.name], result.name)
 
 
+def _test_batch_matmul_dynamic(
+    A_shape, B_shape, A_np_shape, B_np_shape, dtype, adjoint_a=False, adjoint_b=False
+):
+    with tf.Graph().as_default():
+        A = tf.placeholder(shape=A_shape, dtype=dtype, name="A")
+        B = tf.placeholder(shape=B_shape, dtype=dtype, name="B")
+        result = tf.matmul(A, B, adjoint_a=adjoint_a, adjoint_b=adjoint_b, name="batchmatmul")
+
+        A_np = np.random.uniform(high=5.0, size=A_np_shape).astype(dtype)
+        B_np = np.random.uniform(high=5.0, size=B_np_shape).astype(dtype)
+        # for now, in TOPI, only cublas's implementation support dynamic shape
+        # TODO add more backends support in TOPI
+        compare_tf_with_tvm(
+            [A_np, B_np], [A.name, B.name], result.name, mode="vm", targets=["cuda -libs=cublas"]
+        )
+
+
 def test_forward_batch_matmul():
     """ TF op BatchMatMul, BatchMatMulV2 test"""
     _test_batch_matmul((3, 5, 4), (3, 4, 5), "int32")
@@ -1793,6 +1816,33 @@ def test_forward_batch_matmul():
     _test_batch_matmul((2, 3, 4, 2, 3, 4, 5, 6), (2, 3, 4, 2, 3, 4, 5, 6), "float32", False, True)
 
 
+@tvm.testing.requires_cuda
+def test_forward_batch_matmul_dynamic():
+    _test_batch_matmul_dynamic((None, 5, 4), (None, 4, 5), (3, 5, 4), (3, 4, 5), "int32")
+    _test_batch_matmul_dynamic(
+        (None, 5, 4), (None, 4, 5), (3, 5, 4), (3, 4, 5), "float32", True, True
+    )
+    _test_batch_matmul_dynamic(
+        (None, 5, 4), (None, 5, 4), (3, 5, 4), (3, 5, 4), "int32", True, False
+    )
+    _test_batch_matmul_dynamic(
+        (None, 5, 4), (None, 5, 4), (3, 5, 4), (3, 5, 4), "float32", False, True
+    )
+    _test_batch_matmul_dynamic(
+        (None, 4, 5, 6), (None, 4, 6, 5), (3, 4, 5, 6), (3, 4, 6, 5), "float32"
+    )
+    _test_batch_matmul_dynamic(
+        (None, None, 5, 6), (None, None, 6, 5), (3, 4, 5, 6), (3, 4, 6, 5), "float32"
+    )
+    _test_batch_matmul_dynamic(
+        (None, None, None, 5, 6),
+        (None, None, None, 6, 5),
+        (2, 3, 4, 5, 6),
+        (2, 3, 4, 6, 5),
+        "float32",
+    )
+
+
 #######################################################################
 # SparseTensorDenseMatMul
 # ----------------------------------

From ab86aa69e2337cb2f9ead6105f58fed90d7023e2 Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Wed, 17 Mar 2021 21:54:53 -0700
Subject: [PATCH 352/357] [Relay][QNN] Relax simulated qnn tests to prevent
 flakiness. (#7684)

* Relax simulated qnn tests to prevent flakiness.

* Change name of helper to make pytest happy.
---
 .../relay/test_op_qnn_simulated_dequantize.py   | 10 +++++-----
 .../relay/test_op_qnn_simulated_quantize.py     | 17 ++++++++++++-----
 tests/python/topi/python/test_topi_qnn.py       |  8 ++++++--
 3 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/tests/python/relay/test_op_qnn_simulated_dequantize.py b/tests/python/relay/test_op_qnn_simulated_dequantize.py
index 0cc04e4998eb..a9333c916561 100644
--- a/tests/python/relay/test_op_qnn_simulated_dequantize.py
+++ b/tests/python/relay/test_op_qnn_simulated_dequantize.py
@@ -81,7 +81,7 @@ def verify_simulated_dequantize_simple(dtype):
     dtype = relay.var("dtype", shape=[])
     vm = build_simulated_dequantize(input_data, scale, zp, dtype)
     sim_dq_out = vm.invoke("main", input_data=data_fp, scale=scale_np, zp=zp_np, dtype=dtype_np)
-    np.testing.assert_equal(sim_dq_out.asnumpy(), dq_out)
+    np.testing.assert_allclose(sim_dq_out.asnumpy(), dq_out, rtol=1e-5)
 
 
 def test_simulated_dequantize():
@@ -112,7 +112,7 @@ def test_dynamic_channels():
     dtype = relay.var("dtype", shape=[])
     vm = build_simulated_dequantize(input_data, scale, zp, dtype, axis=0)
     sim_dq_out = vm.invoke("main", input_data=data_fp, scale=scale_np, zp=zp_np, dtype=dtype_np)
-    np.testing.assert_equal(sim_dq_out.asnumpy(), dq_out)
+    np.testing.assert_allclose(sim_dq_out.asnumpy(), dq_out, rtol=1e-5)
 
     # Now get the perchannel quantize output and compare without recompiling.
     scale_np = np.array([0.5, 0.25]).astype("float32")
@@ -128,7 +128,7 @@ def test_dynamic_channels():
     )
     # Run the simulated quantize without recompiling and confirm results match.
     sim_dq_out = vm.invoke("main", input_data=data_fp, scale=scale_np, zp=zp_np, dtype=dtype_np)
-    np.testing.assert_equal(sim_dq_out.asnumpy(), dq_out)
+    np.testing.assert_allclose(sim_dq_out.asnumpy(), dq_out, rtol=1e-5)
 
 
 def test_dynamic_dtype():
@@ -153,7 +153,7 @@ def test_dynamic_dtype():
     dtype = relay.var("dtype", shape=[])
     vm = build_simulated_dequantize(input_data, scale, zp, dtype)
     sim_dq_out = vm.invoke("main", input_data=data_fp, scale=scale_np, zp=zp_np, dtype=dtype_np)
-    np.testing.assert_equal(sim_dq_out.asnumpy(), dq_out)
+    np.testing.assert_allclose(sim_dq_out.asnumpy(), dq_out, rtol=1e-5)
 
     # Now test int8 to float32 compilation.
     data = np.random.uniform(low=0, high=255, size=[2, 5]).astype("int8")
@@ -168,7 +168,7 @@ def test_dynamic_dtype():
     # Run the simulated quantize without recompiling and confirm results match.
     dtype_np = np.int32(SQNN_DTYPE_TO_CODE["int8"])
     sim_dq_out = vm.invoke("main", input_data=data_fp, scale=scale_np, zp=zp_np, dtype=dtype_np)
-    np.testing.assert_equal(sim_dq_out.asnumpy(), dq_out)
+    np.testing.assert_allclose(sim_dq_out.asnumpy(), dq_out, rtol=1e-5)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/relay/test_op_qnn_simulated_quantize.py b/tests/python/relay/test_op_qnn_simulated_quantize.py
index ee4ba209dcb8..c0fa0648d879 100644
--- a/tests/python/relay/test_op_qnn_simulated_quantize.py
+++ b/tests/python/relay/test_op_qnn_simulated_quantize.py
@@ -24,6 +24,13 @@
 from tvm.topi.nn.qnn import SQNN_DTYPE_TO_CODE
 
 
+def allclose_with_rounding(a, b):
+    # Find number of mismatches in inputs.
+    mismatch = a != b
+    # Allow some rounding errors due to GPU fp32 arithmetic.
+    assert np.sum(mismatch) <= 3
+
+
 def quantize_test_driver(in_dtype, quant_args, axis, out_dtype, in_data):
     shape = in_data.shape
     input_data = relay.var("input_data", shape=shape, dtype=in_dtype)
@@ -82,7 +89,7 @@ def verify_simulated_quantize_simple(dtype):
     dtype = relay.var("dtype", shape=[])
     vm = build_simulated_quantize(input_data, scale, zp, dtype)
     sim_q_out = vm.invoke("main", input_data=data, scale=scale_np, zp=zp_np, dtype=dtype_np)
-    np.testing.assert_equal(sim_q_out.asnumpy(), q_out)
+    allclose_with_rounding(sim_q_out.asnumpy(), q_out)
 
 
 def test_simulated_quantize():
@@ -113,7 +120,7 @@ def test_dynamic_channels():
     dtype = relay.var("dtype", shape=[])
     vm = build_simulated_quantize(input_data, scale, zp, dtype, axis=0)
     sim_q_out = vm.invoke("main", input_data=data, scale=scale_np, zp=zp_np, dtype=dtype_np)
-    np.testing.assert_equal(sim_q_out.asnumpy(), q_out)
+    allclose_with_rounding(sim_q_out.asnumpy(), q_out)
 
     # Now get the perchannel quantize output and compare without recompiling.
     scale_np = np.array([0.5, 0.25]).astype("float32")
@@ -130,7 +137,7 @@ def test_dynamic_channels():
     )
     # Run the simulated quantize without recompiling and confirm results match.
     sim_q_out = vm.invoke("main", input_data=data, scale=scale_np, zp=zp_np, dtype=dtype_np)
-    np.testing.assert_equal(sim_q_out.asnumpy(), q_out)
+    allclose_with_rounding(sim_q_out.asnumpy(), q_out)
 
 
 def test_dynamic_dtype():
@@ -155,7 +162,7 @@ def test_dynamic_dtype():
     dtype = relay.var("dtype", shape=[])
     vm = build_simulated_quantize(input_data, scale, zp, dtype)
     sim_q_out = vm.invoke("main", input_data=data, scale=scale_np, zp=zp_np, dtype=dtype_np)
-    np.testing.assert_equal(sim_q_out.asnumpy(), q_out)
+    allclose_with_rounding(sim_q_out.asnumpy(), q_out)
 
     # Now test float32 to int32 compilation.
     # Get the reference quantize output.
@@ -169,7 +176,7 @@ def test_dynamic_dtype():
     # Run the simulated quantize without recompiling and confirm results match.
     dtype_np = np.int32(SQNN_DTYPE_TO_CODE["int32"])
     sim_q_out = vm.invoke("main", input_data=data, scale=scale_np, zp=zp_np, dtype=dtype_np)
-    np.testing.assert_equal(sim_q_out.asnumpy(), q_out)
+    allclose_with_rounding(sim_q_out.asnumpy(), q_out)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/topi/python/test_topi_qnn.py b/tests/python/topi/python/test_topi_qnn.py
index a63f34fe08d0..386f77335f1a 100644
--- a/tests/python/topi/python/test_topi_qnn.py
+++ b/tests/python/topi/python/test_topi_qnn.py
@@ -72,7 +72,9 @@ def check_device(device, ctx):
         func(a, d, s, z, q)
 
         # Check correctness against the true qnn output.
-        tvm.testing.assert_allclose(q.asnumpy(), real_q_out.asnumpy().astype("float32"))
+        mismatch = q.asnumpy() != real_q_out.asnumpy().astype("float32")
+        # Allow some rounding errors due to GPU fp32 arithmetic.
+        assert np.sum(mismatch) <= 3
 
     for target, ctx in tvm.testing.enabled_targets():
         check_device(target, ctx)
@@ -137,7 +139,9 @@ def check_device(device, ctx):
         func(a, d, s, z, dq)
 
         # Check correctness against the true qnn output.
-        tvm.testing.assert_allclose(dq.asnumpy(), real_dq_out.asnumpy().astype("float32"))
+        tvm.testing.assert_allclose(
+            dq.asnumpy(), real_dq_out.asnumpy().astype("float32"), rtol=1e-5
+        )
 
     for target, ctx in tvm.testing.enabled_targets():
         check_device(target, ctx)

From 4976bb2a2f90ecfdad7306257daa4e5948af74e0 Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Wed, 17 Mar 2021 22:08:51 -0700
Subject: [PATCH 353/357] [Relay] Add TopPattern to nn.dropout (#7685)

---
 src/relay/op/nn/nn.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 0ea71de367fa..b2404cc1954b 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -590,6 +590,7 @@ The whole array is rescaled by ``1/(1-p)`` to keep the expected sum of the input
     .set_num_inputs(1)
     .add_argument("data", "Tensor", "Input to which dropout will be applied.")
     .set_support_level(1)
+    .set_attr<TOpPattern>("TOpPattern", kOpaque)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
     .add_type_rel("Dropout", DropoutRel)
     .set_attr<TOpIsStateful>("TOpIsStateful", true);

From 38aed59f9fdddcbc9ac98afb8aa11455c81fc9de Mon Sep 17 00:00:00 2001
From: CircleSpin <2keepconnected@gmail.com>
Date: Thu, 18 Mar 2021 01:09:55 -0400
Subject: [PATCH 354/357] [TVMC] Allow optional arguments to be passed to
 importers (#7674)

* add support for optional args for frontends tvmc

* remove unnecessary comments

* Add changes suggested by Matt W. via PR

Co-authored-by: Jocelyn <jocelyn@pop-os.localdomain>
---
 python/tvm/driver/tvmc/frontends.py        | 27 +++++++++++-----------
 tests/python/driver/tvmc/test_frontends.py | 22 ++++++++++++------
 2 files changed, 29 insertions(+), 20 deletions(-)

diff --git a/python/tvm/driver/tvmc/frontends.py b/python/tvm/driver/tvmc/frontends.py
index 16e6c8eb966e..0488223c782f 100644
--- a/python/tvm/driver/tvmc/frontends.py
+++ b/python/tvm/driver/tvmc/frontends.py
@@ -54,7 +54,7 @@ def suffixes():
         """File suffixes (extensions) used by this frontend"""
 
     @abstractmethod
-    def load(self, path, shape_dict=None):
+    def load(self, path, shape_dict=None, **kwargs):
         """Load a model from a given path.
 
         Parameters
@@ -101,7 +101,7 @@ def name():
     def suffixes():
         return ["h5"]
 
-    def load(self, path, shape_dict=None):
+    def load(self, path, shape_dict=None, **kwargs):
         # pylint: disable=C0103
         tf, keras = import_keras()
 
@@ -130,7 +130,8 @@ def load(self, path, shape_dict=None):
         input_shapes = {name: x.shape for (name, x) in zip(model.input_names, inputs)}
         if shape_dict is not None:
             input_shapes.update(shape_dict)
-        return relay.frontend.from_keras(model, input_shapes, layout="NHWC")
+        kwargs.setdefault("layout", "NHWC")
+        return relay.frontend.from_keras(model, input_shapes, **kwargs)
 
     def is_sequential_p(self, model):
         _, keras = import_keras()
@@ -158,14 +159,14 @@ def name():
     def suffixes():
         return ["onnx"]
 
-    def load(self, path, shape_dict=None):
+    def load(self, path, shape_dict=None, **kwargs):
         # pylint: disable=C0415
         import onnx
 
         # pylint: disable=E1101
         model = onnx.load(path)
 
-        return relay.frontend.from_onnx(model, shape=shape_dict)
+        return relay.frontend.from_onnx(model, shape=shape_dict, **kwargs)
 
 
 class TensorflowFrontend(Frontend):
@@ -179,7 +180,7 @@ def name():
     def suffixes():
         return ["pb"]
 
-    def load(self, path, shape_dict=None):
+    def load(self, path, shape_dict=None, **kwargs):
         # pylint: disable=C0415
         import tensorflow as tf
         import tvm.relay.testing.tf as tf_testing
@@ -192,7 +193,7 @@ def load(self, path, shape_dict=None):
         graph_def = tf_testing.ProcessGraphDefParam(graph_def)
 
         logger.debug("parse TensorFlow model and convert into Relay computation graph")
-        return relay.frontend.from_tensorflow(graph_def, shape=shape_dict)
+        return relay.frontend.from_tensorflow(graph_def, shape=shape_dict, **kwargs)
 
 
 class TFLiteFrontend(Frontend):
@@ -206,7 +207,7 @@ def name():
     def suffixes():
         return ["tflite"]
 
-    def load(self, path, shape_dict=None):
+    def load(self, path, shape_dict=None, **kwargs):
         # pylint: disable=C0415
         import tflite.Model as model
 
@@ -229,7 +230,7 @@ def load(self, path, shape_dict=None):
             raise TVMCException("input file not tflite version 3")
 
         logger.debug("parse TFLite model and convert into Relay computation graph")
-        mod, params = relay.frontend.from_tflite(tflite_model, shape_dict=shape_dict)
+        mod, params = relay.frontend.from_tflite(tflite_model, shape_dict=shape_dict, **kwargs)
         return mod, params
 
 
@@ -245,7 +246,7 @@ def suffixes():
         # Torch Script is a zip file, but can be named pth
         return ["pth", "zip"]
 
-    def load(self, path, shape_dict=None):
+    def load(self, path, shape_dict=None, **kwargs):
         # pylint: disable=C0415
         import torch
 
@@ -259,7 +260,7 @@ def load(self, path, shape_dict=None):
         input_shapes = list(shape_dict.items())
 
         logger.debug("parse Torch model and convert into Relay computation graph")
-        return relay.frontend.from_pytorch(traced_model, input_shapes)
+        return relay.frontend.from_pytorch(traced_model, input_shapes, **kwargs)
 
 
 ALL_FRONTENDS = [
@@ -339,7 +340,7 @@ def guess_frontend(path):
     raise TVMCException("failed to infer the model format. Please specify --model-format")
 
 
-def load_model(path, model_format=None, shape_dict=None):
+def load_model(path, model_format=None, shape_dict=None, **kwargs):
     """Load a model from a supported framework and convert it
     into an equivalent relay representation.
 
@@ -367,6 +368,6 @@ def load_model(path, model_format=None, shape_dict=None):
     else:
         frontend = guess_frontend(path)
 
-    mod, params = frontend.load(path, shape_dict)
+    mod, params = frontend.load(path, shape_dict, **kwargs)
 
     return mod, params
diff --git a/tests/python/driver/tvmc/test_frontends.py b/tests/python/driver/tvmc/test_frontends.py
index b41f4c4dff2d..5a63c5c47933 100644
--- a/tests/python/driver/tvmc/test_frontends.py
+++ b/tests/python/driver/tvmc/test_frontends.py
@@ -115,26 +115,34 @@ def test_load_model__tflite(tflite_mobilenet_v1_1_quant):
     assert "_param_1" in params.keys()
 
 
-def test_load_model__keras(keras_resnet50):
+@pytest.mark.parametrize("load_model_kwargs", [{}, {"layout": "NCHW"}])
+def test_load_model__keras(keras_resnet50, load_model_kwargs):
     # some CI environments wont offer TensorFlow/Keras, so skip in case it is not present
     pytest.importorskip("tensorflow")
 
-    mod, params = tvmc.frontends.load_model(keras_resnet50)
+    mod, params = tvmc.frontends.load_model(keras_resnet50, **load_model_kwargs)
     assert type(mod) is IRModule
     assert type(params) is dict
     ## check whether one known value is part of the params dict
     assert "_param_1" in params.keys()
 
 
+def verify_load_model__onnx(model, **kwargs):
+    mod, params = tvmc.frontends.load_model(model, **kwargs)
+    assert type(mod) is IRModule
+    assert type(params) is dict
+    return mod, params
+
+
 def test_load_model__onnx(onnx_resnet50):
     # some CI environments wont offer onnx, so skip in case it is not present
     pytest.importorskip("onnx")
-
-    mod, params = tvmc.frontends.load_model(onnx_resnet50)
-    assert type(mod) is IRModule
-    assert type(params) is dict
-    ## check whether one known value is part of the params dict
+    mod, params = verify_load_model__onnx(onnx_resnet50)
+    # check whether one known value is part of the params dict
     assert "resnetv24_batchnorm0_gamma" in params.keys()
+    mod, params = verify_load_model__onnx(onnx_resnet50, freeze_params=True)
+    # check that the parameter dict is empty, implying that they have been folded into constants
+    assert params == {}
 
 
 def test_load_model__pb(pb_mobilenet_v1_1_quant):

From c976a07fa24efe91e8ac4f9d088a14442bb8d161 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tristan.konolige@gmail.com>
Date: Thu, 18 Mar 2021 06:08:08 -0700
Subject: [PATCH 355/357] [RUNTIME] Add libbacktrace for backtraces with line
 numbers (#7153)

* [RUNTIME] Add libbacktrace for backtraces with line numbers

Co-authored-by: Robert Kimball <bobkimball@gmail.com>
---
 .gitmodules                                   |   3 +
 3rdparty/dmlc-core                            |   2 +-
 3rdparty/libbacktrace                         |   1 +
 CMakeLists.txt                                |  62 ++-
 .../app/src/main/jni/Application.mk           |   4 +-
 .../app/src/main/jni/tvm_runtime.h            |  38 +-
 .../app/src/main/jni/Application.mk           |   2 +-
 .../app/src/main/jni/tvm_runtime.h            |   3 +
 .../app/src/main/jni/Application.mk           |   2 +-
 .../app/src/main/jni/tvm_runtime.h            |  36 +-
 apps/bundle_deploy/Makefile                   |   6 +-
 apps/dso_plugin_module/Makefile               |   3 +-
 apps/extension/Makefile                       |   3 +-
 apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj |   6 +
 apps/ios_rpc/tvmrpc/TVMRuntime.h              |   2 +-
 apps/ios_rpc/tvmrpc/TVMRuntime.mm             |  16 +-
 apps/ios_rpc/tvmrpc/ViewController.mm         |   4 +-
 cmake/config.cmake                            |   5 +
 cmake/modules/Libbacktrace.cmake              |  45 ++
 cmake/modules/VTA.cmake                       |   3 +
 golang/Makefile                               |   2 +-
 include/tvm/ir/attrs.h                        |   4 +-
 include/tvm/ir/diagnostic.h                   |   9 +
 include/tvm/ir/error.h                        |  26 +-
 include/tvm/ir/type_relation.h                |   2 +-
 include/tvm/relay/analysis.h                  |   2 +-
 include/tvm/runtime/container.h               |   1 +
 include/tvm/runtime/data_type.h               |   2 +-
 include/tvm/runtime/logging.h                 | 438 ++++++++++++++++++
 include/tvm/runtime/object.h                  |   2 +-
 include/tvm/runtime/packed_func.h             |   6 +-
 include/tvm/runtime/vm/bytecode.h             |   2 +-
 include/tvm/support/logging.h                 | 158 -------
 include/tvm/support/with.h                    |   2 +-
 licenses/LICENSE.libbacktrace.txt             |  29 ++
 python/setup.py                               |   2 +-
 python/tvm/_ffi/base.py                       |   4 +-
 python/tvm/micro/build.py                     |   2 +-
 src/auto_scheduler/compute_dag.cc             |   2 +-
 src/auto_scheduler/feature.cc                 |   2 +-
 .../search_policy/sketch_policy_rules.cc      |   4 +-
 src/auto_scheduler/transform_step.cc          |   2 +-
 src/ir/error.cc                               |   3 +-
 src/parser/parser.cc                          |   8 +-
 src/parser/span_check.h                       |   2 +-
 src/relay/analysis/annotated_region_set.cc    |   5 +-
 src/relay/analysis/kind_check.cc              |   2 +-
 src/relay/analysis/type_solver.cc             |   4 +-
 src/relay/analysis/well_formed.cc             |   2 +-
 src/relay/backend/vm/compiler.cc              |   2 +-
 src/relay/backend/vm/compiler.h               |   2 +-
 src/relay/backend/vm/inline_primitives.cc     |   2 +-
 src/relay/backend/vm/lambda_lift.cc           |   2 +-
 src/relay/backend/vm/removed_unused_funcs.cc  |   2 +-
 src/relay/op/nn/convolution.h                 |   2 +-
 src/relay/op/tensor/transform.cc              |  12 +-
 src/relay/op/tensor/transform.h               |   4 +-
 src/relay/op/type_relations.cc                |   2 +-
 src/relay/qnn/op/concatenate.cc               |  14 +-
 src/relay/transforms/fold_explicit_padding.cc |   2 +-
 src/relay/transforms/inline.cc                |   2 +-
 src/relay/transforms/memory_alloc.cc          |   2 +-
 src/relay/transforms/partial_eval.cc          |   4 +-
 src/relay/transforms/simplify_expr.cc         |   2 +-
 src/relay/transforms/to_a_normal_form.cc      |   2 +-
 .../transforms/to_basic_block_normal_form.cc  |   2 +-
 src/relay/transforms/type_infer.cc            |   2 +-
 src/runtime/c_runtime_api.cc                  |   6 +-
 src/runtime/contrib/cblas/cblas.cc            |   2 +-
 src/runtime/contrib/cblas/mkl.cc              |   2 +-
 src/runtime/contrib/cblas/mkldnn.cc           |   2 +-
 src/runtime/contrib/cublas/cublas.cc          |   2 +-
 src/runtime/contrib/cublas/cublas_utils.h     |   2 +-
 src/runtime/contrib/cudnn/cudnn_utils.h       |   2 +-
 src/runtime/contrib/miopen/miopen_utils.h     |   2 +-
 src/runtime/contrib/mps/mps_utils.h           |   2 +-
 src/runtime/contrib/nnpack/convolution.cc     |   2 +-
 src/runtime/contrib/nnpack/fully_connected.cc |   2 +-
 src/runtime/contrib/nnpack/nnpack_utils.h     |   2 +-
 .../contrib/random/mt_random_engine.cc        |   2 +-
 src/runtime/contrib/random/random.cc          |   2 +-
 src/runtime/contrib/rocblas/rocblas.cc        |   2 +-
 .../contrib/tensorrt/tensorrt_logger.h        |   2 +-
 .../contrib/vitis_ai/vitis_ai_runtime.cc      |   1 +
 src/runtime/cpu_device_api.cc                 |   2 +-
 src/runtime/crt/Makefile                      |   4 +-
 src/runtime/crt/graph_runtime/load_json.c     |   2 +-
 src/runtime/file_utils.cc                     |   2 +-
 src/runtime/graph/graph_runtime.cc            |   2 +-
 src/runtime/hexagon/hexagon_device_api.cc     |   2 +-
 src/runtime/hexagon/hexagon_module.cc         |   2 +-
 src/runtime/hexagon/hexagon_module.h          |   2 +-
 src/runtime/hexagon/sim/hexagon_device_sim.cc |   2 +-
 .../hexagon/target/hexagon_dsprpcapi.cc       |   2 +-
 .../hexagon/target/hexagon_dsprpcapi.h        |   2 +-
 src/runtime/hexagon/target/hexagon_stubapi.cc |   2 +-
 src/runtime/hexagon/target/hexagon_stubapi.h  |   2 +-
 src/runtime/logging.cc                        | 151 ++++++
 src/runtime/metal/metal_common.h              |   2 +-
 src/runtime/micro/micro_session.cc            |   2 +-
 src/runtime/minrpc/minrpc_server.h            |   2 +-
 src/runtime/ndarray.cc                        |   2 +-
 src/runtime/object.cc                         |   2 +-
 src/runtime/opencl/opencl_common.h            |   2 +-
 src/runtime/registry.cc                       |   2 +-
 src/runtime/rocm/rocm_device_api.cc           |   2 +-
 src/runtime/rpc/rpc_device_api.cc             |   4 +-
 src/runtime/rpc/rpc_endpoint.cc               |  10 +-
 src/runtime/rpc/rpc_module.cc                 |   4 +-
 src/runtime/rpc/rpc_session.cc                |   8 +-
 src/runtime/runtime_base.h                    |   6 +-
 src/runtime/thread_pool.cc                    |   2 +-
 src/runtime/threading_backend.cc              |   2 +-
 src/runtime/vm/bytecode.cc                    |   2 +-
 src/runtime/vm/vm.cc                          |   2 +-
 src/runtime/vulkan/vulkan_common.h            |   2 +-
 src/runtime/vulkan/vulkan_shader.h            |   2 +-
 src/support/base64.h                          |   2 +-
 src/support/parallel_for.cc                   |   2 +-
 src/support/pipe.h                            |   2 +-
 src/support/socket.h                          |   2 +-
 src/target/llvm/llvm_common.cc                |   2 +-
 src/target/target.cc                          |  81 ++--
 tests/cpp/ir_functor_test.cc                  |   2 +-
 tests/cpp/parallel_for_test.cc                |   2 +-
 tests/lint/check_file_type.py                 |   2 +
 tests/python/relay/test_ir_parser.py          |   8 +-
 tests/python/relay/test_ir_text_printer.py    |   5 -
 tests/scripts/task_build.sh                   |   2 +-
 tutorials/auto_scheduler/tune_network_cuda.py |   2 +-
 tutorials/auto_scheduler/tune_network_mali.py |   2 +-
 tutorials/auto_scheduler/tune_network_x86.py  |   2 +-
 web/emcc/tvmjs_support.cc                     |   8 +-
 web/emcc/wasm_runtime.cc                      |  10 +-
 web/emcc/webgpu_runtime.cc                    |  25 +-
 135 files changed, 1055 insertions(+), 419 deletions(-)
 create mode 160000 3rdparty/libbacktrace
 create mode 100644 cmake/modules/Libbacktrace.cmake
 create mode 100644 include/tvm/runtime/logging.h
 delete mode 100644 include/tvm/support/logging.h
 create mode 100644 licenses/LICENSE.libbacktrace.txt
 mode change 100755 => 100644 src/auto_scheduler/transform_step.cc
 create mode 100644 src/runtime/logging.cc

diff --git a/.gitmodules b/.gitmodules
index a1367c97b2f5..6ef740e33153 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,3 +10,6 @@
 [submodule "3rdparty/vta-hw"]
 	path = 3rdparty/vta-hw
 	url = https://github.com/apache/incubator-tvm-vta
+[submodule "3rdparty/libbacktrace"]
+	path = 3rdparty/libbacktrace
+	url = https://github.com/tlc-pack/libbacktrace.git
diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core
index 6c401e242c59..21cc7de0dc9f 160000
--- a/3rdparty/dmlc-core
+++ b/3rdparty/dmlc-core
@@ -1 +1 @@
-Subproject commit 6c401e242c59a1f4c913918246591bb13fd714e7
+Subproject commit 21cc7de0dc9fd6acb796e1be6181fa8e6b6c8f41
diff --git a/3rdparty/libbacktrace b/3rdparty/libbacktrace
new file mode 160000
index 000000000000..08f7c7e69f8e
--- /dev/null
+++ b/3rdparty/libbacktrace
@@ -0,0 +1 @@
+Subproject commit 08f7c7e69f8ea61a0c4151359bc8023be8e9217b
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 16968ce41f70..1aa3e68ffd14 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -48,6 +48,11 @@ tvm_option(USE_TF_TVMDSOOP "Build with TensorFlow TVMDSOOp" OFF)
 tvm_option(USE_FALLBACK_STL_MAP "Use TVM's POD compatible Map" OFF)
 tvm_option(USE_ETHOSN "Build with Arm Ethos-N" OFF)
 tvm_option(INDEX_DEFAULT_I64 "Defaults the index datatype to int64" ON)
+set(_LIBBACKTRACE_DEFAULT OFF)
+if(CMAKE_SYSTEM_NAME MATCHES "Darwin" OR CMAKE_SYSTEM_NAME MATCHES "Linux")
+  set(_LIBBACKTRACE_DEFAULT ON)
+endif()
+tvm_option(USE_LIBBACKTRACE "Build libbacktrace to supply linenumbers on stack traces" ${_LIBBACKTRACE_DEFAULT})
 
 # 3rdparty libraries
 tvm_option(DLPACK_PATH "Path to DLPACK" "3rdparty/dlpack/include")
@@ -138,6 +143,8 @@ if(MSVC)
   add_compile_options(/wd4146)
   # 'inline': used more than once
   add_compile_options(/wd4141)
+  # unknown pragma
+  add_compile_options(/wd4068)
 else(MSVC)
   set(WARNING_FLAG -Wall)
   if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
@@ -389,6 +396,26 @@ set_property(TARGET tvm APPEND PROPERTY LINK_OPTIONS "${TVM_VISIBILITY_FLAG}")
 add_library(tvm_runtime SHARED $<TARGET_OBJECTS:tvm_runtime_objs>)
 set_property(TARGET tvm_runtime APPEND PROPERTY LINK_OPTIONS "${TVM_VISIBILITY_FLAG}")
 
+target_compile_definitions(tvm_objs PUBLIC DMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
+target_compile_definitions(tvm_runtime_objs PUBLIC DMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
+target_compile_definitions(tvm PUBLIC DMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
+target_compile_definitions(tvm_runtime PUBLIC DMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
+if(USE_LIBBACKTRACE)
+  message(STATUS "Building with libbacktrace...")
+  include(cmake/modules/Libbacktrace.cmake)
+  target_link_libraries(tvm PRIVATE libbacktrace)
+  target_link_libraries(tvm_runtime PRIVATE libbacktrace)
+  add_dependencies(tvm_runtime_objs libbacktrace)
+  # pre 3.12 versions of cmake cannot propagate include directories from imported targets so we set them manually
+  target_include_directories(tvm PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/include")
+  target_include_directories(tvm_objs PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/include")
+  target_include_directories(tvm_runtime PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/include")
+  target_include_directories(tvm_runtime_objs PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/include")
+else()
+  target_compile_definitions(tvm_objs PRIVATE TVM_BACKTRACE_DISABLED)
+  target_compile_definitions(tvm_runtime_objs PRIVATE TVM_BACKTRACE_DISABLED)
+endif()
+
 if(USE_MICRO)
   # NOTE: cmake doesn't track dependencies at the file level across subdirectories. For the
   # Unix Makefiles generator, need to add these explicit target-level dependency)
@@ -403,9 +430,9 @@ endif()
 if(USE_RELAY_DEBUG)
   message(STATUS "Building Relay in debug mode...")
   target_compile_definitions(tvm_objs PRIVATE "USE_RELAY_DEBUG")
-  target_compile_definitions(tvm_objs PRIVATE "DMLC_LOG_DEBUG")
+  target_compile_definitions(tvm_objs PRIVATE "TVM_LOG_DEBUG")
   target_compile_definitions(tvm_runtime_objs PRIVATE "USE_RELAY_DEBUG")
-  target_compile_definitions(tvm_runtime_objs PRIVATE "DMLC_LOG_DEBUG")
+  target_compile_definitions(tvm_runtime_objs PRIVATE "TVM_LOG_DEBUG")
 else()
   target_compile_definitions(tvm_objs PRIVATE "NDEBUG")
   target_compile_definitions(tvm_runtime_objs PRIVATE "NDEBUG")
@@ -476,6 +503,7 @@ if (HIDE_PRIVATE_SYMBOLS AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
   # once minimum CMake version is bumped up to 3.13 or above.
   target_link_libraries(tvm PRIVATE ${HIDE_SYMBOLS_LINKER_FLAGS})
   target_link_libraries(tvm_runtime PRIVATE ${HIDE_SYMBOLS_LINKER_FLAGS})
+  target_compile_definitions(tvm_allvisible PUBLIC DMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
 endif()
 
 # Tests
@@ -544,3 +572,33 @@ if(MSVC)
   target_compile_definitions(tvm_objs PRIVATE -DTVM_EXPORTS)
   target_compile_definitions(tvm_runtime_objs PRIVATE -DTVM_EXPORTS)
 endif()
+
+set(TVM_IS_DEBUG_BUILD OFF)
+if(CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo" OR CMAKE_CXX_FLAGS MATCHES "-g")
+  set(TVM_IS_DEBUG_BUILD ON)
+endif()
+
+# Change relative paths in backtrace to absolute ones
+if(TVM_IS_DEBUG_BUILD)
+  set(FILE_PREFIX_MAP_FLAG "-ffile-prefix-map=..=${CMAKE_CURRENT_SOURCE_DIR}")
+  target_compile_options(tvm PRIVATE "${FILE_PREFIX_MAP_FLAG}")
+  CHECK_CXX_COMPILER_FLAG("${FILE_PREFIX_MAP_FLAG}" FILE_PREFIX_MAP_SUPPORTED)
+  if(FILE_PREFIX_MAP_SUPPORTED)
+    target_compile_options(tvm PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${FILE_PREFIX_MAP_FLAG}>)
+    target_compile_options(tvm_objs PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${FILE_PREFIX_MAP_FLAG}>)
+    target_compile_options(tvm_runtime PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${FILE_PREFIX_MAP_FLAG}>)
+    target_compile_options(tvm_runtime_objs PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${FILE_PREFIX_MAP_FLAG}>)
+  endif()
+endif()
+
+# Run dsymutil to generate debugging symbols for backtraces
+if(APPLE AND TVM_IS_DEBUG_BUILD)
+  find_program(DSYMUTIL dsymutil)
+  mark_as_advanced(DSYMUTIL)
+  add_custom_command(TARGET tvm
+      POST_BUILD
+      COMMAND ${DSYMUTIL} ARGS $<TARGET_FILE:tvm>
+      COMMENT "Running dsymutil"
+      VERBATIM
+		  )
+endif()
diff --git a/apps/android_camera/app/src/main/jni/Application.mk b/apps/android_camera/app/src/main/jni/Application.mk
index 63a79458ef94..5c8774889685 100644
--- a/apps/android_camera/app/src/main/jni/Application.mk
+++ b/apps/android_camera/app/src/main/jni/Application.mk
@@ -31,7 +31,7 @@ include $(config)
 APP_ABI ?= all
 APP_STL := c++_shared
 
-APP_CPPFLAGS += -DDMLC_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti
+APP_CPPFLAGS += -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti
 ifeq ($(USE_OPENCL), 1)
     APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1
 endif
@@ -43,4 +43,4 @@ endif
 
 ifeq ($(USE_SORT), 1)
     APP_CPPFLAGS += -DUSE_SORT=1
-endif
\ No newline at end of file
+endif
diff --git a/apps/android_camera/app/src/main/jni/tvm_runtime.h b/apps/android_camera/app/src/main/jni/tvm_runtime.h
index 5f3db04274a1..47a3a3de6bba 100644
--- a/apps/android_camera/app/src/main/jni/tvm_runtime.h
+++ b/apps/android_camera/app/src/main/jni/tvm_runtime.h
@@ -25,17 +25,13 @@
 
 #include <fstream>
 
-/* Enable custom logging - this will cause TVM to pass every log message
- * through CustomLogMessage instead of LogMessage. By enabling this, we must
- * implement dmlc::CustomLogMessage::Log. We use this to pass TVM log
- * messages to Android logcat.
+#define DMLC_USE_LOGGING_LIBRARY <tvm/runtime/logging.h>
+#define TVM_BACKTRACE_DISABLED 1
+/* Enable custom logging - this will cause TVM to use a custom implementation
+ * of tvm::runtime::detail::LogMessage. We use this to pass TVM log messages to
+ * Android logcat.
  */
-#define DMLC_LOG_CUSTOMIZE 1
-
-/* Ensure that fatal errors are passed to the logger before throwing
- * in LogMessageFatal
- */
-#define DMLC_LOG_BEFORE_THROW 1
+#define TVM_LOG_CUSTOMIZE 1
 
 #include "../src/runtime/c_runtime_api.cc"
 #include "../src/runtime/cpu_device_api.cc"
@@ -72,8 +68,20 @@
 
 #include <android/log.h>
 
-void dmlc::CustomLogMessage::Log(const std::string& msg) {
-  // This is called for every message logged by TVM.
-  // We pass the message to logcat.
-  __android_log_write(ANDROID_LOG_DEBUG, "TVM_RUNTIME", msg.c_str());
-}
\ No newline at end of file
+namespace tvm {
+namespace runtime {
+namespace detail {
+// Override logging mechanism
+void LogFatalImpl(const std::string& file, int lineno, const std::string& message) {
+  std::string m = file + ":" + std::to_string(lineno) + ": " + message;
+  __android_log_write(ANDROID_LOG_DEBUG, "TVM_RUNTIME", m.c_str());
+  throw InternalError(file, lineno, message);
+}
+void LogMessageImpl(const std::string& file, int lineno, const std::string& message) {
+  std::string m = file + ":" + std::to_string(lineno) + ": " + message;
+  __android_log_write(ANDROID_LOG_DEBUG, "TVM_RUNTIME", m.c_str());
+}
+
+}  // namespace detail
+}  // namespace runtime
+}  // namespace tvm
diff --git a/apps/android_deploy/app/src/main/jni/Application.mk b/apps/android_deploy/app/src/main/jni/Application.mk
index a50a40bf5cd1..42c4f232a553 100644
--- a/apps/android_deploy/app/src/main/jni/Application.mk
+++ b/apps/android_deploy/app/src/main/jni/Application.mk
@@ -27,7 +27,7 @@ include $(config)
 
 APP_STL := c++_static
 
-APP_CPPFLAGS += -DDMLC_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti
+APP_CPPFLAGS += -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti
 ifeq ($(USE_OPENCL), 1)
 	APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1
 endif
diff --git a/apps/android_deploy/app/src/main/jni/tvm_runtime.h b/apps/android_deploy/app/src/main/jni/tvm_runtime.h
index 362d278c38c4..4412e9c62e9d 100644
--- a/apps/android_deploy/app/src/main/jni/tvm_runtime.h
+++ b/apps/android_deploy/app/src/main/jni/tvm_runtime.h
@@ -25,6 +25,9 @@
 
 #include <fstream>
 
+#define DMLC_USE_LOGGING_LIBRARY <tvm/runtime/logging.h>
+#define TVM_BACKTRACE_DISABLED 1
+
 #include "../src/runtime/c_runtime_api.cc"
 #include "../src/runtime/cpu_device_api.cc"
 #include "../src/runtime/dso_library.cc"
diff --git a/apps/android_rpc/app/src/main/jni/Application.mk b/apps/android_rpc/app/src/main/jni/Application.mk
index 5f885f1c6f14..088eeed750b8 100644
--- a/apps/android_rpc/app/src/main/jni/Application.mk
+++ b/apps/android_rpc/app/src/main/jni/Application.mk
@@ -31,7 +31,7 @@ include $(config)
 APP_ABI ?= armeabi-v7a arm64-v8a x86 x86_64 mips
 APP_STL := c++_shared
 
-APP_CPPFLAGS += -DDMLC_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti
+APP_CPPFLAGS += -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti
 ifeq ($(USE_OPENCL), 1)
     APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1
 endif
diff --git a/apps/android_rpc/app/src/main/jni/tvm_runtime.h b/apps/android_rpc/app/src/main/jni/tvm_runtime.h
index fb5993066448..40e6279fb386 100644
--- a/apps/android_rpc/app/src/main/jni/tvm_runtime.h
+++ b/apps/android_rpc/app/src/main/jni/tvm_runtime.h
@@ -25,17 +25,13 @@
 
 #include <fstream>
 
-/* Enable custom logging - this will cause TVM to pass every log message
- * through CustomLogMessage instead of LogMessage. By enabling this, we must
- * implement dmlc::CustomLogMessage::Log. We use this to pass TVM log
- * messages to Android logcat.
+#define DMLC_USE_LOGGING_LIBRARY <tvm/runtime/logging.h>
+#define TVM_BACKTRACE_DISABLED 1
+/* Enable custom logging - this will cause TVM to use a custom implementation
+ * of tvm::runtime::detail::LogMessage. We use this to pass TVM log messages to
+ * Android logcat.
  */
-#define DMLC_LOG_CUSTOMIZE 1
-
-/* Ensure that fatal errors are passed to the logger before throwing
- * in LogMessageFatal
- */
-#define DMLC_LOG_BEFORE_THROW 1
+#define TVM_LOG_CUSTOMIZE 1
 
 #include "../src/runtime/c_runtime_api.cc"
 #include "../src/runtime/cpu_device_api.cc"
@@ -81,8 +77,20 @@
 
 #include <android/log.h>
 
-void dmlc::CustomLogMessage::Log(const std::string& msg) {
-  // This is called for every message logged by TVM.
-  // We pass the message to logcat.
-  __android_log_write(ANDROID_LOG_DEBUG, "TVM_RUNTIME", msg.c_str());
+namespace tvm {
+namespace runtime {
+namespace detail {
+// Override logging mechanism
+void LogFatalImpl(const std::string& file, int lineno, const std::string& message) {
+  std::string m = file + ":" + std::to_string(lineno) + ": " + message;
+  __android_log_write(ANDROID_LOG_DEBUG, "TVM_RUNTIME", m.c_str());
+  throw InternalError(file, lineno, message);
 }
+void LogMessageImpl(const std::string& file, int lineno, const std::string& message) {
+  std::string m = file + ":" + std::to_string(lineno) + ": " + message;
+  __android_log_write(ANDROID_LOG_DEBUG, "TVM_RUNTIME", m.c_str());
+}
+
+}  // namespace detail
+}  // namespace runtime
+}  // namespace tvm
diff --git a/apps/bundle_deploy/Makefile b/apps/bundle_deploy/Makefile
index 38d9d3456d55..8e23a92afa93 100644
--- a/apps/bundle_deploy/Makefile
+++ b/apps/bundle_deploy/Makefile
@@ -32,12 +32,14 @@ PKG_CXXFLAGS = ${PKG_COMPILE_OPTS} -std=c++14 \
 	-I${TVM_ROOT}/include \
 	-I${DMLC_CORE}/include \
 	-I${TVM_ROOT}/3rdparty/dlpack/include \
-	-Icrt_config
+	-Icrt_config \
+	-DDMLC_USE_LOGGING_LIBRARY=\<tvm/runtime/logging.h\>
 PKG_CFLAGS = ${PKG_COMPILE_OPTS} \
 	-I${TVM_ROOT}/include \
 	-I${DMLC_CORE}/include \
 	-I${TVM_ROOT}/3rdparty/dlpack/include \
-	-Icrt_config
+	-Icrt_config \
+	-DDMLC_USE_LOGGING_LIBRARY=\<tvm/runtime/logging.h\>
 
 PKG_LDFLAGS = -pthread -lm
 
diff --git a/apps/dso_plugin_module/Makefile b/apps/dso_plugin_module/Makefile
index c2ce3306870a..438d9db223a8 100644
--- a/apps/dso_plugin_module/Makefile
+++ b/apps/dso_plugin_module/Makefile
@@ -19,7 +19,8 @@ TVM_ROOT=$(shell cd ../..; pwd)
 PKG_CFLAGS = -std=c++14 -O2 -fPIC\
 	-I${TVM_ROOT}/include\
 	-I${TVM_ROOT}/3rdparty/dmlc-core/include\
-	-I${TVM_ROOT}/3rdparty/dlpack/include
+	-I${TVM_ROOT}/3rdparty/dlpack/include\
+	-DDMLC_USE_LOGGING_LIBRARY=\<tvm/runtime/logging.h\>
 
 PKG_LDFLAGS =-L${TVM_ROOT}/build
 UNAME_S := $(shell uname -s)
diff --git a/apps/extension/Makefile b/apps/extension/Makefile
index 91d914aba63b..6eba941f7c98 100644
--- a/apps/extension/Makefile
+++ b/apps/extension/Makefile
@@ -20,7 +20,8 @@ TVM_ROOT=$(shell cd ../..; pwd)
 PKG_CFLAGS = -std=c++14 -O2 -fPIC\
 	-I${TVM_ROOT}/include\
 	-I${TVM_ROOT}/3rdparty/dmlc-core/include\
-	-I${TVM_ROOT}/3rdparty/dlpack/include
+	-I${TVM_ROOT}/3rdparty/dlpack/include\
+	-DDMLC_USE_LOGGING_LIBRARY=\<tvm/runtime/logging.h\>
 
 
 PKG_LDFLAGS =-L${TVM_ROOT}/build
diff --git a/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj b/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj
index b33c892cf002..28079e710a38 100644
--- a/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj
+++ b/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj
@@ -349,6 +349,8 @@
 				GCC_PREPROCESSOR_DEFINITIONS = (
 					"DEBUG=1",
 					"$(inherited)",
+					"DMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>",
+					"TVM_BACKTRACE_DISABLED=1",
 				);
 				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
 				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
@@ -393,6 +395,10 @@
 				ENABLE_STRICT_OBJC_MSGSEND = YES;
 				GCC_C_LANGUAGE_STANDARD = gnu99;
 				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>",
+					"TVM_BACKTRACE_DISABLED=1",
+				);
 				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
 				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
 				GCC_WARN_UNDECLARED_SELECTOR = YES;
diff --git a/apps/ios_rpc/tvmrpc/TVMRuntime.h b/apps/ios_rpc/tvmrpc/TVMRuntime.h
index f6a6dc64c53a..0d172fc3eaa1 100644
--- a/apps/ios_rpc/tvmrpc/TVMRuntime.h
+++ b/apps/ios_rpc/tvmrpc/TVMRuntime.h
@@ -22,7 +22,7 @@
  */
 #import <Foundation/Foundation.h>
 // Customize logging mechanism, redirect to NSLOG
-#define DMLC_LOG_CUSTOMIZE 1
+#define TVM_LOG_CUSTOMIZE 1
 #define TVM_METAL_RUNTIME 1
 
 #include <tvm/runtime/packed_func.h>
diff --git a/apps/ios_rpc/tvmrpc/TVMRuntime.mm b/apps/ios_rpc/tvmrpc/TVMRuntime.mm
index fbe4850e1b57..87cb6f9b4c69 100644
--- a/apps/ios_rpc/tvmrpc/TVMRuntime.mm
+++ b/apps/ios_rpc/tvmrpc/TVMRuntime.mm
@@ -53,9 +53,19 @@
 // CoreML
 #include "../../../src/runtime/contrib/coreml/coreml_runtime.mm"
 
-namespace dmlc {
+namespace tvm {
+namespace runtime {
+namespace detail {
 // Override logging mechanism
-void CustomLogMessage::Log(const std::string& msg) { NSLog(@"%s", msg.c_str()); }
+void LogFatalImpl(const std::string& file, int lineno, const std::string& message) {
+  throw tvm::runtime::InternalError(file, lineno, message);
+}
+
+void LogMessageImpl(const std::string& file, int lineno, const std::string& message) {
+  NSLog(@"%s:%d: %s", file.c_str(), lineno, message.c_str());
+}
+}
+}
 }  // namespace dmlc
 
 namespace tvm {
@@ -69,7 +79,7 @@ size_t Send(const void* data, size_t size) final {
     ssize_t nbytes = [stream_ write:reinterpret_cast<const uint8_t*>(data) maxLength:size];
     if (nbytes < 0) {
       NSLog(@"%@", [stream_ streamError].localizedDescription);
-      throw dmlc::Error("Stream error");
+      throw tvm::Error("Stream error");
     }
     return nbytes;
   }
diff --git a/apps/ios_rpc/tvmrpc/ViewController.mm b/apps/ios_rpc/tvmrpc/ViewController.mm
index 910c650aedc1..879ed2334a84 100644
--- a/apps/ios_rpc/tvmrpc/ViewController.mm
+++ b/apps/ios_rpc/tvmrpc/ViewController.mm
@@ -100,7 +100,7 @@ - (void)onReadAvailable {
         if (flag == 2) {
           [self onShutdownReceived];
         }
-      } catch (const dmlc::Error& e) {
+      } catch (const tvm::Error& e) {
         [self close];
       }
     }
@@ -123,7 +123,7 @@ - (void)onWriteAvailable {
       if (flag == 2) {
         [self onShutdownReceived];
       }
-    } catch (const dmlc::Error& e) {
+    } catch (const tvm::Error& e) {
       [self close];
     }
   }
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 60c718c97bc1..8c090dce741e 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -275,3 +275,8 @@ set(USE_TARGET_ONNX OFF)
 
 # Whether enable BNNS runtime
 set(USE_BNNS OFF)
+
+# Whether to use libbacktrace
+# Libbacktrace provides line and column information on stack traces from errors. It is only
+# supported on linux and macOS.
+# set(USE_LIBBACKTRACE OFF)
diff --git a/cmake/modules/Libbacktrace.cmake b/cmake/modules/Libbacktrace.cmake
new file mode 100644
index 000000000000..742855358809
--- /dev/null
+++ b/cmake/modules/Libbacktrace.cmake
@@ -0,0 +1,45 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+include(ExternalProject)
+
+ExternalProject_Add(project_libbacktrace
+  PREFIX libbacktrace
+  SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/../../3rdparty/libbacktrace
+  BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/libbacktrace
+  CONFIGURE_COMMAND "${CMAKE_CURRENT_LIST_DIR}/../../3rdparty/libbacktrace/configure"
+                    "--prefix=${CMAKE_CURRENT_BINARY_DIR}/libbacktrace" --with-pic
+  INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace"
+  BUILD_COMMAND make
+  INSTALL_COMMAND make install
+  BUILD_BYPRODUCTS "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/lib/libbacktrace.a"
+                   "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/include/backtrace.h"
+  )
+
+# Custom step to rebuild libbacktrace if any of the source files change
+file(GLOB LIBBACKTRACE_SRCS "${CMAKE_CURRENT_LIST_DIR}/../../3rdparty/libbacktrace/*.c")
+ExternalProject_Add_Step(project_libbacktrace checkout
+  DEPENDERS configure
+  DEPENDEES download
+  DEPENDS ${LIBBACKTRACE_SRCS}
+)
+
+add_library(libbacktrace STATIC IMPORTED)
+add_dependencies(libbacktrace project_libbacktrace)
+set_property(TARGET libbacktrace
+  PROPERTY IMPORTED_LOCATION ${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/lib/libbacktrace.a)
+# create include directory so cmake doesn't complain
+file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/include)
diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake
index 115216680fff..58b58d231d83 100644
--- a/cmake/modules/VTA.cmake
+++ b/cmake/modules/VTA.cmake
@@ -60,6 +60,7 @@ elseif(PYTHON)
     # Target lib: vta_fsim
     add_library(vta_fsim SHARED ${FSIM_RUNTIME_SRCS})
     target_include_directories(vta_fsim SYSTEM PUBLIC ${VTA_HW_PATH}/include)
+    target_compile_definitions(vta_fsim PUBLIC DMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
     foreach(__def ${VTA_DEFINITIONS})
       string(SUBSTRING ${__def} 3 -1 __strip_def)
       target_compile_definitions(vta_fsim PUBLIC ${__strip_def})
@@ -81,6 +82,7 @@ elseif(PYTHON)
     # Target lib: vta_tsim
     add_library(vta_tsim SHARED ${TSIM_RUNTIME_SRCS})
     target_include_directories(vta_tsim SYSTEM PUBLIC ${VTA_HW_PATH}/include)
+    target_compile_definitions(vta_tsim PUBLIC DMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
     foreach(__def ${VTA_DEFINITIONS})
       string(SUBSTRING ${__def} 3 -1 __strip_def)
       target_compile_definitions(vta_tsim PUBLIC ${__strip_def})
@@ -107,6 +109,7 @@ elseif(PYTHON)
     add_library(vta SHARED ${FPGA_RUNTIME_SRCS})
     target_include_directories(vta PUBLIC vta/runtime)
     target_include_directories(vta PUBLIC ${VTA_HW_PATH}/include)
+    target_compile_definitions(vta PUBLIC DMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
     foreach(__def ${VTA_DEFINITIONS})
       string(SUBSTRING ${__def} 3 -1 __strip_def)
       target_compile_definitions(vta PUBLIC ${__strip_def})
diff --git a/golang/Makefile b/golang/Makefile
index 6fd77996e119..137e2a488e29 100644
--- a/golang/Makefile
+++ b/golang/Makefile
@@ -25,7 +25,7 @@ NATIVE_SRC = tvm_runtime_pack.cc
 GOPATH=$(CURDIR)/gopath
 GOPATHDIR=${GOPATH}/src/${TARGET}/
 CGO_CPPFLAGS="-I. -I${TVM_BASE}/ -I${TVM_BASE}/3rdparty/dmlc-core/include -I${TVM_BASE}/include -I${TVM_BASE}/3rdparty/dlpack/include/"
-CGO_CXXFLAGS="-std=c++14"
+CGO_CXXFLAGS="-std=c++14 -DDMLC_USE_LOGGING_LIBRARY=\<tvm/runtime/logging.h\>"
 CGO_CFLAGS="-I${TVM_BASE}"
 CGO_LDFLAGS="-ldl -lm"
 
diff --git a/include/tvm/ir/attrs.h b/include/tvm/ir/attrs.h
index f05ab04c3305..da7bc12619bd 100644
--- a/include/tvm/ir/attrs.h
+++ b/include/tvm/ir/attrs.h
@@ -92,12 +92,12 @@ inline DataType NullValue<DataType>() {
 }
 
 /*! \brief Error thrown during attribute checking. */
-struct AttrError : public dmlc::Error {
+struct AttrError : public Error {
   /*!
    * \brief constructor
    * \param msg error message
    */
-  explicit AttrError(std::string msg) : dmlc::Error("AttributeError:" + msg) {}
+  explicit AttrError(std::string msg) : Error("AttributeError:" + msg) {}
 };
 
 /*!
diff --git a/include/tvm/ir/diagnostic.h b/include/tvm/ir/diagnostic.h
index 2053a295a3b8..41130a5be0aa 100644
--- a/include/tvm/ir/diagnostic.h
+++ b/include/tvm/ir/diagnostic.h
@@ -37,6 +37,15 @@ namespace tvm {
 using tvm::parser::SourceMap;
 using tvm::runtime::TypedPackedFunc;
 
+/*! \brief The diagnostic level, controls the printing of the message. */
+enum class DiagnosticLevel : int {
+  kBug = 10,
+  kError = 20,
+  kWarning = 30,
+  kNote = 40,
+  kHelp = 50,
+};
+
 class DiagnosticBuilder;
 
 /*! \brief A compiler diagnostic. */
diff --git a/include/tvm/ir/error.h b/include/tvm/ir/error.h
index ac7b96a3bd59..6ff61781ac44 100644
--- a/include/tvm/ir/error.h
+++ b/include/tvm/ir/error.h
@@ -36,11 +36,11 @@ namespace tvm {
 /*!
  * \brief A wrapper around std::stringstream to build error.
  *
- * Can be consumed by Error to construct an error.
+ * Can be consumed by CompileError to construct an error.
  *
  * \code
  *
- * void ReportError(const Error& err);
+ * void ReportError(const CompileError& err);
  *
  * void Test(int number) {
  *   // Use error reporter to construct an error.
@@ -59,13 +59,13 @@ struct ErrorBuilder {
 
  private:
   std::stringstream stream_;
-  friend class Error;
+  friend class CompileError;
 };
 
 /*!
  * \brief Custom Error class to be thrown during compilation.
  */
-class Error : public dmlc::Error {
+class CompileError : public Error {
  public:
   /*! \brief Location of the error */
   Span span;
@@ -73,20 +73,20 @@ class Error : public dmlc::Error {
    * \brief construct error from message.
    * \param msg The message
    */
-  explicit Error(const std::string& msg) : dmlc::Error(msg), span(nullptr) {}
+  explicit CompileError(const std::string& msg) : Error(msg), span(nullptr) {}
   /*!
    * \brief construct error from error builder.
    * \param err The error builder
    */
-  Error(const ErrorBuilder& err) : dmlc::Error(err.stream_.str()), span(nullptr) {}  // NOLINT(*)
+  CompileError(const ErrorBuilder& err) : Error(err.stream_.str()), span(nullptr) {}  // NOLINT(*)
   /*!
    * \brief copy constructor.
    * \param other The other ereor.
    */
-  Error(const Error& other) : dmlc::Error(other.what()), span(other.span) {}  // NOLINT(*)
+  CompileError(const CompileError& other) : Error(other.what()), span(other.span) {}  // NOLINT(*)
   /*!
    * \brief default constructor. */
-  Error() : dmlc::Error(""), span(nullptr) {}
+  CompileError() : Error(""), span(nullptr) {}
 };
 
 /*!
@@ -115,13 +115,13 @@ class ErrorReporter {
   ErrorReporter() : errors_(), node_to_error_() {}
 
   /*!
-   * \brief Report a tvm::Error.
+   * \brief Report a CompileError.
    *
    * This API is useful for reporting spanned errors.
    *
    * \param err The error to report.
    */
-  void Report(const Error& err) {
+  void Report(const CompileError& err) {
     if (!err.span.defined()) {
       throw err;
     }
@@ -143,7 +143,7 @@ class ErrorReporter {
    */
   void ReportAt(const GlobalVar& global, const ObjectRef& node, std::stringstream& err) {
     std::string err_msg = err.str();
-    this->ReportAt(global, node, Error(err_msg));
+    this->ReportAt(global, node, CompileError(err_msg));
   }
 
   /*!
@@ -158,7 +158,7 @@ class ErrorReporter {
    * \param node The expression or type to report the error at.
    * \param err The error to report.
    */
-  void ReportAt(const GlobalVar& global, const ObjectRef& node, const Error& err);
+  void ReportAt(const GlobalVar& global, const ObjectRef& node, const CompileError& err);
 
   /*!
    * \brief Render all reported errors and exit the program.
@@ -176,7 +176,7 @@ class ErrorReporter {
   inline bool AnyErrors() { return errors_.size() != 0; }
 
  private:
-  std::vector<Error> errors_;
+  std::vector<CompileError> errors_;
   std::unordered_map<ObjectRef, std::vector<size_t>, ObjectPtrHash, ObjectPtrEqual> node_to_error_;
   std::unordered_map<ObjectRef, GlobalVar, ObjectPtrHash, ObjectPtrEqual> node_to_gv_;
 };
diff --git a/include/tvm/ir/type_relation.h b/include/tvm/ir/type_relation.h
index 462588006c9b..dd6861750a10 100644
--- a/include/tvm/ir/type_relation.h
+++ b/include/tvm/ir/type_relation.h
@@ -29,7 +29,7 @@
 #include <tvm/ir/env_func.h>
 #include <tvm/ir/module.h>
 #include <tvm/ir/type.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 namespace tvm {
 
diff --git a/include/tvm/relay/analysis.h b/include/tvm/relay/analysis.h
index 5dd837038731..f88b04994099 100644
--- a/include/tvm/relay/analysis.h
+++ b/include/tvm/relay/analysis.h
@@ -29,7 +29,7 @@
 #include <tvm/relay/expr.h>
 #include <tvm/relay/function.h>
 #include <tvm/relay/type.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include <string>
 #include <unordered_map>
diff --git a/include/tvm/runtime/container.h b/include/tvm/runtime/container.h
index 336fef21ab88..362582f4dab9 100644
--- a/include/tvm/runtime/container.h
+++ b/include/tvm/runtime/container.h
@@ -30,6 +30,7 @@
 
 #include <dmlc/logging.h>
 #include <tvm/runtime/container.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/memory.h>
 #include <tvm/runtime/object.h>
 
diff --git a/include/tvm/runtime/data_type.h b/include/tvm/runtime/data_type.h
index 7d914ce6bff9..b4fdcbff58b4 100644
--- a/include/tvm/runtime/data_type.h
+++ b/include/tvm/runtime/data_type.h
@@ -25,7 +25,7 @@
 #define TVM_RUNTIME_DATA_TYPE_H_
 
 #include <tvm/runtime/c_runtime_api.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include <string>
 #include <type_traits>
diff --git a/include/tvm/runtime/logging.h b/include/tvm/runtime/logging.h
new file mode 100644
index 000000000000..952a5ffec637
--- /dev/null
+++ b/include/tvm/runtime/logging.h
@@ -0,0 +1,438 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/runtime/logging.h
+ * \brief logging utilities
+ *
+ * We define our own CHECK and LOG macros to replace those from dmlc-core.
+ * These macros are then injected into dmlc-core via the
+ * DMLC_USE_LOGGING_LIBRARY define. dmlc-core will #include this file wherever
+ * it needs logging.
+ */
+#ifndef TVM_RUNTIME_LOGGING_H_
+#define TVM_RUNTIME_LOGGING_H_
+
+#include <dmlc/common.h>
+
+#include <ctime>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+#include "tvm/runtime/c_runtime_api.h"
+
+// a technique that enables overriding macro names on the number of parameters. This is used
+// to define other macros below
+#define GET_MACRO(_1, _2, _3, _4, _5, NAME, ...) NAME
+
+/*!
+ * \brief COND_X calls COND_X_N where N is the number of parameters passed to COND_X
+ * X can be any of CHECK_GE, CHECK_EQ, CHECK, or LOG COND_X (but not COND_X_N)
+ * are supposed to be used outside this file.
+ * The first parameter of COND_X (and therefore, COND_X_N), which we call 'quit_on_assert',
+ * is a boolean. The rest of the parameters of COND_X is the same as the parameters of X.
+ * quit_on_assert determines the overall behavior of COND_X. If it's true COND_X
+ * quits the program on assertion failure. If it's false, then it moves on and somehow reports
+ * the assertion failure back to the macro caller in an appropriate manner (e.g, 'return false'
+ * in a function, or 'continue' or 'break' in a loop)
+ * The default behavior when quit_on_assertion is false, is to 'return false'. If this is not
+ * desirable, the macro caller can pass one more last parameter to COND_X to tell COND_X what
+ * to do when when quit_on_assertion is false and the assertion fails.
+ *
+ * Rationale: These macros were designed to implement functions that have two behaviors
+ * in a concise way. Those behaviors are quitting on assertion failures, or trying to
+ * move on from assertion failures. Note that these macros hide lots of control flow in them,
+ * and therefore, makes the logic of the whole code slightly harder to understand. However,
+ * in pieces of code that use these macros frequently, it will significantly shorten the
+ * amount of code needed to be read, and we won't need to clutter the main logic of the
+ * function by repetitive control flow structure. The first problem
+ * mentioned will be improved over time as the developer gets used to the macro.
+ *
+ * Here is an example of how to use it
+ * \code
+ * bool f(..., bool quit_on_assertion) {
+ *   int a = 0, b = 0;
+ *   ...
+ *   a = ...
+ *   b = ...
+ *   // if quit_on_assertion is true, if a==b, continue, otherwise quit.
+ *   // if quit_on_assertion is false, if a==b, continue, otherwise 'return false' (default
+ * behaviour) COND_CHECK_EQ(quit_on_assertion, a, b) << "some error message when  quiting"
+ *   ...
+ *   for (int i = 0; i < N; i++) {
+ *     a = ...
+ *     b = ...
+ *     // if quit_on_assertion is true, if a==b, continue, otherwise quit.
+ *     // if quit_on_assertion is false, if a==b, continue, otherwise 'break' (non-default
+ *     // behaviour, therefore, has to be explicitly specified)
+ *     COND_CHECK_EQ(quit_on_assertion, a, b, break) << "some error message when  quiting"
+ *   }
+ * }
+ * \endcode
+ */
+#define COND_CHECK_GE(...) \
+  GET_MACRO(__VA_ARGS__, COND_CHECK_GE_5, COND_CHECK_GE_4, COND_CHECK_GE_3)(__VA_ARGS__)
+#define COND_CHECK_EQ(...) \
+  GET_MACRO(__VA_ARGS__, COND_CHECK_EQ_5, COND_CHECK_EQ_4, COND_CHECK_EQ_3)(__VA_ARGS__)
+#define COND_CHECK(...) \
+  GET_MACRO(__VA_ARGS__, COND_CHECK_5, COND_CHECK_4, COND_CHECK_3, COND_CHECK_2)(__VA_ARGS__)
+#define COND_LOG(...) \
+  GET_MACRO(__VA_ARGS__, COND_LOG_5, COND_LOG_4, COND_LOG_3, COND_LOG_2)(__VA_ARGS__)
+
+// Not supposed to be used by users directly.
+#define COND_CHECK_OP(quit_on_assert, x, y, what, op) \
+  if (!quit_on_assert) {                              \
+    if (!((x)op(y))) what;                            \
+  } else /* NOLINT(*) */                              \
+    CHECK_##op(x, y)
+
+#define COND_CHECK_EQ_4(quit_on_assert, x, y, what) COND_CHECK_OP(quit_on_assert, x, y, what, ==)
+#define COND_CHECK_GE_4(quit_on_assert, x, y, what) COND_CHECK_OP(quit_on_assert, x, y, what, >=)
+
+#define COND_CHECK_3(quit_on_assert, x, what) \
+  if (!quit_on_assert) {                      \
+    if (!(x)) what;                           \
+  } else /* NOLINT(*) */                      \
+    CHECK(x)
+
+#define COND_LOG_3(quit_on_assert, x, what) \
+  if (!quit_on_assert) {                    \
+    what;                                   \
+  } else /* NOLINT(*) */                    \
+    LOG(x)
+
+#define COND_CHECK_EQ_3(quit_on_assert, x, y) COND_CHECK_EQ_4(quit_on_assert, x, y, return false)
+#define COND_CHECK_GE_3(quit_on_assert, x, y) COND_CHECK_GE_4(quit_on_assert, x, y, return false)
+#define COND_CHECK_2(quit_on_assert, x) COND_CHECK_3(quit_on_assert, x, return false)
+#define COND_LOG_2(quit_on_assert, x) COND_LOG_3(quit_on_assert, x, return false)
+
+#ifdef _MSC_VER
+#define TVM_THROW_EXCEPTION noexcept(false) __declspec(noreturn)
+#else
+#define TVM_THROW_EXCEPTION noexcept(false)
+#endif
+
+namespace tvm {
+namespace runtime {
+
+/* \brief Generate a backtrace when called.
+ * \return A multiline string of the backtrace. There will be either one or two lines per frame.
+ */
+std::string Backtrace();
+
+/*! \brief Base error type for TVM. Wraps a string message. */
+class Error : public ::dmlc::Error {  // for backwards compatibility
+ public:
+  /*! \brief Construct an error.
+   * \param s The message to be displayed with the error.
+   */
+  explicit Error(const std::string& s) : ::dmlc::Error(s) {}
+};
+
+/*! \brief Error type for errors from CHECK, ICHECK, and LOG(FATAL). This error
+ * contains a backtrace of where it occured.
+ */
+class InternalError : public Error {
+ public:
+  /*! \brief Construct an error. Not recommended to use directly. Instead use LOG(FATAL).
+   *
+   * \param file The file where the error occurred.
+   * \param lineno The line number where the error occurred.
+   * \param message The error message to display.
+   * \param time The time at which the error occurred. This should be in local time.
+   * \param backtrace Backtrace from when the error occurred.
+   */
+  InternalError(std::string file, int lineno, std::string message,
+                std::time_t time = std::time(nullptr), std::string backtrace = Backtrace())
+      : Error(""),
+        file_(file),
+        lineno_(lineno),
+        message_(message),
+        time_(time),
+        backtrace_(backtrace) {
+    std::ostringstream s;
+    // XXX: Do not change this format, otherwise all error handling in python will break (because it
+    // parses the message to reconstruct the error type).
+    // TODO(tkonolige): Convert errors to Objects, so we can avoid the mess of formatting/parsing
+    // error messages correctly.
+    s << "[" << std::put_time(std::localtime(&time), "%H:%M:%S") << "] " << file << ":" << lineno
+      << ": " << message << std::endl;
+    if (backtrace.size() > 0) {
+      s << backtrace << std::endl;
+    }
+    full_message_ = s.str();
+  }
+  /*! \return The file in which the error occurred. */
+  const std::string& file() const { return file_; }
+  /*! \return The message associated with this error. */
+  const std::string& message() const { return message_; }
+  /*! \return Formatted error message including file, linenumber, backtrace, and message. */
+  const std::string& full_message() const { return full_message_; }
+  /*! \return The backtrace from where this error occurred. */
+  const std::string& backtrace() const { return backtrace_; }
+  /*! \return The time at which this error occurred. */
+  const std::time_t& time() const { return time_; }
+  /*! \return The line number at which this error occurred. */
+  int lineno() const { return lineno_; }
+  virtual const char* what() const noexcept { return full_message_.c_str(); }
+
+ private:
+  std::string file_;
+  int lineno_;
+  std::string message_;
+  std::time_t time_;
+  std::string backtrace_;
+  std::string full_message_;  // holds the full error string
+};
+
+namespace detail {
+#ifndef TVM_LOG_CUSTOMIZE
+
+/*! \brief Class to accumulate an error message and throw it. Do not use
+ * directly, instead use LOG(FATAL).
+ */
+class LogFatal {
+ public:
+  LogFatal(const std::string& file, int lineno) : file_(file), lineno_(lineno) {}
+#ifdef _MSC_VER
+#pragma disagnostic push
+#pragma warning(disable : 4722)
+#endif
+  ~LogFatal() noexcept(false) { throw InternalError(file_, lineno_, stream_.str()); }
+#ifdef _MSC_VER
+#pragma disagnostic pop
+#endif
+  std::ostringstream& stream() { return stream_; }
+
+ private:
+  std::ostringstream stream_;
+  std::string file_;
+  int lineno_;
+};
+
+/*! \brief Class to accumulate an log message. Do not use directly, instead use
+ * LOG(INFO), LOG(WARNING), LOG(ERROR).
+ */
+class LogMessage {
+ public:
+  LogMessage(const std::string& file, int lineno) {
+    std::time_t t = std::time(nullptr);
+    stream_ << "[" << std::put_time(std::localtime(&t), "%H:%M:%S") << "] " << file << ":" << lineno
+            << ": ";
+  }
+  ~LogMessage() { std::cerr << stream_.str() << std::endl; }
+  std::ostringstream& stream() { return stream_; }
+
+ private:
+  std::ostringstream stream_;
+};
+#else
+// Custom implementations of LogFatal and LogMessage that allow the user to
+// override handling of the message. The user must implement LogFatalImpl and LogMessageImpl
+void LogFatalImpl(const std::string& file, int lineno, const std::string& message);
+class LogFatal {
+ public:
+  LogFatal(const std::string& file, int lineno) : file_(file), lineno_(lineno) {}
+  ~LogFatal() TVM_THROW_EXCEPTION { LogFatalImpl(file_, lineno_, stream_.str()); }
+  std::ostringstream& stream() { return stream_; }
+
+ private:
+  std::ostringstream stream_;
+  std::string file_;
+  int lineno_;
+};
+
+void LogMessageImpl(const std::string& file, int lineno, const std::string& message);
+class LogMessage {
+ public:
+  LogMessage(const std::string& file, int lineno) : file_(file), lineno_(lineno) {}
+  ~LogMessage() { LogMessageImpl(file_, lineno_, stream_.str()); }
+  std::ostringstream& stream() { return stream_; }
+
+ private:
+  std::string file_;
+  int lineno_;
+  std::ostringstream stream_;
+};
+#endif
+
+// Below is from dmlc-core
+// This class is used to explicitly ignore values in the conditional
+// logging macros.  This avoids compiler warnings like "value computed
+// is not used" and "statement has no effect".
+class LogMessageVoidify {
+ public:
+  LogMessageVoidify() {}
+  // This has to be an operator with a precedence lower than << but
+  // higher than "?:". See its usage.
+  void operator&(std::ostream&) {}
+};
+
+// Also from dmlc-core
+inline bool DebugLoggingEnabled() {
+  static int state = 0;
+  if (state == 0) {
+    if (auto var = std::getenv("TVM_LOG_DEBUG")) {
+      if (std::string(var) == "1") {
+        state = 1;
+      } else {
+        state = -1;
+      }
+    } else {
+      // by default hide debug logging.
+      state = -1;
+    }
+  }
+  return state == 1;
+}
+
+constexpr const char* kTVM_INTERNAL_ERROR_MESSAGE =
+    "---------------------------------------------------------------\n"
+    "An internal invariant was violated during the execution of TVM.\n"
+    "Please read TVM's error reporting guidelines.\n"
+    "More details can be found here: https://discuss.tvm.ai/t/error-reporting/7793.\n"
+    "---------------------------------------------------------------\n";
+
+// Inline _Pragma in macros does not work reliably on old version of MVSC and
+// GCC. We wrap all comparisons in a function so that we can use #pragma to
+// silence bad comparison warnings.
+#define TVM_CHECK_FUNC(name, op)                                   \
+  template <typename A, typename B>                                \
+  DMLC_ALWAYS_INLINE bool LogCheck##name(const A& a, const B& b) { \
+    return a op b;                                                 \
+  }
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wsign-compare"
+TVM_CHECK_FUNC(_LT, <)
+TVM_CHECK_FUNC(_GT, >)
+TVM_CHECK_FUNC(_LE, <=)
+TVM_CHECK_FUNC(_GE, >=)
+TVM_CHECK_FUNC(_EQ, ==)
+TVM_CHECK_FUNC(_NE, !=)
+#pragma GCC diagnostic pop
+}  // namespace detail
+
+#define LOG(level) LOG_##level
+#define LOG_FATAL ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream()
+#define LOG_INFO ::tvm::runtime::detail::LogMessage(__FILE__, __LINE__).stream()
+#define LOG_ERROR (::tvm::runtime::detail::LogMessage(__FILE__, __LINE__).stream() << "error: ")
+#define LOG_WARNING (::tvm::runtime::detail::LogMessage(__FILE__, __LINE__).stream() << "warning: ")
+
+#define TVM_CHECK_BINARY_OP(name, op, x, y)                     \
+  if (!::tvm::runtime::detail::LogCheck##name(x, y))            \
+  ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream() \
+      << "Check failed: " << #x " " #op " " #y << ": "
+
+#define CHECK(x)                                                \
+  if (!(x))                                                     \
+  ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream() \
+      << "Check failed: " #x << " == false: "
+
+#define CHECK_LT(x, y) TVM_CHECK_BINARY_OP(_LT, <, x, y)
+#define CHECK_GT(x, y) TVM_CHECK_BINARY_OP(_GT, >, x, y)
+#define CHECK_LE(x, y) TVM_CHECK_BINARY_OP(_LE, <=, x, y)
+#define CHECK_GE(x, y) TVM_CHECK_BINARY_OP(_GE, >=, x, y)
+#define CHECK_EQ(x, y) TVM_CHECK_BINARY_OP(_EQ, ==, x, y)
+#define CHECK_NE(x, y) TVM_CHECK_BINARY_OP(_NE, !=, x, y)
+#define CHECK_NOTNULL(x)                                                          \
+  ((x) == nullptr ? ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream() \
+                        << "Check not null: " #x << ' ',                          \
+   (x) : (x))  // NOLINT(*)
+
+#define LOG_IF(severity, condition) \
+  !(condition) ? (void)0 : ::tvm::runtime::detail::LogMessageVoidify() & LOG(severity)
+
+#if TVM_LOG_DEBUG
+
+#define LOG_DFATAL LOG_FATAL
+#define DFATAL FATAL
+#define DLOG(severity) LOG_IF(severity, ::tvm::runtime::detail::DebugLoggingEnabled())
+#define DLOG_IF(severity, condition) \
+  LOG_IF(severity, ::tvm::runtime::detail::DebugLoggingEnabled() && (condition))
+
+#else
+
+#define LOG_DFATAL LOG_ERROR
+#define DFATAL ERROR
+#define DLOG(severity) true ? (void)0 : ::tvm::runtime::detail::LogMessageVoidify() & LOG(severity)
+#define DLOG_IF(severity, condition) \
+  (true || !(condition)) ? (void)0 : ::tvm::runtime::detail::LogMessageVoidify() & LOG(severity)
+
+#endif
+
+#if TVM_LOG_DEBUG
+#define DCHECK(x) \
+  while (false) CHECK(x)
+#define DCHECK_LT(x, y) \
+  while (false) CHECK((x) < (y))
+#define DCHECK_GT(x, y) \
+  while (false) CHECK((x) > (y))
+#define DCHECK_LE(x, y) \
+  while (false) CHECK((x) <= (y))
+#define DCHECK_GE(x, y) \
+  while (false) CHECK((x) >= (y))
+#define DCHECK_EQ(x, y) \
+  while (false) CHECK((x) == (y))
+#define DCHECK_NE(x, y) \
+  while (false) CHECK((x) != (y))
+#else
+#define DCHECK(x) CHECK(x)
+#define DCHECK_LT(x, y) CHECK((x) < (y))
+#define DCHECK_GT(x, y) CHECK((x) > (y))
+#define DCHECK_LE(x, y) CHECK((x) <= (y))
+#define DCHECK_GE(x, y) CHECK((x) >= (y))
+#define DCHECK_EQ(x, y) CHECK((x) == (y))
+#define DCHECK_NE(x, y) CHECK((x) != (y))
+#endif
+
+#define TVM_ICHECK_INDENT "  "
+
+#define ICHECK_BINARY_OP(name, op, x, y)                                  \
+  if (!::tvm::runtime::detail::LogCheck##name(x, y))                      \
+  ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream()           \
+      << ::tvm::runtime::detail::kTVM_INTERNAL_ERROR_MESSAGE << std::endl \
+      << TVM_ICHECK_INDENT << "Check failed: " << #x " " #op " " #y << ": "
+
+#define ICHECK(x)                                                                 \
+  if (!(x))                                                                       \
+  ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream()                   \
+      << ::tvm::runtime::detail::kTVM_INTERNAL_ERROR_MESSAGE << TVM_ICHECK_INDENT \
+      << "Check failed: " #x << " == false: "
+
+#define ICHECK_LT(x, y) ICHECK_BINARY_OP(_LT, <, x, y)
+#define ICHECK_GT(x, y) ICHECK_BINARY_OP(_GT, >, x, y)
+#define ICHECK_LE(x, y) ICHECK_BINARY_OP(_LE, <=, x, y)
+#define ICHECK_GE(x, y) ICHECK_BINARY_OP(_GE, >=, x, y)
+#define ICHECK_EQ(x, y) ICHECK_BINARY_OP(_EQ, ==, x, y)
+#define ICHECK_NE(x, y) ICHECK_BINARY_OP(_NE, !=, x, y)
+#define ICHECK_NOTNULL(x)                                                         \
+  ((x) == nullptr ? ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream() \
+                        << ::tvm::runtime::detail::kTVM_INTERNAL_ERROR_MESSAGE    \
+                        << TVM_ICHECK_INDENT << "Check not null: " #x << ' ',     \
+   (x) : (x))  // NOLINT(*)
+
+}  // namespace runtime
+// Re-export error types
+using runtime::Error;
+using runtime::InternalError;
+}  // namespace tvm
+#endif  // TVM_RUNTIME_LOGGING_H_
diff --git a/include/tvm/runtime/object.h b/include/tvm/runtime/object.h
index 47788394126e..048fc1d5af54 100644
--- a/include/tvm/runtime/object.h
+++ b/include/tvm/runtime/object.h
@@ -24,7 +24,7 @@
 #define TVM_RUNTIME_OBJECT_H_
 
 #include <tvm/runtime/c_runtime_api.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include <string>
 #include <type_traits>
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index 751a435c734a..7113863a6fb3 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -24,10 +24,10 @@
 #ifndef TVM_RUNTIME_PACKED_FUNC_H_
 #define TVM_RUNTIME_PACKED_FUNC_H_
 
-#include <dmlc/logging.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/container.h>
 #include <tvm/runtime/data_type.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/object.h>
@@ -1086,7 +1086,7 @@ struct PackedFuncValueConverter {
       Function(::tvm::runtime::TVMArgs(args, type_code, num_args), &rv);                    \
       rv.MoveToCHost(out_value, out_type_code);                                             \
       return 0;                                                                             \
-    } catch (const ::std::runtime_error& _except_) {                                        \
+    } catch (const ::std::exception& _except_) {                                            \
       TVMAPISetLastError(_except_.what());                                                  \
       return -1;                                                                            \
     }                                                                                       \
@@ -1140,7 +1140,7 @@ struct PackedFuncValueConverter {
           f, ::tvm::runtime::TVMArgs(args, type_code, num_args), &rv);                      \
       rv.MoveToCHost(out_value, out_type_code);                                             \
       return 0;                                                                             \
-    } catch (const ::std::runtime_error& _except_) {                                        \
+    } catch (const ::std::exception& _except_) {                                            \
       TVMAPISetLastError(_except_.what());                                                  \
       return -1;                                                                            \
     }                                                                                       \
diff --git a/include/tvm/runtime/vm/bytecode.h b/include/tvm/runtime/vm/bytecode.h
index e858c4458054..72a557fa93b1 100644
--- a/include/tvm/runtime/vm/bytecode.h
+++ b/include/tvm/runtime/vm/bytecode.h
@@ -25,7 +25,7 @@
 #define TVM_RUNTIME_VM_BYTECODE_H_
 
 #include <tvm/runtime/data_type.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include <iostream>
 #include <vector>
diff --git a/include/tvm/support/logging.h b/include/tvm/support/logging.h
deleted file mode 100644
index ced1902a1bd1..000000000000
--- a/include/tvm/support/logging.h
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file tvm/support/logging.h
- * \brief logging utilities on top of dmlc-core
- */
-#ifndef TVM_SUPPORT_LOGGING_H_
-#define TVM_SUPPORT_LOGGING_H_
-
-#include <dmlc/logging.h>
-
-// a technique that enables overriding macro names on the number of parameters. This is used
-// to define other macros below
-#define GET_MACRO(_1, _2, _3, _4, _5, NAME, ...) NAME
-
-/*!
- * \brief COND_X calls COND_X_N where N is the number of parameters passed to COND_X
- * X can be any of CHECK_GE, CHECK_EQ, CHECK, or LOG (defined dmlc-core/include/dmlc/logging.h.)
- * COND_X (but not COND_X_N) are supposed to be used outside this file.
- * The first parameter of COND_X (and therefore, COND_X_N), which we call 'quit_on_assert',
- * is a boolean. The rest of the parameters of COND_X is the same as the parameters of X.
- * quit_on_assert determines the overall behaviour of COND_X. If it's true COND_X
- * quits the program on assertion failure. If it's false, then it moves on and somehow reports
- * the assertion failure back to the macro caller in an appropriate manner (e.g, 'return false'
- * in a function, or 'continue' or 'break' in a loop)
- * The default behavior when quit_on_assertion is false, is to 'return false'. If this is not
- * desirable, the macro caller can pass one more last parameter to COND_X to tell COND_X what
- * to do when when quit_on_assertion is false and the assertion fails.
- *
- * Rationale: These macros were designed to implement functions that have two behaviours
- * in a concise way. Those behaviours are quitting on assertion failures, or trying to
- * move on from assertion failures. Note that these macros hide lots of control flow in them,
- * and therefore, makes the logic of the whole code slightly harder to understand. However,
- * in pieces of code that use these macros frequently, it will significantly shorten the
- * amount of code needed to be read, and we won't need to clutter the main logic of the
- * function by repetitive control flow structure. The first problem
- * mentioned will be improved over time as the developer gets used to the macro.
- *
- * Here is an example of how to use it
- * \code
- * bool f(..., bool quit_on_assertion) {
- *   int a = 0, b = 0;
- *   ...
- *   a = ...
- *   b = ...
- *   // if quit_on_assertion is true, if a==b, continue, otherwise quit.
- *   // if quit_on_assertion is false, if a==b, continue, otherwise 'return false' (default
- * behaviour) COND_CHECK_EQ(quit_on_assertion, a, b) << "some error message when  quiting"
- *   ...
- *   for (int i = 0; i < N; i++) {
- *     a = ...
- *     b = ...
- *     // if quit_on_assertion is true, if a==b, continue, otherwise quit.
- *     // if quit_on_assertion is false, if a==b, continue, otherwise 'break' (non-default
- *     // behaviour, therefore, has to be explicitly specified)
- *     COND_CHECK_EQ(quit_on_assertion, a, b, break) << "some error message when  quiting"
- *   }
- * }
- * \endcode
- */
-#define COND_CHECK_GE(...) \
-  GET_MACRO(__VA_ARGS__, COND_CHECK_GE_5, COND_CHECK_GE_4, COND_CHECK_GE_3)(__VA_ARGS__)
-#define COND_CHECK_EQ(...) \
-  GET_MACRO(__VA_ARGS__, COND_CHECK_EQ_5, COND_CHECK_EQ_4, COND_CHECK_EQ_3)(__VA_ARGS__)
-#define COND_CHECK(...) \
-  GET_MACRO(__VA_ARGS__, COND_CHECK_5, COND_CHECK_4, COND_CHECK_3, COND_CHECK_2)(__VA_ARGS__)
-#define COND_LOG(...) \
-  GET_MACRO(__VA_ARGS__, COND_LOG_5, COND_LOG_4, COND_LOG_3, COND_LOG_2)(__VA_ARGS__)
-
-// Not supposed to be used by users directly.
-#define COND_CHECK_OP(quit_on_assert, x, y, what, op) \
-  if (!quit_on_assert) {                              \
-    if (!((x)op(y))) what;                            \
-  } else /* NOLINT(*) */                              \
-    CHECK_##op(x, y)
-
-#define COND_CHECK_EQ_4(quit_on_assert, x, y, what) COND_CHECK_OP(quit_on_assert, x, y, what, ==)
-#define COND_CHECK_GE_4(quit_on_assert, x, y, what) COND_CHECK_OP(quit_on_assert, x, y, what, >=)
-
-#define COND_CHECK_3(quit_on_assert, x, what) \
-  if (!quit_on_assert) {                      \
-    if (!(x)) what;                           \
-  } else /* NOLINT(*) */                      \
-    CHECK(x)
-
-#define COND_LOG_3(quit_on_assert, x, what) \
-  if (!quit_on_assert) {                    \
-    what;                                   \
-  } else /* NOLINT(*) */                    \
-    LOG(x)
-
-#define COND_CHECK_EQ_3(quit_on_assert, x, y) COND_CHECK_EQ_4(quit_on_assert, x, y, return false)
-#define COND_CHECK_GE_3(quit_on_assert, x, y) COND_CHECK_GE_4(quit_on_assert, x, y, return false)
-#define COND_CHECK_2(quit_on_assert, x) COND_CHECK_3(quit_on_assert, x, return false)
-#define COND_LOG_2(quit_on_assert, x) COND_LOG_3(quit_on_assert, x, return false)
-
-namespace tvm {
-
-constexpr const char* kTVM_INTERNAL_ERROR_MESSAGE =
-    "\n---------------------------------------------------------------\n"
-    "An internal invariant was violated during the execution of TVM.\n"
-    "Please read TVM's error reporting guidelines.\n"
-    "More details can be found here: https://discuss.tvm.ai/t/error-reporting/7793.\n"
-    "---------------------------------------------------------------\n";
-
-#define ICHECK_INDENT "  "
-
-#define ICHECK_BINARY_OP(name, op, x, y)                           \
-  if (dmlc::LogCheckError _check_err = dmlc::LogCheck##name(x, y)) \
-  dmlc::LogMessageFatal(__FILE__, __LINE__).stream()               \
-      << tvm::kTVM_INTERNAL_ERROR_MESSAGE << std::endl             \
-      << ICHECK_INDENT << "Check failed: " << #x " " #op " " #y << *(_check_err.str) << ": "
-
-#define ICHECK(x)                                    \
-  if (!(x))                                          \
-  dmlc::LogMessageFatal(__FILE__, __LINE__).stream() \
-      << tvm::kTVM_INTERNAL_ERROR_MESSAGE << ICHECK_INDENT << "Check failed: " #x << " == false: "
-
-#define ICHECK_LT(x, y) ICHECK_BINARY_OP(_LT, <, x, y)
-#define ICHECK_GT(x, y) ICHECK_BINARY_OP(_GT, >, x, y)
-#define ICHECK_LE(x, y) ICHECK_BINARY_OP(_LE, <=, x, y)
-#define ICHECK_GE(x, y) ICHECK_BINARY_OP(_GE, >=, x, y)
-#define ICHECK_EQ(x, y) ICHECK_BINARY_OP(_EQ, ==, x, y)
-#define ICHECK_NE(x, y) ICHECK_BINARY_OP(_NE, !=, x, y)
-#define ICHECK_NOTNULL(x)                                                    \
-  ((x) == nullptr ? dmlc::LogMessageFatal(__FILE__, __LINE__).stream()       \
-                        << tvm::kTVM_INTERNAL_ERROR_MESSAGE << ICHECK_INDENT \
-                        << "Check not null: " #x << ' ',                     \
-   (x) : (x))  // NOLINT(*)
-
-/*! \brief The diagnostic level, controls the printing of the message. */
-enum class DiagnosticLevel : int {
-  kBug = 10,
-  kError = 20,
-  kWarning = 30,
-  kNote = 40,
-  kHelp = 50,
-};
-
-}  // namespace tvm
-#endif  // TVM_SUPPORT_LOGGING_H_
diff --git a/include/tvm/support/with.h b/include/tvm/support/with.h
index 90c82c4f3a06..d4547a304e8f 100644
--- a/include/tvm/support/with.h
+++ b/include/tvm/support/with.h
@@ -25,7 +25,7 @@
 #ifndef TVM_SUPPORT_WITH_H_
 #define TVM_SUPPORT_WITH_H_
 
-#include <dmlc/logging.h>
+#include <dmlc/common.h>
 
 #include <utility>
 
diff --git a/licenses/LICENSE.libbacktrace.txt b/licenses/LICENSE.libbacktrace.txt
new file mode 100644
index 000000000000..097d2774e5df
--- /dev/null
+++ b/licenses/LICENSE.libbacktrace.txt
@@ -0,0 +1,29 @@
+# Copyright (C) 2012-2016 Free Software Foundation, Inc.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+
+#     (1) Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer. 
+
+#     (2) Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in
+#     the documentation and/or other materials provided with the
+#     distribution.  
+    
+#     (3) The name of the author may not be used to
+#     endorse or promote products derived from this software without
+#     specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
diff --git a/python/setup.py b/python/setup.py
index e02369e97777..b47e5b14f6a7 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -94,7 +94,7 @@ def config_cython():
             subdir = "_cy2"
         ret = []
         path = "tvm/_ffi/_cython"
-        extra_compile_args = ["-std=c++14"]
+        extra_compile_args = ["-std=c++14", "-DDMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>"]
         if os.name == "nt":
             library_dirs = ["tvm", "../build/Release", "../build"]
             libraries = ["tvm"]
diff --git a/python/tvm/_ffi/base.py b/python/tvm/_ffi/base.py
index 397090618ade..0496195fd73f 100644
--- a/python/tvm/_ffi/base.py
+++ b/python/tvm/_ffi/base.py
@@ -253,7 +253,9 @@ def c2pyerror(err_msg):
     message = []
     for line in arr:
         if trace_mode:
-            if line.startswith("  "):
+            if line.startswith("        "):
+                stack_trace[-1] += "\n" + line
+            elif line.startswith("  "):
                 stack_trace.append(line)
             else:
                 trace_mode = False
diff --git a/python/tvm/micro/build.py b/python/tvm/micro/build.py
index 3837d423f8bd..d95f14f0349e 100644
--- a/python/tvm/micro/build.py
+++ b/python/tvm/micro/build.py
@@ -118,7 +118,7 @@ def get_runtime_libs() -> str:
 RUNTIME_SRC_REGEX = re.compile(r"^.*\.cc?$", re.IGNORECASE)
 
 
-_COMMON_CFLAGS = ["-Wall", "-Werror"]
+_COMMON_CFLAGS = ["-Wall", "-Werror", "-DDMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>"]
 
 
 def _build_default_compiler_options(standalone_crt_dir: typing.Optional[str] = None) -> str:
diff --git a/src/auto_scheduler/compute_dag.cc b/src/auto_scheduler/compute_dag.cc
index 4e7fb05660a4..abbcba234848 100644
--- a/src/auto_scheduler/compute_dag.cc
+++ b/src/auto_scheduler/compute_dag.cc
@@ -1367,7 +1367,7 @@ Array<State> ComputeDAG::InferBound(const Array<State>& states) const {
   support::parallel_for(0, states.size(), [this, &states, &out_states](int i) {
     try {
       out_states.Set(i, (states[i].defined()) ? this->InferBound(states[i]) : states[i]);
-    } catch (dmlc::Error& e) {
+    } catch (Error& e) {
       LOG(WARNING) << "InferBound fails on the state:\n"
                    << states[i] << "\n"
                    << "with: " << e.what() << std::endl;
diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc
index d93218c0208c..b3c62f01c7c8 100755
--- a/src/auto_scheduler/feature.cc
+++ b/src/auto_scheduler/feature.cc
@@ -1328,7 +1328,7 @@ void GetPerStoreFeaturesWorkerFunc(const SearchTask& task, const State& state, i
     const auto& prim_func = (*it).second.as<PrimFuncNode>();
     GetPerStoreFeature(prim_func->body, task->hardware_params->cache_line_bytes, max_n_bufs,
                        feature);
-  } catch (dmlc::Error& e) {
+  } catch (Error& e) {
     (*error_ct)++;
   }
 }
diff --git a/src/auto_scheduler/search_policy/sketch_policy_rules.cc b/src/auto_scheduler/search_policy/sketch_policy_rules.cc
index 110be6bd6f68..8eaf80321456 100644
--- a/src/auto_scheduler/search_policy/sketch_policy_rules.cc
+++ b/src/auto_scheduler/search_policy/sketch_policy_rules.cc
@@ -1106,7 +1106,7 @@ PopulationGenerationRule::ResultKind MutateComputeLocation::Apply(SketchPolicyNo
     }
     try {
       StepApplyToState(tmp_s->transform_steps.back(), &tmp_s, policy->search_task->compute_dag);
-    } catch (dmlc::Error& e) {
+    } catch (Error& e) {
       return ResultKind::kInvalid;
     }
   }
@@ -1228,7 +1228,7 @@ PopulationGenerationRule::ResultKind MutateParallel::Apply(SketchPolicyNode* pol
     tmp_s.CopyOnWrite()->transform_steps.push_back(step);
     try {
       StepApplyToState(tmp_s->transform_steps.back(), &tmp_s, policy->search_task->compute_dag);
-    } catch (dmlc::Error& e) {
+    } catch (Error& e) {
       return ResultKind::kInvalid;
     }
   }
diff --git a/src/auto_scheduler/transform_step.cc b/src/auto_scheduler/transform_step.cc
old mode 100755
new mode 100644
index 5ba3eee07098..b67d5cdd7bd9
--- a/src/auto_scheduler/transform_step.cc
+++ b/src/auto_scheduler/transform_step.cc
@@ -26,8 +26,8 @@
 #include <tvm/auto_scheduler/compute_dag.h>
 #include <tvm/auto_scheduler/loop_state.h>
 #include <tvm/auto_scheduler/transform_step.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 #include <tvm/te/operation.h>
 
 #include <string>
diff --git a/src/ir/error.cc b/src/ir/error.cc
index 5d3978dda4ff..0089f55a4da8 100644
--- a/src/ir/error.cc
+++ b/src/ir/error.cc
@@ -132,7 +132,8 @@ void ErrorReporter::RenderErrors(const IRModule& module, bool use_color) {
   LOG(FATAL) << annotated_prog.str() << std::endl;
 }
 
-void ErrorReporter::ReportAt(const GlobalVar& global, const ObjectRef& node, const Error& err) {
+void ErrorReporter::ReportAt(const GlobalVar& global, const ObjectRef& node,
+                             const CompileError& err) {
   size_t index_to_insert = this->errors_.size();
   this->errors_.push_back(err);
   auto it = this->node_to_error_.find(node);
diff --git a/src/parser/parser.cc b/src/parser/parser.cc
index 3061735eff7c..c7d8e025848a 100644
--- a/src/parser/parser.cc
+++ b/src/parser/parser.cc
@@ -28,9 +28,9 @@
 #include <tvm/relay/expr.h>
 #include <tvm/relay/function.h>
 #include <tvm/relay/transform.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/object.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 #include <fstream>
 
@@ -172,8 +172,8 @@ class ScopeStack {
   void PopStack() { this->scope_stack.pop_back(); }
 };
 
-struct DuplicateKeyError : public dmlc::Error {
-  explicit DuplicateKeyError(const std::string& msg) : dmlc::Error(msg) {}
+struct DuplicateKeyError : public Error {
+  explicit DuplicateKeyError(const std::string& msg) : Error(msg) {}
 };
 
 /*! \brief A table of interning strings as global function and type names. */
@@ -1492,7 +1492,7 @@ class Parser {
     DLOG(INFO) << "op_name=" << op_name << " span=" << span;
     try {
       return Op::Get(op_name);
-    } catch (const dmlc::Error& e) {
+    } catch (const Error& e) {
       // we can relax this, but probably need to relax checks or return non-null here.
       this->diag_ctx.EmitFatal(Diagnostic::Error(span)
                                << "operator `" << op_name
diff --git a/src/parser/span_check.h b/src/parser/span_check.h
index 9a887474fe67..ab71d30a54f5 100644
--- a/src/parser/span_check.h
+++ b/src/parser/span_check.h
@@ -30,8 +30,8 @@
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/runtime/container.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/object.h>
-#include <tvm/support/logging.h>
 
 #include <fstream>
 #include <string>
diff --git a/src/relay/analysis/annotated_region_set.cc b/src/relay/analysis/annotated_region_set.cc
index 04a18c4b7351..85a9c51a2fa8 100644
--- a/src/relay/analysis/annotated_region_set.cc
+++ b/src/relay/analysis/annotated_region_set.cc
@@ -157,8 +157,9 @@ class AnnotatedRegionSet::Creator : protected MixedModeVisitor {
       // Check if the argument already belongs to a region
       auto region = region_set_->GetRegion(call->args[0]);
       if (!region.defined()) {
-        throw Error(ErrorBuilder() << "Cannot find the corresponding region for end annotation:\n"
-                                   << AsText(GetRef<Call>(call), false));
+        throw CompileError(ErrorBuilder()
+                           << "Cannot find the corresponding region for end annotation:\n"
+                           << AsText(GetRef<Call>(call), false));
       } else {
         // If the argument is belonged to a region, it must have the same target.
         // Otherwise we should see a region_begin op.
diff --git a/src/relay/analysis/kind_check.cc b/src/relay/analysis/kind_check.cc
index c7c5a0a9f083..65b8516cb16c 100644
--- a/src/relay/analysis/kind_check.cc
+++ b/src/relay/analysis/kind_check.cc
@@ -139,7 +139,7 @@ struct KindChecker : TypeFunctor<Kind(const Type&)> {
                   << "Expected " << data->type_vars.size() << "arguments for " << tc << "; got "
                   << op->args.size());
       }
-    } catch (const dmlc::Error& err) {
+    } catch (const Error& err) {
       // TODO(@jroesch): can probably relax to just emit
       EmitFatal(Diagnostic::Error(op->span)
                 << "the type variable : `" << var->name_hint << "` is undefined");
diff --git a/src/relay/analysis/type_solver.cc b/src/relay/analysis/type_solver.cc
index cc1ada677c65..22e2e9a71040 100644
--- a/src/relay/analysis/type_solver.cc
+++ b/src/relay/analysis/type_solver.cc
@@ -617,10 +617,10 @@ bool TypeSolver::Solve() {
       }
 
       rnode->resolved = resolved;
-    } catch (const Error& err) {
+    } catch (const CompileError& err) {
       this->diag_ctx_.Emit(Diagnostic::Error(rnode->span) << err.what());
       rnode->resolved = false;
-    } catch (const dmlc::Error& e) {
+    } catch (const Error& e) {
       ICHECK(false) << e.what();
     }
 
diff --git a/src/relay/analysis/well_formed.cc b/src/relay/analysis/well_formed.cc
index 856c5dc7aac1..acc1a9adc9f4 100644
--- a/src/relay/analysis/well_formed.cc
+++ b/src/relay/analysis/well_formed.cc
@@ -24,7 +24,7 @@
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/pattern_functor.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include <unordered_set>
 
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index 251a55f10b72..9d3ffc558aae 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -33,8 +33,8 @@
 #include <tvm/relay/interpreter.h>
 #include <tvm/relay/qnn/transform.h>
 #include <tvm/relay/transform.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/vm/vm.h>
-#include <tvm/support/logging.h>
 #include <tvm/te/operation.h>
 
 #include <iostream>
diff --git a/src/relay/backend/vm/compiler.h b/src/relay/backend/vm/compiler.h
index 615a8181b387..9c813a4f561c 100644
--- a/src/relay/backend/vm/compiler.h
+++ b/src/relay/backend/vm/compiler.h
@@ -29,8 +29,8 @@
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/interpreter.h>
 #include <tvm/relay/transform.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/vm/vm.h>
-#include <tvm/support/logging.h>
 #include <tvm/tir/function.h>
 
 #include <iostream>
diff --git a/src/relay/backend/vm/inline_primitives.cc b/src/relay/backend/vm/inline_primitives.cc
index eb848eb7a828..05fb2a120620 100644
--- a/src/relay/backend/vm/inline_primitives.cc
+++ b/src/relay/backend/vm/inline_primitives.cc
@@ -25,7 +25,7 @@
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include <iostream>
 #include <vector>
diff --git a/src/relay/backend/vm/lambda_lift.cc b/src/relay/backend/vm/lambda_lift.cc
index cc530a10188e..c768a2c300ec 100644
--- a/src/relay/backend/vm/lambda_lift.cc
+++ b/src/relay/backend/vm/lambda_lift.cc
@@ -28,7 +28,7 @@
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include <iostream>
 #include <vector>
diff --git a/src/relay/backend/vm/removed_unused_funcs.cc b/src/relay/backend/vm/removed_unused_funcs.cc
index cdf898fca756..5e9b1b7978f9 100644
--- a/src/relay/backend/vm/removed_unused_funcs.cc
+++ b/src/relay/backend/vm/removed_unused_funcs.cc
@@ -26,7 +26,7 @@
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include <iostream>
 #include <unordered_set>
diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h
index 2a49a2e251f8..379fa3fa71d3 100644
--- a/src/relay/op/nn/convolution.h
+++ b/src/relay/op/nn/convolution.h
@@ -25,7 +25,7 @@
 #define TVM_RELAY_OP_NN_CONVOLUTION_H_
 
 #include <tvm/auto_scheduler/compute_dag.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/tir/analysis.h>
 
 #include <string>
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index e3929bf8b77e..b65068bd0506 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -312,7 +312,7 @@ bool StackRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
       if (first->shape[j].as<AnyNode>() || e->shape[j].as<AnyNode>() ||
           reporter->AssertEQ(first->shape[j], e->shape[j]))
         continue;
-      throw Error(
+      throw CompileError(
           "relay.stack requires all tensors have the same shape "
           "on non-stacking axes");
     }
@@ -483,7 +483,7 @@ Array<Array<Layout>> TransposeInferCorrectLayout(const Attrs& attrs,
     }
     try {
       return Array<Array<Layout>>({{Layout(in_layout_str)}, {Layout(out_layout_str)}});
-    } catch (const dmlc::Error& e) {
+    } catch (const tvm::Error& e) {
       // If the layout string is invalid for any reason, give up.
       return Array<Array<Layout>>({{Layout::Undef()}, {Layout::Undef()}});
     }
@@ -1691,8 +1691,8 @@ bool MeshgridRel(const Array<Type>& types, int num_inputs, const Attrs& raw_attr
   const MeshgridAttrs* attrs = raw_attrs.as<MeshgridAttrs>();
   const auto* tensor_tuple = types[0].as<TupleTypeNode>();
   if (tensor_tuple == nullptr) {
-    throw Error(
-        ErrorBuilder() << "meshgrid requires a tuple of tensors as the first argument, found "
+    throw CompileError(ErrorBuilder()
+                       << "meshgrid requires a tuple of tensors as the first argument, found "
                        << PrettyPrint(types[0]));
   } else if (types[0].as<IncompleteTypeNode>() != nullptr) {
     return false;
@@ -1714,14 +1714,14 @@ bool MeshgridRel(const Array<Type>& types, int num_inputs, const Attrs& raw_attr
     int e_ndim = static_cast<int>(e->shape.size());
     const DataType& e_dtype = e->dtype;
     if (e_dtype != dtype) {
-      throw Error("relay.meshgrid requires all tensors have the same dtype");
+      throw CompileError("relay.meshgrid requires all tensors have the same dtype");
     }
     if (e_ndim == 0) {
       grid_shape.emplace_back(1);
     } else if (e_ndim == 1) {
       grid_shape.emplace_back(e->shape[0]);
     } else {
-      throw Error("relay.meshgrid requires all tensors be either scalars or 1-D vectors.");
+      throw CompileError("relay.meshgrid requires all tensors be either scalars or 1-D vectors.");
     }
   }
 
diff --git a/src/relay/op/tensor/transform.h b/src/relay/op/tensor/transform.h
index dbf8537e0dad..3c670bcaaa51 100644
--- a/src/relay/op/tensor/transform.h
+++ b/src/relay/op/tensor/transform.h
@@ -78,8 +78,8 @@ bool ConcatenateRel(const Array<Type>& types, int num_inputs, const Attrs& attrs
   // Sanity check: axis
   int axis = param->axis;
   if (!(-ndim <= axis && axis < ndim)) {
-    throw Error(ErrorBuilder() << "concatenate only accepts `axis` in [-ndim, ndim)"
-                               << ", but got axis = " << axis << ", and ndim = " << ndim);
+    throw CompileError(ErrorBuilder() << "concatenate only accepts `axis` in [-ndim, ndim)"
+                                      << ", but got axis = " << axis << ", and ndim = " << ndim);
   }
   axis = axis < 0 ? ndim + axis : axis;
 
diff --git a/src/relay/op/type_relations.cc b/src/relay/op/type_relations.cc
index 7b30aea2eb57..6e30ad9624c4 100644
--- a/src/relay/op/type_relations.cc
+++ b/src/relay/op/type_relations.cc
@@ -85,7 +85,7 @@ TensorType ConcreteBroadcast(const TensorType& t1, const TensorType& t2, DataTyp
     } else if (EqualCheck(s1, s2)) {
       oshape.push_back(s1);
     } else {
-      throw Error(ErrorBuilder() << "Incompatible broadcast type " << t1 << " and " << t2);
+      throw CompileError(ErrorBuilder() << "Incompatible broadcast type " << t1 << " and " << t2);
     }
   }
 
diff --git a/src/relay/qnn/op/concatenate.cc b/src/relay/qnn/op/concatenate.cc
index 59a519d66436..eb0f83836a54 100644
--- a/src/relay/qnn/op/concatenate.cc
+++ b/src/relay/qnn/op/concatenate.cc
@@ -51,9 +51,10 @@ bool QnnConcatenateRel(const Array<Type>& types, int num_inputs, const Attrs& at
     if (types[1].as<IncompleteTypeNode>()) {
       return false;
     } else {
-      throw Error(ErrorBuilder()
-                  << "qnn concatenate requires a tuple of scales as the second argument, found "
-                  << PrettyPrint(types[1]));
+      throw CompileError(
+          ErrorBuilder()
+          << "qnn concatenate requires a tuple of scales as the second argument, found "
+          << PrettyPrint(types[1]));
     }
   }
   for (const auto& input_scale : input_scales_tuple->fields) {
@@ -68,9 +69,10 @@ bool QnnConcatenateRel(const Array<Type>& types, int num_inputs, const Attrs& at
     if (types[2].as<IncompleteTypeNode>()) {
       return false;
     } else {
-      throw Error(ErrorBuilder()
-                  << "qnn concatenate requires a tuple of zero_points as the third argument, found "
-                  << PrettyPrint(types[2]));
+      throw CompileError(
+          ErrorBuilder()
+          << "qnn concatenate requires a tuple of zero_points as the third argument, found "
+          << PrettyPrint(types[2]));
     }
   }
   for (const auto& input_zero_point : input_zero_points_tuple->fields) {
diff --git a/src/relay/transforms/fold_explicit_padding.cc b/src/relay/transforms/fold_explicit_padding.cc
index bab8b814df05..d959e5b75e40 100644
--- a/src/relay/transforms/fold_explicit_padding.cc
+++ b/src/relay/transforms/fold_explicit_padding.cc
@@ -26,7 +26,7 @@
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include "../op/tensor/transform.h"
 #include "pattern_utils.h"
diff --git a/src/relay/transforms/inline.cc b/src/relay/transforms/inline.cc
index dae34674de77..6e6505b28dc6 100644
--- a/src/relay/transforms/inline.cc
+++ b/src/relay/transforms/inline.cc
@@ -36,7 +36,7 @@
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include <string>
 #include <unordered_set>
diff --git a/src/relay/transforms/memory_alloc.cc b/src/relay/transforms/memory_alloc.cc
index b8c87909a025..f75b7ba1fc75 100644
--- a/src/relay/transforms/memory_alloc.cc
+++ b/src/relay/transforms/memory_alloc.cc
@@ -31,7 +31,7 @@
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/op.h>
 #include <tvm/relay/transform.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/target/target.h>
 
 #include <cstdint>
diff --git a/src/relay/transforms/partial_eval.cc b/src/relay/transforms/partial_eval.cc
index fa080a7ff22c..3a87aa8ed498 100644
--- a/src/relay/transforms/partial_eval.cc
+++ b/src/relay/transforms/partial_eval.cc
@@ -861,8 +861,8 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
     return VisitFunc(GetRef<Function>(op), ll);
   }
 
-  struct ReflectError : dmlc::Error {
-    ReflectError() : dmlc::Error("static value not found") {}
+  struct ReflectError : Error {
+    ReflectError() : Error("static value not found") {}
   };
 
   Expr Reflect(const PStatic& st) {
diff --git a/src/relay/transforms/simplify_expr.cc b/src/relay/transforms/simplify_expr.cc
index 3c8876ceccb5..b4f4cc16e9df 100644
--- a/src/relay/transforms/simplify_expr.cc
+++ b/src/relay/transforms/simplify_expr.cc
@@ -26,7 +26,7 @@
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include "../op/tensor/transform.h"
 #include "pattern_utils.h"
diff --git a/src/relay/transforms/to_a_normal_form.cc b/src/relay/transforms/to_a_normal_form.cc
index 05844477cc5b..91e8d90c1232 100644
--- a/src/relay/transforms/to_a_normal_form.cc
+++ b/src/relay/transforms/to_a_normal_form.cc
@@ -26,7 +26,7 @@
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include "../../support/arena.h"
 #include "../analysis/dependency_graph.h"
diff --git a/src/relay/transforms/to_basic_block_normal_form.cc b/src/relay/transforms/to_basic_block_normal_form.cc
index 1aab367cf22a..79157bba1918 100644
--- a/src/relay/transforms/to_basic_block_normal_form.cc
+++ b/src/relay/transforms/to_basic_block_normal_form.cc
@@ -26,7 +26,7 @@
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include "../../support/arena.h"
 #include "../analysis/dependency_graph.h"
diff --git a/src/relay/transforms/type_infer.cc b/src/relay/transforms/type_infer.cc
index b4ccd1659865..4c6013792426 100644
--- a/src/relay/transforms/type_infer.cc
+++ b/src/relay/transforms/type_infer.cc
@@ -166,7 +166,7 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
              bool assign_rhs = true) {
     try {
       return solver_.Unify(t1, t2, span, assign_lhs, assign_rhs);
-    } catch (const dmlc::Error& e) {
+    } catch (const Error& e) {
       this->EmitFatal(Diagnostic::Error(span)
                       << "Error unifying `" << t1 << "` and `" << t2 << "`: " << e.what());
       return Type();
diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc
index 7fd27cba6136..150d7f215da5 100644
--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -384,7 +384,7 @@ typedef dmlc::ThreadLocalStore<TVMRuntimeEntry> TVMAPIRuntimeStore;
 
 const char* TVMGetLastError() { return TVMAPIRuntimeStore::Get()->last_error.c_str(); }
 
-int TVMAPIHandleException(const std::runtime_error& e) {
+int TVMAPIHandleException(const std::exception& e) {
   TVMAPISetLastError(NormalizeError(e.what()).c_str());
   return -1;
 }
@@ -518,7 +518,7 @@ int TVMFuncCreateFromCFunc(TVMPackedCFunc func, void* resource_handle, TVMPacked
       int ret = func(const_cast<TVMValue*>(args.values), const_cast<int*>(args.type_codes),
                      args.num_args, rv, resource_handle);
       if (ret != 0) {
-        throw dmlc::Error(TVMGetLastError() + ::dmlc::StackTrace());
+        throw tvm::Error(TVMGetLastError() + tvm::runtime::Backtrace());
       }
     });
   } else {
@@ -529,7 +529,7 @@ int TVMFuncCreateFromCFunc(TVMPackedCFunc func, void* resource_handle, TVMPacked
       int ret = func(const_cast<TVMValue*>(args.values), const_cast<int*>(args.type_codes),
                      args.num_args, rv, rpack.get());
       if (ret != 0) {
-        throw dmlc::Error(TVMGetLastError() + ::dmlc::StackTrace());
+        throw tvm::Error(TVMGetLastError() + tvm::runtime::Backtrace());
       }
     });
   }
diff --git a/src/runtime/contrib/cblas/cblas.cc b/src/runtime/contrib/cblas/cblas.cc
index 16496e06aae3..fbac6222488d 100644
--- a/src/runtime/contrib/cblas/cblas.cc
+++ b/src/runtime/contrib/cblas/cblas.cc
@@ -21,8 +21,8 @@
  * \file Use external cblas library call.
  */
 #include <tvm/runtime/data_type.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 extern "C" {
 #include <cblas.h>
diff --git a/src/runtime/contrib/cblas/mkl.cc b/src/runtime/contrib/cblas/mkl.cc
index 273aa45367dd..4323878db276 100644
--- a/src/runtime/contrib/cblas/mkl.cc
+++ b/src/runtime/contrib/cblas/mkl.cc
@@ -21,8 +21,8 @@
  * \file Use external mkl library call.
  */
 #include <tvm/runtime/data_type.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 extern "C" {
 #include <mkl_cblas.h>
diff --git a/src/runtime/contrib/cblas/mkldnn.cc b/src/runtime/contrib/cblas/mkldnn.cc
index 1c3fa023dcc7..31abd317c6a4 100644
--- a/src/runtime/contrib/cblas/mkldnn.cc
+++ b/src/runtime/contrib/cblas/mkldnn.cc
@@ -21,8 +21,8 @@
  * \file Use external cblas library call.
  */
 #include <tvm/runtime/data_type.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 extern "C" {
 #include <dnnl.h>
diff --git a/src/runtime/contrib/cublas/cublas.cc b/src/runtime/contrib/cublas/cublas.cc
index b12992f57159..9af1602cf3c0 100644
--- a/src/runtime/contrib/cublas/cublas.cc
+++ b/src/runtime/contrib/cublas/cublas.cc
@@ -21,8 +21,8 @@
  * \file Use external cblas library call.
  */
 #include <tvm/runtime/data_type.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 #include "../cblas/gemm_common.h"
 #include "cublas_utils.h"
diff --git a/src/runtime/contrib/cublas/cublas_utils.h b/src/runtime/contrib/cublas/cublas_utils.h
index 32c3b03ddbb0..3edb8300be88 100644
--- a/src/runtime/contrib/cublas/cublas_utils.h
+++ b/src/runtime/contrib/cublas/cublas_utils.h
@@ -28,7 +28,7 @@
 #include <cuda_runtime.h>
 #include <cuda_runtime_api.h>
 #include <dlpack/dlpack.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include <cstdint>
 #if CUDART_VERSION >= 10010
diff --git a/src/runtime/contrib/cudnn/cudnn_utils.h b/src/runtime/contrib/cudnn/cudnn_utils.h
index 528298b75187..9b8e9fb33f98 100644
--- a/src/runtime/contrib/cudnn/cudnn_utils.h
+++ b/src/runtime/contrib/cudnn/cudnn_utils.h
@@ -26,7 +26,7 @@
 
 #include <cudnn.h>
 #include <tvm/runtime/device_api.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include "../../cuda/cuda_common.h"
 
diff --git a/src/runtime/contrib/miopen/miopen_utils.h b/src/runtime/contrib/miopen/miopen_utils.h
index 9982f0914f6b..e5a769a974f0 100644
--- a/src/runtime/contrib/miopen/miopen_utils.h
+++ b/src/runtime/contrib/miopen/miopen_utils.h
@@ -26,7 +26,7 @@
 
 #include <miopen/miopen.h>
 #include <tvm/runtime/device_api.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include <string>
 
diff --git a/src/runtime/contrib/mps/mps_utils.h b/src/runtime/contrib/mps/mps_utils.h
index d1c49732318a..c2b7e3c7aa99 100644
--- a/src/runtime/contrib/mps/mps_utils.h
+++ b/src/runtime/contrib/mps/mps_utils.h
@@ -28,8 +28,8 @@
 #include <dmlc/thread_local.h>
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/device_api.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 #include <vector>
 
diff --git a/src/runtime/contrib/nnpack/convolution.cc b/src/runtime/contrib/nnpack/convolution.cc
index b3ea6c891d43..0d6359495902 100644
--- a/src/runtime/contrib/nnpack/convolution.cc
+++ b/src/runtime/contrib/nnpack/convolution.cc
@@ -23,8 +23,8 @@
 #include <nnpack.h>
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/device_api.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 #include "nnpack_utils.h"
 
diff --git a/src/runtime/contrib/nnpack/fully_connected.cc b/src/runtime/contrib/nnpack/fully_connected.cc
index 8b72eb38e08c..28570026ada3 100644
--- a/src/runtime/contrib/nnpack/fully_connected.cc
+++ b/src/runtime/contrib/nnpack/fully_connected.cc
@@ -22,8 +22,8 @@
  */
 #include <nnpack.h>
 #include <tvm/runtime/data_type.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 #include "nnpack_utils.h"
 
diff --git a/src/runtime/contrib/nnpack/nnpack_utils.h b/src/runtime/contrib/nnpack/nnpack_utils.h
index 231309baaa8e..4396ea0bcde6 100644
--- a/src/runtime/contrib/nnpack/nnpack_utils.h
+++ b/src/runtime/contrib/nnpack/nnpack_utils.h
@@ -25,8 +25,8 @@
 #include <dmlc/thread_local.h>
 #include <nnpack.h>
 #include <tvm/runtime/data_type.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 namespace tvm {
 namespace contrib {
diff --git a/src/runtime/contrib/random/mt_random_engine.cc b/src/runtime/contrib/random/mt_random_engine.cc
index 49bc056dcafb..699f6bbcf376 100644
--- a/src/runtime/contrib/random/mt_random_engine.cc
+++ b/src/runtime/contrib/random/mt_random_engine.cc
@@ -22,8 +22,8 @@
  * \brief mt19937 random engine
  */
 #include <tvm/runtime/device_api.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/ndarray.h>
-#include <tvm/support/logging.h>
 
 #include <algorithm>
 #include <ctime>
diff --git a/src/runtime/contrib/random/random.cc b/src/runtime/contrib/random/random.cc
index edcd20883369..2d111bc322ab 100644
--- a/src/runtime/contrib/random/random.cc
+++ b/src/runtime/contrib/random/random.cc
@@ -22,8 +22,8 @@
  */
 #include <dmlc/thread_local.h>
 #include <tvm/runtime/data_type.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 #include <algorithm>
 
diff --git a/src/runtime/contrib/rocblas/rocblas.cc b/src/runtime/contrib/rocblas/rocblas.cc
index dca1ebc6ed83..d977b1a211b0 100644
--- a/src/runtime/contrib/rocblas/rocblas.cc
+++ b/src/runtime/contrib/rocblas/rocblas.cc
@@ -23,8 +23,8 @@
 #include "rocblas.h"
 
 #include <tvm/runtime/data_type.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 namespace tvm {
 namespace contrib {
diff --git a/src/runtime/contrib/tensorrt/tensorrt_logger.h b/src/runtime/contrib/tensorrt/tensorrt_logger.h
index 087cb010189c..eb0164210dbb 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_logger.h
+++ b/src/runtime/contrib/tensorrt/tensorrt_logger.h
@@ -25,7 +25,7 @@
 #ifndef TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_LOGGER_H_
 #define TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_LOGGER_H_
 
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include "NvInfer.h"
 #include "tensorrt_utils.h"
diff --git a/src/runtime/contrib/vitis_ai/vitis_ai_runtime.cc b/src/runtime/contrib/vitis_ai/vitis_ai_runtime.cc
index 37dc767d31af..0e5e2ce4c4fa 100755
--- a/src/runtime/contrib/vitis_ai/vitis_ai_runtime.cc
+++ b/src/runtime/contrib/vitis_ai/vitis_ai_runtime.cc
@@ -25,6 +25,7 @@
 
 #include <tvm/runtime/registry.h>
 
+#include <cassert>
 #include <fstream>
 #include <streambuf>
 #include <string>
diff --git a/src/runtime/cpu_device_api.cc b/src/runtime/cpu_device_api.cc
index b745be33b456..133bb01d7d13 100644
--- a/src/runtime/cpu_device_api.cc
+++ b/src/runtime/cpu_device_api.cc
@@ -22,8 +22,8 @@
  */
 #include <dmlc/thread_local.h>
 #include <tvm/runtime/device_api.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 #include <cstdlib>
 #include <cstring>
diff --git a/src/runtime/crt/Makefile b/src/runtime/crt/Makefile
index 0f3e3096e319..d707d0c63b81 100644
--- a/src/runtime/crt/Makefile
+++ b/src/runtime/crt/Makefile
@@ -45,8 +45,8 @@ QUIET ?= @
 CRT_PREFIX = $(wildcard src/crt)
 
 INCLUDES ?= -isystem include -iquote $(dir ${CRT_CONFIG})
-CFLAGS += ${INCLUDES} -Werror -g $(EXTRA_CFLAGS)
-CXXFLAGS += ${INCLUDES} -std=c++11 -Werror -g $(EXTRA_CXXFLAGS)
+CFLAGS += ${INCLUDES} -Werror -g $(EXTRA_CFLAGS) -DDMLC_USE_LOGGING_LIBRARY=\<tvm/runtime/logging.h\>
+CXXFLAGS += ${INCLUDES} -std=c++11 -Werror -g $(EXTRA_CXXFLAGS) -DDMLC_USE_LOGGING_LIBRARY=\<tvm/runtime/logging.h\>
 LDFLAGS += -Werror -g $(EXTRA_LDFLAGS)
 
 ${BUILD_DIR}/%.o: src/%.c $(CRT_CONFIG)
diff --git a/src/runtime/crt/graph_runtime/load_json.c b/src/runtime/crt/graph_runtime/load_json.c
index 6de49a3f9789..3d1fb601a355 100644
--- a/src/runtime/crt/graph_runtime/load_json.c
+++ b/src/runtime/crt/graph_runtime/load_json.c
@@ -173,7 +173,7 @@ char JSONReader_PeekNextNonSpace(JSONReader* reader) {
  * \param out_str the output string. NULL to merely consume input and discard it.
  * \param out_str_size Number of bytes available to write starting from out_str. Includes
  *      terminating \0.
- * \throw dmlc::Error when next token is not string
+ * \throw tvm::Error when next token is not string
  */
 int JSONReader_ReadString(JSONReader* reader, char* out_str, size_t out_str_size) {
   int status = 0;
diff --git a/src/runtime/file_utils.cc b/src/runtime/file_utils.cc
index 92c398b559d2..32dd1d8020c9 100644
--- a/src/runtime/file_utils.cc
+++ b/src/runtime/file_utils.cc
@@ -24,9 +24,9 @@
 
 #include <dmlc/json.h>
 #include <dmlc/memory_io.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/serializer.h>
-#include <tvm/support/logging.h>
 
 #include <fstream>
 #include <unordered_map>
diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index 6c51e711aef1..7e98acb6fb3e 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -491,7 +491,7 @@ PackedFunc GraphRuntime::GetFunction(const std::string& name,
   } else if (name == "share_params") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
       const auto& module = args[0].operator Module();
-      ICHECK_EQ(module.operator->()->type_key(), "GraphRuntime");
+      ICHECK_EQ(module.operator->()->type_key(), std::string("GraphRuntime"));
       const auto& param_blob = args[1].operator std::string();
       dmlc::MemoryStringStream strm(const_cast<std::string*>(&param_blob));
       this->ShareParams(dynamic_cast<const GraphRuntime&>(*module.operator->()), &strm);
diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
index 70cebf5afa44..a01c9def5d5d 100644
--- a/src/runtime/hexagon/hexagon_device_api.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -18,8 +18,8 @@
  */
 
 #include <tvm/runtime/device_api.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 #include <algorithm>
 #include <cstring>
diff --git a/src/runtime/hexagon/hexagon_module.cc b/src/runtime/hexagon/hexagon_module.cc
index 994e24b99084..f6a57ff55355 100644
--- a/src/runtime/hexagon/hexagon_module.cc
+++ b/src/runtime/hexagon/hexagon_module.cc
@@ -22,8 +22,8 @@
 #ifdef __ANDROID__
 #include <android/log.h>
 #endif
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 #include <memory>
 #include <set>
diff --git a/src/runtime/hexagon/hexagon_module.h b/src/runtime/hexagon/hexagon_module.h
index e558997b7a4c..02ed7d2541c2 100644
--- a/src/runtime/hexagon/hexagon_module.h
+++ b/src/runtime/hexagon/hexagon_module.h
@@ -20,8 +20,8 @@
 #ifndef TVM_RUNTIME_HEXAGON_HEXAGON_MODULE_H_
 #define TVM_RUNTIME_HEXAGON_HEXAGON_MODULE_H_
 
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/module.h>
-#include <tvm/support/logging.h>
 
 #include <array>
 #include <memory>
diff --git a/src/runtime/hexagon/sim/hexagon_device_sim.cc b/src/runtime/hexagon/sim/hexagon_device_sim.cc
index 6cc7dcf3209f..1d3f0fd1006f 100644
--- a/src/runtime/hexagon/sim/hexagon_device_sim.cc
+++ b/src/runtime/hexagon/sim/hexagon_device_sim.cc
@@ -22,7 +22,7 @@
 #include <llvm/ADT/StringRef.h>
 #include <llvm/Support/FileSystem.h>
 #include <llvm/Support/Process.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include <algorithm>
 #include <deque>
diff --git a/src/runtime/hexagon/target/hexagon_dsprpcapi.cc b/src/runtime/hexagon/target/hexagon_dsprpcapi.cc
index d494db82e2c7..a089684c4188 100644
--- a/src/runtime/hexagon/target/hexagon_dsprpcapi.cc
+++ b/src/runtime/hexagon/target/hexagon_dsprpcapi.cc
@@ -22,7 +22,7 @@
 
 #include <dlfcn.h>
 #include <stdint.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include "hexagon_target_log.h"
 
diff --git a/src/runtime/hexagon/target/hexagon_dsprpcapi.h b/src/runtime/hexagon/target/hexagon_dsprpcapi.h
index c0e40805ecbf..e4711e3da584 100644
--- a/src/runtime/hexagon/target/hexagon_dsprpcapi.h
+++ b/src/runtime/hexagon/target/hexagon_dsprpcapi.h
@@ -22,7 +22,7 @@
 
 #ifdef __ANDROID__
 #include <stdint.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include "remote.h"
 #include "remote64.h"
diff --git a/src/runtime/hexagon/target/hexagon_stubapi.cc b/src/runtime/hexagon/target/hexagon_stubapi.cc
index 5428ae7c1cff..1fb7d942e968 100644
--- a/src/runtime/hexagon/target/hexagon_stubapi.cc
+++ b/src/runtime/hexagon/target/hexagon_stubapi.cc
@@ -23,7 +23,7 @@
 #include <dlfcn.h>
 #include <stdint.h>
 #include <sys/stat.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include "hexagon_target_log.h"
 
diff --git a/src/runtime/hexagon/target/hexagon_stubapi.h b/src/runtime/hexagon/target/hexagon_stubapi.h
index cc5b7b7413ca..fba22b10247c 100644
--- a/src/runtime/hexagon/target/hexagon_stubapi.h
+++ b/src/runtime/hexagon/target/hexagon_stubapi.h
@@ -24,7 +24,7 @@
 #include <AEEStdErr.h>
 #include <rpcmem.h>
 #include <stdint.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include <tuple>
 
diff --git a/src/runtime/logging.cc b/src/runtime/logging.cc
new file mode 100644
index 000000000000..8a44ec04532c
--- /dev/null
+++ b/src/runtime/logging.cc
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifdef TVM_BACKTRACE_DISABLED
+#include <string>
+
+// TODO(bkimball,tkonolige) This inline function is to work around a linking error I am having when
+// using MSVC If the function definition is in logging.cc then the linker can't find it no matter
+// what kind of attributes (dllexport) I decorate it with. This is temporary and will be addressed
+// when we get backtrace working on Windows.
+namespace tvm {
+namespace runtime {
+__declspec(dllexport) std::string Backtrace() { return ""; }
+}  // namespace runtime
+}  // namespace tvm
+#else
+
+#include <backtrace.h>
+#include <cxxabi.h>
+#include <tvm/runtime/logging.h>
+
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace tvm {
+namespace runtime {
+namespace {
+
+struct BacktraceInfo {
+  std::vector<std::string> lines;
+  size_t max_size;
+  std::string error_message;
+};
+
+void BacktraceCreateErrorCallback(void* data, const char* msg, int errnum) {
+  std::cerr << "Could not initialize backtrace state: " << msg << std::endl;
+}
+
+backtrace_state* BacktraceCreate() {
+  return backtrace_create_state(nullptr, 1, BacktraceCreateErrorCallback, nullptr);
+}
+
+static backtrace_state* _bt_state = BacktraceCreate();
+
+std::string DemangleName(std::string name) {
+  int status = 0;
+  size_t length = name.size();
+  std::unique_ptr<char, void (*)(void* __ptr)> demangled_name = {
+      abi::__cxa_demangle(name.c_str(), nullptr, &length, &status), &std::free};
+  if (demangled_name && status == 0 && length > 0) {
+    return demangled_name.get();
+  } else {
+    return name;
+  }
+}
+
+void BacktraceErrorCallback(void* data, const char* msg, int errnum) {
+  // do nothing
+}
+
+void BacktraceSyminfoCallback(void* data, uintptr_t pc, const char* symname, uintptr_t symval,
+                              uintptr_t symsize) {
+  auto str = reinterpret_cast<std::string*>(data);
+
+  if (symname != nullptr) {
+    std::string tmp(symname, symsize);
+    *str = DemangleName(tmp.c_str());
+  } else {
+    std::ostringstream s;
+    s << "0x" << std::setfill('0') << std::setw(sizeof(uintptr_t) * 2) << std::hex << pc;
+    *str = s.str();
+  }
+}
+
+int BacktraceFullCallback(void* data, uintptr_t pc, const char* filename, int lineno,
+                          const char* symbol) {
+  auto stack_trace = reinterpret_cast<BacktraceInfo*>(data);
+  std::stringstream s;
+
+  std::unique_ptr<std::string> symbol_str = std::make_unique<std::string>("<unknown>");
+  if (symbol != nullptr) {
+    *symbol_str = DemangleName(symbol);
+  } else {
+    // see if syminfo gives anything
+    backtrace_syminfo(_bt_state, pc, BacktraceSyminfoCallback, BacktraceErrorCallback,
+                      symbol_str.get());
+  }
+  s << *symbol_str;
+
+  if (filename != nullptr) {
+    s << std::endl << "        at " << filename;
+    if (lineno != 0) {
+      s << ":" << lineno;
+    }
+  }
+  // Skip tvm::backtrace and tvm::LogFatal::~LogFatal at the beginning of the trace as they don't
+  // add anything useful to the backtrace.
+  if (!(stack_trace->lines.size() == 0 &&
+        (symbol_str->find("tvm::runtime::Backtrace", 0) == 0 ||
+         symbol_str->find("tvm::runtime::detail::LogFatal", 0) == 0))) {
+    stack_trace->lines.push_back(s.str());
+  }
+  // TVMFuncCall denotes the API boundary so we stop there. Exceptions should be caught there.
+  if (*symbol_str == "TVMFuncCall" || stack_trace->lines.size() >= stack_trace->max_size) {
+    return 1;
+  }
+  return 0;
+}
+}  // namespace
+
+std::string Backtrace() {
+  BacktraceInfo bt;
+  bt.max_size = 100;
+  if (_bt_state == nullptr) {
+    return "";
+  }
+  // libbacktrace eats memory if run on multiple threads at the same time, so we guard against it
+  static std::mutex m;
+  std::lock_guard<std::mutex> lock(m);
+  backtrace_full(_bt_state, 0, BacktraceFullCallback, BacktraceErrorCallback, &bt);
+
+  std::ostringstream s;
+  s << "Stack trace:\n";
+  for (size_t i = 0; i < bt.lines.size(); i++) {
+    s << "  " << i << ": " << bt.lines[i] << "\n";
+  }
+
+  return s.str();
+}
+}  // namespace runtime
+}  // namespace tvm
+#endif
diff --git a/src/runtime/metal/metal_common.h b/src/runtime/metal/metal_common.h
index bd07dbfde9d0..b5d06192396b 100644
--- a/src/runtime/metal/metal_common.h
+++ b/src/runtime/metal/metal_common.h
@@ -32,8 +32,8 @@
 #import <Metal/MTLLibrary.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/packed_func.h>
-#include <tvm/support/logging.h>
 
 #include <memory>
 #include <mutex>
diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc
index 6c0d0c4c40fe..cd916d46971d 100644
--- a/src/runtime/micro/micro_session.cc
+++ b/src/runtime/micro/micro_session.cc
@@ -25,8 +25,8 @@
 
 #include <tvm/runtime/crt/rpc_common/framing.h>
 #include <tvm/runtime/crt/rpc_common/session.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 #include <algorithm>
 #include <chrono>
diff --git a/src/runtime/minrpc/minrpc_server.h b/src/runtime/minrpc/minrpc_server.h
index d5c61eccfd6d..3b9772f2fb60 100644
--- a/src/runtime/minrpc/minrpc_server.h
+++ b/src/runtime/minrpc/minrpc_server.h
@@ -46,7 +46,7 @@
 #endif
 
 #if TVM_MINRPC_ENABLE_LOGGING
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 #endif
 
 namespace tvm {
diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc
index d3ddbf8c0229..d46f0868a2ea 100644
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -23,9 +23,9 @@
  */
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 #include "runtime_base.h"
 
diff --git a/src/runtime/object.cc b/src/runtime/object.cc
index ad68c70698ea..c9a9669671e6 100644
--- a/src/runtime/object.cc
+++ b/src/runtime/object.cc
@@ -20,9 +20,9 @@
  * \file src/runtime/object.cc
  * \brief Object type management system.
  */
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/object.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 #include <iostream>
 #include <mutex>
diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index 2e7f05f91020..3fca368c758b 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -26,8 +26,8 @@
 
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/packed_func.h>
-#include <tvm/support/logging.h>
 
 /* There are many OpenCL platforms that do not yet support OpenCL 2.0,
  * hence we use 1.2 APIs, some of which are now deprecated.  In order
diff --git a/src/runtime/registry.cc b/src/runtime/registry.cc
index a65235090bfd..bb5a794a030b 100644
--- a/src/runtime/registry.cc
+++ b/src/runtime/registry.cc
@@ -22,8 +22,8 @@
  * \brief The global registry of packed function.
  */
 #include <dmlc/thread_local.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 #include <array>
 #include <memory>
diff --git a/src/runtime/rocm/rocm_device_api.cc b/src/runtime/rocm/rocm_device_api.cc
index 5f24ce0eec48..5d03374a4571 100644
--- a/src/runtime/rocm/rocm_device_api.cc
+++ b/src/runtime/rocm/rocm_device_api.cc
@@ -25,9 +25,9 @@
 #include <hip/hip_runtime_api.h>
 #include <hsa/hsa.h>
 #include <tvm/runtime/device_api.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/profiling.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 #include "rocm_common.h"
 
diff --git a/src/runtime/rpc/rpc_device_api.cc b/src/runtime/rpc/rpc_device_api.cc
index 06737f99a4de..cdeeb368f5a2 100644
--- a/src/runtime/rpc/rpc_device_api.cc
+++ b/src/runtime/rpc/rpc_device_api.cc
@@ -21,8 +21,8 @@
  * \file rpc_device_api.cc
  */
 #include <tvm/runtime/device_api.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/support/logging.h>
 
 #include <utility>
 
@@ -72,7 +72,7 @@ class RPCDeviceAPI final : public DeviceAPI {
     auto remote_ctx = RemoveRPCSessionMask(ctx);
     try {
       GetSess(ctx)->GetDeviceAPI(remote_ctx)->FreeDataSpace(remote_ctx, space->data);
-    } catch (const dmlc::Error& e) {
+    } catch (const Error& e) {
       // fault tolerance to remote close.
     }
     delete space;
diff --git a/src/runtime/rpc/rpc_endpoint.cc b/src/runtime/rpc/rpc_endpoint.cc
index 8716355fd68f..5e2bba88921e 100644
--- a/src/runtime/rpc/rpc_endpoint.cc
+++ b/src/runtime/rpc/rpc_endpoint.cc
@@ -526,7 +526,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
 
       try {
         fconstructor->CallPacked(constructor_args, &con_ret);
-      } catch (const dmlc::Error& e) {
+      } catch (const Error& e) {
         LOG(FATAL) << "Server[" << name_ << "]:"
                    << " Error caught from session constructor " << constructor_name << ":\n"
                    << e.what();
@@ -540,7 +540,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
       ICHECK_EQ(tkey, "rpc") << "Constructor " << constructor_name << " to return an RPCModule";
       serving_session_ = RPCModuleGetSession(mod);
       this->ReturnVoid();
-    } catch (const std::runtime_error& e) {
+    } catch (const std::exception& e) {
       this->ReturnException(e.what());
     }
 
@@ -562,7 +562,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
         }
         this->SwitchToState(kRecvPacketNumBytes);
       });
-    } catch (const std::runtime_error& e) {
+    } catch (const std::exception& e) {
       this->ReturnException(e.what());
       this->SwitchToState(kRecvPacketNumBytes);
     }
@@ -581,7 +581,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
       setter(0, rv);
 
       this->ReturnPackedSeq(TVMArgs(&ret_value, &ret_tcode, 1));
-    } catch (const std::runtime_error& e) {
+    } catch (const std::exception& e) {
       this->ReturnException(e.what());
     }
     this->SwitchToState(kRecvPacketNumBytes);
@@ -719,7 +719,7 @@ void RPCEndpoint::Shutdown() {
             writer_.bytes_available());
         if (n == 0) break;
       }
-    } catch (const dmlc::Error& e) {
+    } catch (const Error& e) {
     }
     channel_.reset(nullptr);
   }
diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index 34691415c1a4..46e1be794520 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -130,7 +130,7 @@ class RPCWrappedFunc : public Object {
   ~RPCWrappedFunc() {
     try {
       sess_->FreeHandle(handle_, kTVMPackedFuncHandle);
-    } catch (const dmlc::Error& e) {
+    } catch (const Error& e) {
       // fault tolerance to remote close
     }
   }
@@ -165,7 +165,7 @@ class RPCModuleNode final : public ModuleNode {
     if (module_handle_ != nullptr) {
       try {
         sess_->FreeHandle(module_handle_, kTVMModuleHandle);
-      } catch (const dmlc::Error& e) {
+      } catch (const Error& e) {
         // fault tolerance to remote close
       }
       module_handle_ = nullptr;
diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc
index 0ac5b8dc74ef..2b75018099d5 100644
--- a/src/runtime/rpc/rpc_session.cc
+++ b/src/runtime/rpc/rpc_session.cc
@@ -46,7 +46,7 @@ void RPCSession::AsyncCallFunc(PackedFuncHandle func, const TVMValue* arg_values
   try {
     this->CallFunc(func, arg_values, arg_type_codes, num_args,
                    [&callback](TVMArgs args) { callback(RPCCode::kReturn, args); });
-  } catch (const std::runtime_error& e) {
+  } catch (const std::exception& e) {
     this->SendException(callback, e.what());
   }
 }
@@ -60,7 +60,7 @@ void RPCSession::AsyncCopyToRemote(void* local_from_bytes, DLTensor* remote_to,
   try {
     this->CopyToRemote(local_from_bytes, remote_to, nbytes);
     callback(RPCCode::kReturn, TVMArgs(&value, &tcode, 1));
-  } catch (const std::runtime_error& e) {
+  } catch (const std::exception& e) {
     this->SendException(callback, e.what());
   }
 }
@@ -74,7 +74,7 @@ void RPCSession::AsyncCopyFromRemote(DLTensor* remote_from, void* local_to_bytes
   try {
     this->CopyFromRemote(remote_from, local_to_bytes, nbytes);
     callback(RPCCode::kReturn, TVMArgs(&value, &tcode, 1));
-  } catch (const std::runtime_error& e) {
+  } catch (const std::exception& e) {
     this->SendException(callback, e.what());
   }
 }
@@ -88,7 +88,7 @@ void RPCSession::AsyncStreamWait(TVMContext ctx, TVMStreamHandle stream,
   try {
     this->GetDeviceAPI(ctx)->StreamSync(ctx, stream);
     callback(RPCCode::kReturn, TVMArgs(&value, &tcode, 1));
-  } catch (const std::runtime_error& e) {
+  } catch (const std::exception& e) {
     this->SendException(callback, e.what());
   }
 }
diff --git a/src/runtime/runtime_base.h b/src/runtime/runtime_base.h
index 21601df1ad39..7abb32935a2b 100644
--- a/src/runtime/runtime_base.h
+++ b/src/runtime/runtime_base.h
@@ -34,7 +34,7 @@
      and finishes with API_END() or API_END_HANDLE_ERROR */
 #define API_END()                           \
   }                                         \
-  catch (std::runtime_error & _except_) {   \
+  catch (std::exception & _except_) {       \
     return TVMAPIHandleException(_except_); \
   }                                         \
   return 0;  // NOLINT(*)
@@ -45,7 +45,7 @@
  */
 #define API_END_HANDLE_ERROR(Finalize)      \
   }                                         \
-  catch (std::runtime_error & _except_) {   \
+  catch (std::exception & _except_) {       \
     Finalize;                               \
     return TVMAPIHandleException(_except_); \
   }                                         \
@@ -56,6 +56,6 @@
  * \param e the exception
  * \return the return value of API after exception is handled
  */
-int TVMAPIHandleException(const std::runtime_error& e);
+int TVMAPIHandleException(const std::exception& e);
 
 #endif  // TVM_RUNTIME_RUNTIME_BASE_H_
diff --git a/src/runtime/thread_pool.cc b/src/runtime/thread_pool.cc
index 5f5a811c2d30..cab04ec0db4a 100644
--- a/src/runtime/thread_pool.cc
+++ b/src/runtime/thread_pool.cc
@@ -24,10 +24,10 @@
 #include <dmlc/thread_local.h>
 #include <tvm/runtime/c_backend_api.h>
 #include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/threading_backend.h>
-#include <tvm/support/logging.h>
 #if TVM_THREADPOOL_USE_OPENMP
 #include <omp.h>
 #endif
diff --git a/src/runtime/threading_backend.cc b/src/runtime/threading_backend.cc
index 2527f4799086..7f9cfaa8730c 100644
--- a/src/runtime/threading_backend.cc
+++ b/src/runtime/threading_backend.cc
@@ -21,8 +21,8 @@
  * \file threading_backend.cc
  * \brief Native threading backend
  */
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/threading_backend.h>
-#include <tvm/support/logging.h>
 
 #include <algorithm>
 #include <thread>
diff --git a/src/runtime/vm/bytecode.cc b/src/runtime/vm/bytecode.cc
index f82d708468f7..09b928fa1e39 100644
--- a/src/runtime/vm/bytecode.cc
+++ b/src/runtime/vm/bytecode.cc
@@ -22,8 +22,8 @@
  * \brief The bytecode for Relay virtual machine.
  */
 
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/vm/bytecode.h>
-#include <tvm/support/logging.h>
 
 #include <sstream>
 
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index 6d121aa67733..4683398b01d4 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -24,10 +24,10 @@
 
 #include <dmlc/memory_io.h>
 #include <tvm/runtime/container.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/memory.h>
 #include <tvm/runtime/object.h>
 #include <tvm/runtime/vm/vm.h>
-#include <tvm/support/logging.h>
 
 #include <algorithm>
 #include <chrono>
diff --git a/src/runtime/vulkan/vulkan_common.h b/src/runtime/vulkan/vulkan_common.h
index 9cd1f257f091..3083ba6f9ce4 100644
--- a/src/runtime/vulkan/vulkan_common.h
+++ b/src/runtime/vulkan/vulkan_common.h
@@ -22,8 +22,8 @@
 
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/packed_func.h>
-#include <tvm/support/logging.h>
 #include <vulkan/vulkan.h>
 
 #include <memory>
diff --git a/src/runtime/vulkan/vulkan_shader.h b/src/runtime/vulkan/vulkan_shader.h
index c9fbb13e938d..513e3bccc36e 100644
--- a/src/runtime/vulkan/vulkan_shader.h
+++ b/src/runtime/vulkan/vulkan_shader.h
@@ -22,8 +22,8 @@
 
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/runtime/packed_func.h>
-#include <tvm/support/logging.h>
 
 #include <vector>
 
diff --git a/src/support/base64.h b/src/support/base64.h
index 901922db8edc..3aac9920a075 100644
--- a/src/support/base64.h
+++ b/src/support/base64.h
@@ -26,7 +26,7 @@
 #ifndef TVM_SUPPORT_BASE64_H_
 #define TVM_SUPPORT_BASE64_H_
 
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include <cctype>
 #include <cstdio>
diff --git a/src/support/parallel_for.cc b/src/support/parallel_for.cc
index f4756c29adeb..4ced0df6ddf3 100644
--- a/src/support/parallel_for.cc
+++ b/src/support/parallel_for.cc
@@ -21,7 +21,7 @@
  * \file parallel_for.cc
  * \brief An implementation to run loop in parallel.
  */
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/support/parallel_for.h>
 
 #include <future>
diff --git a/src/support/pipe.h b/src/support/pipe.h
index 3c1356ba174c..a2803638e1f3 100644
--- a/src/support/pipe.h
+++ b/src/support/pipe.h
@@ -25,7 +25,7 @@
 #define TVM_SUPPORT_PIPE_H_
 
 #include <dmlc/io.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #ifdef _WIN32
 #include <windows.h>
diff --git a/src/support/socket.h b/src/support/socket.h
index 16fba6b58e3d..11060ae8aae1 100644
--- a/src/support/socket.h
+++ b/src/support/socket.h
@@ -49,7 +49,7 @@ using ssize_t = int;
 #include <sys/socket.h>
 #include <unistd.h>
 #endif
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 
 #include <cstring>
 #include <string>
diff --git a/src/target/llvm/llvm_common.cc b/src/target/llvm/llvm_common.cc
index 35bfc8dc2e5b..61dd7024ff05 100644
--- a/src/target/llvm/llvm_common.cc
+++ b/src/target/llvm/llvm_common.cc
@@ -24,7 +24,7 @@
 
 #include "llvm_common.h"
 
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/target/target.h>
 
 #include <atomic>
diff --git a/src/target/target.cc b/src/target/target.cc
index b5ca4c38bbb9..55ef5f1a4e24 100644
--- a/src/target/target.cc
+++ b/src/target/target.cc
@@ -79,7 +79,7 @@ static const TObj* ObjTypeCheck(const ObjectRef& obj, const std::string& expecte
     std::ostringstream os;
     os << ": Expects type \"" << expected_type << "\", but gets \"" << obj->GetTypeKey()
        << "\" for object: " << obj;
-    throw dmlc::Error(os.str());
+    throw Error(os.str());
   }
   return ptr;
 }
@@ -87,7 +87,7 @@ static const TObj* ObjTypeCheck(const ObjectRef& obj, const std::string& expecte
 static TargetKind GetTargetKind(const String& name) {
   Optional<TargetKind> kind = TargetKind::Get(name);
   if (!kind.defined()) {
-    throw dmlc::Error(": Target kind \"" + name + "\" is not defined");
+    throw Error(": Target kind \"" + name + "\" is not defined");
   }
   return kind.value();
 }
@@ -98,10 +98,10 @@ static std::string RemovePrefixDashes(const std::string& s) {
   for (; n_dashes < len && s[n_dashes] == '-'; ++n_dashes) {
   }
   if (n_dashes == 0) {
-    throw dmlc::Error(": Attribute keys should start with '-', not an attribute key: " + s);
+    throw Error(": Attribute keys should start with '-', not an attribute key: " + s);
   }
   if (n_dashes >= len) {
-    throw dmlc::Error(": Not an attribute key: " + s);
+    throw Error(": Not an attribute key: " + s);
   }
   return s.substr(n_dashes);
 }
@@ -133,7 +133,7 @@ static int ParseKVPair(const std::string& s, const std::string& s_next, std::str
     result_k = s.substr(0, pos);
     result_v = s.substr(pos + 1);
     if (result_k.empty() || result_v.empty()) {
-      throw dmlc::Error(": Empty attribute key or value in \"" + s + "\"");
+      throw Error(": Empty attribute key or value in \"" + s + "\"");
     }
     return 1;
   } else if (!s_next.empty() && s_next[0] != '-') {
@@ -163,7 +163,7 @@ const TargetKindNode::ValueTypeInfo& TargetInternal::FindTypeInfo(const TargetKi
       }
       os << kv.first;
     }
-    throw dmlc::Error(os.str());
+    throw Error(os.str());
   }
   return it->second;
 }
@@ -177,14 +177,14 @@ ObjectRef TargetInternal::ParseType(const std::string& str,
     // Parsing integer
     int v;
     if (!(is >> v)) {
-      throw dmlc::Error(": Cannot parse into type \"Integer\" from string: " + str);
+      throw Error(": Cannot parse into type \"Integer\" from string: " + str);
     }
     return Integer(v);
   } else if (info.type_index == String::ContainerType::_GetOrAllocRuntimeTypeIndex()) {
     // Parsing string
     std::string v;
     if (!(is >> v)) {
-      throw dmlc::Error(": Cannot parse into type \"String\" from string: " + str);
+      throw Error(": Cannot parse into type \"String\" from string: " + str);
     }
     return String(v);
   } else if (info.type_index == Target::ContainerType::_GetOrAllocRuntimeTypeIndex()) {
@@ -197,14 +197,14 @@ ObjectRef TargetInternal::ParseType(const std::string& str,
       try {
         ObjectRef parsed = TargetInternal::ParseType(substr, *info.key);
         result.push_back(parsed);
-      } catch (const dmlc::Error& e) {
+      } catch (const Error& e) {
         std::string index = "[" + std::to_string(result.size()) + "]";
-        throw dmlc::Error(index + e.what());
+        throw Error(index + e.what());
       }
     }
     return Array<ObjectRef>(result);
   }
-  throw dmlc::Error(": Unsupported type \"" + info.type_key + "\" for parsing from string: " + str);
+  throw Error(": Unsupported type \"" + info.type_key + "\" for parsing from string: " + str);
 }
 
 ObjectRef TargetInternal::ParseType(const ObjectRef& obj,
@@ -224,15 +224,14 @@ ObjectRef TargetInternal::ParseType(const ObjectRef& obj,
     } else if (const auto* ptr = obj.as<MapNode>()) {
       for (const auto& kv : *ptr) {
         if (!kv.first->IsInstance<StringObj>()) {
-          throw dmlc::Error(": Target object requires key of dict to be str, but get: " +
-                            kv.first->GetTypeKey());
+          throw Error(": Target object requires key of dict to be str, but get: " +
+                      kv.first->GetTypeKey());
         }
       }
       Map<String, ObjectRef> config = GetRef<Map<String, ObjectRef>>(ptr);
       return Target(TargetInternal::FromConfig({config.begin(), config.end()}));
     }
-    throw dmlc::Error(": Expect type 'dict' or 'str' to construct Target, but get: " +
-                      obj->GetTypeKey());
+    throw Error(": Expect type 'dict' or 'str' to construct Target, but get: " + obj->GetTypeKey());
   } else if (info.type_index == ArrayNode::_GetOrAllocRuntimeTypeIndex()) {
     // Parsing array
     const auto* array = ObjTypeCheck<ArrayNode>(obj, "Array");
@@ -240,9 +239,9 @@ ObjectRef TargetInternal::ParseType(const ObjectRef& obj,
     for (const ObjectRef& e : *array) {
       try {
         result.push_back(TargetInternal::ParseType(e, *info.key));
-      } catch (const dmlc::Error& e) {
+      } catch (const Error& e) {
         std::string index = '[' + std::to_string(result.size()) + ']';
-        throw dmlc::Error(index + e.what());
+        throw Error(index + e.what());
       }
     }
     return Array<ObjectRef>(result);
@@ -254,17 +253,17 @@ ObjectRef TargetInternal::ParseType(const ObjectRef& obj,
       ObjectRef key, val;
       try {
         key = TargetInternal::ParseType(kv.first, *info.key);
-      } catch (const dmlc::Error& e) {
+      } catch (const Error& e) {
         std::ostringstream os;
         os << "'s key \"" << key << "\"" << e.what();
-        throw dmlc::Error(os.str());
+        throw Error(os.str());
       }
       try {
         val = TargetInternal::ParseType(kv.second, *info.val);
-      } catch (const dmlc::Error& e) {
+      } catch (const Error& e) {
         std::ostringstream os;
         os << "[\"" << key << "\"]" << e.what();
-        throw dmlc::Error(os.str());
+        throw Error(os.str());
       }
       result[key] = val;
     }
@@ -275,7 +274,7 @@ ObjectRef TargetInternal::ParseType(const ObjectRef& obj,
     os << ": Parsing type \"" << info.type_key
        << "\" is not supported for the given object of type \"" << obj->GetTypeKey()
        << "\". The object is: " << obj;
-    throw dmlc::Error(os.str());
+    throw Error(os.str());
   }
   return obj;
 }
@@ -355,7 +354,7 @@ Target::Target(const String& tag_or_config_or_target_str) {
   ObjectPtr<Object> target;
   try {
     target = TargetInternal::FromString(tag_or_config_or_target_str);
-  } catch (const dmlc::Error& e) {
+  } catch (const Error& e) {
     LOG(FATAL) << "ValueError" << e.what()
                << ". Target creation from string failed: " << tag_or_config_or_target_str;
   }
@@ -366,7 +365,7 @@ Target::Target(const Map<String, ObjectRef>& config) {
   ObjectPtr<Object> target;
   try {
     target = TargetInternal::FromConfig({config.begin(), config.end()});
-  } catch (const dmlc::Error& e) {
+  } catch (const Error& e) {
     LOG(FATAL) << "ValueError" << e.what()
                << ". Target creation from config dict failed: " << config;
   }
@@ -496,7 +495,7 @@ ObjectPtr<Object> TargetInternal::FromConfigString(const String& config_str) {
                     "if the python module is properly loaded";
   Optional<Map<String, ObjectRef>> config = (*loader)(config_str);
   if (!config.defined()) {
-    throw dmlc::Error(": Cannot load config dict with python JSON loader");
+    throw Error(": Cannot load config dict with python JSON loader");
   }
   return TargetInternal::FromConfig({config.value().begin(), config.value().end()});
 }
@@ -514,7 +513,7 @@ ObjectPtr<Object> TargetInternal::FromRawString(const String& target_str) {
     }
   }
   if (name.empty()) {
-    throw dmlc::Error(": Cannot parse empty target string");
+    throw Error(": Cannot parse empty target string");
   }
   // Create the target config
   std::unordered_map<String, ObjectRef> config = {{"kind", String(name)}};
@@ -525,17 +524,17 @@ ObjectPtr<Object> TargetInternal::FromRawString(const String& target_str) {
       // Parse key-value pair
       std::string s_next = (iter + 1 < options.size()) ? options[iter + 1] : "";
       iter += ParseKVPair(RemovePrefixDashes(options[iter]), s_next, &key, &value);
-    } catch (const dmlc::Error& e) {
-      throw dmlc::Error(": Error when parsing target" + std::string(e.what()));
+    } catch (const Error& e) {
+      throw Error(": Error when parsing target" + std::string(e.what()));
     }
     try {
       // check if `key` has been used
       if (config.count(key)) {
-        throw dmlc::Error(": The key \"" + key + "\" appears more than once");
+        throw Error(": The key \"" + key + "\" appears more than once");
       }
       config[key] = TargetInternal::ParseType(value, TargetInternal::FindTypeInfo(kind, key));
-    } catch (const dmlc::Error& e) {
-      throw dmlc::Error(": Error when parsing target[\"" + key + "\"]" + e.what());
+    } catch (const Error& e) {
+      throw Error(": Error when parsing target[\"" + key + "\"]" + e.what());
     }
   }
   return TargetInternal::FromConfig(config);
@@ -554,11 +553,11 @@ ObjectPtr<Object> TargetInternal::FromConfig(std::unordered_map<String, ObjectRe
       target->kind = GetTargetKind(GetRef<String>(kind));
       config.erase(kKind);
     } else {
-      throw dmlc::Error(": Expect type of field \"kind\" is String, but get type: " +
-                        config[kKind]->GetTypeKey());
+      throw Error(": Expect type of field \"kind\" is String, but get type: " +
+                  config[kKind]->GetTypeKey());
     }
   } else {
-    throw dmlc::Error(": Field \"kind\" is not found");
+    throw Error(": Field \"kind\" is not found");
   }
   // parse "tag"
   if (config.count(kTag)) {
@@ -566,8 +565,8 @@ ObjectPtr<Object> TargetInternal::FromConfig(std::unordered_map<String, ObjectRe
       target->tag = GetRef<String>(tag);
       config.erase(kTag);
     } else {
-      throw dmlc::Error(": Expect type of field \"tag\" is String, but get type: " +
-                        config[kTag]->GetTypeKey());
+      throw Error(": Expect type of field \"tag\" is String, but get type: " +
+                  config[kTag]->GetTypeKey());
     }
   } else {
     target->tag = "";
@@ -582,15 +581,15 @@ ObjectPtr<Object> TargetInternal::FromConfig(std::unordered_map<String, ObjectRe
           if (const auto* key = e.as<StringObj>()) {
             keys.push_back(GetRef<String>(key));
           } else {
-            throw dmlc::Error(
+            throw Error(
                 ": Expect 'keys' to be an array of strings, but it "
                 "contains an element of type: " +
                 e->GetTypeKey());
           }
         }
       } else {
-        throw dmlc::Error(": Expect type of field \"keys\" is Array, but get type: " +
-                          config[kKeys]->GetTypeKey());
+        throw Error(": Expect type of field \"keys\" is Array, but get type: " +
+                    config[kKeys]->GetTypeKey());
       }
     }
     // add device name
@@ -615,8 +614,8 @@ ObjectPtr<Object> TargetInternal::FromConfig(std::unordered_map<String, ObjectRe
     try {
       const TargetKindNode::ValueTypeInfo& info = TargetInternal::FindTypeInfo(target->kind, key);
       attrs[key] = TargetInternal::ParseType(value, info);
-    } catch (const dmlc::Error& e) {
-      throw dmlc::Error(": Error when parsing target[\"" + key + "\"]" + e.what());
+    } catch (const Error& e) {
+      throw Error(": Error when parsing target[\"" + key + "\"]" + e.what());
     }
   }
   // parse host
diff --git a/tests/cpp/ir_functor_test.cc b/tests/cpp/ir_functor_test.cc
index 1f7d18f747ea..9e8595d6809c 100644
--- a/tests/cpp/ir_functor_test.cc
+++ b/tests/cpp/ir_functor_test.cc
@@ -125,7 +125,7 @@ TEST(IRF, ExprTransform) {
   try {
     f(z - 1, 2);
     LOG(FATAL) << "should fail";
-  } catch (dmlc::Error&) {
+  } catch (Error&) {
   }
 }
 
diff --git a/tests/cpp/parallel_for_test.cc b/tests/cpp/parallel_for_test.cc
index bf5fe94b83ff..a4549344bd11 100644
--- a/tests/cpp/parallel_for_test.cc
+++ b/tests/cpp/parallel_for_test.cc
@@ -19,7 +19,7 @@
 
 #include <dmlc/logging.h>
 #include <gtest/gtest.h>
-#include <tvm/support/logging.h>
+#include <tvm/runtime/logging.h>
 #include <tvm/support/parallel_for.h>
 
 #include <vector>
diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py
index ab51b6c79c83..f5c0de0a50b0 100644
--- a/tests/lint/check_file_type.py
+++ b/tests/lint/check_file_type.py
@@ -131,6 +131,8 @@
     # microTVM Virtual Machines
     "apps/microtvm/reference-vm/zephyr/Vagrantfile",
     "apps/microtvm/reference-vm/zephyr/base-box/Vagrantfile.packer-template",
+    # patch file for libbacktrace
+    "cmake/modules/libbacktrace_macos.patch",
 }
 
 
diff --git a/tests/python/relay/test_ir_parser.py b/tests/python/relay/test_ir_parser.py
index 62e52abefeb4..8b6b39e3df15 100644
--- a/tests/python/relay/test_ir_parser.py
+++ b/tests/python/relay/test_ir_parser.py
@@ -827,8 +827,8 @@ def test_import_grad():
     mod.import_from_std("gradient.rly")
 
 
-def test_resnet():
-    mod, _ = relay.testing.resnet.get_workload()
+def test_mlp():
+    mod, _ = relay.testing.mlp.get_workload(1)
     text = mod.astext()
     parsed_mod = tvm.parser.parse(text)
     tvm.ir.assert_structural_equal(mod, parsed_mod)
@@ -850,8 +850,8 @@ def inline_params(mod, params):
     return mod
 
 
-def test_resnet_inlined_params():
-    mod, params = relay.testing.resnet.get_workload()
+def test_mlp_inlined_params():
+    mod, params = relay.testing.mlp.get_workload(1)
     mod = inline_params(mod, params)
     mod = relay.transform.InferType()(mod)
     text = mod.astext()
diff --git a/tests/python/relay/test_ir_text_printer.py b/tests/python/relay/test_ir_text_printer.py
index 72a243dbbb67..b2ae28649e6a 100644
--- a/tests/python/relay/test_ir_text_printer.py
+++ b/tests/python/relay/test_ir_text_printer.py
@@ -181,11 +181,6 @@ def test_squeezenet():
         astext(net)
 
 
-def test_vgg():
-    net, _ = tvm.relay.testing.vgg.get_workload(batch_size=1)
-    astext(net)
-
-
 def test_densenet():
     net, _ = tvm.relay.testing.densenet.get_workload(batch_size=1)
     astext(net)
diff --git a/tests/scripts/task_build.sh b/tests/scripts/task_build.sh
index d8e35ebd4de3..845b7153ae20 100755
--- a/tests/scripts/task_build.sh
+++ b/tests/scripts/task_build.sh
@@ -16,4 +16,4 @@
 # specific language governing permissions and limitations
 # under the License.
 export VTA_HW_PATH=`pwd`/3rdparty/vta-hw
-cd $1 && cmake .. && make $2 && cd ..
+cd $1 && cmake .. -DCMAKE_BUILD_TYPE=RelWithDebInfo && make $2 && cd ..
diff --git a/tutorials/auto_scheduler/tune_network_cuda.py b/tutorials/auto_scheduler/tune_network_cuda.py
index 5ed3ceef5ba0..bc88457f94f9 100644
--- a/tutorials/auto_scheduler/tune_network_cuda.py
+++ b/tutorials/auto_scheduler/tune_network_cuda.py
@@ -252,7 +252,7 @@ def run_tuning():
 #   The last line also prints the total number of measurement trials,
 #   total time spent on auto-tuning and the id of the next task to tune.
 #
-#   There will also be some "dmlc::Error"s and CUDA errors, because the
+#   There will also be some "tvm::Error"s and CUDA errors, because the
 #   auto-scheduler will try some invalid schedules.
 #   You can safely ignore them if the tuning can continue, because these
 #   errors are isolated from the main process.
diff --git a/tutorials/auto_scheduler/tune_network_mali.py b/tutorials/auto_scheduler/tune_network_mali.py
index ca1067b27c80..2bce968771e3 100644
--- a/tutorials/auto_scheduler/tune_network_mali.py
+++ b/tutorials/auto_scheduler/tune_network_mali.py
@@ -329,7 +329,7 @@ def tune_and_evaluate():
 #   The last line also prints the total number of measurement trials,
 #   total time spent on auto-tuning and the id of the next task to tune.
 #
-#   There will also be some "dmlc::Error"s errors, because the
+#   There will also be some "tvm::Error"s errors, because the
 #   auto-scheduler will try some invalid schedules.
 #   You can safely ignore them if the tuning can continue, because these
 #   errors are isolated from the main process.
diff --git a/tutorials/auto_scheduler/tune_network_x86.py b/tutorials/auto_scheduler/tune_network_x86.py
index 8526abbbe6ca..2b47c64729e0 100644
--- a/tutorials/auto_scheduler/tune_network_x86.py
+++ b/tutorials/auto_scheduler/tune_network_x86.py
@@ -251,7 +251,7 @@ def run_tuning():
 #   The last line also prints the total number of measurement trials,
 #   total time spent on auto-tuning and the id of the next task to tune.
 #
-#   There will also be some "dmlc::Error"s errors, because the
+#   There will also be some "tvm::Error"s errors, because the
 #   auto-scheduler will try some invalid schedules.
 #   You can safely ignore them if the tuning can continue, because these
 #   errors are isolated from the main process.
diff --git a/web/emcc/tvmjs_support.cc b/web/emcc/tvmjs_support.cc
index b72caad1e3df..12f930f491a5 100644
--- a/web/emcc/tvmjs_support.cc
+++ b/web/emcc/tvmjs_support.cc
@@ -25,11 +25,9 @@
  */
 
 // configurations for the dmlc log.
-#define DMLC_LOG_CUSTOMIZE 0
-#define DMLC_LOG_STACK_TRACE 0
-#define DMLC_LOG_DEBUG 0
-#define DMLC_LOG_NODATE 1
-#define DMLC_LOG_FATAL_THROW 0
+#define TVM_LOG_DEBUG 0
+#define DMLC_USE_LOGGING_LIBRARY <tvm/runtime/logging.h>
+#define TVM_BACKTRACE_DISABLED 1
 
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/container.h>
diff --git a/web/emcc/wasm_runtime.cc b/web/emcc/wasm_runtime.cc
index 214c1883f874..0b14ef6476d2 100644
--- a/web/emcc/wasm_runtime.cc
+++ b/web/emcc/wasm_runtime.cc
@@ -23,14 +23,12 @@
  */
 
 // configurations for the dmlc log.
-#define DMLC_LOG_CUSTOMIZE 0
-#define DMLC_LOG_STACK_TRACE 0
-#define DMLC_LOG_DEBUG 0
-#define DMLC_LOG_NODATE 1
-#define DMLC_LOG_FATAL_THROW 0
+#define TVM_LOG_DEBUG 0
+#define DMLC_USE_LOGGING_LIBRARY <tvm/runtime/logging.h>
+#define TVM_BACKTRACE_DISABLED 1
 
-#include <dmlc/logging.h>
 #include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/logging.h>
 
 #include "src/runtime/c_runtime_api.cc"
 #include "src/runtime/cpu_device_api.cc"
diff --git a/web/emcc/webgpu_runtime.cc b/web/emcc/webgpu_runtime.cc
index 62b87af01774..01e42ef3faa8 100644
--- a/web/emcc/webgpu_runtime.cc
+++ b/web/emcc/webgpu_runtime.cc
@@ -22,12 +22,10 @@
  * \brief WebGPU runtime based on the TVM JS.
  */
 
-// configurations for the dmlc log.
-#define DMLC_LOG_CUSTOMIZE 0
-#define DMLC_LOG_STACK_TRACE 0
-#define DMLC_LOG_DEBUG 0
-#define DMLC_LOG_NODATE 1
-#define DMLC_LOG_FATAL_THROW 0
+// configurations for tvm logging.
+#define TVM_LOG_DEBUG 0
+#define DMLC_USE_LOGGING_LIBRARY <tvm/runtime/logging.h>
+#define TVM_BACKTRACE_DISABLED 1
 
 #include <dmlc/thread_local.h>
 #include <tvm/runtime/c_runtime_api.h>
@@ -35,12 +33,27 @@
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
 
+#include <iostream>
+#include <string>
+
 #include "../../src/runtime/meta_data.h"
 #include "../../src/runtime/vulkan/vulkan_shader.h"
 #include "../../src/runtime/workspace_pool.h"
 
 namespace tvm {
 namespace runtime {
+namespace detail {
+// Override logging mechanism
+void LogFatalImpl(const std::string& file, int lineno, const std::string& message) {
+  std::cerr << file << ":" << lineno << ": " << message << std::endl;
+  abort();
+}
+
+void LogMessageImpl(const std::string& file, int lineno, const std::string& message) {
+  std::cerr << file << ":" << lineno << ": " << message << std::endl;
+}
+
+}  // namespace detail
 
 /*! \brief Thread local workspace */
 class WebGPUThreadEntry {

From 45442edcf4d1e9d3e8a37b4530d8f634a6f0cc05 Mon Sep 17 00:00:00 2001
From: Altan Haan <ahaan@octoml.ai>
Date: Thu, 18 Mar 2021 07:04:48 -0700
Subject: [PATCH 356/357] [Relay][Training][Pass] Factor out first-order AD to
 a module pass (#7677)

---
 python/tvm/relay/transform/transform.py       |  26 +-
 src/relay/transforms/first_order_gradient.cc  | 309 ++++++++++++++++++
 src/relay/transforms/gradient.h               |  54 +++
 .../{gradient.cc => higher_order_gradient.cc} | 274 +---------------
 4 files changed, 391 insertions(+), 272 deletions(-)
 create mode 100644 src/relay/transforms/first_order_gradient.cc
 create mode 100644 src/relay/transforms/gradient.h
 rename src/relay/transforms/{gradient.cc => higher_order_gradient.cc} (64%)

diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py
index b61f209505d8..5b0e480f5f28 100644
--- a/python/tvm/relay/transform/transform.py
+++ b/python/tvm/relay/transform/transform.py
@@ -800,12 +800,36 @@ def gradient(expr, mod=None, mode="higher_order"):
       The transformed expression.
     """
     if mode == "first_order":
-        return _ffi_api.first_order_gradient(expr, mod)
+        warnings.warn(
+            "using transform.gradient for first-order AD is deprecated, please use the"
+            "FirstOrderGradient module pass",
+            DeprecationWarning,
+        )
+        if mod is not None:
+            raise RuntimeError(
+                "to run first-order AD on a module, please use the FirstOrderGradient module pass."
+            )
+        return FirstOrderGradient()(tvm.IRModule.from_expr(expr))["main"]
     if mode == "higher_order":
         return _ffi_api.gradient(expr, mod)
     raise Exception("unknown mode")
 
 
+def FirstOrderGradient():
+    """
+    Transforms all global functions in the module to return the original result, paired with the
+    gradients of the inputs. This pass transforms each global function independently and does not
+    support interprocedural AD. Additionally, this pass does not support any control-flow or
+    references, and should only be used on pure data-flow graphs.
+
+    Returns
+    -------
+    ret : tvm.transform.Pass
+        The registered FirstOrderGradient pass.
+    """
+    return _ffi_api.FirstOrderGradient()
+
+
 def Defunctionalization(func, mod):
     """
     Performs defunctionalization on func,
diff --git a/src/relay/transforms/first_order_gradient.cc b/src/relay/transforms/first_order_gradient.cc
new file mode 100644
index 000000000000..55714592ded7
--- /dev/null
+++ b/src/relay/transforms/first_order_gradient.cc
@@ -0,0 +1,309 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file first_order_gradient.cc
+ * \brief First-order Automatic Differentiation in Relay for pure dataflow graphs.
+ */
+#include <tvm/ir/type_functor.h>
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/dataflow_matcher.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/feature.h>
+#include <tvm/relay/transform.h>
+#include <tvm/te/operation.h>
+
+#include "gradient.h"
+#include "let_list.h"
+#include "pass_utils.h"
+#include "pattern_utils.h"
+
+namespace tvm {
+namespace relay {
+
+template <typename F>
+Expr MultiFactory(const Type& t, F factory, DiagnosticContext diag_ctx) {
+  if (auto* tt = t.as<TensorTypeNode>()) {
+    return factory(tt->shape, tt->dtype);
+  } else if (auto* tt = t.as<TupleTypeNode>()) {
+    std::vector<Expr> res;
+    for (size_t i = 0; i < tt->fields.size(); i++) {
+      res.push_back(MultiFactory(tt->fields[i], factory, diag_ctx));
+    }
+    return Tuple(res);
+  } else {
+    diag_ctx.EmitFatal(Diagnostic::Error(t->span)
+                       << "could not build tensors using factory for type " << PrettyPrint(t));
+    throw;
+  }
+}
+
+template <typename F, typename F2>
+Expr MultiFactoryLike(const Expr& e, const Type& t, F factory, F2 factory_like,
+                      DiagnosticContext diag_ctx) {
+  if (t.as<TensorTypeNode>()) {
+    return factory_like(e);
+  } else if (auto* tt = t.as<TupleTypeNode>()) {
+    return MultiFactory(t, factory, diag_ctx);
+  } else {
+    diag_ctx.EmitFatal(Diagnostic::Error(t->span)
+                       << "could not build tensors using factory for type " << PrettyPrint(t));
+    throw;
+  }
+}
+
+/*! \brief A fragment of the program being built by the automatic differentation
+ *  pass.
+ */
+struct ADValueNode {
+  virtual ~ADValueNode() {}
+  template <typename T>
+  T& get() {
+    auto ret = dynamic_cast<T*>(this);
+    ICHECK(ret) << "cannot downcast";
+    return *ret;
+  }
+};
+
+using ADValue = std::shared_ptr<ADValueNode>;
+
+/*! \brief AD over a program which generates a tensor output. */
+struct ADTensor : ADValueNode {
+  Expr forward;
+  mutable Expr reverse;  // must be a variable to avoid duplication
+  ADTensor(LetList* ll, const Expr& forward, DiagnosticContext diag_ctx)
+      : forward(ll->Push(forward)),
+        reverse(ll->Push(
+            MultiFactoryLike(this->forward, forward->checked_type(), Zeros, ZerosLike, diag_ctx))) {
+    this->forward->checked_type_ = forward->checked_type();
+  }
+};
+
+/*! \brief A staged representation of the program, we reflect
+ * Relay functions into a function over fragments of AD. We
+ * can compute away this function to obtain a reverse mode program.
+ */
+struct ADFunction : ADValueNode {
+  // (ad_args, orig) -> ad_ret
+  using ADFunctionType = ADValue(const std::vector<ADValue>&, const Call&);
+  std::function<ADFunctionType> func;
+  explicit ADFunction(const std::function<ADFunctionType>& func) : func(func) {}
+};
+
+struct FirstOrderReverseAD : ExprFunctor<ADValue(const Expr&)> {
+  const OpAttrMap<FPrimalGradient> rev_map = Op::GetAttrMap<FPrimalGradient>("FPrimalGradient");
+  std::vector<std::function<void(LetList* ll)>> backprop_actions;
+  // we assume no closure so no need for lexical scoping
+  std::unordered_map<Expr, ADValue, ObjectPtrHash, ObjectPtrEqual> env;
+  LetList* ll;
+  DiagnosticContext diag_ctx;
+
+  FirstOrderReverseAD(LetList* ll, DiagnosticContext diag_ctx) : ll(ll), diag_ctx(diag_ctx) {}
+
+  ADValue VisitExpr(const Expr& n) final {
+    if (env.count(n)) {
+      return env.at(n);
+    }
+    auto ret = ExprFunctor::VisitExpr(n);
+    env[n] = ret;
+    return ret;
+  }
+
+  static Expr LiftedAdd(const Type& t, const Expr& x, const Expr& y, LetList* ll) {
+    if (t.as<TensorTypeNode>()) {
+      return ll->Push(Add(x, y));
+    } else if (auto* tt = t.as<TupleTypeNode>()) {
+      Array<Expr> fields;
+      for (size_t i = 0; i < tt->fields.size(); ++i) {
+        fields.push_back(
+            LiftedAdd(tt->fields[i], ll->Push(GetField(x, i)), ll->Push(GetField(y, i)), ll));
+      }
+      return ll->Push(Tuple(fields));
+    } else {
+      LOG(FATAL) << "cannot lift addition for type " << PrettyPrint(t);
+      throw;
+    }
+  }
+
+  ADValue VisitExpr_(const OpNode* op) final {
+    Op op_ref = GetRef<Op>(op);
+    if (!rev_map.count(op_ref)) {
+      diag_ctx.EmitFatal(Diagnostic::Error(op->span)
+                         << "the operator " << op->name << " does not have a registered gradient.");
+    }
+    return std::make_shared<ADFunction>([this, op_ref](const std::vector<ADValue>& ad_args,
+                                                       const Call& orig) {
+      std::vector<Expr> orig_args;
+      for (const ADValue& adval : ad_args) {
+        orig_args.push_back(adval->get<ADTensor>().forward);
+      }
+      auto orig_new = Call(op_ref, orig_args, orig->attrs, orig->type_args);
+      orig_new->checked_type_ = orig->checked_type();
+      auto ret = std::make_shared<ADTensor>(ll, orig_new, diag_ctx);
+      backprop_actions.push_back([this, ad_args, orig_new, ret, op_ref](LetList* ll) {
+        tvm::Array<Expr> rev = rev_map[op_ref](orig_new, ret->reverse);
+        if (ad_args.size() != rev.size()) {
+          diag_ctx.EmitFatal(Diagnostic::Error(op_ref->span)
+                             << "arity mismatch for operator " << op_ref->name
+                             << " and its registered gradient: expected " << ad_args.size()
+                             << " but got " << rev.size() << " gradients.");
+        }
+        for (size_t i = 0; i < ad_args.size(); ++i) {
+          auto& ad_arg = ad_args[i]->get<ADTensor>();
+          ad_arg.reverse = LiftedAdd(ad_arg.forward->checked_type(), ad_arg.reverse, rev[i], ll);
+        }
+      });
+      return ret;
+    });
+  }
+
+  ADValue VisitExpr_(const TupleGetItemNode* op) final {
+    Expr e = GetRef<Expr>(op);
+    ADValue tup = VisitExpr(op->tuple);
+    auto tt = op->tuple->checked_type().as<TupleTypeNode>();
+    size_t idx = op->index;
+    auto ret = std::make_shared<ADTensor>(ll, e, diag_ctx);
+    backprop_actions.push_back([tup, tt, idx, ret](LetList* ll) {
+      auto& ad_tup = tup->get<ADTensor>();
+      std::vector<Expr> updated_grads;
+      for (size_t i = 0; i < tt->fields.size(); ++i) {
+        Expr grad_pre = GetField(ad_tup.reverse, i);
+        updated_grads.push_back(i != idx ? grad_pre
+                                         : LiftedAdd(tt->fields[i], grad_pre, ret->reverse, ll));
+      }
+      ad_tup.reverse = ll->Push(Tuple(updated_grads));
+    });
+    return ret;
+  }
+
+  ADValue VisitExpr_(const TupleNode* op) final {
+    Expr e = GetRef<Expr>(op);
+    std::vector<ADValue> fields;
+    for (const auto& f : op->fields) {
+      fields.push_back(VisitExpr(f));
+    }
+    auto tt = op->checked_type().as<TupleTypeNode>();
+    auto ret = std::make_shared<ADTensor>(ll, e, diag_ctx);
+    backprop_actions.push_back([fields, tt, ret](LetList* ll) {
+      for (size_t i = 0; i < fields.size(); ++i) {
+        auto& ad_field = fields[i]->get<ADTensor>();
+        ad_field.reverse =
+            LiftedAdd(tt->fields[i], ad_field.reverse, GetField(ret->reverse, i), ll);
+      }
+    });
+    return ret;
+  }
+
+  ADValue VisitExpr_(const ConstantNode* op) final {
+    Expr e = GetRef<Expr>(op);
+    return std::make_shared<ADTensor>(ll, e, diag_ctx);
+  }
+
+  ADValue VisitExpr_(const CallNode* op) final {
+    ADValue f = VisitExpr(op->op);
+    std::vector<ADValue> args;
+    for (const auto& arg : op->args) {
+      args.push_back(VisitExpr(arg));
+    }
+    return f->get<ADFunction>().func(args, GetRef<Call>(op));
+  }
+
+  ADValue VisitExpr_(const FunctionNode* op) final {
+    Function f = GetRef<Function>(op);
+    // todo: assert no closure
+    return std::make_shared<ADFunction>(
+        [this, f](const std::vector<ADValue>& ad_args, const Call& orig) {
+          ICHECK_EQ(f->params.size(), ad_args.size());
+          for (size_t i = 0; i < f->params.size(); ++i) {
+            env[f->params[i]] = ad_args[i];
+          }
+          return VisitExpr(f->body);
+        });
+  }
+
+  // Var will always be in env, handled in VisitExpr (without _), so we don't need
+  // to implement its VisitExpr_.
+};
+
+namespace transform {
+
+Pass FirstOrderGradient() {
+  runtime::TypedPackedFunc<IRModule(IRModule, PassContext)> f = [](IRModule mod, PassContext ctx) {
+    CheckFeature(
+        mod, FeatureSet({fVar, fConstant, fTuple, fTupleGetItem, fFunction, fOp, fCall, fGraph}));
+    IRModule ad_mod = GetRef<IRModule>(mod.CopyOnWrite());
+    DiagnosticContext diag_ctx = DiagnosticContext::Default(ad_mod);
+
+    if (mod->functions.size() > 1) {
+      LOG(WARNING) << "IRModule contains multiple global functions: first-order AD will transform "
+                      "them indepedently!";
+    }
+
+    for (const auto& pr : mod->functions) {
+      const FunctionNode* func = pr.second.as<FunctionNode>();
+      if (!func) {
+        diag_ctx.Emit(Diagnostic::Warning(pr.second->span)
+                      << "AD can only be performed on Relay functions, skipping "
+                      << PrettyPrint(pr.first));
+      }
+      if (func->type_params.size() > 0) {
+        diag_ctx.EmitFatal(Diagnostic::Error(pr.second->span)
+                           << "first-order AD does not support polymorphism yet.");
+      }
+      Expr body = LetList::With([&](LetList* ll) {
+        FirstOrderReverseAD reverse_ad(ll, diag_ctx);
+        ADValue rev = reverse_ad(pr.second);
+        std::vector<ADValue> args;
+        for (const auto& p : func->params) {
+          args.push_back(std::make_shared<ADTensor>(ll, p, diag_ctx));
+        }
+        Call placeholder = Call(GetRef<Function>(func), {});
+        placeholder->checked_type_ = func->checked_type().as<FuncTypeNode>()->ret_type;
+        auto grad_call = rev->get<ADFunction>().func(args, placeholder);
+        auto& res = grad_call->get<ADTensor>();
+        Expr grad_tuple = LetList::With([&](LetList* ll) {
+          res.reverse =
+              MultiFactoryLike(res.forward, res.forward->checked_type(), Ones, OnesLike, diag_ctx);
+          for (auto it = reverse_ad.backprop_actions.rbegin();
+               it != reverse_ad.backprop_actions.rend(); ++it) {
+            (*it)(ll);
+          }
+          std::vector<Expr> grads;
+          for (const auto& a : args) {
+            grads.push_back(a->get<ADTensor>().reverse);
+          }
+          return Tuple(grads);
+        });
+        return Pair(res.forward, grad_tuple);
+      });
+      ad_mod->Update(pr.first,
+                     Function(func->params, body, GradRetType(GetRef<Function>(func)), {}));
+    }
+
+    return ad_mod;
+  };
+  return CreateModulePass(f, 0, "FirstOrderGradient", {});
+}
+
+TVM_REGISTER_GLOBAL("relay._transform.FirstOrderGradient").set_body_typed(FirstOrderGradient);
+
+}  // namespace transform
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/transforms/gradient.h b/src/relay/transforms/gradient.h
new file mode 100644
index 000000000000..2e6ffbcc7c9e
--- /dev/null
+++ b/src/relay/transforms/gradient.h
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file gradient.h
+ * \brief Utility functions for Automatic Differentiation in Relay.
+ */
+#ifndef TVM_RELAY_TRANSFORMS_GRADIENT_H_
+#define TVM_RELAY_TRANSFORMS_GRADIENT_H_
+
+#include <tvm/relay/expr.h>
+#include <tvm/relay/function.h>
+
+#include <vector>
+
+namespace tvm {
+namespace relay {
+
+inline Type GradRetType(const Function& f) {
+  // if type annotations are provided, we will construct a ret type;
+  // otherwise, leave it to be inferred
+  if (!f->ret_type.defined()) {
+    return Type();
+  }
+  std::vector<Type> vt;
+  for (const auto& p : f->params) {
+    if (!p->type_annotation.defined()) {
+      return Type();
+    }
+    vt.push_back(p->type_annotation);
+  }
+
+  return TupleType({f->ret_type, TupleType(vt)});
+}
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_TRANSFORMS_GRADIENT_H_
diff --git a/src/relay/transforms/gradient.cc b/src/relay/transforms/higher_order_gradient.cc
similarity index 64%
rename from src/relay/transforms/gradient.cc
rename to src/relay/transforms/higher_order_gradient.cc
index cd3a99655341..202275626d5d 100644
--- a/src/relay/transforms/gradient.cc
+++ b/src/relay/transforms/higher_order_gradient.cc
@@ -18,8 +18,8 @@
  */
 
 /*!
- * \file gradient.cc
- * \brief API for Automatic Differentiation for the Relay IR.
+ * \file higher_order_gradient.cc
+ * \brief Higher-order Automatic Differentiation in Relay IR, for non-graph programs.
  */
 #include <tvm/ir/type_functor.h>
 #include <tvm/relay/analysis.h>
@@ -28,6 +28,7 @@
 #include <tvm/relay/transform.h>
 #include <tvm/te/operation.h>
 
+#include "gradient.h"
 #include "let_list.h"
 #include "pass_utils.h"
 #include "pattern_utils.h"
@@ -64,13 +65,6 @@ using namespace tvm::runtime;
  * output. There are multiple implementation of AD in relay, with different characteristic. However,
  * they all transform the input expr according to WithGradientType.
  */
-Type WithGradientType(const Type&);
-
-/*! return an expression that represent differentiation of e (according to WithGradientType).
- *  This version only work on first order code without control flow.
- */
-Expr FirstOrderGradient(const Expr& e, const Optional<IRModule>& mod);
-
 Type WithGradientType(const Type& t) {
   // TODO(@M.K.): stricter checking
   auto ty = t.as<FuncTypeNode>();
@@ -94,268 +88,6 @@ Expr DeGlobal(const Optional<IRModule>& mod, const Expr& e) {
   }
 }
 
-/*! \brief A fragment of the program being built by the automatic differentation
- *  pass.
- */
-struct ADValueNode {
-  virtual ~ADValueNode() {}
-  template <typename T>
-  T& get() {
-    auto ret = dynamic_cast<T*>(this);
-    ICHECK(ret) << "cannot downcast";
-    return *ret;
-  }
-};
-
-template <typename F>
-Expr MultiFactory(const Type& t, F factory) {
-  if (auto* tt = t.as<TensorTypeNode>()) {
-    return factory(tt->shape, tt->dtype);
-  } else if (auto* tt = t.as<TupleTypeNode>()) {
-    std::vector<Expr> res;
-    for (size_t i = 0; i < tt->fields.size(); i++) {
-      res.push_back(MultiFactory(tt->fields[i], factory));
-    }
-    return Tuple(res);
-  } else {
-    LOG(FATAL) << "unsupported type to create tensors of: " << tt;
-    throw;
-  }
-}
-
-template <typename F, typename F2>
-Expr MultiFactoryLike(const Expr& e, const Type& t, F factory, F2 factory_like) {
-  if (t.as<TensorTypeNode>()) {
-    return factory_like(e);
-  } else if (auto* tt = t.as<TupleTypeNode>()) {
-    return MultiFactory(t, factory);
-  } else {
-    LOG(FATAL) << "unsupported type to tensors of: " << tt;
-    throw;
-  }
-}
-
-using ADValue = std::shared_ptr<ADValueNode>;
-
-/*! \brief AD over a program which generates a tensor output. */
-struct ADTensor : ADValueNode {
-  Expr forward;
-  mutable Expr reverse;  // must be a variable to avoid duplication
-  ADTensor(LetList* ll, const Expr& forward)
-      : forward(ll->Push(forward)),
-        reverse(
-            ll->Push(MultiFactoryLike(this->forward, forward->checked_type(), Zeros, ZerosLike))) {
-    this->forward->checked_type_ = forward->checked_type();
-  }
-};
-
-/*! \brief A staged representation of the program, we reflect
- * Relay functions into a function over fragments of AD. We
- * can compute away this function to obtain a reverse mode program.
- */
-struct ADFunction : ADValueNode {
-  std::function<ADValue(const Type&, const std::vector<ADValue>&, const Attrs&,
-                        const tvm::Array<Type>&)>
-      func;
-  explicit ADFunction(const std::function<ADValue(const Type&, const std::vector<ADValue>&,
-                                                  const Attrs&, const tvm::Array<Type>&)>& func)
-      : func(func) {}
-};
-
-struct FirstOrderReverseAD : ExprFunctor<ADValue(const Expr&)> {
-  using TBase = ExprFunctor<ADValue(const Expr&)>;
-  const OpAttrMap<FPrimalGradient> rev_map = Op::GetAttrMap<FPrimalGradient>("FPrimalGradient");
-  std::vector<std::function<void(LetList* ll)>> backprop_actions;
-  // we assume no closure so no need for lexical scoping
-  std::unordered_map<Expr, ADValue, ObjectPtrHash, ObjectPtrEqual> env;
-  LetList* ll;
-
-  FirstOrderReverseAD(LetList* ll) : ll(ll) {}
-
-  ADValue VisitExpr(const Expr& n) final {
-    if (env.count(n)) {
-      return env.at(n);
-    }
-    auto ret = TBase::VisitExpr(n);
-    env[n] = ret;
-    return ret;
-  }
-
-  Expr UpdateGrad(const Type& t, const Expr& arg, const Expr& grad, LetList* ll) {
-    if (t.as<TensorTypeNode>()) {
-      return ll->Push(Add(arg, grad));
-    } else if (auto* tt = t.as<TupleTypeNode>()) {
-      Array<Expr> updates;
-      for (size_t i = 0; i < tt->fields.size(); ++i) {
-        updates.push_back(this->UpdateGrad(tt->fields[i], ll->Push(GetField(arg, i)),
-                                           ll->Push(GetField(grad, i)), ll));
-      }
-      return ll->Push(Tuple(updates));
-    } else {
-      LOG(FATAL) << "unsupported arg type of operator: " << t;
-      throw;
-    }
-  }
-
-  ADValue VisitExpr_(const OpNode* op) final {
-    Op op_ref = GetRef<Op>(op);
-    ICHECK(rev_map.count(op_ref)) << op->name << " does not have reverse mode defined";
-    return std::make_shared<ADFunction>(
-        [this, op_ref](const Type& orig_type, const std::vector<ADValue>& args, const Attrs& attrs,
-                       const tvm::Array<Type>& type_args) {
-          std::vector<Expr> call_args;
-          for (const ADValue& adval : args) {
-            call_args.push_back(adval->get<ADTensor>().forward);
-          }
-          auto orig = Call(op_ref, call_args, attrs, type_args);
-          orig->checked_type_ = orig_type;
-          auto ret = std::make_shared<ADTensor>(ll, orig);
-          backprop_actions.push_back([this, args, orig, ret, op_ref](LetList* ll) {
-            tvm::Array<Expr> rev = rev_map[op_ref](orig, ret->reverse);
-            ICHECK(args.size() == rev.size());
-            for (size_t i = 0; i < args.size(); ++i) {
-              auto ad_arg = args[i]->get<ADTensor>();
-              auto ad_arg_type = ad_arg.forward->checked_type();
-              args[i]->get<ADTensor>().reverse =
-                  this->UpdateGrad(ad_arg_type, ad_arg.reverse, rev[i], ll);
-            }
-          });
-          return ret;
-        });
-  }
-
-  ADValue VisitExpr_(const TupleGetItemNode* op) final {
-    Expr e = GetRef<Expr>(op);
-    ADValue tup = VisitExpr(op->tuple);
-    auto tt = op->tuple->checked_type().as<TupleTypeNode>();
-    size_t size = tt->fields.size();
-    size_t idx = op->index;
-    auto ret = std::make_shared<ADTensor>(ll, e);
-    backprop_actions.push_back([tup, idx, size, ret](LetList* ll) {
-      auto rev = tup->get<ADTensor>().reverse;
-      // special-case Tuple, to avoid long chains of GetItem/Tuple,
-      // but we might have functions using tuples, so we don't know
-      // that the reverse node is always a tuple
-      std::vector<Expr> grfields;
-      if (auto tup_node = rev.as<TupleNode>()) {
-        for (size_t i = 0; i < size; ++i) {
-          grfields.push_back(i != idx ? tup_node->fields[i]
-                                      : Add(tup_node->fields[i], ret->reverse));
-        }
-      } else {
-        for (size_t i = 0; i < size; ++i) {
-          grfields.push_back(i != idx ? TupleGetItem(rev, i)
-                                      : Add(TupleGetItem(rev, i), ret->reverse));
-        }
-      }
-      tup->get<ADTensor>().reverse = ll->Push(Tuple(grfields));
-    });
-    return ret;
-  }
-
-  ADValue VisitExpr_(const TupleNode* op) final {
-    Expr e = GetRef<Expr>(op);
-    std::vector<ADValue> fields;
-    for (const auto& f : op->fields) {
-      fields.push_back(VisitExpr(f));
-    }
-    auto ret = std::make_shared<ADTensor>(ll, e);
-    backprop_actions.push_back([fields, ret](LetList* ll) {
-      for (size_t i = 0; i < fields.size(); ++i) {
-        fields[i]->get<ADTensor>().reverse =
-            ll->Push(Add(fields[i]->get<ADTensor>().reverse, TupleGetItem(ret->reverse, i)));
-      }
-    });
-    return ret;
-  }
-
-  ADValue VisitExpr_(const ConstantNode* op) final {
-    Expr e = GetRef<Expr>(op);
-    return std::make_shared<ADTensor>(ll, e);
-  }
-
-  ADValue VisitExpr_(const CallNode* op) final {
-    ADValue f = VisitExpr(op->op);
-    std::vector<ADValue> args;
-    for (const auto& arg : op->args) {
-      args.push_back(VisitExpr(arg));
-    }
-    return f->get<ADFunction>().func(op->checked_type(), args, op->attrs, op->type_args);
-  }
-
-  ADValue VisitExpr_(const FunctionNode* op) final {
-    Function f = GetRef<Function>(op);
-    // todo: assert no closure
-    return std::make_shared<ADFunction>(
-        [this, f](const Type& orig_type, const std::vector<ADValue>& args, const Attrs& attrs,
-                  const tvm::Array<Type>& type_args) {
-          ICHECK_EQ(f->params.size(), args.size());
-          for (size_t i = 0; i < f->params.size(); ++i) {
-            env[f->params[i]] = args[i];
-          }
-          return VisitExpr(f->body);
-        });
-  }
-
-  // Var will always be in env, handled in VisitExpr (without _), so we don't need
-  // to implement its VisitExpr_.
-};
-
-Type GradRetType(const Function& f) {
-  // if type annotations are provided, we will construct a ret type;
-  // otherwise, leave it to be inferred
-  if (!f->ret_type.defined()) {
-    return Type();
-  }
-  std::vector<Type> vt;
-  for (const auto& p : f->params) {
-    if (!p->type_annotation.defined()) {
-      return Type();
-    }
-    vt.push_back(p->type_annotation);
-  }
-
-  return TupleType({f->ret_type, TupleType(vt)});
-}
-
-Expr FirstOrderGradient(const Expr& re, const Optional<IRModule>& mod) {
-  // Currently we first remove any global functions for the first
-  // order case.
-  auto e = DeGlobal(mod, re);
-  auto f = e.as<FunctionNode>();
-  ICHECK(f) << "FOWithGradient expects its argument to be a function: " << f;
-  ICHECK(f->type_params.size() == 0) << "no polymorphism supported for now";
-
-  // We will then build a sequence of lets which implement reverse mode.
-  Expr body = LetList::With([&](LetList* ll) {
-    FirstOrderReverseAD reverse_ad(ll);
-    ADValue rev = reverse_ad(e);
-    std::vector<ADValue> args;
-    for (const auto& p : f->params) {
-      args.push_back(std::make_shared<ADTensor>(ll, p));
-    }
-    auto c = rev->get<ADFunction>().func(f->checked_type(), args, Attrs(), {});
-    const auto& res = c->get<ADTensor>();
-    Expr grad = LetList::With([&](LetList* ll) {
-      res.reverse = MultiFactoryLike(res.forward, res.forward->checked_type(), Ones, OnesLike);
-      for (auto it = reverse_ad.backprop_actions.rbegin(); it != reverse_ad.backprop_actions.rend();
-           ++it) {
-        (*it)(ll);
-      }
-      std::vector<Expr> grad_res;
-      for (const auto& a : args) {
-        grad_res.push_back(a->get<ADTensor>().reverse);
-      }
-      return Tuple(grad_res);
-    });
-    return Pair(res.forward, grad);
-  });
-
-  return Function(f->params, body, GradRetType(GetRef<Function>(f)), {});
-}
-
-TVM_REGISTER_GLOBAL("relay._transform.first_order_gradient").set_body_typed(FirstOrderGradient);
-
 static Type bpt = RelayRefType(FuncType({}, TupleType(Array<Type>()), {}, {}));
 
 struct ReverseADType : TypeMutator {

From 431a7d6c0b7e5ae71b411c500836b136322f9fbf Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <pivovaa@amazon.com>
Date: Thu, 18 Mar 2021 16:44:45 -0700
Subject: [PATCH 357/357] Default value for graph_runtime Init
 lookup_linked_param_func (#7676)

---
 src/runtime/graph/graph_runtime.cc | 5 +++--
 src/runtime/graph/graph_runtime.h  | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index 7e98acb6fb3e..5c7b75696168 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -66,10 +66,11 @@ void GraphRuntime::Run() {
  * processor.
  * \param ctxs The context of the host and devices where graph nodes will be
  * executed on.
- * \param lookup_linked_param_func Linked parameter lookup function.
+ * \param lookup_linked_param_func Linked parameter lookup function. Default is nullptr.
  */
 void GraphRuntime::Init(const std::string& graph_json, tvm::runtime::Module module,
-                        const std::vector<TVMContext>& ctxs, PackedFunc lookup_linked_param_func) {
+                        const std::vector<TVMContext>& ctxs,
+                        const PackedFunc lookup_linked_param_func) {
   std::istringstream is(graph_json);
   dmlc::JSONReader reader(&is);
   this->Load(&reader);
diff --git a/src/runtime/graph/graph_runtime.h b/src/runtime/graph/graph_runtime.h
index a1e2ee3b5d74..e417d2aa4bfc 100644
--- a/src/runtime/graph/graph_runtime.h
+++ b/src/runtime/graph/graph_runtime.h
@@ -93,11 +93,12 @@ class TVM_DLL GraphRuntime : public ModuleNode {
    *  executed on.
    * \param lookup_linked_param_func If given, a PackedFunc invoked to lookup linked parameters
    *  by storage_id. If not given, linked parameters are looked-up using an internal implementation,
-   *  which is not compatible with RPCModules.
+   *  which is not compatible with RPCModules. Default is nullptr.
    */
 
   void Init(const std::string& graph_json, tvm::runtime::Module module,
-            const std::vector<TVMContext>& ctxs, const PackedFunc lookup_linked_param_func);
+            const std::vector<TVMContext>& ctxs,
+            const PackedFunc lookup_linked_param_func = nullptr);
 
   /*!
    * \brief Get the input index given the name of input.